29 files changed, 8807 insertions, 0 deletions
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
new file mode 100644
index 000000000..642769e7c
--- /dev/null
+++ b/pkg/sentry/vfs/BUILD
@@ -0,0 +1,100 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+licenses(["notice"])
+
+go_template_instance(
+    name = "epoll_interest_list",
+    out = "epoll_interest_list.go",
+    package = "vfs",
+    prefix = "epollInterest",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*epollInterest",
+        "Linker": "*epollInterest",
+    },
+)
+
+go_template_instance(
+    name = "event_list",
+    out = "event_list.go",
+    package = "vfs",
+    prefix = "event",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*Event",
+        "Linker": "*Event",
+    },
+)
+
+go_library(
+    name = "vfs",
+    srcs = [
+        "anonfs.go",
+        "context.go",
+        "debug.go",
+        "dentry.go",
+        "device.go",
+        "epoll.go",
+        "epoll_interest_list.go",
+        "event_list.go",
+        "file_description.go",
+        "file_description_impl_util.go",
+        "filesystem.go",
+        "filesystem_impl_util.go",
+        "filesystem_type.go",
+        "inotify.go",
+        "lock.go",
+        "mount.go",
+        "mount_unsafe.go",
+        "options.go",
+        "pathname.go",
+        "permissions.go",
+        "resolving_path.go",
+        "vfs.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/context",
+        "//pkg/fd",
+        "//pkg/fdnotifier",
+        "//pkg/fspath",
+        "//pkg/gohacks",
+        "//pkg/log",
+        "//pkg/safemem",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/uniqueid",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+go_test(
+    name = "vfs_test",
+    size = "small",
+    srcs = [
+        "file_description_impl_util_test.go",
+        "mount_test.go",
+    ],
+    library = ":vfs",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/sentry/contexttest",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md
new file mode 100644
index 000000000..4b9faf2ea
--- /dev/null
+++ b/pkg/sentry/vfs/README.md
@@ -0,0 +1,195 @@
+# The gVisor Virtual Filesystem
+
+THIS PACKAGE IS CURRENTLY EXPERIMENTAL AND NOT READY OR ENABLED FOR PRODUCTION
+USE. For the filesystem implementation currently used by gVisor, see the `fs`
+package.
+
+## Implementation Notes
+
+### Reference Counting
+
+Filesystem, Dentry, Mount, MountNamespace, and FileDescription are all
+reference-counted. Mount and MountNamespace are exclusively VFS-managed; when
+their reference count reaches zero, VFS releases their resources. Filesystem and
+FileDescription management is shared between VFS and filesystem implementations;
+when their reference count reaches zero, VFS notifies the implementation by
+calling `FilesystemImpl.Release()` or `FileDescriptionImpl.Release()`
+respectively and then releases VFS-owned resources. Dentries are exclusively
+managed by filesystem implementations; reference count changes are abstracted
+through DentryImpl, which should release resources when reference count reaches
+zero.
+
+Filesystem references are held by:
+
+-   Mount: Each referenced Mount holds a reference on the mounted Filesystem.
+
+Dentry references are held by:
+
+-   FileDescription: Each referenced FileDescription holds a reference on the
+    Dentry through which it was opened, via `FileDescription.vd.dentry`.
+
+-   Mount: Each referenced Mount holds a reference on its mount point and on the
+    mounted filesystem root. The mount point is mutable (`mount(MS_MOVE)`).
+
+Mount references are held by:
+
+-   FileDescription: Each referenced FileDescription holds a reference on the
+    Mount on which it was opened, via `FileDescription.vd.mount`.
+
+-   Mount: Each referenced Mount holds a reference on its parent, which is the
+    mount containing its mount point.
+
+-   VirtualFilesystem: A reference is held on each Mount that has been connected
+    to a mount point, but not yet umounted.
+
+MountNamespace and FileDescription references are held by users of VFS. The
+expectation is that each `kernel.Task` holds a reference on its corresponding
+MountNamespace, and each file descriptor holds a reference on its represented
+FileDescription.
+
+Notes:
+
+-   Dentries do not hold a reference on their owning Filesystem. Instead, all
+    uses of a Dentry occur in the context of a Mount, which holds a reference on
+    the relevant Filesystem (see e.g. the VirtualDentry type). As a corollary,
+    when releasing references on both a Dentry and its corresponding Mount, the
+    Dentry's reference must be released first (because releasing the Mount's
+    reference may release the last reference on the Filesystem, whose state may
+    be required to release the Dentry reference).
+
+### The Inheritance Pattern
+
+Filesystem, Dentry, and FileDescription are all concepts featuring both state
+that must be shared between VFS and filesystem implementations, and operations
+that are implementation-defined. To facilitate this, each of these three
+concepts follows the same pattern, shown below for Dentry:
+
+```go
+// Dentry represents a node in a filesystem tree.
+type Dentry struct {
+  // VFS-required dentry state.
+  parent *Dentry
+  // ...
+
+  // impl is the DentryImpl associated with this Dentry. impl is immutable.
+  // This should be the last field in Dentry.
+  impl DentryImpl
+}
+
+// Init must be called before first use of d.
+func (d *Dentry) Init(impl DentryImpl) {
+  d.impl = impl
+}
+
+// Impl returns the DentryImpl associated with d.
+func (d *Dentry) Impl() DentryImpl {
+  return d.impl
+}
+
+// DentryImpl contains implementation-specific details of a Dentry.
+// Implementations of DentryImpl should contain their associated Dentry by
+// value as their first field.
+type DentryImpl interface {
+  // VFS-required implementation-defined dentry operations.
+  IncRef()
+  // ...
+}
+```
+
+This construction, which is essentially a type-safe analogue to Linux's
+`container_of` pattern, has the following properties:
+
+-   VFS works almost exclusively with pointers to Dentry rather than DentryImpl
+    interface objects, such as in the type of `Dentry.parent`. This avoids
+    interface method calls (which are somewhat expensive to perform, and defeat
+    inlining and escape analysis), reduces the size of VFS types (since an
+    interface object is two pointers in size), and allows pointers to be loaded
+    and stored atomically using `sync/atomic`. Implementation-defined behavior
+    is accessed via `Dentry.impl` when required.
+
+-   Filesystem implementations can access the implementation-defined state
+    associated with objects of VFS types by type-asserting or type-switching
+    (e.g. `Dentry.Impl().(*myDentry)`). Type assertions to a concrete type
+    require only an equality comparison of the interface object's type pointer
+    to a static constant, and are consequently very fast.
+
+-   Filesystem implementations can access the VFS state associated with objects
+    of implementation-defined types directly.
+
+-   VFS and implementation-defined state for a given type occupy the same
+    object, minimizing memory allocations and maximizing memory locality. `impl`
+    is the last field in `Dentry`, and `Dentry` is the first field in
+    `DentryImpl` implementations, for similar reasons: this tends to cause
+    fetching of the `Dentry.impl` interface object to also fetch `DentryImpl`
+    fields, either because they are in the same cache line or via next-line
+    prefetching.
+
+## Future Work
+
+-   Most `mount(2)` features, and unmounting, are incomplete.
+
+-   VFS1 filesystems are not directly compatible with VFS2. It may be possible
+    to implement shims that implement `vfs.FilesystemImpl` for
+    `fs.MountNamespace`, `vfs.DentryImpl` for `fs.Dirent`, and
+    `vfs.FileDescriptionImpl` for `fs.File`, which may be adequate for
+    filesystems that are not performance-critical (e.g. sysfs); however, it is
+    not clear that this will be less effort than simply porting the filesystems
+    in question. Practically speaking, the following filesystems will probably
+    need to be ported or made compatible through a shim to evaluate filesystem
+    performance on realistic workloads:
+
+    -   devfs/procfs/sysfs, which will realistically be necessary to execute
+        most applications. (Note that procfs and sysfs do not support hard
+        links, so they do not require the complexity of separate inode objects.
+        Also note that Linux's /dev is actually a variant of tmpfs called
+        devtmpfs.)
+
+    -   tmpfs. This should be relatively straightforward: copy/paste memfs,
+        store regular file contents in pgalloc-allocated memory instead of
+        `[]byte`, and add support for file timestamps. (In fact, it probably
+        makes more sense to convert memfs to tmpfs and not keep the former.)
+
+    -   A remote filesystem, either lisafs (if it is ready by the time that
+        other benchmarking prerequisites are) or v9fs (aka 9P, aka gofers).
+
+    -   epoll files.
+
+    Filesystems that will need to be ported before switching to VFS2, but can
+    probably be skipped for early testing:
+
+    -   overlayfs, which is needed for (at least) synthetic mount points.
+
+    -   Support for host ttys.
+
+    -   timerfd files.
+
+    Filesystems that can be probably dropped:
+
+    -   ashmem, which is far too incomplete to use.
+
+    -   binder, which is similarly far too incomplete to use.
+
+-   Save/restore. For instance, it is unclear if the current implementation of
+    the `state` package supports the inheritance pattern described above.
+
+-   Many features that were previously implemented by VFS must now be
+    implemented by individual filesystems (though, in most cases, this should
+    consist of calls to hooks or libraries provided by `vfs` or other packages).
+    This includes, but is not necessarily limited to:
+
+    -   Block and character device special files
+
+    -   Inotify
+
+    -   File locking
+
+    -   `O_ASYNC`
+
+-   Reference counts in the `vfs` package do not use the `refs` package since
+    `refs.AtomicRefCount` adds 64 bytes of overhead to each 8-byte reference
+    count, resulting in considerable cache bloat. 24 bytes of this overhead is
+    for weak reference support, which have poor performance and will not be used
+    by VFS2. The remaining 40 bytes is to store a descriptive string and stack
+    trace for reference leak checking; we can support reference leak checking
+    without incurring this space overhead by including the applicable
+    information directly in finalizers for applicable types.
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
new file mode 100644
index 000000000..641e3e502
--- /dev/null
+++ b/pkg/sentry/vfs/anonfs.go
@@ -0,0 +1,314 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// NewAnonVirtualDentry returns a VirtualDentry with the given synthetic name,
+// consistent with Linux's fs/anon_inodes.c:anon_inode_getfile(). References
+// are taken on the returned VirtualDentry.
+func (vfs *VirtualFilesystem) NewAnonVirtualDentry(name string) VirtualDentry {
+	d := anonDentry{
+		name: name,
+	}
+	d.vfsd.Init(&d)
+	vfs.anonMount.IncRef()
+	// anonDentry no-ops refcounting.
+	return VirtualDentry{
+		mount:  vfs.anonMount,
+		dentry: &d.vfsd,
+	}
+}
+
+const (
+	anonfsBlockSize = usermem.PageSize // via fs/libfs.c:pseudo_fs_fill_super()
+
+	// Mode, UID, and GID for a generic anonfs file.
+	anonFileMode = 0600 // no type is correct
+	anonFileUID  = auth.RootKUID
+	anonFileGID  = auth.RootKGID
+)
+
+// anonFilesystemType implements FilesystemType.
+type anonFilesystemType struct{}
+
+// GetFilesystem implements FilesystemType.GetFilesystem.
+func (anonFilesystemType) GetFilesystem(context.Context, *VirtualFilesystem, *auth.Credentials, string, GetFilesystemOptions) (*Filesystem, *Dentry, error) {
+	panic("cannot instaniate an anon filesystem")
+}
+
+// Name implemenents FilesystemType.Name.
+func (anonFilesystemType) Name() string {
+	return "none"
+}
+
+// anonFilesystem is the implementation of FilesystemImpl that backs
+// VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
+//
+// Since all Dentries in anonFilesystem are non-directories, all FilesystemImpl
+// methods that would require an anonDentry to be a directory return ENOTDIR.
+type anonFilesystem struct {
+	vfsfs Filesystem
+
+	devMinor uint32
+}
+
+type anonDentry struct {
+	vfsd Dentry
+
+	name string
+}
+
+// Release implements FilesystemImpl.Release.
+func (fs *anonFilesystem) Release() {
+}
+
+// Sync implements FilesystemImpl.Sync.
+func (fs *anonFilesystem) Sync(ctx context.Context) error {
+	return nil
+}
+
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *anonFilesystem) AccessAt(ctx context.Context, rp *ResolvingPath, creds *auth.Credentials, ats AccessTypes) error {
+	if !rp.Done() {
+		return syserror.ENOTDIR
+	}
+	return GenericCheckPermissions(creds, ats, anonFileMode, anonFileUID, anonFileGID)
+}
+
+// GetDentryAt implements FilesystemImpl.GetDentryAt.
+func (fs *anonFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) {
+	if !rp.Done() {
+		return nil, syserror.ENOTDIR
+	}
+	if opts.CheckSearchable {
+		return nil, syserror.ENOTDIR
+	}
+	// anonDentry no-ops refcounting.
+	return rp.Start(), nil
+}
+
+// GetParentDentryAt implements FilesystemImpl.GetParentDentryAt.
+func (fs *anonFilesystem) GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) {
+	if !rp.Final() {
+		return nil, syserror.ENOTDIR
+	}
+	// anonDentry no-ops refcounting.
+	return rp.Start(), nil
+}
+
+// LinkAt implements FilesystemImpl.LinkAt.
+func (fs *anonFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error {
+	if !rp.Final() {
+		return syserror.ENOTDIR
+	}
+	return syserror.EPERM
+}
+
+// MkdirAt implements FilesystemImpl.MkdirAt.
+func (fs *anonFilesystem) MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error {
+	if !rp.Final() {
+		return syserror.ENOTDIR
+	}
+	return syserror.EPERM
+}
+
+// MknodAt implements FilesystemImpl.MknodAt.
+func (fs *anonFilesystem) MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error {
+	if !rp.Final() {
+		return syserror.ENOTDIR
+	}
+	return syserror.EPERM
+}
+
+// OpenAt implements FilesystemImpl.OpenAt.
+func (fs *anonFilesystem) OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) {
+	if !rp.Done() {
+		return nil, syserror.ENOTDIR
+	}
+	return nil, syserror.ENODEV
+}
+
+// ReadlinkAt implements FilesystemImpl.ReadlinkAt.
+func (fs *anonFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) {
+	if !rp.Done() {
+		return "", syserror.ENOTDIR
+	}
+	return "", syserror.EINVAL
+}
+
+// RenameAt implements FilesystemImpl.RenameAt.
+func (fs *anonFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error {
+	if !rp.Final() {
+		return syserror.ENOTDIR
+	}
+	return syserror.EPERM
+}
+
+// RmdirAt implements FilesystemImpl.RmdirAt.
+func (fs *anonFilesystem) RmdirAt(ctx context.Context, rp *ResolvingPath) error {
+	if !rp.Final() {
+		return syserror.ENOTDIR
+	}
+	return syserror.EPERM
+}
+
+// SetStatAt implements FilesystemImpl.SetStatAt.
+func (fs *anonFilesystem) SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error {
+	if !rp.Done() {
+		return syserror.ENOTDIR
+	}
+	// Linux actually permits anon_inode_inode's metadata to be set, which is
+	// visible to all users of anon_inode_inode. We just silently ignore
+	// metadata changes.
+	return nil
+}
+
+// StatAt implements FilesystemImpl.StatAt.
+func (fs *anonFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) {
+	if !rp.Done() {
+		return linux.Statx{}, syserror.ENOTDIR
+	}
+	// See fs/anon_inodes.c:anon_inode_init() => fs/libfs.c:alloc_anon_inode().
+	return linux.Statx{
+		Mask:     linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS,
+		Blksize:  anonfsBlockSize,
+		Nlink:    1,
+		UID:      uint32(anonFileUID),
+		GID:      uint32(anonFileGID),
+		Mode:     anonFileMode,
+		Ino:      1,
+		Size:     0,
+		Blocks:   0,
+		DevMajor: linux.UNNAMED_MAJOR,
+		DevMinor: fs.devMinor,
+	}, nil
+}
+
+// StatFSAt implements FilesystemImpl.StatFSAt.
+func (fs *anonFilesystem) StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) {
+	if !rp.Done() {
+		return linux.Statfs{}, syserror.ENOTDIR
+	}
+	return linux.Statfs{
+		Type:      linux.ANON_INODE_FS_MAGIC,
+		BlockSize: anonfsBlockSize,
+	}, nil
+}
+
+// SymlinkAt implements FilesystemImpl.SymlinkAt.
+func (fs *anonFilesystem) SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error {
+	if !rp.Final() {
+		return syserror.ENOTDIR
+	}
+	return syserror.EPERM
+}
+
+// UnlinkAt implements FilesystemImpl.UnlinkAt.
+func (fs *anonFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error {
+	if !rp.Final() {
+		return syserror.ENOTDIR
+	}
+	return syserror.EPERM
+}
+
+// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath, opts BoundEndpointOptions) (transport.BoundEndpoint, error) {
+	if !rp.Final() {
+		return nil, syserror.ENOTDIR
+	}
+	if err := GenericCheckPermissions(rp.Credentials(), MayWrite, anonFileMode, anonFileUID, anonFileGID); err != nil {
+		return nil, err
+	}
+	return nil, syserror.ECONNREFUSED
+}
+
+// ListxattrAt implements FilesystemImpl.ListxattrAt.
+func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) {
+	if !rp.Done() {
+		return nil, syserror.ENOTDIR
+	}
+	return nil, nil
+}
+
+// GetxattrAt implements FilesystemImpl.GetxattrAt.
+func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error) {
+	if !rp.Done() {
+		return "", syserror.ENOTDIR
+	}
+	return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements FilesystemImpl.SetxattrAt.
+func (fs *anonFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error {
+	if !rp.Done() {
+		return syserror.ENOTDIR
+	}
+	return syserror.EPERM
+}
+
+// RemovexattrAt implements FilesystemImpl.RemovexattrAt.
+func (fs *anonFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
+	if !rp.Done() {
+		return syserror.ENOTDIR
+	}
+	return syserror.EPERM
+}
+
+// PrependPath implements FilesystemImpl.PrependPath.
+func (fs *anonFilesystem) PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error {
+	b.PrependComponent(fmt.Sprintf("anon_inode:%s", vd.dentry.impl.(*anonDentry).name))
+	return PrependPathSyntheticError{}
+}
+
+// IncRef implements DentryImpl.IncRef.
+func (d *anonDentry) IncRef() {
+	// no-op
+}
+
+// TryIncRef implements DentryImpl.TryIncRef.
+func (d *anonDentry) TryIncRef() bool {
+	return true
+}
+
+// DecRef implements DentryImpl.DecRef.
+func (d *anonDentry) DecRef() {
+	// no-op
+}
+
+// InotifyWithParent implements DentryImpl.InotifyWithParent.
+//
+// Although Linux technically supports inotify on pseudo filesystems (inotify
+// is implemented at the vfs layer), it is not particularly useful. It is left
+// unimplemented until someone actually needs it.
+func (d *anonDentry) InotifyWithParent(events, cookie uint32, et EventType) {}
+
+// Watches implements DentryImpl.Watches.
+func (d *anonDentry) Watches() *Watches {
+	return nil
+}
+
+// OnZeroWatches implements Dentry.OnZeroWatches.
+func (d *anonDentry) OnZeroWatches() {}
diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go
new file mode 100644
index 000000000..c9e724fef
--- /dev/null
+++ b/pkg/sentry/vfs/context.go
@@ -0,0 +1,75 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+)
+
+// contextID is this package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxMountNamespace is a Context.Value key for a MountNamespace.
+	CtxMountNamespace contextID = iota
+
+	// CtxRoot is a Context.Value key for a VFS root.
+	CtxRoot
+)
+
+// MountNamespaceFromContext returns the MountNamespace used by ctx. If ctx is
+// not associated with a MountNamespace, MountNamespaceFromContext returns nil.
+//
+// A reference is taken on the returned MountNamespace.
+func MountNamespaceFromContext(ctx context.Context) *MountNamespace {
+	if v := ctx.Value(CtxMountNamespace); v != nil {
+		return v.(*MountNamespace)
+	}
+	return nil
+}
+
+// RootFromContext returns the VFS root used by ctx. It takes a reference on
+// the returned VirtualDentry. If ctx does not have a specific VFS root,
+// RootFromContext returns a zero-value VirtualDentry.
+func RootFromContext(ctx context.Context) VirtualDentry {
+	if v := ctx.Value(CtxRoot); v != nil {
+		return v.(VirtualDentry)
+	}
+	return VirtualDentry{}
+}
+
+type rootContext struct {
+	context.Context
+	root VirtualDentry
+}
+
+// WithRoot returns a copy of ctx with the given root.
+func WithRoot(ctx context.Context, root VirtualDentry) context.Context {
+	return &rootContext{
+		Context: ctx,
+		root:    root,
+	}
+}
+
+// Value implements Context.Value.
+func (rc rootContext) Value(key interface{}) interface{} {
+	switch key {
+	case CtxRoot:
+		rc.root.IncRef()
+		return rc.root
+	default:
+		return rc.Context.Value(key)
+	}
+}
diff --git a/pkg/sentry/vfs/debug.go b/pkg/sentry/vfs/debug.go
new file mode 100644
index 000000000..0ed20f249
--- /dev/null
+++ b/pkg/sentry/vfs/debug.go
@@ -0,0 +1,22 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+const (
+	// If checkInvariants is true, perform runtime checks for invariants
+	// expected by the vfs package. This is normally disabled since VFS is
+	// often a hot path.
+	checkInvariants = false
+)
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
new file mode 100644
index 000000000..cea3e6955
--- /dev/null
+++ b/pkg/sentry/vfs/dentry.go
@@ -0,0 +1,324 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Dentry represents a node in a Filesystem tree at which a file exists.
+//
+// Dentries are reference-counted. Unless otherwise specified, all Dentry
+// methods require that a reference is held.
+//
+// Dentry is loosely analogous to Linux's struct dentry, but:
+//
+// - VFS does not associate Dentries with inodes. gVisor interacts primarily
+// with filesystems that are accessed through filesystem APIs (as opposed to
+// raw block devices); many such APIs support only paths and file descriptors,
+// and not inodes. Furthermore, when parties outside the scope of VFS can
+// rename inodes on such filesystems, VFS generally cannot "follow" the rename,
+// both due to synchronization issues and because it may not even be able to
+// name the destination path; this implies that it would in fact be incorrect
+// for Dentries to be associated with inodes on such filesystems. Consequently,
+// operations that are inode operations in Linux are FilesystemImpl methods
+// and/or FileDescriptionImpl methods in gVisor's VFS. Filesystems that do
+// support inodes may store appropriate state in implementations of DentryImpl.
+//
+// - VFS does not require that Dentries are instantiated for all paths accessed
+// through VFS, only those that are tracked beyond the scope of a single
+// Filesystem operation. This includes file descriptions, mount points, mount
+// roots, process working directories, and chroots. This avoids instantiation
+// of Dentries for operations on mutable remote filesystems that can't actually
+// cache any state in the Dentry.
+//
+// - VFS does not track filesystem structure (i.e. relationships between
+// Dentries), since both the relevant state and synchronization are
+// filesystem-specific.
+//
+// - For the reasons above, VFS is not directly responsible for managing Dentry
+// lifetime. Dentry reference counts only indicate the extent to which VFS
+// requires Dentries to exist; Filesystems may elect to cache or discard
+// Dentries with zero references.
+//
+// +stateify savable
+type Dentry struct {
+	// mu synchronizes deletion/invalidation and mounting over this Dentry.
+	mu sync.Mutex `state:"nosave"`
+
+	// dead is true if the file represented by this Dentry has been deleted (by
+	// CommitDeleteDentry or CommitRenameReplaceDentry) or invalidated (by
+	// InvalidateDentry). dead is protected by mu.
+	dead bool
+
+	// mounts is the number of Mounts for which this Dentry is Mount.point.
+	// mounts is accessed using atomic memory operations.
+	mounts uint32
+
+	// impl is the DentryImpl associated with this Dentry. impl is immutable.
+	// This should be the last field in Dentry.
+	impl DentryImpl
+}
+
+// Init must be called before first use of d.
+func (d *Dentry) Init(impl DentryImpl) {
+	d.impl = impl
+}
+
+// Impl returns the DentryImpl associated with d.
+func (d *Dentry) Impl() DentryImpl {
+	return d.impl
+}
+
+// DentryImpl contains implementation details for a Dentry. Implementations of
+// DentryImpl should contain their associated Dentry by value as their first
+// field.
+type DentryImpl interface {
+	// IncRef increments the Dentry's reference count. A Dentry with a non-zero
+	// reference count must remain coherent with the state of the filesystem.
+	IncRef()
+
+	// TryIncRef increments the Dentry's reference count and returns true. If
+	// the Dentry's reference count is zero, TryIncRef may do nothing and
+	// return false. (It is also permitted to succeed if it can restore the
+	// guarantee that the Dentry is coherent with the state of the filesystem.)
+	//
+	// TryIncRef does not require that a reference is held on the Dentry.
+	TryIncRef() bool
+
+	// DecRef decrements the Dentry's reference count.
+	DecRef()
+
+	// InotifyWithParent notifies all watches on the targets represented by this
+	// dentry and its parent. The parent's watches are notified first, followed
+	// by this dentry's.
+	//
+	// InotifyWithParent automatically adds the IN_ISDIR flag for dentries
+	// representing directories.
+	//
+	// Note that the events may not actually propagate up to the user, depending
+	// on the event masks.
+	InotifyWithParent(events, cookie uint32, et EventType)
+
+	// Watches returns the set of inotify watches for the file corresponding to
+	// the Dentry. Dentries that are hard links to the same underlying file
+	// share the same watches.
+	//
+	// Watches may return nil if the dentry belongs to a FilesystemImpl that
+	// does not support inotify. If an implementation returns a non-nil watch
+	// set, it must always return a non-nil watch set. Likewise, if an
+	// implementation returns a nil watch set, it must always return a nil watch
+	// set.
+	//
+	// The caller does not need to hold a reference on the dentry.
+	Watches() *Watches
+
+	// OnZeroWatches is called whenever the number of watches on a dentry drops
+	// to zero. This is needed by some FilesystemImpls (e.g. gofer) to manage
+	// dentry lifetime.
+	//
+	// The caller does not need to hold a reference on the dentry. OnZeroWatches
+	// may acquire inotify locks, so to prevent deadlock, no inotify locks should
+	// be held by the caller.
+	OnZeroWatches()
+}
+
+// IncRef increments d's reference count.
+func (d *Dentry) IncRef() {
+	d.impl.IncRef()
+}
+
+// TryIncRef increments d's reference count and returns true. If d's reference
+// count is zero, TryIncRef may instead do nothing and return false.
+func (d *Dentry) TryIncRef() bool {
+	return d.impl.TryIncRef()
+}
+
+// DecRef decrements d's reference count.
+func (d *Dentry) DecRef() {
+	d.impl.DecRef()
+}
+
+// IsDead returns true if d has been deleted or invalidated by its owning
+// filesystem.
+func (d *Dentry) IsDead() bool {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	return d.dead
+}
+
+func (d *Dentry) isMounted() bool {
+	return atomic.LoadUint32(&d.mounts) != 0
+}
+
+// InotifyWithParent notifies all watches on the targets represented by d and
+// its parent of events.
+func (d *Dentry) InotifyWithParent(events, cookie uint32, et EventType) {
+	d.impl.InotifyWithParent(events, cookie, et)
+}
+
+// Watches returns the set of inotify watches associated with d.
+//
+// Watches will return nil if d belongs to a FilesystemImpl that does not
+// support inotify.
+func (d *Dentry) Watches() *Watches {
+	return d.impl.Watches()
+}
+
+// OnZeroWatches performs cleanup tasks whenever the number of watches on a
+// dentry drops to zero.
+func (d *Dentry) OnZeroWatches() {
+	d.impl.OnZeroWatches()
+}
+
+// The following functions are exported so that filesystem implementations can
+// use them. The vfs package, and users of VFS, should not call these
+// functions.
+
+// PrepareDeleteDentry must be called before attempting to delete the file
+// represented by d. If PrepareDeleteDentry succeeds, the caller must call
+// AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome.
+func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dentry) error {
+	vfs.mountMu.Lock()
+	if mntns.mountpoints[d] != 0 {
+		vfs.mountMu.Unlock()
+		return syserror.EBUSY
+	}
+	d.mu.Lock()
+	vfs.mountMu.Unlock()
+	// Return with d.mu locked to block attempts to mount over it; it will be
+	// unlocked by AbortDeleteDentry or CommitDeleteDentry.
+	return nil
+}
+
+// AbortDeleteDentry must be called after PrepareDeleteDentry if the deletion
+// fails.
+func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) {
+	d.mu.Unlock()
+}
+
+// CommitDeleteDentry must be called after PrepareDeleteDentry if the deletion
+// succeeds.
+func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) {
+	d.dead = true
+	d.mu.Unlock()
+	if d.isMounted() {
+		vfs.forgetDeadMountpoint(d)
+	}
+}
+
+// InvalidateDentry is called when d ceases to represent the file it formerly
+// did for reasons outside of VFS' control (e.g. d represents the local state
+// of a file on a remote filesystem on which the file has already been
+// deleted).
+func (vfs *VirtualFilesystem) InvalidateDentry(d *Dentry) {
+	d.mu.Lock()
+	d.dead = true
+	d.mu.Unlock()
+	if d.isMounted() {
+		vfs.forgetDeadMountpoint(d)
+	}
+}
+
+// PrepareRenameDentry must be called before attempting to rename the file
+// represented by from. If to is not nil, it represents the file that will be
+// replaced or exchanged by the rename. If PrepareRenameDentry succeeds, the
+// caller must call AbortRenameDentry, CommitRenameReplaceDentry, or
+// CommitRenameExchangeDentry depending on the rename's outcome.
+//
+// Preconditions: If to is not nil, it must be a child Dentry from the same
+// Filesystem. from != to.
+func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error {
+	vfs.mountMu.Lock()
+	if mntns.mountpoints[from] != 0 {
+		vfs.mountMu.Unlock()
+		return syserror.EBUSY
+	}
+	if to != nil {
+		if mntns.mountpoints[to] != 0 {
+			vfs.mountMu.Unlock()
+			return syserror.EBUSY
+		}
+		to.mu.Lock()
+	}
+	from.mu.Lock()
+	vfs.mountMu.Unlock()
+	// Return with from.mu and to.mu locked, which will be unlocked by
+	// AbortRenameDentry, CommitRenameReplaceDentry, or
+	// CommitRenameExchangeDentry.
+	return nil
+}
+
+// AbortRenameDentry must be called after PrepareRenameDentry if the rename
+// fails.
+func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) {
+	from.mu.Unlock()
+	if to != nil {
+		to.mu.Unlock()
+	}
+}
+
+// CommitRenameReplaceDentry must be called after the file represented by from
+// is renamed without RENAME_EXCHANGE. If to is not nil, it represents the file
+// that was replaced by from.
+//
+// Preconditions: PrepareRenameDentry was previously called on from and to.
+func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, to *Dentry) {
+	from.mu.Unlock()
+	if to != nil {
+		to.dead = true
+		to.mu.Unlock()
+		if to.isMounted() {
+			vfs.forgetDeadMountpoint(to)
+		}
+	}
+}
+
+// CommitRenameExchangeDentry must be called after the files represented by
+// from and to are exchanged by rename(RENAME_EXCHANGE).
+//
+// Preconditions: PrepareRenameDentry was previously called on from and to.
+func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) {
+	from.mu.Unlock()
+	to.mu.Unlock()
+}
+
+// forgetDeadMountpoint is called when a mount point is deleted or invalidated
+// to umount all mounts using it in all other mount namespaces.
+//
+// forgetDeadMountpoint is analogous to Linux's
+// fs/namespace.c:__detach_mounts().
+func (vfs *VirtualFilesystem) forgetDeadMountpoint(d *Dentry) {
+	var (
+		vdsToDecRef    []VirtualDentry
+		mountsToDecRef []*Mount
+	)
+	vfs.mountMu.Lock()
+	vfs.mounts.seq.BeginWrite()
+	for mnt := range vfs.mountpoints[d] {
+		vdsToDecRef, mountsToDecRef = vfs.umountRecursiveLocked(mnt, &umountRecursiveOptions{}, vdsToDecRef, mountsToDecRef)
+	}
+	vfs.mounts.seq.EndWrite()
+	vfs.mountMu.Unlock()
+	for _, vd := range vdsToDecRef {
+		vd.DecRef()
+	}
+	for _, mnt := range mountsToDecRef {
+		mnt.DecRef()
+	}
+}
diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go
new file mode 100644
index 000000000..1e9dffc8f
--- /dev/null
+++ b/pkg/sentry/vfs/device.go
@@ -0,0 +1,132 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// DeviceKind indicates whether a device is a block or character device.
+type DeviceKind uint32
+
+const (
+	// BlockDevice indicates a block device.
+	BlockDevice DeviceKind = iota
+
+	// CharDevice indicates a character device.
+	CharDevice
+)
+
+// String implements fmt.Stringer.String.
+func (kind DeviceKind) String() string {
+	switch kind {
+	case BlockDevice:
+		return "block"
+	case CharDevice:
+		return "character"
+	default:
+		return fmt.Sprintf("invalid device kind %d", kind)
+	}
+}
+
+type devTuple struct {
+	kind  DeviceKind
+	major uint32
+	minor uint32
+}
+
+// A Device backs device special files.
+type Device interface {
+	// Open returns a FileDescription representing this device.
+	Open(ctx context.Context, mnt *Mount, d *Dentry, opts OpenOptions) (*FileDescription, error)
+}
+
+// +stateify savable
+type registeredDevice struct {
+	dev  Device
+	opts RegisterDeviceOptions
+}
+
+// RegisterDeviceOptions contains options to
+// VirtualFilesystem.RegisterDevice().
+//
+// +stateify savable
+type RegisterDeviceOptions struct {
+	// GroupName is the name shown for this device registration in
+	// /proc/devices. If GroupName is empty, this registration will not be
+	// shown in /proc/devices.
+	GroupName string
+}
+
+// RegisterDevice registers the given Device in vfs with the given major and
+// minor device numbers.
+func (vfs *VirtualFilesystem) RegisterDevice(kind DeviceKind, major, minor uint32, dev Device, opts *RegisterDeviceOptions) error {
+	tup := devTuple{kind, major, minor}
+	vfs.devicesMu.Lock()
+	defer vfs.devicesMu.Unlock()
+	if existing, ok := vfs.devices[tup]; ok {
+		return fmt.Errorf("%s device number (%d, %d) is already registered to device type %T", kind, major, minor, existing.dev)
+	}
+	vfs.devices[tup] = &registeredDevice{
+		dev:  dev,
+		opts: *opts,
+	}
+	return nil
+}
+
+// OpenDeviceSpecialFile returns a FileDescription representing the given
+// device.
+func (vfs *VirtualFilesystem) OpenDeviceSpecialFile(ctx context.Context, mnt *Mount, d *Dentry, kind DeviceKind, major, minor uint32, opts *OpenOptions) (*FileDescription, error) {
+	tup := devTuple{kind, major, minor}
+	vfs.devicesMu.RLock()
+	defer vfs.devicesMu.RUnlock()
+	rd, ok := vfs.devices[tup]
+	if !ok {
+		return nil, syserror.ENXIO
+	}
+	return rd.dev.Open(ctx, mnt, d, *opts)
+}
+
+// GetAnonBlockDevMinor allocates and returns an unused minor device number for
+// an "anonymous" block device with major number UNNAMED_MAJOR.
+func (vfs *VirtualFilesystem) GetAnonBlockDevMinor() (uint32, error) {
+	vfs.anonBlockDevMinorMu.Lock()
+	defer vfs.anonBlockDevMinorMu.Unlock()
+	minor := vfs.anonBlockDevMinorNext
+	const maxDevMinor = (1 << 20) - 1
+	for minor < maxDevMinor {
+		if _, ok := vfs.anonBlockDevMinor[minor]; !ok {
+			vfs.anonBlockDevMinor[minor] = struct{}{}
+			vfs.anonBlockDevMinorNext = minor + 1
+			return minor, nil
+		}
+		minor++
+	}
+	return 0, syserror.EMFILE
+}
+
+// PutAnonBlockDevMinor deallocates a minor device number returned by a
+// previous call to GetAnonBlockDevMinor.
+func (vfs *VirtualFilesystem) PutAnonBlockDevMinor(minor uint32) {
+	vfs.anonBlockDevMinorMu.Lock()
+	defer vfs.anonBlockDevMinorMu.Unlock()
+	delete(vfs.anonBlockDevMinor, minor)
+	if minor < vfs.anonBlockDevMinorNext {
+		vfs.anonBlockDevMinorNext = minor
+	}
+}
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
new file mode 100644
index 000000000..599c3131c
--- /dev/null
+++ b/pkg/sentry/vfs/epoll.go
@@ -0,0 +1,383 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// epollCycleMu serializes attempts to register EpollInstances with other
+// EpollInstances in order to check for cycles.
+var epollCycleMu sync.Mutex
+
+// EpollInstance represents an epoll instance, as described by epoll(7).
+type EpollInstance struct {
+	vfsfd FileDescription
+	FileDescriptionDefaultImpl
+	DentryMetadataFileDescriptionImpl
+	NoLockFD
+
+	// q holds waiters on this EpollInstance.
+	q waiter.Queue
+
+	// interest is the set of file descriptors that are registered with the
+	// EpollInstance for monitoring. interest is protected by interestMu.
+	interestMu sync.Mutex
+	interest   map[epollInterestKey]*epollInterest
+
+	// mu protects fields in registered epollInterests.
+	mu sync.Mutex
+
+	// ready is the set of file descriptors that may be "ready" for I/O. Note
+	// that this must be an ordered list, not a map: "If more than maxevents
+	// file descriptors are ready when epoll_wait() is called, then successive
+	// epoll_wait() calls will round robin through the set of ready file
+	// descriptors. This behavior helps avoid starvation scenarios, where a
+	// process fails to notice that additional file descriptors are ready
+	// because it focuses on a set of file descriptors that are already known
+	// to be ready." - epoll_wait(2)
+	ready epollInterestList
+}
+
+type epollInterestKey struct {
+	// file is the registered FileDescription. No reference is held on file;
+	// instead, when the last reference is dropped, FileDescription.DecRef()
+	// removes the FileDescription from all EpollInstances. file is immutable.
+	file *FileDescription
+
+	// num is the file descriptor number with which this entry was registered.
+	// num is immutable.
+	num int32
+}
+
+// epollInterest represents an EpollInstance's interest in a file descriptor.
+type epollInterest struct {
+	// epoll is the owning EpollInstance. epoll is immutable.
+	epoll *EpollInstance
+
+	// key is the file to which this epollInterest applies. key is immutable.
+	key epollInterestKey
+
+	// waiter is registered with key.file. entry is protected by epoll.mu.
+	waiter waiter.Entry
+
+	// mask is the event mask associated with this registration, including
+	// flags EPOLLET and EPOLLONESHOT. mask is protected by epoll.mu.
+	mask uint32
+
+	// ready is true if epollInterestEntry is linked into epoll.ready. ready
+	// and epollInterestEntry are protected by epoll.mu.
+	ready bool
+	epollInterestEntry
+
+	// userData is the struct epoll_event::data associated with this
+	// epollInterest. userData is protected by epoll.mu.
+	userData [2]int32
+}
+
+// NewEpollInstanceFD returns a FileDescription representing a new epoll
+// instance. A reference is taken on the returned FileDescription.
+func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) {
+	vd := vfs.NewAnonVirtualDentry("[eventpoll]")
+	defer vd.DecRef()
+	ep := &EpollInstance{
+		interest: make(map[epollInterestKey]*epollInterest),
+	}
+	if err := ep.vfsfd.Init(ep, linux.O_RDWR, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
+		DenyPRead:         true,
+		DenyPWrite:        true,
+		UseDentryMetadata: true,
+	}); err != nil {
+		return nil, err
+	}
+	return &ep.vfsfd, nil
+}
+
+// Release implements FileDescriptionImpl.Release.
+func (ep *EpollInstance) Release() {
+	// Unregister all polled fds.
+	ep.interestMu.Lock()
+	defer ep.interestMu.Unlock()
+	for key, epi := range ep.interest {
+		file := key.file
+		file.epollMu.Lock()
+		delete(file.epolls, epi)
+		file.epollMu.Unlock()
+		file.EventUnregister(&epi.waiter)
+	}
+	ep.interest = nil
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (ep *EpollInstance) Readiness(mask waiter.EventMask) waiter.EventMask {
+	if mask&waiter.EventIn == 0 {
+		return 0
+	}
+	ep.mu.Lock()
+	for epi := ep.ready.Front(); epi != nil; epi = epi.Next() {
+		wmask := waiter.EventMaskFromLinux(epi.mask)
+		if epi.key.file.Readiness(wmask)&wmask != 0 {
+			ep.mu.Unlock()
+			return waiter.EventIn
+		}
+	}
+	ep.mu.Unlock()
+	return 0
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (ep *EpollInstance) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	ep.q.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (ep *EpollInstance) EventUnregister(e *waiter.Entry) {
+	ep.q.EventUnregister(e)
+}
+
+// Seek implements FileDescriptionImpl.Seek.
+func (ep *EpollInstance) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	// Linux: fs/eventpoll.c:eventpoll_fops.llseek == noop_llseek
+	return 0, nil
+}
+
+// AddInterest implements the semantics of EPOLL_CTL_ADD.
+//
+// Preconditions: A reference must be held on file.
+func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event linux.EpollEvent) error {
+	// Check for cyclic polling if necessary.
+	subep, _ := file.impl.(*EpollInstance)
+	if subep != nil {
+		epollCycleMu.Lock()
+		// epollCycleMu must be locked for the rest of AddInterest to ensure
+		// that cyclic polling is not introduced after the check.
+		defer epollCycleMu.Unlock()
+		if subep.mightPoll(ep) {
+			return syserror.ELOOP
+		}
+	}
+
+	ep.interestMu.Lock()
+	defer ep.interestMu.Unlock()
+
+	// Fail if the key is already registered.
+	key := epollInterestKey{
+		file: file,
+		num:  num,
+	}
+	if _, ok := ep.interest[key]; ok {
+		return syserror.EEXIST
+	}
+
+	// Register interest in file.
+	mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP
+	epi := &epollInterest{
+		epoll:    ep,
+		key:      key,
+		mask:     mask,
+		userData: event.Data,
+	}
+	epi.waiter.Callback = epi
+	ep.interest[key] = epi
+	wmask := waiter.EventMaskFromLinux(mask)
+	file.EventRegister(&epi.waiter, wmask)
+
+	// Check if the file is already ready.
+	if file.Readiness(wmask)&wmask != 0 {
+		epi.Callback(nil)
+	}
+
+	// Add epi to file.epolls so that it is removed when the last
+	// FileDescription reference is dropped.
+	file.epollMu.Lock()
+	if file.epolls == nil {
+		file.epolls = make(map[*epollInterest]struct{})
+	}
+	file.epolls[epi] = struct{}{}
+	file.epollMu.Unlock()
+
+	return nil
+}
+
+func (ep *EpollInstance) mightPoll(ep2 *EpollInstance) bool {
+	return ep.mightPollRecursive(ep2, 4) // Linux: fs/eventpoll.c:EP_MAX_NESTS
+}
+
+func (ep *EpollInstance) mightPollRecursive(ep2 *EpollInstance, remainingRecursion int) bool {
+	ep.interestMu.Lock()
+	defer ep.interestMu.Unlock()
+	for key := range ep.interest {
+		nextep, ok := key.file.impl.(*EpollInstance)
+		if !ok {
+			continue
+		}
+		if nextep == ep2 {
+			return true
+		}
+		if remainingRecursion == 0 {
+			return true
+		}
+		if nextep.mightPollRecursive(ep2, remainingRecursion-1) {
+			return true
+		}
+	}
+	return false
+}
+
+// ModifyInterest implements the semantics of EPOLL_CTL_MOD.
+//
+// Preconditions: A reference must be held on file.
+func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event linux.EpollEvent) error {
+	ep.interestMu.Lock()
+	defer ep.interestMu.Unlock()
+
+	// Fail if the key is not already registered.
+	epi, ok := ep.interest[epollInterestKey{
+		file: file,
+		num:  num,
+	}]
+	if !ok {
+		return syserror.ENOENT
+	}
+
+	// Update epi for the next call to ep.ReadEvents().
+	mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP
+	ep.mu.Lock()
+	epi.mask = mask
+	epi.userData = event.Data
+	ep.mu.Unlock()
+
+	// Re-register with the new mask.
+	file.EventUnregister(&epi.waiter)
+	wmask := waiter.EventMaskFromLinux(mask)
+	file.EventRegister(&epi.waiter, wmask)
+
+	// Check if the file is already ready with the new mask.
+	if file.Readiness(wmask)&wmask != 0 {
+		epi.Callback(nil)
+	}
+
+	return nil
+}
+
+// DeleteInterest implements the semantics of EPOLL_CTL_DEL.
+//
+// Preconditions: A reference must be held on file.
+func (ep *EpollInstance) DeleteInterest(file *FileDescription, num int32) error {
+	ep.interestMu.Lock()
+	defer ep.interestMu.Unlock()
+
+	// Fail if the key is not already registered.
+	epi, ok := ep.interest[epollInterestKey{
+		file: file,
+		num:  num,
+	}]
+	if !ok {
+		return syserror.ENOENT
+	}
+
+	// Unregister from the file so that epi will no longer be readied.
+	file.EventUnregister(&epi.waiter)
+
+	// Forget about epi.
+	ep.removeLocked(epi)
+
+	file.epollMu.Lock()
+	delete(file.epolls, epi)
+	file.epollMu.Unlock()
+
+	return nil
+}
+
+// Callback implements waiter.EntryCallback.Callback.
+func (epi *epollInterest) Callback(*waiter.Entry) {
+	newReady := false
+	epi.epoll.mu.Lock()
+	if !epi.ready {
+		newReady = true
+		epi.ready = true
+		epi.epoll.ready.PushBack(epi)
+	}
+	epi.epoll.mu.Unlock()
+	if newReady {
+		epi.epoll.q.Notify(waiter.EventIn)
+	}
+}
+
+// Preconditions: ep.interestMu must be locked.
+func (ep *EpollInstance) removeLocked(epi *epollInterest) {
+	delete(ep.interest, epi.key)
+	ep.mu.Lock()
+	if epi.ready {
+		epi.ready = false
+		ep.ready.Remove(epi)
+	}
+	ep.mu.Unlock()
+}
+
+// ReadEvents reads up to len(events) ready events into events and returns the
+// number of events read.
+//
+// Preconditions: len(events) != 0.
+func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int {
+	i := 0
+	// Hot path: avoid defer.
+	ep.mu.Lock()
+	var next *epollInterest
+	var requeue epollInterestList
+	for epi := ep.ready.Front(); epi != nil; epi = next {
+		next = epi.Next()
+		// Regardless of what else happens, epi is initially removed from the
+		// ready list.
+		ep.ready.Remove(epi)
+		wmask := waiter.EventMaskFromLinux(epi.mask)
+		ievents := epi.key.file.Readiness(wmask) & wmask
+		if ievents == 0 {
+			// Leave epi off the ready list.
+			epi.ready = false
+			continue
+		}
+		// Determine what we should do with epi.
+		switch {
+		case epi.mask&linux.EPOLLONESHOT != 0:
+			// Clear all events from the mask; they must be re-added by
+			// EPOLL_CTL_MOD.
+			epi.mask &= linux.EP_PRIVATE_BITS
+			fallthrough
+		case epi.mask&linux.EPOLLET != 0:
+			// Leave epi off the ready list.
+			epi.ready = false
+		default:
+			// Queue epi to be moved to the end of the ready list.
+			requeue.PushBack(epi)
+		}
+		// Report ievents.
+		events[i] = linux.EpollEvent{
+			Events: ievents.ToLinux(),
+			Data:   epi.userData,
+		}
+		i++
+		if i == len(events) {
+			break
+		}
+	}
+	ep.ready.PushBackList(&requeue)
+	ep.mu.Unlock()
+	return i
+}
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
new file mode 100644
index 000000000..0c42574db
--- /dev/null
+++ b/pkg/sentry/vfs/file_description.go
@@ -0,0 +1,837 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// A FileDescription represents an open file description, which is the entity
+// referred to by a file descriptor (POSIX.1-2017 3.258 "Open File
+// Description").
+//
+// FileDescriptions are reference-counted. Unless otherwise specified, all
+// FileDescription methods require that a reference is held.
+//
+// FileDescription is analogous to Linux's struct file.
+type FileDescription struct {
+	// refs is the reference count. refs is accessed using atomic memory
+	// operations.
+	refs int64
+
+	// flagsMu protects statusFlags and asyncHandler below.
+	flagsMu sync.Mutex
+
+	// statusFlags contains status flags, "initialized by open(2) and possibly
+	// modified by fcntl()" - fcntl(2). statusFlags can be read using atomic
+	// memory operations when it does not need to be synchronized with an
+	// access to asyncHandler.
+	statusFlags uint32
+
+	// asyncHandler handles O_ASYNC signal generation. It is set with the
+	// F_SETOWN or F_SETOWN_EX fcntls. For asyncHandler to be used, O_ASYNC must
+	// also be set by fcntl(2).
+	asyncHandler FileAsync
+
+	// epolls is the set of epollInterests registered for this FileDescription.
+	// epolls is protected by epollMu.
+	epollMu sync.Mutex
+	epolls  map[*epollInterest]struct{}
+
+	// vd is the filesystem location at which this FileDescription was opened.
+	// A reference is held on vd. vd is immutable.
+	vd VirtualDentry
+
+	// opts contains options passed to FileDescription.Init(). opts is
+	// immutable.
+	opts FileDescriptionOptions
+
+	// readable is MayReadFileWithOpenFlags(statusFlags). readable is
+	// immutable.
+	//
+	// readable is analogous to Linux's FMODE_READ.
+	readable bool
+
+	// writable is MayWriteFileWithOpenFlags(statusFlags). If writable is true,
+	// the FileDescription holds a write count on vd.mount. writable is
+	// immutable.
+	//
+	// writable is analogous to Linux's FMODE_WRITE.
+	writable bool
+
+	usedLockBSD uint32
+
+	// impl is the FileDescriptionImpl associated with this Filesystem. impl is
+	// immutable. This should be the last field in FileDescription.
+	impl FileDescriptionImpl
+}
+
+// FileDescriptionOptions contains options to FileDescription.Init().
+type FileDescriptionOptions struct {
+	// If AllowDirectIO is true, allow O_DIRECT to be set on the file.
+	AllowDirectIO bool
+
+	// If DenyPRead is true, calls to FileDescription.PRead() return ESPIPE.
+	DenyPRead bool
+
+	// If DenyPWrite is true, calls to FileDescription.PWrite() return
+	// ESPIPE.
+	DenyPWrite bool
+
+	// If UseDentryMetadata is true, calls to FileDescription methods that
+	// interact with file and filesystem metadata (Stat, SetStat, StatFS,
+	// Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling
+	// the corresponding FilesystemImpl methods instead of the corresponding
+	// FileDescriptionImpl methods.
+	//
+	// UseDentryMetadata is intended for file descriptions that are implemented
+	// outside of individual filesystems, such as pipes, sockets, and device
+	// special files. FileDescriptions for which UseDentryMetadata is true may
+	// embed DentryMetadataFileDescriptionImpl to obtain appropriate
+	// implementations of FileDescriptionImpl methods that should not be
+	// called.
+	UseDentryMetadata bool
+}
+
+// FileCreationFlags are the set of flags passed to FileDescription.Init() but
+// omitted from FileDescription.StatusFlags().
+const FileCreationFlags = linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC
+
+// Init must be called before first use of fd. If it succeeds, it takes
+// references on mnt and d. flags is the initial file description flags, which
+// is usually the full set of flags passed to open(2).
+func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) error {
+	writable := MayWriteFileWithOpenFlags(flags)
+	if writable {
+		if err := mnt.CheckBeginWrite(); err != nil {
+			return err
+		}
+	}
+
+	fd.refs = 1
+
+	// Remove "file creation flags" to mirror the behavior from file.f_flags in
+	// fs/open.c:do_dentry_open.
+	fd.statusFlags = flags &^ FileCreationFlags
+	fd.vd = VirtualDentry{
+		mount:  mnt,
+		dentry: d,
+	}
+	mnt.IncRef()
+	d.IncRef()
+	fd.opts = *opts
+	fd.readable = MayReadFileWithOpenFlags(flags)
+	fd.writable = writable
+	fd.impl = impl
+	return nil
+}
+
+// IncRef increments fd's reference count.
+func (fd *FileDescription) IncRef() {
+	atomic.AddInt64(&fd.refs, 1)
+}
+
+// TryIncRef increments fd's reference count and returns true. If fd's
+// reference count is already zero, TryIncRef does nothing and returns false.
+//
+// TryIncRef does not require that a reference is held on fd.
+func (fd *FileDescription) TryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&fd.refs)
+		if refs <= 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&fd.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+// DecRef decrements fd's reference count.
+func (fd *FileDescription) DecRef() {
+	if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 {
+		// Unregister fd from all epoll instances.
+		fd.epollMu.Lock()
+		epolls := fd.epolls
+		fd.epolls = nil
+		fd.epollMu.Unlock()
+		for epi := range epolls {
+			ep := epi.epoll
+			ep.interestMu.Lock()
+			// Check that epi has not been concurrently unregistered by
+			// EpollInstance.DeleteInterest() or EpollInstance.Release().
+			if _, ok := ep.interest[epi.key]; ok {
+				fd.EventUnregister(&epi.waiter)
+				ep.removeLocked(epi)
+			}
+			ep.interestMu.Unlock()
+		}
+
+		// If BSD locks were used, release any lock that it may have acquired.
+		if atomic.LoadUint32(&fd.usedLockBSD) != 0 {
+			fd.impl.UnlockBSD(context.Background(), fd)
+		}
+
+		// Release implementation resources.
+		fd.impl.Release()
+		if fd.writable {
+			fd.vd.mount.EndWrite()
+		}
+		fd.vd.DecRef()
+		fd.flagsMu.Lock()
+		// TODO(gvisor.dev/issue/1663): We may need to unregister during save, as we do in VFS1.
+		if fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil {
+			fd.asyncHandler.Unregister(fd)
+		}
+		fd.asyncHandler = nil
+		fd.flagsMu.Unlock()
+	} else if refs < 0 {
+		panic("FileDescription.DecRef() called without holding a reference")
+	}
+}
+
+// Refs returns the current number of references. The returned count
+// is inherently racy and is unsafe to use without external synchronization.
+func (fd *FileDescription) Refs() int64 {
+	return atomic.LoadInt64(&fd.refs)
+}
+
+// Mount returns the mount on which fd was opened. It does not take a reference
+// on the returned Mount.
+func (fd *FileDescription) Mount() *Mount {
+	return fd.vd.mount
+}
+
+// Dentry returns the dentry at which fd was opened. It does not take a
+// reference on the returned Dentry.
+func (fd *FileDescription) Dentry() *Dentry {
+	return fd.vd.dentry
+}
+
+// VirtualDentry returns the location at which fd was opened. It does not take
+// a reference on the returned VirtualDentry.
+func (fd *FileDescription) VirtualDentry() VirtualDentry {
+	return fd.vd
+}
+
+// Options returns the options passed to fd.Init().
+func (fd *FileDescription) Options() FileDescriptionOptions {
+	return fd.opts
+}
+
+// StatusFlags returns file description status flags, as for fcntl(F_GETFL).
+func (fd *FileDescription) StatusFlags() uint32 {
+	return atomic.LoadUint32(&fd.statusFlags)
+}
+
+// SetStatusFlags sets file description status flags, as for fcntl(F_SETFL).
+func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Credentials, flags uint32) error {
+	// Compare Linux's fs/fcntl.c:setfl().
+	oldFlags := fd.StatusFlags()
+	// Linux documents this check as "O_APPEND cannot be cleared if the file is
+	// marked as append-only and the file is open for write", which would make
+	// sense. However, the check as actually implemented seems to be "O_APPEND
+	// cannot be changed if the file is marked as append-only".
+	if (flags^oldFlags)&linux.O_APPEND != 0 {
+		stat, err := fd.Stat(ctx, StatOptions{
+			// There is no mask bit for stx_attributes.
+			Mask: 0,
+			// Linux just reads inode::i_flags directly.
+			Sync: linux.AT_STATX_DONT_SYNC,
+		})
+		if err != nil {
+			return err
+		}
+		if (stat.AttributesMask&linux.STATX_ATTR_APPEND != 0) && (stat.Attributes&linux.STATX_ATTR_APPEND != 0) {
+			return syserror.EPERM
+		}
+	}
+	if (flags&linux.O_NOATIME != 0) && (oldFlags&linux.O_NOATIME == 0) {
+		stat, err := fd.Stat(ctx, StatOptions{
+			Mask: linux.STATX_UID,
+			// Linux's inode_owner_or_capable() just reads inode::i_uid
+			// directly.
+			Sync: linux.AT_STATX_DONT_SYNC,
+		})
+		if err != nil {
+			return err
+		}
+		if stat.Mask&linux.STATX_UID == 0 {
+			return syserror.EPERM
+		}
+		if !CanActAsOwner(creds, auth.KUID(stat.UID)) {
+			return syserror.EPERM
+		}
+	}
+	if flags&linux.O_DIRECT != 0 && !fd.opts.AllowDirectIO {
+		return syserror.EINVAL
+	}
+	// TODO(jamieliu): FileDescriptionImpl.SetOAsync()?
+	const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK
+	fd.flagsMu.Lock()
+	if fd.asyncHandler != nil {
+		// Use fd.statusFlags instead of oldFlags, which may have become outdated,
+		// to avoid double registering/unregistering.
+		if fd.statusFlags&linux.O_ASYNC == 0 && flags&linux.O_ASYNC != 0 {
+			fd.asyncHandler.Register(fd)
+		} else if fd.statusFlags&linux.O_ASYNC != 0 && flags&linux.O_ASYNC == 0 {
+			fd.asyncHandler.Unregister(fd)
+		}
+	}
+	fd.statusFlags = (oldFlags &^ settableFlags) | (flags & settableFlags)
+	fd.flagsMu.Unlock()
+	return nil
+}
+
+// IsReadable returns true if fd was opened for reading.
+func (fd *FileDescription) IsReadable() bool {
+	return fd.readable
+}
+
+// IsWritable returns true if fd was opened for writing.
+func (fd *FileDescription) IsWritable() bool {
+	return fd.writable
+}
+
+// Impl returns the FileDescriptionImpl associated with fd.
+func (fd *FileDescription) Impl() FileDescriptionImpl {
+	return fd.impl
+}
+
+// FileDescriptionImpl contains implementation details for an FileDescription.
+// Implementations of FileDescriptionImpl should contain their associated
+// FileDescription by value as their first field.
+//
+// For all functions that return linux.Statx, Statx.Uid and Statx.Gid will
+// be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID and
+// auth.KGID respectively).
+//
+// All methods may return errors not specified.
+//
+// FileDescriptionImpl is analogous to Linux's struct file_operations.
+type FileDescriptionImpl interface {
+	// Release is called when the associated FileDescription reaches zero
+	// references.
+	Release()
+
+	// OnClose is called when a file descriptor representing the
+	// FileDescription is closed. Note that returning a non-nil error does not
+	// prevent the file descriptor from being closed.
+	OnClose(ctx context.Context) error
+
+	// Stat returns metadata for the file represented by the FileDescription.
+	Stat(ctx context.Context, opts StatOptions) (linux.Statx, error)
+
+	// SetStat updates metadata for the file represented by the
+	// FileDescription. Implementations are responsible for checking if the
+	// operation can be performed (see vfs.CheckSetStat() for common checks).
+	SetStat(ctx context.Context, opts SetStatOptions) error
+
+	// StatFS returns metadata for the filesystem containing the file
+	// represented by the FileDescription.
+	StatFS(ctx context.Context) (linux.Statfs, error)
+
+	// Allocate grows file represented by FileDescription to offset + length bytes.
+	// Only mode == 0 is supported currently.
+	Allocate(ctx context.Context, mode, offset, length uint64) error
+
+	// waiter.Waitable methods may be used to poll for I/O events.
+	waiter.Waitable
+
+	// PRead reads from the file into dst, starting at the given offset, and
+	// returns the number of bytes read. PRead is permitted to return partial
+	// reads with a nil error.
+	//
+	// Errors:
+	//
+	// - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP.
+	//
+	// Preconditions: The FileDescription was opened for reading.
+	// FileDescriptionOptions.DenyPRead == false.
+	PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)
+
+	// Read is similar to PRead, but does not specify an offset.
+	//
+	// For files with an implicit FileDescription offset (e.g. regular files),
+	// Read begins at the FileDescription offset, and advances the offset by
+	// the number of bytes read; note that POSIX 2.9.7 "Thread Interactions
+	// with Regular File Operations" requires that all operations that may
+	// mutate the FileDescription offset are serialized.
+	//
+	// Errors:
+	//
+	// - If opts.Flags specifies unsupported options, Read returns EOPNOTSUPP.
+	//
+	// Preconditions: The FileDescription was opened for reading.
+	Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error)
+
+	// PWrite writes src to the file, starting at the given offset, and returns
+	// the number of bytes written. PWrite is permitted to return partial
+	// writes with a nil error.
+	//
+	// As in Linux (but not POSIX), if O_APPEND is in effect for the
+	// FileDescription, PWrite should ignore the offset and append data to the
+	// end of the file.
+	//
+	// Errors:
+	//
+	// - If opts.Flags specifies unsupported options, PWrite returns
+	// EOPNOTSUPP.
+	//
+	// Preconditions: The FileDescription was opened for writing.
+	// FileDescriptionOptions.DenyPWrite == false.
+	PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)
+
+	// Write is similar to PWrite, but does not specify an offset, which is
+	// implied as for Read.
+	//
+	// Write is a FileDescriptionImpl method, instead of a wrapper around
+	// PWrite that uses a FileDescription offset, to make it possible for
+	// remote filesystems to implement O_APPEND correctly (i.e. atomically with
+	// respect to writers outside the scope of VFS).
+	//
+	// Errors:
+	//
+	// - If opts.Flags specifies unsupported options, Write returns EOPNOTSUPP.
+	//
+	// Preconditions: The FileDescription was opened for writing.
+	Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error)
+
+	// IterDirents invokes cb on each entry in the directory represented by the
+	// FileDescription. If IterDirents has been called since the last call to
+	// Seek, it continues iteration from the end of the last call.
+	IterDirents(ctx context.Context, cb IterDirentsCallback) error
+
+	// Seek changes the FileDescription offset (assuming one exists) and
+	// returns its new value.
+	//
+	// For directories, if whence == SEEK_SET and offset == 0, the caller is
+	// rewinddir(), such that Seek "shall also cause the directory stream to
+	// refer to the current state of the corresponding directory" -
+	// POSIX.1-2017.
+	Seek(ctx context.Context, offset int64, whence int32) (int64, error)
+
+	// Sync requests that cached state associated with the file represented by
+	// the FileDescription is synchronized with persistent storage, and blocks
+	// until this is complete.
+	Sync(ctx context.Context) error
+
+	// ConfigureMMap mutates opts to implement mmap(2) for the file. Most
+	// implementations that support memory mapping can call
+	// GenericConfigureMMap with the appropriate memmap.Mappable.
+	ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error
+
+	// Ioctl implements the ioctl(2) syscall.
+	Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)
+
+	// Listxattr returns all extended attribute names for the file.
+	Listxattr(ctx context.Context, size uint64) ([]string, error)
+
+	// Getxattr returns the value associated with the given extended attribute
+	// for the file.
+	Getxattr(ctx context.Context, opts GetxattrOptions) (string, error)
+
+	// Setxattr changes the value associated with the given extended attribute
+	// for the file.
+	Setxattr(ctx context.Context, opts SetxattrOptions) error
+
+	// Removexattr removes the given extended attribute from the file.
+	Removexattr(ctx context.Context, name string) error
+
+	// LockBSD tries to acquire a BSD-style advisory file lock.
+	LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error
+
+	// UnlockBSD releases a BSD-style advisory file lock.
+	UnlockBSD(ctx context.Context, uid lock.UniqueID) error
+
+	// LockPOSIX tries to acquire a POSIX-style advisory file lock.
+	LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, length uint64, whence int16, block lock.Blocker) error
+
+	// UnlockPOSIX releases a POSIX-style advisory file lock.
+	UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, length uint64, whence int16) error
+}
+
+// Dirent holds the information contained in struct linux_dirent64.
+type Dirent struct {
+	// Name is the filename.
+	Name string
+
+	// Type is the file type, a linux.DT_* constant.
+	Type uint8
+
+	// Ino is the inode number.
+	Ino uint64
+
+	// NextOff is the offset of the *next* Dirent in the directory; that is,
+	// FileDescription.Seek(NextOff, SEEK_SET) (as called by seekdir(3)) will
+	// cause the next call to FileDescription.IterDirents() to yield the next
+	// Dirent. (The offset of the first Dirent in a directory is always 0.)
+	NextOff int64
+}
+
+// IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents.
+type IterDirentsCallback interface {
+	// Handle handles the given iterated Dirent. If Handle returns a non-nil
+	// error, FileDescriptionImpl.IterDirents must stop iteration and return
+	// the error; the next call to FileDescriptionImpl.IterDirents should
+	// restart with the same Dirent.
+	Handle(dirent Dirent) error
+}
+
+// IterDirentsCallbackFunc implements IterDirentsCallback for a function with
+// the semantics of IterDirentsCallback.Handle.
+type IterDirentsCallbackFunc func(dirent Dirent) error
+
+// Handle implements IterDirentsCallback.Handle.
+func (f IterDirentsCallbackFunc) Handle(dirent Dirent) error {
+	return f(dirent)
+}
+
+// OnClose is called when a file descriptor representing the FileDescription is
+// closed. Returning a non-nil error should not prevent the file descriptor
+// from being closed.
+func (fd *FileDescription) OnClose(ctx context.Context) error {
+	return fd.impl.OnClose(ctx)
+}
+
+// Stat returns metadata for the file represented by fd.
+func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts)
+		vfsObj.putResolvingPath(rp)
+		return stat, err
+	}
+	return fd.impl.Stat(ctx, opts)
+}
+
+// SetStat updates metadata for the file represented by fd.
+func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) error {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts)
+		vfsObj.putResolvingPath(rp)
+		return err
+	}
+	return fd.impl.SetStat(ctx, opts)
+}
+
+// StatFS returns metadata for the filesystem containing the file represented
+// by fd.
+func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp)
+		vfsObj.putResolvingPath(rp)
+		return statfs, err
+	}
+	return fd.impl.StatFS(ctx)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+//
+// It returns fd's I/O readiness.
+func (fd *FileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return fd.impl.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+//
+// It registers e for I/O readiness events in mask.
+func (fd *FileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	fd.impl.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+//
+// It unregisters e for I/O readiness events.
+func (fd *FileDescription) EventUnregister(e *waiter.Entry) {
+	fd.impl.EventUnregister(e)
+}
+
+// PRead reads from the file represented by fd into dst, starting at the given
+// offset, and returns the number of bytes read. PRead is permitted to return
+// partial reads with a nil error.
+func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+	if fd.opts.DenyPRead {
+		return 0, syserror.ESPIPE
+	}
+	if !fd.readable {
+		return 0, syserror.EBADF
+	}
+	return fd.impl.PRead(ctx, dst, offset, opts)
+}
+
+// Read is similar to PRead, but does not specify an offset.
+func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+	if !fd.readable {
+		return 0, syserror.EBADF
+	}
+	return fd.impl.Read(ctx, dst, opts)
+}
+
+// PWrite writes src to the file represented by fd, starting at the given
+// offset, and returns the number of bytes written. PWrite is permitted to
+// return partial writes with a nil error.
+func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+	if fd.opts.DenyPWrite {
+		return 0, syserror.ESPIPE
+	}
+	if !fd.writable {
+		return 0, syserror.EBADF
+	}
+	return fd.impl.PWrite(ctx, src, offset, opts)
+}
+
+// Write is similar to PWrite, but does not specify an offset.
+func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+	if !fd.writable {
+		return 0, syserror.EBADF
+	}
+	return fd.impl.Write(ctx, src, opts)
+}
+
+// IterDirents invokes cb on each entry in the directory represented by fd. If
+// IterDirents has been called since the last call to Seek, it continues
+// iteration from the end of the last call.
+func (fd *FileDescription) IterDirents(ctx context.Context, cb IterDirentsCallback) error {
+	return fd.impl.IterDirents(ctx, cb)
+}
+
+// Seek changes fd's offset (assuming one exists) and returns its new value.
+func (fd *FileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	return fd.impl.Seek(ctx, offset, whence)
+}
+
+// Sync has the semantics of fsync(2).
+func (fd *FileDescription) Sync(ctx context.Context) error {
+	return fd.impl.Sync(ctx)
+}
+
+// ConfigureMMap mutates opts to implement mmap(2) for the file represented by
+// fd.
+func (fd *FileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	return fd.impl.ConfigureMMap(ctx, opts)
+}
+
+// Ioctl implements the ioctl(2) syscall.
+func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	return fd.impl.Ioctl(ctx, uio, args)
+}
+
+// Listxattr returns all extended attribute names for the file represented by
+// fd.
+//
+// If the size of the list (including a NUL terminating byte after every entry)
+// would exceed size, ERANGE may be returned. Note that implementations
+// are free to ignore size entirely and return without error). In all cases,
+// if size is 0, the list should be returned without error, regardless of size.
+func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp, size)
+		vfsObj.putResolvingPath(rp)
+		return names, err
+	}
+	names, err := fd.impl.Listxattr(ctx, size)
+	if err == syserror.ENOTSUP {
+		// Linux doesn't actually return ENOTSUP in this case; instead,
+		// fs/xattr.c:vfs_listxattr() falls back to allowing the security
+		// subsystem to return security extended attributes, which by default
+		// don't exist.
+		return nil, nil
+	}
+	return names, err
+}
+
+// Getxattr returns the value associated with the given extended attribute for
+// the file represented by fd.
+//
+// If the size of the return value exceeds opts.Size, ERANGE may be returned
+// (note that implementations are free to ignore opts.Size entirely and return
+// without error). In all cases, if opts.Size is 0, the value should be
+// returned without error, regardless of size.
+func (fd *FileDescription) Getxattr(ctx context.Context, opts *GetxattrOptions) (string, error) {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
+		vfsObj.putResolvingPath(rp)
+		return val, err
+	}
+	return fd.impl.Getxattr(ctx, *opts)
+}
+
+// Setxattr changes the value associated with the given extended attribute for
+// the file represented by fd.
+func (fd *FileDescription) Setxattr(ctx context.Context, opts *SetxattrOptions) error {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
+		vfsObj.putResolvingPath(rp)
+		return err
+	}
+	return fd.impl.Setxattr(ctx, *opts)
+}
+
+// Removexattr removes the given extended attribute from the file represented
+// by fd.
+func (fd *FileDescription) Removexattr(ctx context.Context, name string) error {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name)
+		vfsObj.putResolvingPath(rp)
+		return err
+	}
+	return fd.impl.Removexattr(ctx, name)
+}
+
+// SyncFS instructs the filesystem containing fd to execute the semantics of
+// syncfs(2).
+func (fd *FileDescription) SyncFS(ctx context.Context) error {
+	return fd.vd.mount.fs.impl.Sync(ctx)
+}
+
+// MappedName implements memmap.MappingIdentity.MappedName.
+func (fd *FileDescription) MappedName(ctx context.Context) string {
+	vfsroot := RootFromContext(ctx)
+	s, _ := fd.vd.mount.vfs.PathnameWithDeleted(ctx, vfsroot, fd.vd)
+	if vfsroot.Ok() {
+		vfsroot.DecRef()
+	}
+	return s
+}
+
+// DeviceID implements memmap.MappingIdentity.DeviceID.
+func (fd *FileDescription) DeviceID() uint64 {
+	stat, err := fd.Stat(context.Background(), StatOptions{
+		// There is no STATX_DEV; we assume that Stat will return it if it's
+		// available regardless of mask.
+		Mask: 0,
+		// fs/proc/task_mmu.c:show_map_vma() just reads inode::i_sb->s_dev
+		// directly.
+		Sync: linux.AT_STATX_DONT_SYNC,
+	})
+	if err != nil {
+		return 0
+	}
+	return uint64(linux.MakeDeviceID(uint16(stat.DevMajor), stat.DevMinor))
+}
+
+// InodeID implements memmap.MappingIdentity.InodeID.
+func (fd *FileDescription) InodeID() uint64 {
+	stat, err := fd.Stat(context.Background(), StatOptions{
+		Mask: linux.STATX_INO,
+		// fs/proc/task_mmu.c:show_map_vma() just reads inode::i_ino directly.
+		Sync: linux.AT_STATX_DONT_SYNC,
+	})
+	if err != nil || stat.Mask&linux.STATX_INO == 0 {
+		return 0
+	}
+	return stat.Ino
+}
+
+// Msync implements memmap.MappingIdentity.Msync.
+func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error {
+	return fd.Sync(ctx)
+}
+
+// LockBSD tries to acquire a BSD-style advisory file lock.
+func (fd *FileDescription) LockBSD(ctx context.Context, lockType lock.LockType, blocker lock.Blocker) error {
+	atomic.StoreUint32(&fd.usedLockBSD, 1)
+	return fd.impl.LockBSD(ctx, fd, lockType, blocker)
+}
+
+// UnlockBSD releases a BSD-style advisory file lock.
+func (fd *FileDescription) UnlockBSD(ctx context.Context) error {
+	return fd.impl.UnlockBSD(ctx, fd)
+}
+
+// LockPOSIX locks a POSIX-style file range lock.
+func (fd *FileDescription) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, end uint64, whence int16, block lock.Blocker) error {
+	return fd.impl.LockPOSIX(ctx, uid, t, start, end, whence, block)
+}
+
+// UnlockPOSIX unlocks a POSIX-style file range lock.
+func (fd *FileDescription) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, end uint64, whence int16) error {
+	return fd.impl.UnlockPOSIX(ctx, uid, start, end, whence)
+}
+
+// A FileAsync sends signals to its owner when w is ready for IO. This is only
+// implemented by pkg/sentry/fasync:FileAsync, but we unfortunately need this
+// interface to avoid circular dependencies.
+type FileAsync interface {
+	Register(w waiter.Waitable)
+	Unregister(w waiter.Waitable)
+}
+
+// AsyncHandler returns the FileAsync for fd.
+func (fd *FileDescription) AsyncHandler() FileAsync {
+	fd.flagsMu.Lock()
+	defer fd.flagsMu.Unlock()
+	return fd.asyncHandler
+}
+
+// SetAsyncHandler sets fd.asyncHandler if it has not been set before and
+// returns it.
+func (fd *FileDescription) SetAsyncHandler(newHandler func() FileAsync) FileAsync {
+	fd.flagsMu.Lock()
+	defer fd.flagsMu.Unlock()
+	if fd.asyncHandler == nil {
+		fd.asyncHandler = newHandler()
+		if fd.statusFlags&linux.O_ASYNC != 0 {
+			fd.asyncHandler.Register(fd)
+		}
+	}
+	return fd.asyncHandler
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
new file mode 100644
index 000000000..6b8b4ad49
--- /dev/null
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -0,0 +1,428 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"bytes"
+	"io"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// The following design pattern is strongly recommended for filesystem
+// implementations to adapt:
+//   - Have a local fileDescription struct (containing FileDescription) which
+//     embeds FileDescriptionDefaultImpl and overrides the default methods
+//     which are common to all fd implementations for that filesystem like
+//     StatusFlags, SetStatusFlags, Stat, SetStat, StatFS, etc.
+//   - This should be embedded in all file description implementations as the
+//     first field by value.
+//   - Directory FDs would also embed DirectoryFileDescriptionDefaultImpl.
+
+// FileDescriptionDefaultImpl may be embedded by implementations of
+// FileDescriptionImpl to obtain implementations of many FileDescriptionImpl
+// methods with default behavior analogous to Linux's.
+type FileDescriptionDefaultImpl struct{}
+
+// OnClose implements FileDescriptionImpl.OnClose analogously to
+// file_operations::flush == NULL in Linux.
+func (FileDescriptionDefaultImpl) OnClose(ctx context.Context) error {
+	return nil
+}
+
+// StatFS implements FileDescriptionImpl.StatFS analogously to
+// super_operations::statfs == NULL in Linux.
+func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, error) {
+	return linux.Statfs{}, syserror.ENOSYS
+}
+
+// Allocate implements FileDescriptionImpl.Allocate analogously to
+// fallocate called on regular file, directory or FIFO in Linux.
+func (FileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	return syserror.ENODEV
+}
+
+// Readiness implements waiter.Waitable.Readiness analogously to
+// file_operations::poll == NULL in Linux.
+func (FileDescriptionDefaultImpl) Readiness(mask waiter.EventMask) waiter.EventMask {
+	// include/linux/poll.h:vfs_poll() => DEFAULT_POLLMASK
+	return waiter.EventIn | waiter.EventOut
+}
+
+// EventRegister implements waiter.Waitable.EventRegister analogously to
+// file_operations::poll == NULL in Linux.
+func (FileDescriptionDefaultImpl) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister analogously to
+// file_operations::poll == NULL in Linux.
+func (FileDescriptionDefaultImpl) EventUnregister(e *waiter.Entry) {
+}
+
+// PRead implements FileDescriptionImpl.PRead analogously to
+// file_operations::read == file_operations::read_iter == NULL in Linux.
+func (FileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+	return 0, syserror.EINVAL
+}
+
+// Read implements FileDescriptionImpl.Read analogously to
+// file_operations::read == file_operations::read_iter == NULL in Linux.
+func (FileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+	return 0, syserror.EINVAL
+}
+
+// PWrite implements FileDescriptionImpl.PWrite analogously to
+// file_operations::write == file_operations::write_iter == NULL in Linux.
+func (FileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+	return 0, syserror.EINVAL
+}
+
+// Write implements FileDescriptionImpl.Write analogously to
+// file_operations::write == file_operations::write_iter == NULL in Linux.
+func (FileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+	return 0, syserror.EINVAL
+}
+
+// IterDirents implements FileDescriptionImpl.IterDirents analogously to
+// file_operations::iterate == file_operations::iterate_shared == NULL in
+// Linux.
+func (FileDescriptionDefaultImpl) IterDirents(ctx context.Context, cb IterDirentsCallback) error {
+	return syserror.ENOTDIR
+}
+
+// Seek implements FileDescriptionImpl.Seek analogously to
+// file_operations::llseek == NULL in Linux.
+func (FileDescriptionDefaultImpl) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// Sync implements FileDescriptionImpl.Sync analogously to
+// file_operations::fsync == NULL in Linux.
+func (FileDescriptionDefaultImpl) Sync(ctx context.Context) error {
+	return syserror.EINVAL
+}
+
+// ConfigureMMap implements FileDescriptionImpl.ConfigureMMap analogously to
+// file_operations::mmap == NULL in Linux.
+func (FileDescriptionDefaultImpl) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	return syserror.ENODEV
+}
+
+// Ioctl implements FileDescriptionImpl.Ioctl analogously to
+// file_operations::unlocked_ioctl == NULL in Linux.
+func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	return 0, syserror.ENOTTY
+}
+
+// Listxattr implements FileDescriptionImpl.Listxattr analogously to
+// inode_operations::listxattr == NULL in Linux.
+func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context, size uint64) ([]string, error) {
+	// This isn't exactly accurate; see FileDescription.Listxattr.
+	return nil, syserror.ENOTSUP
+}
+
+// Getxattr implements FileDescriptionImpl.Getxattr analogously to
+// inode::i_opflags & IOP_XATTR == 0 in Linux.
+func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, opts GetxattrOptions) (string, error) {
+	return "", syserror.ENOTSUP
+}
+
+// Setxattr implements FileDescriptionImpl.Setxattr analogously to
+// inode::i_opflags & IOP_XATTR == 0 in Linux.
+func (FileDescriptionDefaultImpl) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+	return syserror.ENOTSUP
+}
+
+// Removexattr implements FileDescriptionImpl.Removexattr analogously to
+// inode::i_opflags & IOP_XATTR == 0 in Linux.
+func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) error {
+	return syserror.ENOTSUP
+}
+
+// DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
+// FileDescriptionImpl that always represent directories to obtain
+// implementations of non-directory I/O methods that return EISDIR.
+type DirectoryFileDescriptionDefaultImpl struct{}
+
+// Allocate implements DirectoryFileDescriptionDefaultImpl.Allocate.
+func (DirectoryFileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	return syserror.EISDIR
+}
+
+// PRead implements FileDescriptionImpl.PRead.
+func (DirectoryFileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+	return 0, syserror.EISDIR
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (DirectoryFileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+	return 0, syserror.EISDIR
+}
+
+// PWrite implements FileDescriptionImpl.PWrite.
+func (DirectoryFileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+	return 0, syserror.EISDIR
+}
+
+// Write implements FileDescriptionImpl.Write.
+func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+	return 0, syserror.EISDIR
+}
+
+// DentryMetadataFileDescriptionImpl may be embedded by implementations of
+// FileDescriptionImpl for which FileDescriptionOptions.UseDentryMetadata is
+// true to obtain implementations of Stat and SetStat that panic.
+type DentryMetadataFileDescriptionImpl struct{}
+
+// Stat implements FileDescriptionImpl.Stat.
+func (DentryMetadataFileDescriptionImpl) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+	panic("illegal call to DentryMetadataFileDescriptionImpl.Stat")
+}
+
+// SetStat implements FileDescriptionImpl.SetStat.
+func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetStatOptions) error {
+	panic("illegal call to DentryMetadataFileDescriptionImpl.SetStat")
+}
+
+// DynamicBytesSource represents a data source for a
+// DynamicBytesFileDescriptionImpl.
+type DynamicBytesSource interface {
+	// Generate writes the file's contents to buf.
+	Generate(ctx context.Context, buf *bytes.Buffer) error
+}
+
+// StaticData implements DynamicBytesSource over a static string.
+type StaticData struct {
+	Data string
+}
+
+// Generate implements DynamicBytesSource.
+func (s *StaticData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString(s.Data)
+	return nil
+}
+
+// WritableDynamicBytesSource extends DynamicBytesSource to allow writes to the
+// underlying source.
+type WritableDynamicBytesSource interface {
+	DynamicBytesSource
+
+	// Write sends writes to the source.
+	Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error)
+}
+
+// DynamicBytesFileDescriptionImpl may be embedded by implementations of
+// FileDescriptionImpl that represent read-only regular files whose contents
+// are backed by a bytes.Buffer that is regenerated when necessary, consistent
+// with Linux's fs/seq_file.c:single_open().
+//
+// DynamicBytesFileDescriptionImpl.SetDataSource() must be called before first
+// use.
+type DynamicBytesFileDescriptionImpl struct {
+	data     DynamicBytesSource // immutable
+	mu       sync.Mutex         // protects the following fields
+	buf      bytes.Buffer
+	off      int64
+	lastRead int64 // offset at which the last Read, PRead, or Seek ended
+}
+
+// SetDataSource must be called exactly once on fd before first use.
+func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) {
+	fd.data = data
+}
+
+// Preconditions: fd.mu must be locked.
+func (fd *DynamicBytesFileDescriptionImpl) preadLocked(ctx context.Context, dst usermem.IOSequence, offset int64, opts *ReadOptions) (int64, error) {
+	// Regenerate the buffer if it's empty, or before pread() at a new offset.
+	// Compare fs/seq_file.c:seq_read() => traverse().
+	switch {
+	case offset != fd.lastRead:
+		fd.buf.Reset()
+		fallthrough
+	case fd.buf.Len() == 0:
+		if err := fd.data.Generate(ctx, &fd.buf); err != nil {
+			fd.buf.Reset()
+			// fd.off is not updated in this case.
+			fd.lastRead = 0
+			return 0, err
+		}
+	}
+	bs := fd.buf.Bytes()
+	if offset >= int64(len(bs)) {
+		return 0, io.EOF
+	}
+	n, err := dst.CopyOut(ctx, bs[offset:])
+	fd.lastRead = offset + int64(n)
+	return int64(n), err
+}
+
+// PRead implements FileDescriptionImpl.PRead.
+func (fd *DynamicBytesFileDescriptionImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+	fd.mu.Lock()
+	n, err := fd.preadLocked(ctx, dst, offset, &opts)
+	fd.mu.Unlock()
+	return n, err
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (fd *DynamicBytesFileDescriptionImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+	fd.mu.Lock()
+	n, err := fd.preadLocked(ctx, dst, fd.off, &opts)
+	fd.off += n
+	fd.mu.Unlock()
+	return n, err
+}
+
+// Seek implements FileDescriptionImpl.Seek.
+func (fd *DynamicBytesFileDescriptionImpl) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	switch whence {
+	case linux.SEEK_SET:
+		// Use offset as given.
+	case linux.SEEK_CUR:
+		offset += fd.off
+	default:
+		// fs/seq_file:seq_lseek() rejects SEEK_END etc.
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	if offset != fd.lastRead {
+		// Regenerate the file's contents immediately. Compare
+		// fs/seq_file.c:seq_lseek() => traverse().
+		fd.buf.Reset()
+		if err := fd.data.Generate(ctx, &fd.buf); err != nil {
+			fd.buf.Reset()
+			fd.off = 0
+			fd.lastRead = 0
+			return 0, err
+		}
+		fd.lastRead = offset
+	}
+	fd.off = offset
+	return offset, nil
+}
+
+// Preconditions: fd.mu must be locked.
+func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+	if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+	limit, err := CheckLimit(ctx, offset, src.NumBytes())
+	if err != nil {
+		return 0, err
+	}
+	src = src.TakeFirst64(limit)
+
+	writable, ok := fd.data.(WritableDynamicBytesSource)
+	if !ok {
+		return 0, syserror.EIO
+	}
+	n, err := writable.Write(ctx, src, offset)
+	if err != nil {
+		return 0, err
+	}
+
+	// Invalidate cached data that might exist prior to this call.
+	fd.buf.Reset()
+	return n, nil
+}
+
+// PWrite implements FileDescriptionImpl.PWrite.
+func (fd *DynamicBytesFileDescriptionImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+	fd.mu.Lock()
+	n, err := fd.pwriteLocked(ctx, src, offset, opts)
+	fd.mu.Unlock()
+	return n, err
+}
+
+// Write implements FileDescriptionImpl.Write.
+func (fd *DynamicBytesFileDescriptionImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+	fd.mu.Lock()
+	n, err := fd.pwriteLocked(ctx, src, fd.off, opts)
+	fd.off += n
+	fd.mu.Unlock()
+	return n, err
+}
+
+// GenericConfigureMMap may be used by most implementations of
+// FileDescriptionImpl.ConfigureMMap.
+func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.MMapOpts) error {
+	opts.Mappable = m
+	opts.MappingIdentity = fd
+	fd.IncRef()
+	return nil
+}
+
+// LockFD may be used by most implementations of FileDescriptionImpl.Lock*
+// functions. Caller must call Init().
+type LockFD struct {
+	locks *FileLocks
+}
+
+// Init initializes fd with FileLocks to use.
+func (fd *LockFD) Init(locks *FileLocks) {
+	fd.locks = locks
+}
+
+// Locks returns the locks associated with this file.
+func (fd *LockFD) Locks() *FileLocks {
+	return fd.locks
+}
+
+// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
+func (fd *LockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+	return fd.locks.LockBSD(uid, t, block)
+}
+
+// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD.
+func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error {
+	fd.locks.UnlockBSD(uid)
+	return nil
+}
+
+// NoLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface
+// returning ENOLCK.
+type NoLockFD struct{}
+
+// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
+func (NoLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+	return syserror.ENOLCK
+}
+
+// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD.
+func (NoLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error {
+	return syserror.ENOLCK
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return syserror.ENOLCK
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return syserror.ENOLCK
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
new file mode 100644
index 000000000..3b7e1c273
--- /dev/null
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -0,0 +1,224 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"sync/atomic"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// fileDescription is the common fd struct which a filesystem implementation
+// embeds in all of its file description implementations as required.
+type fileDescription struct {
+	vfsfd FileDescription
+	FileDescriptionDefaultImpl
+	NoLockFD
+}
+
+// genCount contains the number of times its DynamicBytesSource.Generate()
+// implementation has been called.
+type genCount struct {
+	count uint64 // accessed using atomic memory ops
+}
+
+// Generate implements DynamicBytesSource.Generate.
+func (g *genCount) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "%d", atomic.AddUint64(&g.count, 1))
+	return nil
+}
+
+type storeData struct {
+	data string
+}
+
+var _ WritableDynamicBytesSource = (*storeData)(nil)
+
+// Generate implements DynamicBytesSource.
+func (d *storeData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString(d.data)
+	return nil
+}
+
+// Generate implements WritableDynamicBytesSource.
+func (d *storeData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	buf := make([]byte, src.NumBytes())
+	n, err := src.CopyIn(ctx, buf)
+	if err != nil {
+		return 0, err
+	}
+
+	d.data = string(buf[:n])
+	return 0, nil
+}
+
+// testFD is a read-only FileDescriptionImpl representing a regular file.
+type testFD struct {
+	fileDescription
+	DynamicBytesFileDescriptionImpl
+
+	data DynamicBytesSource
+}
+
+func newTestFD(vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesSource) *FileDescription {
+	vd := vfsObj.NewAnonVirtualDentry("genCountFD")
+	defer vd.DecRef()
+	var fd testFD
+	fd.vfsfd.Init(&fd, statusFlags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{})
+	fd.DynamicBytesFileDescriptionImpl.SetDataSource(data)
+	return &fd.vfsfd
+}
+
+// Release implements FileDescriptionImpl.Release.
+func (fd *testFD) Release() {
+}
+
+// SetStatusFlags implements FileDescriptionImpl.SetStatusFlags.
+// Stat implements FileDescriptionImpl.Stat.
+func (fd *testFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+	// Note that Statx.Mask == 0 in the return value.
+	return linux.Statx{}, nil
+}
+
+// SetStat implements FileDescriptionImpl.SetStat.
+func (fd *testFD) SetStat(ctx context.Context, opts SetStatOptions) error {
+	return syserror.EPERM
+}
+
+func TestGenCountFD(t *testing.T) {
+	ctx := contexttest.Context(t)
+
+	vfsObj := &VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
+	fd := newTestFD(vfsObj, linux.O_RDWR, &genCount{})
+	defer fd.DecRef()
+
+	// The first read causes Generate to be called to fill the FD's buffer.
+	buf := make([]byte, 2)
+	ioseq := usermem.BytesIOSequence(buf)
+	n, err := fd.Read(ctx, ioseq, ReadOptions{})
+	if n != 1 || (err != nil && err != io.EOF) {
+		t.Fatalf("first Read: got (%d, %v), wanted (1, nil or EOF)", n, err)
+	}
+	if want := byte('1'); buf[0] != want {
+		t.Errorf("first Read: got byte %c, wanted %c", buf[0], want)
+	}
+
+	// A second read without seeking is still at EOF.
+	n, err = fd.Read(ctx, ioseq, ReadOptions{})
+	if n != 0 || err != io.EOF {
+		t.Fatalf("second Read: got (%d, %v), wanted (0, EOF)", n, err)
+	}
+
+	// Seeking to the beginning of the file causes it to be regenerated.
+	n, err = fd.Seek(ctx, 0, linux.SEEK_SET)
+	if n != 0 || err != nil {
+		t.Fatalf("Seek: got (%d, %v), wanted (0, nil)", n, err)
+	}
+	n, err = fd.Read(ctx, ioseq, ReadOptions{})
+	if n != 1 || (err != nil && err != io.EOF) {
+		t.Fatalf("Read after Seek: got (%d, %v), wanted (1, nil or EOF)", n, err)
+	}
+	if want := byte('2'); buf[0] != want {
+		t.Errorf("Read after Seek: got byte %c, wanted %c", buf[0], want)
+	}
+
+	// PRead at the beginning of the file also causes it to be regenerated.
+	n, err = fd.PRead(ctx, ioseq, 0, ReadOptions{})
+	if n != 1 || (err != nil && err != io.EOF) {
+		t.Fatalf("PRead: got (%d, %v), wanted (1, nil or EOF)", n, err)
+	}
+	if want := byte('3'); buf[0] != want {
+		t.Errorf("PRead: got byte %c, wanted %c", buf[0], want)
+	}
+
+	// Write and PWrite fails.
+	if _, err := fd.Write(ctx, ioseq, WriteOptions{}); err != syserror.EIO {
+		t.Errorf("Write: got err %v, wanted %v", err, syserror.EIO)
+	}
+	if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); err != syserror.EIO {
+		t.Errorf("Write: got err %v, wanted %v", err, syserror.EIO)
+	}
+}
+
+func TestWritable(t *testing.T) {
+	ctx := contexttest.Context(t)
+
+	vfsObj := &VirtualFilesystem{}
+	if err := vfsObj.Init(); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
+	fd := newTestFD(vfsObj, linux.O_RDWR, &storeData{data: "init"})
+	defer fd.DecRef()
+
+	buf := make([]byte, 10)
+	ioseq := usermem.BytesIOSequence(buf)
+	if n, err := fd.Read(ctx, ioseq, ReadOptions{}); n != 4 && err != io.EOF {
+		t.Fatalf("Read: got (%v, %v), wanted (4, EOF)", n, err)
+	}
+	if want := "init"; want == string(buf) {
+		t.Fatalf("Read: got %v, wanted %v", string(buf), want)
+	}
+
+	// Test PWrite.
+	want := "write"
+	writeIOSeq := usermem.BytesIOSequence([]byte(want))
+	if n, err := fd.PWrite(ctx, writeIOSeq, 0, WriteOptions{}); int(n) != len(want) && err != nil {
+		t.Errorf("PWrite: got err (%v, %v), wanted (%v, nil)", n, err, len(want))
+	}
+	if n, err := fd.PRead(ctx, ioseq, 0, ReadOptions{}); int(n) != len(want) && err != io.EOF {
+		t.Fatalf("PRead: got (%v, %v), wanted (%v, EOF)", n, err, len(want))
+	}
+	if want == string(buf) {
+		t.Fatalf("PRead: got %v, wanted %v", string(buf), want)
+	}
+
+	// Test Seek to 0 followed by Write.
+	want = "write2"
+	writeIOSeq = usermem.BytesIOSequence([]byte(want))
+	if n, err := fd.Seek(ctx, 0, linux.SEEK_SET); n != 0 && err != nil {
+		t.Errorf("Seek: got err (%v, %v), wanted (0, nil)", n, err)
+	}
+	if n, err := fd.Write(ctx, writeIOSeq, WriteOptions{}); int(n) != len(want) && err != nil {
+		t.Errorf("Write: got err (%v, %v), wanted (%v, nil)", n, err, len(want))
+	}
+	if n, err := fd.PRead(ctx, ioseq, 0, ReadOptions{}); int(n) != len(want) && err != io.EOF {
+		t.Fatalf("PRead: got (%v, %v), wanted (%v, EOF)", n, err, len(want))
+	}
+	if want == string(buf) {
+		t.Fatalf("PRead: got %v, wanted %v", string(buf), want)
+	}
+
+	// Test failure if offset != 0.
+	if n, err := fd.Seek(ctx, 1, linux.SEEK_SET); n != 0 && err != nil {
+		t.Errorf("Seek: got err (%v, %v), wanted (0, nil)", n, err)
+	}
+	if n, err := fd.Write(ctx, writeIOSeq, WriteOptions{}); n != 0 && err != syserror.EINVAL {
+		t.Errorf("Write: got err (%v, %v), wanted (0, EINVAL)", n, err)
+	}
+	if n, err := fd.PWrite(ctx, writeIOSeq, 2, WriteOptions{}); n != 0 && err != syserror.EINVAL {
+		t.Errorf("PWrite: got err (%v, %v), wanted (0, EINVAL)", n, err)
+	}
+}
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
new file mode 100644
index 000000000..6bb9ca180
--- /dev/null
+++ b/pkg/sentry/vfs/filesystem.go
@@ -0,0 +1,556 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+)
+
+// A Filesystem is a tree of nodes represented by Dentries, which forms part of
+// a VirtualFilesystem.
+//
+// Filesystems are reference-counted. Unless otherwise specified, all
+// Filesystem methods require that a reference is held.
+//
+// Filesystem is analogous to Linux's struct super_block.
+//
+// +stateify savable
+type Filesystem struct {
+	// refs is the reference count. refs is accessed using atomic memory
+	// operations.
+	refs int64
+
+	// vfs is the VirtualFilesystem that uses this Filesystem. vfs is
+	// immutable.
+	vfs *VirtualFilesystem
+
+	// fsType is the FilesystemType of this Filesystem.
+	fsType FilesystemType
+
+	// impl is the FilesystemImpl associated with this Filesystem. impl is
+	// immutable. This should be the last field in Dentry.
+	impl FilesystemImpl
+}
+
+// Init must be called before first use of fs.
+func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, fsType FilesystemType, impl FilesystemImpl) {
+	fs.refs = 1
+	fs.vfs = vfsObj
+	fs.fsType = fsType
+	fs.impl = impl
+	vfsObj.filesystemsMu.Lock()
+	vfsObj.filesystems[fs] = struct{}{}
+	vfsObj.filesystemsMu.Unlock()
+}
+
+// FilesystemType returns the FilesystemType for this Filesystem.
+func (fs *Filesystem) FilesystemType() FilesystemType {
+	return fs.fsType
+}
+
+// VirtualFilesystem returns the containing VirtualFilesystem.
+func (fs *Filesystem) VirtualFilesystem() *VirtualFilesystem {
+	return fs.vfs
+}
+
+// Impl returns the FilesystemImpl associated with fs.
+func (fs *Filesystem) Impl() FilesystemImpl {
+	return fs.impl
+}
+
+// IncRef increments fs' reference count.
+func (fs *Filesystem) IncRef() {
+	if atomic.AddInt64(&fs.refs, 1) <= 1 {
+		panic("Filesystem.IncRef() called without holding a reference")
+	}
+}
+
+// TryIncRef increments fs' reference count and returns true. If fs' reference
+// count is zero, TryIncRef does nothing and returns false.
+//
+// TryIncRef does not require that a reference is held on fs.
+func (fs *Filesystem) TryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&fs.refs)
+		if refs <= 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&fs.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+// DecRef decrements fs' reference count.
+func (fs *Filesystem) DecRef() {
+	if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 {
+		fs.vfs.filesystemsMu.Lock()
+		delete(fs.vfs.filesystems, fs)
+		fs.vfs.filesystemsMu.Unlock()
+		fs.impl.Release()
+	} else if refs < 0 {
+		panic("Filesystem.decRef() called without holding a reference")
+	}
+}
+
+// FilesystemImpl contains implementation details for a Filesystem.
+// Implementations of FilesystemImpl should contain their associated Filesystem
+// by value as their first field.
+//
+// All methods that take a ResolvingPath must resolve the path before
+// performing any other checks, including rejection of the operation if not
+// supported by the FilesystemImpl. This is because the final FilesystemImpl
+// (responsible for actually implementing the operation) isn't known until path
+// resolution is complete.
+//
+// Unless otherwise specified, FilesystemImpl methods are responsible for
+// performing permission checks. In many cases, vfs package functions in
+// permissions.go may be used to help perform these checks.
+//
+// When multiple specified error conditions apply to a given method call, the
+// implementation may return any applicable errno unless otherwise specified,
+// but returning the earliest error specified is preferable to maximize
+// compatibility with Linux.
+//
+// All methods may return errors not specified, notably including:
+//
+// - ENOENT if a required path component does not exist.
+//
+// - ENOTDIR if an intermediate path component is not a directory.
+//
+// - Errors from vfs-package functions (ResolvingPath.Resolve*(),
+// Mount.CheckBeginWrite(), permission-checking functions, etc.)
+//
+// For all methods that take or return linux.Statx, Statx.Uid and Statx.Gid
+// should be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID
+// and auth.KGID respectively).
+//
+// FilesystemImpl combines elements of Linux's struct super_operations and
+// struct inode_operations, for reasons described in the documentation for
+// Dentry.
+type FilesystemImpl interface {
+	// Release is called when the associated Filesystem reaches zero
+	// references.
+	Release()
+
+	// Sync "causes all pending modifications to filesystem metadata and cached
+	// file data to be written to the underlying [filesystem]", as by syncfs(2).
+	Sync(ctx context.Context) error
+
+	// AccessAt checks whether a user with creds can access the file at rp.
+	AccessAt(ctx context.Context, rp *ResolvingPath, creds *auth.Credentials, ats AccessTypes) error
+
+	// GetDentryAt returns a Dentry representing the file at rp. A reference is
+	// taken on the returned Dentry.
+	//
+	// GetDentryAt does not correspond directly to a Linux syscall; it is used
+	// in the implementation of:
+	//
+	// - Syscalls that need to resolve two paths: link(), linkat().
+	//
+	// - Syscalls that need to refer to a filesystem position outside the
+	// context of a file description: chdir(), fchdir(), chroot(), mount(),
+	// umount().
+	GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error)
+
+	// GetParentDentryAt returns a Dentry representing the directory at the
+	// second-to-last path component in rp. (Note that, despite the name, this
+	// is not necessarily the parent directory of the file at rp, since the
+	// last path component in rp may be "." or "..".) A reference is taken on
+	// the returned Dentry.
+	//
+	// GetParentDentryAt does not correspond directly to a Linux syscall; it is
+	// used in the implementation of the rename() family of syscalls, which
+	// must resolve the parent directories of two paths.
+	//
+	// Preconditions: !rp.Done().
+	//
+	// Postconditions: If GetParentDentryAt returns a nil error, then
+	// rp.Final(). If GetParentDentryAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
+	GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error)
+
+	// LinkAt creates a hard link at rp representing the same file as vd. It
+	// does not take ownership of references on vd.
+	//
+	// Errors:
+	//
+	// - If the last path component in rp is "." or "..", LinkAt returns
+	// EEXIST.
+	//
+	// - If a file already exists at rp, LinkAt returns EEXIST.
+	//
+	// - If rp.MustBeDir(), LinkAt returns ENOENT.
+	//
+	// - If the directory in which the link would be created has been removed
+	// by RmdirAt or RenameAt, LinkAt returns ENOENT.
+	//
+	// - If rp.Mount != vd.Mount(), LinkAt returns EXDEV.
+	//
+	// - If vd represents a directory, LinkAt returns EPERM.
+	//
+	// - If vd represents a file for which all existing links have been
+	// removed, or a file created by open(O_TMPFILE|O_EXCL), LinkAt returns
+	// ENOENT. Equivalently, if vd represents a file with a link count of 0 not
+	// created by open(O_TMPFILE) without O_EXCL, LinkAt returns ENOENT.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If LinkAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
+	LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error
+
+	// MkdirAt creates a directory at rp.
+	//
+	// Errors:
+	//
+	// - If the last path component in rp is "." or "..", MkdirAt returns
+	// EEXIST.
+	//
+	// - If a file already exists at rp, MkdirAt returns EEXIST.
+	//
+	// - If the directory in which the new directory would be created has been
+	// removed by RmdirAt or RenameAt, MkdirAt returns ENOENT.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If MkdirAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
+	MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error
+
+	// MknodAt creates a regular file, device special file, or named pipe at
+	// rp.
+	//
+	// Errors:
+	//
+	// - If the last path component in rp is "." or "..", MknodAt returns
+	// EEXIST.
+	//
+	// - If a file already exists at rp, MknodAt returns EEXIST.
+	//
+	// - If rp.MustBeDir(), MknodAt returns ENOENT.
+	//
+	// - If the directory in which the file would be created has been removed
+	// by RmdirAt or RenameAt, MknodAt returns ENOENT.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If MknodAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
+	MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error
+
+	// OpenAt returns an FileDescription providing access to the file at rp. A
+	// reference is taken on the returned FileDescription.
+	//
+	// Errors:
+	//
+	// - If opts.Flags specifies O_TMPFILE and this feature is unsupported by
+	// the implementation, OpenAt returns EOPNOTSUPP. (All other unsupported
+	// features are silently ignored, consistently with Linux's open*(2).)
+	OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error)
+
+	// ReadlinkAt returns the target of the symbolic link at rp.
+	//
+	// Errors:
+	//
+	// - If the file at rp is not a symbolic link, ReadlinkAt returns EINVAL.
+	ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error)
+
+	// RenameAt renames the file named oldName in directory oldParentVD to rp.
+	// It does not take ownership of references on oldParentVD.
+	//
+	// Errors [1]:
+	//
+	// - If opts.Flags specifies unsupported options, RenameAt returns EINVAL.
+	//
+	// - If the last path component in rp is "." or "..", and opts.Flags
+	// contains RENAME_NOREPLACE, RenameAt returns EEXIST.
+	//
+	// - If the last path component in rp is "." or "..", and opts.Flags does
+	// not contain RENAME_NOREPLACE, RenameAt returns EBUSY.
+	//
+	// - If rp.Mount != oldParentVD.Mount(), RenameAt returns EXDEV.
+	//
+	// - If the renamed file is not a directory, and opts.MustBeDir is true,
+	// RenameAt returns ENOTDIR.
+	//
+	// - If renaming would replace an existing file and opts.Flags contains
+	// RENAME_NOREPLACE, RenameAt returns EEXIST.
+	//
+	// - If there is no existing file at rp and opts.Flags contains
+	// RENAME_EXCHANGE, RenameAt returns ENOENT.
+	//
+	// - If there is an existing non-directory file at rp, and rp.MustBeDir()
+	// is true, RenameAt returns ENOTDIR.
+	//
+	// - If the renamed file is not a directory, opts.Flags does not contain
+	// RENAME_EXCHANGE, and rp.MustBeDir() is true, RenameAt returns ENOTDIR.
+	// (This check is not subsumed by the check for directory replacement below
+	// since it applies even if there is no file to replace.)
+	//
+	// - If the renamed file is a directory, and the new parent directory of
+	// the renamed file is either the renamed directory or a descendant
+	// subdirectory of the renamed directory, RenameAt returns EINVAL.
+	//
+	// - If renaming would exchange the renamed file with an ancestor directory
+	// of the renamed file, RenameAt returns EINVAL.
+	//
+	// - If renaming would replace an ancestor directory of the renamed file,
+	// RenameAt returns ENOTEMPTY. (This check would be subsumed by the
+	// non-empty directory check below; however, this check takes place before
+	// the self-rename check.)
+	//
+	// - If the renamed file would replace or exchange with itself (i.e. the
+	// source and destination paths resolve to the same file), RenameAt returns
+	// nil, skipping the checks described below.
+	//
+	// - If the source or destination directory is not writable by the provider
+	// of rp.Credentials(), RenameAt returns EACCES.
+	//
+	// - If the renamed file is a directory, and renaming would replace a
+	// non-directory file, RenameAt returns ENOTDIR.
+	//
+	// - If the renamed file is not a directory, and renaming would replace a
+	// directory, RenameAt returns EISDIR.
+	//
+	// - If the new parent directory of the renamed file has been removed by
+	// RmdirAt or a preceding call to RenameAt, RenameAt returns ENOENT.
+	//
+	// - If the renamed file is a directory, it is not writable by the
+	// provider of rp.Credentials(), and the source and destination parent
+	// directories are different, RenameAt returns EACCES. (This is nominally
+	// required to change the ".." entry in the renamed directory.)
+	//
+	// - If renaming would replace a non-empty directory, RenameAt returns
+	// ENOTEMPTY.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink(). oldParentVD.Dentry() was obtained from a
+	// previous call to
+	// oldParentVD.Mount().Filesystem().Impl().GetParentDentryAt(). oldName is
+	// not "." or "..".
+	//
+	// Postconditions: If RenameAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
+	//
+	// [1] "The worst of all namespace operations - renaming directory.
+	// "Perverted" doesn't even start to describe it. Somebody in UCB had a
+	// heck of a trip..." - fs/namei.c:vfs_rename()
+	RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error
+
+	// RmdirAt removes the directory at rp.
+	//
+	// Errors:
+	//
+	// - If the last path component in rp is ".", RmdirAt returns EINVAL.
+	//
+	// - If the last path component in rp is "..", RmdirAt returns ENOTEMPTY.
+	//
+	// - If no file exists at rp, RmdirAt returns ENOENT.
+	//
+	// - If the file at rp exists but is not a directory, RmdirAt returns
+	// ENOTDIR.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If RmdirAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
+	RmdirAt(ctx context.Context, rp *ResolvingPath) error
+
+	// SetStatAt updates metadata for the file at the given path. Implementations
+	// are responsible for checking if the operation can be performed
+	// (see vfs.CheckSetStat() for common checks).
+	//
+	// Errors:
+	//
+	// - If opts specifies unsupported options, SetStatAt returns EINVAL.
+	SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error
+
+	// StatAt returns metadata for the file at rp.
+	StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error)
+
+	// StatFSAt returns metadata for the filesystem containing the file at rp.
+	// (This method takes a path because a FilesystemImpl may consist of any
+	// number of constituent filesystems.)
+	StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error)
+
+	// SymlinkAt creates a symbolic link at rp referring to the given target.
+	//
+	// Errors:
+	//
+	// - If the last path component in rp is "." or "..", SymlinkAt returns
+	// EEXIST.
+	//
+	// - If a file already exists at rp, SymlinkAt returns EEXIST.
+	//
+	// - If rp.MustBeDir(), SymlinkAt returns ENOENT.
+	//
+	// - If the directory in which the symbolic link would be created has been
+	// removed by RmdirAt or RenameAt, SymlinkAt returns ENOENT.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If SymlinkAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
+	SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error
+
+	// UnlinkAt removes the file at rp.
+	//
+	// Errors:
+	//
+	// - If the last path component in rp is "." or "..", UnlinkAt returns
+	// EISDIR.
+	//
+	// - If no file exists at rp, UnlinkAt returns ENOENT.
+	//
+	// - If rp.MustBeDir(), and the file at rp exists and is not a directory,
+	// UnlinkAt returns ENOTDIR.
+	//
+	// - If the file at rp exists but is a directory, UnlinkAt returns EISDIR.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If UnlinkAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
+	UnlinkAt(ctx context.Context, rp *ResolvingPath) error
+
+	// ListxattrAt returns all extended attribute names for the file at rp.
+	//
+	// Errors:
+	//
+	// - If extended attributes are not supported by the filesystem,
+	// ListxattrAt returns ENOTSUP.
+	//
+	// - If the size of the list (including a NUL terminating byte after every
+	// entry) would exceed size, ERANGE may be returned. Note that
+	// implementations are free to ignore size entirely and return without
+	// error). In all cases, if size is 0, the list should be returned without
+	// error, regardless of size.
+	ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error)
+
+	// GetxattrAt returns the value associated with the given extended
+	// attribute for the file at rp.
+	//
+	// Errors:
+	//
+	// - If extended attributes are not supported by the filesystem, GetxattrAt
+	// returns ENOTSUP.
+	//
+	// - If an extended attribute named opts.Name does not exist, ENODATA is
+	// returned.
+	//
+	// - If the size of the return value exceeds opts.Size, ERANGE may be
+	// returned (note that implementations are free to ignore opts.Size entirely
+	// and return without error). In all cases, if opts.Size is 0, the value
+	// should be returned without error, regardless of size.
+	GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error)
+
+	// SetxattrAt changes the value associated with the given extended
+	// attribute for the file at rp.
+	//
+	// Errors:
+	//
+	// - If extended attributes are not supported by the filesystem, SetxattrAt
+	// returns ENOTSUP.
+	//
+	// - If XATTR_CREATE is set in opts.Flag and opts.Name already exists,
+	// EEXIST is returned. If XATTR_REPLACE is set and opts.Name does not exist,
+	// ENODATA is returned.
+	SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error
+
+	// RemovexattrAt removes the given extended attribute from the file at rp.
+	//
+	// Errors:
+	//
+	// - If extended attributes are not supported by the filesystem,
+	// RemovexattrAt returns ENOTSUP.
+	//
+	// - If name does not exist, ENODATA is returned.
+	RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error
+
+	// BoundEndpointAt returns the Unix socket endpoint bound at the path rp.
+	//
+	// Errors:
+	//
+	// - If the file does not have write permissions, then BoundEndpointAt
+	// returns EACCES.
+	//
+	// - If a non-socket file exists at rp, then BoundEndpointAt returns
+	// ECONNREFUSED.
+	BoundEndpointAt(ctx context.Context, rp *ResolvingPath, opts BoundEndpointOptions) (transport.BoundEndpoint, error)
+
+	// PrependPath prepends a path from vd to vd.Mount().Root() to b.
+	//
+	// If vfsroot.Ok(), it is the contextual VFS root; if it is encountered
+	// before vd.Mount().Root(), PrependPath should stop prepending path
+	// components and return a PrependPathAtVFSRootError.
+	//
+	// If traversal of vd.Dentry()'s ancestors encounters an independent
+	// ("root") Dentry that is not vd.Mount().Root() (i.e. vd.Dentry() is not a
+	// descendant of vd.Mount().Root()), PrependPath should stop prepending
+	// path components and return a PrependPathAtNonMountRootError.
+	//
+	// Filesystems for which Dentries do not have meaningful paths may prepend
+	// an arbitrary descriptive string to b and then return a
+	// PrependPathSyntheticError.
+	//
+	// Most implementations can acquire the appropriate locks to ensure that
+	// Dentry.Name() and Dentry.Parent() are fixed for vd.Dentry() and all of
+	// its ancestors, then call GenericPrependPath.
+	//
+	// Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl.
+	PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error
+}
+
+// PrependPathAtVFSRootError is returned by implementations of
+// FilesystemImpl.PrependPath() when they encounter the contextual VFS root.
+type PrependPathAtVFSRootError struct{}
+
+// Error implements error.Error.
+func (PrependPathAtVFSRootError) Error() string {
+	return "vfs.FilesystemImpl.PrependPath() reached VFS root"
+}
+
+// PrependPathAtNonMountRootError is returned by implementations of
+// FilesystemImpl.PrependPath() when they encounter an independent ancestor
+// Dentry that is not the Mount root.
+type PrependPathAtNonMountRootError struct{}
+
+// Error implements error.Error.
+func (PrependPathAtNonMountRootError) Error() string {
+	return "vfs.FilesystemImpl.PrependPath() reached root other than Mount root"
+}
+
+// PrependPathSyntheticError is returned by implementations of
+// FilesystemImpl.PrependPath() for which prepended names do not represent real
+// paths.
+type PrependPathSyntheticError struct{}
+
+// Error implements error.Error.
+func (PrependPathSyntheticError) Error() string {
+	return "vfs.FilesystemImpl.PrependPath() prepended synthetic name"
+}
diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go
new file mode 100644
index 000000000..465e610e0
--- /dev/null
+++ b/pkg/sentry/vfs/filesystem_impl_util.go
@@ -0,0 +1,43 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"strings"
+)
+
+// GenericParseMountOptions parses a comma-separated list of options of the
+// form "key" or "key=value", where neither key nor value contain commas, and
+// returns it as a map. If str contains duplicate keys, then the last value
+// wins. For example:
+//
+// str = "key0=value0,key1,key2=value2,key0=value3" -> map{'key0':'value3','key1':'','key2':'value2'}
+//
+// GenericParseMountOptions is not appropriate if values may contain commas,
+// e.g. in the case of the mpol mount option for tmpfs(5).
+func GenericParseMountOptions(str string) map[string]string {
+	m := make(map[string]string)
+	for _, opt := range strings.Split(str, ",") {
+		if len(opt) > 0 {
+			res := strings.SplitN(opt, "=", 2)
+			if len(res) == 2 {
+				m[res[0]] = res[1]
+			} else {
+				m[opt] = ""
+			}
+		}
+	}
+	return m
+}
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
new file mode 100644
index 000000000..f2298f7f6
--- /dev/null
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -0,0 +1,117 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// A FilesystemType constructs filesystems.
+//
+// FilesystemType is analogous to Linux's struct file_system_type.
+type FilesystemType interface {
+	// GetFilesystem returns a Filesystem configured by the given options,
+	// along with its mount root. A reference is taken on the returned
+	// Filesystem and Dentry.
+	GetFilesystem(ctx context.Context, vfsObj *VirtualFilesystem, creds *auth.Credentials, source string, opts GetFilesystemOptions) (*Filesystem, *Dentry, error)
+
+	// Name returns the name of this FilesystemType.
+	Name() string
+}
+
+// GetFilesystemOptions contains options to FilesystemType.GetFilesystem.
+type GetFilesystemOptions struct {
+	// Data is the string passed as the 5th argument to mount(2), which is
+	// usually a comma-separated list of filesystem-specific mount options.
+	Data string
+
+	// InternalData holds opaque FilesystemType-specific data. There is
+	// intentionally no way for applications to specify InternalData; if it is
+	// not nil, the call to GetFilesystem originates from within the sentry.
+	InternalData interface{}
+}
+
+// +stateify savable
+type registeredFilesystemType struct {
+	fsType FilesystemType
+	opts   RegisterFilesystemTypeOptions
+}
+
+// RegisterFilesystemTypeOptions contains options to
+// VirtualFilesystem.RegisterFilesystem().
+type RegisterFilesystemTypeOptions struct {
+	// If AllowUserMount is true, allow calls to VirtualFilesystem.MountAt()
+	// for which MountOptions.InternalMount == false to use this filesystem
+	// type.
+	AllowUserMount bool
+
+	// If AllowUserList is true, make this filesystem type visible in
+	// /proc/filesystems.
+	AllowUserList bool
+
+	// If RequiresDevice is true, indicate that mounting this filesystem
+	// requires a block device as the mount source in /proc/filesystems.
+	RequiresDevice bool
+}
+
+// RegisterFilesystemType registers the given FilesystemType in vfs with the
+// given name.
+func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) error {
+	vfs.fsTypesMu.Lock()
+	defer vfs.fsTypesMu.Unlock()
+	if existing, ok := vfs.fsTypes[name]; ok {
+		return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing.fsType)
+	}
+	vfs.fsTypes[name] = &registeredFilesystemType{
+		fsType: fsType,
+		opts:   *opts,
+	}
+	return nil
+}
+
+// MustRegisterFilesystemType is equivalent to RegisterFilesystemType but
+// panics on failure.
+func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) {
+	if err := vfs.RegisterFilesystemType(name, fsType, opts); err != nil {
+		panic(fmt.Sprintf("failed to register filesystem type %T: %v", fsType, err))
+	}
+}
+
+func (vfs *VirtualFilesystem) getFilesystemType(name string) *registeredFilesystemType {
+	vfs.fsTypesMu.RLock()
+	defer vfs.fsTypesMu.RUnlock()
+	return vfs.fsTypes[name]
+}
+
+// GenerateProcFilesystems emits the contents of /proc/filesystems for vfs to
+// buf.
+func (vfs *VirtualFilesystem) GenerateProcFilesystems(buf *bytes.Buffer) {
+	vfs.fsTypesMu.RLock()
+	defer vfs.fsTypesMu.RUnlock()
+	for name, rft := range vfs.fsTypes {
+		if !rft.opts.AllowUserList {
+			continue
+		}
+		var nodev string
+		if !rft.opts.RequiresDevice {
+			nodev = "nodev"
+		}
+		fmt.Fprintf(buf, "%s\t%s\n", nodev, name)
+	}
+}
diff --git a/pkg/sentry/vfs/g3doc/inotify.md b/pkg/sentry/vfs/g3doc/inotify.md
new file mode 100644
index 000000000..e7da49faa
--- /dev/null
+++ b/pkg/sentry/vfs/g3doc/inotify.md
@@ -0,0 +1,210 @@
+# Inotify
+
+Inotify is a mechanism for monitoring filesystem events in Linux--see
+inotify(7). An inotify instance can be used to monitor files and directories for
+modifications, creation/deletion, etc. The inotify API consists of system calls
+that create inotify instances (inotify_init/inotify_init1) and add/remove
+watches on files to an instance (inotify_add_watch/inotify_rm_watch). Events are
+generated from various places in the sentry, including the syscall layer, the
+vfs layer, the process fd table, and within each filesystem implementation. This
+document outlines the implementation details of inotify in VFS2.
+
+## Inotify Objects
+
+Inotify data structures are implemented in the vfs package.
+
+### vfs.Inotify
+
+Inotify instances are represented by vfs.Inotify objects, which implement
+vfs.FileDescriptionImpl. As in Linux, inotify fds are backed by a
+pseudo-filesystem (anonfs). Each inotify instance receives events from a set of
+vfs.Watch objects, which can be modified with inotify_add_watch(2) and
+inotify_rm_watch(2). An application can retrieve events by reading the inotify
+fd.
+
+### vfs.Watches
+
+The set of all watches held on a single file (i.e., the watch target) is stored
+in vfs.Watches. Each watch will belong to a different inotify instance (an
+instance can only have one watch on any watch target). The watches are stored in
+a map indexed by their vfs.Inotify owner’s id. Hard links and file descriptions
+to a single file will all share the same vfs.Watches. Activity on the target
+causes its vfs.Watches to generate notifications on its watches’ inotify
+instances.
+
+### vfs.Watch
+
+A single watch, owned by one inotify instance and applied to one watch target.
+Both the vfs.Inotify owner and vfs.Watches on the target will hold a vfs.Watch,
+which leads to some complicated locking behavior (see Lock Ordering). Whenever a
+watch is notified of an event on its target, it will queue events to its inotify
+instance for delivery to the user.
+
+### vfs.Event
+
+vfs.Event is a simple struct encapsulating all the fields for an inotify event.
+It is generated by vfs.Watches and forwarded to the watches' owners. It is
+serialized to the user during read(2) syscalls on the associated fs.Inotify's
+fd.
+
+## Lock Ordering
+
+There are three locks related to the inotify implementation:
+
+Inotify.mu: the inotify instance lock. Inotify.evMu: the inotify event queue
+lock. Watches.mu: the watch set lock, used to protect the collection of watches
+on a target.
+
+The correct lock ordering for inotify code is:
+
+Inotify.mu -> Watches.mu -> Inotify.evMu.
+
+Note that we use a distinct lock to protect the inotify event queue. If we
+simply used Inotify.mu, we could simultaneously have locks being acquired in the
+order of Inotify.mu -> Watches.mu and Watches.mu -> Inotify.mu, which would
+cause deadlocks. For instance, adding a watch to an inotify instance would
+require locking Inotify.mu, and then adding the same watch to the target would
+cause Watches.mu to be held. At the same time, generating an event on the target
+would require Watches.mu to be held before iterating through each watch, and
+then notifying the owner of each watch would cause Inotify.mu to be held.
+
+See the vfs package comment to understand how inotify locks fit into the overall
+ordering of filesystem locks.
+
+## Watch Targets in Different Filesystem Implementations
+
+In Linux, watches reside on inodes at the virtual filesystem layer. As a result,
+all hard links and file descriptions on a single file will all share the same
+watch set. In VFS2, there is no common inode structure across filesystem types
+(some may not even have inodes), so we have to plumb inotify support through
+each specific filesystem implementation. Some of the technical considerations
+are outlined below.
+
+### Tmpfs
+
+For filesystems with inodes, like tmpfs, the design is quite similar to that of
+Linux, where watches reside on the inode.
+
+### Pseudo-filesystems
+
+Technically, because inotify is implemented at the vfs layer in Linux,
+pseudo-filesystems on top of kernfs support inotify passively. However, watches
+can only track explicit filesystem operations like read/write, open/close,
+mknod, etc., so watches on a target like /proc/self/fd will not generate events
+every time a new fd is added or removed. As of this writing, we leave inotify
+unimplemented in kernfs and anonfs; it does not seem particularly useful.
+
+### Gofer Filesystem (fsimpl/gofer)
+
+The gofer filesystem has several traits that make it difficult to support
+inotify:
+
+*   **There are no inodes.** A file is represented as a dentry that holds an
+    unopened p9 file (and possibly an open FID), through which the Sentry
+    interacts with the gofer.
+    *   *Solution:* Because there is no inode structure stored in the sandbox,
+        inotify watches must be held on the dentry. This would be an issue in
+        the presence of hard links, where multiple dentries would need to share
+        the same set of watches, but in VFS2, we do not support the internal
+        creation of hard links on gofer fs. As a result, we make the assumption
+        that every dentry corresponds to a unique inode. However, the next point
+        raises an issue with this assumption:
+*   **The Sentry cannot always be aware of hard links on the remote
+    filesystem.** There is no way for us to confirm whether two files on the
+    remote filesystem are actually links to the same inode. QIDs and inodes are
+    not always 1:1. The assumption that dentries and inodes are 1:1 is
+    inevitably broken if there are remote hard links that we cannot detect.
+    *   *Solution:* this is an issue with gofer fs in general, not only inotify,
+        and we will have to live with it.
+*   **Dentries can be cached, and then evicted.** Dentry lifetime does not
+    correspond to file lifetime. Because gofer fs is not entirely in-memory, the
+    absence of a dentry does not mean that the corresponding file does not
+    exist, nor does a dentry reaching zero references mean that the
+    corresponding file no longer exists. When a dentry reaches zero references,
+    it will be cached, in case the file at that path is needed again in the
+    future. However, the dentry may be evicted from the cache, which will cause
+    a new dentry to be created next time the same file path is used. The
+    existing watches will be lost.
+    *   *Solution:* When a dentry reaches zero references, do not cache it if it
+        has any watches, so we can avoid eviction/destruction. Note that if the
+        dentry was deleted or invalidated (d.vfsd.IsDead()), we should still
+        destroy it along with its watches. Additionally, when a dentry’s last
+        watch is removed, we cache it if it also has zero references. This way,
+        the dentry can eventually be evicted from memory if it is no longer
+        needed.
+*   **Dentries can be invalidated.** Another issue with dentry lifetime is that
+    the remote file at the file path represented may change from underneath the
+    dentry. In this case, the next time that the dentry is used, it will be
+    invalidated and a new dentry will replace it. In this case, it is not clear
+    what should be done with the watches on the old dentry.
+    *   *Solution:* Silently destroy the watches when invalidation occurs. We
+        have no way of knowing exactly what happened, when it happens. Inotify
+        instances on NFS files in Linux probably behave in a similar fashion,
+        since inotify is implemented at the vfs layer and is not aware of the
+        complexities of remote file systems.
+    *   An alternative would be to issue some kind of event upon invalidation,
+        e.g. a delete event, but this has several issues:
+    *   We cannot discern whether the remote file was invalidated because it was
+        moved, deleted, etc. This information is crucial, because these cases
+        should result in different events. Furthermore, the watches should only
+        be destroyed if the file has been deleted.
+    *   Moreover, the mechanism for detecting whether the underlying file has
+        changed is to check whether a new QID is given by the gofer. This may
+        result in false positives, e.g. suppose that the server closed and
+        re-opened the same file, which may result in a new QID.
+    *   Finally, the time of the event may be completely different from the time
+        of the file modification, since a dentry is not immediately notified
+        when the underlying file has changed. It would be quite unexpected to
+        receive the notification when invalidation was triggered, i.e. the next
+        time the file was accessed within the sandbox, because then the
+        read/write/etc. operation on the file would not result in the expected
+        event.
+    *   Another point in favor of the first solution: inotify in Linux can
+        already be lossy on local filesystems (one of the sacrifices made so
+        that filesystem performance isn’t killed), and it is lossy on NFS for
+        similar reasons to gofer fs. Therefore, it is better for inotify to be
+        silent than to emit incorrect notifications.
+*   **There may be external users of the remote filesystem.** We can only track
+    operations performed on the file within the sandbox. This is sufficient
+    under InteropModeExclusive, but whenever there are external users, the set
+    of actions we are aware of is incomplete.
+    *   *Solution:* We could either return an error or just issue a warning when
+        inotify is used without InteropModeExclusive. Although faulty, VFS1
+        allows it when the filesystem is shared, and Linux does the same for
+        remote filesystems (as mentioned above, inotify sits at the vfs level).
+
+## Dentry Interface
+
+For events that must be generated above the vfs layer, we provide the following
+DentryImpl methods to allow interactions with targets on any FilesystemImpl:
+
+*   **InotifyWithParent()** generates events on the dentry’s watches as well as
+    its parent’s.
+*   **Watches()** retrieves the watch set of the target represented by the
+    dentry. This is used to access and modify watches on a target.
+*   **OnZeroWatches()** performs cleanup tasks after the last watch is removed
+    from a dentry. This is needed by gofer fs, which must allow a watched dentry
+    to be cached once it has no more watches. Most implementations can just do
+    nothing. Note that OnZeroWatches() must be called after all inotify locks
+    are released to preserve lock ordering, since it may acquire
+    FilesystemImpl-specific locks.
+
+## IN_EXCL_UNLINK
+
+There are several options that can be set for a watch, specified as part of the
+mask in inotify_add_watch(2). In particular, IN_EXCL_UNLINK requires some
+additional support in each filesystem.
+
+A watch with IN_EXCL_UNLINK will not generate events for its target if it
+corresponds to a path that was unlinked. For instance, if an fd is opened on
+“foo/bar” and “foo/bar” is subsequently unlinked, any reads/writes/etc. on the
+fd will be ignored by watches on “foo” or “foo/bar” with IN_EXCL_UNLINK. This
+requires each DentryImpl to keep track of whether it has been unlinked, in order
+to determine whether events should be sent to watches with IN_EXCL_UNLINK.
+
+## IN_ONESHOT
+
+One-shot watches expire after generating a single event. When an event occurs,
+all one-shot watches on the target that successfully generated an event are
+removed. Lock ordering can cause the management of one-shot watches to be quite
+expensive; see Watches.Notify() for more information.
diff --git a/pkg/sentry/vfs/genericfstree/BUILD b/pkg/sentry/vfs/genericfstree/BUILD
new file mode 100644
index 000000000..d8fd92677
--- /dev/null
+++ b/pkg/sentry/vfs/genericfstree/BUILD
@@ -0,0 +1,16 @@
+load("//tools/go_generics:defs.bzl", "go_template")
+
+package(
+    default_visibility = ["//:sandbox"],
+    licenses = ["notice"],
+)
+
+go_template(
+    name = "generic_fstree",
+    srcs = [
+        "genericfstree.go",
+    ],
+    types = [
+        "Dentry",
+    ],
+)
diff --git a/pkg/sentry/vfs/genericfstree/genericfstree.go b/pkg/sentry/vfs/genericfstree/genericfstree.go
new file mode 100644
index 000000000..8882fa84a
--- /dev/null
+++ b/pkg/sentry/vfs/genericfstree/genericfstree.go
@@ -0,0 +1,81 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package genericfstree provides tools for implementing vfs.FilesystemImpls
+// where a single statically-determined lock or set of locks is sufficient to
+// ensure that a Dentry's name and parent are contextually immutable.
+//
+// Clients using this package must use the go_template_instance rule in
+// tools/go_generics/defs.bzl to create an instantiation of this template
+// package, providing types to use in place of Dentry.
+package genericfstree
+
+import (
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// Dentry is a required type parameter that is a struct with the given fields.
+type Dentry struct {
+	// vfsd is the embedded vfs.Dentry corresponding to this vfs.DentryImpl.
+	vfsd vfs.Dentry
+
+	// parent is the parent of this Dentry in the filesystem's tree. If this
+	// Dentry is a filesystem root, parent is nil.
+	parent *Dentry
+
+	// name is the name of this Dentry in its parent. If this Dentry is a
+	// filesystem root, name is unspecified.
+	name string
+}
+
+// IsAncestorDentry returns true if d is an ancestor of d2; that is, d is
+// either d2's parent or an ancestor of d2's parent.
+func IsAncestorDentry(d, d2 *Dentry) bool {
+	for d2 != nil { // Stop at root, where d2.parent == nil.
+		if d2.parent == d {
+			return true
+		}
+		if d2.parent == d2 {
+			return false
+		}
+		d2 = d2.parent
+	}
+	return false
+}
+
+// ParentOrSelf returns d.parent. If d.parent is nil, ParentOrSelf returns d.
+func ParentOrSelf(d *Dentry) *Dentry {
+	if d.parent != nil {
+		return d.parent
+	}
+	return d
+}
+
+// PrependPath is a generic implementation of FilesystemImpl.PrependPath().
+func PrependPath(vfsroot vfs.VirtualDentry, mnt *vfs.Mount, d *Dentry, b *fspath.Builder) error {
+	for {
+		if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() {
+			return vfs.PrependPathAtVFSRootError{}
+		}
+		if &d.vfsd == mnt.Root() {
+			return nil
+		}
+		if d.parent == nil {
+			return vfs.PrependPathAtNonMountRootError{}
+		}
+		b.PrependComponent(d.name)
+		d = d.parent
+	}
+}
diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go
new file mode 100644
index 000000000..c2e21ac5f
--- /dev/null
+++ b/pkg/sentry/vfs/inotify.go
@@ -0,0 +1,774 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"bytes"
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// inotifyEventBaseSize is the base size of linux's struct inotify_event. This
+// must be a power 2 for rounding below.
+const inotifyEventBaseSize = 16
+
+// EventType defines different kinds of inotfiy events.
+//
+// The way events are labelled appears somewhat arbitrary, but they must match
+// Linux so that IN_EXCL_UNLINK behaves as it does in Linux.
+type EventType uint8
+
+// PathEvent and InodeEvent correspond to FSNOTIFY_EVENT_PATH and
+// FSNOTIFY_EVENT_INODE in Linux.
+const (
+	PathEvent  EventType = iota
+	InodeEvent EventType = iota
+)
+
+// Inotify represents an inotify instance created by inotify_init(2) or
+// inotify_init1(2). Inotify implements FileDescriptionImpl.
+//
+// +stateify savable
+type Inotify struct {
+	vfsfd FileDescription
+	FileDescriptionDefaultImpl
+	DentryMetadataFileDescriptionImpl
+	NoLockFD
+
+	// Unique identifier for this inotify instance. We don't just reuse the
+	// inotify fd because fds can be duped. These should not be exposed to the
+	// user, since we may aggressively reuse an id on S/R.
+	id uint64
+
+	// queue is used to notify interested parties when the inotify instance
+	// becomes readable or writable.
+	queue waiter.Queue `state:"nosave"`
+
+	// evMu *only* protects the events list. We need a separate lock while
+	// queuing events: using mu may violate lock ordering, since at that point
+	// the calling goroutine may already hold Watches.mu.
+	evMu sync.Mutex `state:"nosave"`
+
+	// A list of pending events for this inotify instance. Protected by evMu.
+	events eventList
+
+	// A scratch buffer, used to serialize inotify events. Allocate this
+	// ahead of time for the sake of performance. Protected by evMu.
+	scratch []byte
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// nextWatchMinusOne is used to allocate watch descriptors on this Inotify
+	// instance. Note that Linux starts numbering watch descriptors from 1.
+	nextWatchMinusOne int32
+
+	// Map from watch descriptors to watch objects.
+	watches map[int32]*Watch
+}
+
+var _ FileDescriptionImpl = (*Inotify)(nil)
+
+// NewInotifyFD constructs a new Inotify instance.
+func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) (*FileDescription, error) {
+	// O_CLOEXEC affects file descriptors, so it must be handled outside of vfs.
+	flags &^= linux.O_CLOEXEC
+	if flags&^linux.O_NONBLOCK != 0 {
+		return nil, syserror.EINVAL
+	}
+
+	id := uniqueid.GlobalFromContext(ctx)
+	vd := vfsObj.NewAnonVirtualDentry(fmt.Sprintf("[inotifyfd:%d]", id))
+	defer vd.DecRef()
+	fd := &Inotify{
+		id:      id,
+		scratch: make([]byte, inotifyEventBaseSize),
+		watches: make(map[int32]*Watch),
+	}
+	if err := fd.vfsfd.Init(fd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
+		UseDentryMetadata: true,
+		DenyPRead:         true,
+		DenyPWrite:        true,
+	}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// Release implements FileDescriptionImpl.Release. Release removes all
+// watches and frees all resources for an inotify instance.
+func (i *Inotify) Release() {
+	var ds []*Dentry
+
+	// We need to hold i.mu to avoid a race with concurrent calls to
+	// Inotify.handleDeletion from Watches. There's no risk of Watches
+	// accessing this Inotify after the destructor ends, because we remove all
+	// references to it below.
+	i.mu.Lock()
+	for _, w := range i.watches {
+		// Remove references to the watch from the watches set on the target. We
+		// don't need to worry about the references from i.watches, since this
+		// file description is about to be destroyed.
+		d := w.target
+		ws := d.Watches()
+		// Watchable dentries should never return a nil watch set.
+		if ws == nil {
+			panic("Cannot remove watch from an unwatchable dentry")
+		}
+		ws.Remove(i.id)
+		if ws.Size() == 0 {
+			ds = append(ds, d)
+		}
+	}
+	i.mu.Unlock()
+
+	for _, d := range ds {
+		d.OnZeroWatches()
+	}
+}
+
+// Allocate implements FileDescription.Allocate.
+func (i *Inotify) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	panic("Allocate should not be called on read-only inotify fds")
+}
+
+// EventRegister implements waiter.Waitable.
+func (i *Inotify) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	i.queue.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.
+func (i *Inotify) EventUnregister(e *waiter.Entry) {
+	i.queue.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+//
+// Readiness indicates whether there are pending events for an inotify instance.
+func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask {
+	ready := waiter.EventMask(0)
+
+	i.evMu.Lock()
+	defer i.evMu.Unlock()
+
+	if !i.events.Empty() {
+		ready |= waiter.EventIn
+	}
+
+	return mask & ready
+}
+
+// PRead implements FileDescriptionImpl.
+func (*Inotify) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// PWrite implements FileDescriptionImpl.
+func (*Inotify) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+	return 0, syserror.ESPIPE
+}
+
+// Write implements FileDescriptionImpl.Write.
+func (*Inotify) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+	return 0, syserror.EBADF
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+	if dst.NumBytes() < inotifyEventBaseSize {
+		return 0, syserror.EINVAL
+	}
+
+	i.evMu.Lock()
+	defer i.evMu.Unlock()
+
+	if i.events.Empty() {
+		// Nothing to read yet, tell caller to block.
+		return 0, syserror.ErrWouldBlock
+	}
+
+	var writeLen int64
+	for it := i.events.Front(); it != nil; {
+		// Advance `it` before the element is removed from the list, or else
+		// it.Next() will always be nil.
+		event := it
+		it = it.Next()
+
+		// Does the buffer have enough remaining space to hold the event we're
+		// about to write out?
+		if dst.NumBytes() < int64(event.sizeOf()) {
+			if writeLen > 0 {
+				// Buffer wasn't big enough for all pending events, but we did
+				// write some events out.
+				return writeLen, nil
+			}
+			return 0, syserror.EINVAL
+		}
+
+		// Linux always dequeues an available event as long as there's enough
+		// buffer space to copy it out, even if the copy below fails. Emulate
+		// this behaviour.
+		i.events.Remove(event)
+
+		// Buffer has enough space, copy event to the read buffer.
+		n, err := event.CopyTo(ctx, i.scratch, dst)
+		if err != nil {
+			return 0, err
+		}
+
+		writeLen += n
+		dst = dst.DropFirst64(n)
+	}
+	return writeLen, nil
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (i *Inotify) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	switch args[1].Int() {
+	case linux.FIONREAD:
+		i.evMu.Lock()
+		defer i.evMu.Unlock()
+		var n uint32
+		for e := i.events.Front(); e != nil; e = e.Next() {
+			n += uint32(e.sizeOf())
+		}
+		var buf [4]byte
+		usermem.ByteOrder.PutUint32(buf[:], n)
+		_, err := uio.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{})
+		return 0, err
+
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
+
+func (i *Inotify) queueEvent(ev *Event) {
+	i.evMu.Lock()
+
+	// Check if we should coalesce the event we're about to queue with the last
+	// one currently in the queue. Events are coalesced if they are identical.
+	if last := i.events.Back(); last != nil {
+		if ev.equals(last) {
+			// "Coalesce" the two events by simply not queuing the new one. We
+			// don't need to raise a waiter.EventIn notification because no new
+			// data is available for reading.
+			i.evMu.Unlock()
+			return
+		}
+	}
+
+	i.events.PushBack(ev)
+
+	// Release mutex before notifying waiters because we don't control what they
+	// can do.
+	i.evMu.Unlock()
+
+	i.queue.Notify(waiter.EventIn)
+}
+
+// newWatchLocked creates and adds a new watch to target.
+//
+// Precondition: i.mu must be locked. ws must be the watch set for target d.
+func (i *Inotify) newWatchLocked(d *Dentry, ws *Watches, mask uint32) *Watch {
+	w := &Watch{
+		owner:  i,
+		wd:     i.nextWatchIDLocked(),
+		target: d,
+		mask:   mask,
+	}
+
+	// Hold the watch in this inotify instance as well as the watch set on the
+	// target.
+	i.watches[w.wd] = w
+	ws.Add(w)
+	return w
+}
+
+// newWatchIDLocked allocates and returns a new watch descriptor.
+//
+// Precondition: i.mu must be locked.
+func (i *Inotify) nextWatchIDLocked() int32 {
+	i.nextWatchMinusOne++
+	return i.nextWatchMinusOne
+}
+
+// AddWatch constructs a new inotify watch and adds it to the target. It
+// returns the watch descriptor returned by inotify_add_watch(2).
+//
+// The caller must hold a reference on target.
+func (i *Inotify) AddWatch(target *Dentry, mask uint32) (int32, error) {
+	// Note: Locking this inotify instance protects the result returned by
+	// Lookup() below. With the lock held, we know for sure the lookup result
+	// won't become stale because it's impossible for *this* instance to
+	// add/remove watches on target.
+	i.mu.Lock()
+	defer i.mu.Unlock()
+
+	ws := target.Watches()
+	if ws == nil {
+		// While Linux supports inotify watches on all filesystem types, watches on
+		// filesystems like kernfs are not generally useful, so we do not.
+		return 0, syserror.EPERM
+	}
+	// Does the target already have a watch from this inotify instance?
+	if existing := ws.Lookup(i.id); existing != nil {
+		newmask := mask
+		if mask&linux.IN_MASK_ADD != 0 {
+			// "Add (OR) events to watch mask for this pathname if it already
+			// exists (instead of replacing mask)." -- inotify(7)
+			newmask |= atomic.LoadUint32(&existing.mask)
+		}
+		atomic.StoreUint32(&existing.mask, newmask)
+		return existing.wd, nil
+	}
+
+	// No existing watch, create a new watch.
+	w := i.newWatchLocked(target, ws, mask)
+	return w.wd, nil
+}
+
+// RmWatch looks up an inotify watch for the given 'wd' and configures the
+// target to stop sending events to this inotify instance.
+func (i *Inotify) RmWatch(wd int32) error {
+	i.mu.Lock()
+
+	// Find the watch we were asked to removed.
+	w, ok := i.watches[wd]
+	if !ok {
+		i.mu.Unlock()
+		return syserror.EINVAL
+	}
+
+	// Remove the watch from this instance.
+	delete(i.watches, wd)
+
+	// Remove the watch from the watch target.
+	ws := w.target.Watches()
+	// AddWatch ensures that w.target has a non-nil watch set.
+	if ws == nil {
+		panic("Watched dentry cannot have nil watch set")
+	}
+	ws.Remove(w.OwnerID())
+	remaining := ws.Size()
+	i.mu.Unlock()
+
+	if remaining == 0 {
+		w.target.OnZeroWatches()
+	}
+
+	// Generate the event for the removal.
+	i.queueEvent(newEvent(wd, "", linux.IN_IGNORED, 0))
+
+	return nil
+}
+
+// Watches is the collection of all inotify watches on a single file.
+//
+// +stateify savable
+type Watches struct {
+	// mu protects the fields below.
+	mu sync.RWMutex `state:"nosave"`
+
+	// ws is the map of active watches in this collection, keyed by the inotify
+	// instance id of the owner.
+	ws map[uint64]*Watch
+}
+
+// Size returns the number of watches held by w.
+func (w *Watches) Size() int {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	return len(w.ws)
+}
+
+// Lookup returns the watch owned by an inotify instance with the given id.
+// Returns nil if no such watch exists.
+//
+// Precondition: the inotify instance with the given id must be locked to
+// prevent the returned watch from being concurrently modified or replaced in
+// Inotify.watches.
+func (w *Watches) Lookup(id uint64) *Watch {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	return w.ws[id]
+}
+
+// Add adds watch into this set of watches.
+//
+// Precondition: the inotify instance with the given id must be locked.
+func (w *Watches) Add(watch *Watch) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	owner := watch.OwnerID()
+	// Sanity check, we should never have two watches for one owner on the
+	// same target.
+	if _, exists := w.ws[owner]; exists {
+		panic(fmt.Sprintf("Watch collision with ID %+v", owner))
+	}
+	if w.ws == nil {
+		w.ws = make(map[uint64]*Watch)
+	}
+	w.ws[owner] = watch
+}
+
+// Remove removes a watch with the given id from this set of watches and
+// releases it. The caller is responsible for generating any watch removal
+// event, as appropriate. The provided id must match an existing watch in this
+// collection.
+//
+// Precondition: the inotify instance with the given id must be locked.
+func (w *Watches) Remove(id uint64) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	if w.ws == nil {
+		// This watch set is being destroyed. The thread executing the
+		// destructor is already in the process of deleting all our watches. We
+		// got here with no references on the target because we raced with the
+		// destructor notifying all the watch owners of destruction. See the
+		// comment in Watches.HandleDeletion for why this race exists.
+		return
+	}
+
+	// It is possible for w.Remove() to be called for the same watch multiple
+	// times. See the treatment of one-shot watches in Watches.Notify().
+	if _, ok := w.ws[id]; ok {
+		delete(w.ws, id)
+	}
+}
+
+// Notify queues a new event with watches in this set. Watches with
+// IN_EXCL_UNLINK are skipped if the event is coming from a child that has been
+// unlinked.
+func (w *Watches) Notify(name string, events, cookie uint32, et EventType, unlinked bool) {
+	var hasExpired bool
+	w.mu.RLock()
+	for _, watch := range w.ws {
+		if unlinked && watch.ExcludeUnlinked() && et == PathEvent {
+			continue
+		}
+		if watch.Notify(name, events, cookie) {
+			hasExpired = true
+		}
+	}
+	w.mu.RUnlock()
+
+	if hasExpired {
+		w.cleanupExpiredWatches()
+	}
+}
+
+// This function is relatively expensive and should only be called where there
+// are expired watches.
+func (w *Watches) cleanupExpiredWatches() {
+	// Because of lock ordering, we cannot acquire Inotify.mu for each watch
+	// owner while holding w.mu. As a result, store expired watches locally
+	// before removing.
+	var toRemove []*Watch
+	w.mu.RLock()
+	for _, watch := range w.ws {
+		if atomic.LoadInt32(&watch.expired) == 1 {
+			toRemove = append(toRemove, watch)
+		}
+	}
+	w.mu.RUnlock()
+	for _, watch := range toRemove {
+		watch.owner.RmWatch(watch.wd)
+	}
+}
+
+// HandleDeletion is called when the watch target is destroyed. Clear the
+// watch set, detach watches from the inotify instances they belong to, and
+// generate the appropriate events.
+func (w *Watches) HandleDeletion() {
+	w.Notify("", linux.IN_DELETE_SELF, 0, InodeEvent, true /* unlinked */)
+
+	// As in Watches.Notify, we can't hold w.mu while acquiring Inotify.mu for
+	// the owner of each watch being deleted. Instead, atomically store the
+	// watches map in a local variable and set it to nil so we can iterate over
+	// it with the assurance that there will be no concurrent accesses.
+	var ws map[uint64]*Watch
+	w.mu.Lock()
+	ws = w.ws
+	w.ws = nil
+	w.mu.Unlock()
+
+	// Remove each watch from its owner's watch set, and generate a corresponding
+	// watch removal event.
+	for _, watch := range ws {
+		i := watch.owner
+		i.mu.Lock()
+		_, found := i.watches[watch.wd]
+		delete(i.watches, watch.wd)
+
+		// Release mutex before notifying waiters because we don't control what
+		// they can do.
+		i.mu.Unlock()
+
+		// If watch was not found, it was removed from the inotify instance before
+		// we could get to it, in which case we should not generate an event.
+		if found {
+			i.queueEvent(newEvent(watch.wd, "", linux.IN_IGNORED, 0))
+		}
+	}
+}
+
+// Watch represent a particular inotify watch created by inotify_add_watch.
+//
+// +stateify savable
+type Watch struct {
+	// Inotify instance which owns this watch.
+	//
+	// This field is immutable after creation.
+	owner *Inotify
+
+	// Descriptor for this watch. This is unique across an inotify instance.
+	//
+	// This field is immutable after creation.
+	wd int32
+
+	// target is a dentry representing the watch target. Its watch set contains this watch.
+	//
+	// This field is immutable after creation.
+	target *Dentry
+
+	// Events being monitored via this watch. Must be accessed with atomic
+	// memory operations.
+	mask uint32
+
+	// expired is set to 1 to indicate that this watch is a one-shot that has
+	// already sent a notification and therefore can be removed. Must be accessed
+	// with atomic memory operations.
+	expired int32
+}
+
+// OwnerID returns the id of the inotify instance that owns this watch.
+func (w *Watch) OwnerID() uint64 {
+	return w.owner.id
+}
+
+// ExcludeUnlinked indicates whether the watched object should continue to be
+// notified of events originating from a path that has been unlinked.
+//
+// For example, if "foo/bar" is opened and then unlinked, operations on the
+// open fd may be ignored by watches on "foo" and "foo/bar" with IN_EXCL_UNLINK.
+func (w *Watch) ExcludeUnlinked() bool {
+	return atomic.LoadUint32(&w.mask)&linux.IN_EXCL_UNLINK != 0
+}
+
+// Notify queues a new event on this watch. Returns true if this is a one-shot
+// watch that should be deleted, after this event was successfully queued.
+func (w *Watch) Notify(name string, events uint32, cookie uint32) bool {
+	if atomic.LoadInt32(&w.expired) == 1 {
+		// This is a one-shot watch that is already in the process of being
+		// removed. This may happen if a second event reaches the watch target
+		// before this watch has been removed.
+		return false
+	}
+
+	mask := atomic.LoadUint32(&w.mask)
+	if mask&events == 0 {
+		// We weren't watching for this event.
+		return false
+	}
+
+	// Event mask should include bits matched from the watch plus all control
+	// event bits.
+	unmaskableBits := ^uint32(0) &^ linux.IN_ALL_EVENTS
+	effectiveMask := unmaskableBits | mask
+	matchedEvents := effectiveMask & events
+	w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie))
+	if mask&linux.IN_ONESHOT != 0 {
+		atomic.StoreInt32(&w.expired, 1)
+		return true
+	}
+	return false
+}
+
+// Event represents a struct inotify_event from linux.
+//
+// +stateify savable
+type Event struct {
+	eventEntry
+
+	wd     int32
+	mask   uint32
+	cookie uint32
+
+	// len is computed based on the name field is set automatically by
+	// Event.setName. It should be 0 when no name is set; otherwise it is the
+	// length of the name slice.
+	len uint32
+
+	// The name field has special padding requirements and should only be set by
+	// calling Event.setName.
+	name []byte
+}
+
+func newEvent(wd int32, name string, events, cookie uint32) *Event {
+	e := &Event{
+		wd:     wd,
+		mask:   events,
+		cookie: cookie,
+	}
+	if name != "" {
+		e.setName(name)
+	}
+	return e
+}
+
+// paddedBytes converts a go string to a null-terminated c-string, padded with
+// null bytes to a total size of 'l'. 'l' must be large enough for all the bytes
+// in the 's' plus at least one null byte.
+func paddedBytes(s string, l uint32) []byte {
+	if l < uint32(len(s)+1) {
+		panic("Converting string to byte array results in truncation, this can lead to buffer-overflow due to the missing null-byte!")
+	}
+	b := make([]byte, l)
+	copy(b, s)
+
+	// b was zero-value initialized during make(), so the rest of the slice is
+	// already filled with null bytes.
+
+	return b
+}
+
+// setName sets the optional name for this event.
+func (e *Event) setName(name string) {
+	// We need to pad the name such that the entire event length ends up a
+	// multiple of inotifyEventBaseSize.
+	unpaddedLen := len(name) + 1
+	// Round up to nearest multiple of inotifyEventBaseSize.
+	e.len = uint32((unpaddedLen + inotifyEventBaseSize - 1) & ^(inotifyEventBaseSize - 1))
+	// Make sure we haven't overflowed and wrapped around when rounding.
+	if unpaddedLen > int(e.len) {
+		panic("Overflow when rounding inotify event size, the 'name' field was too big.")
+	}
+	e.name = paddedBytes(name, e.len)
+}
+
+func (e *Event) sizeOf() int {
+	s := inotifyEventBaseSize + int(e.len)
+	if s < inotifyEventBaseSize {
+		panic("Overflowed event size")
+	}
+	return s
+}
+
+// CopyTo serializes this event to dst. buf is used as a scratch buffer to
+// construct the output. We use a buffer allocated ahead of time for
+// performance. buf must be at least inotifyEventBaseSize bytes.
+func (e *Event) CopyTo(ctx context.Context, buf []byte, dst usermem.IOSequence) (int64, error) {
+	usermem.ByteOrder.PutUint32(buf[0:], uint32(e.wd))
+	usermem.ByteOrder.PutUint32(buf[4:], e.mask)
+	usermem.ByteOrder.PutUint32(buf[8:], e.cookie)
+	usermem.ByteOrder.PutUint32(buf[12:], e.len)
+
+	writeLen := 0
+
+	n, err := dst.CopyOut(ctx, buf)
+	if err != nil {
+		return 0, err
+	}
+	writeLen += n
+	dst = dst.DropFirst(n)
+
+	if e.len > 0 {
+		n, err = dst.CopyOut(ctx, e.name)
+		if err != nil {
+			return 0, err
+		}
+		writeLen += n
+	}
+
+	// Santiy check.
+	if writeLen != e.sizeOf() {
+		panic(fmt.Sprintf("Serialized unexpected amount of data for an event, expected %d, wrote %d.", e.sizeOf(), writeLen))
+	}
+
+	return int64(writeLen), nil
+}
+
+func (e *Event) equals(other *Event) bool {
+	return e.wd == other.wd &&
+		e.mask == other.mask &&
+		e.cookie == other.cookie &&
+		e.len == other.len &&
+		bytes.Equal(e.name, other.name)
+}
+
+// InotifyEventFromStatMask generates the appropriate events for an operation
+// that set the stats specified in mask.
+func InotifyEventFromStatMask(mask uint32) uint32 {
+	var ev uint32
+	if mask&(linux.STATX_UID|linux.STATX_GID|linux.STATX_MODE) != 0 {
+		ev |= linux.IN_ATTRIB
+	}
+	if mask&linux.STATX_SIZE != 0 {
+		ev |= linux.IN_MODIFY
+	}
+
+	if (mask & (linux.STATX_ATIME | linux.STATX_MTIME)) == (linux.STATX_ATIME | linux.STATX_MTIME) {
+		// Both times indicates a utime(s) call.
+		ev |= linux.IN_ATTRIB
+	} else if mask&linux.STATX_ATIME != 0 {
+		ev |= linux.IN_ACCESS
+	} else if mask&linux.STATX_MTIME != 0 {
+		mask |= linux.IN_MODIFY
+	}
+	return ev
+}
+
+// InotifyRemoveChild sends the appriopriate notifications to the watch sets of
+// the child being removed and its parent. Note that unlike most pairs of
+// parent/child notifications, the child is notified first in this case.
+func InotifyRemoveChild(self, parent *Watches, name string) {
+	if self != nil {
+		self.Notify("", linux.IN_ATTRIB, 0, InodeEvent, true /* unlinked */)
+	}
+	if parent != nil {
+		parent.Notify(name, linux.IN_DELETE, 0, InodeEvent, true /* unlinked */)
+	}
+}
+
+// InotifyRename sends the appriopriate notifications to the watch sets of the
+// file being renamed and its old/new parents.
+func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, oldName, newName string, isDir bool) {
+	var dirEv uint32
+	if isDir {
+		dirEv = linux.IN_ISDIR
+	}
+	cookie := uniqueid.InotifyCookie(ctx)
+	if oldParent != nil {
+		oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent, false /* unlinked */)
+	}
+	if newParent != nil {
+		newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent, false /* unlinked */)
+	}
+	// Somewhat surprisingly, self move events do not have a cookie.
+	if renamed != nil {
+		renamed.Notify("", linux.IN_MOVE_SELF, 0, InodeEvent, false /* unlinked */)
+	}
+}
diff --git a/pkg/sentry/vfs/lock.go b/pkg/sentry/vfs/lock.go
new file mode 100644
index 000000000..6c7583a81
--- /dev/null
+++ b/pkg/sentry/vfs/lock.go
@@ -0,0 +1,109 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package lock provides POSIX and BSD style file locking for VFS2 file
+// implementations.
+//
+// The actual implementations can be found in the lock package under
+// sentry/fs/lock.
+package vfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// FileLocks supports POSIX and BSD style locks, which correspond to fcntl(2)
+// and flock(2) respectively in Linux. It can be embedded into various file
+// implementations for VFS2 that support locking.
+//
+// Note that in Linux these two types of locks are _not_ cooperative, because
+// race and deadlock conditions make merging them prohibitive. We do the same
+// and keep them oblivious to each other.
+type FileLocks struct {
+	// bsd is a set of BSD-style advisory file wide locks, see flock(2).
+	bsd fslock.Locks
+
+	// posix is a set of POSIX-style regional advisory locks, see fcntl(2).
+	posix fslock.Locks
+}
+
+// LockBSD tries to acquire a BSD-style lock on the entire file.
+func (fl *FileLocks) LockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+	if fl.bsd.LockRegion(uid, t, fslock.LockRange{0, fslock.LockEOF}, block) {
+		return nil
+	}
+	return syserror.ErrWouldBlock
+}
+
+// UnlockBSD releases a BSD-style lock on the entire file.
+//
+// This operation is always successful, even if there did not exist a lock on
+// the requested region held by uid in the first place.
+func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) {
+	fl.bsd.UnlockRegion(uid, fslock.LockRange{0, fslock.LockEOF})
+}
+
+// LockPOSIX tries to acquire a POSIX-style lock on a file region.
+func (fl *FileLocks) LockPOSIX(ctx context.Context, fd *FileDescription, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	rng, err := computeRange(ctx, fd, start, length, whence)
+	if err != nil {
+		return err
+	}
+	if fl.posix.LockRegion(uid, t, rng, block) {
+		return nil
+	}
+	return syserror.ErrWouldBlock
+}
+
+// UnlockPOSIX releases a POSIX-style lock on a file region.
+//
+// This operation is always successful, even if there did not exist a lock on
+// the requested region held by uid in the first place.
+func (fl *FileLocks) UnlockPOSIX(ctx context.Context, fd *FileDescription, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	rng, err := computeRange(ctx, fd, start, length, whence)
+	if err != nil {
+		return err
+	}
+	fl.posix.UnlockRegion(uid, rng)
+	return nil
+}
+
+func computeRange(ctx context.Context, fd *FileDescription, start uint64, length uint64, whence int16) (fslock.LockRange, error) {
+	var off int64
+	switch whence {
+	case linux.SEEK_SET:
+		off = 0
+	case linux.SEEK_CUR:
+		// Note that Linux does not hold any mutexes while retrieving the file
+		// offset, see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk.
+		curOff, err := fd.Seek(ctx, 0, linux.SEEK_CUR)
+		if err != nil {
+			return fslock.LockRange{}, err
+		}
+		off = curOff
+	case linux.SEEK_END:
+		stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_SIZE})
+		if err != nil {
+			return fslock.LockRange{}, err
+		}
+		off = int64(stat.Size)
+	default:
+		return fslock.LockRange{}, syserror.EINVAL
+	}
+
+	return fslock.ComputeRange(int64(start), int64(length), off)
+}
diff --git a/pkg/sentry/vfs/memxattr/BUILD b/pkg/sentry/vfs/memxattr/BUILD
new file mode 100644
index 000000000..d8c4d27b9
--- /dev/null
+++ b/pkg/sentry/vfs/memxattr/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "memxattr",
+    srcs = ["xattr.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/vfs/memxattr/xattr.go b/pkg/sentry/vfs/memxattr/xattr.go
new file mode 100644
index 000000000..cc1e7d764
--- /dev/null
+++ b/pkg/sentry/vfs/memxattr/xattr.go
@@ -0,0 +1,102 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package memxattr provides a default, in-memory extended attribute
+// implementation.
+package memxattr
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// SimpleExtendedAttributes implements extended attributes using a map of
+// names to values.
+//
+// +stateify savable
+type SimpleExtendedAttributes struct {
+	// mu protects the below fields.
+	mu     sync.RWMutex `state:"nosave"`
+	xattrs map[string]string
+}
+
+// Getxattr returns the value at 'name'.
+func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string, error) {
+	x.mu.RLock()
+	value, ok := x.xattrs[opts.Name]
+	x.mu.RUnlock()
+	if !ok {
+		return "", syserror.ENODATA
+	}
+	// Check that the size of the buffer provided in getxattr(2) is large enough
+	// to contain the value.
+	if opts.Size != 0 && uint64(len(value)) > opts.Size {
+		return "", syserror.ERANGE
+	}
+	return value, nil
+}
+
+// Setxattr sets 'value' at 'name'.
+func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error {
+	x.mu.Lock()
+	defer x.mu.Unlock()
+	if x.xattrs == nil {
+		if opts.Flags&linux.XATTR_REPLACE != 0 {
+			return syserror.ENODATA
+		}
+		x.xattrs = make(map[string]string)
+	}
+
+	_, ok := x.xattrs[opts.Name]
+	if ok && opts.Flags&linux.XATTR_CREATE != 0 {
+		return syserror.EEXIST
+	}
+	if !ok && opts.Flags&linux.XATTR_REPLACE != 0 {
+		return syserror.ENODATA
+	}
+
+	x.xattrs[opts.Name] = opts.Value
+	return nil
+}
+
+// Listxattr returns all names in xattrs.
+func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) {
+	// Keep track of the size of the buffer needed in listxattr(2) for the list.
+	listSize := 0
+	x.mu.RLock()
+	names := make([]string, 0, len(x.xattrs))
+	for n := range x.xattrs {
+		names = append(names, n)
+		// Add one byte per null terminator.
+		listSize += len(n) + 1
+	}
+	x.mu.RUnlock()
+	if size != 0 && uint64(listSize) > size {
+		return nil, syserror.ERANGE
+	}
+	return names, nil
+}
+
+// Removexattr removes the xattr at 'name'.
+func (x *SimpleExtendedAttributes) Removexattr(name string) error {
+	x.mu.Lock()
+	defer x.mu.Unlock()
+	if _, ok := x.xattrs[name]; !ok {
+		return syserror.ENODATA
+	}
+	delete(x.xattrs, name)
+	return nil
+}
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
new file mode 100644
index 000000000..32f901bd8
--- /dev/null
+++ b/pkg/sentry/vfs/mount.go
@@ -0,0 +1,903 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"bytes"
+	"fmt"
+	"math"
+	"sort"
+	"strings"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem
+// (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem
+// (Mount.fs), which applies to path resolution in the context of a particular
+// Mount (Mount.key.parent).
+//
+// Mounts are reference-counted. Unless otherwise specified, all Mount methods
+// require that a reference is held.
+//
+// Mount and Filesystem are distinct types because it's possible for a single
+// Filesystem to be mounted at multiple locations and/or in multiple mount
+// namespaces.
+//
+// Mount is analogous to Linux's struct mount. (gVisor does not distinguish
+// between struct mount and struct vfsmount.)
+//
+// +stateify savable
+type Mount struct {
+	// vfs, fs, root are immutable. References are held on fs and root.
+	//
+	// Invariant: root belongs to fs.
+	vfs  *VirtualFilesystem
+	fs   *Filesystem
+	root *Dentry
+
+	// ID is the immutable mount ID.
+	ID uint64
+
+	// Flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except
+	// for MS_RDONLY which is tracked in "writers". Immutable.
+	Flags MountFlags
+
+	// key is protected by VirtualFilesystem.mountMu and
+	// VirtualFilesystem.mounts.seq, and may be nil. References are held on
+	// key.parent and key.point if they are not nil.
+	//
+	// Invariant: key.parent != nil iff key.point != nil. key.point belongs to
+	// key.parent.fs.
+	key mountKey
+
+	// ns is the namespace in which this Mount was mounted. ns is protected by
+	// VirtualFilesystem.mountMu.
+	ns *MountNamespace
+
+	// The lower 63 bits of refs are a reference count. The MSB of refs is set
+	// if the Mount has been eagerly umounted, as by umount(2) without the
+	// MNT_DETACH flag. refs is accessed using atomic memory operations.
+	refs int64
+
+	// children is the set of all Mounts for which Mount.key.parent is this
+	// Mount. children is protected by VirtualFilesystem.mountMu.
+	children map[*Mount]struct{}
+
+	// umounted is true if VFS.umountRecursiveLocked() has been called on this
+	// Mount. VirtualFilesystem does not hold a reference on Mounts for which
+	// umounted is true. umounted is protected by VirtualFilesystem.mountMu.
+	umounted bool
+
+	// The lower 63 bits of writers is the number of calls to
+	// Mount.CheckBeginWrite() that have not yet been paired with a call to
+	// Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect.
+	// writers is accessed using atomic memory operations.
+	writers int64
+}
+
+func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount {
+	mnt := &Mount{
+		ID:    atomic.AddUint64(&vfs.lastMountID, 1),
+		Flags: opts.Flags,
+		vfs:   vfs,
+		fs:    fs,
+		root:  root,
+		ns:    mntns,
+		refs:  1,
+	}
+	if opts.ReadOnly {
+		mnt.setReadOnlyLocked(true)
+	}
+	return mnt
+}
+
+// Options returns a copy of the MountOptions currently applicable to mnt.
+func (mnt *Mount) Options() MountOptions {
+	mnt.vfs.mountMu.Lock()
+	defer mnt.vfs.mountMu.Unlock()
+	return MountOptions{
+		Flags:    mnt.Flags,
+		ReadOnly: mnt.readOnly(),
+	}
+}
+
+// A MountNamespace is a collection of Mounts.//
+// MountNamespaces are reference-counted. Unless otherwise specified, all
+// MountNamespace methods require that a reference is held.
+//
+// MountNamespace is analogous to Linux's struct mnt_namespace.
+//
+// +stateify savable
+type MountNamespace struct {
+	// Owner is the usernamespace that owns this mount namespace.
+	Owner *auth.UserNamespace
+
+	// root is the MountNamespace's root mount. root is immutable.
+	root *Mount
+
+	// refs is the reference count. refs is accessed using atomic memory
+	// operations.
+	refs int64
+
+	// mountpoints maps all Dentries which are mount points in this namespace
+	// to the number of Mounts for which they are mount points. mountpoints is
+	// protected by VirtualFilesystem.mountMu.
+	//
+	// mountpoints is used to determine if a Dentry can be moved or removed
+	// (which requires that the Dentry is not a mount point in the calling
+	// namespace).
+	//
+	// mountpoints is maintained even if there are no references held on the
+	// MountNamespace; this is required to ensure that
+	// VFS.PrepareDeleteDentry() and VFS.PrepareRemoveDentry() operate
+	// correctly on unreferenced MountNamespaces.
+	mountpoints map[*Dentry]uint32
+}
+
+// NewMountNamespace returns a new mount namespace with a root filesystem
+// configured by the given arguments. A reference is taken on the returned
+// MountNamespace.
+func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
+	rft := vfs.getFilesystemType(fsTypeName)
+	if rft == nil {
+		ctx.Warningf("Unknown filesystem type: %s", fsTypeName)
+		return nil, syserror.ENODEV
+	}
+	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
+	if err != nil {
+		return nil, err
+	}
+	mntns := &MountNamespace{
+		Owner:       creds.UserNamespace,
+		refs:        1,
+		mountpoints: make(map[*Dentry]uint32),
+	}
+	mntns.root = newMount(vfs, fs, root, mntns, &MountOptions{})
+	return mntns, nil
+}
+
+// NewDisconnectedMount returns a Mount representing fs with the given root
+// (which may be nil). The new Mount is not associated with any MountNamespace
+// and is not connected to any other Mounts. References are taken on fs and
+// root.
+func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry, opts *MountOptions) (*Mount, error) {
+	fs.IncRef()
+	if root != nil {
+		root.IncRef()
+	}
+	return newMount(vfs, fs, root, nil /* mntns */, opts), nil
+}
+
+// MountDisconnected creates a Filesystem configured by the given arguments,
+// then returns a Mount representing it. The new Mount is not associated with
+// any MountNamespace and is not connected to any other Mounts.
+func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth.Credentials, source string, fsTypeName string, opts *MountOptions) (*Mount, error) {
+	rft := vfs.getFilesystemType(fsTypeName)
+	if rft == nil {
+		return nil, syserror.ENODEV
+	}
+	if !opts.InternalMount && !rft.opts.AllowUserMount {
+		return nil, syserror.ENODEV
+	}
+	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
+	if err != nil {
+		return nil, err
+	}
+	defer root.DecRef()
+	defer fs.DecRef()
+	return vfs.NewDisconnectedMount(fs, root, opts)
+}
+
+// ConnectMountAt connects mnt at the path represented by target.
+//
+// Preconditions: mnt must be disconnected.
+func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Credentials, mnt *Mount, target *PathOperation) error {
+	// We can't hold vfs.mountMu while calling FilesystemImpl methods due to
+	// lock ordering.
+	vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{})
+	if err != nil {
+		return err
+	}
+	vfs.mountMu.Lock()
+	vd.dentry.mu.Lock()
+	for {
+		if vd.dentry.dead {
+			vd.dentry.mu.Unlock()
+			vfs.mountMu.Unlock()
+			vd.DecRef()
+			return syserror.ENOENT
+		}
+		// vd might have been mounted over between vfs.GetDentryAt() and
+		// vfs.mountMu.Lock().
+		if !vd.dentry.isMounted() {
+			break
+		}
+		nextmnt := vfs.mounts.Lookup(vd.mount, vd.dentry)
+		if nextmnt == nil {
+			break
+		}
+		// It's possible that nextmnt has been umounted but not disconnected,
+		// in which case vfs no longer holds a reference on it, and the last
+		// reference may be concurrently dropped even though we're holding
+		// vfs.mountMu.
+		if !nextmnt.tryIncMountedRef() {
+			break
+		}
+		// This can't fail since we're holding vfs.mountMu.
+		nextmnt.root.IncRef()
+		vd.dentry.mu.Unlock()
+		vd.DecRef()
+		vd = VirtualDentry{
+			mount:  nextmnt,
+			dentry: nextmnt.root,
+		}
+		vd.dentry.mu.Lock()
+	}
+	// TODO(gvisor.dev/issue/1035): Linux requires that either both the mount
+	// point and the mount root are directories, or neither are, and returns
+	// ENOTDIR if this is not the case.
+	mntns := vd.mount.ns
+	vfs.mounts.seq.BeginWrite()
+	vfs.connectLocked(mnt, vd, mntns)
+	vfs.mounts.seq.EndWrite()
+	vd.dentry.mu.Unlock()
+	vfs.mountMu.Unlock()
+	return nil
+}
+
+// MountAt creates and mounts a Filesystem configured by the given arguments.
+func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
+	mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts)
+	if err != nil {
+		return err
+	}
+	defer mnt.DecRef()
+	if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil {
+		return err
+	}
+	return nil
+}
+
+// UmountAt removes the Mount at the given path.
+func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error {
+	if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 {
+		return syserror.EINVAL
+	}
+
+	// MNT_FORCE is currently unimplemented except for the permission check.
+	// Force unmounting specifically requires CAP_SYS_ADMIN in the root user
+	// namespace, and not in the owner user namespace for the target mount. See
+	// fs/namespace.c:SYSCALL_DEFINE2(umount, ...)
+	if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) {
+		return syserror.EPERM
+	}
+
+	vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{})
+	if err != nil {
+		return err
+	}
+	defer vd.DecRef()
+	if vd.dentry != vd.mount.root {
+		return syserror.EINVAL
+	}
+	vfs.mountMu.Lock()
+	if mntns := MountNamespaceFromContext(ctx); mntns != nil {
+		defer mntns.DecRef()
+		if mntns != vd.mount.ns {
+			vfs.mountMu.Unlock()
+			return syserror.EINVAL
+		}
+	}
+
+	// TODO(gvisor.dev/issue/1035): Linux special-cases umount of the caller's
+	// root, which we don't implement yet (we'll just fail it since the caller
+	// holds a reference on it).
+
+	vfs.mounts.seq.BeginWrite()
+	if opts.Flags&linux.MNT_DETACH == 0 {
+		if len(vd.mount.children) != 0 {
+			vfs.mounts.seq.EndWrite()
+			vfs.mountMu.Unlock()
+			return syserror.EBUSY
+		}
+		// We are holding a reference on vd.mount.
+		expectedRefs := int64(1)
+		if !vd.mount.umounted {
+			expectedRefs = 2
+		}
+		if atomic.LoadInt64(&vd.mount.refs)&^math.MinInt64 != expectedRefs { // mask out MSB
+			vfs.mounts.seq.EndWrite()
+			vfs.mountMu.Unlock()
+			return syserror.EBUSY
+		}
+	}
+	vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(vd.mount, &umountRecursiveOptions{
+		eager:               opts.Flags&linux.MNT_DETACH == 0,
+		disconnectHierarchy: true,
+	}, nil, nil)
+	vfs.mounts.seq.EndWrite()
+	vfs.mountMu.Unlock()
+	for _, vd := range vdsToDecRef {
+		vd.DecRef()
+	}
+	for _, mnt := range mountsToDecRef {
+		mnt.DecRef()
+	}
+	return nil
+}
+
+type umountRecursiveOptions struct {
+	// If eager is true, ensure that future calls to Mount.tryIncMountedRef()
+	// on umounted mounts fail.
+	//
+	// eager is analogous to Linux's UMOUNT_SYNC.
+	eager bool
+
+	// If disconnectHierarchy is true, Mounts that are umounted hierarchically
+	// should be disconnected from their parents. (Mounts whose parents are not
+	// umounted, which in most cases means the Mount passed to the initial call
+	// to umountRecursiveLocked, are unconditionally disconnected for
+	// consistency with Linux.)
+	//
+	// disconnectHierarchy is analogous to Linux's !UMOUNT_CONNECTED.
+	disconnectHierarchy bool
+}
+
+// umountRecursiveLocked marks mnt and its descendants as umounted. It does not
+// release mount or dentry references; instead, it appends VirtualDentries and
+// Mounts on which references must be dropped to vdsToDecRef and mountsToDecRef
+// respectively, and returns updated slices. (This is necessary because
+// filesystem locks possibly taken by DentryImpl.DecRef() may precede
+// vfs.mountMu in the lock order, and Mount.DecRef() may lock vfs.mountMu.)
+//
+// umountRecursiveLocked is analogous to Linux's fs/namespace.c:umount_tree().
+//
+// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
+// writer critical section.
+func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecursiveOptions, vdsToDecRef []VirtualDentry, mountsToDecRef []*Mount) ([]VirtualDentry, []*Mount) {
+	if !mnt.umounted {
+		mnt.umounted = true
+		mountsToDecRef = append(mountsToDecRef, mnt)
+		if parent := mnt.parent(); parent != nil && (opts.disconnectHierarchy || !parent.umounted) {
+			vdsToDecRef = append(vdsToDecRef, vfs.disconnectLocked(mnt))
+		}
+	}
+	if opts.eager {
+		for {
+			refs := atomic.LoadInt64(&mnt.refs)
+			if refs < 0 {
+				break
+			}
+			if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs|math.MinInt64) {
+				break
+			}
+		}
+	}
+	for child := range mnt.children {
+		vdsToDecRef, mountsToDecRef = vfs.umountRecursiveLocked(child, opts, vdsToDecRef, mountsToDecRef)
+	}
+	return vdsToDecRef, mountsToDecRef
+}
+
+// connectLocked makes vd the mount parent/point for mnt. It consumes
+// references held by vd.
+//
+// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
+// writer critical section. d.mu must be locked. mnt.parent() == nil, i.e. mnt
+// must not already be connected.
+func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) {
+	if checkInvariants {
+		if mnt.parent() != nil {
+			panic("VFS.connectLocked called on connected mount")
+		}
+	}
+	mnt.IncRef() // dropped by callers of umountRecursiveLocked
+	mnt.storeKey(vd)
+	if vd.mount.children == nil {
+		vd.mount.children = make(map[*Mount]struct{})
+	}
+	vd.mount.children[mnt] = struct{}{}
+	atomic.AddUint32(&vd.dentry.mounts, 1)
+	mnt.ns = mntns
+	mntns.mountpoints[vd.dentry]++
+	vfs.mounts.insertSeqed(mnt)
+	vfsmpmounts, ok := vfs.mountpoints[vd.dentry]
+	if !ok {
+		vfsmpmounts = make(map[*Mount]struct{})
+		vfs.mountpoints[vd.dentry] = vfsmpmounts
+	}
+	vfsmpmounts[mnt] = struct{}{}
+}
+
+// disconnectLocked makes vd have no mount parent/point and returns its old
+// mount parent/point with a reference held.
+//
+// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
+// writer critical section. mnt.parent() != nil.
+func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry {
+	vd := mnt.loadKey()
+	if checkInvariants {
+		if vd.mount != nil {
+			panic("VFS.disconnectLocked called on disconnected mount")
+		}
+	}
+	mnt.storeKey(VirtualDentry{})
+	delete(vd.mount.children, mnt)
+	atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1
+	mnt.ns.mountpoints[vd.dentry]--
+	if mnt.ns.mountpoints[vd.dentry] == 0 {
+		delete(mnt.ns.mountpoints, vd.dentry)
+	}
+	vfs.mounts.removeSeqed(mnt)
+	vfsmpmounts := vfs.mountpoints[vd.dentry]
+	delete(vfsmpmounts, mnt)
+	if len(vfsmpmounts) == 0 {
+		delete(vfs.mountpoints, vd.dentry)
+	}
+	return vd
+}
+
+// tryIncMountedRef increments mnt's reference count and returns true. If mnt's
+// reference count is already zero, or has been eagerly umounted,
+// tryIncMountedRef does nothing and returns false.
+//
+// tryIncMountedRef does not require that a reference is held on mnt.
+func (mnt *Mount) tryIncMountedRef() bool {
+	for {
+		refs := atomic.LoadInt64(&mnt.refs)
+		if refs <= 0 { // refs < 0 => MSB set => eagerly unmounted
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+// IncRef increments mnt's reference count.
+func (mnt *Mount) IncRef() {
+	// In general, negative values for mnt.refs are valid because the MSB is
+	// the eager-unmount bit.
+	atomic.AddInt64(&mnt.refs, 1)
+}
+
+// DecRef decrements mnt's reference count.
+func (mnt *Mount) DecRef() {
+	refs := atomic.AddInt64(&mnt.refs, -1)
+	if refs&^math.MinInt64 == 0 { // mask out MSB
+		var vd VirtualDentry
+		if mnt.parent() != nil {
+			mnt.vfs.mountMu.Lock()
+			mnt.vfs.mounts.seq.BeginWrite()
+			vd = mnt.vfs.disconnectLocked(mnt)
+			mnt.vfs.mounts.seq.EndWrite()
+			mnt.vfs.mountMu.Unlock()
+		}
+		mnt.root.DecRef()
+		mnt.fs.DecRef()
+		if vd.Ok() {
+			vd.DecRef()
+		}
+	}
+}
+
+// IncRef increments mntns' reference count.
+func (mntns *MountNamespace) IncRef() {
+	if atomic.AddInt64(&mntns.refs, 1) <= 1 {
+		panic("MountNamespace.IncRef() called without holding a reference")
+	}
+}
+
+// DecRef decrements mntns' reference count.
+func (mntns *MountNamespace) DecRef() {
+	vfs := mntns.root.fs.VirtualFilesystem()
+	if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 {
+		vfs.mountMu.Lock()
+		vfs.mounts.seq.BeginWrite()
+		vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(mntns.root, &umountRecursiveOptions{
+			disconnectHierarchy: true,
+		}, nil, nil)
+		vfs.mounts.seq.EndWrite()
+		vfs.mountMu.Unlock()
+		for _, vd := range vdsToDecRef {
+			vd.DecRef()
+		}
+		for _, mnt := range mountsToDecRef {
+			mnt.DecRef()
+		}
+	} else if refs < 0 {
+		panic("MountNamespace.DecRef() called without holding a reference")
+	}
+}
+
+// getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes
+// a reference on the returned Mount. If (mnt, d) is not a mount point,
+// getMountAt returns nil.
+//
+// getMountAt is analogous to Linux's fs/namei.c:follow_mount().
+//
+// Preconditions: References are held on mnt and d.
+func (vfs *VirtualFilesystem) getMountAt(mnt *Mount, d *Dentry) *Mount {
+	// The first mount is special-cased:
+	//
+	// - The caller is assumed to have checked d.isMounted() already. (This
+	// isn't a precondition because it doesn't matter for correctness.)
+	//
+	// - We return nil, instead of mnt, if there is no mount at (mnt, d).
+	//
+	// - We don't drop the caller's references on mnt and d.
+retryFirst:
+	next := vfs.mounts.Lookup(mnt, d)
+	if next == nil {
+		return nil
+	}
+	if !next.tryIncMountedRef() {
+		// Raced with umount.
+		goto retryFirst
+	}
+	mnt = next
+	d = next.root
+	// We don't need to take Dentry refs anywhere in this function because
+	// Mounts hold references on Mount.root, which is immutable.
+	for d.isMounted() {
+		next := vfs.mounts.Lookup(mnt, d)
+		if next == nil {
+			break
+		}
+		if !next.tryIncMountedRef() {
+			// Raced with umount.
+			continue
+		}
+		mnt.DecRef()
+		mnt = next
+		d = next.root
+	}
+	return mnt
+}
+
+// getMountpointAt returns the mount point for the stack of Mounts including
+// mnt. It takes a reference on the returned VirtualDentry. If no such mount
+// point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil).
+//
+// Preconditions: References are held on mnt and root. vfsroot is not (mnt,
+// mnt.root).
+func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) VirtualDentry {
+	// The first mount is special-cased:
+	//
+	// - The caller must have already checked mnt against vfsroot.
+	//
+	// - We return nil, instead of mnt, if there is no mount point for mnt.
+	//
+	// - We don't drop the caller's reference on mnt.
+retryFirst:
+	epoch := vfs.mounts.seq.BeginRead()
+	parent, point := mnt.parent(), mnt.point()
+	if !vfs.mounts.seq.ReadOk(epoch) {
+		goto retryFirst
+	}
+	if parent == nil {
+		return VirtualDentry{}
+	}
+	if !parent.tryIncMountedRef() {
+		// Raced with umount.
+		goto retryFirst
+	}
+	if !point.TryIncRef() {
+		// Since Mount holds a reference on Mount.key.point, this can only
+		// happen due to a racing change to Mount.key.
+		parent.DecRef()
+		goto retryFirst
+	}
+	if !vfs.mounts.seq.ReadOk(epoch) {
+		point.DecRef()
+		parent.DecRef()
+		goto retryFirst
+	}
+	mnt = parent
+	d := point
+	for {
+		if mnt == vfsroot.mount && d == vfsroot.dentry {
+			break
+		}
+		if d != mnt.root {
+			break
+		}
+	retryNotFirst:
+		epoch := vfs.mounts.seq.BeginRead()
+		parent, point := mnt.parent(), mnt.point()
+		if !vfs.mounts.seq.ReadOk(epoch) {
+			goto retryNotFirst
+		}
+		if parent == nil {
+			break
+		}
+		if !parent.tryIncMountedRef() {
+			// Raced with umount.
+			goto retryNotFirst
+		}
+		if !point.TryIncRef() {
+			// Since Mount holds a reference on Mount.key.point, this can
+			// only happen due to a racing change to Mount.key.
+			parent.DecRef()
+			goto retryNotFirst
+		}
+		if !vfs.mounts.seq.ReadOk(epoch) {
+			point.DecRef()
+			parent.DecRef()
+			goto retryNotFirst
+		}
+		d.DecRef()
+		mnt.DecRef()
+		mnt = parent
+		d = point
+	}
+	return VirtualDentry{mnt, d}
+}
+
+// CheckBeginWrite increments the counter of in-progress write operations on
+// mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns
+// EROFS.
+//
+// If CheckBeginWrite succeeds, EndWrite must be called when the write
+// operation is finished.
+func (mnt *Mount) CheckBeginWrite() error {
+	if atomic.AddInt64(&mnt.writers, 1) < 0 {
+		atomic.AddInt64(&mnt.writers, -1)
+		return syserror.EROFS
+	}
+	return nil
+}
+
+// EndWrite indicates that a write operation signaled by a previous successful
+// call to CheckBeginWrite has finished.
+func (mnt *Mount) EndWrite() {
+	atomic.AddInt64(&mnt.writers, -1)
+}
+
+// Preconditions: VirtualFilesystem.mountMu must be locked.
+func (mnt *Mount) setReadOnlyLocked(ro bool) error {
+	if oldRO := atomic.LoadInt64(&mnt.writers) < 0; oldRO == ro {
+		return nil
+	}
+	if ro {
+		if !atomic.CompareAndSwapInt64(&mnt.writers, 0, math.MinInt64) {
+			return syserror.EBUSY
+		}
+		return nil
+	}
+	// Unset MSB without dropping any temporary increments from failed calls to
+	// mnt.CheckBeginWrite().
+	atomic.AddInt64(&mnt.writers, math.MinInt64)
+	return nil
+}
+
+func (mnt *Mount) readOnly() bool {
+	return atomic.LoadInt64(&mnt.writers) < 0
+}
+
+// Filesystem returns the mounted Filesystem. It does not take a reference on
+// the returned Filesystem.
+func (mnt *Mount) Filesystem() *Filesystem {
+	return mnt.fs
+}
+
+// submountsLocked returns this Mount and all Mounts that are descendents of
+// it.
+//
+// Precondition: mnt.vfs.mountMu must be held.
+func (mnt *Mount) submountsLocked() []*Mount {
+	mounts := []*Mount{mnt}
+	for m := range mnt.children {
+		mounts = append(mounts, m.submountsLocked()...)
+	}
+	return mounts
+}
+
+// Root returns the mount's root. It does not take a reference on the returned
+// Dentry.
+func (mnt *Mount) Root() *Dentry {
+	return mnt.root
+}
+
+// Root returns mntns' root. A reference is taken on the returned
+// VirtualDentry.
+func (mntns *MountNamespace) Root() VirtualDentry {
+	vd := VirtualDentry{
+		mount:  mntns.root,
+		dentry: mntns.root.root,
+	}
+	vd.IncRef()
+	return vd
+}
+
+// GenerateProcMounts emits the contents of /proc/[pid]/mounts for vfs to buf.
+//
+// Preconditions: taskRootDir.Ok().
+func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
+	vfs.mountMu.Lock()
+	defer vfs.mountMu.Unlock()
+	rootMnt := taskRootDir.mount
+	mounts := rootMnt.submountsLocked()
+	sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+	for _, mnt := range mounts {
+		// Get the path to this mount relative to task root.
+		mntRootVD := VirtualDentry{
+			mount:  mnt,
+			dentry: mnt.root,
+		}
+		path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
+		if err != nil {
+			// For some reason we didn't get a path. Log a warning
+			// and run with empty path.
+			ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+			path = ""
+		}
+		if path == "" {
+			// Either an error occurred, or path is not reachable
+			// from root.
+			break
+		}
+
+		opts := "rw"
+		if mnt.readOnly() {
+			opts = "ro"
+		}
+		if mnt.Flags.NoATime {
+			opts = ",noatime"
+		}
+		if mnt.Flags.NoExec {
+			opts += ",noexec"
+		}
+
+		// Format:
+		// <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order>
+		//
+		// The "needs dump" and "fsck order" flags are always 0, which
+		// is allowed.
+		fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", path, mnt.fs.FilesystemType().Name(), opts, 0, 0)
+	}
+}
+
+// GenerateProcMountInfo emits the contents of /proc/[pid]/mountinfo for vfs to
+// buf.
+//
+// Preconditions: taskRootDir.Ok().
+func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
+	vfs.mountMu.Lock()
+	defer vfs.mountMu.Unlock()
+	rootMnt := taskRootDir.mount
+	mounts := rootMnt.submountsLocked()
+	sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+	for _, mnt := range mounts {
+		// Get the path to this mount relative to task root.
+		mntRootVD := VirtualDentry{
+			mount:  mnt,
+			dentry: mnt.root,
+		}
+		path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
+		if err != nil {
+			// For some reason we didn't get a path. Log a warning
+			// and run with empty path.
+			ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+			path = ""
+		}
+		if path == "" {
+			// Either an error occurred, or path is not reachable
+			// from root.
+			break
+		}
+		// Stat the mount root to get the major/minor device numbers.
+		pop := &PathOperation{
+			Root:  mntRootVD,
+			Start: mntRootVD,
+		}
+		statx, err := vfs.StatAt(ctx, auth.NewAnonymousCredentials(), pop, &StatOptions{})
+		if err != nil {
+			// Well that's not good. Ignore this mount.
+			break
+		}
+
+		// Format:
+		// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
+		// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
+
+		// (1) Mount ID.
+		fmt.Fprintf(buf, "%d ", mnt.ID)
+
+		// (2)  Parent ID (or this ID if there is no parent).
+		pID := mnt.ID
+		if p := mnt.parent(); p != nil {
+			pID = p.ID
+		}
+		fmt.Fprintf(buf, "%d ", pID)
+
+		// (3) Major:Minor device ID. We don't have a superblock, so we
+		// just use the root inode device number.
+		fmt.Fprintf(buf, "%d:%d ", statx.DevMajor, statx.DevMinor)
+
+		// (4) Root: the pathname of the directory in the filesystem
+		// which forms the root of this mount.
+		//
+		// NOTE(b/78135857): This will always be "/" until we implement
+		// bind mounts.
+		fmt.Fprintf(buf, "/ ")
+
+		// (5) Mount point (relative to process root).
+		fmt.Fprintf(buf, "%s ", manglePath(path))
+
+		// (6) Mount options.
+		opts := "rw"
+		if mnt.readOnly() {
+			opts = "ro"
+		}
+		if mnt.Flags.NoATime {
+			opts = ",noatime"
+		}
+		if mnt.Flags.NoExec {
+			opts += ",noexec"
+		}
+		fmt.Fprintf(buf, "%s ", opts)
+
+		// (7) Optional fields: zero or more fields of the form "tag[:value]".
+		// (8) Separator: the end of the optional fields is marked by a single hyphen.
+		fmt.Fprintf(buf, "- ")
+
+		// (9) Filesystem type.
+		fmt.Fprintf(buf, "%s ", mnt.fs.FilesystemType().Name())
+
+		// (10) Mount source: filesystem-specific information or "none".
+		fmt.Fprintf(buf, "none ")
+
+		// (11) Superblock options, and final newline.
+		fmt.Fprintf(buf, "%s\n", superBlockOpts(path, mnt))
+	}
+}
+
+// manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents.
+// See Linux fs/seq_file.c:mangle_path.
+func manglePath(p string) string {
+	r := strings.NewReplacer(" ", "\\040", "\t", "\\011", "\n", "\\012", "\\", "\\134")
+	return r.Replace(p)
+}
+
+// superBlockOpts returns the super block options string for the the mount at
+// the given path.
+func superBlockOpts(mountPath string, mnt *Mount) string {
+	// gVisor doesn't (yet) have a concept of super block options, so we
+	// use the ro/rw bit from the mount flag.
+	opts := "rw"
+	if mnt.readOnly() {
+		opts = "ro"
+	}
+
+	// NOTE(b/147673608): If the mount is a cgroup, we also need to include
+	// the cgroup name in the options. For now we just read that from the
+	// path.
+	//
+	// TODO(gvisor.dev/issue/190): Once gVisor has full cgroup support, we
+	// should get this value from the cgroup itself, and not rely on the
+	// path.
+	if mnt.fs.FilesystemType().Name() == "cgroup" {
+		splitPath := strings.Split(mountPath, "/")
+		cgroupType := splitPath[len(splitPath)-1]
+		opts += "," + cgroupType
+	}
+	return opts
+}
diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go
new file mode 100644
index 000000000..3335e4057
--- /dev/null
+++ b/pkg/sentry/vfs/mount_test.go
@@ -0,0 +1,458 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"fmt"
+	"runtime"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+func TestMountTableLookupEmpty(t *testing.T) {
+	var mt mountTable
+	mt.Init()
+
+	parent := &Mount{}
+	point := &Dentry{}
+	if m := mt.Lookup(parent, point); m != nil {
+		t.Errorf("empty mountTable lookup: got %p, wanted nil", m)
+	}
+}
+
+func TestMountTableInsertLookup(t *testing.T) {
+	var mt mountTable
+	mt.Init()
+
+	mount := &Mount{}
+	mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}})
+	mt.Insert(mount)
+
+	if m := mt.Lookup(mount.parent(), mount.point()); m != mount {
+		t.Errorf("mountTable positive lookup: got %p, wanted %p", m, mount)
+	}
+
+	otherParent := &Mount{}
+	if m := mt.Lookup(otherParent, mount.point()); m != nil {
+		t.Errorf("mountTable lookup with wrong mount parent: got %p, wanted nil", m)
+	}
+	otherPoint := &Dentry{}
+	if m := mt.Lookup(mount.parent(), otherPoint); m != nil {
+		t.Errorf("mountTable lookup with wrong mount point: got %p, wanted nil", m)
+	}
+}
+
+// TODO(gvisor.dev/issue/1035): concurrent lookup/insertion/removal.
+
+// must be powers of 2
+var benchNumMounts = []int{1 << 2, 1 << 5, 1 << 8}
+
+// For all of the following:
+//
+// - BenchmarkMountTableFoo tests usage pattern "Foo" for mountTable.
+//
+// - BenchmarkMountMapFoo tests usage pattern "Foo" for a
+// sync.RWMutex-protected map. (Mutator benchmarks do not use a RWMutex, since
+// mountTable also requires external synchronization between mutators.)
+//
+// - BenchmarkMountSyncMapFoo tests usage pattern "Foo" for a sync.Map.
+//
+// ParallelLookup is by far the most common and performance-sensitive operation
+// for this application. NegativeLookup is also important, but less so (only
+// relevant with multiple mount namespaces and significant differences in
+// mounts between them). Insertion and removal are benchmarked for
+// completeness.
+const enableComparativeBenchmarks = false
+
+func newBenchMount() *Mount {
+	mount := &Mount{}
+	mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}})
+	return mount
+}
+
+func BenchmarkMountTableParallelLookup(b *testing.B) {
+	for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 {
+		for _, numMounts := range benchNumMounts {
+			desc := fmt.Sprintf("%dx%d", numG, numMounts)
+			b.Run(desc, func(b *testing.B) {
+				var mt mountTable
+				mt.Init()
+				keys := make([]VirtualDentry, 0, numMounts)
+				for i := 0; i < numMounts; i++ {
+					mount := newBenchMount()
+					mt.Insert(mount)
+					keys = append(keys, mount.loadKey())
+				}
+
+				var ready sync.WaitGroup
+				begin := make(chan struct{})
+				var end sync.WaitGroup
+				for g := 0; g < numG; g++ {
+					ready.Add(1)
+					end.Add(1)
+					go func() {
+						defer end.Done()
+						ready.Done()
+						<-begin
+						for i := 0; i < b.N; i++ {
+							k := keys[i&(numMounts-1)]
+							m := mt.Lookup(k.mount, k.dentry)
+							if m == nil {
+								b.Fatalf("lookup failed")
+							}
+							if parent := m.parent(); parent != k.mount {
+								b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount)
+							}
+							if point := m.point(); point != k.dentry {
+								b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry)
+							}
+						}
+					}()
+				}
+
+				ready.Wait()
+				b.ResetTimer()
+				close(begin)
+				end.Wait()
+			})
+		}
+	}
+}
+
+func BenchmarkMountMapParallelLookup(b *testing.B) {
+	if !enableComparativeBenchmarks {
+		b.Skipf("comparative benchmarks are disabled")
+	}
+
+	for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 {
+		for _, numMounts := range benchNumMounts {
+			desc := fmt.Sprintf("%dx%d", numG, numMounts)
+			b.Run(desc, func(b *testing.B) {
+				var mu sync.RWMutex
+				ms := make(map[VirtualDentry]*Mount)
+				keys := make([]VirtualDentry, 0, numMounts)
+				for i := 0; i < numMounts; i++ {
+					mount := newBenchMount()
+					key := mount.loadKey()
+					ms[key] = mount
+					keys = append(keys, key)
+				}
+
+				var ready sync.WaitGroup
+				begin := make(chan struct{})
+				var end sync.WaitGroup
+				for g := 0; g < numG; g++ {
+					ready.Add(1)
+					end.Add(1)
+					go func() {
+						defer end.Done()
+						ready.Done()
+						<-begin
+						for i := 0; i < b.N; i++ {
+							k := keys[i&(numMounts-1)]
+							mu.RLock()
+							m := ms[k]
+							mu.RUnlock()
+							if m == nil {
+								b.Fatalf("lookup failed")
+							}
+							if parent := m.parent(); parent != k.mount {
+								b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount)
+							}
+							if point := m.point(); point != k.dentry {
+								b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry)
+							}
+						}
+					}()
+				}
+
+				ready.Wait()
+				b.ResetTimer()
+				close(begin)
+				end.Wait()
+			})
+		}
+	}
+}
+
+func BenchmarkMountSyncMapParallelLookup(b *testing.B) {
+	if !enableComparativeBenchmarks {
+		b.Skipf("comparative benchmarks are disabled")
+	}
+
+	for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 {
+		for _, numMounts := range benchNumMounts {
+			desc := fmt.Sprintf("%dx%d", numG, numMounts)
+			b.Run(desc, func(b *testing.B) {
+				var ms sync.Map
+				keys := make([]VirtualDentry, 0, numMounts)
+				for i := 0; i < numMounts; i++ {
+					mount := newBenchMount()
+					key := mount.loadKey()
+					ms.Store(key, mount)
+					keys = append(keys, key)
+				}
+
+				var ready sync.WaitGroup
+				begin := make(chan struct{})
+				var end sync.WaitGroup
+				for g := 0; g < numG; g++ {
+					ready.Add(1)
+					end.Add(1)
+					go func() {
+						defer end.Done()
+						ready.Done()
+						<-begin
+						for i := 0; i < b.N; i++ {
+							k := keys[i&(numMounts-1)]
+							mi, ok := ms.Load(k)
+							if !ok {
+								b.Fatalf("lookup failed")
+							}
+							m := mi.(*Mount)
+							if parent := m.parent(); parent != k.mount {
+								b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount)
+							}
+							if point := m.point(); point != k.dentry {
+								b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry)
+							}
+						}
+					}()
+				}
+
+				ready.Wait()
+				b.ResetTimer()
+				close(begin)
+				end.Wait()
+			})
+		}
+	}
+}
+
+func BenchmarkMountTableNegativeLookup(b *testing.B) {
+	for _, numMounts := range benchNumMounts {
+		desc := fmt.Sprintf("%d", numMounts)
+		b.Run(desc, func(b *testing.B) {
+			var mt mountTable
+			mt.Init()
+			for i := 0; i < numMounts; i++ {
+				mt.Insert(newBenchMount())
+			}
+			negkeys := make([]VirtualDentry, 0, numMounts)
+			for i := 0; i < numMounts; i++ {
+				negkeys = append(negkeys, VirtualDentry{
+					mount:  &Mount{},
+					dentry: &Dentry{},
+				})
+			}
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				k := negkeys[i&(numMounts-1)]
+				m := mt.Lookup(k.mount, k.dentry)
+				if m != nil {
+					b.Fatalf("lookup got %p, wanted nil", m)
+				}
+			}
+		})
+	}
+}
+
+func BenchmarkMountMapNegativeLookup(b *testing.B) {
+	if !enableComparativeBenchmarks {
+		b.Skipf("comparative benchmarks are disabled")
+	}
+
+	for _, numMounts := range benchNumMounts {
+		desc := fmt.Sprintf("%d", numMounts)
+		b.Run(desc, func(b *testing.B) {
+			var mu sync.RWMutex
+			ms := make(map[VirtualDentry]*Mount)
+			for i := 0; i < numMounts; i++ {
+				mount := newBenchMount()
+				ms[mount.loadKey()] = mount
+			}
+			negkeys := make([]VirtualDentry, 0, numMounts)
+			for i := 0; i < numMounts; i++ {
+				negkeys = append(negkeys, VirtualDentry{
+					mount:  &Mount{},
+					dentry: &Dentry{},
+				})
+			}
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				k := negkeys[i&(numMounts-1)]
+				mu.RLock()
+				m := ms[k]
+				mu.RUnlock()
+				if m != nil {
+					b.Fatalf("lookup got %p, wanted nil", m)
+				}
+			}
+		})
+	}
+}
+
+func BenchmarkMountSyncMapNegativeLookup(b *testing.B) {
+	if !enableComparativeBenchmarks {
+		b.Skipf("comparative benchmarks are disabled")
+	}
+
+	for _, numMounts := range benchNumMounts {
+		desc := fmt.Sprintf("%d", numMounts)
+		b.Run(desc, func(b *testing.B) {
+			var ms sync.Map
+			for i := 0; i < numMounts; i++ {
+				mount := newBenchMount()
+				ms.Store(mount.loadKey(), mount)
+			}
+			negkeys := make([]VirtualDentry, 0, numMounts)
+			for i := 0; i < numMounts; i++ {
+				negkeys = append(negkeys, VirtualDentry{
+					mount:  &Mount{},
+					dentry: &Dentry{},
+				})
+			}
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				k := negkeys[i&(numMounts-1)]
+				m, _ := ms.Load(k)
+				if m != nil {
+					b.Fatalf("lookup got %p, wanted nil", m)
+				}
+			}
+		})
+	}
+}
+
+func BenchmarkMountTableInsert(b *testing.B) {
+	// Preallocate Mounts so that allocation time isn't included in the
+	// benchmark.
+	mounts := make([]*Mount, 0, b.N)
+	for i := 0; i < b.N; i++ {
+		mounts = append(mounts, newBenchMount())
+	}
+
+	var mt mountTable
+	mt.Init()
+	b.ResetTimer()
+	for i := range mounts {
+		mt.Insert(mounts[i])
+	}
+}
+
+func BenchmarkMountMapInsert(b *testing.B) {
+	if !enableComparativeBenchmarks {
+		b.Skipf("comparative benchmarks are disabled")
+	}
+
+	// Preallocate Mounts so that allocation time isn't included in the
+	// benchmark.
+	mounts := make([]*Mount, 0, b.N)
+	for i := 0; i < b.N; i++ {
+		mounts = append(mounts, newBenchMount())
+	}
+
+	ms := make(map[VirtualDentry]*Mount)
+	b.ResetTimer()
+	for i := range mounts {
+		mount := mounts[i]
+		ms[mount.loadKey()] = mount
+	}
+}
+
+func BenchmarkMountSyncMapInsert(b *testing.B) {
+	if !enableComparativeBenchmarks {
+		b.Skipf("comparative benchmarks are disabled")
+	}
+
+	// Preallocate Mounts so that allocation time isn't included in the
+	// benchmark.
+	mounts := make([]*Mount, 0, b.N)
+	for i := 0; i < b.N; i++ {
+		mounts = append(mounts, newBenchMount())
+	}
+
+	var ms sync.Map
+	b.ResetTimer()
+	for i := range mounts {
+		mount := mounts[i]
+		ms.Store(mount.loadKey(), mount)
+	}
+}
+
+func BenchmarkMountTableRemove(b *testing.B) {
+	mounts := make([]*Mount, 0, b.N)
+	for i := 0; i < b.N; i++ {
+		mounts = append(mounts, newBenchMount())
+	}
+	var mt mountTable
+	mt.Init()
+	for i := range mounts {
+		mt.Insert(mounts[i])
+	}
+
+	b.ResetTimer()
+	for i := range mounts {
+		mt.Remove(mounts[i])
+	}
+}
+
+func BenchmarkMountMapRemove(b *testing.B) {
+	if !enableComparativeBenchmarks {
+		b.Skipf("comparative benchmarks are disabled")
+	}
+
+	mounts := make([]*Mount, 0, b.N)
+	for i := 0; i < b.N; i++ {
+		mounts = append(mounts, newBenchMount())
+	}
+	ms := make(map[VirtualDentry]*Mount)
+	for i := range mounts {
+		mount := mounts[i]
+		ms[mount.loadKey()] = mount
+	}
+
+	b.ResetTimer()
+	for i := range mounts {
+		mount := mounts[i]
+		delete(ms, mount.loadKey())
+	}
+}
+
+func BenchmarkMountSyncMapRemove(b *testing.B) {
+	if !enableComparativeBenchmarks {
+		b.Skipf("comparative benchmarks are disabled")
+	}
+
+	mounts := make([]*Mount, 0, b.N)
+	for i := 0; i < b.N; i++ {
+		mounts = append(mounts, newBenchMount())
+	}
+	var ms sync.Map
+	for i := range mounts {
+		mount := mounts[i]
+		ms.Store(mount.loadKey(), mount)
+	}
+
+	b.ResetTimer()
+	for i := range mounts {
+		mount := mounts[i]
+		ms.Delete(mount.loadKey())
+	}
+}
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
new file mode 100644
index 000000000..70f850ca4
--- /dev/null
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -0,0 +1,364 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build go1.12
+// +build !go1.16
+
+// Check go:linkname function signatures when updating Go version.
+
+package vfs
+
+import (
+	"fmt"
+	"math/bits"
+	"reflect"
+	"sync/atomic"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/gohacks"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// mountKey represents the location at which a Mount is mounted. It is
+// structurally identical to VirtualDentry, but stores its fields as
+// unsafe.Pointer since mutators synchronize with VFS path traversal using
+// seqcounts.
+type mountKey struct {
+	parent unsafe.Pointer // *Mount
+	point  unsafe.Pointer // *Dentry
+}
+
+func (mnt *Mount) parent() *Mount {
+	return (*Mount)(atomic.LoadPointer(&mnt.key.parent))
+}
+
+func (mnt *Mount) point() *Dentry {
+	return (*Dentry)(atomic.LoadPointer(&mnt.key.point))
+}
+
+func (mnt *Mount) loadKey() VirtualDentry {
+	return VirtualDentry{
+		mount:  mnt.parent(),
+		dentry: mnt.point(),
+	}
+}
+
+// Invariant: mnt.key.parent == nil. vd.Ok().
+func (mnt *Mount) storeKey(vd VirtualDentry) {
+	atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(vd.mount))
+	atomic.StorePointer(&mnt.key.point, unsafe.Pointer(vd.dentry))
+}
+
+// mountTable maps (mount parent, mount point) pairs to mounts. It supports
+// efficient concurrent lookup, even in the presence of concurrent mutators
+// (provided mutation is sufficiently uncommon).
+//
+// mountTable.Init() must be called on new mountTables before use.
+//
+// +stateify savable
+type mountTable struct {
+	// mountTable is implemented as a seqcount-protected hash table that
+	// resolves collisions with linear probing, featuring Robin Hood insertion
+	// and backward shift deletion. These minimize probe length variance,
+	// significantly improving the performance of linear probing at high load
+	// factors. (mountTable doesn't use bucketing, which is the other major
+	// technique commonly used in high-performance hash tables; the efficiency
+	// of bucketing is largely due to SIMD lookup, and Go lacks both SIMD
+	// intrinsics and inline assembly, limiting the performance of this
+	// approach.)
+
+	seq  sync.SeqCount `state:"nosave"`
+	seed uint32        // for hashing keys
+
+	// size holds both length (number of elements) and capacity (number of
+	// slots): capacity is stored as its base-2 log (referred to as order) in
+	// the least significant bits of size, and length is stored in the
+	// remaining bits. Go defines bit shifts >= width of shifted unsigned
+	// operand as shifting to 0, which differs from x86's SHL, so the Go
+	// compiler inserts a bounds check for each bit shift unless we mask order
+	// anyway (cf. runtime.bucketShift()), and length isn't used by lookup;
+	// thus this bit packing gets us more bits for the length (vs. storing
+	// length and cap in separate uint32s) for ~free.
+	size uint64
+
+	slots unsafe.Pointer `state:"nosave"` // []mountSlot; never nil after Init
+}
+
+type mountSlot struct {
+	// We don't store keys in slots; instead, we just check Mount.parent and
+	// Mount.point directly. Any practical use of lookup will need to touch
+	// Mounts anyway, and comparing hashes means that false positives are
+	// extremely rare, so this isn't an extra cache line touch overall.
+	value unsafe.Pointer // *Mount
+	hash  uintptr
+}
+
+const (
+	mtSizeOrderBits = 6 // log2 of pointer size in bits
+	mtSizeOrderMask = (1 << mtSizeOrderBits) - 1
+	mtSizeOrderOne  = 1
+	mtSizeLenLSB    = mtSizeOrderBits
+	mtSizeLenOne    = 1 << mtSizeLenLSB
+	mtSizeLenNegOne = ^uint64(mtSizeOrderMask) // uint64(-1) << mtSizeLenLSB
+
+	mountSlotBytes = unsafe.Sizeof(mountSlot{})
+	mountKeyBytes  = unsafe.Sizeof(mountKey{})
+
+	// Tuning parameters.
+	//
+	// Essentially every mountTable will contain at least /proc, /sys, and
+	// /dev/shm, so there is ~no reason for mtInitCap to be < 4.
+	mtInitOrder  = 2
+	mtInitCap    = 1 << mtInitOrder
+	mtMaxLoadNum = 13
+	mtMaxLoadDen = 16
+)
+
+func init() {
+	// We can't just define mtSizeOrderBits as follows because Go doesn't have
+	// constexpr.
+	if ptrBits := uint(unsafe.Sizeof(uintptr(0)) * 8); mtSizeOrderBits != bits.TrailingZeros(ptrBits) {
+		panic(fmt.Sprintf("mtSizeOrderBits (%d) must be %d = log2 of pointer size in bits (%d)", mtSizeOrderBits, bits.TrailingZeros(ptrBits), ptrBits))
+	}
+	if bits.OnesCount(uint(mountSlotBytes)) != 1 {
+		panic(fmt.Sprintf("sizeof(mountSlotBytes) (%d) must be a power of 2 to use bit masking for wraparound", mountSlotBytes))
+	}
+	if mtInitCap <= 1 {
+		panic(fmt.Sprintf("mtInitCap (%d) must be at least 2 since mountTable methods assume that there will always be at least one empty slot", mtInitCap))
+	}
+	if mtMaxLoadNum >= mtMaxLoadDen {
+		panic(fmt.Sprintf("invalid mountTable maximum load factor (%d/%d)", mtMaxLoadNum, mtMaxLoadDen))
+	}
+}
+
+// Init must be called exactly once on each mountTable before use.
+func (mt *mountTable) Init() {
+	mt.seed = rand32()
+	mt.size = mtInitOrder
+	mt.slots = newMountTableSlots(mtInitCap)
+}
+
+func newMountTableSlots(cap uintptr) unsafe.Pointer {
+	slice := make([]mountSlot, cap, cap)
+	hdr := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
+	return unsafe.Pointer(hdr.Data)
+}
+
+// Lookup returns the Mount with the given parent, mounted at the given point.
+// If no such Mount exists, Lookup returns nil.
+//
+// Lookup may be called even if there are concurrent mutators of mt.
+func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount {
+	key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)}
+	hash := memhash(gohacks.Noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes)
+
+loop:
+	for {
+		epoch := mt.seq.BeginRead()
+		size := atomic.LoadUint64(&mt.size)
+		slots := atomic.LoadPointer(&mt.slots)
+		if !mt.seq.ReadOk(epoch) {
+			continue
+		}
+		tcap := uintptr(1) << (size & mtSizeOrderMask)
+		mask := tcap - 1
+		off := (hash & mask) * mountSlotBytes
+		offmask := mask * mountSlotBytes
+		for {
+			// This avoids bounds checking.
+			slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
+			slotValue := atomic.LoadPointer(&slot.value)
+			slotHash := atomic.LoadUintptr(&slot.hash)
+			if !mt.seq.ReadOk(epoch) {
+				// The element we're looking for might have been moved into a
+				// slot we've previously checked, so restart entirely.
+				continue loop
+			}
+			if slotValue == nil {
+				return nil
+			}
+			if slotHash == hash {
+				mount := (*Mount)(slotValue)
+				var mountKey mountKey
+				mountKey.parent = atomic.LoadPointer(&mount.key.parent)
+				mountKey.point = atomic.LoadPointer(&mount.key.point)
+				if !mt.seq.ReadOk(epoch) {
+					continue loop
+				}
+				if key == mountKey {
+					return mount
+				}
+			}
+			off = (off + mountSlotBytes) & offmask
+		}
+	}
+}
+
+// Insert inserts the given mount into mt.
+//
+// Preconditions: mt must not already contain a Mount with the same mount point
+// and parent.
+func (mt *mountTable) Insert(mount *Mount) {
+	mt.seq.BeginWrite()
+	mt.insertSeqed(mount)
+	mt.seq.EndWrite()
+}
+
+// insertSeqed inserts the given mount into mt.
+//
+// Preconditions: mt.seq must be in a writer critical section. mt must not
+// already contain a Mount with the same mount point and parent.
+func (mt *mountTable) insertSeqed(mount *Mount) {
+	hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
+
+	// We're under the maximum load factor if:
+	//
+	//          (len+1) / cap <= mtMaxLoadNum / mtMaxLoadDen
+	// (len+1) * mtMaxLoadDen <= mtMaxLoadNum * cap
+	tlen := mt.size >> mtSizeLenLSB
+	order := mt.size & mtSizeOrderMask
+	tcap := uintptr(1) << order
+	if ((tlen + 1) * mtMaxLoadDen) <= (uint64(mtMaxLoadNum) << order) {
+		// Atomically insert the new element into the table.
+		atomic.AddUint64(&mt.size, mtSizeLenOne)
+		mtInsertLocked(mt.slots, tcap, unsafe.Pointer(mount), hash)
+		return
+	}
+
+	// Otherwise, we have to expand. Double the number of slots in the new
+	// table.
+	newOrder := order + 1
+	if newOrder > mtSizeOrderMask {
+		panic("mount table size overflow")
+	}
+	newCap := uintptr(1) << newOrder
+	newSlots := newMountTableSlots(newCap)
+	// Copy existing elements to the new table.
+	oldCur := mt.slots
+	// Go does not permit pointers to the end of allocated objects, so we
+	// must use a pointer to the last element of the old table. The
+	// following expression is equivalent to
+	// `slots+(cap-1)*mountSlotBytes` but has a critical path length of 2
+	// arithmetic instructions instead of 3.
+	oldLast := unsafe.Pointer((uintptr(mt.slots) - mountSlotBytes) + (tcap * mountSlotBytes))
+	for {
+		oldSlot := (*mountSlot)(oldCur)
+		if oldSlot.value != nil {
+			mtInsertLocked(newSlots, newCap, oldSlot.value, oldSlot.hash)
+		}
+		if oldCur == oldLast {
+			break
+		}
+		oldCur = unsafe.Pointer(uintptr(oldCur) + mountSlotBytes)
+	}
+	// Insert the new element into the new table.
+	mtInsertLocked(newSlots, newCap, unsafe.Pointer(mount), hash)
+	// Switch to the new table.
+	atomic.AddUint64(&mt.size, mtSizeLenOne|mtSizeOrderOne)
+	atomic.StorePointer(&mt.slots, newSlots)
+}
+
+// Preconditions: There are no concurrent mutators of the table (slots, cap).
+// If the table is visible to readers, then mt.seq must be in a writer critical
+// section. cap must be a power of 2.
+func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, hash uintptr) {
+	mask := cap - 1
+	off := (hash & mask) * mountSlotBytes
+	offmask := mask * mountSlotBytes
+	disp := uintptr(0)
+	for {
+		slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
+		slotValue := slot.value
+		if slotValue == nil {
+			atomic.StorePointer(&slot.value, value)
+			atomic.StoreUintptr(&slot.hash, hash)
+			return
+		}
+		// If we've been displaced farther from our first-probed slot than the
+		// element stored in this one, swap elements and switch to inserting
+		// the replaced one. (This is Robin Hood insertion.)
+		slotHash := slot.hash
+		slotDisp := ((off / mountSlotBytes) - slotHash) & mask
+		if disp > slotDisp {
+			atomic.StorePointer(&slot.value, value)
+			atomic.StoreUintptr(&slot.hash, hash)
+			value = slotValue
+			hash = slotHash
+			disp = slotDisp
+		}
+		off = (off + mountSlotBytes) & offmask
+		disp++
+	}
+}
+
+// Remove removes the given mount from mt.
+//
+// Preconditions: mt must contain mount.
+func (mt *mountTable) Remove(mount *Mount) {
+	mt.seq.BeginWrite()
+	mt.removeSeqed(mount)
+	mt.seq.EndWrite()
+}
+
+// removeSeqed removes the given mount from mt.
+//
+// Preconditions: mt.seq must be in a writer critical section. mt must contain
+// mount.
+func (mt *mountTable) removeSeqed(mount *Mount) {
+	hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
+	tcap := uintptr(1) << (mt.size & mtSizeOrderMask)
+	mask := tcap - 1
+	slots := mt.slots
+	off := (hash & mask) * mountSlotBytes
+	offmask := mask * mountSlotBytes
+	for {
+		slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
+		slotValue := slot.value
+		if slotValue == unsafe.Pointer(mount) {
+			// Found the element to remove. Move all subsequent elements
+			// backward until we either find an empty slot, or an element that
+			// is already in its first-probed slot. (This is backward shift
+			// deletion.)
+			for {
+				nextOff := (off + mountSlotBytes) & offmask
+				nextSlot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + nextOff))
+				nextSlotValue := nextSlot.value
+				if nextSlotValue == nil {
+					break
+				}
+				nextSlotHash := nextSlot.hash
+				if (nextOff / mountSlotBytes) == (nextSlotHash & mask) {
+					break
+				}
+				atomic.StorePointer(&slot.value, nextSlotValue)
+				atomic.StoreUintptr(&slot.hash, nextSlotHash)
+				off = nextOff
+				slot = nextSlot
+			}
+			atomic.StorePointer(&slot.value, nil)
+			atomic.AddUint64(&mt.size, mtSizeLenNegOne)
+			return
+		}
+		if checkInvariants && slotValue == nil {
+			panic(fmt.Sprintf("mountTable.Remove() called on missing Mount %v", mount))
+		}
+		off = (off + mountSlotBytes) & offmask
+	}
+}
+
+//go:linkname memhash runtime.memhash
+func memhash(p unsafe.Pointer, seed, s uintptr) uintptr
+
+//go:linkname rand32 runtime.fastrand
+func rand32() uint32
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
new file mode 100644
index 000000000..f223aeda8
--- /dev/null
+++ b/pkg/sentry/vfs/options.go
@@ -0,0 +1,235 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+)
+
+// GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and
+// FilesystemImpl.GetDentryAt().
+type GetDentryOptions struct {
+	// If CheckSearchable is true, FilesystemImpl.GetDentryAt() must check that
+	// the returned Dentry is a directory for which creds has search
+	// permission.
+	CheckSearchable bool
+}
+
+// MkdirOptions contains options to VirtualFilesystem.MkdirAt() and
+// FilesystemImpl.MkdirAt().
+type MkdirOptions struct {
+	// Mode is the file mode bits for the created directory.
+	Mode linux.FileMode
+
+	// If ForSyntheticMountpoint is true, FilesystemImpl.MkdirAt() may create
+	// the given directory in memory only (as opposed to persistent storage).
+	// The created directory should be able to support the creation of
+	// subdirectories with ForSyntheticMountpoint == true. It does not need to
+	// support the creation of subdirectories with ForSyntheticMountpoint ==
+	// false, or files of other types.
+	//
+	// FilesystemImpls are permitted to ignore the ForSyntheticMountpoint
+	// option.
+	//
+	// The ForSyntheticMountpoint option exists because, unlike mount(2), the
+	// OCI Runtime Specification permits the specification of mount points that
+	// do not exist, under the expectation that container runtimes will create
+	// them. (More accurately, the OCI Runtime Specification completely fails
+	// to document this feature, but it's implemented by runc.)
+	// ForSyntheticMountpoint allows such mount points to be created even when
+	// the underlying persistent filesystem is immutable.
+	ForSyntheticMountpoint bool
+}
+
+// MknodOptions contains options to VirtualFilesystem.MknodAt() and
+// FilesystemImpl.MknodAt().
+type MknodOptions struct {
+	// Mode is the file type and mode bits for the created file.
+	Mode linux.FileMode
+
+	// If Mode specifies a character or block device special file, DevMajor and
+	// DevMinor are the major and minor device numbers for the created device.
+	DevMajor uint32
+	DevMinor uint32
+
+	// Endpoint is the endpoint to bind to the created file, if a socket file is
+	// being created for bind(2) on a Unix domain socket.
+	Endpoint transport.BoundEndpoint
+}
+
+// MountFlags contains flags as specified for mount(2), e.g. MS_NOEXEC.
+// MS_RDONLY is not part of MountFlags because it's tracked in Mount.writers.
+type MountFlags struct {
+	// NoExec is equivalent to MS_NOEXEC.
+	NoExec bool
+
+	// NoATime is equivalent to MS_NOATIME and indicates that the
+	// filesystem should not update access time in-place.
+	NoATime bool
+}
+
+// MountOptions contains options to VirtualFilesystem.MountAt().
+type MountOptions struct {
+	// Flags contains flags as specified for mount(2), e.g. MS_NOEXEC.
+	Flags MountFlags
+
+	// ReadOnly is equivalent to MS_RDONLY.
+	ReadOnly bool
+
+	// GetFilesystemOptions contains options to FilesystemType.GetFilesystem().
+	GetFilesystemOptions GetFilesystemOptions
+
+	// If InternalMount is true, allow the use of filesystem types for which
+	// RegisterFilesystemTypeOptions.AllowUserMount == false.
+	InternalMount bool
+}
+
+// OpenOptions contains options to VirtualFilesystem.OpenAt() and
+// FilesystemImpl.OpenAt().
+type OpenOptions struct {
+	// Flags contains access mode and flags as specified for open(2).
+	//
+	// FilesystemImpls are responsible for implementing the following flags:
+	// O_RDONLY, O_WRONLY, O_RDWR, O_APPEND, O_CREAT, O_DIRECT, O_DSYNC,
+	// O_EXCL, O_NOATIME, O_NOCTTY, O_NONBLOCK, O_PATH, O_SYNC, O_TMPFILE, and
+	// O_TRUNC. VFS is responsible for handling O_DIRECTORY, O_LARGEFILE, and
+	// O_NOFOLLOW. VFS users are responsible for handling O_CLOEXEC, since file
+	// descriptors are mostly outside the scope of VFS.
+	Flags uint32
+
+	// If FilesystemImpl.OpenAt() creates a file, Mode is the file mode for the
+	// created file.
+	Mode linux.FileMode
+
+	// FileExec is set when the file is being opened to be executed.
+	// VirtualFilesystem.OpenAt() checks that the caller has execute permissions
+	// on the file, that the file is a regular file, and that the mount doesn't
+	// have MS_NOEXEC set.
+	FileExec bool
+}
+
+// ReadOptions contains options to FileDescription.PRead(),
+// FileDescriptionImpl.PRead(), FileDescription.Read(), and
+// FileDescriptionImpl.Read().
+type ReadOptions struct {
+	// Flags contains flags as specified for preadv2(2).
+	Flags uint32
+}
+
+// RenameOptions contains options to VirtualFilesystem.RenameAt() and
+// FilesystemImpl.RenameAt().
+type RenameOptions struct {
+	// Flags contains flags as specified for renameat2(2).
+	Flags uint32
+
+	// If MustBeDir is true, the renamed file must be a directory.
+	MustBeDir bool
+}
+
+// SetStatOptions contains options to VirtualFilesystem.SetStatAt(),
+// FilesystemImpl.SetStatAt(), FileDescription.SetStat(), and
+// FileDescriptionImpl.SetStat().
+type SetStatOptions struct {
+	// Stat is the metadata that should be set. Only fields indicated by
+	// Stat.Mask should be set.
+	//
+	// If Stat specifies that a timestamp should be set,
+	// FilesystemImpl.SetStatAt() and FileDescriptionImpl.SetStat() must
+	// special-case StatxTimestamp.Nsec == UTIME_NOW as described by
+	// utimensat(2); however, they do not need to check for StatxTimestamp.Nsec
+	// == UTIME_OMIT (VFS users must unset the corresponding bit in Stat.Mask
+	// instead).
+	Stat linux.Statx
+}
+
+// BoundEndpointOptions contains options to VirtualFilesystem.BoundEndpointAt()
+// and FilesystemImpl.BoundEndpointAt().
+type BoundEndpointOptions struct {
+	// Addr is the path of the file whose socket endpoint is being retrieved.
+	// It is generally irrelevant: most endpoints are stored at a dentry that
+	// was created through a bind syscall, so the path can be stored on creation.
+	// However, if the endpoint was created in FilesystemImpl.BoundEndpointAt(),
+	// then we may not know what the original bind address was.
+	//
+	// For example, if connect(2) is called with address "foo" which corresponds
+	// a remote named socket in goferfs, we need to generate an endpoint wrapping
+	// that file. In this case, we can use Addr to set the endpoint address to
+	// "foo". Note that Addr is only a best-effort attempt--we still do not know
+	// the exact address that was used on the remote fs to bind the socket (it
+	// may have been "foo", "./foo", etc.).
+	Addr string
+}
+
+// GetxattrOptions contains options to VirtualFilesystem.GetxattrAt(),
+// FilesystemImpl.GetxattrAt(), FileDescription.Getxattr(), and
+// FileDescriptionImpl.Getxattr().
+type GetxattrOptions struct {
+	// Name is the name of the extended attribute to retrieve.
+	Name string
+
+	// Size is the maximum value size that the caller will tolerate. If the value
+	// is larger than size, getxattr methods may return ERANGE, but they are also
+	// free to ignore the hint entirely (i.e. the value returned may be larger
+	// than size). All size checking is done independently at the syscall layer.
+	Size uint64
+}
+
+// SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(),
+// FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and
+// FileDescriptionImpl.Setxattr().
+type SetxattrOptions struct {
+	// Name is the name of the extended attribute being mutated.
+	Name string
+
+	// Value is the extended attribute's new value.
+	Value string
+
+	// Flags contains flags as specified for setxattr/lsetxattr/fsetxattr(2).
+	Flags uint32
+}
+
+// StatOptions contains options to VirtualFilesystem.StatAt(),
+// FilesystemImpl.StatAt(), FileDescription.Stat(), and
+// FileDescriptionImpl.Stat().
+type StatOptions struct {
+	// Mask is the set of fields in the returned Statx that the FilesystemImpl
+	// or FileDescriptionImpl should provide. Bits are as in linux.Statx.Mask.
+	//
+	// The FilesystemImpl or FileDescriptionImpl may return fields not
+	// requested in Mask, and may fail to return fields requested in Mask that
+	// are not supported by the underlying filesystem implementation, without
+	// returning an error.
+	Mask uint32
+
+	// Sync specifies the synchronization required, and is one of
+	// linux.AT_STATX_SYNC_AS_STAT (which is 0, and therefore the default),
+	// linux.AT_STATX_SYNC_FORCE_SYNC, or linux.AT_STATX_SYNC_DONT_SYNC.
+	Sync uint32
+}
+
+// UmountOptions contains options to VirtualFilesystem.UmountAt().
+type UmountOptions struct {
+	// Flags contains flags as specified for umount2(2).
+	Flags uint32
+}
+
+// WriteOptions contains options to FileDescription.PWrite(),
+// FileDescriptionImpl.PWrite(), FileDescription.Write(), and
+// FileDescriptionImpl.Write().
+type WriteOptions struct {
+	// Flags contains flags as specified for pwritev2(2).
+	Flags uint32
+}
diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go
new file mode 100644
index 000000000..cd78d66bc
--- /dev/null
+++ b/pkg/sentry/vfs/pathname.go
@@ -0,0 +1,195 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+var fspathBuilderPool = sync.Pool{
+	New: func() interface{} {
+		return &fspath.Builder{}
+	},
+}
+
+func getFSPathBuilder() *fspath.Builder {
+	return fspathBuilderPool.Get().(*fspath.Builder)
+}
+
+func putFSPathBuilder(b *fspath.Builder) {
+	// No methods can be called on b after b.String(), so reset it to its zero
+	// value (as returned by fspathBuilderPool.New) instead.
+	*b = fspath.Builder{}
+	fspathBuilderPool.Put(b)
+}
+
+// PathnameWithDeleted returns an absolute pathname to vd, consistent with
+// Linux's d_path(). In particular, if vd.Dentry() has been disowned,
+// PathnameWithDeleted appends " (deleted)" to the returned pathname.
+func (vfs *VirtualFilesystem) PathnameWithDeleted(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
+	b := getFSPathBuilder()
+	defer putFSPathBuilder(b)
+	haveRef := false
+	defer func() {
+		if haveRef {
+			vd.DecRef()
+		}
+	}()
+
+	origD := vd.dentry
+loop:
+	for {
+		err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b)
+		switch err.(type) {
+		case nil:
+			if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
+				// genericfstree.PrependPath() will have returned
+				// PrependPathAtVFSRootError in this case since it checks
+				// against vfsroot before mnt.root, but other implementations
+				// of FilesystemImpl.PrependPath() may return nil instead.
+				break loop
+			}
+			nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+			if !nextVD.Ok() {
+				break loop
+			}
+			if haveRef {
+				vd.DecRef()
+			}
+			vd = nextVD
+			haveRef = true
+			// continue loop
+		case PrependPathSyntheticError:
+			// Skip prepending "/" and appending " (deleted)".
+			return b.String(), nil
+		case PrependPathAtVFSRootError, PrependPathAtNonMountRootError:
+			break loop
+		default:
+			return "", err
+		}
+	}
+	b.PrependByte('/')
+	if origD.IsDead() {
+		b.AppendString(" (deleted)")
+	}
+	return b.String(), nil
+}
+
+// PathnameReachable returns an absolute pathname to vd, consistent with
+// Linux's __d_path() (as used by seq_path_root()). If vfsroot.Ok() and vd is
+// not reachable from vfsroot, such that seq_path_root() would return SEQ_SKIP
+// (causing the entire containing entry to be skipped), PathnameReachable
+// returns ("", nil).
+func (vfs *VirtualFilesystem) PathnameReachable(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
+	b := getFSPathBuilder()
+	defer putFSPathBuilder(b)
+	haveRef := false
+	defer func() {
+		if haveRef {
+			vd.DecRef()
+		}
+	}()
+loop:
+	for {
+		err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b)
+		switch err.(type) {
+		case nil:
+			if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
+				break loop
+			}
+			nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+			if !nextVD.Ok() {
+				return "", nil
+			}
+			if haveRef {
+				vd.DecRef()
+			}
+			vd = nextVD
+			haveRef = true
+		case PrependPathAtVFSRootError:
+			break loop
+		case PrependPathAtNonMountRootError, PrependPathSyntheticError:
+			return "", nil
+		default:
+			return "", err
+		}
+	}
+	b.PrependByte('/')
+	return b.String(), nil
+}
+
+// PathnameForGetcwd returns an absolute pathname to vd, consistent with
+// Linux's sys_getcwd().
+func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
+	if vd.dentry.IsDead() {
+		return "", syserror.ENOENT
+	}
+
+	b := getFSPathBuilder()
+	defer putFSPathBuilder(b)
+	haveRef := false
+	defer func() {
+		if haveRef {
+			vd.DecRef()
+		}
+	}()
+	unreachable := false
+loop:
+	for {
+		err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b)
+		switch err.(type) {
+		case nil:
+			if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
+				break loop
+			}
+			nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+			if !nextVD.Ok() {
+				unreachable = true
+				break loop
+			}
+			if haveRef {
+				vd.DecRef()
+			}
+			vd = nextVD
+			haveRef = true
+		case PrependPathAtVFSRootError:
+			break loop
+		case PrependPathAtNonMountRootError, PrependPathSyntheticError:
+			unreachable = true
+			break loop
+		default:
+			return "", err
+		}
+	}
+	b.PrependByte('/')
+	if unreachable {
+		b.PrependString("(unreachable)")
+	}
+	return b.String(), nil
+}
+
+// As of this writing, we do not have equivalents to:
+//
+// - d_absolute_path(), which returns EINVAL if (effectively) any call to
+// FilesystemImpl.PrependPath() would return PrependPathAtNonMountRootError.
+//
+// - dentry_path(), which does not walk up mounts (and only returns the path
+// relative to Filesystem root), but also appends "//deleted" for disowned
+// Dentries.
+//
+// These should be added as necessary.
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
new file mode 100644
index 000000000..9cb050597
--- /dev/null
+++ b/pkg/sentry/vfs/permissions.go
@@ -0,0 +1,280 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"math"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// AccessTypes is a bitmask of Unix file permissions.
+type AccessTypes uint16
+
+// Bits in AccessTypes.
+const (
+	MayExec  AccessTypes = 1
+	MayWrite AccessTypes = 2
+	MayRead  AccessTypes = 4
+)
+
+// OnlyRead returns true if access _only_ allows read.
+func (a AccessTypes) OnlyRead() bool {
+	return a == MayRead
+}
+
+// MayRead returns true if access allows read.
+func (a AccessTypes) MayRead() bool {
+	return a&MayRead != 0
+}
+
+// MayWrite returns true if access allows write.
+func (a AccessTypes) MayWrite() bool {
+	return a&MayWrite != 0
+}
+
+// MayExec returns true if access allows exec.
+func (a AccessTypes) MayExec() bool {
+	return a&MayExec != 0
+}
+
+// GenericCheckPermissions checks that creds has the given access rights on a
+// file with the given permissions, UID, and GID, subject to the rules of
+// fs/namei.c:generic_permission().
+func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
+	// Check permission bits.
+	perms := uint16(mode.Permissions())
+	if creds.EffectiveKUID == kuid {
+		perms >>= 6
+	} else if creds.InGroup(kgid) {
+		perms >>= 3
+	}
+	if uint16(ats)&perms == uint16(ats) {
+		// All permission bits match, access granted.
+		return nil
+	}
+
+	// Caller capabilities require that the file's KUID and KGID are mapped in
+	// the caller's user namespace; compare
+	// kernel/capability.c:privileged_wrt_inode_uidgid().
+	if !kuid.In(creds.UserNamespace).Ok() || !kgid.In(creds.UserNamespace).Ok() {
+		return syserror.EACCES
+	}
+	// CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary
+	// directories, and read arbitrary non-directory files.
+	if (mode.IsDir() && !ats.MayWrite()) || ats.OnlyRead() {
+		if creds.HasCapability(linux.CAP_DAC_READ_SEARCH) {
+			return nil
+		}
+	}
+	// CAP_DAC_OVERRIDE allows arbitrary access to directories, read/write
+	// access to non-directory files, and execute access to non-directory files
+	// for which at least one execute bit is set.
+	if mode.IsDir() || !ats.MayExec() || (mode.Permissions()&0111 != 0) {
+		if creds.HasCapability(linux.CAP_DAC_OVERRIDE) {
+			return nil
+		}
+	}
+	return syserror.EACCES
+}
+
+// MayLink determines whether creating a hard link to a file with the given
+// mode, kuid, and kgid is permitted.
+//
+// This corresponds to Linux's fs/namei.c:may_linkat.
+func MayLink(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
+	// Source inode owner can hardlink all they like; otherwise, it must be a
+	// safe source.
+	if CanActAsOwner(creds, kuid) {
+		return nil
+	}
+
+	// Only regular files can be hard linked.
+	if mode.FileType() != linux.S_IFREG {
+		return syserror.EPERM
+	}
+
+	// Setuid files should not get pinned to the filesystem.
+	if mode&linux.S_ISUID != 0 {
+		return syserror.EPERM
+	}
+
+	// Executable setgid files should not get pinned to the filesystem, but we
+	// don't support S_IXGRP anyway.
+
+	// Hardlinking to unreadable or unwritable sources is dangerous.
+	if err := GenericCheckPermissions(creds, MayRead|MayWrite, mode, kuid, kgid); err != nil {
+		return syserror.EPERM
+	}
+	return nil
+}
+
+// AccessTypesForOpenFlags returns the access types required to open a file
+// with the given OpenOptions.Flags. Note that this is NOT the same thing as
+// the set of accesses permitted for the opened file:
+//
+// - O_TRUNC causes MayWrite to be set in the returned AccessTypes (since it
+// mutates the file), but does not permit writing to the open file description
+// thereafter.
+//
+// - "Linux reserves the special, nonstandard access mode 3 (binary 11) in
+// flags to mean: check for read and write permission on the file and return a
+// file descriptor that can't be used for reading or writing." - open(2). Thus
+// AccessTypesForOpenFlags returns MayRead|MayWrite in this case.
+//
+// Use May{Read,Write}FileWithOpenFlags() for these checks instead.
+func AccessTypesForOpenFlags(opts *OpenOptions) AccessTypes {
+	ats := AccessTypes(0)
+	if opts.FileExec {
+		ats |= MayExec
+	}
+
+	switch opts.Flags & linux.O_ACCMODE {
+	case linux.O_RDONLY:
+		if opts.Flags&linux.O_TRUNC != 0 {
+			return ats | MayRead | MayWrite
+		}
+		return ats | MayRead
+	case linux.O_WRONLY:
+		return ats | MayWrite
+	default:
+		return ats | MayRead | MayWrite
+	}
+}
+
+// MayReadFileWithOpenFlags returns true if a file with the given open flags
+// should be readable.
+func MayReadFileWithOpenFlags(flags uint32) bool {
+	switch flags & linux.O_ACCMODE {
+	case linux.O_RDONLY, linux.O_RDWR:
+		return true
+	default:
+		return false
+	}
+}
+
+// MayWriteFileWithOpenFlags returns true if a file with the given open flags
+// should be writable.
+func MayWriteFileWithOpenFlags(flags uint32) bool {
+	switch flags & linux.O_ACCMODE {
+	case linux.O_WRONLY, linux.O_RDWR:
+		return true
+	default:
+		return false
+	}
+}
+
+// CheckSetStat checks that creds has permission to change the metadata of a
+// file with the given permissions, UID, and GID as specified by stat, subject
+// to the rules of Linux's fs/attr.c:setattr_prepare().
+func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
+	if stat.Mask&linux.STATX_SIZE != 0 {
+		limit, err := CheckLimit(ctx, 0, int64(stat.Size))
+		if err != nil {
+			return err
+		}
+		if limit < int64(stat.Size) {
+			return syserror.ErrExceedsFileSizeLimit
+		}
+	}
+	if stat.Mask&linux.STATX_MODE != 0 {
+		if !CanActAsOwner(creds, kuid) {
+			return syserror.EPERM
+		}
+		// TODO(b/30815691): "If the calling process is not privileged (Linux:
+		// does not have the CAP_FSETID capability), and the group of the file
+		// does not match the effective group ID of the process or one of its
+		// supplementary group IDs, the S_ISGID bit will be turned off, but
+		// this will not cause an error to be returned." - chmod(2)
+	}
+	if stat.Mask&linux.STATX_UID != 0 {
+		if !((creds.EffectiveKUID == kuid && auth.KUID(stat.UID) == kuid) ||
+			HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) {
+			return syserror.EPERM
+		}
+	}
+	if stat.Mask&linux.STATX_GID != 0 {
+		if !((creds.EffectiveKUID == kuid && creds.InGroup(auth.KGID(stat.GID))) ||
+			HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) {
+			return syserror.EPERM
+		}
+	}
+	if stat.Mask&(linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME) != 0 {
+		if !CanActAsOwner(creds, kuid) {
+			if (stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW) ||
+				(stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW) ||
+				(stat.Mask&linux.STATX_CTIME != 0 && stat.Ctime.Nsec != linux.UTIME_NOW) {
+				return syserror.EPERM
+			}
+			if err := GenericCheckPermissions(creds, MayWrite, mode, kuid, kgid); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+// CheckDeleteSticky checks whether the sticky bit is set on a directory with
+// the given file mode, and if so, checks whether creds has permission to
+// remove a file owned by childKUID from a directory with the given mode.
+// CheckDeleteSticky is consistent with fs/linux.h:check_sticky().
+func CheckDeleteSticky(creds *auth.Credentials, parentMode linux.FileMode, childKUID auth.KUID) error {
+	if parentMode&linux.ModeSticky == 0 {
+		return nil
+	}
+	if CanActAsOwner(creds, childKUID) {
+		return nil
+	}
+	return syserror.EPERM
+}
+
+// CanActAsOwner returns true if creds can act as the owner of a file with the
+// given owning UID, consistent with Linux's
+// fs/inode.c:inode_owner_or_capable().
+func CanActAsOwner(creds *auth.Credentials, kuid auth.KUID) bool {
+	if creds.EffectiveKUID == kuid {
+		return true
+	}
+	return creds.HasCapability(linux.CAP_FOWNER) && creds.UserNamespace.MapFromKUID(kuid).Ok()
+}
+
+// HasCapabilityOnFile returns true if creds has the given capability with
+// respect to a file with the given owning UID and GID, consistent with Linux's
+// kernel/capability.c:capable_wrt_inode_uidgid().
+func HasCapabilityOnFile(creds *auth.Credentials, cp linux.Capability, kuid auth.KUID, kgid auth.KGID) bool {
+	return creds.HasCapability(cp) && creds.UserNamespace.MapFromKUID(kuid).Ok() && creds.UserNamespace.MapFromKGID(kgid).Ok()
+}
+
+// CheckLimit enforces file size rlimits. It returns error if the write
+// operation must not proceed. Otherwise it returns the max length allowed to
+// without violating the limit.
+func CheckLimit(ctx context.Context, offset, size int64) (int64, error) {
+	fileSizeLimit := limits.FromContext(ctx).Get(limits.FileSize).Cur
+	if fileSizeLimit > math.MaxInt64 {
+		return size, nil
+	}
+	if offset >= int64(fileSizeLimit) {
+		return 0, syserror.ErrExceedsFileSizeLimit
+	}
+	remaining := int64(fileSizeLimit) - offset
+	if remaining < size {
+		return remaining, nil
+	}
+	return size, nil
+}
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
new file mode 100644
index 000000000..9d047ff88
--- /dev/null
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -0,0 +1,466 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// ResolvingPath represents the state of an in-progress path resolution, shared
+// between VFS and FilesystemImpl methods that take a path.
+//
+// From the perspective of FilesystemImpl methods, a ResolvingPath represents a
+// starting Dentry on the associated Filesystem (on which a reference is
+// already held), a stream of path components relative to that Dentry, and
+// elements of the invoking Context that are commonly required by
+// FilesystemImpl methods.
+//
+// ResolvingPath is loosely analogous to Linux's struct nameidata.
+type ResolvingPath struct {
+	vfs   *VirtualFilesystem
+	root  VirtualDentry // refs borrowed from PathOperation
+	mount *Mount
+	start *Dentry
+	pit   fspath.Iterator
+
+	flags         uint16
+	mustBeDir     bool // final file must be a directory?
+	mustBeDirOrig bool
+	symlinks      uint8 // number of symlinks traversed
+	symlinksOrig  uint8
+	curPart       uint8 // index into parts
+	numOrigParts  uint8
+
+	creds *auth.Credentials
+
+	// Data associated with resolve*Errors, stored in ResolvingPath so that
+	// those errors don't need to allocate.
+	nextMount        *Mount  // ref held if not nil
+	nextStart        *Dentry // ref held if not nil
+	absSymlinkTarget fspath.Path
+
+	// ResolvingPath must track up to two relative paths: the "current"
+	// relative path, which is updated whenever a relative symlink is
+	// encountered, and the "original" relative path, which is updated from the
+	// current relative path by handleError() when resolution must change
+	// filesystems (due to reaching a mount boundary or absolute symlink) and
+	// overwrites the current relative path when Restart() is called.
+	parts     [1 + linux.MaxSymlinkTraversals]fspath.Iterator
+	origParts [1 + linux.MaxSymlinkTraversals]fspath.Iterator
+}
+
+const (
+	rpflagsHaveMountRef       = 1 << iota // do we hold a reference on mount?
+	rpflagsHaveStartRef                   // do we hold a reference on start?
+	rpflagsFollowFinalSymlink             // same as PathOperation.FollowFinalSymlink
+)
+
+func init() {
+	if maxParts := len(ResolvingPath{}.parts); maxParts > 255 {
+		panic(fmt.Sprintf("uint8 is insufficient to accommodate len(ResolvingPath.parts) (%d)", maxParts))
+	}
+}
+
+// Error types that communicate state from the FilesystemImpl-caller,
+// VFS-callee side of path resolution (i.e. errors returned by
+// ResolvingPath.Resolve*()) to the VFS-caller, FilesystemImpl-callee side
+// (i.e. VFS methods => ResolvingPath.handleError()). These are empty structs
+// rather than error values because Go doesn't support non-primitive constants,
+// so error "constants" are really mutable vars, necessitating somewhat
+// expensive interface object comparisons.
+
+type resolveMountRootOrJumpError struct{}
+
+// Error implements error.Error.
+func (resolveMountRootOrJumpError) Error() string {
+	return "resolving mount root or jump"
+}
+
+type resolveMountPointError struct{}
+
+// Error implements error.Error.
+func (resolveMountPointError) Error() string {
+	return "resolving mount point"
+}
+
+type resolveAbsSymlinkError struct{}
+
+// Error implements error.Error.
+func (resolveAbsSymlinkError) Error() string {
+	return "resolving absolute symlink"
+}
+
+var resolvingPathPool = sync.Pool{
+	New: func() interface{} {
+		return &ResolvingPath{}
+	},
+}
+
+func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) *ResolvingPath {
+	rp := resolvingPathPool.Get().(*ResolvingPath)
+	rp.vfs = vfs
+	rp.root = pop.Root
+	rp.mount = pop.Start.mount
+	rp.start = pop.Start.dentry
+	rp.pit = pop.Path.Begin
+	rp.flags = 0
+	if pop.FollowFinalSymlink {
+		rp.flags |= rpflagsFollowFinalSymlink
+	}
+	rp.mustBeDir = pop.Path.Dir
+	rp.mustBeDirOrig = pop.Path.Dir
+	rp.symlinks = 0
+	rp.curPart = 0
+	rp.numOrigParts = 1
+	rp.creds = creds
+	rp.parts[0] = pop.Path.Begin
+	rp.origParts[0] = pop.Path.Begin
+	return rp
+}
+
+func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) {
+	rp.root = VirtualDentry{}
+	rp.decRefStartAndMount()
+	rp.mount = nil
+	rp.start = nil
+	rp.releaseErrorState()
+	resolvingPathPool.Put(rp)
+}
+
+func (rp *ResolvingPath) decRefStartAndMount() {
+	if rp.flags&rpflagsHaveStartRef != 0 {
+		rp.start.DecRef()
+	}
+	if rp.flags&rpflagsHaveMountRef != 0 {
+		rp.mount.DecRef()
+	}
+}
+
+func (rp *ResolvingPath) releaseErrorState() {
+	if rp.nextStart != nil {
+		rp.nextStart.DecRef()
+		rp.nextStart = nil
+	}
+	if rp.nextMount != nil {
+		rp.nextMount.DecRef()
+		rp.nextMount = nil
+	}
+}
+
+// VirtualFilesystem returns the containing VirtualFilesystem.
+func (rp *ResolvingPath) VirtualFilesystem() *VirtualFilesystem {
+	return rp.vfs
+}
+
+// Credentials returns the credentials of rp's provider.
+func (rp *ResolvingPath) Credentials() *auth.Credentials {
+	return rp.creds
+}
+
+// Mount returns the Mount on which path resolution is currently occurring. It
+// does not take a reference on the returned Mount.
+func (rp *ResolvingPath) Mount() *Mount {
+	return rp.mount
+}
+
+// Start returns the starting Dentry represented by rp. It does not take a
+// reference on the returned Dentry.
+func (rp *ResolvingPath) Start() *Dentry {
+	return rp.start
+}
+
+// Done returns true if there are no remaining path components in the stream
+// represented by rp.
+func (rp *ResolvingPath) Done() bool {
+	// We don't need to check for rp.curPart == 0 because rp.Advance() won't
+	// set rp.pit to a terminal iterator otherwise.
+	return !rp.pit.Ok()
+}
+
+// Final returns true if there is exactly one remaining path component in the
+// stream represented by rp.
+//
+// Preconditions: !rp.Done().
+func (rp *ResolvingPath) Final() bool {
+	return rp.curPart == 0 && !rp.pit.NextOk()
+}
+
+// Component returns the current path component in the stream represented by
+// rp.
+//
+// Preconditions: !rp.Done().
+func (rp *ResolvingPath) Component() string {
+	if checkInvariants {
+		if !rp.pit.Ok() {
+			panic("ResolvingPath.Component() called at end of relative path")
+		}
+	}
+	return rp.pit.String()
+}
+
+// Advance advances the stream of path components represented by rp.
+//
+// Preconditions: !rp.Done().
+func (rp *ResolvingPath) Advance() {
+	if checkInvariants {
+		if !rp.pit.Ok() {
+			panic("ResolvingPath.Advance() called at end of relative path")
+		}
+	}
+	next := rp.pit.Next()
+	if next.Ok() || rp.curPart == 0 { // have next component, or at end of path
+		rp.pit = next
+	} else { // at end of path segment, continue with next one
+		rp.curPart--
+		rp.pit = rp.parts[rp.curPart]
+	}
+}
+
+// Restart resets the stream of path components represented by rp to its state
+// on entry to the current FilesystemImpl method.
+func (rp *ResolvingPath) Restart() {
+	rp.pit = rp.origParts[rp.numOrigParts-1]
+	rp.mustBeDir = rp.mustBeDirOrig
+	rp.symlinks = rp.symlinksOrig
+	rp.curPart = rp.numOrigParts - 1
+	copy(rp.parts[:], rp.origParts[:rp.numOrigParts])
+	rp.releaseErrorState()
+}
+
+func (rp *ResolvingPath) relpathCommit() {
+	rp.mustBeDirOrig = rp.mustBeDir
+	rp.symlinksOrig = rp.symlinks
+	rp.numOrigParts = rp.curPart + 1
+	copy(rp.origParts[:rp.curPart], rp.parts[:])
+	rp.origParts[rp.curPart] = rp.pit
+}
+
+// CheckRoot is called before resolving the parent of the Dentry d. If the
+// Dentry is contextually a VFS root, such that path resolution should treat
+// d's parent as itself, CheckRoot returns (true, nil). If the Dentry is the
+// root of a non-root mount, such that path resolution should switch to another
+// Mount, CheckRoot returns (unspecified, non-nil error). Otherwise, path
+// resolution should resolve d's parent normally, and CheckRoot returns (false,
+// nil).
+func (rp *ResolvingPath) CheckRoot(d *Dentry) (bool, error) {
+	if d == rp.root.dentry && rp.mount == rp.root.mount {
+		// At contextual VFS root (due to e.g. chroot(2)).
+		return true, nil
+	} else if d == rp.mount.root {
+		// At mount root ...
+		vd := rp.vfs.getMountpointAt(rp.mount, rp.root)
+		if vd.Ok() {
+			// ... of non-root mount.
+			rp.nextMount = vd.mount
+			rp.nextStart = vd.dentry
+			return false, resolveMountRootOrJumpError{}
+		}
+		// ... of root mount.
+		return true, nil
+	}
+	return false, nil
+}
+
+// CheckMount is called after resolving the parent or child of another Dentry
+// to d. If d is a mount point, such that path resolution should switch to
+// another Mount, CheckMount returns a non-nil error. Otherwise, CheckMount
+// returns nil.
+func (rp *ResolvingPath) CheckMount(d *Dentry) error {
+	if !d.isMounted() {
+		return nil
+	}
+	if mnt := rp.vfs.getMountAt(rp.mount, d); mnt != nil {
+		rp.nextMount = mnt
+		return resolveMountPointError{}
+	}
+	return nil
+}
+
+// ShouldFollowSymlink returns true if, supposing that the current path
+// component in pcs represents a symbolic link, the symbolic link should be
+// followed.
+//
+// If path is terminated with '/', the '/' is considered the last element and
+// any symlink before that is followed:
+//   - For most non-creating walks, the last path component is handled by
+//     fs/namei.c:lookup_last(), which sets LOOKUP_FOLLOW if the first byte
+//     after the path component is non-NULL (which is only possible if it's '/')
+//     and the path component is of type LAST_NORM.
+//
+//   - For open/openat/openat2 without O_CREAT, the last path component is
+//     handled by fs/namei.c:do_last(), which does the same, though without the
+//     LAST_NORM check.
+//
+// Preconditions: !rp.Done().
+func (rp *ResolvingPath) ShouldFollowSymlink() bool {
+	// Non-final symlinks are always followed. Paths terminated with '/' are also
+	// always followed.
+	return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final() || rp.MustBeDir()
+}
+
+// HandleSymlink is called when the current path component is a symbolic link
+// to the given target. If the calling Filesystem method should continue path
+// traversal, HandleSymlink updates the path component stream to reflect the
+// symlink target and returns nil. Otherwise it returns a non-nil error.
+//
+// Preconditions: !rp.Done().
+//
+// Postconditions: If HandleSymlink returns a nil error, then !rp.Done().
+func (rp *ResolvingPath) HandleSymlink(target string) error {
+	if rp.symlinks >= linux.MaxSymlinkTraversals {
+		return syserror.ELOOP
+	}
+	if len(target) == 0 {
+		return syserror.ENOENT
+	}
+	rp.symlinks++
+	targetPath := fspath.Parse(target)
+	if targetPath.Absolute {
+		rp.absSymlinkTarget = targetPath
+		return resolveAbsSymlinkError{}
+	}
+	// Consume the path component that represented the symlink.
+	rp.Advance()
+	// Prepend the symlink target to the relative path.
+	if checkInvariants {
+		if !targetPath.HasComponents() {
+			panic(fmt.Sprintf("non-empty pathname %q parsed to relative path with no components", target))
+		}
+	}
+	rp.relpathPrepend(targetPath)
+	return nil
+}
+
+// Preconditions: path.HasComponents().
+func (rp *ResolvingPath) relpathPrepend(path fspath.Path) {
+	if rp.pit.Ok() {
+		rp.parts[rp.curPart] = rp.pit
+		rp.pit = path.Begin
+		rp.curPart++
+	} else {
+		// The symlink was the final path component, so now the symlink target
+		// is the whole path.
+		rp.pit = path.Begin
+		// Symlink targets can set rp.mustBeDir (if they end in a trailing /),
+		// but can't unset it.
+		if path.Dir {
+			rp.mustBeDir = true
+		}
+	}
+}
+
+// HandleJump is called when the current path component is a "magic" link to
+// the given VirtualDentry, like /proc/[pid]/fd/[fd]. If the calling Filesystem
+// method should continue path traversal, HandleMagicSymlink updates the path
+// component stream to reflect the magic link target and returns nil. Otherwise
+// it returns a non-nil error.
+//
+// Preconditions: !rp.Done().
+func (rp *ResolvingPath) HandleJump(target VirtualDentry) error {
+	if rp.symlinks >= linux.MaxSymlinkTraversals {
+		return syserror.ELOOP
+	}
+	rp.symlinks++
+	// Consume the path component that represented the magic link.
+	rp.Advance()
+	// Unconditionally return a resolveMountRootOrJumpError, even if the Mount
+	// isn't changing, to force restarting at the new Dentry.
+	target.IncRef()
+	rp.nextMount = target.mount
+	rp.nextStart = target.dentry
+	return resolveMountRootOrJumpError{}
+}
+
+func (rp *ResolvingPath) handleError(err error) bool {
+	switch err.(type) {
+	case resolveMountRootOrJumpError:
+		// Switch to the new Mount. We hold references on the Mount and Dentry.
+		rp.decRefStartAndMount()
+		rp.mount = rp.nextMount
+		rp.start = rp.nextStart
+		rp.flags |= rpflagsHaveMountRef | rpflagsHaveStartRef
+		rp.nextMount = nil
+		rp.nextStart = nil
+		// Commit the previous FileystemImpl's progress through the relative
+		// path. (Don't consume the path component that caused us to traverse
+		// through the mount root - i.e. the ".." - because we still need to
+		// resolve the mount point's parent in the new FilesystemImpl.)
+		rp.relpathCommit()
+		// Restart path resolution on the new Mount. Don't bother calling
+		// rp.releaseErrorState() since we already set nextMount and nextStart
+		// to nil above.
+		return true
+
+	case resolveMountPointError:
+		// Switch to the new Mount. We hold a reference on the Mount, but
+		// borrow the reference on the mount root from the Mount.
+		rp.decRefStartAndMount()
+		rp.mount = rp.nextMount
+		rp.start = rp.nextMount.root
+		rp.flags = rp.flags&^rpflagsHaveStartRef | rpflagsHaveMountRef
+		rp.nextMount = nil
+		// Consume the path component that represented the mount point.
+		rp.Advance()
+		// Commit the previous FilesystemImpl's progress through the relative
+		// path.
+		rp.relpathCommit()
+		// Restart path resolution on the new Mount.
+		rp.releaseErrorState()
+		return true
+
+	case resolveAbsSymlinkError:
+		// Switch to the new Mount. References are borrowed from rp.root.
+		rp.decRefStartAndMount()
+		rp.mount = rp.root.mount
+		rp.start = rp.root.dentry
+		rp.flags &^= rpflagsHaveMountRef | rpflagsHaveStartRef
+		// Consume the path component that represented the symlink.
+		rp.Advance()
+		// Prepend the symlink target to the relative path.
+		rp.relpathPrepend(rp.absSymlinkTarget)
+		// Commit the previous FilesystemImpl's progress through the relative
+		// path, including the symlink target we just prepended.
+		rp.relpathCommit()
+		// Restart path resolution on the new Mount.
+		rp.releaseErrorState()
+		return true
+
+	default:
+		// Not an error we can handle.
+		return false
+	}
+}
+
+// canHandleError returns true if err is an error returned by rp.Resolve*()
+// that rp.handleError() may attempt to handle.
+func (rp *ResolvingPath) canHandleError(err error) bool {
+	switch err.(type) {
+	case resolveMountRootOrJumpError, resolveMountPointError, resolveAbsSymlinkError:
+		return true
+	default:
+		return false
+	}
+}
+
+// MustBeDir returns true if the file traversed by rp must be a directory.
+func (rp *ResolvingPath) MustBeDir() bool {
+	return rp.mustBeDir
+}
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
new file mode 100644
index 000000000..522e27475
--- /dev/null
+++ b/pkg/sentry/vfs/vfs.go
@@ -0,0 +1,849 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package vfs implements a virtual filesystem layer.
+//
+// Lock order:
+//
+// EpollInstance.interestMu
+//   FileDescription.epollMu
+//     FilesystemImpl/FileDescriptionImpl locks
+//       VirtualFilesystem.mountMu
+//         Dentry.mu
+//           Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
+//         VirtualFilesystem.filesystemsMu
+//       EpollInstance.mu
+//		   Inotify.mu
+// 		     Watches.mu
+//  		     Inotify.evMu
+// VirtualFilesystem.fsTypesMu
+//
+// Locking Dentry.mu in multiple Dentries requires holding
+// VirtualFilesystem.mountMu. Locking EpollInstance.interestMu in multiple
+// EpollInstances requires holding epollCycleMu.
+package vfs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts.
+//
+// There is no analogue to the VirtualFilesystem type in Linux, as the
+// equivalent state in Linux is global.
+//
+// +stateify savable
+type VirtualFilesystem struct {
+	// mountMu serializes mount mutations.
+	//
+	// mountMu is analogous to Linux's namespace_sem.
+	mountMu sync.Mutex `state:"nosave"`
+
+	// mounts maps (mount parent, mount point) pairs to mounts. (Since mounts
+	// are uniquely namespaced, including mount parent in the key correctly
+	// handles both bind mounts and mount namespaces; Linux does the same.)
+	// Synchronization between mutators and readers is provided by mounts.seq;
+	// synchronization between mutators is provided by mountMu.
+	//
+	// mounts is used to follow mount points during path traversal. We use a
+	// single table rather than per-Dentry tables to reduce size (and therefore
+	// cache footprint) for the vast majority of Dentries that are not mount
+	// points.
+	//
+	// mounts is analogous to Linux's mount_hashtable.
+	mounts mountTable
+
+	// mountpoints maps mount points to mounts at those points in all
+	// namespaces. mountpoints is protected by mountMu.
+	//
+	// mountpoints is used to find mounts that must be umounted due to
+	// removal of a mount point Dentry from another mount namespace. ("A file
+	// or directory that is a mount point in one namespace that is not a mount
+	// point in another namespace, may be renamed, unlinked, or removed
+	// (rmdir(2)) in the mount namespace in which it is not a mount point
+	// (subject to the usual permission checks)." - mount_namespaces(7))
+	//
+	// mountpoints is analogous to Linux's mountpoint_hashtable.
+	mountpoints map[*Dentry]map[*Mount]struct{}
+
+	// lastMountID is the last allocated mount ID. lastMountID is accessed
+	// using atomic memory operations.
+	lastMountID uint64
+
+	// anonMount is a Mount, not included in mounts or mountpoints,
+	// representing an anonFilesystem. anonMount is used to back
+	// VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
+	// anonMount is immutable.
+	//
+	// anonMount is analogous to Linux's anon_inode_mnt.
+	anonMount *Mount
+
+	// devices contains all registered Devices. devices is protected by
+	// devicesMu.
+	devicesMu sync.RWMutex `state:"nosave"`
+	devices   map[devTuple]*registeredDevice
+
+	// anonBlockDevMinor contains all allocated anonymous block device minor
+	// numbers. anonBlockDevMinorNext is a lower bound for the smallest
+	// unallocated anonymous block device number. anonBlockDevMinorNext and
+	// anonBlockDevMinor are protected by anonBlockDevMinorMu.
+	anonBlockDevMinorMu   sync.Mutex `state:"nosave"`
+	anonBlockDevMinorNext uint32
+	anonBlockDevMinor     map[uint32]struct{}
+
+	// fsTypes contains all registered FilesystemTypes. fsTypes is protected by
+	// fsTypesMu.
+	fsTypesMu sync.RWMutex `state:"nosave"`
+	fsTypes   map[string]*registeredFilesystemType
+
+	// filesystems contains all Filesystems. filesystems is protected by
+	// filesystemsMu.
+	filesystemsMu sync.Mutex `state:"nosave"`
+	filesystems   map[*Filesystem]struct{}
+}
+
+// Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes.
+func (vfs *VirtualFilesystem) Init() error {
+	if vfs.mountpoints != nil {
+		panic("VFS already initialized")
+	}
+	vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{})
+	vfs.devices = make(map[devTuple]*registeredDevice)
+	vfs.anonBlockDevMinorNext = 1
+	vfs.anonBlockDevMinor = make(map[uint32]struct{})
+	vfs.fsTypes = make(map[string]*registeredFilesystemType)
+	vfs.filesystems = make(map[*Filesystem]struct{})
+	vfs.mounts.Init()
+
+	// Construct vfs.anonMount.
+	anonfsDevMinor, err := vfs.GetAnonBlockDevMinor()
+	if err != nil {
+		// This shouldn't be possible since anonBlockDevMinorNext was
+		// initialized to 1 above (no device numbers have been allocated yet).
+		panic(fmt.Sprintf("VirtualFilesystem.Init: device number allocation for anonfs failed: %v", err))
+	}
+	anonfs := anonFilesystem{
+		devMinor: anonfsDevMinor,
+	}
+	anonfs.vfsfs.Init(vfs, &anonFilesystemType{}, &anonfs)
+	defer anonfs.vfsfs.DecRef()
+	anonMount, err := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{})
+	if err != nil {
+		// We should not be passing any MountOptions that would cause
+		// construction of this mount to fail.
+		panic(fmt.Sprintf("VirtualFilesystem.Init: anonfs mount failed: %v", err))
+	}
+	vfs.anonMount = anonMount
+
+	return nil
+}
+
+// PathOperation specifies the path operated on by a VFS method.
+//
+// PathOperation is passed to VFS methods by pointer to reduce memory copying:
+// it's somewhat large and should never escape. (Options structs are passed by
+// pointer to VFS and FileDescription methods for the same reason.)
+type PathOperation struct {
+	// Root is the VFS root. References on Root are borrowed from the provider
+	// of the PathOperation.
+	//
+	// Invariants: Root.Ok().
+	Root VirtualDentry
+
+	// Start is the starting point for the path traversal. References on Start
+	// are borrowed from the provider of the PathOperation (i.e. the caller of
+	// the VFS method to which the PathOperation was passed).
+	//
+	// Invariants: Start.Ok(). If Path.Absolute, then Start == Root.
+	Start VirtualDentry
+
+	// Path is the pathname traversed by this operation.
+	Path fspath.Path
+
+	// If FollowFinalSymlink is true, and the Dentry traversed by the final
+	// path component represents a symbolic link, the symbolic link should be
+	// followed.
+	FollowFinalSymlink bool
+}
+
+// AccessAt checks whether a user with creds has access to the file at
+// the given path.
+func (vfs *VirtualFilesystem) AccessAt(ctx context.Context, creds *auth.Credentials, ats AccessTypes, pop *PathOperation) error {
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		err := rp.mount.fs.impl.AccessAt(ctx, rp, creds, ats)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// GetDentryAt returns a VirtualDentry representing the given path, at which a
+// file must exist. A reference is taken on the returned VirtualDentry.
+func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts)
+		if err == nil {
+			vd := VirtualDentry{
+				mount:  rp.mount,
+				dentry: d,
+			}
+			rp.mount.IncRef()
+			vfs.putResolvingPath(rp)
+			return vd, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return VirtualDentry{}, err
+		}
+	}
+}
+
+// Preconditions: pop.Path.Begin.Ok().
+func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (VirtualDentry, string, error) {
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		parent, err := rp.mount.fs.impl.GetParentDentryAt(ctx, rp)
+		if err == nil {
+			parentVD := VirtualDentry{
+				mount:  rp.mount,
+				dentry: parent,
+			}
+			rp.mount.IncRef()
+			name := rp.Component()
+			vfs.putResolvingPath(rp)
+			return parentVD, name, nil
+		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
+			}
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return VirtualDentry{}, "", err
+		}
+	}
+}
+
+// LinkAt creates a hard link at newpop representing the existing file at
+// oldpop.
+func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation) error {
+	oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{})
+	if err != nil {
+		return err
+	}
+
+	if !newpop.Path.Begin.Ok() {
+		oldVD.DecRef()
+		if newpop.Path.Absolute {
+			return syserror.EEXIST
+		}
+		return syserror.ENOENT
+	}
+	if newpop.FollowFinalSymlink {
+		oldVD.DecRef()
+		ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink")
+		return syserror.EINVAL
+	}
+
+	rp := vfs.getResolvingPath(creds, newpop)
+	for {
+		err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			oldVD.DecRef()
+			return nil
+		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
+			}
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			oldVD.DecRef()
+			return err
+		}
+	}
+}
+
+// MkdirAt creates a directory at the given path.
+func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return syserror.EEXIST
+		}
+		return syserror.ENOENT
+	}
+	if pop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.MkdirAt: file creation paths can't follow final symlink")
+		return syserror.EINVAL
+	}
+	// "Under Linux, apart from the permission bits, the S_ISVTX mode bit is
+	// also honored." - mkdir(2)
+	opts.Mode &= 0777 | linux.S_ISVTX
+
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
+			}
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// MknodAt creates a file of the given mode at the given path. It returns an
+// error from the syserror package.
+func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return syserror.EEXIST
+		}
+		return syserror.ENOENT
+	}
+	if pop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.MknodAt: file creation paths can't follow final symlink")
+		return syserror.EINVAL
+	}
+
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
+			}
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// OpenAt returns a FileDescription providing access to the file at the given
+// path. A reference is taken on the returned FileDescription.
+func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
+	// Remove:
+	//
+	// - O_CLOEXEC, which affects file descriptors and therefore must be
+	// handled outside of VFS.
+	//
+	// - Unknown flags.
+	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_LARGEFILE | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
+	// Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC.
+	if opts.Flags&linux.O_SYNC != 0 {
+		opts.Flags |= linux.O_DSYNC
+	}
+	// Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified
+	// with O_DIRECTORY and a writable access mode (to ensure that it fails on
+	// filesystem implementations that do not support it).
+	if opts.Flags&linux.O_TMPFILE != 0 {
+		if opts.Flags&linux.O_DIRECTORY == 0 {
+			return nil, syserror.EINVAL
+		}
+		if opts.Flags&linux.O_CREAT != 0 {
+			return nil, syserror.EINVAL
+		}
+		if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY {
+			return nil, syserror.EINVAL
+		}
+	}
+	// O_PATH causes most other flags to be ignored.
+	if opts.Flags&linux.O_PATH != 0 {
+		opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH
+	}
+	// "On Linux, the following bits are also honored in mode: [S_ISUID,
+	// S_ISGID, S_ISVTX]" - open(2)
+	opts.Mode &= 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX
+
+	if opts.Flags&linux.O_NOFOLLOW != 0 {
+		pop.FollowFinalSymlink = false
+	}
+	rp := vfs.getResolvingPath(creds, pop)
+	if opts.Flags&linux.O_DIRECTORY != 0 {
+		rp.mustBeDir = true
+		rp.mustBeDirOrig = true
+	}
+	for {
+		fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+
+			if opts.FileExec {
+				if fd.Mount().Flags.NoExec {
+					fd.DecRef()
+					return nil, syserror.EACCES
+				}
+
+				// Only a regular file can be executed.
+				stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE})
+				if err != nil {
+					fd.DecRef()
+					return nil, err
+				}
+				if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG {
+					fd.DecRef()
+					return nil, syserror.EACCES
+				}
+			}
+
+			fd.Dentry().InotifyWithParent(linux.IN_OPEN, 0, PathEvent)
+			return fd, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return nil, err
+		}
+	}
+}
+
+// ReadlinkAt returns the target of the symbolic link at the given path.
+func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (string, error) {
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return target, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return "", err
+		}
+	}
+}
+
+// RenameAt renames the file at oldpop to newpop.
+func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error {
+	if !oldpop.Path.Begin.Ok() {
+		if oldpop.Path.Absolute {
+			return syserror.EBUSY
+		}
+		return syserror.ENOENT
+	}
+	if oldpop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.RenameAt: source path can't follow final symlink")
+		return syserror.EINVAL
+	}
+
+	oldParentVD, oldName, err := vfs.getParentDirAndName(ctx, creds, oldpop)
+	if err != nil {
+		return err
+	}
+	if oldName == "." || oldName == ".." {
+		oldParentVD.DecRef()
+		return syserror.EBUSY
+	}
+
+	if !newpop.Path.Begin.Ok() {
+		oldParentVD.DecRef()
+		if newpop.Path.Absolute {
+			return syserror.EBUSY
+		}
+		return syserror.ENOENT
+	}
+	if newpop.FollowFinalSymlink {
+		oldParentVD.DecRef()
+		ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink")
+		return syserror.EINVAL
+	}
+
+	rp := vfs.getResolvingPath(creds, newpop)
+	renameOpts := *opts
+	if oldpop.Path.Dir {
+		renameOpts.MustBeDir = true
+	}
+	for {
+		err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			oldParentVD.DecRef()
+			return nil
+		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
+			}
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			oldParentVD.DecRef()
+			return err
+		}
+	}
+}
+
+// RmdirAt removes the directory at the given path.
+func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return syserror.EBUSY
+		}
+		return syserror.ENOENT
+	}
+	if pop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.RmdirAt: file deletion paths can't follow final symlink")
+		return syserror.EINVAL
+	}
+
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		err := rp.mount.fs.impl.RmdirAt(ctx, rp)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
+			}
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// SetStatAt changes metadata for the file at the given path.
+func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetStatOptions) error {
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// StatAt returns metadata for the file at the given path.
+func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) {
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return stat, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return linux.Statx{}, err
+		}
+	}
+}
+
+// StatFSAt returns metadata for the filesystem containing the file at the
+// given path.
+func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (linux.Statfs, error) {
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return statfs, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return linux.Statfs{}, err
+		}
+	}
+}
+
+// SymlinkAt creates a symbolic link at the given path with the given target.
+func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error {
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return syserror.EEXIST
+		}
+		return syserror.ENOENT
+	}
+	if pop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.SymlinkAt: file creation paths can't follow final symlink")
+		return syserror.EINVAL
+	}
+
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
+			}
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// UnlinkAt deletes the non-directory file at the given path.
+func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return syserror.EBUSY
+		}
+		return syserror.ENOENT
+	}
+	if pop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.UnlinkAt: file deletion paths can't follow final symlink")
+		return syserror.EINVAL
+	}
+
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		err := rp.mount.fs.impl.UnlinkAt(ctx, rp)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
+			}
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// BoundEndpointAt gets the bound endpoint at the given path, if one exists.
+func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *BoundEndpointOptions) (transport.BoundEndpoint, error) {
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return nil, syserror.ECONNREFUSED
+		}
+		return nil, syserror.ENOENT
+	}
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return bep, nil
+		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
+			}
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return nil, err
+		}
+	}
+}
+
+// ListxattrAt returns all extended attribute names for the file at the given
+// path.
+func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) {
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp, size)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return names, nil
+		}
+		if err == syserror.ENOTSUP {
+			// Linux doesn't actually return ENOTSUP in this case; instead,
+			// fs/xattr.c:vfs_listxattr() falls back to allowing the security
+			// subsystem to return security extended attributes, which by
+			// default don't exist.
+			vfs.putResolvingPath(rp)
+			return nil, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return nil, err
+		}
+	}
+}
+
+// GetxattrAt returns the value associated with the given extended attribute
+// for the file at the given path.
+func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetxattrOptions) (string, error) {
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return val, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return "", err
+		}
+	}
+}
+
+// SetxattrAt changes the value associated with the given extended attribute
+// for the file at the given path.
+func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetxattrOptions) error {
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// RemovexattrAt removes the given extended attribute from the file at rp.
+func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error {
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// SyncAllFilesystems has the semantics of Linux's sync(2).
+func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error {
+	fss := make(map[*Filesystem]struct{})
+	vfs.filesystemsMu.Lock()
+	for fs := range vfs.filesystems {
+		if !fs.TryIncRef() {
+			continue
+		}
+		fss[fs] = struct{}{}
+	}
+	vfs.filesystemsMu.Unlock()
+	var retErr error
+	for fs := range fss {
+		if err := fs.impl.Sync(ctx); err != nil && retErr == nil {
+			retErr = err
+		}
+		fs.DecRef()
+	}
+	return retErr
+}
+
+// A VirtualDentry represents a node in a VFS tree, by combining a Dentry
+// (which represents a node in a Filesystem's tree) and a Mount (which
+// represents the Filesystem's position in a VFS mount tree).
+//
+// VirtualDentry's semantics are similar to that of a Go interface object
+// representing a pointer: it is a copyable value type that represents
+// references to another entity. The zero value of VirtualDentry is an "empty
+// VirtualDentry", directly analogous to a nil interface object.
+// VirtualDentry.Ok() checks that a VirtualDentry is not zero-valued; unless
+// otherwise specified, all other VirtualDentry methods require
+// VirtualDentry.Ok() == true.
+//
+// Mounts and Dentries are reference-counted, requiring that users call
+// VirtualDentry.{Inc,Dec}Ref() as appropriate. We often colloquially refer to
+// references on the Mount and Dentry referred to by a VirtualDentry as
+// references on the VirtualDentry itself. Unless otherwise specified, all
+// VirtualDentry methods require that a reference is held on the VirtualDentry.
+//
+// VirtualDentry is analogous to Linux's struct path.
+//
+// +stateify savable
+type VirtualDentry struct {
+	mount  *Mount
+	dentry *Dentry
+}
+
+// MakeVirtualDentry creates a VirtualDentry.
+func MakeVirtualDentry(mount *Mount, dentry *Dentry) VirtualDentry {
+	return VirtualDentry{
+		mount:  mount,
+		dentry: dentry,
+	}
+}
+
+// Ok returns true if vd is not empty. It does not require that a reference is
+// held.
+func (vd VirtualDentry) Ok() bool {
+	return vd.mount != nil
+}
+
+// IncRef increments the reference counts on the Mount and Dentry represented
+// by vd.
+func (vd VirtualDentry) IncRef() {
+	vd.mount.IncRef()
+	vd.dentry.IncRef()
+}
+
+// DecRef decrements the reference counts on the Mount and Dentry represented
+// by vd.
+func (vd VirtualDentry) DecRef() {
+	vd.dentry.DecRef()
+	vd.mount.DecRef()
+}
+
+// Mount returns the Mount associated with vd. It does not take a reference on
+// the returned Mount.
+func (vd VirtualDentry) Mount() *Mount {
+	return vd.mount
+}
+
+// Dentry returns the Dentry associated with vd. It does not take a reference
+// on the returned Dentry.
+func (vd VirtualDentry) Dentry() *Dentry {
+	return vd.dentry
+}