Merge branch 'master' into ip-forwarding

- Merges aleksej-paschenko's with HEAD - Adds vfs2 support for ip_forward
author: Ian Lewis <ianmlewis@gmail.com> 2020-08-17 21:44:31 -0400
committer: Ian Lewis <ianmlewis@gmail.com> 2020-08-17 21:44:31 -0400
commit: ac324f646ee3cb7955b0b45a7453aeb9671cbdf1 (patch)
tree: 0cbc5018e8807421d701d190dc20525726c7ca76 /pkg/sentry/fsimpl/kernfs
parent: 352ae1022ce19de28fc72e034cc469872ad79d06 (diff)
parent: 6d0c5803d557d453f15ac6f683697eeb46dab680 (diff)
8 files changed, 2779 insertions, 0 deletions
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
new file mode 100644
index 000000000..3835557fe
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -0,0 +1,75 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+licenses(["notice"])
+
+go_template_instance(
+    name = "fstree",
+    out = "fstree.go",
+    package = "kernfs",
+    prefix = "generic",
+    template = "//pkg/sentry/vfs/genericfstree:generic_fstree",
+    types = {
+        "Dentry": "Dentry",
+    },
+)
+
+go_template_instance(
+    name = "slot_list",
+    out = "slot_list.go",
+    package = "kernfs",
+    prefix = "slot",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*slot",
+        "Linker": "*slot",
+    },
+)
+
+go_library(
+    name = "kernfs",
+    srcs = [
+        "dynamic_bytes_file.go",
+        "fd_impl_util.go",
+        "filesystem.go",
+        "fstree.go",
+        "inode_impl_util.go",
+        "kernfs.go",
+        "slot_list.go",
+        "symlink.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
+
+go_test(
+    name = "kernfs_test",
+    size = "small",
+    srcs = ["kernfs_test.go"],
+    deps = [
+        ":kernfs",
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/sentry/contexttest",
+        "//pkg/sentry/fsimpl/testutil",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "@com_github_google_go_cmp//cmp:go_default_library",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
new file mode 100644
index 000000000..12adf727a
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -0,0 +1,147 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// DynamicBytesFile implements kernfs.Inode and represents a read-only
+// file whose contents are backed by a vfs.DynamicBytesSource.
+//
+// Must be instantiated with NewDynamicBytesFile or initialized with Init
+// before first use.
+//
+// +stateify savable
+type DynamicBytesFile struct {
+	InodeAttrs
+	InodeNoopRefCount
+	InodeNotDirectory
+	InodeNotSymlink
+
+	locks vfs.FileLocks
+	data  vfs.DynamicBytesSource
+}
+
+var _ Inode = (*DynamicBytesFile)(nil)
+
+// Init initializes a dynamic bytes file.
+func (f *DynamicBytesFile) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) {
+	if perm&^linux.PermissionsMask != 0 {
+		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
+	}
+	f.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
+	f.data = data
+}
+
+// Open implements Inode.Open.
+func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &DynamicBytesFD{}
+	if err := fd.Init(rp.Mount(), vfsd, f.data, &f.locks, opts.Flags); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// SetStat implements Inode.SetStat. By default DynamicBytesFile doesn't allow
+// inode attributes to be changed. Override SetStat() making it call
+// f.InodeAttrs to allow it.
+func (*DynamicBytesFile) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
+
+// DynamicBytesFD implements vfs.FileDescriptionImpl for an FD backed by a
+// DynamicBytesFile.
+//
+// Must be initialized with Init before first use.
+//
+// +stateify savable
+type DynamicBytesFD struct {
+	vfs.FileDescriptionDefaultImpl
+	vfs.DynamicBytesFileDescriptionImpl
+	vfs.LockFD
+
+	vfsfd vfs.FileDescription
+	inode Inode
+}
+
+// Init initializes a DynamicBytesFD.
+func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error {
+	fd.LockFD.Init(locks)
+	if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
+		return err
+	}
+	fd.inode = d.Impl().(*Dentry).inode
+	fd.SetDataSource(data)
+	return nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *DynamicBytesFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	return fd.DynamicBytesFileDescriptionImpl.Seek(ctx, offset, whence)
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *DynamicBytesFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return fd.DynamicBytesFileDescriptionImpl.Read(ctx, dst, opts)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *DynamicBytesFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return fd.DynamicBytesFileDescriptionImpl.PRead(ctx, dst, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *DynamicBytesFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return fd.DynamicBytesFileDescriptionImpl.Write(ctx, src, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *DynamicBytesFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return fd.DynamicBytesFileDescriptionImpl.PWrite(ctx, src, offset, opts)
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *DynamicBytesFD) Release(context.Context) {}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return fd.inode.Stat(ctx, fs, opts)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error {
+	// DynamicBytesFiles are immutable.
+	return syserror.EPERM
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *DynamicBytesFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *DynamicBytesFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
new file mode 100644
index 000000000..fcee6200a
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -0,0 +1,252 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+	"math"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory
+// inode that uses OrderChildren to track child nodes. GenericDirectoryFD is not
+// compatible with dynamic directories.
+//
+// Note that GenericDirectoryFD holds a lock over OrderedChildren while calling
+// IterDirents callback. The IterDirents callback therefore cannot hash or
+// unhash children, or recursively call IterDirents on the same underlying
+// inode.
+//
+// Must be initialize with Init before first use.
+//
+// Lock ordering: mu => children.mu.
+type GenericDirectoryFD struct {
+	vfs.FileDescriptionDefaultImpl
+	vfs.DirectoryFileDescriptionDefaultImpl
+	vfs.LockFD
+
+	vfsfd    vfs.FileDescription
+	children *OrderedChildren
+
+	// mu protects the fields below.
+	mu sync.Mutex
+
+	// off is the current directory offset. Protected by "mu".
+	off int64
+}
+
+// NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its
+// dentry.
+func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) {
+	fd := &GenericDirectoryFD{}
+	if err := fd.Init(children, locks, opts); err != nil {
+		return nil, err
+	}
+	if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, err
+	}
+	return fd, nil
+}
+
+// Init initializes a GenericDirectoryFD. Use it when overriding
+// GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the
+// correct implementation.
+func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) error {
+	if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 {
+		// Can't open directories for writing.
+		return syserror.EISDIR
+	}
+	fd.LockFD.Init(locks)
+	fd.children = children
+	return nil
+}
+
+// VFSFileDescription returns a pointer to the vfs.FileDescription representing
+// this object.
+func (fd *GenericDirectoryFD) VFSFileDescription() *vfs.FileDescription {
+	return &fd.vfsfd
+}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *GenericDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	return fd.FileDescriptionDefaultImpl.ConfigureMMap(ctx, opts)
+}
+
+// Read implmenets vfs.FileDescriptionImpl.Read.
+func (fd *GenericDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return fd.DirectoryFileDescriptionDefaultImpl.Read(ctx, dst, opts)
+}
+
+// PRead implmenets vfs.FileDescriptionImpl.PRead.
+func (fd *GenericDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return fd.DirectoryFileDescriptionDefaultImpl.PRead(ctx, dst, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *GenericDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return fd.DirectoryFileDescriptionDefaultImpl.Write(ctx, src, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *GenericDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return fd.DirectoryFileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts)
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *GenericDirectoryFD) Release(context.Context) {}
+
+func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem {
+	return fd.vfsfd.VirtualDentry().Mount().Filesystem()
+}
+
+func (fd *GenericDirectoryFD) inode() Inode {
+	return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents. IterDirents holds
+// o.mu when calling cb.
+func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+
+	opts := vfs.StatOptions{Mask: linux.STATX_INO}
+	// Handle ".".
+	if fd.off == 0 {
+		stat, err := fd.inode().Stat(ctx, fd.filesystem(), opts)
+		if err != nil {
+			return err
+		}
+		dirent := vfs.Dirent{
+			Name:    ".",
+			Type:    linux.DT_DIR,
+			Ino:     stat.Ino,
+			NextOff: 1,
+		}
+		if err := cb.Handle(dirent); err != nil {
+			return err
+		}
+		fd.off++
+	}
+
+	// Handle "..".
+	if fd.off == 1 {
+		vfsd := fd.vfsfd.VirtualDentry().Dentry()
+		parentInode := genericParentOrSelf(vfsd.Impl().(*Dentry)).inode
+		stat, err := parentInode.Stat(ctx, fd.filesystem(), opts)
+		if err != nil {
+			return err
+		}
+		dirent := vfs.Dirent{
+			Name:    "..",
+			Type:    linux.FileMode(stat.Mode).DirentType(),
+			Ino:     stat.Ino,
+			NextOff: 2,
+		}
+		if err := cb.Handle(dirent); err != nil {
+			return err
+		}
+		fd.off++
+	}
+
+	// Handle static children.
+	fd.children.mu.RLock()
+	defer fd.children.mu.RUnlock()
+	// fd.off accounts for "." and "..", but fd.children do not track
+	// these.
+	childIdx := fd.off - 2
+	for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() {
+		inode := it.Dentry.Impl().(*Dentry).inode
+		stat, err := inode.Stat(ctx, fd.filesystem(), opts)
+		if err != nil {
+			return err
+		}
+		dirent := vfs.Dirent{
+			Name:    it.Name,
+			Type:    linux.FileMode(stat.Mode).DirentType(),
+			Ino:     stat.Ino,
+			NextOff: fd.off + 1,
+		}
+		if err := cb.Handle(dirent); err != nil {
+			return err
+		}
+		fd.off++
+	}
+
+	var err error
+	relOffset := fd.off - int64(len(fd.children.set)) - 2
+	fd.off, err = fd.inode().IterDirents(ctx, cb, fd.off, relOffset)
+	return err
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+
+	switch whence {
+	case linux.SEEK_SET:
+		// Use offset as given.
+	case linux.SEEK_CUR:
+		offset += fd.off
+	case linux.SEEK_END:
+		// TODO(gvisor.dev/issue/1193): This can prevent new files from showing up
+		// if they are added after SEEK_END.
+		offset = math.MaxInt64
+	default:
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	fd.off = offset
+	return offset, nil
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	fs := fd.filesystem()
+	inode := fd.inode()
+	return inode.Stat(ctx, fs, opts)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	creds := auth.CredentialsFromContext(ctx)
+	inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
+	return inode.SetStat(ctx, fd.filesystem(), creds, opts)
+}
+
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *GenericDirectoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	return fd.DirectoryFileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *GenericDirectoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *GenericDirectoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
new file mode 100644
index 000000000..d7edb6342
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -0,0 +1,840 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+// This file implements vfs.FilesystemImpl for kernfs.
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// stepExistingLocked resolves rp.Component() in parent directory vfsd.
+//
+// stepExistingLocked is loosely analogous to fs/namei.c:walk_component().
+//
+// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
+//
+// Postcondition: Caller must call fs.processDeferredDecRefs*.
+func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, mayFollowSymlinks bool) (*vfs.Dentry, error) {
+	d := vfsd.Impl().(*Dentry)
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	// Directory searchable?
+	if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+afterSymlink:
+	name := rp.Component()
+	// Revalidation must be skipped if name is "." or ".."; d or its parent
+	// respectively can't be expected to transition from invalidated back to
+	// valid, so detecting invalidation and retrying would loop forever. This
+	// is consistent with Linux: fs/namei.c:walk_component() => lookup_fast()
+	// calls d_revalidate(), but walk_component() => handle_dots() does not.
+	if name == "." {
+		rp.Advance()
+		return vfsd, nil
+	}
+	if name == ".." {
+		if isRoot, err := rp.CheckRoot(ctx, vfsd); err != nil {
+			return nil, err
+		} else if isRoot || d.parent == nil {
+			rp.Advance()
+			return vfsd, nil
+		}
+		if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+			return nil, err
+		}
+		rp.Advance()
+		return &d.parent.vfsd, nil
+	}
+	if len(name) > linux.NAME_MAX {
+		return nil, syserror.ENAMETOOLONG
+	}
+	d.dirMu.Lock()
+	next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, d.children[name])
+	d.dirMu.Unlock()
+	if err != nil {
+		return nil, err
+	}
+	if err := rp.CheckMount(ctx, &next.vfsd); err != nil {
+		return nil, err
+	}
+	// Resolve any symlink at current path component.
+	if mayFollowSymlinks && rp.ShouldFollowSymlink() && next.isSymlink() {
+		targetVD, targetPathname, err := next.inode.Getlink(ctx, rp.Mount())
+		if err != nil {
+			return nil, err
+		}
+		if targetVD.Ok() {
+			err := rp.HandleJump(targetVD)
+			targetVD.DecRef(ctx)
+			if err != nil {
+				return nil, err
+			}
+		} else {
+			if err := rp.HandleSymlink(targetPathname); err != nil {
+				return nil, err
+			}
+		}
+		goto afterSymlink
+	}
+	rp.Advance()
+	return &next.vfsd, nil
+}
+
+// revalidateChildLocked must be called after a call to parent.vfsd.Child(name)
+// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be
+// nil) to verify that the returned child (or lack thereof) is correct.
+//
+// Preconditions: Filesystem.mu must be locked for at least reading.
+// parent.dirMu must be locked. parent.isDir(). name is not "." or "..".
+//
+// Postconditions: Caller must call fs.processDeferredDecRefs*.
+func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, child *Dentry) (*Dentry, error) {
+	if child != nil {
+		// Cached dentry exists, revalidate.
+		if !child.inode.Valid(ctx) {
+			delete(parent.children, name)
+			vfsObj.InvalidateDentry(ctx, &child.vfsd)
+			fs.deferDecRef(&child.vfsd) // Reference from Lookup.
+			child = nil
+		}
+	}
+	if child == nil {
+		// Dentry isn't cached; it either doesn't exist or failed
+		// revalidation. Attempt to resolve it via Lookup.
+		//
+		// FIXME(gvisor.dev/issue/1193): Inode.Lookup() should return
+		// *(kernfs.)Dentry, not *vfs.Dentry, since (kernfs.)Filesystem assumes
+		// that all dentries in the filesystem are (kernfs.)Dentry and performs
+		// vfs.DentryImpl casts accordingly.
+		childVFSD, err := parent.inode.Lookup(ctx, name)
+		if err != nil {
+			return nil, err
+		}
+		// Reference on childVFSD dropped by a corresponding Valid.
+		child = childVFSD.Impl().(*Dentry)
+		parent.insertChildLocked(name, child)
+	}
+	return child, nil
+}
+
+// walkExistingLocked resolves rp to an existing file.
+//
+// walkExistingLocked is loosely analogous to Linux's
+// fs/namei.c:path_lookupat().
+//
+// Preconditions: Filesystem.mu must be locked for at least reading.
+//
+// Postconditions: Caller must call fs.processDeferredDecRefs*.
+func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
+	vfsd := rp.Start()
+	for !rp.Done() {
+		var err error
+		vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */)
+		if err != nil {
+			return nil, nil, err
+		}
+	}
+	d := vfsd.Impl().(*Dentry)
+	if rp.MustBeDir() && !d.isDir() {
+		return nil, nil, syserror.ENOTDIR
+	}
+	return vfsd, d.inode, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory. It does not check that the returned directory is
+// searchable by the provider of rp.
+//
+// walkParentDirLocked is loosely analogous to Linux's
+// fs/namei.c:path_parentat().
+//
+// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
+//
+// Postconditions: Caller must call fs.processDeferredDecRefs*.
+func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
+	vfsd := rp.Start()
+	for !rp.Final() {
+		var err error
+		vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */)
+		if err != nil {
+			return nil, nil, err
+		}
+	}
+	d := vfsd.Impl().(*Dentry)
+	if !d.isDir() {
+		return nil, nil, syserror.ENOTDIR
+	}
+	return vfsd, d.inode, nil
+}
+
+// checkCreateLocked checks that a file named rp.Component() may be created in
+// directory parentVFSD, then returns rp.Component().
+//
+// Preconditions: Filesystem.mu must be locked for at least reading. parentInode
+// == parentVFSD.Impl().(*Dentry).Inode. isDir(parentInode) == true.
+func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) {
+	if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return "", err
+	}
+	pc := rp.Component()
+	if pc == "." || pc == ".." {
+		return "", syserror.EEXIST
+	}
+	if len(pc) > linux.NAME_MAX {
+		return "", syserror.ENAMETOOLONG
+	}
+	// FIXME(gvisor.dev/issue/1193): Data race due to not holding dirMu.
+	if _, ok := parentVFSD.Impl().(*Dentry).children[pc]; ok {
+		return "", syserror.EEXIST
+	}
+	if parentVFSD.IsDead() {
+		return "", syserror.ENOENT
+	}
+	return pc, nil
+}
+
+// checkDeleteLocked checks that the file represented by vfsd may be deleted.
+//
+// Preconditions: Filesystem.mu must be locked for at least reading.
+func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error {
+	parent := vfsd.Impl().(*Dentry).parent
+	if parent == nil {
+		return syserror.EBUSY
+	}
+	if parent.vfsd.IsDead() {
+		return syserror.ENOENT
+	}
+	if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	return nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *Filesystem) Release(context.Context) {
+}
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *Filesystem) Sync(ctx context.Context) error {
+	// All filesystem state is in-memory.
+	return nil
+}
+
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	fs.mu.RLock()
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+
+	_, inode, err := fs.walkExistingLocked(ctx, rp)
+	if err != nil {
+		return err
+	}
+	return inode.CheckPermissions(ctx, creds, ats)
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+	fs.mu.RLock()
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+	if err != nil {
+		return nil, err
+	}
+
+	if opts.CheckSearchable {
+		d := vfsd.Impl().(*Dentry)
+		if !d.isDir() {
+			return nil, syserror.ENOTDIR
+		}
+		if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
+			return nil, err
+		}
+	}
+	vfsd.IncRef() // Ownership transferred to caller.
+	return vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	fs.mu.RLock()
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	vfsd, _, err := fs.walkParentDirLocked(ctx, rp)
+	if err != nil {
+		return nil, err
+	}
+	vfsd.IncRef() // Ownership transferred to caller.
+	return vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked(ctx)
+	if err != nil {
+		return err
+	}
+	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+	if err != nil {
+		return err
+	}
+	if rp.Mount() != vd.Mount() {
+		return syserror.EXDEV
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+
+	d := vd.Dentry().Impl().(*Dentry)
+	if d.isDir() {
+		return syserror.EPERM
+	}
+
+	childVFSD, err := parentInode.NewLink(ctx, pc, d.inode)
+	if err != nil {
+		return err
+	}
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+	return nil
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked(ctx)
+	if err != nil {
+		return err
+	}
+	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	childVFSD, err := parentInode.NewDir(ctx, pc, opts)
+	if err != nil {
+		return err
+	}
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+	return nil
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked(ctx)
+	if err != nil {
+		return err
+	}
+	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	newVFSD, err := parentInode.NewNode(ctx, pc, opts)
+	if err != nil {
+		return err
+	}
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, newVFSD.Impl().(*Dentry))
+	return nil
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	// Filter out flags that are not supported by kernfs. O_DIRECTORY and
+	// O_NOFOLLOW have no effect here (they're handled by VFS by setting
+	// appropriate bits in rp), but are returned by
+	// FileDescriptionImpl.StatusFlags().
+	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK
+	ats := vfs.AccessTypesForOpenFlags(&opts)
+
+	// Do not create new file.
+	if opts.Flags&linux.O_CREAT == 0 {
+		fs.mu.RLock()
+		defer fs.processDeferredDecRefs(ctx)
+		defer fs.mu.RUnlock()
+		vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+		if err != nil {
+			return nil, err
+		}
+		if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
+			return nil, err
+		}
+		return inode.Open(ctx, rp, vfsd, opts)
+	}
+
+	// May create new file.
+	mustCreate := opts.Flags&linux.O_EXCL != 0
+	vfsd := rp.Start()
+	inode := vfsd.Impl().(*Dentry).inode
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	if rp.Done() {
+		if rp.MustBeDir() {
+			return nil, syserror.EISDIR
+		}
+		if mustCreate {
+			return nil, syserror.EEXIST
+		}
+		if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
+			return nil, err
+		}
+		return inode.Open(ctx, rp, vfsd, opts)
+	}
+afterTrailingSymlink:
+	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked(ctx)
+	if err != nil {
+		return nil, err
+	}
+	// Check for search permission in the parent directory.
+	if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+	// Reject attempts to open directories with O_CREAT.
+	if rp.MustBeDir() {
+		return nil, syserror.EISDIR
+	}
+	pc := rp.Component()
+	if pc == "." || pc == ".." {
+		return nil, syserror.EISDIR
+	}
+	if len(pc) > linux.NAME_MAX {
+		return nil, syserror.ENAMETOOLONG
+	}
+	// Determine whether or not we need to create a file.
+	childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD, false /* mayFollowSymlinks */)
+	if err == syserror.ENOENT {
+		// Already checked for searchability above; now check for writability.
+		if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
+			return nil, err
+		}
+		if err := rp.Mount().CheckBeginWrite(); err != nil {
+			return nil, err
+		}
+		defer rp.Mount().EndWrite()
+		// Create and open the child.
+		childVFSD, err = parentInode.NewFile(ctx, pc, opts)
+		if err != nil {
+			return nil, err
+		}
+		child := childVFSD.Impl().(*Dentry)
+		parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+		return child.inode.Open(ctx, rp, childVFSD, opts)
+	}
+	if err != nil {
+		return nil, err
+	}
+	// Open existing file or follow symlink.
+	if mustCreate {
+		return nil, syserror.EEXIST
+	}
+	child := childVFSD.Impl().(*Dentry)
+	if rp.ShouldFollowSymlink() && child.isSymlink() {
+		targetVD, targetPathname, err := child.inode.Getlink(ctx, rp.Mount())
+		if err != nil {
+			return nil, err
+		}
+		if targetVD.Ok() {
+			err := rp.HandleJump(targetVD)
+			targetVD.DecRef(ctx)
+			if err != nil {
+				return nil, err
+			}
+		} else {
+			if err := rp.HandleSymlink(targetPathname); err != nil {
+				return nil, err
+			}
+		}
+		// rp.Final() may no longer be true since we now need to resolve the
+		// symlink target.
+		goto afterTrailingSymlink
+	}
+	if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
+		return nil, err
+	}
+	return child.inode.Open(ctx, rp, &child.vfsd, opts)
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+	fs.mu.RLock()
+	d, inode, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs(ctx)
+	if err != nil {
+		return "", err
+	}
+	if !d.Impl().(*Dentry).isSymlink() {
+		return "", syserror.EINVAL
+	}
+	return inode.Readlink(ctx)
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+	// Only RENAME_NOREPLACE is supported.
+	if opts.Flags&^linux.RENAME_NOREPLACE != 0 {
+		return syserror.EINVAL
+	}
+	noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0
+
+	fs.mu.Lock()
+	defer fs.processDeferredDecRefsLocked(ctx)
+	defer fs.mu.Unlock()
+
+	// Resolve the destination directory first to verify that it's on this
+	// Mount.
+	dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp)
+	if err != nil {
+		return err
+	}
+	dstDir := dstDirVFSD.Impl().(*Dentry)
+	mnt := rp.Mount()
+	if mnt != oldParentVD.Mount() {
+		return syserror.EXDEV
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+
+	srcDirVFSD := oldParentVD.Dentry()
+	srcDir := srcDirVFSD.Impl().(*Dentry)
+	srcDir.dirMu.Lock()
+	src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName, srcDir.children[oldName])
+	srcDir.dirMu.Unlock()
+	if err != nil {
+		return err
+	}
+	srcVFSD := &src.vfsd
+
+	// Can we remove the src dentry?
+	if err := checkDeleteLocked(ctx, rp, srcVFSD); err != nil {
+		return err
+	}
+
+	// Can we create the dst dentry?
+	var dst *Dentry
+	pc, err := checkCreateLocked(ctx, rp, dstDirVFSD, dstDirInode)
+	switch err {
+	case nil:
+		// Ok, continue with rename as replacement.
+	case syserror.EEXIST:
+		if noReplace {
+			// Won't overwrite existing node since RENAME_NOREPLACE was requested.
+			return syserror.EEXIST
+		}
+		dst = dstDir.children[pc]
+		if dst == nil {
+			panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD))
+		}
+	default:
+		return err
+	}
+	var dstVFSD *vfs.Dentry
+	if dst != nil {
+		dstVFSD = &dst.vfsd
+	}
+
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef(ctx)
+	virtfs := rp.VirtualFilesystem()
+
+	// We can't deadlock here due to lock ordering because we're protected from
+	// concurrent renames by fs.mu held for writing.
+	srcDir.dirMu.Lock()
+	defer srcDir.dirMu.Unlock()
+	if srcDir != dstDir {
+		dstDir.dirMu.Lock()
+		defer dstDir.dirMu.Unlock()
+	}
+
+	if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil {
+		return err
+	}
+	replaced, err := srcDir.inode.Rename(ctx, src.name, pc, srcVFSD, dstDirVFSD)
+	if err != nil {
+		virtfs.AbortRenameDentry(srcVFSD, dstVFSD)
+		return err
+	}
+	delete(srcDir.children, src.name)
+	if srcDir != dstDir {
+		fs.deferDecRef(srcDirVFSD)
+		dstDir.IncRef()
+	}
+	src.parent = dstDir
+	src.name = pc
+	if dstDir.children == nil {
+		dstDir.children = make(map[string]*Dentry)
+	}
+	dstDir.children[pc] = src
+	virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaced)
+	return nil
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked(ctx)
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	if err := checkDeleteLocked(ctx, rp, vfsd); err != nil {
+		return err
+	}
+	d := vfsd.Impl().(*Dentry)
+	if !d.isDir() {
+		return syserror.ENOTDIR
+	}
+	if inode.HasChildren() {
+		return syserror.ENOTEMPTY
+	}
+	virtfs := rp.VirtualFilesystem()
+	parentDentry := d.parent
+	parentDentry.dirMu.Lock()
+	defer parentDentry.dirMu.Unlock()
+
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef(ctx)
+	if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
+		return err
+	}
+	if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil {
+		virtfs.AbortDeleteDentry(vfsd)
+		return err
+	}
+	virtfs.CommitDeleteDentry(ctx, vfsd)
+	return nil
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+	fs.mu.RLock()
+	_, inode, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs(ctx)
+	if err != nil {
+		return err
+	}
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	return inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts)
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+	fs.mu.RLock()
+	_, inode, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs(ctx)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	return inode.Stat(ctx, fs.VFSFilesystem(), opts)
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs(ctx)
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	// TODO(gvisor.dev/issue/1193): actually implement statfs.
+	return linux.Statfs{}, syserror.ENOSYS
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked(ctx)
+	if err != nil {
+		return err
+	}
+	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	childVFSD, err := parentInode.NewSymlink(ctx, pc, target)
+	if err != nil {
+		return err
+	}
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+	return nil
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	vfsd, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked(ctx)
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	if err := checkDeleteLocked(ctx, rp, vfsd); err != nil {
+		return err
+	}
+	d := vfsd.Impl().(*Dentry)
+	if d.isDir() {
+		return syserror.EISDIR
+	}
+	virtfs := rp.VirtualFilesystem()
+	parentDentry := d.parent
+	parentDentry.dirMu.Lock()
+	defer parentDentry.dirMu.Unlock()
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef(ctx)
+	if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
+		return err
+	}
+	if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil {
+		virtfs.AbortDeleteDentry(vfsd)
+		return err
+	}
+	virtfs.CommitDeleteDentry(ctx, vfsd)
+	return nil
+}
+
+// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
+	fs.mu.RLock()
+	_, inode, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs(ctx)
+	if err != nil {
+		return nil, err
+	}
+	if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
+		return nil, err
+	}
+	return nil, syserror.ECONNREFUSED
+}
+
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs(ctx)
+	if err != nil {
+		return nil, err
+	}
+	// kernfs currently does not support extended attributes.
+	return nil, syserror.ENOTSUP
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs(ctx)
+	if err != nil {
+		return "", err
+	}
+	// kernfs currently does not support extended attributes.
+	return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs(ctx)
+	if err != nil {
+		return err
+	}
+	// kernfs currently does not support extended attributes.
+	return syserror.ENOTSUP
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs(ctx)
+	if err != nil {
+		return err
+	}
+	// kernfs currently does not support extended attributes.
+	return syserror.ENOTSUP
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*Dentry), b)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
new file mode 100644
index 000000000..c3efcf3ec
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -0,0 +1,613 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// InodeNoopRefCount partially implements the Inode interface, specifically the
+// inodeRefs sub interface. InodeNoopRefCount implements a simple reference
+// count for inodes, performing no extra actions when references are obtained or
+// released. This is suitable for simple file inodes that don't reference any
+// resources.
+type InodeNoopRefCount struct {
+}
+
+// IncRef implements Inode.IncRef.
+func (InodeNoopRefCount) IncRef() {
+}
+
+// DecRef implements Inode.DecRef.
+func (InodeNoopRefCount) DecRef(context.Context) {
+}
+
+// TryIncRef implements Inode.TryIncRef.
+func (InodeNoopRefCount) TryIncRef() bool {
+	return true
+}
+
+// Destroy implements Inode.Destroy.
+func (InodeNoopRefCount) Destroy(context.Context) {
+}
+
+// InodeDirectoryNoNewChildren partially implements the Inode interface.
+// InodeDirectoryNoNewChildren represents a directory inode which does not
+// support creation of new children.
+type InodeDirectoryNoNewChildren struct{}
+
+// NewFile implements Inode.NewFile.
+func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// NewDir implements Inode.NewDir.
+func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// NewLink implements Inode.NewLink.
+func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// NewSymlink implements Inode.NewSymlink.
+func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// NewNode implements Inode.NewNode.
+func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// InodeNotDirectory partially implements the Inode interface, specifically the
+// inodeDirectory and inodeDynamicDirectory sub interfaces. Inodes that do not
+// represent directories can embed this to provide no-op implementations for
+// directory-related functions.
+type InodeNotDirectory struct {
+}
+
+// HasChildren implements Inode.HasChildren.
+func (InodeNotDirectory) HasChildren() bool {
+	return false
+}
+
+// NewFile implements Inode.NewFile.
+func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+	panic("NewFile called on non-directory inode")
+}
+
+// NewDir implements Inode.NewDir.
+func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+	panic("NewDir called on non-directory inode")
+}
+
+// NewLink implements Inode.NewLinkink.
+func (InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+	panic("NewLink called on non-directory inode")
+}
+
+// NewSymlink implements Inode.NewSymlink.
+func (InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+	panic("NewSymlink called on non-directory inode")
+}
+
+// NewNode implements Inode.NewNode.
+func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+	panic("NewNode called on non-directory inode")
+}
+
+// Unlink implements Inode.Unlink.
+func (InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error {
+	panic("Unlink called on non-directory inode")
+}
+
+// RmDir implements Inode.RmDir.
+func (InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error {
+	panic("RmDir called on non-directory inode")
+}
+
+// Rename implements Inode.Rename.
+func (InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) {
+	panic("Rename called on non-directory inode")
+}
+
+// Lookup implements Inode.Lookup.
+func (InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+	panic("Lookup called on non-directory inode")
+}
+
+// IterDirents implements Inode.IterDirents.
+func (InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
+	panic("IterDirents called on non-directory inode")
+}
+
+// Valid implements Inode.Valid.
+func (InodeNotDirectory) Valid(context.Context) bool {
+	return true
+}
+
+// InodeNoDynamicLookup partially implements the Inode interface, specifically
+// the inodeDynamicLookup sub interface. Directory inodes that do not support
+// dymanic entries (i.e. entries that are not "hashed" into the
+// vfs.Dentry.children) can embed this to provide no-op implementations for
+// functions related to dynamic entries.
+type InodeNoDynamicLookup struct{}
+
+// Lookup implements Inode.Lookup.
+func (InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+	return nil, syserror.ENOENT
+}
+
+// IterDirents implements Inode.IterDirents.
+func (InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+	return offset, nil
+}
+
+// Valid implements Inode.Valid.
+func (InodeNoDynamicLookup) Valid(ctx context.Context) bool {
+	return true
+}
+
+// InodeNotSymlink partially implements the Inode interface, specifically the
+// inodeSymlink sub interface. All inodes that are not symlinks may embed this
+// to return the appropriate errors from symlink-related functions.
+type InodeNotSymlink struct{}
+
+// Readlink implements Inode.Readlink.
+func (InodeNotSymlink) Readlink(context.Context) (string, error) {
+	return "", syserror.EINVAL
+}
+
+// Getlink implements Inode.Getlink.
+func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	return vfs.VirtualDentry{}, "", syserror.EINVAL
+}
+
+// InodeAttrs partially implements the Inode interface, specifically the
+// inodeMetadata sub interface. InodeAttrs provides functionality related to
+// inode attributes.
+//
+// Must be initialized by Init prior to first use.
+type InodeAttrs struct {
+	devMajor uint32
+	devMinor uint32
+	ino      uint64
+	mode     uint32
+	uid      uint32
+	gid      uint32
+	nlink    uint32
+}
+
+// Init initializes this InodeAttrs.
+func (a *InodeAttrs) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, mode linux.FileMode) {
+	if mode.FileType() == 0 {
+		panic(fmt.Sprintf("No file type specified in 'mode' for InodeAttrs.Init(): mode=0%o", mode))
+	}
+
+	nlink := uint32(1)
+	if mode.FileType() == linux.ModeDirectory {
+		nlink = 2
+	}
+	a.devMajor = devMajor
+	a.devMinor = devMinor
+	atomic.StoreUint64(&a.ino, ino)
+	atomic.StoreUint32(&a.mode, uint32(mode))
+	atomic.StoreUint32(&a.uid, uint32(creds.EffectiveKUID))
+	atomic.StoreUint32(&a.gid, uint32(creds.EffectiveKGID))
+	atomic.StoreUint32(&a.nlink, nlink)
+}
+
+// DevMajor returns the device major number.
+func (a *InodeAttrs) DevMajor() uint32 {
+	return a.devMajor
+}
+
+// DevMinor returns the device minor number.
+func (a *InodeAttrs) DevMinor() uint32 {
+	return a.devMinor
+}
+
+// Ino returns the inode id.
+func (a *InodeAttrs) Ino() uint64 {
+	return atomic.LoadUint64(&a.ino)
+}
+
+// Mode implements Inode.Mode.
+func (a *InodeAttrs) Mode() linux.FileMode {
+	return linux.FileMode(atomic.LoadUint32(&a.mode))
+}
+
+// Stat partially implements Inode.Stat. Note that this function doesn't provide
+// all the stat fields, and the embedder should consider extending the result
+// with filesystem-specific fields.
+func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) {
+	var stat linux.Statx
+	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK
+	stat.DevMajor = a.devMajor
+	stat.DevMinor = a.devMinor
+	stat.Ino = atomic.LoadUint64(&a.ino)
+	stat.Mode = uint16(a.Mode())
+	stat.UID = atomic.LoadUint32(&a.uid)
+	stat.GID = atomic.LoadUint32(&a.gid)
+	stat.Nlink = atomic.LoadUint32(&a.nlink)
+
+	// TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
+
+	return stat, nil
+}
+
+// SetStat implements Inode.SetStat.
+func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 {
+		return syserror.EPERM
+	}
+	if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
+		return err
+	}
+
+	stat := opts.Stat
+	if stat.Mask&linux.STATX_MODE != 0 {
+		for {
+			old := atomic.LoadUint32(&a.mode)
+			new := old | uint32(stat.Mode & ^uint16(linux.S_IFMT))
+			if swapped := atomic.CompareAndSwapUint32(&a.mode, old, new); swapped {
+				break
+			}
+		}
+	}
+
+	if stat.Mask&linux.STATX_UID != 0 {
+		atomic.StoreUint32(&a.uid, stat.UID)
+	}
+	if stat.Mask&linux.STATX_GID != 0 {
+		atomic.StoreUint32(&a.gid, stat.GID)
+	}
+
+	// Note that not all fields are modifiable. For example, the file type and
+	// inode numbers are immutable after node creation.
+
+	// TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
+	// Also, STATX_SIZE will need some special handling, because read-only static
+	// files should return EIO for truncate operations.
+
+	return nil
+}
+
+// CheckPermissions implements Inode.CheckPermissions.
+func (a *InodeAttrs) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	return vfs.GenericCheckPermissions(
+		creds,
+		ats,
+		a.Mode(),
+		auth.KUID(atomic.LoadUint32(&a.uid)),
+		auth.KGID(atomic.LoadUint32(&a.gid)),
+	)
+}
+
+// IncLinks implements Inode.IncLinks.
+func (a *InodeAttrs) IncLinks(n uint32) {
+	if atomic.AddUint32(&a.nlink, n) <= n {
+		panic("InodeLink.IncLinks called with no existing links")
+	}
+}
+
+// DecLinks implements Inode.DecLinks.
+func (a *InodeAttrs) DecLinks() {
+	if nlink := atomic.AddUint32(&a.nlink, ^uint32(0)); nlink == ^uint32(0) {
+		// Negative overflow
+		panic("Inode.DecLinks called at 0 links")
+	}
+}
+
+type slot struct {
+	Name   string
+	Dentry *vfs.Dentry
+	slotEntry
+}
+
+// OrderedChildrenOptions contains initialization options for OrderedChildren.
+type OrderedChildrenOptions struct {
+	// Writable indicates whether vfs.FilesystemImpl methods implemented by
+	// OrderedChildren may modify the tracked children. This applies to
+	// operations related to rename, unlink and rmdir. If an OrderedChildren is
+	// not writable, these operations all fail with EPERM.
+	Writable bool
+}
+
+// OrderedChildren partially implements the Inode interface. OrderedChildren can
+// be embedded in directory inodes to keep track of the children in the
+// directory, and can then be used to implement a generic directory FD -- see
+// GenericDirectoryFD. OrderedChildren is not compatible with dynamic
+// directories.
+//
+// Must be initialize with Init before first use.
+type OrderedChildren struct {
+	refs.AtomicRefCount
+
+	// Can children be modified by user syscalls? It set to false, interface
+	// methods that would modify the children return EPERM. Immutable.
+	writable bool
+
+	mu    sync.RWMutex
+	order slotList
+	set   map[string]*slot
+}
+
+// Init initializes an OrderedChildren.
+func (o *OrderedChildren) Init(opts OrderedChildrenOptions) {
+	o.writable = opts.Writable
+	o.set = make(map[string]*slot)
+}
+
+// DecRef implements Inode.DecRef.
+func (o *OrderedChildren) DecRef(ctx context.Context) {
+	o.AtomicRefCount.DecRefWithDestructor(ctx, o.Destroy)
+}
+
+// Destroy cleans up resources referenced by this OrderedChildren.
+func (o *OrderedChildren) Destroy(context.Context) {
+	o.mu.Lock()
+	defer o.mu.Unlock()
+	o.order.Reset()
+	o.set = nil
+}
+
+// Populate inserts children into this OrderedChildren, and d's dentry
+// cache. Populate returns the number of directories inserted, which the caller
+// may use to update the link count for the parent directory.
+//
+// Precondition: d must represent a directory inode. children must not contain
+// any conflicting entries already in o.
+func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint32 {
+	var links uint32
+	for name, child := range children {
+		if child.isDir() {
+			links++
+		}
+		if err := o.Insert(name, child.VFSDentry()); err != nil {
+			panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d))
+		}
+		d.InsertChild(name, child)
+	}
+	return links
+}
+
+// HasChildren implements Inode.HasChildren.
+func (o *OrderedChildren) HasChildren() bool {
+	o.mu.RLock()
+	defer o.mu.RUnlock()
+	return len(o.set) > 0
+}
+
+// Insert inserts child into o. This ignores the writability of o, as this is
+// not part of the vfs.FilesystemImpl interface, and is a lower-level operation.
+func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error {
+	o.mu.Lock()
+	defer o.mu.Unlock()
+	if _, ok := o.set[name]; ok {
+		return syserror.EEXIST
+	}
+	s := &slot{
+		Name:   name,
+		Dentry: child,
+	}
+	o.order.PushBack(s)
+	o.set[name] = s
+	return nil
+}
+
+// Precondition: caller must hold o.mu for writing.
+func (o *OrderedChildren) removeLocked(name string) {
+	if s, ok := o.set[name]; ok {
+		delete(o.set, name)
+		o.order.Remove(s)
+	}
+}
+
+// Precondition: caller must hold o.mu for writing.
+func (o *OrderedChildren) replaceChildLocked(name string, new *vfs.Dentry) *vfs.Dentry {
+	if s, ok := o.set[name]; ok {
+		// Existing slot with given name, simply replace the dentry.
+		var old *vfs.Dentry
+		old, s.Dentry = s.Dentry, new
+		return old
+	}
+
+	// No existing slot with given name, create and hash new slot.
+	s := &slot{
+		Name:   name,
+		Dentry: new,
+	}
+	o.order.PushBack(s)
+	o.set[name] = s
+	return nil
+}
+
+// Precondition: caller must hold o.mu for reading or writing.
+func (o *OrderedChildren) checkExistingLocked(name string, child *vfs.Dentry) error {
+	s, ok := o.set[name]
+	if !ok {
+		return syserror.ENOENT
+	}
+	if s.Dentry != child {
+		panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! OrderedChild: %+v, vfs: %+v", s.Dentry, child))
+	}
+	return nil
+}
+
+// Unlink implements Inode.Unlink.
+func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.Dentry) error {
+	if !o.writable {
+		return syserror.EPERM
+	}
+	o.mu.Lock()
+	defer o.mu.Unlock()
+	if err := o.checkExistingLocked(name, child); err != nil {
+		return err
+	}
+
+	// TODO(gvisor.dev/issue/3027): Check sticky bit before removing.
+	o.removeLocked(name)
+	return nil
+}
+
+// Rmdir implements Inode.Rmdir.
+func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *vfs.Dentry) error {
+	// We're not responsible for checking that child is a directory, that it's
+	// empty, or updating any link counts; so this is the same as unlink.
+	return o.Unlink(ctx, name, child)
+}
+
+type renameAcrossDifferentImplementationsError struct{}
+
+func (renameAcrossDifferentImplementationsError) Error() string {
+	return "rename across inodes with different implementations"
+}
+
+// Rename implements Inode.Rename.
+//
+// Precondition: Rename may only be called across two directory inodes with
+// identical implementations of Rename. Practically, this means filesystems that
+// implement Rename by embedding OrderedChildren for any directory
+// implementation must use OrderedChildren for all directory implementations
+// that will support Rename.
+//
+// Postcondition: reference on any replaced dentry transferred to caller.
+func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (*vfs.Dentry, error) {
+	dst, ok := dstDir.Impl().(*Dentry).inode.(interface{}).(*OrderedChildren)
+	if !ok {
+		return nil, renameAcrossDifferentImplementationsError{}
+	}
+	if !o.writable || !dst.writable {
+		return nil, syserror.EPERM
+	}
+	// Note: There's a potential deadlock below if concurrent calls to Rename
+	// refer to the same src and dst directories in reverse. We avoid any
+	// ordering issues because the caller is required to serialize concurrent
+	// calls to Rename in accordance with the interface declaration.
+	o.mu.Lock()
+	defer o.mu.Unlock()
+	if dst != o {
+		dst.mu.Lock()
+		defer dst.mu.Unlock()
+	}
+	if err := o.checkExistingLocked(oldname, child); err != nil {
+		return nil, err
+	}
+
+	// TODO(gvisor.dev/issue/3027): Check sticky bit before removing.
+	replaced := dst.replaceChildLocked(newname, child)
+	return replaced, nil
+}
+
+// nthLocked returns an iterator to the nth child tracked by this object. The
+// iterator is valid until the caller releases o.mu. Returns nil if the
+// requested index falls out of bounds.
+//
+// Preconditon: Caller must hold o.mu for reading.
+func (o *OrderedChildren) nthLocked(i int64) *slot {
+	for it := o.order.Front(); it != nil && i >= 0; it = it.Next() {
+		if i == 0 {
+			return it
+		}
+		i--
+	}
+	return nil
+}
+
+// InodeSymlink partially implements Inode interface for symlinks.
+type InodeSymlink struct {
+	InodeNotDirectory
+}
+
+// Open implements Inode.Open.
+func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	return nil, syserror.ELOOP
+}
+
+// StaticDirectory is a standard implementation of a directory with static
+// contents.
+//
+// +stateify savable
+type StaticDirectory struct {
+	InodeNotSymlink
+	InodeDirectoryNoNewChildren
+	InodeAttrs
+	InodeNoDynamicLookup
+	OrderedChildren
+
+	locks vfs.FileLocks
+}
+
+var _ Inode = (*StaticDirectory)(nil)
+
+// NewStaticDir creates a new static directory and returns its dentry.
+func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry {
+	inode := &StaticDirectory{}
+	inode.Init(creds, devMajor, devMinor, ino, perm)
+
+	dentry := &Dentry{}
+	dentry.Init(inode)
+
+	inode.OrderedChildren.Init(OrderedChildrenOptions{})
+	links := inode.OrderedChildren.Populate(dentry, children)
+	inode.IncLinks(links)
+
+	return dentry
+}
+
+// Init initializes StaticDirectory.
+func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
+	if perm&^linux.PermissionsMask != 0 {
+		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
+	}
+	s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeDirectory|perm)
+}
+
+// Open implements kernfs.Inode.
+func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts)
+	if err != nil {
+		return nil, err
+	}
+	return fd.VFSFileDescription(), nil
+}
+
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
+
+// AlwaysValid partially implements kernfs.inodeDynamicLookup.
+type AlwaysValid struct{}
+
+// Valid implements kernfs.inodeDynamicLookup.
+func (*AlwaysValid) Valid(context.Context) bool {
+	return true
+}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
new file mode 100644
index 000000000..080118841
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -0,0 +1,456 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package kernfs provides the tools to implement inode-based filesystems.
+// Kernfs has two main features:
+//
+// 1. The Inode interface, which maps VFS2's path-based filesystem operations to
+//    specific filesystem nodes. Kernfs uses the Inode interface to provide a
+//    blanket implementation for the vfs.FilesystemImpl. Kernfs also serves as
+//    the synchronization mechanism for all filesystem operations by holding a
+//    filesystem-wide lock across all operations.
+//
+// 2. Various utility types which provide generic implementations for various
+//    parts of the Inode and vfs.FileDescription interfaces. Client filesystems
+//    based on kernfs can embed the appropriate set of these to avoid having to
+//    reimplement common filesystem operations. See inode_impl_util.go and
+//    fd_impl_util.go.
+//
+// Reference Model:
+//
+// Kernfs dentries represents named pointers to inodes. Dentries and inode have
+// independent lifetimes and reference counts. A child dentry unconditionally
+// holds a reference on its parent directory's dentry. A dentry also holds a
+// reference on the inode it points to. Multiple dentries can point to the same
+// inode (for example, in the case of hardlinks). File descriptors hold a
+// reference to the dentry they're opened on.
+//
+// Dentries are guaranteed to exist while holding Filesystem.mu for
+// reading. Dropping dentries require holding Filesystem.mu for writing. To
+// queue dentries for destruction from a read critical section, see
+// Filesystem.deferDecRef.
+//
+// Lock ordering:
+//
+// kernfs.Filesystem.mu
+//   kernfs.Dentry.dirMu
+//     vfs.VirtualFilesystem.mountMu
+//       vfs.Dentry.mu
+//   kernfs.Filesystem.droppedDentriesMu
+//   (inode implementation locks, if any)
+package kernfs
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory
+// filesystem. Concrete implementations are expected to embed this in their own
+// Filesystem type.
+type Filesystem struct {
+	vfsfs vfs.Filesystem
+
+	droppedDentriesMu sync.Mutex
+
+	// droppedDentries is a list of dentries waiting to be DecRef()ed. This is
+	// used to defer dentry destruction until mu can be acquired for
+	// writing. Protected by droppedDentriesMu.
+	droppedDentries []*vfs.Dentry
+
+	// mu synchronizes the lifetime of Dentries on this filesystem. Holding it
+	// for reading guarantees continued existence of any resolved dentries, but
+	// the dentry tree may be modified.
+	//
+	// Kernfs dentries can only be DecRef()ed while holding mu for writing. For
+	// example:
+	//
+	//   fs.mu.Lock()
+	//   defer fs.mu.Unlock()
+	//   ...
+	//   dentry1.DecRef()
+	//   defer dentry2.DecRef() // Ok, will run before Unlock.
+	//
+	// If discarding dentries in a read context, use Filesystem.deferDecRef. For
+	// example:
+	//
+	//   fs.mu.RLock()
+	//   fs.mu.processDeferredDecRefs()
+	//   defer fs.mu.RUnlock()
+	//   ...
+	//   fs.deferDecRef(dentry)
+	mu sync.RWMutex
+
+	// nextInoMinusOne is used to to allocate inode numbers on this
+	// filesystem. Must be accessed by atomic operations.
+	nextInoMinusOne uint64
+}
+
+// deferDecRef defers dropping a dentry ref until the next call to
+// processDeferredDecRefs{,Locked}. See comment on Filesystem.mu.
+//
+// Precondition: d must not already be pending destruction.
+func (fs *Filesystem) deferDecRef(d *vfs.Dentry) {
+	fs.droppedDentriesMu.Lock()
+	fs.droppedDentries = append(fs.droppedDentries, d)
+	fs.droppedDentriesMu.Unlock()
+}
+
+// processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the
+// droppedDentries list. See comment on Filesystem.mu.
+func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) {
+	fs.mu.Lock()
+	fs.processDeferredDecRefsLocked(ctx)
+	fs.mu.Unlock()
+}
+
+// Precondition: fs.mu must be held for writing.
+func (fs *Filesystem) processDeferredDecRefsLocked(ctx context.Context) {
+	fs.droppedDentriesMu.Lock()
+	for _, d := range fs.droppedDentries {
+		d.DecRef(ctx)
+	}
+	fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse.
+	fs.droppedDentriesMu.Unlock()
+}
+
+// VFSFilesystem returns the generic vfs filesystem object.
+func (fs *Filesystem) VFSFilesystem() *vfs.Filesystem {
+	return &fs.vfsfs
+}
+
+// NextIno allocates a new inode number on this filesystem.
+func (fs *Filesystem) NextIno() uint64 {
+	return atomic.AddUint64(&fs.nextInoMinusOne, 1)
+}
+
+// These consts are used in the Dentry.flags field.
+const (
+	// Dentry points to a directory inode.
+	dflagsIsDir = 1 << iota
+
+	// Dentry points to a symlink inode.
+	dflagsIsSymlink
+)
+
+// Dentry implements vfs.DentryImpl.
+//
+// A kernfs dentry is similar to a dentry in a traditional filesystem: it's a
+// named reference to an inode. A dentry generally lives as long as it's part of
+// a mounted filesystem tree. Kernfs doesn't cache dentries once all references
+// to them are removed. Dentries hold a single reference to the inode they point
+// to, and child dentries hold a reference on their parent.
+//
+// Must be initialized by Init prior to first use.
+type Dentry struct {
+	vfsd vfs.Dentry
+
+	refs.AtomicRefCount
+
+	// flags caches useful information about the dentry from the inode. See the
+	// dflags* consts above. Must be accessed by atomic ops.
+	flags uint32
+
+	parent *Dentry
+	name   string
+
+	// dirMu protects children and the names of child Dentries.
+	dirMu    sync.Mutex
+	children map[string]*Dentry
+
+	inode Inode
+}
+
+// Init initializes this dentry.
+//
+// Precondition: Caller must hold a reference on inode.
+//
+// Postcondition: Caller's reference on inode is transferred to the dentry.
+func (d *Dentry) Init(inode Inode) {
+	d.vfsd.Init(d)
+	d.inode = inode
+	ftype := inode.Mode().FileType()
+	if ftype == linux.ModeDirectory {
+		d.flags |= dflagsIsDir
+	}
+	if ftype == linux.ModeSymlink {
+		d.flags |= dflagsIsSymlink
+	}
+}
+
+// VFSDentry returns the generic vfs dentry for this kernfs dentry.
+func (d *Dentry) VFSDentry() *vfs.Dentry {
+	return &d.vfsd
+}
+
+// isDir checks whether the dentry points to a directory inode.
+func (d *Dentry) isDir() bool {
+	return atomic.LoadUint32(&d.flags)&dflagsIsDir != 0
+}
+
+// isSymlink checks whether the dentry points to a symlink inode.
+func (d *Dentry) isSymlink() bool {
+	return atomic.LoadUint32(&d.flags)&dflagsIsSymlink != 0
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *Dentry) DecRef(ctx context.Context) {
+	d.AtomicRefCount.DecRefWithDestructor(ctx, d.destroy)
+}
+
+// Precondition: Dentry must be removed from VFS' dentry cache.
+func (d *Dentry) destroy(ctx context.Context) {
+	d.inode.DecRef(ctx) // IncRef from Init.
+	d.inode = nil
+	if d.parent != nil {
+		d.parent.DecRef(ctx) // IncRef from Dentry.InsertChild.
+	}
+}
+
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+//
+// Although Linux technically supports inotify on pseudo filesystems (inotify
+// is implemented at the vfs layer), it is not particularly useful. It is left
+// unimplemented until someone actually needs it.
+func (d *Dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *Dentry) Watches() *vfs.Watches {
+	return nil
+}
+
+// OnZeroWatches implements vfs.Dentry.OnZeroWatches.
+func (d *Dentry) OnZeroWatches(context.Context) {}
+
+// InsertChild inserts child into the vfs dentry cache with the given name under
+// this dentry. This does not update the directory inode, so calling this on
+// its own isn't sufficient to insert a child into a directory. InsertChild
+// updates the link count on d if required.
+//
+// Precondition: d must represent a directory inode.
+func (d *Dentry) InsertChild(name string, child *Dentry) {
+	d.dirMu.Lock()
+	d.insertChildLocked(name, child)
+	d.dirMu.Unlock()
+}
+
+// insertChildLocked is equivalent to InsertChild, with additional
+// preconditions.
+//
+// Precondition: d.dirMu must be locked.
+func (d *Dentry) insertChildLocked(name string, child *Dentry) {
+	if !d.isDir() {
+		panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d))
+	}
+	d.IncRef() // DecRef in child's Dentry.destroy.
+	child.parent = d
+	child.name = name
+	if d.children == nil {
+		d.children = make(map[string]*Dentry)
+	}
+	d.children[name] = child
+}
+
+// Inode returns the dentry's inode.
+func (d *Dentry) Inode() Inode {
+	return d.inode
+}
+
+// The Inode interface maps filesystem-level operations that operate on paths to
+// equivalent operations on specific filesystem nodes.
+//
+// The interface methods are groups into logical categories as sub interfaces
+// below. Generally, an implementation for each sub interface can be provided by
+// embedding an appropriate type from inode_impl_utils.go. The sub interfaces
+// are purely organizational. Methods declared directly in the main interface
+// have no generic implementations, and should be explicitly provided by the
+// client filesystem.
+//
+// Generally, implementations are not responsible for tasks that are common to
+// all filesystems. These include:
+//
+// - Checking that dentries passed to methods are of the appropriate file type.
+// - Checking permissions.
+// - Updating link and reference counts.
+//
+// Specific responsibilities of implementations are documented below.
+type Inode interface {
+	// Methods related to reference counting. A generic implementation is
+	// provided by InodeNoopRefCount. These methods are generally called by the
+	// equivalent Dentry methods.
+	inodeRefs
+
+	// Methods related to node metadata. A generic implementation is provided by
+	// InodeAttrs.
+	inodeMetadata
+
+	// Method for inodes that represent symlink. InodeNotSymlink provides a
+	// blanket implementation for all non-symlink inodes.
+	inodeSymlink
+
+	// Method for inodes that represent directories. InodeNotDirectory provides
+	// a blanket implementation for all non-directory inodes.
+	inodeDirectory
+
+	// Method for inodes that represent dynamic directories and their
+	// children. InodeNoDynamicLookup provides a blanket implementation for all
+	// non-dynamic-directory inodes.
+	inodeDynamicLookup
+
+	// Open creates a file description for the filesystem object represented by
+	// this inode. The returned file description should hold a reference on the
+	// inode for its lifetime.
+	//
+	// Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing
+	// the inode on which Open() is being called.
+	Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
+}
+
+type inodeRefs interface {
+	IncRef()
+	DecRef(ctx context.Context)
+	TryIncRef() bool
+	// Destroy is called when the inode reaches zero references. Destroy release
+	// all resources (references) on objects referenced by the inode, including
+	// any child dentries.
+	Destroy(ctx context.Context)
+}
+
+type inodeMetadata interface {
+	// CheckPermissions checks that creds may access this inode for the
+	// requested access type, per the the rules of
+	// fs/namei.c:generic_permission().
+	CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error
+
+	// Mode returns the (struct stat)::st_mode value for this inode. This is
+	// separated from Stat for performance.
+	Mode() linux.FileMode
+
+	// Stat returns the metadata for this inode. This corresponds to
+	// vfs.FilesystemImpl.StatAt.
+	Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error)
+
+	// SetStat updates the metadata for this inode. This corresponds to
+	// vfs.FilesystemImpl.SetStatAt. Implementations are responsible for checking
+	// if the operation can be performed (see vfs.CheckSetStat() for common
+	// checks).
+	SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error
+}
+
+// Precondition: All methods in this interface may only be called on directory
+// inodes.
+type inodeDirectory interface {
+	// The New{File,Dir,Node,Symlink} methods below should return a new inode
+	// hashed into this inode.
+	//
+	// These inode constructors are inode-level operations rather than
+	// filesystem-level operations to allow client filesystems to mix different
+	// implementations based on the new node's location in the
+	// filesystem.
+
+	// HasChildren returns true if the directory inode has any children.
+	HasChildren() bool
+
+	// NewFile creates a new regular file inode.
+	NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error)
+
+	// NewDir creates a new directory inode.
+	NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error)
+
+	// NewLink creates a new hardlink to a specified inode in this
+	// directory. Implementations should create a new kernfs Dentry pointing to
+	// target, and update target's link count.
+	NewLink(ctx context.Context, name string, target Inode) (*vfs.Dentry, error)
+
+	// NewSymlink creates a new symbolic link inode.
+	NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error)
+
+	// NewNode creates a new filesystem node for a mknod syscall.
+	NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error)
+
+	// Unlink removes a child dentry from this directory inode.
+	Unlink(ctx context.Context, name string, child *vfs.Dentry) error
+
+	// RmDir removes an empty child directory from this directory
+	// inode. Implementations must update the parent directory's link count,
+	// if required. Implementations are not responsible for checking that child
+	// is a directory, checking for an empty directory.
+	RmDir(ctx context.Context, name string, child *vfs.Dentry) error
+
+	// Rename is called on the source directory containing an inode being
+	// renamed. child should point to the resolved child in the source
+	// directory. If Rename replaces a dentry in the destination directory, it
+	// should return the replaced dentry or nil otherwise.
+	//
+	// Precondition: Caller must serialize concurrent calls to Rename.
+	Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (replaced *vfs.Dentry, err error)
+}
+
+type inodeDynamicLookup interface {
+	// Lookup should return an appropriate dentry if name should resolve to a
+	// child of this dynamic directory inode. This gives the directory an
+	// opportunity on every lookup to resolve additional entries that aren't
+	// hashed into the directory. This is only called when the inode is a
+	// directory. If the inode is not a directory, or if the directory only
+	// contains a static set of children, the implementer can unconditionally
+	// return an appropriate error (ENOTDIR and ENOENT respectively).
+	//
+	// The child returned by Lookup will be hashed into the VFS dentry tree. Its
+	// lifetime can be controlled by the filesystem implementation with an
+	// appropriate implementation of Valid.
+	//
+	// Lookup returns the child with an extra reference and the caller owns this
+	// reference.
+	Lookup(ctx context.Context, name string) (*vfs.Dentry, error)
+
+	// Valid should return true if this inode is still valid, or needs to
+	// be resolved again by a call to Lookup.
+	Valid(ctx context.Context) bool
+
+	// IterDirents is used to iterate over dynamically created entries. It invokes
+	// cb on each entry in the directory represented by the FileDescription.
+	// 'offset' is the offset for the entire IterDirents call, which may include
+	// results from the caller (e.g. "." and ".."). 'relOffset' is the offset
+	// inside the entries returned by this IterDirents invocation. In other words,
+	// 'offset' should be used to calculate each vfs.Dirent.NextOff as well as
+	// the return value, while 'relOffset' is the place to start iteration.
+	IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error)
+}
+
+type inodeSymlink interface {
+	// Readlink returns the target of a symbolic link. If an inode is not a
+	// symlink, the implementation should return EINVAL.
+	Readlink(ctx context.Context) (string, error)
+
+	// Getlink returns the target of a symbolic link, as used by path
+	// resolution:
+	//
+	// - If the inode is a "magic link" (a link whose target is most accurately
+	// represented as a VirtualDentry), Getlink returns (ok VirtualDentry, "",
+	// nil). A reference is taken on the returned VirtualDentry.
+	//
+	// - If the inode is an ordinary symlink, Getlink returns (zero-value
+	// VirtualDentry, symlink target, nil).
+	//
+	// - If the inode is not a symlink, Getlink returns (zero-value
+	// VirtualDentry, "", EINVAL).
+	Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
new file mode 100644
index 000000000..c5d5afedf
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -0,0 +1,330 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs_test
+
+import (
+	"bytes"
+	"fmt"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const defaultMode linux.FileMode = 01777
+const staticFileContent = "This is sample content for a static test file."
+
+// RootDentryFn is a generator function for creating the root dentry of a test
+// filesystem. See newTestSystem.
+type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry
+
+// newTestSystem sets up a minimal environment for running a test, including an
+// instance of a test filesystem. Tests can control the contents of the
+// filesystem by providing an appropriate rootFn, which should return a
+// pre-populated root dentry.
+func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System {
+	ctx := contexttest.Context(t)
+	creds := auth.CredentialsFromContext(ctx)
+	v := &vfs.VirtualFilesystem{}
+	if err := v.Init(ctx); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
+	v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+	mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{})
+	if err != nil {
+		t.Fatalf("Failed to create testfs root mount: %v", err)
+	}
+	return testutil.NewSystem(ctx, t, v, mns)
+}
+
+type fsType struct {
+	rootFn RootDentryFn
+}
+
+type filesystem struct {
+	kernfs.Filesystem
+}
+
+type file struct {
+	kernfs.DynamicBytesFile
+	content string
+}
+
+func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry {
+	f := &file{}
+	f.content = content
+	f.DynamicBytesFile.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), f, 0777)
+
+	d := &kernfs.Dentry{}
+	d.Init(f)
+	return d
+}
+
+func (f *file) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "%s", f.content)
+	return nil
+}
+
+type attrs struct {
+	kernfs.InodeAttrs
+}
+
+func (*attrs) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
+
+type readonlyDir struct {
+	attrs
+	kernfs.InodeNotSymlink
+	kernfs.InodeNoDynamicLookup
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.OrderedChildren
+
+	locks vfs.FileLocks
+
+	dentry kernfs.Dentry
+}
+
+func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+	dir := &readonlyDir{}
+	dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
+	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	dir.dentry.Init(dir)
+
+	dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
+
+	return &dir.dentry
+}
+
+func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
+	if err != nil {
+		return nil, err
+	}
+	return fd.VFSFileDescription(), nil
+}
+
+type dir struct {
+	attrs
+	kernfs.InodeNotSymlink
+	kernfs.InodeNoDynamicLookup
+	kernfs.OrderedChildren
+
+	locks vfs.FileLocks
+
+	fs     *filesystem
+	dentry kernfs.Dentry
+}
+
+func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+	dir := &dir{}
+	dir.fs = fs
+	dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
+	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true})
+	dir.dentry.Init(dir)
+
+	dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
+
+	return &dir.dentry
+}
+
+func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
+	if err != nil {
+		return nil, err
+	}
+	return fd.VFSFileDescription(), nil
+}
+
+func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) {
+	creds := auth.CredentialsFromContext(ctx)
+	dir := d.fs.newDir(creds, opts.Mode, nil)
+	dirVFSD := dir.VFSDentry()
+	if err := d.OrderedChildren.Insert(name, dirVFSD); err != nil {
+		dir.DecRef(ctx)
+		return nil, err
+	}
+	d.IncLinks(1)
+	return dirVFSD, nil
+}
+
+func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) {
+	creds := auth.CredentialsFromContext(ctx)
+	f := d.fs.newFile(creds, "")
+	fVFSD := f.VFSDentry()
+	if err := d.OrderedChildren.Insert(name, fVFSD); err != nil {
+		f.DecRef(ctx)
+		return nil, err
+	}
+	return fVFSD, nil
+}
+
+func (*dir) NewLink(context.Context, string, kernfs.Inode) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+func (*dir) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+func (fsType) Name() string {
+	return "kernfs"
+}
+
+func (fst fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	fs := &filesystem{}
+	fs.VFSFilesystem().Init(vfsObj, &fst, fs)
+	root := fst.rootFn(creds, fs)
+	return fs.VFSFilesystem(), root.VFSDentry(), nil
+}
+
+// -------------------- Remainder of the file are test cases --------------------
+
+func TestBasic(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+			"file1": fs.newFile(creds, staticFileContent),
+		})
+	})
+	defer sys.Destroy()
+	sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef(sys.Ctx)
+}
+
+func TestMkdirGetDentry(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+			"dir1": fs.newDir(creds, 0755, nil),
+		})
+	})
+	defer sys.Destroy()
+
+	pop := sys.PathOpAtRoot("dir1/a new directory")
+	if err := sys.VFS.MkdirAt(sys.Ctx, sys.Creds, pop, &vfs.MkdirOptions{Mode: 0755}); err != nil {
+		t.Fatalf("MkdirAt for PathOperation %+v failed: %v", pop, err)
+	}
+	sys.GetDentryOrDie(pop).DecRef(sys.Ctx)
+}
+
+func TestReadStaticFile(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+			"file1": fs.newFile(creds, staticFileContent),
+		})
+	})
+	defer sys.Destroy()
+
+	pop := sys.PathOpAtRoot("file1")
+	fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	})
+	if err != nil {
+		t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
+	}
+	defer fd.DecRef(sys.Ctx)
+
+	content, err := sys.ReadToEnd(fd)
+	if err != nil {
+		t.Fatalf("Read failed: %v", err)
+	}
+	if diff := cmp.Diff(staticFileContent, content); diff != "" {
+		t.Fatalf("Read returned unexpected data:\n--- want\n+++ got\n%v", diff)
+	}
+}
+
+func TestCreateNewFileInStaticDir(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+			"dir1": fs.newDir(creds, 0755, nil),
+		})
+	})
+	defer sys.Destroy()
+
+	pop := sys.PathOpAtRoot("dir1/newfile")
+	opts := &vfs.OpenOptions{Flags: linux.O_CREAT | linux.O_EXCL, Mode: defaultMode}
+	fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, opts)
+	if err != nil {
+		t.Fatalf("OpenAt(pop:%+v, opts:%+v) failed: %v", pop, opts, err)
+	}
+
+	// Close the file. The file should persist.
+	fd.DecRef(sys.Ctx)
+
+	fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	})
+	if err != nil {
+		t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err)
+	}
+	fd.DecRef(sys.Ctx)
+}
+
+func TestDirFDReadWrite(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, nil)
+	})
+	defer sys.Destroy()
+
+	pop := sys.PathOpAtRoot("/")
+	fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	})
+	if err != nil {
+		t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
+	}
+	defer fd.DecRef(sys.Ctx)
+
+	// Read/Write should fail for directory FDs.
+	if _, err := fd.Read(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR {
+		t.Fatalf("Read for directory FD failed with unexpected error: %v", err)
+	}
+	if _, err := fd.Write(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); err != syserror.EBADF {
+		t.Fatalf("Write for directory FD failed with unexpected error: %v", err)
+	}
+}
+
+func TestDirFDIterDirents(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+			// Fill root with nodes backed by various inode implementations.
+			"dir1": fs.newReadonlyDir(creds, 0755, nil),
+			"dir2": fs.newDir(creds, 0755, map[string]*kernfs.Dentry{
+				"dir3": fs.newDir(creds, 0755, nil),
+			}),
+			"file1": fs.newFile(creds, staticFileContent),
+		})
+	})
+	defer sys.Destroy()
+
+	pop := sys.PathOpAtRoot("/")
+	sys.AssertAllDirentTypes(sys.ListDirents(pop), map[string]testutil.DirentType{
+		"dir1":  linux.DT_DIR,
+		"dir2":  linux.DT_DIR,
+		"file1": linux.DT_REG,
+	})
+}
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
new file mode 100644
index 000000000..2ab3f53fd
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -0,0 +1,66 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// StaticSymlink provides an Inode implementation for symlinks that point to
+// a immutable target.
+type StaticSymlink struct {
+	InodeAttrs
+	InodeNoopRefCount
+	InodeSymlink
+
+	target string
+}
+
+var _ Inode = (*StaticSymlink)(nil)
+
+// NewStaticSymlink creates a new symlink file pointing to 'target'.
+func NewStaticSymlink(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) *Dentry {
+	inode := &StaticSymlink{}
+	inode.Init(creds, devMajor, devMinor, ino, target)
+
+	d := &Dentry{}
+	d.Init(inode)
+	return d
+}
+
+// Init initializes the instance.
+func (s *StaticSymlink) Init(creds *auth.Credentials, devMajor uint32, devMinor uint32, ino uint64, target string) {
+	s.target = target
+	s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeSymlink|0777)
+}
+
+// Readlink implements Inode.
+func (s *StaticSymlink) Readlink(_ context.Context) (string, error) {
+	return s.target, nil
+}
+
+// Getlink implements Inode.Getlink.
+func (s *StaticSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	return vfs.VirtualDentry{}, s.target, nil
+}
+
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*StaticSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
author	Ian Lewis <ianmlewis@gmail.com>	2020-08-17 21:44:31 -0400
committer	Ian Lewis <ianmlewis@gmail.com>	2020-08-17 21:44:31 -0400
commit	ac324f646ee3cb7955b0b45a7453aeb9671cbdf1 (patch)
tree	0cbc5018e8807421d701d190dc20525726c7ca76 /pkg/sentry/fsimpl/kernfs
parent	352ae1022ce19de28fc72e034cc469872ad79d06 (diff)
parent	6d0c5803d557d453f15ac6f683697eeb46dab680 (diff)