Sentry virtual filesystem, v2

Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
author: Jamie Liu <jamieliu@google.com> 2019-07-18 15:09:14 -0700
committer: gVisor bot <gvisor-bot@google.com> 2019-07-18 15:10:29 -0700
commit: 163ab5e9bab4f14923433967656d20f169d0f904 (patch)
tree: 5e51b1573e48fe87fe0e277a32f13c78b0c2f058 /pkg/sentry/fsimpl/memfs
parent: 6f7e2bb388cb29a355dece8921671c0085f53ea9 (diff)
7 files changed, 1729 insertions, 0 deletions
diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD
new file mode 100644
index 000000000..d5d4f68df
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/BUILD
@@ -0,0 +1,55 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+go_template_instance(
+    name = "dentry_list",
+    out = "dentry_list.go",
+    package = "memfs",
+    prefix = "dentry",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*Dentry",
+        "Linker": "*Dentry",
+    },
+)
+
+go_library(
+    name = "memfs",
+    srcs = [
+        "dentry_list.go",
+        "directory.go",
+        "filesystem.go",
+        "memfs.go",
+        "regular_file.go",
+        "symlink.go",
+    ],
+    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/context",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/usermem",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "benchmark_test",
+    size = "small",
+    srcs = ["benchmark_test.go"],
+    deps = [
+        ":memfs",
+        "//pkg/abi/linux",
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go
new file mode 100644
index 000000000..a94b17db6
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/benchmark_test.go
@@ -0,0 +1,464 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package benchmark_test
+
+import (
+	"fmt"
+	"runtime"
+	"strings"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Differences from stat_benchmark:
+//
+// - Syscall interception, CopyInPath, copyOutStat, and overlayfs overheads are
+// not included.
+//
+// - *MountStat benchmarks use a tmpfs root mount and a tmpfs submount at /tmp.
+// Non-MountStat benchmarks use a tmpfs root mount and no submounts.
+// stat_benchmark uses a varying root mount, a tmpfs submount at /tmp, and a
+// subdirectory /tmp/<top_dir> (assuming TEST_TMPDIR == "/tmp"). Thus
+// stat_benchmark at depth 1 does a comparable amount of work to *MountStat
+// benchmarks at depth 2, and non-MountStat benchmarks at depth 3.
+var depths = []int{1, 2, 3, 8, 64, 100}
+
+const (
+	mountPointName = "tmp"
+	filename       = "gvisor_test_temp_0_1557494568"
+)
+
+// This is copied from syscalls/linux/sys_file.go, with the dependency on
+// kernel.Task stripped out.
+func fileOpOn(ctx context.Context, mntns *fs.MountNamespace, root, wd *fs.Dirent, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent) error) error {
+	var (
+		d   *fs.Dirent // The file.
+		rel *fs.Dirent // The relative directory for search (if required.)
+		err error
+	)
+
+	// Extract the working directory (maybe).
+	if len(path) > 0 && path[0] == '/' {
+		// Absolute path; rel can be nil.
+	} else if dirFD == linux.AT_FDCWD {
+		// Need to reference the working directory.
+		rel = wd
+	} else {
+		// Need to extract the given FD.
+		return syserror.EBADF
+	}
+
+	// Lookup the node.
+	remainingTraversals := uint(linux.MaxSymlinkTraversals)
+	if resolve {
+		d, err = mntns.FindInode(ctx, root, rel, path, &remainingTraversals)
+	} else {
+		d, err = mntns.FindLink(ctx, root, rel, path, &remainingTraversals)
+	}
+	if err != nil {
+		return err
+	}
+
+	err = fn(root, d)
+	d.DecRef()
+	return err
+}
+
+func BenchmarkVFS1TmpfsStat(b *testing.B) {
+	for _, depth := range depths {
+		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+			ctx := contexttest.Context(b)
+
+			// Create VFS.
+			tmpfsFS, ok := fs.FindFilesystem("tmpfs")
+			if !ok {
+				b.Fatalf("failed to find tmpfs filesystem type")
+			}
+			rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
+			if err != nil {
+				b.Fatalf("failed to create tmpfs root mount: %v", err)
+			}
+			mntns, err := fs.NewMountNamespace(ctx, rootInode)
+			if err != nil {
+				b.Fatalf("failed to create mount namespace: %v", err)
+			}
+			defer mntns.DecRef()
+
+			var filePathBuilder strings.Builder
+			filePathBuilder.WriteByte('/')
+
+			// Create nested directories with given depth.
+			root := mntns.Root()
+			defer root.DecRef()
+			d := root
+			d.IncRef()
+			defer d.DecRef()
+			for i := depth; i > 0; i-- {
+				name := fmt.Sprintf("%d", i)
+				if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil {
+					b.Fatalf("failed to create directory %q: %v", name, err)
+				}
+				next, err := d.Walk(ctx, root, name)
+				if err != nil {
+					b.Fatalf("failed to walk to directory %q: %v", name, err)
+				}
+				d.DecRef()
+				d = next
+				filePathBuilder.WriteString(name)
+				filePathBuilder.WriteByte('/')
+			}
+
+			// Create the file that will be stat'd.
+			file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644))
+			if err != nil {
+				b.Fatalf("failed to create file %q: %v", filename, err)
+			}
+			file.DecRef()
+			filePathBuilder.WriteString(filename)
+			filePath := filePathBuilder.String()
+
+			dirPath := false
+			runtime.GC()
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+					if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+						return syserror.ENOTDIR
+					}
+					uattr, err := d.Inode.UnstableAttr(ctx)
+					if err != nil {
+						return err
+					}
+					// Sanity check.
+					if uattr.Perms.User.Execute {
+						b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode())
+					}
+					return nil
+				})
+				if err != nil {
+					b.Fatalf("stat(%q) failed: %v", filePath, err)
+				}
+			}
+		})
+	}
+}
+
+func BenchmarkVFS2MemfsStat(b *testing.B) {
+	for _, depth := range depths {
+		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+			ctx := contexttest.Context(b)
+			creds := auth.CredentialsFromContext(ctx)
+
+			// Create VFS.
+			vfsObj := vfs.New()
+			vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{})
+			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{})
+			if err != nil {
+				b.Fatalf("failed to create tmpfs root mount: %v", err)
+			}
+
+			var filePathBuilder strings.Builder
+			filePathBuilder.WriteByte('/')
+
+			// Create nested directories with given depth.
+			root := mntns.Root()
+			defer root.DecRef()
+			vd := root
+			vd.IncRef()
+			defer vd.DecRef()
+			for i := depth; i > 0; i-- {
+				name := fmt.Sprintf("%d", i)
+				pop := vfs.PathOperation{
+					Root:     root,
+					Start:    vd,
+					Pathname: name,
+				}
+				if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
+					Mode: 0755,
+				}); err != nil {
+					b.Fatalf("failed to create directory %q: %v", name, err)
+				}
+				nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+				if err != nil {
+					b.Fatalf("failed to walk to directory %q: %v", name, err)
+				}
+				vd.DecRef()
+				vd = nextVD
+				filePathBuilder.WriteString(name)
+				filePathBuilder.WriteByte('/')
+			}
+
+			// Create the file that will be stat'd.
+			fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+				Root:               root,
+				Start:              vd,
+				Pathname:           filename,
+				FollowFinalSymlink: true,
+			}, &vfs.OpenOptions{
+				Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+				Mode:  0644,
+			})
+			if err != nil {
+				b.Fatalf("failed to create file %q: %v", filename, err)
+			}
+			defer fd.DecRef()
+			filePathBuilder.WriteString(filename)
+			filePath := filePathBuilder.String()
+
+			runtime.GC()
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+					Root:               root,
+					Start:              root,
+					Pathname:           filePath,
+					FollowFinalSymlink: true,
+				}, &vfs.StatOptions{})
+				if err != nil {
+					b.Fatalf("stat(%q) failed: %v", filePath, err)
+				}
+				// Sanity check.
+				if stat.Mode&^linux.S_IFMT != 0644 {
+					b.Fatalf("got wrong permissions (%0o)", stat.Mode)
+				}
+			}
+		})
+	}
+}
+
+func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
+	for _, depth := range depths {
+		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+			ctx := contexttest.Context(b)
+
+			// Create VFS.
+			tmpfsFS, ok := fs.FindFilesystem("tmpfs")
+			if !ok {
+				b.Fatalf("failed to find tmpfs filesystem type")
+			}
+			rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
+			if err != nil {
+				b.Fatalf("failed to create tmpfs root mount: %v", err)
+			}
+			mntns, err := fs.NewMountNamespace(ctx, rootInode)
+			if err != nil {
+				b.Fatalf("failed to create mount namespace: %v", err)
+			}
+			defer mntns.DecRef()
+
+			var filePathBuilder strings.Builder
+			filePathBuilder.WriteByte('/')
+
+			// Create and mount the submount.
+			root := mntns.Root()
+			defer root.DecRef()
+			if err := root.Inode.CreateDirectory(ctx, root, mountPointName, fs.FilePermsFromMode(0755)); err != nil {
+				b.Fatalf("failed to create mount point: %v", err)
+			}
+			mountPoint, err := root.Walk(ctx, root, mountPointName)
+			if err != nil {
+				b.Fatalf("failed to walk to mount point: %v", err)
+			}
+			defer mountPoint.DecRef()
+			submountInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
+			if err != nil {
+				b.Fatalf("failed to create tmpfs submount: %v", err)
+			}
+			if err := mntns.Mount(ctx, mountPoint, submountInode); err != nil {
+				b.Fatalf("failed to mount tmpfs submount: %v", err)
+			}
+			filePathBuilder.WriteString(mountPointName)
+			filePathBuilder.WriteByte('/')
+
+			// Create nested directories with given depth.
+			d, err := root.Walk(ctx, root, mountPointName)
+			if err != nil {
+				b.Fatalf("failed to walk to mount root: %v", err)
+			}
+			defer d.DecRef()
+			for i := depth; i > 0; i-- {
+				name := fmt.Sprintf("%d", i)
+				if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil {
+					b.Fatalf("failed to create directory %q: %v", name, err)
+				}
+				next, err := d.Walk(ctx, root, name)
+				if err != nil {
+					b.Fatalf("failed to walk to directory %q: %v", name, err)
+				}
+				d.DecRef()
+				d = next
+				filePathBuilder.WriteString(name)
+				filePathBuilder.WriteByte('/')
+			}
+
+			// Create the file that will be stat'd.
+			file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644))
+			if err != nil {
+				b.Fatalf("failed to create file %q: %v", filename, err)
+			}
+			file.DecRef()
+			filePathBuilder.WriteString(filename)
+			filePath := filePathBuilder.String()
+
+			dirPath := false
+			runtime.GC()
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+					if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+						return syserror.ENOTDIR
+					}
+					uattr, err := d.Inode.UnstableAttr(ctx)
+					if err != nil {
+						return err
+					}
+					// Sanity check.
+					if uattr.Perms.User.Execute {
+						b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode())
+					}
+					return nil
+				})
+				if err != nil {
+					b.Fatalf("stat(%q) failed: %v", filePath, err)
+				}
+			}
+		})
+	}
+}
+
+func BenchmarkVFS2MemfsMountStat(b *testing.B) {
+	for _, depth := range depths {
+		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+			ctx := contexttest.Context(b)
+			creds := auth.CredentialsFromContext(ctx)
+
+			// Create VFS.
+			vfsObj := vfs.New()
+			vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{})
+			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{})
+			if err != nil {
+				b.Fatalf("failed to create tmpfs root mount: %v", err)
+			}
+
+			var filePathBuilder strings.Builder
+			filePathBuilder.WriteByte('/')
+
+			// Create the mount point.
+			root := mntns.Root()
+			defer root.DecRef()
+			pop := vfs.PathOperation{
+				Root:     root,
+				Start:    root,
+				Pathname: mountPointName,
+			}
+			if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
+				Mode: 0755,
+			}); err != nil {
+				b.Fatalf("failed to create mount point: %v", err)
+			}
+			// Save the mount point for later use.
+			mountPoint, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+			if err != nil {
+				b.Fatalf("failed to walk to mount point: %v", err)
+			}
+			defer mountPoint.DecRef()
+			// Create and mount the submount.
+			if err := vfsObj.NewMount(ctx, creds, "", &pop, "memfs", &vfs.NewFilesystemOptions{}); err != nil {
+				b.Fatalf("failed to mount tmpfs submount: %v", err)
+			}
+			filePathBuilder.WriteString(mountPointName)
+			filePathBuilder.WriteByte('/')
+
+			// Create nested directories with given depth.
+			vd, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+			if err != nil {
+				b.Fatalf("failed to walk to mount root: %v", err)
+			}
+			defer vd.DecRef()
+			for i := depth; i > 0; i-- {
+				name := fmt.Sprintf("%d", i)
+				pop := vfs.PathOperation{
+					Root:     root,
+					Start:    vd,
+					Pathname: name,
+				}
+				if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
+					Mode: 0755,
+				}); err != nil {
+					b.Fatalf("failed to create directory %q: %v", name, err)
+				}
+				nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+				if err != nil {
+					b.Fatalf("failed to walk to directory %q: %v", name, err)
+				}
+				vd.DecRef()
+				vd = nextVD
+				filePathBuilder.WriteString(name)
+				filePathBuilder.WriteByte('/')
+			}
+
+			// Verify that we didn't create any directories under the mount
+			// point (i.e. they were all created on the submount).
+			firstDirName := fmt.Sprintf("%d", depth)
+			if child := mountPoint.Dentry().Child(firstDirName); child != nil {
+				b.Fatalf("created directory %q under root mount, not submount", firstDirName)
+			}
+
+			// Create the file that will be stat'd.
+			fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+				Root:               root,
+				Start:              vd,
+				Pathname:           filename,
+				FollowFinalSymlink: true,
+			}, &vfs.OpenOptions{
+				Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+				Mode:  0644,
+			})
+			if err != nil {
+				b.Fatalf("failed to create file %q: %v", filename, err)
+			}
+			fd.DecRef()
+			filePathBuilder.WriteString(filename)
+			filePath := filePathBuilder.String()
+
+			runtime.GC()
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+					Root:               root,
+					Start:              root,
+					Pathname:           filePath,
+					FollowFinalSymlink: true,
+				}, &vfs.StatOptions{})
+				if err != nil {
+					b.Fatalf("stat(%q) failed: %v", filePath, err)
+				}
+				// Sanity check.
+				if stat.Mode&^linux.S_IFMT != 0644 {
+					b.Fatalf("got wrong permissions (%0o)", stat.Mode)
+				}
+			}
+		})
+	}
+}
diff --git a/pkg/sentry/fsimpl/memfs/directory.go b/pkg/sentry/fsimpl/memfs/directory.go
new file mode 100644
index 000000000..b0c3ea39a
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/directory.go
@@ -0,0 +1,178 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+type directory struct {
+	inode Inode
+
+	// childList is a list containing (1) child Dentries and (2) fake Dentries
+	// (with inode == nil) that represent the iteration position of
+	// directoryFDs. childList is used to support directoryFD.IterDirents()
+	// efficiently. childList is protected by Filesystem.mu.
+	childList dentryList
+}
+
+func (fs *Filesystem) newDirectory(creds *auth.Credentials, mode uint16) *Inode {
+	dir := &directory{}
+	dir.inode.init(dir, fs, creds, mode)
+	dir.inode.nlink = 2 // from "." and parent directory or ".." for root
+	return &dir.inode
+}
+
+func (i *Inode) isDir() bool {
+	_, ok := i.impl.(*directory)
+	return ok
+}
+
+type directoryFD struct {
+	fileDescription
+	vfs.DirectoryFileDescriptionDefaultImpl
+
+	// Protected by Filesystem.mu.
+	iter *Dentry
+	off  int64
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *directoryFD) Release() {
+	if fd.iter != nil {
+		fs := fd.filesystem()
+		dir := fd.inode().impl.(*directory)
+		fs.mu.Lock()
+		dir.childList.Remove(fd.iter)
+		fs.mu.Unlock()
+		fd.iter = nil
+	}
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	fs := fd.filesystem()
+	d := fd.vfsfd.VirtualDentry().Dentry()
+
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+
+	if fd.off == 0 {
+		if !cb.Handle(vfs.Dirent{
+			Name: ".",
+			Type: linux.DT_DIR,
+			Ino:  d.Impl().(*Dentry).inode.ino,
+			Off:  0,
+		}) {
+			return nil
+		}
+		fd.off++
+	}
+	if fd.off == 1 {
+		parentInode := d.ParentOrSelf().Impl().(*Dentry).inode
+		if !cb.Handle(vfs.Dirent{
+			Name: "..",
+			Type: parentInode.direntType(),
+			Ino:  parentInode.ino,
+			Off:  1,
+		}) {
+			return nil
+		}
+		fd.off++
+	}
+
+	dir := d.Impl().(*Dentry).inode.impl.(*directory)
+	var child *Dentry
+	if fd.iter == nil {
+		// Start iteration at the beginning of dir.
+		child = dir.childList.Front()
+		fd.iter = &Dentry{}
+	} else {
+		// Continue iteration from where we left off.
+		child = fd.iter.Next()
+		dir.childList.Remove(fd.iter)
+	}
+	for child != nil {
+		// Skip other directoryFD iterators.
+		if child.inode != nil {
+			if !cb.Handle(vfs.Dirent{
+				Name: child.vfsd.Name(),
+				Type: child.inode.direntType(),
+				Ino:  child.inode.ino,
+				Off:  fd.off,
+			}) {
+				dir.childList.InsertBefore(child, fd.iter)
+				return nil
+			}
+			fd.off++
+		}
+		child = child.Next()
+	}
+	dir.childList.PushBack(fd.iter)
+	return nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	if whence != linux.SEEK_SET {
+		// TODO: Linux also allows SEEK_CUR.
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	fd.off = offset
+	// Compensate for "." and "..".
+	var remChildren int64
+	if offset < 2 {
+		remChildren = 0
+	} else {
+		remChildren = offset - 2
+	}
+
+	fs := fd.filesystem()
+	dir := fd.inode().impl.(*directory)
+
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+
+	// Ensure that fd.iter exists and is not linked into dir.childList.
+	if fd.iter == nil {
+		fd.iter = &Dentry{}
+	} else {
+		dir.childList.Remove(fd.iter)
+	}
+	// Insert fd.iter before the remChildren'th child, or at the end of the
+	// list if remChildren >= number of children.
+	child := dir.childList.Front()
+	for child != nil {
+		// Skip other directoryFD iterators.
+		if child.inode != nil {
+			if remChildren == 0 {
+				dir.childList.InsertBefore(child, fd.iter)
+				return offset, nil
+			}
+			remChildren--
+		}
+		child = child.Next()
+	}
+	dir.childList.PushBack(fd.iter)
+	return offset, nil
+}
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
new file mode 100644
index 000000000..4d989eeaf
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/filesystem.go
@@ -0,0 +1,542 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memfs
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// stepLocked resolves rp.Component() in parent directory vfsd.
+//
+// stepLocked is loosely analogous to fs/namei.c:walk_component().
+//
+// Preconditions: Filesystem.mu must be locked. !rp.Done(). inode ==
+// vfsd.Impl().(*Dentry).inode.
+func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *Inode) (*vfs.Dentry, *Inode, error) {
+	if !inode.isDir() {
+		return nil, nil, syserror.ENOTDIR
+	}
+	if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+		return nil, nil, err
+	}
+afterSymlink:
+	nextVFSD, err := rp.ResolveComponent(vfsd)
+	if err != nil {
+		return nil, nil, err
+	}
+	if nextVFSD == nil {
+		// Since the Dentry tree is the sole source of truth for memfs, if it's
+		// not in the Dentry tree, it doesn't exist.
+		return nil, nil, syserror.ENOENT
+	}
+	nextInode := nextVFSD.Impl().(*Dentry).inode
+	if symlink, ok := nextInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
+		// TODO: symlink traversals update access time
+		if err := rp.HandleSymlink(symlink.target); err != nil {
+			return nil, nil, err
+		}
+		goto afterSymlink // don't check the current directory again
+	}
+	rp.Advance()
+	return nextVFSD, nextInode, nil
+}
+
+// walkExistingLocked resolves rp to an existing file.
+//
+// walkExistingLocked is loosely analogous to Linux's
+// fs/namei.c:path_lookupat().
+//
+// Preconditions: Filesystem.mu must be locked.
+func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) {
+	vfsd := rp.Start()
+	inode := vfsd.Impl().(*Dentry).inode
+	for !rp.Done() {
+		var err error
+		vfsd, inode, err = stepLocked(rp, vfsd, inode)
+		if err != nil {
+			return nil, nil, err
+		}
+	}
+	if rp.MustBeDir() && !inode.isDir() {
+		return nil, nil, syserror.ENOTDIR
+	}
+	return vfsd, inode, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory. It does not check that the returned directory is
+// searchable by the provider of rp.
+//
+// walkParentDirLocked is loosely analogous to Linux's
+// fs/namei.c:path_parentat().
+//
+// Preconditions: Filesystem.mu must be locked. !rp.Done().
+func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) {
+	vfsd := rp.Start()
+	inode := vfsd.Impl().(*Dentry).inode
+	for !rp.Final() {
+		var err error
+		vfsd, inode, err = stepLocked(rp, vfsd, inode)
+		if err != nil {
+			return nil, nil, err
+		}
+	}
+	if !inode.isDir() {
+		return nil, nil, syserror.ENOTDIR
+	}
+	return vfsd, inode, nil
+}
+
+// checkCreateLocked checks that a file named rp.Component() may be created in
+// directory parentVFSD, then returns rp.Component().
+//
+// Preconditions: Filesystem.mu must be locked. parentInode ==
+// parentVFSD.Impl().(*Dentry).inode. parentInode.isDir() == true.
+func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode *Inode) (string, error) {
+	if err := parentInode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+		return "", err
+	}
+	pc := rp.Component()
+	if pc == "." || pc == ".." {
+		return "", syserror.EEXIST
+	}
+	childVFSD, err := rp.ResolveChild(parentVFSD, pc)
+	if err != nil {
+		return "", err
+	}
+	if childVFSD != nil {
+		return "", syserror.EEXIST
+	}
+	if parentVFSD.IsDisowned() {
+		return "", syserror.ENOENT
+	}
+	return pc, nil
+}
+
+// checkDeleteLocked checks that the file represented by vfsd may be deleted.
+func checkDeleteLocked(vfsd *vfs.Dentry) error {
+	parentVFSD := vfsd.Parent()
+	if parentVFSD == nil {
+		return syserror.EBUSY
+	}
+	if parentVFSD.IsDisowned() {
+		return syserror.ENOENT
+	}
+	return nil
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	vfsd, inode, err := walkExistingLocked(rp)
+	if err != nil {
+		return nil, err
+	}
+	if opts.CheckSearchable {
+		if !inode.isDir() {
+			return nil, syserror.ENOTDIR
+		}
+		if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+			return nil, err
+		}
+	}
+	inode.incRef() // vfsd.IncRef(&fs.vfsfs)
+	return vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentVFSD, parentInode, err := walkParentDirLocked(rp)
+	if err != nil {
+		return err
+	}
+	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+	if err != nil {
+		return err
+	}
+	if rp.Mount() != vd.Mount() {
+		return syserror.EXDEV
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	d := vd.Dentry().Impl().(*Dentry)
+	if d.inode.isDir() {
+		return syserror.EPERM
+	}
+	d.inode.incLinksLocked()
+	child := fs.newDentry(d.inode)
+	parentVFSD.InsertChild(&child.vfsd, pc)
+	parentInode.impl.(*directory).childList.PushBack(child)
+	return nil
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentVFSD, parentInode, err := walkParentDirLocked(rp)
+	if err != nil {
+		return err
+	}
+	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode))
+	parentVFSD.InsertChild(&child.vfsd, pc)
+	parentInode.impl.(*directory).childList.PushBack(child)
+	parentInode.incLinksLocked() // from child's ".."
+	return nil
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentVFSD, parentInode, err := walkParentDirLocked(rp)
+	if err != nil {
+		return err
+	}
+	_, err = checkCreateLocked(rp, parentVFSD, parentInode)
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	// TODO: actually implement mknod
+	return syserror.EPERM
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	// Filter out flags that are not supported by memfs. O_DIRECTORY and
+	// O_NOFOLLOW have no effect here (they're handled by VFS by setting
+	// appropriate bits in rp), but are returned by
+	// FileDescriptionImpl.StatusFlags().
+	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW
+
+	if opts.Flags&linux.O_CREAT == 0 {
+		fs.mu.RLock()
+		defer fs.mu.RUnlock()
+		vfsd, inode, err := walkExistingLocked(rp)
+		if err != nil {
+			return nil, err
+		}
+		return inode.open(rp, vfsd, opts.Flags, false)
+	}
+
+	mustCreate := opts.Flags&linux.O_EXCL != 0
+	vfsd := rp.Start()
+	inode := vfsd.Impl().(*Dentry).inode
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	if rp.Done() {
+		// FIXME: ???
+		if rp.MustBeDir() {
+			return nil, syserror.EISDIR
+		}
+		if mustCreate {
+			return nil, syserror.EEXIST
+		}
+		return inode.open(rp, vfsd, opts.Flags, false)
+	}
+afterTrailingSymlink:
+	// Walk to the parent directory of the last path component.
+	for !rp.Final() {
+		var err error
+		vfsd, inode, err = stepLocked(rp, vfsd, inode)
+		if err != nil {
+			return nil, err
+		}
+	}
+	if !inode.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	// Check for search permission in the parent directory.
+	if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+		return nil, err
+	}
+	// Reject attempts to open directories with O_CREAT.
+	if rp.MustBeDir() {
+		return nil, syserror.EISDIR
+	}
+	pc := rp.Component()
+	if pc == "." || pc == ".." {
+		return nil, syserror.EISDIR
+	}
+	// Determine whether or not we need to create a file.
+	childVFSD, err := rp.ResolveChild(vfsd, pc)
+	if err != nil {
+		return nil, err
+	}
+	if childVFSD == nil {
+		// Already checked for searchability above; now check for writability.
+		if err := inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+			return nil, err
+		}
+		if err := rp.Mount().CheckBeginWrite(); err != nil {
+			return nil, err
+		}
+		defer rp.Mount().EndWrite()
+		// Create and open the child.
+		childInode := fs.newRegularFile(rp.Credentials(), opts.Mode)
+		child := fs.newDentry(childInode)
+		vfsd.InsertChild(&child.vfsd, pc)
+		inode.impl.(*directory).childList.PushBack(child)
+		return childInode.open(rp, &child.vfsd, opts.Flags, true)
+	}
+	// Open existing file or follow symlink.
+	if mustCreate {
+		return nil, syserror.EEXIST
+	}
+	childInode := childVFSD.Impl().(*Dentry).inode
+	if symlink, ok := childInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
+		// TODO: symlink traversals update access time
+		if err := rp.HandleSymlink(symlink.target); err != nil {
+			return nil, err
+		}
+		// rp.Final() may no longer be true since we now need to resolve the
+		// symlink target.
+		goto afterTrailingSymlink
+	}
+	return childInode.open(rp, childVFSD, opts.Flags, false)
+}
+
+func (i *Inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
+	ats := vfs.AccessTypesForOpenFlags(flags)
+	if !afterCreate {
+		if err := i.checkPermissions(rp.Credentials(), ats, i.isDir()); err != nil {
+			return nil, err
+		}
+	}
+	switch impl := i.impl.(type) {
+	case *regularFile:
+		var fd regularFileFD
+		fd.flags = flags
+		fd.readable = vfs.MayReadFileWithOpenFlags(flags)
+		fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
+		if fd.writable {
+			if err := rp.Mount().CheckBeginWrite(); err != nil {
+				return nil, err
+			}
+			// Mount.EndWrite() is called by regularFileFD.Release().
+		}
+		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+		if flags&linux.O_TRUNC != 0 {
+			impl.mu.Lock()
+			impl.data = impl.data[:0]
+			atomic.StoreInt64(&impl.dataLen, 0)
+			impl.mu.Unlock()
+		}
+		return &fd.vfsfd, nil
+	case *directory:
+		// Can't open directories writably.
+		if ats&vfs.MayWrite != 0 {
+			return nil, syserror.EISDIR
+		}
+		var fd directoryFD
+		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+		fd.flags = flags
+		return &fd.vfsfd, nil
+	case *symlink:
+		// Can't open symlinks without O_PATH (which is unimplemented).
+		return nil, syserror.ELOOP
+	default:
+		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+	}
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+	fs.mu.RLock()
+	_, inode, err := walkExistingLocked(rp)
+	fs.mu.RUnlock()
+	if err != nil {
+		return "", err
+	}
+	symlink, ok := inode.impl.(*symlink)
+	if !ok {
+		return "", syserror.EINVAL
+	}
+	return symlink.target, nil
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
+	if rp.Done() {
+		// FIXME
+		return syserror.ENOENT
+	}
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentVFSD, parentInode, err := walkParentDirLocked(rp)
+	if err != nil {
+		return err
+	}
+	_, err = checkCreateLocked(rp, parentVFSD, parentInode)
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	// TODO: actually implement RenameAt
+	return syserror.EPERM
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	vfsd, inode, err := walkExistingLocked(rp)
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	if err := checkDeleteLocked(vfsd); err != nil {
+		return err
+	}
+	if !inode.isDir() {
+		return syserror.ENOTDIR
+	}
+	if vfsd.HasChildren() {
+		return syserror.ENOTEMPTY
+	}
+	if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+		return err
+	}
+	inode.decRef()
+	return nil
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+	fs.mu.RLock()
+	_, _, err := walkExistingLocked(rp)
+	fs.mu.RUnlock()
+	if err != nil {
+		return err
+	}
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	// TODO: implement Inode.setStat
+	return syserror.EPERM
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+	fs.mu.RLock()
+	_, inode, err := walkExistingLocked(rp)
+	fs.mu.RUnlock()
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	var stat linux.Statx
+	inode.statTo(&stat)
+	return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+	fs.mu.RLock()
+	_, _, err := walkExistingLocked(rp)
+	fs.mu.RUnlock()
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	// TODO: actually implement statfs
+	return linux.Statfs{}, syserror.ENOSYS
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentVFSD, parentInode, err := walkParentDirLocked(rp)
+	if err != nil {
+		return err
+	}
+	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
+	parentVFSD.InsertChild(&child.vfsd, pc)
+	parentInode.impl.(*directory).childList.PushBack(child)
+	return nil
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	vfsd, inode, err := walkExistingLocked(rp)
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	if err := checkDeleteLocked(vfsd); err != nil {
+		return err
+	}
+	if inode.isDir() {
+		return syserror.EISDIR
+	}
+	if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+		return err
+	}
+	inode.decLinksLocked()
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go
new file mode 100644
index 000000000..f381e1a88
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/memfs.go
@@ -0,0 +1,299 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package memfs provides a filesystem implementation that behaves like tmpfs:
+// the Dentry tree is the sole source of truth for the state of the filesystem.
+//
+// memfs is intended primarily to demonstrate filesystem implementation
+// patterns. Real uses cases for an in-memory filesystem should use tmpfs
+// instead.
+//
+// Lock order:
+//
+// Filesystem.mu
+//   regularFileFD.offMu
+//     regularFile.mu
+//   Inode.mu
+package memfs
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// Filesystem implements vfs.FilesystemImpl.
+type Filesystem struct {
+	vfsfs vfs.Filesystem
+
+	// mu serializes changes to the Dentry tree.
+	mu sync.RWMutex
+
+	nextInoMinusOne uint64 // accessed using atomic memory operations
+}
+
+// NewFilesystem implements vfs.FilesystemType.NewFilesystem.
+func (fstype FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	var fs Filesystem
+	fs.vfsfs.Init(&fs)
+	root := fs.newDentry(fs.newDirectory(creds, 01777))
+	return &fs.vfsfs, &root.vfsd, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *Filesystem) Release() {
+}
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *Filesystem) Sync(ctx context.Context) error {
+	// All filesystem state is in-memory.
+	return nil
+}
+
+// Dentry implements vfs.DentryImpl.
+type Dentry struct {
+	vfsd vfs.Dentry
+
+	// inode is the inode represented by this Dentry. Multiple Dentries may
+	// share a single non-directory Inode (with hard links). inode is
+	// immutable.
+	inode *Inode
+
+	// memfs doesn't count references on Dentries; because the Dentry tree is
+	// the sole source of truth, it is by definition always consistent with the
+	// state of the filesystem. However, it does count references on Inodes,
+	// because Inode resources are released when all references are dropped.
+	// (memfs doesn't really have resources to release, but we implement
+	// reference counting because tmpfs regular files will.)
+
+	// dentryEntry (ugh) links Dentries into their parent directory.childList.
+	dentryEntry
+}
+
+func (fs *Filesystem) newDentry(inode *Inode) *Dentry {
+	d := &Dentry{
+		inode: inode,
+	}
+	d.vfsd.Init(d)
+	return d
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *Dentry) IncRef(vfsfs *vfs.Filesystem) {
+	d.inode.incRef()
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *Dentry) TryIncRef(vfsfs *vfs.Filesystem) bool {
+	return d.inode.tryIncRef()
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *Dentry) DecRef(vfsfs *vfs.Filesystem) {
+	d.inode.decRef()
+}
+
+// Inode represents a filesystem object.
+type Inode struct {
+	// refs is a reference count. refs is accessed using atomic memory
+	// operations.
+	//
+	// A reference is held on all Inodes that are reachable in the filesystem
+	// tree. For non-directories (which may have multiple hard links), this
+	// means that a reference is dropped when nlink reaches 0. For directories,
+	// nlink never reaches 0 due to the "." entry; instead,
+	// Filesystem.RmdirAt() drops the reference.
+	refs int64
+
+	// Inode metadata; protected by mu and accessed using atomic memory
+	// operations unless otherwise specified.
+	mu    sync.RWMutex
+	mode  uint32 // excluding file type bits, which are based on impl
+	nlink uint32 // protected by Filesystem.mu instead of Inode.mu
+	uid   uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
+	gid   uint32 // auth.KGID, but ...
+	ino   uint64 // immutable
+
+	impl interface{} // immutable
+}
+
+func (i *Inode) init(impl interface{}, fs *Filesystem, creds *auth.Credentials, mode uint16) {
+	i.refs = 1
+	i.mode = uint32(mode)
+	i.uid = uint32(creds.EffectiveKUID)
+	i.gid = uint32(creds.EffectiveKGID)
+	i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
+	// i.nlink initialized by caller
+	i.impl = impl
+}
+
+// Preconditions: Filesystem.mu must be locked for writing.
+func (i *Inode) incLinksLocked() {
+	if atomic.AddUint32(&i.nlink, 1) <= 1 {
+		panic("memfs.Inode.incLinksLocked() called with no existing links")
+	}
+}
+
+// Preconditions: Filesystem.mu must be locked for writing.
+func (i *Inode) decLinksLocked() {
+	if nlink := atomic.AddUint32(&i.nlink, ^uint32(0)); nlink == 0 {
+		i.decRef()
+	} else if nlink == ^uint32(0) { // negative overflow
+		panic("memfs.Inode.decLinksLocked() called with no existing links")
+	}
+}
+
+func (i *Inode) incRef() {
+	if atomic.AddInt64(&i.refs, 1) <= 1 {
+		panic("memfs.Inode.incRef() called without holding a reference")
+	}
+}
+
+func (i *Inode) tryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&i.refs)
+		if refs == 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+func (i *Inode) decRef() {
+	if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
+		// This is unnecessary; it's mostly to simulate what tmpfs would do.
+		if regfile, ok := i.impl.(*regularFile); ok {
+			regfile.mu.Lock()
+			regfile.data = nil
+			atomic.StoreInt64(&regfile.dataLen, 0)
+			regfile.mu.Unlock()
+		}
+	} else if refs < 0 {
+		panic("memfs.Inode.decRef() called without holding a reference")
+	}
+}
+
+func (i *Inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
+	return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
+}
+
+// Go won't inline this function, and returning linux.Statx (which is quite
+// big) means spending a lot of time in runtime.duffcopy(), so instead it's an
+// output parameter.
+func (i *Inode) statTo(stat *linux.Statx) {
+	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+	stat.Blksize = 1 // usermem.PageSize in tmpfs
+	stat.Nlink = atomic.LoadUint32(&i.nlink)
+	stat.UID = atomic.LoadUint32(&i.uid)
+	stat.GID = atomic.LoadUint32(&i.gid)
+	stat.Mode = uint16(atomic.LoadUint32(&i.mode))
+	stat.Ino = i.ino
+	// TODO: device number
+	switch impl := i.impl.(type) {
+	case *regularFile:
+		stat.Mode |= linux.S_IFREG
+		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
+		stat.Size = uint64(atomic.LoadInt64(&impl.dataLen))
+		// In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in
+		// a uint64 accessed using atomic memory operations to avoid taking
+		// locks).
+		stat.Blocks = allocatedBlocksForSize(stat.Size)
+	case *directory:
+		stat.Mode |= linux.S_IFDIR
+	case *symlink:
+		stat.Mode |= linux.S_IFLNK
+		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
+		stat.Size = uint64(len(impl.target))
+		stat.Blocks = allocatedBlocksForSize(stat.Size)
+	default:
+		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+	}
+}
+
+// allocatedBlocksForSize returns the number of 512B blocks needed to
+// accommodate the given size in bytes, as appropriate for struct
+// stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
+// size is independent of the "preferred block size for I/O", struct
+// stat::st_blksize and struct statx::stx_blksize.)
+func allocatedBlocksForSize(size uint64) uint64 {
+	return (size + 511) / 512
+}
+
+func (i *Inode) direntType() uint8 {
+	switch i.impl.(type) {
+	case *regularFile:
+		return linux.DT_REG
+	case *directory:
+		return linux.DT_DIR
+	case *symlink:
+		return linux.DT_LNK
+	default:
+		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+	}
+}
+
+// fileDescription is embedded by memfs implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+
+	flags uint32 // status flags; immutable
+}
+
+func (fd *fileDescription) filesystem() *Filesystem {
+	return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*Filesystem)
+}
+
+func (fd *fileDescription) inode() *Inode {
+	return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
+}
+
+// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
+func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) {
+	return fd.flags, nil
+}
+
+// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
+func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
+	// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
+	// no-op.
+	return nil
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	var stat linux.Statx
+	fd.inode().statTo(&stat)
+	return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	// TODO: implement Inode.setStat
+	return syserror.EPERM
+}
diff --git a/pkg/sentry/fsimpl/memfs/regular_file.go b/pkg/sentry/fsimpl/memfs/regular_file.go
new file mode 100644
index 000000000..4a3603cc8
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/regular_file.go
@@ -0,0 +1,155 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memfs
+
+import (
+	"io"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+type regularFile struct {
+	inode Inode
+
+	mu   sync.RWMutex
+	data []byte
+	// dataLen is len(data), but accessed using atomic memory operations to
+	// avoid locking in Inode.stat().
+	dataLen int64
+}
+
+func (fs *Filesystem) newRegularFile(creds *auth.Credentials, mode uint16) *Inode {
+	file := &regularFile{}
+	file.inode.init(file, fs, creds, mode)
+	file.inode.nlink = 1 // from parent directory
+	return &file.inode
+}
+
+type regularFileFD struct {
+	fileDescription
+	vfs.FileDescriptionDefaultImpl
+
+	// These are immutable.
+	readable bool
+	writable bool
+
+	// off is the file offset. off is accessed using atomic memory operations.
+	// offMu serializes operations that may mutate off.
+	off   int64
+	offMu sync.Mutex
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release() {
+	if fd.writable {
+		fd.vfsfd.VirtualDentry().Mount().EndWrite()
+	}
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	if !fd.readable {
+		return 0, syserror.EINVAL
+	}
+	f := fd.inode().impl.(*regularFile)
+	f.mu.RLock()
+	if offset >= int64(len(f.data)) {
+		f.mu.RUnlock()
+		return 0, io.EOF
+	}
+	n, err := dst.CopyOut(ctx, f.data[offset:])
+	f.mu.RUnlock()
+	return int64(n), err
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	fd.offMu.Lock()
+	n, err := fd.PRead(ctx, dst, fd.off, opts)
+	fd.off += n
+	fd.offMu.Unlock()
+	return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	if !fd.writable {
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	srclen := src.NumBytes()
+	if srclen == 0 {
+		return 0, nil
+	}
+	f := fd.inode().impl.(*regularFile)
+	f.mu.Lock()
+	end := offset + srclen
+	if end < offset {
+		// Overflow.
+		f.mu.Unlock()
+		return 0, syserror.EFBIG
+	}
+	if end > f.dataLen {
+		f.data = append(f.data, make([]byte, end-f.dataLen)...)
+		atomic.StoreInt64(&f.dataLen, end)
+	}
+	n, err := src.CopyIn(ctx, f.data[offset:end])
+	f.mu.Unlock()
+	return int64(n), err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	fd.offMu.Lock()
+	n, err := fd.PWrite(ctx, src, fd.off, opts)
+	fd.off += n
+	fd.offMu.Unlock()
+	return n, err
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.offMu.Lock()
+	defer fd.offMu.Unlock()
+	switch whence {
+	case linux.SEEK_SET:
+		// use offset as specified
+	case linux.SEEK_CUR:
+		offset += fd.off
+	case linux.SEEK_END:
+		offset += atomic.LoadInt64(&fd.inode().impl.(*regularFile).dataLen)
+	default:
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	fd.off = offset
+	return offset, nil
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *regularFileFD) Sync(ctx context.Context) error {
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/memfs/symlink.go b/pkg/sentry/fsimpl/memfs/symlink.go
new file mode 100644
index 000000000..e002d1727
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/symlink.go
@@ -0,0 +1,36 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memfs
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+type symlink struct {
+	inode  Inode
+	target string // immutable
+}
+
+func (fs *Filesystem) newSymlink(creds *auth.Credentials, target string) *Inode {
+	link := &symlink{
+		target: target,
+	}
+	link.inode.init(link, fs, creds, 0777)
+	link.inode.nlink = 1 // from parent directory
+	return &link.inode
+}
+
+// O_PATH is unimplemented, so there's no way to get a FileDescription
+// representing a symlink yet.
author	Jamie Liu <jamieliu@google.com>	2019-07-18 15:09:14 -0700
committer	gVisor bot <gvisor-bot@google.com>	2019-07-18 15:10:29 -0700
commit	163ab5e9bab4f14923433967656d20f169d0f904 (patch)
tree	5e51b1573e48fe87fe0e277a32f13c78b0c2f058 /pkg/sentry/fsimpl/memfs
parent	6f7e2bb388cb29a355dece8921671c0085f53ea9 (diff)