summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/fsimpl/memfs
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/fsimpl/memfs')
-rw-r--r--pkg/sentry/fsimpl/memfs/BUILD55
-rw-r--r--pkg/sentry/fsimpl/memfs/benchmark_test.go464
-rw-r--r--pkg/sentry/fsimpl/memfs/directory.go178
-rw-r--r--pkg/sentry/fsimpl/memfs/filesystem.go542
-rw-r--r--pkg/sentry/fsimpl/memfs/memfs.go299
-rw-r--r--pkg/sentry/fsimpl/memfs/regular_file.go155
-rw-r--r--pkg/sentry/fsimpl/memfs/symlink.go36
7 files changed, 1729 insertions, 0 deletions
diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD
new file mode 100644
index 000000000..d5d4f68df
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/BUILD
@@ -0,0 +1,55 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+go_template_instance(
+ name = "dentry_list",
+ out = "dentry_list.go",
+ package = "memfs",
+ prefix = "dentry",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Element": "*Dentry",
+ "Linker": "*Dentry",
+ },
+)
+
+go_library(
+ name = "memfs",
+ srcs = [
+ "dentry_list.go",
+ "directory.go",
+ "filesystem.go",
+ "memfs.go",
+ "regular_file.go",
+ "symlink.go",
+ ],
+ importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs",
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/sentry/context",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/usermem",
+ "//pkg/sentry/vfs",
+ "//pkg/syserror",
+ ],
+)
+
+go_test(
+ name = "benchmark_test",
+ size = "small",
+ srcs = ["benchmark_test.go"],
+ deps = [
+ ":memfs",
+ "//pkg/abi/linux",
+ "//pkg/sentry/context",
+ "//pkg/sentry/context/contexttest",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/fs/tmpfs",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/vfs",
+ "//pkg/syserror",
+ ],
+)
diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go
new file mode 100644
index 000000000..a94b17db6
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/benchmark_test.go
@@ -0,0 +1,464 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package benchmark_test
+
+import (
+ "fmt"
+ "runtime"
+ "strings"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Differences from stat_benchmark:
+//
+// - Syscall interception, CopyInPath, copyOutStat, and overlayfs overheads are
+// not included.
+//
+// - *MountStat benchmarks use a tmpfs root mount and a tmpfs submount at /tmp.
+// Non-MountStat benchmarks use a tmpfs root mount and no submounts.
+// stat_benchmark uses a varying root mount, a tmpfs submount at /tmp, and a
+// subdirectory /tmp/<top_dir> (assuming TEST_TMPDIR == "/tmp"). Thus
+// stat_benchmark at depth 1 does a comparable amount of work to *MountStat
+// benchmarks at depth 2, and non-MountStat benchmarks at depth 3.
+var depths = []int{1, 2, 3, 8, 64, 100}
+
+const (
+ mountPointName = "tmp"
+ filename = "gvisor_test_temp_0_1557494568"
+)
+
+// This is copied from syscalls/linux/sys_file.go, with the dependency on
+// kernel.Task stripped out.
+func fileOpOn(ctx context.Context, mntns *fs.MountNamespace, root, wd *fs.Dirent, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent) error) error {
+ var (
+ d *fs.Dirent // The file.
+ rel *fs.Dirent // The relative directory for search (if required.)
+ err error
+ )
+
+ // Extract the working directory (maybe).
+ if len(path) > 0 && path[0] == '/' {
+ // Absolute path; rel can be nil.
+ } else if dirFD == linux.AT_FDCWD {
+ // Need to reference the working directory.
+ rel = wd
+ } else {
+ // Need to extract the given FD.
+ return syserror.EBADF
+ }
+
+ // Lookup the node.
+ remainingTraversals := uint(linux.MaxSymlinkTraversals)
+ if resolve {
+ d, err = mntns.FindInode(ctx, root, rel, path, &remainingTraversals)
+ } else {
+ d, err = mntns.FindLink(ctx, root, rel, path, &remainingTraversals)
+ }
+ if err != nil {
+ return err
+ }
+
+ err = fn(root, d)
+ d.DecRef()
+ return err
+}
+
+func BenchmarkVFS1TmpfsStat(b *testing.B) {
+ for _, depth := range depths {
+ b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+ ctx := contexttest.Context(b)
+
+ // Create VFS.
+ tmpfsFS, ok := fs.FindFilesystem("tmpfs")
+ if !ok {
+ b.Fatalf("failed to find tmpfs filesystem type")
+ }
+ rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
+ if err != nil {
+ b.Fatalf("failed to create tmpfs root mount: %v", err)
+ }
+ mntns, err := fs.NewMountNamespace(ctx, rootInode)
+ if err != nil {
+ b.Fatalf("failed to create mount namespace: %v", err)
+ }
+ defer mntns.DecRef()
+
+ var filePathBuilder strings.Builder
+ filePathBuilder.WriteByte('/')
+
+ // Create nested directories with given depth.
+ root := mntns.Root()
+ defer root.DecRef()
+ d := root
+ d.IncRef()
+ defer d.DecRef()
+ for i := depth; i > 0; i-- {
+ name := fmt.Sprintf("%d", i)
+ if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil {
+ b.Fatalf("failed to create directory %q: %v", name, err)
+ }
+ next, err := d.Walk(ctx, root, name)
+ if err != nil {
+ b.Fatalf("failed to walk to directory %q: %v", name, err)
+ }
+ d.DecRef()
+ d = next
+ filePathBuilder.WriteString(name)
+ filePathBuilder.WriteByte('/')
+ }
+
+ // Create the file that will be stat'd.
+ file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644))
+ if err != nil {
+ b.Fatalf("failed to create file %q: %v", filename, err)
+ }
+ file.DecRef()
+ filePathBuilder.WriteString(filename)
+ filePath := filePathBuilder.String()
+
+ dirPath := false
+ runtime.GC()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+ if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+ uattr, err := d.Inode.UnstableAttr(ctx)
+ if err != nil {
+ return err
+ }
+ // Sanity check.
+ if uattr.Perms.User.Execute {
+ b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode())
+ }
+ return nil
+ })
+ if err != nil {
+ b.Fatalf("stat(%q) failed: %v", filePath, err)
+ }
+ }
+ })
+ }
+}
+
+func BenchmarkVFS2MemfsStat(b *testing.B) {
+ for _, depth := range depths {
+ b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+ ctx := contexttest.Context(b)
+ creds := auth.CredentialsFromContext(ctx)
+
+ // Create VFS.
+ vfsObj := vfs.New()
+ vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{})
+ if err != nil {
+ b.Fatalf("failed to create tmpfs root mount: %v", err)
+ }
+
+ var filePathBuilder strings.Builder
+ filePathBuilder.WriteByte('/')
+
+ // Create nested directories with given depth.
+ root := mntns.Root()
+ defer root.DecRef()
+ vd := root
+ vd.IncRef()
+ defer vd.DecRef()
+ for i := depth; i > 0; i-- {
+ name := fmt.Sprintf("%d", i)
+ pop := vfs.PathOperation{
+ Root: root,
+ Start: vd,
+ Pathname: name,
+ }
+ if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
+ Mode: 0755,
+ }); err != nil {
+ b.Fatalf("failed to create directory %q: %v", name, err)
+ }
+ nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+ if err != nil {
+ b.Fatalf("failed to walk to directory %q: %v", name, err)
+ }
+ vd.DecRef()
+ vd = nextVD
+ filePathBuilder.WriteString(name)
+ filePathBuilder.WriteByte('/')
+ }
+
+ // Create the file that will be stat'd.
+ fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: vd,
+ Pathname: filename,
+ FollowFinalSymlink: true,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+ Mode: 0644,
+ })
+ if err != nil {
+ b.Fatalf("failed to create file %q: %v", filename, err)
+ }
+ defer fd.DecRef()
+ filePathBuilder.WriteString(filename)
+ filePath := filePathBuilder.String()
+
+ runtime.GC()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Pathname: filePath,
+ FollowFinalSymlink: true,
+ }, &vfs.StatOptions{})
+ if err != nil {
+ b.Fatalf("stat(%q) failed: %v", filePath, err)
+ }
+ // Sanity check.
+ if stat.Mode&^linux.S_IFMT != 0644 {
+ b.Fatalf("got wrong permissions (%0o)", stat.Mode)
+ }
+ }
+ })
+ }
+}
+
+func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
+ for _, depth := range depths {
+ b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+ ctx := contexttest.Context(b)
+
+ // Create VFS.
+ tmpfsFS, ok := fs.FindFilesystem("tmpfs")
+ if !ok {
+ b.Fatalf("failed to find tmpfs filesystem type")
+ }
+ rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
+ if err != nil {
+ b.Fatalf("failed to create tmpfs root mount: %v", err)
+ }
+ mntns, err := fs.NewMountNamespace(ctx, rootInode)
+ if err != nil {
+ b.Fatalf("failed to create mount namespace: %v", err)
+ }
+ defer mntns.DecRef()
+
+ var filePathBuilder strings.Builder
+ filePathBuilder.WriteByte('/')
+
+ // Create and mount the submount.
+ root := mntns.Root()
+ defer root.DecRef()
+ if err := root.Inode.CreateDirectory(ctx, root, mountPointName, fs.FilePermsFromMode(0755)); err != nil {
+ b.Fatalf("failed to create mount point: %v", err)
+ }
+ mountPoint, err := root.Walk(ctx, root, mountPointName)
+ if err != nil {
+ b.Fatalf("failed to walk to mount point: %v", err)
+ }
+ defer mountPoint.DecRef()
+ submountInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
+ if err != nil {
+ b.Fatalf("failed to create tmpfs submount: %v", err)
+ }
+ if err := mntns.Mount(ctx, mountPoint, submountInode); err != nil {
+ b.Fatalf("failed to mount tmpfs submount: %v", err)
+ }
+ filePathBuilder.WriteString(mountPointName)
+ filePathBuilder.WriteByte('/')
+
+ // Create nested directories with given depth.
+ d, err := root.Walk(ctx, root, mountPointName)
+ if err != nil {
+ b.Fatalf("failed to walk to mount root: %v", err)
+ }
+ defer d.DecRef()
+ for i := depth; i > 0; i-- {
+ name := fmt.Sprintf("%d", i)
+ if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil {
+ b.Fatalf("failed to create directory %q: %v", name, err)
+ }
+ next, err := d.Walk(ctx, root, name)
+ if err != nil {
+ b.Fatalf("failed to walk to directory %q: %v", name, err)
+ }
+ d.DecRef()
+ d = next
+ filePathBuilder.WriteString(name)
+ filePathBuilder.WriteByte('/')
+ }
+
+ // Create the file that will be stat'd.
+ file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644))
+ if err != nil {
+ b.Fatalf("failed to create file %q: %v", filename, err)
+ }
+ file.DecRef()
+ filePathBuilder.WriteString(filename)
+ filePath := filePathBuilder.String()
+
+ dirPath := false
+ runtime.GC()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+ if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+ uattr, err := d.Inode.UnstableAttr(ctx)
+ if err != nil {
+ return err
+ }
+ // Sanity check.
+ if uattr.Perms.User.Execute {
+ b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode())
+ }
+ return nil
+ })
+ if err != nil {
+ b.Fatalf("stat(%q) failed: %v", filePath, err)
+ }
+ }
+ })
+ }
+}
+
+func BenchmarkVFS2MemfsMountStat(b *testing.B) {
+ for _, depth := range depths {
+ b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+ ctx := contexttest.Context(b)
+ creds := auth.CredentialsFromContext(ctx)
+
+ // Create VFS.
+ vfsObj := vfs.New()
+ vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{})
+ if err != nil {
+ b.Fatalf("failed to create tmpfs root mount: %v", err)
+ }
+
+ var filePathBuilder strings.Builder
+ filePathBuilder.WriteByte('/')
+
+ // Create the mount point.
+ root := mntns.Root()
+ defer root.DecRef()
+ pop := vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Pathname: mountPointName,
+ }
+ if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
+ Mode: 0755,
+ }); err != nil {
+ b.Fatalf("failed to create mount point: %v", err)
+ }
+ // Save the mount point for later use.
+ mountPoint, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+ if err != nil {
+ b.Fatalf("failed to walk to mount point: %v", err)
+ }
+ defer mountPoint.DecRef()
+ // Create and mount the submount.
+ if err := vfsObj.NewMount(ctx, creds, "", &pop, "memfs", &vfs.NewFilesystemOptions{}); err != nil {
+ b.Fatalf("failed to mount tmpfs submount: %v", err)
+ }
+ filePathBuilder.WriteString(mountPointName)
+ filePathBuilder.WriteByte('/')
+
+ // Create nested directories with given depth.
+ vd, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+ if err != nil {
+ b.Fatalf("failed to walk to mount root: %v", err)
+ }
+ defer vd.DecRef()
+ for i := depth; i > 0; i-- {
+ name := fmt.Sprintf("%d", i)
+ pop := vfs.PathOperation{
+ Root: root,
+ Start: vd,
+ Pathname: name,
+ }
+ if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
+ Mode: 0755,
+ }); err != nil {
+ b.Fatalf("failed to create directory %q: %v", name, err)
+ }
+ nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+ if err != nil {
+ b.Fatalf("failed to walk to directory %q: %v", name, err)
+ }
+ vd.DecRef()
+ vd = nextVD
+ filePathBuilder.WriteString(name)
+ filePathBuilder.WriteByte('/')
+ }
+
+ // Verify that we didn't create any directories under the mount
+ // point (i.e. they were all created on the submount).
+ firstDirName := fmt.Sprintf("%d", depth)
+ if child := mountPoint.Dentry().Child(firstDirName); child != nil {
+ b.Fatalf("created directory %q under root mount, not submount", firstDirName)
+ }
+
+ // Create the file that will be stat'd.
+ fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: vd,
+ Pathname: filename,
+ FollowFinalSymlink: true,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+ Mode: 0644,
+ })
+ if err != nil {
+ b.Fatalf("failed to create file %q: %v", filename, err)
+ }
+ fd.DecRef()
+ filePathBuilder.WriteString(filename)
+ filePath := filePathBuilder.String()
+
+ runtime.GC()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Pathname: filePath,
+ FollowFinalSymlink: true,
+ }, &vfs.StatOptions{})
+ if err != nil {
+ b.Fatalf("stat(%q) failed: %v", filePath, err)
+ }
+ // Sanity check.
+ if stat.Mode&^linux.S_IFMT != 0644 {
+ b.Fatalf("got wrong permissions (%0o)", stat.Mode)
+ }
+ }
+ })
+ }
+}
diff --git a/pkg/sentry/fsimpl/memfs/directory.go b/pkg/sentry/fsimpl/memfs/directory.go
new file mode 100644
index 000000000..b0c3ea39a
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/directory.go
@@ -0,0 +1,178 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memfs
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+type directory struct {
+ inode Inode
+
+ // childList is a list containing (1) child Dentries and (2) fake Dentries
+ // (with inode == nil) that represent the iteration position of
+ // directoryFDs. childList is used to support directoryFD.IterDirents()
+ // efficiently. childList is protected by Filesystem.mu.
+ childList dentryList
+}
+
+func (fs *Filesystem) newDirectory(creds *auth.Credentials, mode uint16) *Inode {
+ dir := &directory{}
+ dir.inode.init(dir, fs, creds, mode)
+ dir.inode.nlink = 2 // from "." and parent directory or ".." for root
+ return &dir.inode
+}
+
+func (i *Inode) isDir() bool {
+ _, ok := i.impl.(*directory)
+ return ok
+}
+
+type directoryFD struct {
+ fileDescription
+ vfs.DirectoryFileDescriptionDefaultImpl
+
+ // Protected by Filesystem.mu.
+ iter *Dentry
+ off int64
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *directoryFD) Release() {
+ if fd.iter != nil {
+ fs := fd.filesystem()
+ dir := fd.inode().impl.(*directory)
+ fs.mu.Lock()
+ dir.childList.Remove(fd.iter)
+ fs.mu.Unlock()
+ fd.iter = nil
+ }
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+ fs := fd.filesystem()
+ d := fd.vfsfd.VirtualDentry().Dentry()
+
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+
+ if fd.off == 0 {
+ if !cb.Handle(vfs.Dirent{
+ Name: ".",
+ Type: linux.DT_DIR,
+ Ino: d.Impl().(*Dentry).inode.ino,
+ Off: 0,
+ }) {
+ return nil
+ }
+ fd.off++
+ }
+ if fd.off == 1 {
+ parentInode := d.ParentOrSelf().Impl().(*Dentry).inode
+ if !cb.Handle(vfs.Dirent{
+ Name: "..",
+ Type: parentInode.direntType(),
+ Ino: parentInode.ino,
+ Off: 1,
+ }) {
+ return nil
+ }
+ fd.off++
+ }
+
+ dir := d.Impl().(*Dentry).inode.impl.(*directory)
+ var child *Dentry
+ if fd.iter == nil {
+ // Start iteration at the beginning of dir.
+ child = dir.childList.Front()
+ fd.iter = &Dentry{}
+ } else {
+ // Continue iteration from where we left off.
+ child = fd.iter.Next()
+ dir.childList.Remove(fd.iter)
+ }
+ for child != nil {
+ // Skip other directoryFD iterators.
+ if child.inode != nil {
+ if !cb.Handle(vfs.Dirent{
+ Name: child.vfsd.Name(),
+ Type: child.inode.direntType(),
+ Ino: child.inode.ino,
+ Off: fd.off,
+ }) {
+ dir.childList.InsertBefore(child, fd.iter)
+ return nil
+ }
+ fd.off++
+ }
+ child = child.Next()
+ }
+ dir.childList.PushBack(fd.iter)
+ return nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ if whence != linux.SEEK_SET {
+ // TODO: Linux also allows SEEK_CUR.
+ return 0, syserror.EINVAL
+ }
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+
+ fd.off = offset
+ // Compensate for "." and "..".
+ var remChildren int64
+ if offset < 2 {
+ remChildren = 0
+ } else {
+ remChildren = offset - 2
+ }
+
+ fs := fd.filesystem()
+ dir := fd.inode().impl.(*directory)
+
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+
+ // Ensure that fd.iter exists and is not linked into dir.childList.
+ if fd.iter == nil {
+ fd.iter = &Dentry{}
+ } else {
+ dir.childList.Remove(fd.iter)
+ }
+ // Insert fd.iter before the remChildren'th child, or at the end of the
+ // list if remChildren >= number of children.
+ child := dir.childList.Front()
+ for child != nil {
+ // Skip other directoryFD iterators.
+ if child.inode != nil {
+ if remChildren == 0 {
+ dir.childList.InsertBefore(child, fd.iter)
+ return offset, nil
+ }
+ remChildren--
+ }
+ child = child.Next()
+ }
+ dir.childList.PushBack(fd.iter)
+ return offset, nil
+}
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
new file mode 100644
index 000000000..4d989eeaf
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/filesystem.go
@@ -0,0 +1,542 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memfs
+
+import (
+ "fmt"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// stepLocked resolves rp.Component() in parent directory vfsd.
+//
+// stepLocked is loosely analogous to fs/namei.c:walk_component().
+//
+// Preconditions: Filesystem.mu must be locked. !rp.Done(). inode ==
+// vfsd.Impl().(*Dentry).inode.
+func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *Inode) (*vfs.Dentry, *Inode, error) {
+ if !inode.isDir() {
+ return nil, nil, syserror.ENOTDIR
+ }
+ if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+ return nil, nil, err
+ }
+afterSymlink:
+ nextVFSD, err := rp.ResolveComponent(vfsd)
+ if err != nil {
+ return nil, nil, err
+ }
+ if nextVFSD == nil {
+ // Since the Dentry tree is the sole source of truth for memfs, if it's
+ // not in the Dentry tree, it doesn't exist.
+ return nil, nil, syserror.ENOENT
+ }
+ nextInode := nextVFSD.Impl().(*Dentry).inode
+ if symlink, ok := nextInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
+ // TODO: symlink traversals update access time
+ if err := rp.HandleSymlink(symlink.target); err != nil {
+ return nil, nil, err
+ }
+ goto afterSymlink // don't check the current directory again
+ }
+ rp.Advance()
+ return nextVFSD, nextInode, nil
+}
+
+// walkExistingLocked resolves rp to an existing file.
+//
+// walkExistingLocked is loosely analogous to Linux's
+// fs/namei.c:path_lookupat().
+//
+// Preconditions: Filesystem.mu must be locked.
+func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) {
+ vfsd := rp.Start()
+ inode := vfsd.Impl().(*Dentry).inode
+ for !rp.Done() {
+ var err error
+ vfsd, inode, err = stepLocked(rp, vfsd, inode)
+ if err != nil {
+ return nil, nil, err
+ }
+ }
+ if rp.MustBeDir() && !inode.isDir() {
+ return nil, nil, syserror.ENOTDIR
+ }
+ return vfsd, inode, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory. It does not check that the returned directory is
+// searchable by the provider of rp.
+//
+// walkParentDirLocked is loosely analogous to Linux's
+// fs/namei.c:path_parentat().
+//
+// Preconditions: Filesystem.mu must be locked. !rp.Done().
+func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) {
+ vfsd := rp.Start()
+ inode := vfsd.Impl().(*Dentry).inode
+ for !rp.Final() {
+ var err error
+ vfsd, inode, err = stepLocked(rp, vfsd, inode)
+ if err != nil {
+ return nil, nil, err
+ }
+ }
+ if !inode.isDir() {
+ return nil, nil, syserror.ENOTDIR
+ }
+ return vfsd, inode, nil
+}
+
+// checkCreateLocked checks that a file named rp.Component() may be created in
+// directory parentVFSD, then returns rp.Component().
+//
+// Preconditions: Filesystem.mu must be locked. parentInode ==
+// parentVFSD.Impl().(*Dentry).inode. parentInode.isDir() == true.
+func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode *Inode) (string, error) {
+ if err := parentInode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+ return "", err
+ }
+ pc := rp.Component()
+ if pc == "." || pc == ".." {
+ return "", syserror.EEXIST
+ }
+ childVFSD, err := rp.ResolveChild(parentVFSD, pc)
+ if err != nil {
+ return "", err
+ }
+ if childVFSD != nil {
+ return "", syserror.EEXIST
+ }
+ if parentVFSD.IsDisowned() {
+ return "", syserror.ENOENT
+ }
+ return pc, nil
+}
+
+// checkDeleteLocked checks that the file represented by vfsd may be deleted.
+func checkDeleteLocked(vfsd *vfs.Dentry) error {
+ parentVFSD := vfsd.Parent()
+ if parentVFSD == nil {
+ return syserror.EBUSY
+ }
+ if parentVFSD.IsDisowned() {
+ return syserror.ENOENT
+ }
+ return nil
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ vfsd, inode, err := walkExistingLocked(rp)
+ if err != nil {
+ return nil, err
+ }
+ if opts.CheckSearchable {
+ if !inode.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+ return nil, err
+ }
+ }
+ inode.incRef() // vfsd.IncRef(&fs.vfsfs)
+ return vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ parentVFSD, parentInode, err := walkParentDirLocked(rp)
+ if err != nil {
+ return err
+ }
+ pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+ if err != nil {
+ return err
+ }
+ if rp.Mount() != vd.Mount() {
+ return syserror.EXDEV
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ d := vd.Dentry().Impl().(*Dentry)
+ if d.inode.isDir() {
+ return syserror.EPERM
+ }
+ d.inode.incLinksLocked()
+ child := fs.newDentry(d.inode)
+ parentVFSD.InsertChild(&child.vfsd, pc)
+ parentInode.impl.(*directory).childList.PushBack(child)
+ return nil
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ parentVFSD, parentInode, err := walkParentDirLocked(rp)
+ if err != nil {
+ return err
+ }
+ pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+ if err != nil {
+ return err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode))
+ parentVFSD.InsertChild(&child.vfsd, pc)
+ parentInode.impl.(*directory).childList.PushBack(child)
+ parentInode.incLinksLocked() // from child's ".."
+ return nil
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ parentVFSD, parentInode, err := walkParentDirLocked(rp)
+ if err != nil {
+ return err
+ }
+ _, err = checkCreateLocked(rp, parentVFSD, parentInode)
+ if err != nil {
+ return err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ // TODO: actually implement mknod
+ return syserror.EPERM
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ // Filter out flags that are not supported by memfs. O_DIRECTORY and
+ // O_NOFOLLOW have no effect here (they're handled by VFS by setting
+ // appropriate bits in rp), but are returned by
+ // FileDescriptionImpl.StatusFlags().
+ opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW
+
+ if opts.Flags&linux.O_CREAT == 0 {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ vfsd, inode, err := walkExistingLocked(rp)
+ if err != nil {
+ return nil, err
+ }
+ return inode.open(rp, vfsd, opts.Flags, false)
+ }
+
+ mustCreate := opts.Flags&linux.O_EXCL != 0
+ vfsd := rp.Start()
+ inode := vfsd.Impl().(*Dentry).inode
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ if rp.Done() {
+ // FIXME: ???
+ if rp.MustBeDir() {
+ return nil, syserror.EISDIR
+ }
+ if mustCreate {
+ return nil, syserror.EEXIST
+ }
+ return inode.open(rp, vfsd, opts.Flags, false)
+ }
+afterTrailingSymlink:
+ // Walk to the parent directory of the last path component.
+ for !rp.Final() {
+ var err error
+ vfsd, inode, err = stepLocked(rp, vfsd, inode)
+ if err != nil {
+ return nil, err
+ }
+ }
+ if !inode.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ // Check for search permission in the parent directory.
+ if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+ return nil, err
+ }
+ // Reject attempts to open directories with O_CREAT.
+ if rp.MustBeDir() {
+ return nil, syserror.EISDIR
+ }
+ pc := rp.Component()
+ if pc == "." || pc == ".." {
+ return nil, syserror.EISDIR
+ }
+ // Determine whether or not we need to create a file.
+ childVFSD, err := rp.ResolveChild(vfsd, pc)
+ if err != nil {
+ return nil, err
+ }
+ if childVFSD == nil {
+ // Already checked for searchability above; now check for writability.
+ if err := inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+ return nil, err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return nil, err
+ }
+ defer rp.Mount().EndWrite()
+ // Create and open the child.
+ childInode := fs.newRegularFile(rp.Credentials(), opts.Mode)
+ child := fs.newDentry(childInode)
+ vfsd.InsertChild(&child.vfsd, pc)
+ inode.impl.(*directory).childList.PushBack(child)
+ return childInode.open(rp, &child.vfsd, opts.Flags, true)
+ }
+ // Open existing file or follow symlink.
+ if mustCreate {
+ return nil, syserror.EEXIST
+ }
+ childInode := childVFSD.Impl().(*Dentry).inode
+ if symlink, ok := childInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
+ // TODO: symlink traversals update access time
+ if err := rp.HandleSymlink(symlink.target); err != nil {
+ return nil, err
+ }
+ // rp.Final() may no longer be true since we now need to resolve the
+ // symlink target.
+ goto afterTrailingSymlink
+ }
+ return childInode.open(rp, childVFSD, opts.Flags, false)
+}
+
+func (i *Inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
+ ats := vfs.AccessTypesForOpenFlags(flags)
+ if !afterCreate {
+ if err := i.checkPermissions(rp.Credentials(), ats, i.isDir()); err != nil {
+ return nil, err
+ }
+ }
+ switch impl := i.impl.(type) {
+ case *regularFile:
+ var fd regularFileFD
+ fd.flags = flags
+ fd.readable = vfs.MayReadFileWithOpenFlags(flags)
+ fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
+ if fd.writable {
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return nil, err
+ }
+ // Mount.EndWrite() is called by regularFileFD.Release().
+ }
+ fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+ if flags&linux.O_TRUNC != 0 {
+ impl.mu.Lock()
+ impl.data = impl.data[:0]
+ atomic.StoreInt64(&impl.dataLen, 0)
+ impl.mu.Unlock()
+ }
+ return &fd.vfsfd, nil
+ case *directory:
+ // Can't open directories writably.
+ if ats&vfs.MayWrite != 0 {
+ return nil, syserror.EISDIR
+ }
+ var fd directoryFD
+ fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+ fd.flags = flags
+ return &fd.vfsfd, nil
+ case *symlink:
+ // Can't open symlinks without O_PATH (which is unimplemented).
+ return nil, syserror.ELOOP
+ default:
+ panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+ }
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+ fs.mu.RLock()
+ _, inode, err := walkExistingLocked(rp)
+ fs.mu.RUnlock()
+ if err != nil {
+ return "", err
+ }
+ symlink, ok := inode.impl.(*symlink)
+ if !ok {
+ return "", syserror.EINVAL
+ }
+ return symlink.target, nil
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
+ if rp.Done() {
+ // FIXME
+ return syserror.ENOENT
+ }
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ parentVFSD, parentInode, err := walkParentDirLocked(rp)
+ if err != nil {
+ return err
+ }
+ _, err = checkCreateLocked(rp, parentVFSD, parentInode)
+ if err != nil {
+ return err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ // TODO: actually implement RenameAt
+ return syserror.EPERM
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ vfsd, inode, err := walkExistingLocked(rp)
+ if err != nil {
+ return err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ if err := checkDeleteLocked(vfsd); err != nil {
+ return err
+ }
+ if !inode.isDir() {
+ return syserror.ENOTDIR
+ }
+ if vfsd.HasChildren() {
+ return syserror.ENOTEMPTY
+ }
+ if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+ return err
+ }
+ inode.decRef()
+ return nil
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+ fs.mu.RLock()
+ _, _, err := walkExistingLocked(rp)
+ fs.mu.RUnlock()
+ if err != nil {
+ return err
+ }
+ if opts.Stat.Mask == 0 {
+ return nil
+ }
+ // TODO: implement Inode.setStat
+ return syserror.EPERM
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+ fs.mu.RLock()
+ _, inode, err := walkExistingLocked(rp)
+ fs.mu.RUnlock()
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ var stat linux.Statx
+ inode.statTo(&stat)
+ return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+ fs.mu.RLock()
+ _, _, err := walkExistingLocked(rp)
+ fs.mu.RUnlock()
+ if err != nil {
+ return linux.Statfs{}, err
+ }
+ // TODO: actually implement statfs
+ return linux.Statfs{}, syserror.ENOSYS
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ parentVFSD, parentInode, err := walkParentDirLocked(rp)
+ if err != nil {
+ return err
+ }
+ pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+ if err != nil {
+ return err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
+ parentVFSD.InsertChild(&child.vfsd, pc)
+ parentInode.impl.(*directory).childList.PushBack(child)
+ return nil
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ vfsd, inode, err := walkExistingLocked(rp)
+ if err != nil {
+ return err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ if err := checkDeleteLocked(vfsd); err != nil {
+ return err
+ }
+ if inode.isDir() {
+ return syserror.EISDIR
+ }
+ if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+ return err
+ }
+ inode.decLinksLocked()
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go
new file mode 100644
index 000000000..f381e1a88
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/memfs.go
@@ -0,0 +1,299 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package memfs provides a filesystem implementation that behaves like tmpfs:
+// the Dentry tree is the sole source of truth for the state of the filesystem.
+//
+// memfs is intended primarily to demonstrate filesystem implementation
+// patterns. Real uses cases for an in-memory filesystem should use tmpfs
+// instead.
+//
+// Lock order:
+//
+// Filesystem.mu
+// regularFileFD.offMu
+// regularFile.mu
+// Inode.mu
+package memfs
+
+import (
+ "fmt"
+ "sync"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// Filesystem implements vfs.FilesystemImpl.
+type Filesystem struct {
+ vfsfs vfs.Filesystem
+
+ // mu serializes changes to the Dentry tree.
+ mu sync.RWMutex
+
+ nextInoMinusOne uint64 // accessed using atomic memory operations
+}
+
+// NewFilesystem implements vfs.FilesystemType.NewFilesystem.
+func (fstype FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+ var fs Filesystem
+ fs.vfsfs.Init(&fs)
+ root := fs.newDentry(fs.newDirectory(creds, 01777))
+ return &fs.vfsfs, &root.vfsd, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *Filesystem) Release() {
+}
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *Filesystem) Sync(ctx context.Context) error {
+ // All filesystem state is in-memory.
+ return nil
+}
+
+// Dentry implements vfs.DentryImpl.
+type Dentry struct {
+ vfsd vfs.Dentry
+
+ // inode is the inode represented by this Dentry. Multiple Dentries may
+ // share a single non-directory Inode (with hard links). inode is
+ // immutable.
+ inode *Inode
+
+ // memfs doesn't count references on Dentries; because the Dentry tree is
+ // the sole source of truth, it is by definition always consistent with the
+ // state of the filesystem. However, it does count references on Inodes,
+ // because Inode resources are released when all references are dropped.
+ // (memfs doesn't really have resources to release, but we implement
+ // reference counting because tmpfs regular files will.)
+
+ // dentryEntry (ugh) links Dentries into their parent directory.childList.
+ dentryEntry
+}
+
+func (fs *Filesystem) newDentry(inode *Inode) *Dentry {
+ d := &Dentry{
+ inode: inode,
+ }
+ d.vfsd.Init(d)
+ return d
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *Dentry) IncRef(vfsfs *vfs.Filesystem) {
+ d.inode.incRef()
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *Dentry) TryIncRef(vfsfs *vfs.Filesystem) bool {
+ return d.inode.tryIncRef()
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *Dentry) DecRef(vfsfs *vfs.Filesystem) {
+ d.inode.decRef()
+}
+
+// Inode represents a filesystem object.
+type Inode struct {
+ // refs is a reference count. refs is accessed using atomic memory
+ // operations.
+ //
+ // A reference is held on all Inodes that are reachable in the filesystem
+ // tree. For non-directories (which may have multiple hard links), this
+ // means that a reference is dropped when nlink reaches 0. For directories,
+ // nlink never reaches 0 due to the "." entry; instead,
+ // Filesystem.RmdirAt() drops the reference.
+ refs int64
+
+ // Inode metadata; protected by mu and accessed using atomic memory
+ // operations unless otherwise specified.
+ mu sync.RWMutex
+ mode uint32 // excluding file type bits, which are based on impl
+ nlink uint32 // protected by Filesystem.mu instead of Inode.mu
+ uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
+ gid uint32 // auth.KGID, but ...
+ ino uint64 // immutable
+
+ impl interface{} // immutable
+}
+
+func (i *Inode) init(impl interface{}, fs *Filesystem, creds *auth.Credentials, mode uint16) {
+ i.refs = 1
+ i.mode = uint32(mode)
+ i.uid = uint32(creds.EffectiveKUID)
+ i.gid = uint32(creds.EffectiveKGID)
+ i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
+ // i.nlink initialized by caller
+ i.impl = impl
+}
+
+// Preconditions: Filesystem.mu must be locked for writing.
+func (i *Inode) incLinksLocked() {
+ if atomic.AddUint32(&i.nlink, 1) <= 1 {
+ panic("memfs.Inode.incLinksLocked() called with no existing links")
+ }
+}
+
+// Preconditions: Filesystem.mu must be locked for writing.
+func (i *Inode) decLinksLocked() {
+ if nlink := atomic.AddUint32(&i.nlink, ^uint32(0)); nlink == 0 {
+ i.decRef()
+ } else if nlink == ^uint32(0) { // negative overflow
+ panic("memfs.Inode.decLinksLocked() called with no existing links")
+ }
+}
+
+func (i *Inode) incRef() {
+ if atomic.AddInt64(&i.refs, 1) <= 1 {
+ panic("memfs.Inode.incRef() called without holding a reference")
+ }
+}
+
+func (i *Inode) tryIncRef() bool {
+ for {
+ refs := atomic.LoadInt64(&i.refs)
+ if refs == 0 {
+ return false
+ }
+ if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) {
+ return true
+ }
+ }
+}
+
+func (i *Inode) decRef() {
+ if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
+ // This is unnecessary; it's mostly to simulate what tmpfs would do.
+ if regfile, ok := i.impl.(*regularFile); ok {
+ regfile.mu.Lock()
+ regfile.data = nil
+ atomic.StoreInt64(&regfile.dataLen, 0)
+ regfile.mu.Unlock()
+ }
+ } else if refs < 0 {
+ panic("memfs.Inode.decRef() called without holding a reference")
+ }
+}
+
+func (i *Inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
+ return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
+}
+
+// Go won't inline this function, and returning linux.Statx (which is quite
+// big) means spending a lot of time in runtime.duffcopy(), so instead it's an
+// output parameter.
+func (i *Inode) statTo(stat *linux.Statx) {
+ stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+ stat.Blksize = 1 // usermem.PageSize in tmpfs
+ stat.Nlink = atomic.LoadUint32(&i.nlink)
+ stat.UID = atomic.LoadUint32(&i.uid)
+ stat.GID = atomic.LoadUint32(&i.gid)
+ stat.Mode = uint16(atomic.LoadUint32(&i.mode))
+ stat.Ino = i.ino
+ // TODO: device number
+ switch impl := i.impl.(type) {
+ case *regularFile:
+ stat.Mode |= linux.S_IFREG
+ stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
+ stat.Size = uint64(atomic.LoadInt64(&impl.dataLen))
+ // In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in
+ // a uint64 accessed using atomic memory operations to avoid taking
+ // locks).
+ stat.Blocks = allocatedBlocksForSize(stat.Size)
+ case *directory:
+ stat.Mode |= linux.S_IFDIR
+ case *symlink:
+ stat.Mode |= linux.S_IFLNK
+ stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
+ stat.Size = uint64(len(impl.target))
+ stat.Blocks = allocatedBlocksForSize(stat.Size)
+ default:
+ panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+ }
+}
+
+// allocatedBlocksForSize returns the number of 512B blocks needed to
+// accommodate the given size in bytes, as appropriate for struct
+// stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
+// size is independent of the "preferred block size for I/O", struct
+// stat::st_blksize and struct statx::stx_blksize.)
+func allocatedBlocksForSize(size uint64) uint64 {
+ return (size + 511) / 512
+}
+
+func (i *Inode) direntType() uint8 {
+ switch i.impl.(type) {
+ case *regularFile:
+ return linux.DT_REG
+ case *directory:
+ return linux.DT_DIR
+ case *symlink:
+ return linux.DT_LNK
+ default:
+ panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+ }
+}
+
+// fileDescription is embedded by memfs implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+ vfsfd vfs.FileDescription
+
+ flags uint32 // status flags; immutable
+}
+
+func (fd *fileDescription) filesystem() *Filesystem {
+ return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*Filesystem)
+}
+
+func (fd *fileDescription) inode() *Inode {
+ return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
+}
+
+// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
+func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) {
+ return fd.flags, nil
+}
+
+// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
+func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
+ // None of the flags settable by fcntl(F_SETFL) are supported, so this is a
+ // no-op.
+ return nil
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ var stat linux.Statx
+ fd.inode().statTo(&stat)
+ return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+ if opts.Stat.Mask == 0 {
+ return nil
+ }
+ // TODO: implement Inode.setStat
+ return syserror.EPERM
+}
diff --git a/pkg/sentry/fsimpl/memfs/regular_file.go b/pkg/sentry/fsimpl/memfs/regular_file.go
new file mode 100644
index 000000000..4a3603cc8
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/regular_file.go
@@ -0,0 +1,155 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memfs
+
+import (
+ "io"
+ "sync"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+type regularFile struct {
+ inode Inode
+
+ mu sync.RWMutex
+ data []byte
+ // dataLen is len(data), but accessed using atomic memory operations to
+ // avoid locking in Inode.stat().
+ dataLen int64
+}
+
+func (fs *Filesystem) newRegularFile(creds *auth.Credentials, mode uint16) *Inode {
+ file := &regularFile{}
+ file.inode.init(file, fs, creds, mode)
+ file.inode.nlink = 1 // from parent directory
+ return &file.inode
+}
+
+type regularFileFD struct {
+ fileDescription
+ vfs.FileDescriptionDefaultImpl
+
+ // These are immutable.
+ readable bool
+ writable bool
+
+ // off is the file offset. off is accessed using atomic memory operations.
+ // offMu serializes operations that may mutate off.
+ off int64
+ offMu sync.Mutex
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release() {
+ if fd.writable {
+ fd.vfsfd.VirtualDentry().Mount().EndWrite()
+ }
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ if !fd.readable {
+ return 0, syserror.EINVAL
+ }
+ f := fd.inode().impl.(*regularFile)
+ f.mu.RLock()
+ if offset >= int64(len(f.data)) {
+ f.mu.RUnlock()
+ return 0, io.EOF
+ }
+ n, err := dst.CopyOut(ctx, f.data[offset:])
+ f.mu.RUnlock()
+ return int64(n), err
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ fd.offMu.Lock()
+ n, err := fd.PRead(ctx, dst, fd.off, opts)
+ fd.off += n
+ fd.offMu.Unlock()
+ return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ if !fd.writable {
+ return 0, syserror.EINVAL
+ }
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ srclen := src.NumBytes()
+ if srclen == 0 {
+ return 0, nil
+ }
+ f := fd.inode().impl.(*regularFile)
+ f.mu.Lock()
+ end := offset + srclen
+ if end < offset {
+ // Overflow.
+ f.mu.Unlock()
+ return 0, syserror.EFBIG
+ }
+ if end > f.dataLen {
+ f.data = append(f.data, make([]byte, end-f.dataLen)...)
+ atomic.StoreInt64(&f.dataLen, end)
+ }
+ n, err := src.CopyIn(ctx, f.data[offset:end])
+ f.mu.Unlock()
+ return int64(n), err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ fd.offMu.Lock()
+ n, err := fd.PWrite(ctx, src, fd.off, opts)
+ fd.off += n
+ fd.offMu.Unlock()
+ return n, err
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ fd.offMu.Lock()
+ defer fd.offMu.Unlock()
+ switch whence {
+ case linux.SEEK_SET:
+ // use offset as specified
+ case linux.SEEK_CUR:
+ offset += fd.off
+ case linux.SEEK_END:
+ offset += atomic.LoadInt64(&fd.inode().impl.(*regularFile).dataLen)
+ default:
+ return 0, syserror.EINVAL
+ }
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ fd.off = offset
+ return offset, nil
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *regularFileFD) Sync(ctx context.Context) error {
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/memfs/symlink.go b/pkg/sentry/fsimpl/memfs/symlink.go
new file mode 100644
index 000000000..e002d1727
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/symlink.go
@@ -0,0 +1,36 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memfs
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+type symlink struct {
+ inode Inode
+ target string // immutable
+}
+
+func (fs *Filesystem) newSymlink(creds *auth.Credentials, target string) *Inode {
+ link := &symlink{
+ target: target,
+ }
+ link.inode.init(link, fs, creds, 0777)
+ link.inode.nlink = 1 // from parent directory
+ return &link.inode
+}
+
+// O_PATH is unimplemented, so there's no way to get a FileDescription
+// representing a symlink yet.