summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/fsimpl/tmpfs
diff options
context:
space:
mode:
authorNicolas Lacasse <nlacasse@google.com>2020-01-06 12:51:35 -0800
committergVisor bot <gvisor-bot@google.com>2020-01-06 12:52:55 -0800
commit51f3ab85e024fcd74c49d273ce5202a207577d31 (patch)
tree8d4e6ba070203203060a1d185b4321b89f2e8b05 /pkg/sentry/fsimpl/tmpfs
parent354a15a234c1270bcb9b902503f61835b2ccd2d0 (diff)
Convert memfs into proto-tmpfs.
- Renamed memfs to tmpfs. - Copied fileRangeSet bits from fs/fsutil/ to fsimpl/tmpfs/ - Changed tmpfs to be backed by filemem instead of byte slice. - regularFileReadWriter uses a sync.Pool, similar to gofer client. PiperOrigin-RevId: 288356380
Diffstat (limited to 'pkg/sentry/fsimpl/tmpfs')
-rw-r--r--pkg/sentry/fsimpl/tmpfs/BUILD92
-rw-r--r--pkg/sentry/fsimpl/tmpfs/benchmark_test.go487
-rw-r--r--pkg/sentry/fsimpl/tmpfs/directory.go187
-rw-r--r--pkg/sentry/fsimpl/tmpfs/filesystem.go698
-rw-r--r--pkg/sentry/fsimpl/tmpfs/named_pipe.go60
-rw-r--r--pkg/sentry/fsimpl/tmpfs/pipe_test.go235
-rw-r--r--pkg/sentry/fsimpl/tmpfs/regular_file.go357
-rw-r--r--pkg/sentry/fsimpl/tmpfs/regular_file_test.go224
-rw-r--r--pkg/sentry/fsimpl/tmpfs/symlink.go36
-rw-r--r--pkg/sentry/fsimpl/tmpfs/tmpfs.go299
10 files changed, 2675 insertions, 0 deletions
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
new file mode 100644
index 000000000..a5b285987
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -0,0 +1,92 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+ name = "dentry_list",
+ out = "dentry_list.go",
+ package = "tmpfs",
+ prefix = "dentry",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Element": "*dentry",
+ "Linker": "*dentry",
+ },
+)
+
+go_library(
+ name = "tmpfs",
+ srcs = [
+ "dentry_list.go",
+ "directory.go",
+ "filesystem.go",
+ "named_pipe.go",
+ "regular_file.go",
+ "symlink.go",
+ "tmpfs.go",
+ ],
+ importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs",
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/amutex",
+ "//pkg/fspath",
+ "//pkg/log",
+ "//pkg/sentry/arch",
+ "//pkg/sentry/context",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/pipe",
+ "//pkg/sentry/memmap",
+ "//pkg/sentry/pgalloc",
+ "//pkg/sentry/platform",
+ "//pkg/sentry/safemem",
+ "//pkg/sentry/usage",
+ "//pkg/sentry/usermem",
+ "//pkg/sentry/vfs",
+ "//pkg/syserror",
+ ],
+)
+
+go_test(
+ name = "benchmark_test",
+ size = "small",
+ srcs = ["benchmark_test.go"],
+ deps = [
+ ":tmpfs",
+ "//pkg/abi/linux",
+ "//pkg/fspath",
+ "//pkg/refs",
+ "//pkg/sentry/context",
+ "//pkg/sentry/context/contexttest",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/fs/tmpfs",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/vfs",
+ "//pkg/syserror",
+ ],
+)
+
+go_test(
+ name = "tmpfs_test",
+ size = "small",
+ srcs = [
+ "pipe_test.go",
+ "regular_file_test.go",
+ ],
+ embed = [":tmpfs"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/fspath",
+ "//pkg/sentry/context",
+ "//pkg/sentry/context/contexttest",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/contexttest",
+ "//pkg/sentry/usermem",
+ "//pkg/sentry/vfs",
+ "//pkg/syserror",
+ ],
+)
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
new file mode 100644
index 000000000..d88c83499
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -0,0 +1,487 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package benchmark_test
+
+import (
+ "fmt"
+ "runtime"
+ "strings"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/refs"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Differences from stat_benchmark:
+//
+// - Syscall interception, CopyInPath, copyOutStat, and overlayfs overheads are
+// not included.
+//
+// - *MountStat benchmarks use a tmpfs root mount and a tmpfs submount at /tmp.
+// Non-MountStat benchmarks use a tmpfs root mount and no submounts.
+// stat_benchmark uses a varying root mount, a tmpfs submount at /tmp, and a
+// subdirectory /tmp/<top_dir> (assuming TEST_TMPDIR == "/tmp"). Thus
+// stat_benchmark at depth 1 does a comparable amount of work to *MountStat
+// benchmarks at depth 2, and non-MountStat benchmarks at depth 3.
+var depths = []int{1, 2, 3, 8, 64, 100}
+
+const (
+ mountPointName = "tmp"
+ filename = "gvisor_test_temp_0_1557494568"
+)
+
+// This is copied from syscalls/linux/sys_file.go, with the dependency on
+// kernel.Task stripped out.
+func fileOpOn(ctx context.Context, mntns *fs.MountNamespace, root, wd *fs.Dirent, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent) error) error {
+ var (
+ d *fs.Dirent // The file.
+ rel *fs.Dirent // The relative directory for search (if required.)
+ err error
+ )
+
+ // Extract the working directory (maybe).
+ if len(path) > 0 && path[0] == '/' {
+ // Absolute path; rel can be nil.
+ } else if dirFD == linux.AT_FDCWD {
+ // Need to reference the working directory.
+ rel = wd
+ } else {
+ // Need to extract the given FD.
+ return syserror.EBADF
+ }
+
+ // Lookup the node.
+ remainingTraversals := uint(linux.MaxSymlinkTraversals)
+ if resolve {
+ d, err = mntns.FindInode(ctx, root, rel, path, &remainingTraversals)
+ } else {
+ d, err = mntns.FindLink(ctx, root, rel, path, &remainingTraversals)
+ }
+ if err != nil {
+ return err
+ }
+
+ err = fn(root, d)
+ d.DecRef()
+ return err
+}
+
+func BenchmarkVFS1TmpfsStat(b *testing.B) {
+ for _, depth := range depths {
+ b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+ ctx := contexttest.Context(b)
+
+ // Create VFS.
+ tmpfsFS, ok := fs.FindFilesystem("tmpfs")
+ if !ok {
+ b.Fatalf("failed to find tmpfs filesystem type")
+ }
+ rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
+ if err != nil {
+ b.Fatalf("failed to create tmpfs root mount: %v", err)
+ }
+ mntns, err := fs.NewMountNamespace(ctx, rootInode)
+ if err != nil {
+ b.Fatalf("failed to create mount namespace: %v", err)
+ }
+ defer mntns.DecRef()
+
+ var filePathBuilder strings.Builder
+ filePathBuilder.WriteByte('/')
+
+ // Create nested directories with given depth.
+ root := mntns.Root()
+ defer root.DecRef()
+ d := root
+ d.IncRef()
+ defer d.DecRef()
+ for i := depth; i > 0; i-- {
+ name := fmt.Sprintf("%d", i)
+ if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil {
+ b.Fatalf("failed to create directory %q: %v", name, err)
+ }
+ next, err := d.Walk(ctx, root, name)
+ if err != nil {
+ b.Fatalf("failed to walk to directory %q: %v", name, err)
+ }
+ d.DecRef()
+ d = next
+ filePathBuilder.WriteString(name)
+ filePathBuilder.WriteByte('/')
+ }
+
+ // Create the file that will be stat'd.
+ file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644))
+ if err != nil {
+ b.Fatalf("failed to create file %q: %v", filename, err)
+ }
+ file.DecRef()
+ filePathBuilder.WriteString(filename)
+ filePath := filePathBuilder.String()
+
+ dirPath := false
+ runtime.GC()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+ if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+ uattr, err := d.Inode.UnstableAttr(ctx)
+ if err != nil {
+ return err
+ }
+ // Sanity check.
+ if uattr.Perms.User.Execute {
+ b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode())
+ }
+ return nil
+ })
+ if err != nil {
+ b.Fatalf("stat(%q) failed: %v", filePath, err)
+ }
+ }
+ // Don't include deferred cleanup in benchmark time.
+ b.StopTimer()
+ })
+ }
+}
+
+func BenchmarkVFS2MemfsStat(b *testing.B) {
+ for _, depth := range depths {
+ b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+ ctx := contexttest.Context(b)
+ creds := auth.CredentialsFromContext(ctx)
+
+ // Create VFS.
+ vfsObj := vfs.New()
+ vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ })
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+ if err != nil {
+ b.Fatalf("failed to create tmpfs root mount: %v", err)
+ }
+ defer mntns.DecRef(vfsObj)
+
+ var filePathBuilder strings.Builder
+ filePathBuilder.WriteByte('/')
+
+ // Create nested directories with given depth.
+ root := mntns.Root()
+ defer root.DecRef()
+ vd := root
+ vd.IncRef()
+ for i := depth; i > 0; i-- {
+ name := fmt.Sprintf("%d", i)
+ pop := vfs.PathOperation{
+ Root: root,
+ Start: vd,
+ Path: fspath.Parse(name),
+ }
+ if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
+ Mode: 0755,
+ }); err != nil {
+ b.Fatalf("failed to create directory %q: %v", name, err)
+ }
+ nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+ if err != nil {
+ b.Fatalf("failed to walk to directory %q: %v", name, err)
+ }
+ vd.DecRef()
+ vd = nextVD
+ filePathBuilder.WriteString(name)
+ filePathBuilder.WriteByte('/')
+ }
+
+ // Create the file that will be stat'd.
+ fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: vd,
+ Path: fspath.Parse(filename),
+ FollowFinalSymlink: true,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+ Mode: 0644,
+ })
+ vd.DecRef()
+ vd = vfs.VirtualDentry{}
+ if err != nil {
+ b.Fatalf("failed to create file %q: %v", filename, err)
+ }
+ defer fd.DecRef()
+ filePathBuilder.WriteString(filename)
+ filePath := filePathBuilder.String()
+
+ runtime.GC()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(filePath),
+ FollowFinalSymlink: true,
+ }, &vfs.StatOptions{})
+ if err != nil {
+ b.Fatalf("stat(%q) failed: %v", filePath, err)
+ }
+ // Sanity check.
+ if stat.Mode&^linux.S_IFMT != 0644 {
+ b.Fatalf("got wrong permissions (%0o)", stat.Mode)
+ }
+ }
+ // Don't include deferred cleanup in benchmark time.
+ b.StopTimer()
+ })
+ }
+}
+
+func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
+ for _, depth := range depths {
+ b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+ ctx := contexttest.Context(b)
+
+ // Create VFS.
+ tmpfsFS, ok := fs.FindFilesystem("tmpfs")
+ if !ok {
+ b.Fatalf("failed to find tmpfs filesystem type")
+ }
+ rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
+ if err != nil {
+ b.Fatalf("failed to create tmpfs root mount: %v", err)
+ }
+ mntns, err := fs.NewMountNamespace(ctx, rootInode)
+ if err != nil {
+ b.Fatalf("failed to create mount namespace: %v", err)
+ }
+ defer mntns.DecRef()
+
+ var filePathBuilder strings.Builder
+ filePathBuilder.WriteByte('/')
+
+ // Create and mount the submount.
+ root := mntns.Root()
+ defer root.DecRef()
+ if err := root.Inode.CreateDirectory(ctx, root, mountPointName, fs.FilePermsFromMode(0755)); err != nil {
+ b.Fatalf("failed to create mount point: %v", err)
+ }
+ mountPoint, err := root.Walk(ctx, root, mountPointName)
+ if err != nil {
+ b.Fatalf("failed to walk to mount point: %v", err)
+ }
+ defer mountPoint.DecRef()
+ submountInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
+ if err != nil {
+ b.Fatalf("failed to create tmpfs submount: %v", err)
+ }
+ if err := mntns.Mount(ctx, mountPoint, submountInode); err != nil {
+ b.Fatalf("failed to mount tmpfs submount: %v", err)
+ }
+ filePathBuilder.WriteString(mountPointName)
+ filePathBuilder.WriteByte('/')
+
+ // Create nested directories with given depth.
+ d, err := root.Walk(ctx, root, mountPointName)
+ if err != nil {
+ b.Fatalf("failed to walk to mount root: %v", err)
+ }
+ defer d.DecRef()
+ for i := depth; i > 0; i-- {
+ name := fmt.Sprintf("%d", i)
+ if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil {
+ b.Fatalf("failed to create directory %q: %v", name, err)
+ }
+ next, err := d.Walk(ctx, root, name)
+ if err != nil {
+ b.Fatalf("failed to walk to directory %q: %v", name, err)
+ }
+ d.DecRef()
+ d = next
+ filePathBuilder.WriteString(name)
+ filePathBuilder.WriteByte('/')
+ }
+
+ // Create the file that will be stat'd.
+ file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644))
+ if err != nil {
+ b.Fatalf("failed to create file %q: %v", filename, err)
+ }
+ file.DecRef()
+ filePathBuilder.WriteString(filename)
+ filePath := filePathBuilder.String()
+
+ dirPath := false
+ runtime.GC()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+ if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+ uattr, err := d.Inode.UnstableAttr(ctx)
+ if err != nil {
+ return err
+ }
+ // Sanity check.
+ if uattr.Perms.User.Execute {
+ b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode())
+ }
+ return nil
+ })
+ if err != nil {
+ b.Fatalf("stat(%q) failed: %v", filePath, err)
+ }
+ }
+ // Don't include deferred cleanup in benchmark time.
+ b.StopTimer()
+ })
+ }
+}
+
+func BenchmarkVFS2MemfsMountStat(b *testing.B) {
+ for _, depth := range depths {
+ b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+ ctx := contexttest.Context(b)
+ creds := auth.CredentialsFromContext(ctx)
+
+ // Create VFS.
+ vfsObj := vfs.New()
+ vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ })
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+ if err != nil {
+ b.Fatalf("failed to create tmpfs root mount: %v", err)
+ }
+ defer mntns.DecRef(vfsObj)
+
+ var filePathBuilder strings.Builder
+ filePathBuilder.WriteByte('/')
+
+ // Create the mount point.
+ root := mntns.Root()
+ defer root.DecRef()
+ pop := vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(mountPointName),
+ }
+ if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
+ Mode: 0755,
+ }); err != nil {
+ b.Fatalf("failed to create mount point: %v", err)
+ }
+ // Save the mount point for later use.
+ mountPoint, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+ if err != nil {
+ b.Fatalf("failed to walk to mount point: %v", err)
+ }
+ defer mountPoint.DecRef()
+ // Create and mount the submount.
+ if err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil {
+ b.Fatalf("failed to mount tmpfs submount: %v", err)
+ }
+ filePathBuilder.WriteString(mountPointName)
+ filePathBuilder.WriteByte('/')
+
+ // Create nested directories with given depth.
+ vd, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+ if err != nil {
+ b.Fatalf("failed to walk to mount root: %v", err)
+ }
+ for i := depth; i > 0; i-- {
+ name := fmt.Sprintf("%d", i)
+ pop := vfs.PathOperation{
+ Root: root,
+ Start: vd,
+ Path: fspath.Parse(name),
+ }
+ if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
+ Mode: 0755,
+ }); err != nil {
+ b.Fatalf("failed to create directory %q: %v", name, err)
+ }
+ nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+ if err != nil {
+ b.Fatalf("failed to walk to directory %q: %v", name, err)
+ }
+ vd.DecRef()
+ vd = nextVD
+ filePathBuilder.WriteString(name)
+ filePathBuilder.WriteByte('/')
+ }
+
+ // Verify that we didn't create any directories under the mount
+ // point (i.e. they were all created on the submount).
+ firstDirName := fmt.Sprintf("%d", depth)
+ if child := mountPoint.Dentry().Child(firstDirName); child != nil {
+ b.Fatalf("created directory %q under root mount, not submount", firstDirName)
+ }
+
+ // Create the file that will be stat'd.
+ fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: vd,
+ Path: fspath.Parse(filename),
+ FollowFinalSymlink: true,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+ Mode: 0644,
+ })
+ vd.DecRef()
+ if err != nil {
+ b.Fatalf("failed to create file %q: %v", filename, err)
+ }
+ fd.DecRef()
+ filePathBuilder.WriteString(filename)
+ filePath := filePathBuilder.String()
+
+ runtime.GC()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(filePath),
+ FollowFinalSymlink: true,
+ }, &vfs.StatOptions{})
+ if err != nil {
+ b.Fatalf("stat(%q) failed: %v", filePath, err)
+ }
+ // Sanity check.
+ if stat.Mode&^linux.S_IFMT != 0644 {
+ b.Fatalf("got wrong permissions (%0o)", stat.Mode)
+ }
+ }
+ // Don't include deferred cleanup in benchmark time.
+ b.StopTimer()
+ })
+ }
+}
+
+func init() {
+ // Turn off reference leak checking for a fair comparison between vfs1 and
+ // vfs2.
+ refs.SetLeakMode(refs.NoLeakChecking)
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
new file mode 100644
index 000000000..887ca2619
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -0,0 +1,187 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+type directory struct {
+ inode inode
+
+ // childList is a list containing (1) child Dentries and (2) fake Dentries
+ // (with inode == nil) that represent the iteration position of
+ // directoryFDs. childList is used to support directoryFD.IterDirents()
+ // efficiently. childList is protected by filesystem.mu.
+ childList dentryList
+}
+
+func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *inode {
+ dir := &directory{}
+ dir.inode.init(dir, fs, creds, mode)
+ dir.inode.nlink = 2 // from "." and parent directory or ".." for root
+ return &dir.inode
+}
+
+func (i *inode) isDir() bool {
+ _, ok := i.impl.(*directory)
+ return ok
+}
+
+type directoryFD struct {
+ fileDescription
+ vfs.DirectoryFileDescriptionDefaultImpl
+
+ // Protected by filesystem.mu.
+ iter *dentry
+ off int64
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *directoryFD) Release() {
+ if fd.iter != nil {
+ fs := fd.filesystem()
+ dir := fd.inode().impl.(*directory)
+ fs.mu.Lock()
+ dir.childList.Remove(fd.iter)
+ fs.mu.Unlock()
+ fd.iter = nil
+ }
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+ fs := fd.filesystem()
+ vfsd := fd.vfsfd.VirtualDentry().Dentry()
+
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+
+ if fd.off == 0 {
+ if !cb.Handle(vfs.Dirent{
+ Name: ".",
+ Type: linux.DT_DIR,
+ Ino: vfsd.Impl().(*dentry).inode.ino,
+ NextOff: 1,
+ }) {
+ return nil
+ }
+ fd.off++
+ }
+ if fd.off == 1 {
+ parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode
+ if !cb.Handle(vfs.Dirent{
+ Name: "..",
+ Type: parentInode.direntType(),
+ Ino: parentInode.ino,
+ NextOff: 2,
+ }) {
+ return nil
+ }
+ fd.off++
+ }
+
+ dir := vfsd.Impl().(*dentry).inode.impl.(*directory)
+ var child *dentry
+ if fd.iter == nil {
+ // Start iteration at the beginning of dir.
+ child = dir.childList.Front()
+ fd.iter = &dentry{}
+ } else {
+ // Continue iteration from where we left off.
+ child = fd.iter.Next()
+ dir.childList.Remove(fd.iter)
+ }
+ for child != nil {
+ // Skip other directoryFD iterators.
+ if child.inode != nil {
+ if !cb.Handle(vfs.Dirent{
+ Name: child.vfsd.Name(),
+ Type: child.inode.direntType(),
+ Ino: child.inode.ino,
+ NextOff: fd.off + 1,
+ }) {
+ dir.childList.InsertBefore(child, fd.iter)
+ return nil
+ }
+ fd.off++
+ }
+ child = child.Next()
+ }
+ dir.childList.PushBack(fd.iter)
+ return nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ fs := fd.filesystem()
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+
+ switch whence {
+ case linux.SEEK_SET:
+ // Use offset as given.
+ case linux.SEEK_CUR:
+ offset += fd.off
+ default:
+ return 0, syserror.EINVAL
+ }
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+
+ // If the offset isn't changing (e.g. due to lseek(0, SEEK_CUR)), don't
+ // seek even if doing so might reposition the iterator due to concurrent
+ // mutation of the directory. Compare fs/libfs.c:dcache_dir_lseek().
+ if fd.off == offset {
+ return offset, nil
+ }
+
+ fd.off = offset
+ // Compensate for "." and "..".
+ remChildren := int64(0)
+ if offset >= 2 {
+ remChildren = offset - 2
+ }
+
+ dir := fd.inode().impl.(*directory)
+
+ // Ensure that fd.iter exists and is not linked into dir.childList.
+ if fd.iter == nil {
+ fd.iter = &dentry{}
+ } else {
+ dir.childList.Remove(fd.iter)
+ }
+ // Insert fd.iter before the remChildren'th child, or at the end of the
+ // list if remChildren >= number of children.
+ child := dir.childList.Front()
+ for child != nil {
+ // Skip other directoryFD iterators.
+ if child.inode != nil {
+ if remChildren == 0 {
+ dir.childList.InsertBefore(child, fd.iter)
+ return offset, nil
+ }
+ remChildren--
+ }
+ child = child.Next()
+ }
+ dir.childList.PushBack(fd.iter)
+ return offset, nil
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
new file mode 100644
index 000000000..26979729e
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -0,0 +1,698 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+ "fmt"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+ // All filesystem state is in-memory.
+ return nil
+}
+
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// stepLocked is loosely analogous to fs/namei.c:walk_component().
+//
+// Preconditions: filesystem.mu must be locked. !rp.Done().
+func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
+ if !d.inode.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+ return nil, err
+ }
+afterSymlink:
+ nextVFSD, err := rp.ResolveComponent(&d.vfsd)
+ if err != nil {
+ return nil, err
+ }
+ if nextVFSD == nil {
+ // Since the Dentry tree is the sole source of truth for tmpfs, if it's
+ // not in the Dentry tree, it doesn't exist.
+ return nil, syserror.ENOENT
+ }
+ next := nextVFSD.Impl().(*dentry)
+ if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
+ // TODO: symlink traversals update access time
+ if err := rp.HandleSymlink(symlink.target); err != nil {
+ return nil, err
+ }
+ goto afterSymlink // don't check the current directory again
+ }
+ rp.Advance()
+ return next, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory, starting from the given directory (which is usually
+// rp.Start().Impl().(*dentry)). It does not check that the returned directory
+// is searchable by the provider of rp.
+//
+// walkParentDirLocked is loosely analogous to Linux's
+// fs/namei.c:path_parentat().
+//
+// Preconditions: filesystem.mu must be locked. !rp.Done().
+func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
+ for !rp.Final() {
+ next, err := stepLocked(rp, d)
+ if err != nil {
+ return nil, err
+ }
+ d = next
+ }
+ if !d.inode.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ return d, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
+//
+// Preconditions: filesystem.mu must be locked.
+func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) {
+ d := rp.Start().Impl().(*dentry)
+ for !rp.Done() {
+ next, err := stepLocked(rp, d)
+ if err != nil {
+ return nil, err
+ }
+ d = next
+ }
+ if rp.MustBeDir() && !d.inode.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ return d, nil
+}
+
+// doCreateAt checks that creating a file at rp is permitted, then invokes
+// create to do so.
+//
+// doCreateAt is loosely analogous to a conjunction of Linux's
+// fs/namei.c:filename_create() and done_path_create().
+//
+// Preconditions: !rp.Done(). For the final path component in rp,
+// !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error {
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ if err != nil {
+ return err
+ }
+ if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+ return err
+ }
+ name := rp.Component()
+ if name == "." || name == ".." {
+ return syserror.EEXIST
+ }
+ // Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(),
+ // because if the child exists we want to return EEXIST immediately instead
+ // of attempting symlink/mount traversal.
+ if parent.vfsd.Child(name) != nil {
+ return syserror.EEXIST
+ }
+ if !dir && rp.MustBeDir() {
+ return syserror.ENOENT
+ }
+ // In memfs, the only way to cause a dentry to be disowned is by removing
+ // it from the filesystem, so this check is equivalent to checking if
+ // parent has been removed.
+ if parent.vfsd.IsDisowned() {
+ return syserror.ENOENT
+ }
+ mnt := rp.Mount()
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+ return create(parent, name)
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ d, err := resolveLocked(rp)
+ if err != nil {
+ return nil, err
+ }
+ if opts.CheckSearchable {
+ if !d.inode.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true /* isDir */); err != nil {
+ return nil, err
+ }
+ }
+ d.IncRef()
+ return &d.vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ d, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ if err != nil {
+ return nil, err
+ }
+ d.IncRef()
+ return &d.vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+ return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+ if rp.Mount() != vd.Mount() {
+ return syserror.EXDEV
+ }
+ d := vd.Dentry().Impl().(*dentry)
+ if d.inode.isDir() {
+ return syserror.EPERM
+ }
+ if d.inode.nlink == 0 {
+ return syserror.ENOENT
+ }
+ if d.inode.nlink == maxLinks {
+ return syserror.EMLINK
+ }
+ d.inode.incLinksLocked()
+ child := fs.newDentry(d.inode)
+ parent.vfsd.InsertChild(&child.vfsd, name)
+ parent.inode.impl.(*directory).childList.PushBack(child)
+ return nil
+ })
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+ return fs.doCreateAt(rp, true /* dir */, func(parent *dentry, name string) error {
+ if parent.inode.nlink == maxLinks {
+ return syserror.EMLINK
+ }
+ parent.inode.incLinksLocked() // from child's ".."
+ child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode))
+ parent.vfsd.InsertChild(&child.vfsd, name)
+ parent.inode.impl.(*directory).childList.PushBack(child)
+ return nil
+ })
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+ return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+ switch opts.Mode.FileType() {
+ case 0, linux.S_IFREG:
+ child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
+ parent.vfsd.InsertChild(&child.vfsd, name)
+ parent.inode.impl.(*directory).childList.PushBack(child)
+ return nil
+ case linux.S_IFIFO:
+ child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode))
+ parent.vfsd.InsertChild(&child.vfsd, name)
+ parent.inode.impl.(*directory).childList.PushBack(child)
+ return nil
+ case linux.S_IFBLK, linux.S_IFCHR, linux.S_IFSOCK:
+ // Not yet supported.
+ return syserror.EPERM
+ default:
+ return syserror.EINVAL
+ }
+ })
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ if opts.Flags&linux.O_TMPFILE != 0 {
+ // Not yet supported.
+ return nil, syserror.EOPNOTSUPP
+ }
+
+ // Handle O_CREAT and !O_CREAT separately, since in the latter case we
+ // don't need fs.mu for writing.
+ if opts.Flags&linux.O_CREAT == 0 {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ d, err := resolveLocked(rp)
+ if err != nil {
+ return nil, err
+ }
+ return d.open(ctx, rp, opts.Flags, false /* afterCreate */)
+ }
+
+ mustCreate := opts.Flags&linux.O_EXCL != 0
+ start := rp.Start().Impl().(*dentry)
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ if rp.Done() {
+ // Reject attempts to open directories with O_CREAT.
+ if rp.MustBeDir() {
+ return nil, syserror.EISDIR
+ }
+ if mustCreate {
+ return nil, syserror.EEXIST
+ }
+ return start.open(ctx, rp, opts.Flags, false /* afterCreate */)
+ }
+afterTrailingSymlink:
+ parent, err := walkParentDirLocked(rp, start)
+ if err != nil {
+ return nil, err
+ }
+ // Check for search permission in the parent directory.
+ if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+ return nil, err
+ }
+ // Reject attempts to open directories with O_CREAT.
+ if rp.MustBeDir() {
+ return nil, syserror.EISDIR
+ }
+ name := rp.Component()
+ if name == "." || name == ".." {
+ return nil, syserror.EISDIR
+ }
+ // Determine whether or not we need to create a file.
+ child, err := stepLocked(rp, parent)
+ if err == syserror.ENOENT {
+ // Already checked for searchability above; now check for writability.
+ if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+ return nil, err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return nil, err
+ }
+ defer rp.Mount().EndWrite()
+ // Create and open the child.
+ child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
+ parent.vfsd.InsertChild(&child.vfsd, name)
+ parent.inode.impl.(*directory).childList.PushBack(child)
+ return child.open(ctx, rp, opts.Flags, true)
+ }
+ if err != nil {
+ return nil, err
+ }
+ // Do we need to resolve a trailing symlink?
+ if !rp.Done() {
+ start = parent
+ goto afterTrailingSymlink
+ }
+ // Open existing file.
+ if mustCreate {
+ return nil, syserror.EEXIST
+ }
+ return child.open(ctx, rp, opts.Flags, false)
+}
+
+func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
+ ats := vfs.AccessTypesForOpenFlags(flags)
+ if !afterCreate {
+ if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil {
+ return nil, err
+ }
+ }
+ mnt := rp.Mount()
+ switch impl := d.inode.impl.(type) {
+ case *regularFile:
+ var fd regularFileFD
+ fd.readable = vfs.MayReadFileWithOpenFlags(flags)
+ fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
+ if fd.writable {
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return nil, err
+ }
+ // mnt.EndWrite() is called by regularFileFD.Release().
+ }
+ fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
+ if flags&linux.O_TRUNC != 0 {
+ impl.mu.Lock()
+ impl.data.Truncate(0, impl.memFile)
+ atomic.StoreUint64(&impl.size, 0)
+ impl.mu.Unlock()
+ }
+ return &fd.vfsfd, nil
+ case *directory:
+ // Can't open directories writably.
+ if ats&vfs.MayWrite != 0 {
+ return nil, syserror.EISDIR
+ }
+ var fd directoryFD
+ fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
+ return &fd.vfsfd, nil
+ case *symlink:
+ // Can't open symlinks without O_PATH (which is unimplemented).
+ return nil, syserror.ELOOP
+ case *namedPipe:
+ return newNamedPipeFD(ctx, impl, rp, &d.vfsd, flags)
+ default:
+ panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl))
+ }
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ d, err := resolveLocked(rp)
+ if err != nil {
+ return "", err
+ }
+ symlink, ok := d.inode.impl.(*symlink)
+ if !ok {
+ return "", syserror.EINVAL
+ }
+ return symlink.target, nil
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+ if opts.Flags != 0 {
+ // TODO(b/145974740): Support renameat2 flags.
+ return syserror.EINVAL
+ }
+
+ // Resolve newParent first to verify that it's on this Mount.
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ newParent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ if err != nil {
+ return err
+ }
+ newName := rp.Component()
+ if newName == "." || newName == ".." {
+ return syserror.EBUSY
+ }
+ mnt := rp.Mount()
+ if mnt != oldParentVD.Mount() {
+ return syserror.EXDEV
+ }
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+
+ oldParent := oldParentVD.Dentry().Impl().(*dentry)
+ if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+ return err
+ }
+ // Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(),
+ // because if the existing child is a symlink or mount point then we want
+ // to rename over it rather than follow it.
+ renamedVFSD := oldParent.vfsd.Child(oldName)
+ if renamedVFSD == nil {
+ return syserror.ENOENT
+ }
+ renamed := renamedVFSD.Impl().(*dentry)
+ if renamed.inode.isDir() {
+ if renamed == newParent || renamedVFSD.IsAncestorOf(&newParent.vfsd) {
+ return syserror.EINVAL
+ }
+ if oldParent != newParent {
+ // Writability is needed to change renamed's "..".
+ if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true /* isDir */); err != nil {
+ return err
+ }
+ }
+ } else {
+ if opts.MustBeDir || rp.MustBeDir() {
+ return syserror.ENOTDIR
+ }
+ }
+
+ if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+ return err
+ }
+ replacedVFSD := newParent.vfsd.Child(newName)
+ var replaced *dentry
+ if replacedVFSD != nil {
+ replaced = replacedVFSD.Impl().(*dentry)
+ if replaced.inode.isDir() {
+ if !renamed.inode.isDir() {
+ return syserror.EISDIR
+ }
+ if replaced.vfsd.HasChildren() {
+ return syserror.ENOTEMPTY
+ }
+ } else {
+ if rp.MustBeDir() {
+ return syserror.ENOTDIR
+ }
+ if renamed.inode.isDir() {
+ return syserror.ENOTDIR
+ }
+ }
+ } else {
+ if renamed.inode.isDir() && newParent.inode.nlink == maxLinks {
+ return syserror.EMLINK
+ }
+ }
+ if newParent.vfsd.IsDisowned() {
+ return syserror.ENOENT
+ }
+
+ // Linux places this check before some of those above; we do it here for
+ // simplicity, under the assumption that applications are not intentionally
+ // doing noop renames expecting them to succeed where non-noop renames
+ // would fail.
+ if renamedVFSD == replacedVFSD {
+ return nil
+ }
+ vfsObj := rp.VirtualFilesystem()
+ oldParentDir := oldParent.inode.impl.(*directory)
+ newParentDir := newParent.inode.impl.(*directory)
+ if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), renamedVFSD, replacedVFSD); err != nil {
+ return err
+ }
+ if replaced != nil {
+ newParentDir.childList.Remove(replaced)
+ if replaced.inode.isDir() {
+ newParent.inode.decLinksLocked() // from replaced's ".."
+ }
+ replaced.inode.decLinksLocked()
+ }
+ oldParentDir.childList.Remove(renamed)
+ newParentDir.childList.PushBack(renamed)
+ if renamed.inode.isDir() {
+ oldParent.inode.decLinksLocked()
+ newParent.inode.incLinksLocked()
+ }
+ // TODO: update timestamps and parent directory sizes
+ vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD)
+ return nil
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ if err != nil {
+ return err
+ }
+ if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+ return err
+ }
+ name := rp.Component()
+ if name == "." {
+ return syserror.EINVAL
+ }
+ if name == ".." {
+ return syserror.ENOTEMPTY
+ }
+ childVFSD := parent.vfsd.Child(name)
+ if childVFSD == nil {
+ return syserror.ENOENT
+ }
+ child := childVFSD.Impl().(*dentry)
+ if !child.inode.isDir() {
+ return syserror.ENOTDIR
+ }
+ if childVFSD.HasChildren() {
+ return syserror.ENOTEMPTY
+ }
+ mnt := rp.Mount()
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+ vfsObj := rp.VirtualFilesystem()
+ if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
+ return err
+ }
+ parent.inode.impl.(*directory).childList.Remove(child)
+ parent.inode.decLinksLocked() // from child's ".."
+ child.inode.decLinksLocked()
+ vfsObj.CommitDeleteDentry(childVFSD)
+ return nil
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ _, err := resolveLocked(rp)
+ if err != nil {
+ return err
+ }
+ if opts.Stat.Mask == 0 {
+ return nil
+ }
+ // TODO: implement inode.setStat
+ return syserror.EPERM
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ d, err := resolveLocked(rp)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ var stat linux.Statx
+ d.inode.statTo(&stat)
+ return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ _, err := resolveLocked(rp)
+ if err != nil {
+ return linux.Statfs{}, err
+ }
+ // TODO: actually implement statfs
+ return linux.Statfs{}, syserror.ENOSYS
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+ return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+ child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
+ parent.vfsd.InsertChild(&child.vfsd, name)
+ parent.inode.impl.(*directory).childList.PushBack(child)
+ return nil
+ })
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ if err != nil {
+ return err
+ }
+ if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+ return err
+ }
+ name := rp.Component()
+ if name == "." || name == ".." {
+ return syserror.EISDIR
+ }
+ childVFSD := parent.vfsd.Child(name)
+ if childVFSD == nil {
+ return syserror.ENOENT
+ }
+ child := childVFSD.Impl().(*dentry)
+ if child.inode.isDir() {
+ return syserror.EISDIR
+ }
+ if !rp.MustBeDir() {
+ return syserror.ENOTDIR
+ }
+ mnt := rp.Mount()
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+ vfsObj := rp.VirtualFilesystem()
+ if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
+ return err
+ }
+ parent.inode.impl.(*directory).childList.Remove(child)
+ child.inode.decLinksLocked()
+ vfsObj.CommitDeleteDentry(childVFSD)
+ return nil
+}
+
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ _, err := resolveLocked(rp)
+ if err != nil {
+ return nil, err
+ }
+ // TODO(b/127675828): support extended attributes
+ return nil, syserror.ENOTSUP
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ _, err := resolveLocked(rp)
+ if err != nil {
+ return "", err
+ }
+ // TODO(b/127675828): support extended attributes
+ return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ _, err := resolveLocked(rp)
+ if err != nil {
+ return err
+ }
+ // TODO(b/127675828): support extended attributes
+ return syserror.ENOTSUP
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ _, err := resolveLocked(rp)
+ if err != nil {
+ return err
+ }
+ // TODO(b/127675828): support extended attributes
+ return syserror.ENOTSUP
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ return vfs.GenericPrependPath(vfsroot, vd, b)
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
new file mode 100644
index 000000000..40bde54de
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -0,0 +1,60 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+type namedPipe struct {
+ inode inode
+
+ pipe *pipe.VFSPipe
+}
+
+// Preconditions:
+// * fs.mu must be locked.
+// * rp.Mount().CheckBeginWrite() has been called successfully.
+func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode {
+ file := &namedPipe{pipe: pipe.NewVFSPipe(pipe.DefaultPipeSize, usermem.PageSize)}
+ file.inode.init(file, fs, creds, mode)
+ file.inode.nlink = 1 // Only the parent has a link.
+ return &file.inode
+}
+
+// namedPipeFD implements vfs.FileDescriptionImpl. Methods are implemented
+// entirely via struct embedding.
+type namedPipeFD struct {
+ fileDescription
+
+ *pipe.VFSPipeFD
+}
+
+func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+ var err error
+ var fd namedPipeFD
+ fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, rp, vfsd, &fd.vfsfd, flags)
+ if err != nil {
+ return nil, err
+ }
+ mnt := rp.Mount()
+ fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
+ return &fd.vfsfd, nil
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
new file mode 100644
index 000000000..70b42a6ec
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
@@ -0,0 +1,235 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+ "bytes"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+const fileName = "mypipe"
+
+func TestSeparateFDs(t *testing.T) {
+ ctx, creds, vfsObj, root := setup(t)
+ defer root.DecRef()
+
+ // Open the read side. This is done in a concurrently because opening
+ // One end the pipe blocks until the other end is opened.
+ pop := vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(fileName),
+ FollowFinalSymlink: true,
+ }
+ rfdchan := make(chan *vfs.FileDescription)
+ go func() {
+ openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY}
+ rfd, _ := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+ rfdchan <- rfd
+ }()
+
+ // Open the write side.
+ openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY}
+ wfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+ if err != nil {
+ t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
+ }
+ defer wfd.DecRef()
+
+ rfd, ok := <-rfdchan
+ if !ok {
+ t.Fatalf("failed to open pipe for reading %q", fileName)
+ }
+ defer rfd.DecRef()
+
+ const msg = "vamos azul"
+ checkEmpty(ctx, t, rfd)
+ checkWrite(ctx, t, wfd, msg)
+ checkRead(ctx, t, rfd, msg)
+}
+
+func TestNonblockingRead(t *testing.T) {
+ ctx, creds, vfsObj, root := setup(t)
+ defer root.DecRef()
+
+ // Open the read side as nonblocking.
+ pop := vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(fileName),
+ FollowFinalSymlink: true,
+ }
+ openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_NONBLOCK}
+ rfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+ if err != nil {
+ t.Fatalf("failed to open pipe for reading %q: %v", fileName, err)
+ }
+ defer rfd.DecRef()
+
+ // Open the write side.
+ openOpts = vfs.OpenOptions{Flags: linux.O_WRONLY}
+ wfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+ if err != nil {
+ t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
+ }
+ defer wfd.DecRef()
+
+ const msg = "geh blau"
+ checkEmpty(ctx, t, rfd)
+ checkWrite(ctx, t, wfd, msg)
+ checkRead(ctx, t, rfd, msg)
+}
+
+func TestNonblockingWriteError(t *testing.T) {
+ ctx, creds, vfsObj, root := setup(t)
+ defer root.DecRef()
+
+ // Open the write side as nonblocking, which should return ENXIO.
+ pop := vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(fileName),
+ FollowFinalSymlink: true,
+ }
+ openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY | linux.O_NONBLOCK}
+ _, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+ if err != syserror.ENXIO {
+ t.Fatalf("expected ENXIO, but got error: %v", err)
+ }
+}
+
+func TestSingleFD(t *testing.T) {
+ ctx, creds, vfsObj, root := setup(t)
+ defer root.DecRef()
+
+ // Open the pipe as readable and writable.
+ pop := vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(fileName),
+ FollowFinalSymlink: true,
+ }
+ openOpts := vfs.OpenOptions{Flags: linux.O_RDWR}
+ fd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+ if err != nil {
+ t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
+ }
+ defer fd.DecRef()
+
+ const msg = "forza blu"
+ checkEmpty(ctx, t, fd)
+ checkWrite(ctx, t, fd, msg)
+ checkRead(ctx, t, fd, msg)
+}
+
+// setup creates a VFS with a pipe in the root directory at path fileName. The
+// returned VirtualDentry must be DecRef()'d be the caller. It calls t.Fatal
+// upon failure.
+func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesystem, vfs.VirtualDentry) {
+ ctx := contexttest.Context(t)
+ creds := auth.CredentialsFromContext(ctx)
+
+ // Create VFS.
+ vfsObj := vfs.New()
+ vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ })
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+ if err != nil {
+ t.Fatalf("failed to create tmpfs root mount: %v", err)
+ }
+
+ // Create the pipe.
+ root := mntns.Root()
+ pop := vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(fileName),
+ }
+ mknodOpts := vfs.MknodOptions{Mode: linux.ModeNamedPipe | 0644}
+ if err := vfsObj.MknodAt(ctx, creds, &pop, &mknodOpts); err != nil {
+ t.Fatalf("failed to create file %q: %v", fileName, err)
+ }
+
+ // Sanity check: the file pipe exists and has the correct mode.
+ stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(fileName),
+ FollowFinalSymlink: true,
+ }, &vfs.StatOptions{})
+ if err != nil {
+ t.Fatalf("stat(%q) failed: %v", fileName, err)
+ }
+ if stat.Mode&^linux.S_IFMT != 0644 {
+ t.Errorf("got wrong permissions (%0o)", stat.Mode)
+ }
+ if stat.Mode&linux.S_IFMT != linux.ModeNamedPipe {
+ t.Errorf("got wrong file type (%0o)", stat.Mode)
+ }
+
+ return ctx, creds, vfsObj, root
+}
+
+// checkEmpty calls t.Fatal if the pipe in fd is not empty.
+func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) {
+ readData := make([]byte, 1)
+ dst := usermem.BytesIOSequence(readData)
+ bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{})
+ if err != syserror.ErrWouldBlock {
+ t.Fatalf("expected ErrWouldBlock reading from empty pipe %q, but got: %v", fileName, err)
+ }
+ if bytesRead != 0 {
+ t.Fatalf("expected to read 0 bytes, but got %d", bytesRead)
+ }
+}
+
+// checkWrite calls t.Fatal if it fails to write all of msg to fd.
+func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) {
+ writeData := []byte(msg)
+ src := usermem.BytesIOSequence(writeData)
+ bytesWritten, err := fd.Write(ctx, src, vfs.WriteOptions{})
+ if err != nil {
+ t.Fatalf("error writing to pipe %q: %v", fileName, err)
+ }
+ if bytesWritten != int64(len(writeData)) {
+ t.Fatalf("expected to write %d bytes, but wrote %d", len(writeData), bytesWritten)
+ }
+}
+
+// checkRead calls t.Fatal if it fails to read msg from fd.
+func checkRead(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) {
+ readData := make([]byte, len(msg))
+ dst := usermem.BytesIOSequence(readData)
+ bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{})
+ if err != nil {
+ t.Fatalf("error reading from pipe %q: %v", fileName, err)
+ }
+ if bytesRead != int64(len(msg)) {
+ t.Fatalf("expected to read %d bytes, but got %d", len(msg), bytesRead)
+ }
+ if !bytes.Equal(readData, []byte(msg)) {
+ t.Fatalf("expected to read %q from pipe, but got %q", msg, string(readData))
+ }
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
new file mode 100644
index 000000000..f51e247a7
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -0,0 +1,357 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+ "io"
+ "math"
+ "sync"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
+ "gvisor.dev/gvisor/pkg/sentry/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/usage"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+type regularFile struct {
+ inode inode
+
+ // memFile is a platform.File used to allocate pages to this regularFile.
+ memFile *pgalloc.MemoryFile
+
+ // mu protects the fields below.
+ mu sync.RWMutex
+
+ // data maps offsets into the file to offsets into memFile that store
+ // the file's data.
+ data fsutil.FileRangeSet
+
+ // size is the size of data, but accessed using atomic memory
+ // operations to avoid locking in inode.stat().
+ size uint64
+
+ // seals represents file seals on this inode.
+ seals uint32
+}
+
+func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode {
+ file := &regularFile{
+ memFile: fs.memFile,
+ }
+ file.inode.init(file, fs, creds, mode)
+ file.inode.nlink = 1 // from parent directory
+ return &file.inode
+}
+
+type regularFileFD struct {
+ fileDescription
+
+ // These are immutable.
+ readable bool
+ writable bool
+
+ // off is the file offset. off is accessed using atomic memory operations.
+ // offMu serializes operations that may mutate off.
+ off int64
+ offMu sync.Mutex
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release() {
+ if fd.writable {
+ fd.vfsfd.VirtualDentry().Mount().EndWrite()
+ }
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ if !fd.readable {
+ return 0, syserror.EINVAL
+ }
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ if dst.NumBytes() == 0 {
+ return 0, nil
+ }
+ f := fd.inode().impl.(*regularFile)
+ rw := getRegularFileReadWriter(f, offset)
+ n, err := dst.CopyOutFrom(ctx, rw)
+ putRegularFileReadWriter(rw)
+ return int64(n), err
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ fd.offMu.Lock()
+ n, err := fd.PRead(ctx, dst, fd.off, opts)
+ fd.off += n
+ fd.offMu.Unlock()
+ return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ if !fd.writable {
+ return 0, syserror.EINVAL
+ }
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ srclen := src.NumBytes()
+ if srclen == 0 {
+ return 0, nil
+ }
+ f := fd.inode().impl.(*regularFile)
+ end := offset + srclen
+ if end < offset {
+ // Overflow.
+ return 0, syserror.EFBIG
+ }
+ rw := getRegularFileReadWriter(f, offset)
+ n, err := src.CopyInTo(ctx, rw)
+ putRegularFileReadWriter(rw)
+ return n, err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ fd.offMu.Lock()
+ n, err := fd.PWrite(ctx, src, fd.off, opts)
+ fd.off += n
+ fd.offMu.Unlock()
+ return n, err
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ fd.offMu.Lock()
+ defer fd.offMu.Unlock()
+ switch whence {
+ case linux.SEEK_SET:
+ // use offset as specified
+ case linux.SEEK_CUR:
+ offset += fd.off
+ case linux.SEEK_END:
+ offset += int64(atomic.LoadUint64(&fd.inode().impl.(*regularFile).size))
+ default:
+ return 0, syserror.EINVAL
+ }
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ fd.off = offset
+ return offset, nil
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *regularFileFD) Sync(ctx context.Context) error {
+ return nil
+}
+
+// regularFileReadWriter implements safemem.Reader and Safemem.Writer.
+type regularFileReadWriter struct {
+ file *regularFile
+
+ // Offset into the file to read/write at. Note that this may be
+ // different from the FD offset if PRead/PWrite is used.
+ off uint64
+}
+
+var regularFileReadWriterPool = sync.Pool{
+ New: func() interface{} {
+ return &regularFileReadWriter{}
+ },
+}
+
+func getRegularFileReadWriter(file *regularFile, offset int64) *regularFileReadWriter {
+ rw := regularFileReadWriterPool.Get().(*regularFileReadWriter)
+ rw.file = file
+ rw.off = uint64(offset)
+ return rw
+}
+
+func putRegularFileReadWriter(rw *regularFileReadWriter) {
+ rw.file = nil
+ regularFileReadWriterPool.Put(rw)
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+ rw.file.mu.RLock()
+
+ // Compute the range to read (limited by file size and overflow-checked).
+ if rw.off >= rw.file.size {
+ rw.file.mu.RUnlock()
+ return 0, io.EOF
+ }
+ end := rw.file.size
+ if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
+ end = rend
+ }
+
+ var done uint64
+ seg, gap := rw.file.data.Find(uint64(rw.off))
+ for rw.off < end {
+ mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
+ switch {
+ case seg.Ok():
+ // Get internal mappings.
+ ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
+ if err != nil {
+ rw.file.mu.RUnlock()
+ return done, err
+ }
+
+ // Copy from internal mappings.
+ n, err := safemem.CopySeq(dsts, ims)
+ done += n
+ rw.off += uint64(n)
+ dsts = dsts.DropFirst64(n)
+ if err != nil {
+ rw.file.mu.RUnlock()
+ return done, err
+ }
+
+ // Continue.
+ seg, gap = seg.NextNonEmpty()
+
+ case gap.Ok():
+ // Tmpfs holes are zero-filled.
+ gapmr := gap.Range().Intersect(mr)
+ dst := dsts.TakeFirst64(gapmr.Length())
+ n, err := safemem.ZeroSeq(dst)
+ done += n
+ rw.off += uint64(n)
+ dsts = dsts.DropFirst64(n)
+ if err != nil {
+ rw.file.mu.RUnlock()
+ return done, err
+ }
+
+ // Continue.
+ seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
+ }
+ }
+ rw.file.mu.RUnlock()
+ return done, nil
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+ rw.file.mu.Lock()
+
+ // Compute the range to write (overflow-checked).
+ end := rw.off + srcs.NumBytes()
+ if end <= rw.off {
+ end = math.MaxInt64
+ }
+
+ // Check if seals prevent either file growth or all writes.
+ switch {
+ case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed
+ rw.file.mu.Unlock()
+ return 0, syserror.EPERM
+ case end > rw.file.size && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed
+ // When growth is sealed, Linux effectively allows writes which would
+ // normally grow the file to partially succeed up to the current EOF,
+ // rounded down to the page boundary before the EOF.
+ //
+ // This happens because writes (and thus the growth check) for tmpfs
+ // files proceed page-by-page on Linux, and the final write to the page
+ // containing EOF fails, resulting in a partial write up to the start of
+ // that page.
+ //
+ // To emulate this behaviour, artifically truncate the write to the
+ // start of the page containing the current EOF.
+ //
+ // See Linux, mm/filemap.c:generic_perform_write() and
+ // mm/shmem.c:shmem_write_begin().
+ if pgstart := uint64(usermem.Addr(rw.file.size).RoundDown()); end > pgstart {
+ end = pgstart
+ }
+ if end <= rw.off {
+ // Truncation would result in no data being written.
+ rw.file.mu.Unlock()
+ return 0, syserror.EPERM
+ }
+ }
+
+ // Page-aligned mr for when we need to allocate memory. RoundUp can't
+ // overflow since end is an int64.
+ pgstartaddr := usermem.Addr(rw.off).RoundDown()
+ pgendaddr, _ := usermem.Addr(end).RoundUp()
+ pgMR := memmap.MappableRange{uint64(pgstartaddr), uint64(pgendaddr)}
+
+ var (
+ done uint64
+ retErr error
+ )
+ seg, gap := rw.file.data.Find(uint64(rw.off))
+ for rw.off < end {
+ mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
+ switch {
+ case seg.Ok():
+ // Get internal mappings.
+ ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Write)
+ if err != nil {
+ retErr = err
+ goto exitLoop
+ }
+
+ // Copy to internal mappings.
+ n, err := safemem.CopySeq(ims, srcs)
+ done += n
+ rw.off += uint64(n)
+ srcs = srcs.DropFirst64(n)
+ if err != nil {
+ retErr = err
+ goto exitLoop
+ }
+
+ // Continue.
+ seg, gap = seg.NextNonEmpty()
+
+ case gap.Ok():
+ // Allocate memory for the write.
+ gapMR := gap.Range().Intersect(pgMR)
+ fr, err := rw.file.memFile.Allocate(gapMR.Length(), usage.Tmpfs)
+ if err != nil {
+ retErr = err
+ goto exitLoop
+ }
+
+ // Write to that memory as usual.
+ seg, gap = rw.file.data.Insert(gap, gapMR, fr.Start), fsutil.FileRangeGapIterator{}
+ }
+ }
+exitLoop:
+ // If the write ends beyond the file's previous size, it causes the
+ // file to grow.
+ if rw.off > rw.file.size {
+ atomic.StoreUint64(&rw.file.size, rw.off)
+ }
+
+ rw.file.mu.Unlock()
+ return done, retErr
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
new file mode 100644
index 000000000..3731c5b6f
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -0,0 +1,224 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If
+// the returned err is not nil, then cleanup should be called when the FD is no
+// longer needed.
+func newFileFD(ctx context.Context, filename string) (*vfs.FileDescription, func(), error) {
+ creds := auth.CredentialsFromContext(ctx)
+
+ vfsObj := vfs.New()
+ vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ })
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+ if err != nil {
+ return nil, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err)
+ }
+ root := mntns.Root()
+
+ // Create the file that will be write/read.
+ fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(filename),
+ FollowFinalSymlink: true,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+ Mode: 0644,
+ })
+ if err != nil {
+ root.DecRef()
+ mntns.DecRef(vfsObj)
+ return nil, nil, fmt.Errorf("failed to create file %q: %v", filename, err)
+ }
+
+ return fd, func() {
+ root.DecRef()
+ mntns.DecRef(vfsObj)
+ }, nil
+}
+
+// Test that we can write some data to a file and read it back.`
+func TestSimpleWriteRead(t *testing.T) {
+ ctx := contexttest.Context(t)
+ fd, cleanup, err := newFileFD(ctx, "simpleReadWrite")
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer cleanup()
+
+ // Write.
+ data := []byte("foobarbaz")
+ n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+ if err != nil {
+ t.Fatalf("fd.Write failed: %v", err)
+ }
+ if n != int64(len(data)) {
+ t.Errorf("fd.Write got short write length %d, want %d", n, len(data))
+ }
+ if got, want := fd.Impl().(*regularFileFD).off, int64(len(data)); got != want {
+ t.Errorf("fd.Write left offset at %d, want %d", got, want)
+ }
+
+ // Seek back to beginning.
+ if _, err := fd.Seek(ctx, 0, linux.SEEK_SET); err != nil {
+ t.Fatalf("fd.Seek failed: %v", err)
+ }
+ if got, want := fd.Impl().(*regularFileFD).off, int64(0); got != want {
+ t.Errorf("fd.Seek(0) left offset at %d, want %d", got, want)
+ }
+
+ // Read.
+ buf := make([]byte, len(data))
+ n, err = fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
+ if err != nil && err != io.EOF {
+ t.Fatalf("fd.Read failed: %v", err)
+ }
+ if n != int64(len(data)) {
+ t.Errorf("fd.Read got short read length %d, want %d", n, len(data))
+ }
+ if got, want := string(buf), string(data); got != want {
+ t.Errorf("Read got %q want %s", got, want)
+ }
+ if got, want := fd.Impl().(*regularFileFD).off, int64(len(data)); got != want {
+ t.Errorf("fd.Write left offset at %d, want %d", got, want)
+ }
+}
+
+func TestPWrite(t *testing.T) {
+ ctx := contexttest.Context(t)
+ fd, cleanup, err := newFileFD(ctx, "PRead")
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer cleanup()
+
+ // Fill file with 1k 'a's.
+ data := bytes.Repeat([]byte{'a'}, 1000)
+ n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+ if err != nil {
+ t.Fatalf("fd.Write failed: %v", err)
+ }
+ if n != int64(len(data)) {
+ t.Errorf("fd.Write got short write length %d, want %d", n, len(data))
+ }
+
+ // Write "gVisor is awesome" at various offsets.
+ buf := []byte("gVisor is awesome")
+ offsets := []int{0, 1, 2, 10, 20, 50, 100, len(data) - 100, len(data) - 1, len(data), len(data) + 1}
+ for _, offset := range offsets {
+ name := fmt.Sprintf("PWrite offset=%d", offset)
+ t.Run(name, func(t *testing.T) {
+ n, err := fd.PWrite(ctx, usermem.BytesIOSequence(buf), int64(offset), vfs.WriteOptions{})
+ if err != nil {
+ t.Errorf("fd.PWrite got err %v want nil", err)
+ }
+ if n != int64(len(buf)) {
+ t.Errorf("fd.PWrite got %d bytes want %d", n, len(buf))
+ }
+
+ // Update data to reflect expected file contents.
+ if len(data) < offset+len(buf) {
+ data = append(data, make([]byte, (offset+len(buf))-len(data))...)
+ }
+ copy(data[offset:], buf)
+
+ // Read the whole file and compare with data.
+ readBuf := make([]byte, len(data))
+ n, err = fd.PRead(ctx, usermem.BytesIOSequence(readBuf), 0, vfs.ReadOptions{})
+ if err != nil {
+ t.Fatalf("fd.PRead failed: %v", err)
+ }
+ if n != int64(len(data)) {
+ t.Errorf("fd.PRead got short read length %d, want %d", n, len(data))
+ }
+ if got, want := string(readBuf), string(data); got != want {
+ t.Errorf("PRead got %q want %s", got, want)
+ }
+
+ })
+ }
+}
+
+func TestPRead(t *testing.T) {
+ ctx := contexttest.Context(t)
+ fd, cleanup, err := newFileFD(ctx, "PRead")
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer cleanup()
+
+ // Write 100 sequences of 'gVisor is awesome'.
+ data := bytes.Repeat([]byte("gVisor is awsome"), 100)
+ n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+ if err != nil {
+ t.Fatalf("fd.Write failed: %v", err)
+ }
+ if n != int64(len(data)) {
+ t.Errorf("fd.Write got short write length %d, want %d", n, len(data))
+ }
+
+ // Read various sizes from various offsets.
+ sizes := []int{0, 1, 2, 10, 20, 50, 100, 1000}
+ offsets := []int{0, 1, 2, 10, 20, 50, 100, 1000, len(data) - 100, len(data) - 1, len(data), len(data) + 1}
+
+ for _, size := range sizes {
+ for _, offset := range offsets {
+ name := fmt.Sprintf("PRead offset=%d size=%d", offset, size)
+ t.Run(name, func(t *testing.T) {
+ var (
+ wantRead []byte
+ wantErr error
+ )
+ if offset < len(data) {
+ wantRead = data[offset:]
+ } else if size > 0 {
+ wantErr = io.EOF
+ }
+ if offset+size < len(data) {
+ wantRead = wantRead[:size]
+ }
+ buf := make([]byte, size)
+ n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), int64(offset), vfs.ReadOptions{})
+ if err != wantErr {
+ t.Errorf("fd.PRead got err %v want %v", err, wantErr)
+ }
+ if n != int64(len(wantRead)) {
+ t.Errorf("fd.PRead got %d bytes want %d", n, len(wantRead))
+ }
+ if got := string(buf[:n]); got != string(wantRead) {
+ t.Errorf("fd.PRead got %q want %q", got, string(wantRead))
+ }
+ })
+ }
+ }
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go
new file mode 100644
index 000000000..5246aca84
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/symlink.go
@@ -0,0 +1,36 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+type symlink struct {
+ inode inode
+ target string // immutable
+}
+
+func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode {
+ link := &symlink{
+ target: target,
+ }
+ link.inode.init(link, fs, creds, 0777)
+ link.inode.nlink = 1 // from parent directory
+ return &link.inode
+}
+
+// O_PATH is unimplemented, so there's no way to get a FileDescription
+// representing a symlink yet.
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
new file mode 100644
index 000000000..7be6faa5b
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -0,0 +1,299 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package tmpfs provides a filesystem implementation that behaves like tmpfs:
+// the Dentry tree is the sole source of truth for the state of the filesystem.
+//
+// Lock order:
+//
+// filesystem.mu
+// regularFileFD.offMu
+// regularFile.mu
+// inode.mu
+package tmpfs
+
+import (
+ "fmt"
+ "math"
+ "sync"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+ vfsfs vfs.Filesystem
+
+ // memFile is used to allocate pages to for regular files.
+ memFile *pgalloc.MemoryFile
+
+ // mu serializes changes to the Dentry tree.
+ mu sync.RWMutex
+
+ nextInoMinusOne uint64 // accessed using atomic memory operations
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+ memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx)
+ if memFileProvider == nil {
+ panic("MemoryFileProviderFromContext returned nil")
+ }
+ fs := filesystem{
+ memFile: memFileProvider.MemoryFile(),
+ }
+ fs.vfsfs.Init(vfsObj, &fs)
+ root := fs.newDentry(fs.newDirectory(creds, 01777))
+ return &fs.vfsfs, &root.vfsd, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release() {
+}
+
+// dentry implements vfs.DentryImpl.
+type dentry struct {
+ vfsd vfs.Dentry
+
+ // inode is the inode represented by this dentry. Multiple Dentries may
+ // share a single non-directory inode (with hard links). inode is
+ // immutable.
+ inode *inode
+
+ // tmpfs doesn't count references on dentries; because the dentry tree is
+ // the sole source of truth, it is by definition always consistent with the
+ // state of the filesystem. However, it does count references on inodes,
+ // because inode resources are released when all references are dropped.
+ // (tmpfs doesn't really have resources to release, but we implement
+ // reference counting because tmpfs regular files will.)
+
+ // dentryEntry (ugh) links dentries into their parent directory.childList.
+ dentryEntry
+}
+
+func (fs *filesystem) newDentry(inode *inode) *dentry {
+ d := &dentry{
+ inode: inode,
+ }
+ d.vfsd.Init(d)
+ return d
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *dentry) IncRef() {
+ d.inode.incRef()
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *dentry) TryIncRef() bool {
+ return d.inode.tryIncRef()
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *dentry) DecRef() {
+ d.inode.decRef()
+}
+
+// inode represents a filesystem object.
+type inode struct {
+ // refs is a reference count. refs is accessed using atomic memory
+ // operations.
+ //
+ // A reference is held on all inodes that are reachable in the filesystem
+ // tree. For non-directories (which may have multiple hard links), this
+ // means that a reference is dropped when nlink reaches 0. For directories,
+ // nlink never reaches 0 due to the "." entry; instead,
+ // filesystem.RmdirAt() drops the reference.
+ refs int64
+
+ // Inode metadata; protected by mu and accessed using atomic memory
+ // operations unless otherwise specified.
+ mu sync.RWMutex
+ mode uint32 // excluding file type bits, which are based on impl
+ nlink uint32 // protected by filesystem.mu instead of inode.mu
+ uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
+ gid uint32 // auth.KGID, but ...
+ ino uint64 // immutable
+
+ impl interface{} // immutable
+}
+
+const maxLinks = math.MaxUint32
+
+func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) {
+ i.refs = 1
+ i.mode = uint32(mode)
+ i.uid = uint32(creds.EffectiveKUID)
+ i.gid = uint32(creds.EffectiveKGID)
+ i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
+ // i.nlink initialized by caller
+ i.impl = impl
+}
+
+// incLinksLocked increments i's link count.
+//
+// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
+// i.nlink < maxLinks.
+func (i *inode) incLinksLocked() {
+ if i.nlink == 0 {
+ panic("tmpfs.inode.incLinksLocked() called with no existing links")
+ }
+ if i.nlink == maxLinks {
+ panic("memfs.inode.incLinksLocked() called with maximum link count")
+ }
+ atomic.AddUint32(&i.nlink, 1)
+}
+
+// decLinksLocked decrements i's link count.
+//
+// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
+func (i *inode) decLinksLocked() {
+ if i.nlink == 0 {
+ panic("tmpfs.inode.decLinksLocked() called with no existing links")
+ }
+ atomic.AddUint32(&i.nlink, ^uint32(0))
+}
+
+func (i *inode) incRef() {
+ if atomic.AddInt64(&i.refs, 1) <= 1 {
+ panic("tmpfs.inode.incRef() called without holding a reference")
+ }
+}
+
+func (i *inode) tryIncRef() bool {
+ for {
+ refs := atomic.LoadInt64(&i.refs)
+ if refs == 0 {
+ return false
+ }
+ if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) {
+ return true
+ }
+ }
+}
+
+func (i *inode) decRef() {
+ if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
+ // This is unnecessary; it's mostly to simulate what tmpfs would do.
+ if regFile, ok := i.impl.(*regularFile); ok {
+ regFile.mu.Lock()
+ regFile.data.DropAll(regFile.memFile)
+ atomic.StoreUint64(&regFile.size, 0)
+ regFile.mu.Unlock()
+ }
+ } else if refs < 0 {
+ panic("tmpfs.inode.decRef() called without holding a reference")
+ }
+}
+
+func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
+ return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
+}
+
+// Go won't inline this function, and returning linux.Statx (which is quite
+// big) means spending a lot of time in runtime.duffcopy(), so instead it's an
+// output parameter.
+func (i *inode) statTo(stat *linux.Statx) {
+ stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+ stat.Blksize = 1 // usermem.PageSize in tmpfs
+ stat.Nlink = atomic.LoadUint32(&i.nlink)
+ stat.UID = atomic.LoadUint32(&i.uid)
+ stat.GID = atomic.LoadUint32(&i.gid)
+ stat.Mode = uint16(atomic.LoadUint32(&i.mode))
+ stat.Ino = i.ino
+ // TODO: device number
+ switch impl := i.impl.(type) {
+ case *regularFile:
+ stat.Mode |= linux.S_IFREG
+ stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
+ stat.Size = uint64(atomic.LoadUint64(&impl.size))
+ // In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in
+ // a uint64 accessed using atomic memory operations to avoid taking
+ // locks).
+ stat.Blocks = allocatedBlocksForSize(stat.Size)
+ case *directory:
+ stat.Mode |= linux.S_IFDIR
+ case *symlink:
+ stat.Mode |= linux.S_IFLNK
+ stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
+ stat.Size = uint64(len(impl.target))
+ stat.Blocks = allocatedBlocksForSize(stat.Size)
+ case *namedPipe:
+ stat.Mode |= linux.S_IFIFO
+ default:
+ panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+ }
+}
+
+// allocatedBlocksForSize returns the number of 512B blocks needed to
+// accommodate the given size in bytes, as appropriate for struct
+// stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
+// size is independent of the "preferred block size for I/O", struct
+// stat::st_blksize and struct statx::stx_blksize.)
+func allocatedBlocksForSize(size uint64) uint64 {
+ return (size + 511) / 512
+}
+
+func (i *inode) direntType() uint8 {
+ switch i.impl.(type) {
+ case *regularFile:
+ return linux.DT_REG
+ case *directory:
+ return linux.DT_DIR
+ case *symlink:
+ return linux.DT_LNK
+ default:
+ panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+ }
+}
+
+// fileDescription is embedded by tmpfs implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+ vfsfd vfs.FileDescription
+ vfs.FileDescriptionDefaultImpl
+}
+
+func (fd *fileDescription) filesystem() *filesystem {
+ return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
+}
+
+func (fd *fileDescription) inode() *inode {
+ return fd.vfsfd.Dentry().Impl().(*dentry).inode
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ var stat linux.Statx
+ fd.inode().statTo(&stat)
+ return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+ if opts.Stat.Mask == 0 {
+ return nil
+ }
+ // TODO: implement inode.setStat
+ return syserror.EPERM
+}