summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/fsimpl
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/fsimpl')
-rw-r--r--pkg/sentry/fsimpl/cgroupfs/BUILD2
-rw-r--r--pkg/sentry/fsimpl/cgroupfs/base.go9
-rw-r--r--pkg/sentry/fsimpl/cgroupfs/cgroupfs.go181
-rw-r--r--pkg/sentry/fsimpl/cgroupfs/memory.go30
-rw-r--r--pkg/sentry/fsimpl/devpts/BUILD1
-rw-r--r--pkg/sentry/fsimpl/devpts/devpts.go7
-rw-r--r--pkg/sentry/fsimpl/devpts/line_discipline.go10
-rw-r--r--pkg/sentry/fsimpl/devpts/queue.go6
-rw-r--r--pkg/sentry/fsimpl/eventfd/BUILD2
-rw-r--r--pkg/sentry/fsimpl/eventfd/eventfd.go10
-rw-r--r--pkg/sentry/fsimpl/ext/BUILD0
-rw-r--r--pkg/sentry/fsimpl/fuse/BUILD2
-rw-r--r--pkg/sentry/fsimpl/fuse/dev.go15
-rw-r--r--pkg/sentry/fsimpl/fuse/dev_test.go4
-rw-r--r--pkg/sentry/fsimpl/fuse/directory.go12
-rw-r--r--pkg/sentry/fsimpl/fuse/fusefs.go3
-rw-r--r--pkg/sentry/fsimpl/fuse/read_write.go3
-rw-r--r--pkg/sentry/fsimpl/fuse/regular_file.go7
-rw-r--r--pkg/sentry/fsimpl/gofer/BUILD4
-rw-r--r--pkg/sentry/fsimpl/gofer/directory.go101
-rw-r--r--pkg/sentry/fsimpl/gofer/filesystem.go519
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer.go722
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer_test.go1
-rw-r--r--pkg/sentry/fsimpl/gofer/handle.go121
-rw-r--r--pkg/sentry/fsimpl/gofer/host_named_pipe.go3
-rw-r--r--pkg/sentry/fsimpl/gofer/p9file.go12
-rw-r--r--pkg/sentry/fsimpl/gofer/regular_file.go26
-rw-r--r--pkg/sentry/fsimpl/gofer/revalidate.go50
-rw-r--r--pkg/sentry/fsimpl/gofer/save_restore.go147
-rw-r--r--pkg/sentry/fsimpl/gofer/socket.go45
-rw-r--r--pkg/sentry/fsimpl/gofer/special_file.go154
-rw-r--r--pkg/sentry/fsimpl/gofer/symlink.go8
-rw-r--r--pkg/sentry/fsimpl/gofer/time.go5
-rw-r--r--pkg/sentry/fsimpl/host/BUILD1
-rw-r--r--pkg/sentry/fsimpl/host/host.go5
-rw-r--r--pkg/sentry/fsimpl/host/tty.go7
-rw-r--r--pkg/sentry/fsimpl/host/util.go2
-rw-r--r--pkg/sentry/fsimpl/kernfs/BUILD3
-rw-r--r--pkg/sentry/fsimpl/kernfs/fd_impl_util.go3
-rw-r--r--pkg/sentry/fsimpl/kernfs/filesystem.go19
-rw-r--r--pkg/sentry/fsimpl/kernfs/inode_impl_util.go38
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs.go67
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs_test.go61
-rw-r--r--pkg/sentry/fsimpl/overlay/BUILD1
-rw-r--r--pkg/sentry/fsimpl/overlay/copy_up.go12
-rw-r--r--pkg/sentry/fsimpl/overlay/filesystem.go53
-rw-r--r--pkg/sentry/fsimpl/proc/BUILD1
-rw-r--r--pkg/sentry/fsimpl/proc/subtasks.go17
-rw-r--r--pkg/sentry/fsimpl/proc/task_fds.go16
-rw-r--r--pkg/sentry/fsimpl/proc/task_files.go64
-rw-r--r--pkg/sentry/fsimpl/proc/tasks.go8
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_files.go5
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_sys.go9
-rw-r--r--pkg/sentry/fsimpl/signalfd/BUILD2
-rw-r--r--pkg/sentry/fsimpl/signalfd/signalfd.go4
-rw-r--r--pkg/sentry/fsimpl/sys/BUILD1
-rw-r--r--pkg/sentry/fsimpl/sys/sys.go14
-rw-r--r--pkg/sentry/fsimpl/sys/sys_test.go14
-rw-r--r--pkg/sentry/fsimpl/timerfd/BUILD1
-rw-r--r--pkg/sentry/fsimpl/timerfd/timerfd.go3
-rw-r--r--pkg/sentry/fsimpl/tmpfs/BUILD2
-rw-r--r--pkg/sentry/fsimpl/tmpfs/filesystem.go31
-rw-r--r--pkg/sentry/fsimpl/tmpfs/pipe_test.go3
-rw-r--r--pkg/sentry/fsimpl/tmpfs/regular_file_test.go6
-rw-r--r--pkg/sentry/fsimpl/tmpfs/tmpfs.go3
-rw-r--r--pkg/sentry/fsimpl/verity/BUILD15
-rw-r--r--pkg/sentry/fsimpl/verity/filesystem.go69
-rw-r--r--pkg/sentry/fsimpl/verity/verity.go284
68 files changed, 2337 insertions, 729 deletions
diff --git a/pkg/sentry/fsimpl/cgroupfs/BUILD b/pkg/sentry/fsimpl/cgroupfs/BUILD
index 4c9c5b344..e5fdcc776 100644
--- a/pkg/sentry/fsimpl/cgroupfs/BUILD
+++ b/pkg/sentry/fsimpl/cgroupfs/BUILD
@@ -32,6 +32,7 @@ go_library(
"//pkg/context",
"//pkg/coverage",
"//pkg/errors/linuxerr",
+ "//pkg/fspath",
"//pkg/log",
"//pkg/refs",
"//pkg/refsvfs2",
@@ -43,7 +44,6 @@ go_library(
"//pkg/sentry/usage",
"//pkg/sentry/vfs",
"//pkg/sync",
- "//pkg/syserror",
"//pkg/usermem",
],
)
diff --git a/pkg/sentry/fsimpl/cgroupfs/base.go b/pkg/sentry/fsimpl/cgroupfs/base.go
index 4290ffe0d..71bb0a9c8 100644
--- a/pkg/sentry/fsimpl/cgroupfs/base.go
+++ b/pkg/sentry/fsimpl/cgroupfs/base.go
@@ -88,7 +88,6 @@ type controller interface {
// +stateify savable
type cgroupInode struct {
dir
- fs *filesystem
// ts is the list of tasks in this cgroup. The kernel is responsible for
// removing tasks from this list before they're destroyed, so any tasks on
@@ -102,9 +101,10 @@ var _ kernel.CgroupImpl = (*cgroupInode)(nil)
func (fs *filesystem) newCgroupInode(ctx context.Context, creds *auth.Credentials) kernfs.Inode {
c := &cgroupInode{
- fs: fs,
- ts: make(map[*kernel.Task]struct{}),
+ dir: dir{fs: fs},
+ ts: make(map[*kernel.Task]struct{}),
}
+ c.dir.cgi = c
contents := make(map[string]kernfs.Inode)
contents["cgroup.procs"] = fs.newControllerFile(ctx, creds, &cgroupProcsData{c})
@@ -115,8 +115,7 @@ func (fs *filesystem) newCgroupInode(ctx context.Context, creds *auth.Credential
}
c.dir.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|linux.FileMode(0555))
- c.dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
- c.dir.InitRefs()
+ c.dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true})
c.dir.IncLinks(c.dir.OrderedChildren.Populate(contents))
atomic.AddUint64(&fs.numCgroups, 1)
diff --git a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go
index 22c8b7fda..edc3b50b9 100644
--- a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go
+++ b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go
@@ -32,7 +32,8 @@
// controllers associated with them.
//
// Since cgroupfs doesn't allow hardlinks, there is a unique mapping between
-// cgroupfs dentries and inodes.
+// cgroupfs dentries and inodes. Thus, cgroupfs inodes don't need to be ref
+// counted and exist until they're unlinked once or the FS is destroyed.
//
// # Synchronization
//
@@ -48,10 +49,11 @@
// Lock order:
//
// kernel.CgroupRegistry.mu
-// cgroupfs.filesystem.mu
-// kernel.TaskSet.mu
-// kernel.Task.mu
-// cgroupfs.filesystem.tasksMu.
+// kernfs.filesystem.mu
+// kernel.TaskSet.mu
+// kernel.Task.mu
+// cgroupfs.filesystem.tasksMu.
+// cgroupfs.dir.OrderedChildren.mu
package cgroupfs
import (
@@ -63,6 +65,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/errors/linuxerr"
+ "gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -108,6 +111,7 @@ type FilesystemType struct{}
// +stateify savable
type InternalData struct {
DefaultControlValues map[string]int64
+ InitialCgroupPath string
}
// filesystem implements vfs.FilesystemImpl and kernel.cgroupFS.
@@ -134,6 +138,11 @@ type filesystem struct {
numCgroups uint64 // Protected by atomic ops.
root *kernfs.Dentry
+ // effectiveRoot is the initial cgroup new tasks are created in. Unless
+ // overwritten by internal mount options, root == effectiveRoot. If
+ // effectiveRoot != root, an extra reference is held on effectiveRoot for
+ // the lifetime of the filesystem.
+ effectiveRoot *kernfs.Dentry
// tasksMu serializes task membership changes across all cgroups within a
// filesystem.
@@ -229,6 +238,9 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
fs := vfsfs.Impl().(*filesystem)
ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: mounting new view to hierarchy %v", fs.hierarchyID)
fs.root.IncRef()
+ if fs.effectiveRoot != fs.root {
+ fs.effectiveRoot.IncRef()
+ }
return vfsfs, fs.root.VFSDentry(), nil
}
@@ -245,8 +257,8 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
var defaults map[string]int64
if opts.InternalData != nil {
- ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults)
defaults = opts.InternalData.(*InternalData).DefaultControlValues
+ ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults)
}
for _, ty := range wantControllers {
@@ -286,6 +298,14 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
var rootD kernfs.Dentry
rootD.InitRoot(&fs.Filesystem, root)
fs.root = &rootD
+ fs.effectiveRoot = fs.root
+
+ if err := fs.prepareInitialCgroup(ctx, vfsObj, opts); err != nil {
+ ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: failed to prepare initial cgroup: %v", err)
+ rootD.DecRef(ctx)
+ fs.VFSFilesystem().DecRef(ctx)
+ return nil, nil, err
+ }
// Register controllers. The registry may be modified concurrently, so if we
// get an error, we raced with someone else who registered the same
@@ -303,10 +323,47 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
return fs.VFSFilesystem(), rootD.VFSDentry(), nil
}
+// prepareInitialCgroup creates the initial cgroup according to opts. An initial
+// cgroup is optional, and if not specified, this function is a no-op.
+func (fs *filesystem) prepareInitialCgroup(ctx context.Context, vfsObj *vfs.VirtualFilesystem, opts vfs.GetFilesystemOptions) error {
+ if opts.InternalData == nil {
+ return nil
+ }
+ initPathStr := opts.InternalData.(*InternalData).InitialCgroupPath
+ if initPathStr == "" {
+ return nil
+ }
+ ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path: %v", initPathStr)
+ initPath := fspath.Parse(initPathStr)
+ if !initPath.Absolute || !initPath.HasComponents() {
+ ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path invalid: %+v", initPath)
+ return linuxerr.EINVAL
+ }
+
+ // Have initial cgroup target, create the tree.
+ cgDir := fs.root.Inode().(*cgroupInode)
+ for pit := initPath.Begin; pit.Ok(); pit = pit.Next() {
+ cgDirI, err := cgDir.NewDir(ctx, pit.String(), vfs.MkdirOptions{})
+ if err != nil {
+ return err
+ }
+ cgDir = cgDirI.(*cgroupInode)
+ }
+
+ // Walk to target dentry.
+ initDentry, err := fs.root.WalkDentryTree(ctx, vfsObj, initPath)
+ if err != nil {
+ ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup dentry not found: %v", err)
+ return linuxerr.ENOENT
+ }
+ fs.effectiveRoot = initDentry // Reference from WalkDentryTree transferred here.
+ return nil
+}
+
func (fs *filesystem) rootCgroup() kernel.Cgroup {
return kernel.Cgroup{
- Dentry: fs.root,
- CgroupImpl: fs.root.Inode().(kernel.CgroupImpl),
+ Dentry: fs.effectiveRoot,
+ CgroupImpl: fs.effectiveRoot.Inode().(kernel.CgroupImpl),
}
}
@@ -320,6 +377,10 @@ func (fs *filesystem) Release(ctx context.Context) {
r.Unregister(fs.hierarchyID)
}
+ if fs.root != fs.effectiveRoot {
+ fs.effectiveRoot.DecRef(ctx)
+ }
+
fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
fs.Filesystem.Release(ctx)
}
@@ -346,15 +407,18 @@ func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error
//
// +stateify savable
type dir struct {
- dirRefs
+ kernfs.InodeNoopRefCount
kernfs.InodeAlwaysValid
kernfs.InodeAttrs
kernfs.InodeNotSymlink
- kernfs.InodeDirectoryNoNewChildren // TODO(b/183137098): Implement mkdir.
+ kernfs.InodeDirectoryNoNewChildren
kernfs.OrderedChildren
implStatFS
locks vfs.FileLocks
+
+ fs *filesystem // Immutable.
+ cgi *cgroupInode // Immutable.
}
// Keep implements kernfs.Inode.Keep.
@@ -378,9 +442,100 @@ func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry
return fd.VFSFileDescription(), nil
}
-// DecRef implements kernfs.Inode.DecRef.
-func (d *dir) DecRef(ctx context.Context) {
- d.dirRefs.DecRef(func() { d.Destroy(ctx) })
+// NewDir implements kernfs.Inode.NewDir.
+func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) {
+ // "Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable."
+ // -- Linux, kernel/cgroup.c:cgroup_mkdir().
+ if strings.Contains(name, "\n") {
+ return nil, linuxerr.EINVAL
+ }
+ return d.OrderedChildren.Inserter(name, func() kernfs.Inode {
+ d.IncLinks(1)
+ return d.fs.newCgroupInode(ctx, auth.CredentialsFromContext(ctx))
+ })
+}
+
+// Rename implements kernfs.Inode.Rename. Cgroupfs only allows renaming of
+// cgroup directories, and the rename may only change the name within the same
+// parent. See linux, kernel/cgroup.c:cgroup_rename().
+func (d *dir) Rename(ctx context.Context, oldname, newname string, child, dst kernfs.Inode) error {
+ if _, ok := child.(*cgroupInode); !ok {
+ // Not a cgroup directory. Control files are backed by different types.
+ return linuxerr.ENOTDIR
+ }
+
+ dstCGInode, ok := dst.(*cgroupInode)
+ if !ok {
+ // Not a cgroup inode, so definitely can't be *this* inode.
+ return linuxerr.EIO
+ }
+ // Note: We're intentionally comparing addresses, since two different dirs
+ // could plausibly be identical in memory, but would occupy different
+ // locations in memory.
+ if d != &dstCGInode.dir {
+ // Destination dir is a different cgroup inode. Cross directory renames
+ // aren't allowed.
+ return linuxerr.EIO
+ }
+
+ // Rename moves oldname to newname within d. Proceed.
+ return d.OrderedChildren.Rename(ctx, oldname, newname, child, dst)
+}
+
+// Unlink implements kernfs.Inode.Unlink. Cgroupfs disallows unlink, as the only
+// files in the filesystem are control files, which can't be deleted.
+func (d *dir) Unlink(ctx context.Context, name string, child kernfs.Inode) error {
+ return linuxerr.EPERM
+}
+
+// hasChildrenLocked returns whether the cgroup dir contains any objects that
+// prevent it from being deleted.
+func (d *dir) hasChildrenLocked() bool {
+ // Subdirs take a link on the parent, so checks if there are any direct
+ // children cgroups. Exclude the dir's self link and the link from ".".
+ if d.InodeAttrs.Links()-2 > 0 {
+ return true
+ }
+ return len(d.cgi.ts) > 0
+}
+
+// HasChildren implements kernfs.Inode.HasChildren.
+//
+// The empty check for a cgroupfs directory is unlike a regular directory since
+// a cgroupfs directory will always have control files. A cgroupfs directory can
+// be deleted if cgroup contains no tasks and has no sub-cgroups.
+func (d *dir) HasChildren() bool {
+ d.fs.tasksMu.RLock()
+ defer d.fs.tasksMu.RUnlock()
+ return d.hasChildrenLocked()
+}
+
+// RmDir implements kernfs.Inode.RmDir.
+func (d *dir) RmDir(ctx context.Context, name string, child kernfs.Inode) error {
+ // Unlike a normal directory, we need to recheck if d is empty again, since
+ // vfs/kernfs can't stop tasks from entering or leaving the cgroup.
+ d.fs.tasksMu.RLock()
+ defer d.fs.tasksMu.RUnlock()
+
+ cgi, ok := child.(*cgroupInode)
+ if !ok {
+ return linuxerr.ENOTDIR
+ }
+ if cgi.dir.hasChildrenLocked() {
+ return linuxerr.ENOTEMPTY
+ }
+
+ // Disallow deletion of the effective root cgroup.
+ if cgi == d.fs.effectiveRoot.Inode().(*cgroupInode) {
+ ctx.Warningf("Cannot delete initial cgroup for new tasks %q", d.fs.effectiveRoot.FSLocalPath())
+ return linuxerr.EBUSY
+ }
+
+ err := d.OrderedChildren.RmDir(ctx, name, child)
+ if err == nil {
+ d.InodeAttrs.DecLinks()
+ }
+ return err
}
// controllerFile represents a generic control file that appears within a cgroup
diff --git a/pkg/sentry/fsimpl/cgroupfs/memory.go b/pkg/sentry/fsimpl/cgroupfs/memory.go
index 485c98376..d880c9bc4 100644
--- a/pkg/sentry/fsimpl/cgroupfs/memory.go
+++ b/pkg/sentry/fsimpl/cgroupfs/memory.go
@@ -31,22 +31,34 @@ import (
type memoryController struct {
controllerCommon
- limitBytes int64
+ limitBytes int64
+ softLimitBytes int64
+ moveChargeAtImmigrate int64
}
var _ controller = (*memoryController)(nil)
func newMemoryController(fs *filesystem, defaults map[string]int64) *memoryController {
c := &memoryController{
- // Linux sets this to (PAGE_COUNTER_MAX * PAGE_SIZE) by default, which
- // is ~ 2**63 on a 64-bit system. So essentially, inifinity. The exact
- // value isn't very important.
- limitBytes: math.MaxInt64,
+ // Linux sets these limits to (PAGE_COUNTER_MAX * PAGE_SIZE) by default,
+ // which is ~ 2**63 on a 64-bit system. So essentially, inifinity. The
+ // exact value isn't very important.
+
+ limitBytes: math.MaxInt64,
+ softLimitBytes: math.MaxInt64,
}
- if val, ok := defaults["memory.limit_in_bytes"]; ok {
- c.limitBytes = val
- delete(defaults, "memory.limit_in_bytes")
+
+ consumeDefault := func(name string, valPtr *int64) {
+ if val, ok := defaults[name]; ok {
+ *valPtr = val
+ delete(defaults, name)
+ }
}
+
+ consumeDefault("memory.limit_in_bytes", &c.limitBytes)
+ consumeDefault("memory.soft_limit_in_bytes", &c.softLimitBytes)
+ consumeDefault("memory.move_charge_at_immigrate", &c.moveChargeAtImmigrate)
+
c.controllerCommon.init(controllerMemory, fs)
return c
}
@@ -55,6 +67,8 @@ func newMemoryController(fs *filesystem, defaults map[string]int64) *memoryContr
func (c *memoryController) AddControlFiles(ctx context.Context, creds *auth.Credentials, _ *cgroupInode, contents map[string]kernfs.Inode) {
contents["memory.usage_in_bytes"] = c.fs.newControllerFile(ctx, creds, &memoryUsageInBytesData{})
contents["memory.limit_in_bytes"] = c.fs.newStaticControllerFile(ctx, creds, linux.FileMode(0644), fmt.Sprintf("%d\n", c.limitBytes))
+ contents["memory.soft_limit_in_bytes"] = c.fs.newStaticControllerFile(ctx, creds, linux.FileMode(0644), fmt.Sprintf("%d\n", c.softLimitBytes))
+ contents["memory.move_charge_at_immigrate"] = c.fs.newStaticControllerFile(ctx, creds, linux.FileMode(0644), fmt.Sprintf("%d\n", c.moveChargeAtImmigrate))
}
// +stateify savable
diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD
index f981ff296..e0b879339 100644
--- a/pkg/sentry/fsimpl/devpts/BUILD
+++ b/pkg/sentry/fsimpl/devpts/BUILD
@@ -45,7 +45,6 @@ go_library(
"//pkg/sentry/unimpl",
"//pkg/sentry/vfs",
"//pkg/sync",
- "//pkg/syserror",
"//pkg/usermem",
"//pkg/waiter",
],
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
index 7a488e9fd..e711debcb 100644
--- a/pkg/sentry/fsimpl/devpts/devpts.go
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -29,7 +29,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
)
// Name is the filesystem name.
@@ -180,7 +179,7 @@ func (i *rootInode) allocateTerminal(ctx context.Context, creds *auth.Credential
i.mu.Lock()
defer i.mu.Unlock()
if i.nextIdx == math.MaxUint32 {
- return nil, syserror.ENOMEM
+ return nil, linuxerr.ENOMEM
}
idx := i.nextIdx
i.nextIdx++
@@ -241,7 +240,7 @@ func (i *rootInode) Lookup(ctx context.Context, name string) (kernfs.Inode, erro
// Not a static entry.
idx, err := strconv.ParseUint(name, 10, 32)
if err != nil {
- return nil, syserror.ENOENT
+ return nil, linuxerr.ENOENT
}
i.mu.Lock()
defer i.mu.Unlock()
@@ -250,7 +249,7 @@ func (i *rootInode) Lookup(ctx context.Context, name string) (kernfs.Inode, erro
return ri, nil
}
- return nil, syserror.ENOENT
+ return nil, linuxerr.ENOENT
}
// IterDirents implements kernfs.Inode.IterDirents.
diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go
index 9cb21e83b..609623f9f 100644
--- a/pkg/sentry/fsimpl/devpts/line_discipline.go
+++ b/pkg/sentry/fsimpl/devpts/line_discipline.go
@@ -20,10 +20,10 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -203,7 +203,7 @@ func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSeque
} else if notifyEcho {
l.masterWaiter.Notify(waiter.ReadableEvents)
}
- return 0, syserror.ErrWouldBlock
+ return 0, linuxerr.ErrWouldBlock
}
func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) {
@@ -220,7 +220,7 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ
l.replicaWaiter.Notify(waiter.ReadableEvents)
return n, nil
}
- return 0, syserror.ErrWouldBlock
+ return 0, linuxerr.ErrWouldBlock
}
func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
@@ -242,7 +242,7 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ
}
return n, nil
}
- return 0, syserror.ErrWouldBlock
+ return 0, linuxerr.ErrWouldBlock
}
func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) {
@@ -257,7 +257,7 @@ func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSeq
l.masterWaiter.Notify(waiter.ReadableEvents)
return n, nil
}
- return 0, syserror.ErrWouldBlock
+ return 0, linuxerr.ErrWouldBlock
}
// transformer is a helper interface to make it easier to stateify queue.
diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go
index ff1d89955..85aeefa43 100644
--- a/pkg/sentry/fsimpl/devpts/queue.go
+++ b/pkg/sentry/fsimpl/devpts/queue.go
@@ -17,12 +17,12 @@ package devpts
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/marshal/primitive"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -110,7 +110,7 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl
defer q.mu.Unlock()
if !q.readable {
- return 0, false, false, syserror.ErrWouldBlock
+ return 0, false, false, linuxerr.ErrWouldBlock
}
if dst.NumBytes() > canonMaxBytes {
@@ -156,7 +156,7 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip
room := waitBufMaxBytes - q.waitBufLen
// If out of room, return EAGAIN.
if room == 0 && copyLen > 0 {
- return 0, syserror.ErrWouldBlock
+ return 0, linuxerr.ErrWouldBlock
}
// Cap the size of the wait buffer.
if copyLen > room {
diff --git a/pkg/sentry/fsimpl/eventfd/BUILD b/pkg/sentry/fsimpl/eventfd/BUILD
index c09fdc7f9..1cb049a29 100644
--- a/pkg/sentry/fsimpl/eventfd/BUILD
+++ b/pkg/sentry/fsimpl/eventfd/BUILD
@@ -9,11 +9,11 @@ go_library(
deps = [
"//pkg/abi/linux",
"//pkg/context",
+ "//pkg/errors/linuxerr",
"//pkg/fdnotifier",
"//pkg/hostarch",
"//pkg/log",
"//pkg/sentry/vfs",
- "//pkg/syserror",
"//pkg/usermem",
"//pkg/waiter",
"@org_golang_x_sys//unix:go_default_library",
diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go
index 4f79cfcb7..af5ba5131 100644
--- a/pkg/sentry/fsimpl/eventfd/eventfd.go
+++ b/pkg/sentry/fsimpl/eventfd/eventfd.go
@@ -22,11 +22,11 @@ import (
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/fdnotifier"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -149,7 +149,7 @@ func (efd *EventFileDescription) hostReadLocked(ctx context.Context, dst usermem
var buf [8]byte
if _, err := unix.Read(efd.hostfd, buf[:]); err != nil {
if err == unix.EWOULDBLOCK {
- return syserror.ErrWouldBlock
+ return linuxerr.ErrWouldBlock
}
return err
}
@@ -167,7 +167,7 @@ func (efd *EventFileDescription) read(ctx context.Context, dst usermem.IOSequenc
// We can't complete the read if the value is currently zero.
if efd.val == 0 {
efd.mu.Unlock()
- return syserror.ErrWouldBlock
+ return linuxerr.ErrWouldBlock
}
// Update the value based on the mode the event is operating in.
@@ -200,7 +200,7 @@ func (efd *EventFileDescription) hostWriteLocked(val uint64) error {
hostarch.ByteOrder.PutUint64(buf[:], val)
_, err := unix.Write(efd.hostfd, buf[:])
if err == unix.EWOULDBLOCK {
- return syserror.ErrWouldBlock
+ return linuxerr.ErrWouldBlock
}
return err
}
@@ -232,7 +232,7 @@ func (efd *EventFileDescription) Signal(val uint64) error {
// uint64 minus 1.
if val > math.MaxUint64-1-efd.val {
efd.mu.Unlock()
- return syserror.ErrWouldBlock
+ return linuxerr.ErrWouldBlock
}
efd.val += val
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/BUILD
diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD
index 871df5984..05c4fbeb2 100644
--- a/pkg/sentry/fsimpl/fuse/BUILD
+++ b/pkg/sentry/fsimpl/fuse/BUILD
@@ -59,7 +59,6 @@ go_library(
"//pkg/sentry/kernel/auth",
"//pkg/sentry/vfs",
"//pkg/sync",
- "//pkg/syserror",
"//pkg/usermem",
"//pkg/waiter",
"@org_golang_x_sys//unix:go_default_library",
@@ -84,7 +83,6 @@ go_test(
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/vfs",
- "//pkg/syserror",
"//pkg/usermem",
"//pkg/waiter",
"@org_golang_x_sys//unix:go_default_library",
diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go
index dab1e779d..0f855ac59 100644
--- a/pkg/sentry/fsimpl/fuse/dev.go
+++ b/pkg/sentry/fsimpl/fuse/dev.go
@@ -23,7 +23,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -38,7 +37,7 @@ type fuseDevice struct{}
// Open implements vfs.Device.Open.
func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
if !kernel.FUSEEnabled {
- return nil, syserror.ENOENT
+ return nil, linuxerr.ENOENT
}
var fd DeviceFD
@@ -126,7 +125,7 @@ func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset in
return 0, linuxerr.EPERM
}
- return 0, syserror.ENOSYS
+ return 0, linuxerr.ENOSYS
}
// Read implements vfs.FileDescriptionImpl.Read.
@@ -192,7 +191,7 @@ func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts
}
if req == nil {
- return 0, syserror.ErrWouldBlock
+ return 0, linuxerr.ErrWouldBlock
}
// We already checked the size: dst must be able to fit the whole request.
@@ -205,7 +204,7 @@ func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts
return 0, err
}
if n != len(req.data) {
- return 0, syserror.EIO
+ return 0, linuxerr.EIO
}
if req.hdr.Opcode == linux.FUSE_WRITE {
@@ -214,7 +213,7 @@ func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts
return 0, err
}
if written != len(req.payload) {
- return 0, syserror.EIO
+ return 0, linuxerr.EIO
}
n += int(written)
}
@@ -238,7 +237,7 @@ func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset i
return 0, linuxerr.EPERM
}
- return 0, syserror.ENOSYS
+ return 0, linuxerr.ENOSYS
}
// Write implements vfs.FileDescriptionImpl.Write.
@@ -395,7 +394,7 @@ func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64
return 0, linuxerr.EPERM
}
- return 0, syserror.ENOSYS
+ return 0, linuxerr.ENOSYS
}
// sendResponse sends a response to the waiting task (if any).
diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go
index 04250d796..8951b5ba8 100644
--- a/pkg/sentry/fsimpl/fuse/dev_test.go
+++ b/pkg/sentry/fsimpl/fuse/dev_test.go
@@ -20,11 +20,11 @@ import (
"testing"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -186,7 +186,7 @@ func ReadTest(serverTask *kernel.Task, fd *vfs.FileDescription, inIOseq usermem.
// "would block".
n, err = dev.Read(serverTask, inIOseq, vfs.ReadOptions{})
total += n
- if err != syserror.ErrWouldBlock {
+ if err != linuxerr.ErrWouldBlock {
break
}
diff --git a/pkg/sentry/fsimpl/fuse/directory.go b/pkg/sentry/fsimpl/fuse/directory.go
index fcc5d9a2a..9611edd5a 100644
--- a/pkg/sentry/fsimpl/fuse/directory.go
+++ b/pkg/sentry/fsimpl/fuse/directory.go
@@ -19,10 +19,10 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -32,27 +32,27 @@ type directoryFD struct {
// Allocate implements directoryFD.Allocate.
func (*directoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
- return syserror.EISDIR
+ return linuxerr.EISDIR
}
// PRead implements vfs.FileDescriptionImpl.PRead.
func (*directoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
- return 0, syserror.EISDIR
+ return 0, linuxerr.EISDIR
}
// Read implements vfs.FileDescriptionImpl.Read.
func (*directoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
- return 0, syserror.EISDIR
+ return 0, linuxerr.EISDIR
}
// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (*directoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
- return 0, syserror.EISDIR
+ return 0, linuxerr.EISDIR
}
// Write implements vfs.FileDescriptionImpl.Write.
func (*directoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
- return 0, syserror.EISDIR
+ return 0, linuxerr.EISDIR
}
// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go
index 172cbd88f..af16098d2 100644
--- a/pkg/sentry/fsimpl/fuse/fusefs.go
+++ b/pkg/sentry/fsimpl/fuse/fusefs.go
@@ -30,7 +30,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -612,7 +611,7 @@ func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMo
return nil, err
}
if opcode != linux.FUSE_LOOKUP && ((out.Attr.Mode&linux.S_IFMT)^uint32(fileType) != 0 || out.NodeID == 0 || out.NodeID == linux.FUSE_ROOT_ID) {
- return nil, syserror.EIO
+ return nil, linuxerr.EIO
}
child := i.fs.newInode(ctx, out.NodeID, out.Attr)
return child, nil
diff --git a/pkg/sentry/fsimpl/fuse/read_write.go b/pkg/sentry/fsimpl/fuse/read_write.go
index 35d0ab6f4..fe119aa43 100644
--- a/pkg/sentry/fsimpl/fuse/read_write.go
+++ b/pkg/sentry/fsimpl/fuse/read_write.go
@@ -25,7 +25,6 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/syserror"
)
// ReadInPages sends FUSE_READ requests for the size after round it up to
@@ -221,7 +220,7 @@ func (fs *filesystem) Write(ctx context.Context, fd *regularFileFD, off uint64,
// Write more than requested? EIO.
if out.Size > toWrite {
- return 0, syserror.EIO
+ return 0, linuxerr.EIO
}
written += out.Size
diff --git a/pkg/sentry/fsimpl/fuse/regular_file.go b/pkg/sentry/fsimpl/fuse/regular_file.go
index 6c4de3507..38cde8208 100644
--- a/pkg/sentry/fsimpl/fuse/regular_file.go
+++ b/pkg/sentry/fsimpl/fuse/regular_file.go
@@ -24,7 +24,6 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -108,7 +107,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
return 0, err
}
if int64(cp) != toCopy {
- return 0, syserror.EIO
+ return 0, linuxerr.EIO
}
copied += toCopy
}
@@ -205,7 +204,7 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off
return 0, offset, err
}
if int64(cp) != srclen {
- return 0, offset, syserror.EIO
+ return 0, offset, linuxerr.EIO
}
n, err := fd.inode().fs.Write(ctx, fd, uint64(offset), uint32(srclen), data)
@@ -216,7 +215,7 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off
if n == 0 {
// We have checked srclen != 0 previously.
// If err == nil, then it's a short write and we return EIO.
- return 0, offset, syserror.EIO
+ return 0, offset, linuxerr.EIO
}
written = int64(n)
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
index 752060044..509dd0e1a 100644
--- a/pkg/sentry/fsimpl/gofer/BUILD
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -54,7 +54,10 @@ go_library(
"//pkg/fdnotifier",
"//pkg/fspath",
"//pkg/hostarch",
+ "//pkg/lisafs",
"//pkg/log",
+ "//pkg/marshal",
+ "//pkg/marshal/primitive",
"//pkg/metric",
"//pkg/p9",
"//pkg/refs",
@@ -79,7 +82,6 @@ go_library(
"//pkg/sentry/vfs",
"//pkg/sync",
"//pkg/syserr",
- "//pkg/syserror",
"//pkg/unet",
"//pkg/usermem",
"//pkg/waiter",
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index 5c48a9fee..d99a6112c 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -222,47 +222,88 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
off := uint64(0)
const count = 64 * 1024 // for consistency with the vfs1 client
d.handleMu.RLock()
- if d.readFile.isNil() {
+ if !d.isReadFileOk() {
// This should not be possible because a readable handle should
// have been opened when the calling directoryFD was opened.
d.handleMu.RUnlock()
panic("gofer.dentry.getDirents called without a readable handle")
}
+ // shouldSeek0 indicates whether the server should SEEK to 0 before reading
+ // directory entries.
+ shouldSeek0 := true
for {
- p9ds, err := d.readFile.readdir(ctx, off, count)
- if err != nil {
- d.handleMu.RUnlock()
- return nil, err
- }
- if len(p9ds) == 0 {
- d.handleMu.RUnlock()
- break
- }
- for _, p9d := range p9ds {
- if p9d.Name == "." || p9d.Name == ".." {
- continue
+ if d.fs.opts.lisaEnabled {
+ countLisa := int32(count)
+ if shouldSeek0 {
+ // See lisafs.Getdents64Req.Count.
+ countLisa = -countLisa
+ shouldSeek0 = false
+ }
+ lisafsDs, err := d.readFDLisa.Getdents64(ctx, countLisa)
+ if err != nil {
+ d.handleMu.RUnlock()
+ return nil, err
+ }
+ if len(lisafsDs) == 0 {
+ d.handleMu.RUnlock()
+ break
+ }
+ for i := range lisafsDs {
+ name := string(lisafsDs[i].Name)
+ if name == "." || name == ".." {
+ continue
+ }
+ dirent := vfs.Dirent{
+ Name: name,
+ Ino: d.fs.inoFromKey(inoKey{
+ ino: uint64(lisafsDs[i].Ino),
+ devMinor: uint32(lisafsDs[i].DevMinor),
+ devMajor: uint32(lisafsDs[i].DevMajor),
+ }),
+ NextOff: int64(len(dirents) + 1),
+ Type: uint8(lisafsDs[i].Type),
+ }
+ dirents = append(dirents, dirent)
+ if realChildren != nil {
+ realChildren[name] = struct{}{}
+ }
}
- dirent := vfs.Dirent{
- Name: p9d.Name,
- Ino: d.fs.inoFromQIDPath(p9d.QID.Path),
- NextOff: int64(len(dirents) + 1),
+ } else {
+ p9ds, err := d.readFile.readdir(ctx, off, count)
+ if err != nil {
+ d.handleMu.RUnlock()
+ return nil, err
}
- // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or
- // DMSOCKET.
- switch p9d.Type {
- case p9.TypeSymlink:
- dirent.Type = linux.DT_LNK
- case p9.TypeDir:
- dirent.Type = linux.DT_DIR
- default:
- dirent.Type = linux.DT_REG
+ if len(p9ds) == 0 {
+ d.handleMu.RUnlock()
+ break
}
- dirents = append(dirents, dirent)
- if realChildren != nil {
- realChildren[p9d.Name] = struct{}{}
+ for _, p9d := range p9ds {
+ if p9d.Name == "." || p9d.Name == ".." {
+ continue
+ }
+ dirent := vfs.Dirent{
+ Name: p9d.Name,
+ Ino: d.fs.inoFromQIDPath(p9d.QID.Path),
+ NextOff: int64(len(dirents) + 1),
+ }
+ // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or
+ // DMSOCKET.
+ switch p9d.Type {
+ case p9.TypeSymlink:
+ dirent.Type = linux.DT_LNK
+ case p9.TypeDir:
+ dirent.Type = linux.DT_DIR
+ default:
+ dirent.Type = linux.DT_REG
+ }
+ dirents = append(dirents, dirent)
+ if realChildren != nil {
+ realChildren[p9d.Name] = struct{}{}
+ }
}
+ off = p9ds[len(p9ds)-1].Offset
}
- off = p9ds[len(p9ds)-1].Offset
}
}
// Emit entries for synthetic children.
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 05b776c2e..f7b3446d3 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -21,10 +21,12 @@ import (
"sync"
"sync/atomic"
+ "golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/lisafs"
"gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
"gvisor.dev/gvisor/pkg/sentry/fsmetric"
@@ -33,7 +35,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
)
// Sync implements vfs.FilesystemImpl.Sync.
@@ -54,9 +55,47 @@ func (fs *filesystem) Sync(ctx context.Context) error {
// regardless.
var retErr error
+ if fs.opts.lisaEnabled {
+ // Try accumulating all FDIDs to fsync and fsync then via one RPC as
+ // opposed to making an RPC per FDID. Passing a non-nil accFsyncFDIDs to
+ // dentry.syncCachedFile() and specialFileFD.sync() will cause them to not
+ // make an RPC, instead accumulate syncable FDIDs in the passed slice.
+ accFsyncFDIDs := make([]lisafs.FDID, 0, len(ds)+len(sffds))
+
+ // Sync syncable dentries.
+ for _, d := range ds {
+ if err := d.syncCachedFile(ctx, true /* forFilesystemSync */, &accFsyncFDIDs); err != nil {
+ ctx.Infof("gofer.filesystem.Sync: dentry.syncCachedFile failed: %v", err)
+ if retErr == nil {
+ retErr = err
+ }
+ }
+ }
+
+ // Sync special files, which may be writable but do not use dentry shared
+ // handles (so they won't be synced by the above).
+ for _, sffd := range sffds {
+ if err := sffd.sync(ctx, true /* forFilesystemSync */, &accFsyncFDIDs); err != nil {
+ ctx.Infof("gofer.filesystem.Sync: specialFileFD.sync failed: %v", err)
+ if retErr == nil {
+ retErr = err
+ }
+ }
+ }
+
+ if err := fs.clientLisa.SyncFDs(ctx, accFsyncFDIDs); err != nil {
+ ctx.Infof("gofer.filesystem.Sync: fs.fsyncMultipleFDLisa failed: %v", err)
+ if retErr == nil {
+ retErr = err
+ }
+ }
+
+ return retErr
+ }
+
// Sync syncable dentries.
for _, d := range ds {
- if err := d.syncCachedFile(ctx, true /* forFilesystemSync */); err != nil {
+ if err := d.syncCachedFile(ctx, true /* forFilesystemSync */, nil /* accFsyncFDIDsLisa */); err != nil {
ctx.Infof("gofer.filesystem.Sync: dentry.syncCachedFile failed: %v", err)
if retErr == nil {
retErr = err
@@ -67,7 +106,7 @@ func (fs *filesystem) Sync(ctx context.Context) error {
// Sync special files, which may be writable but do not use dentry shared
// handles (so they won't be synced by the above).
for _, sffd := range sffds {
- if err := sffd.sync(ctx, true /* forFilesystemSync */); err != nil {
+ if err := sffd.sync(ctx, true /* forFilesystemSync */, nil /* accFsyncFDIDsLisa */); err != nil {
ctx.Infof("gofer.filesystem.Sync: specialFileFD.sync failed: %v", err)
if retErr == nil {
retErr = err
@@ -198,7 +237,13 @@ afterSymlink:
rp.Advance()
return d.parent, followedSymlink, nil
}
- child, err := fs.getChildLocked(ctx, d, name, ds)
+ var child *dentry
+ var err error
+ if fs.opts.lisaEnabled {
+ child, err = fs.getChildAndWalkPathLocked(ctx, d, rp, ds)
+ } else {
+ child, err = fs.getChildLocked(ctx, d, name, ds)
+ }
if err != nil {
return nil, false, err
}
@@ -220,6 +265,99 @@ afterSymlink:
return child, followedSymlink, nil
}
+// Preconditions:
+// * fs.opts.lisaEnabled.
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
+// * parent.isDir().
+// * parent and the dentry at name have been revalidated.
+func (fs *filesystem) getChildAndWalkPathLocked(ctx context.Context, parent *dentry, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
+ // Note that pit is a copy of the iterator that does not affect rp.
+ pit := rp.Pit()
+ first := pit.String()
+ if len(first) > maxFilenameLen {
+ return nil, linuxerr.ENAMETOOLONG
+ }
+ if child, ok := parent.children[first]; ok || parent.isSynthetic() {
+ if child == nil {
+ return nil, linuxerr.ENOENT
+ }
+ return child, nil
+ }
+
+ // Walk as much of the path as possible in 1 RPC.
+ names := []string{first}
+ for pit = pit.Next(); pit.Ok(); pit = pit.Next() {
+ name := pit.String()
+ if name == "." {
+ continue
+ }
+ if name == ".." {
+ break
+ }
+ names = append(names, name)
+ }
+ status, inodes, err := parent.controlFDLisa.WalkMultiple(ctx, names)
+ if err != nil {
+ return nil, err
+ }
+ if len(inodes) == 0 {
+ parent.cacheNegativeLookupLocked(first)
+ return nil, linuxerr.ENOENT
+ }
+
+ // Add the walked inodes into the dentry tree.
+ curParent := parent
+ curParentDirMuLock := func() {
+ if curParent != parent {
+ curParent.dirMu.Lock()
+ }
+ }
+ curParentDirMuUnlock := func() {
+ if curParent != parent {
+ curParent.dirMu.Unlock() // +checklocksforce: locked via curParentDirMuLock().
+ }
+ }
+ var ret *dentry
+ var dentryCreationErr error
+ for i := range inodes {
+ if dentryCreationErr != nil {
+ fs.clientLisa.CloseFDBatched(ctx, inodes[i].ControlFD)
+ continue
+ }
+
+ child, err := fs.newDentryLisa(ctx, &inodes[i])
+ if err != nil {
+ fs.clientLisa.CloseFDBatched(ctx, inodes[i].ControlFD)
+ dentryCreationErr = err
+ continue
+ }
+ curParentDirMuLock()
+ curParent.cacheNewChildLocked(child, names[i])
+ curParentDirMuUnlock()
+ // For now, child has 0 references, so our caller should call
+ // child.checkCachingLocked(). curParent gained a ref so we should also
+ // call curParent.checkCachingLocked() so it can be removed from the cache
+ // if needed. We only do that for the first iteration because all
+ // subsequent parents would have already been added to ds.
+ if i == 0 {
+ *ds = appendDentry(*ds, curParent)
+ }
+ *ds = appendDentry(*ds, child)
+ curParent = child
+ if i == 0 {
+ ret = child
+ }
+ }
+
+ if status == lisafs.WalkComponentDoesNotExist && curParent.isDir() {
+ curParentDirMuLock()
+ curParent.cacheNegativeLookupLocked(names[len(inodes)])
+ curParentDirMuUnlock()
+ }
+ return ret, dentryCreationErr
+}
+
// getChildLocked returns a dentry representing the child of parent with the
// given name. Returns ENOENT if the child doesn't exist.
//
@@ -228,32 +366,47 @@ afterSymlink:
// * parent.dirMu must be locked.
// * parent.isDir().
// * name is not "." or "..".
-// * dentry at name has been revalidated
+// * parent and the dentry at name have been revalidated.
func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
if len(name) > maxFilenameLen {
return nil, linuxerr.ENAMETOOLONG
}
if child, ok := parent.children[name]; ok || parent.isSynthetic() {
if child == nil {
- return nil, syserror.ENOENT
+ return nil, linuxerr.ENOENT
}
return child, nil
}
- qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name)
- if err != nil {
- if linuxerr.Equals(linuxerr.ENOENT, err) {
- parent.cacheNegativeLookupLocked(name)
+ var child *dentry
+ if fs.opts.lisaEnabled {
+ childInode, err := parent.controlFDLisa.Walk(ctx, name)
+ if err != nil {
+ if linuxerr.Equals(linuxerr.ENOENT, err) {
+ parent.cacheNegativeLookupLocked(name)
+ }
+ return nil, err
+ }
+ // Create a new dentry representing the file.
+ child, err = fs.newDentryLisa(ctx, childInode)
+ if err != nil {
+ fs.clientLisa.CloseFDBatched(ctx, childInode.ControlFD)
+ return nil, err
+ }
+ } else {
+ qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name)
+ if err != nil {
+ if linuxerr.Equals(linuxerr.ENOENT, err) {
+ parent.cacheNegativeLookupLocked(name)
+ }
+ return nil, err
+ }
+ // Create a new dentry representing the file.
+ child, err = fs.newDentry(ctx, file, qid, attrMask, &attr)
+ if err != nil {
+ file.close(ctx)
+ return nil, err
}
- return nil, err
- }
-
- // Create a new dentry representing the file.
- child, err := fs.newDentry(ctx, file, qid, attrMask, &attr)
- if err != nil {
- file.close(ctx)
- delete(parent.children, name)
- return nil, err
}
parent.cacheNewChildLocked(child, name)
appendNewChildDentry(ds, parent, child)
@@ -329,7 +482,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
// Preconditions:
// * !rp.Done().
// * For the final path component in rp, !rp.ShouldFollowSymlink().
-func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string, ds **[]*dentry) error, createInSyntheticDir func(parent *dentry, name string) error) error {
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string, ds **[]*dentry) (*lisafs.Inode, error), createInSyntheticDir func(parent *dentry, name string) error, updateChild func(child *dentry)) error {
var ds *[]*dentry
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
@@ -349,7 +502,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
return linuxerr.EEXIST
}
if parent.isDeleted() {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
if err := fs.revalidateOne(ctx, rp.VirtualFilesystem(), parent, name, &ds); err != nil {
return err
@@ -395,7 +548,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
return err
}
if !dir && rp.MustBeDir() {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
if parent.isSynthetic() {
if createInSyntheticDir == nil {
@@ -416,9 +569,26 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
// No cached dentry exists; however, in InteropModeShared there might still be
// an existing file at name. Just attempt the file creation RPC anyways. If a
// file does exist, the RPC will fail with EEXIST like we would have.
- if err := createInRemoteDir(parent, name, &ds); err != nil {
+ lisaInode, err := createInRemoteDir(parent, name, &ds)
+ if err != nil {
return err
}
+ // lisafs may aggresively cache newly created inodes. This has helped reduce
+ // Walk RPCs in practice.
+ if lisaInode != nil {
+ child, err := fs.newDentryLisa(ctx, lisaInode)
+ if err != nil {
+ fs.clientLisa.CloseFDBatched(ctx, lisaInode.ControlFD)
+ return err
+ }
+ parent.cacheNewChildLocked(child, name)
+ appendNewChildDentry(&ds, parent, child)
+
+ // lisafs may update dentry properties upon successful creation.
+ if updateChild != nil {
+ updateChild(child)
+ }
+ }
if fs.opts.interop != InteropModeShared {
if child, ok := parent.children[name]; ok && child == nil {
// Delete the now-stale negative dentry.
@@ -463,7 +633,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
}
} else {
if name == "." || name == ".." {
- return syserror.EISDIR
+ return linuxerr.EISDIR
}
}
@@ -486,7 +656,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
child, ok = parent.children[name]
if ok && child == nil {
// Hit a negative cached entry, child doesn't exist.
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
} else {
child, _, err = fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
@@ -552,7 +722,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
// child must be a non-directory file.
if child != nil && child.isDir() {
vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above.
- return syserror.EISDIR
+ return linuxerr.EISDIR
}
if rp.MustBeDir() {
if child != nil {
@@ -563,10 +733,14 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
}
if parent.isSynthetic() {
if child == nil {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
} else if child == nil || !child.isSynthetic() {
- err = parent.file.unlinkAt(ctx, name, flags)
+ if fs.opts.lisaEnabled {
+ err = parent.controlFDLisa.UnlinkAt(ctx, name, flags)
+ } else {
+ err = parent.file.unlinkAt(ctx, name, flags)
+ }
if err != nil {
if child != nil {
vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above.
@@ -659,40 +833,43 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa
// LinkAt implements vfs.FilesystemImpl.LinkAt.
func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
- return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, _ **[]*dentry) error {
+ err := fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, ds **[]*dentry) (*lisafs.Inode, error) {
if rp.Mount() != vd.Mount() {
- return linuxerr.EXDEV
+ return nil, linuxerr.EXDEV
}
d := vd.Dentry().Impl().(*dentry)
if d.isDir() {
- return linuxerr.EPERM
+ return nil, linuxerr.EPERM
}
gid := auth.KGID(atomic.LoadUint32(&d.gid))
uid := auth.KUID(atomic.LoadUint32(&d.uid))
mode := linux.FileMode(atomic.LoadUint32(&d.mode))
if err := vfs.MayLink(rp.Credentials(), mode, uid, gid); err != nil {
- return err
+ return nil, err
}
if d.nlink == 0 {
- return syserror.ENOENT
+ return nil, linuxerr.ENOENT
}
if d.nlink == math.MaxUint32 {
- return linuxerr.EMLINK
+ return nil, linuxerr.EMLINK
}
- if err := parent.file.link(ctx, d.file, childName); err != nil {
- return err
+ if fs.opts.lisaEnabled {
+ return parent.controlFDLisa.LinkAt(ctx, d.controlFDLisa.ID(), childName)
}
+ return nil, parent.file.link(ctx, d.file, childName)
+ }, nil, nil)
+ if err == nil {
// Success!
- atomic.AddUint32(&d.nlink, 1)
- return nil
- }, nil)
+ vd.Dentry().Impl().(*dentry).incLinks()
+ }
+ return err
}
// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
creds := rp.Credentials()
- return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string, ds **[]*dentry) error {
+ return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string, ds **[]*dentry) (*lisafs.Inode, error) {
// If the parent is a setgid directory, use the parent's GID
// rather than the caller's and enable setgid.
kgid := creds.EffectiveKGID
@@ -701,9 +878,18 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
kgid = auth.KGID(atomic.LoadUint32(&parent.gid))
mode |= linux.S_ISGID
}
- if _, err := parent.file.mkdir(ctx, name, p9.FileMode(mode), (p9.UID)(creds.EffectiveKUID), p9.GID(kgid)); err != nil {
+ var (
+ childDirInode *lisafs.Inode
+ err error
+ )
+ if fs.opts.lisaEnabled {
+ childDirInode, err = parent.controlFDLisa.MkdirAt(ctx, name, mode, lisafs.UID(creds.EffectiveKUID), lisafs.GID(kgid))
+ } else {
+ _, err = parent.file.mkdir(ctx, name, p9.FileMode(mode), (p9.UID)(creds.EffectiveKUID), p9.GID(kgid))
+ }
+ if err != nil {
if !opts.ForSyntheticMountpoint || linuxerr.Equals(linuxerr.EEXIST, err) {
- return err
+ return nil, err
}
ctx.Infof("Failed to create remote directory %q: %v; falling back to synthetic directory", name, err)
parent.createSyntheticChildLocked(&createSyntheticOpts{
@@ -717,7 +903,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
if fs.opts.interop != InteropModeShared {
parent.incLinks()
}
- return nil
+ return childDirInode, nil
}, func(parent *dentry, name string) error {
if !opts.ForSyntheticMountpoint {
// Can't create non-synthetic files in synthetic directories.
@@ -731,16 +917,26 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
})
parent.incLinks()
return nil
- })
+ }, nil)
}
// MknodAt implements vfs.FilesystemImpl.MknodAt.
func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
- return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) error {
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) (*lisafs.Inode, error) {
creds := rp.Credentials()
- _, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
- if !linuxerr.Equals(linuxerr.EPERM, err) {
- return err
+ var (
+ childInode *lisafs.Inode
+ err error
+ )
+ if fs.opts.lisaEnabled {
+ childInode, err = parent.controlFDLisa.MknodAt(ctx, name, opts.Mode, lisafs.UID(creds.EffectiveKUID), lisafs.GID(creds.EffectiveKGID), opts.DevMinor, opts.DevMajor)
+ } else {
+ _, err = parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
+ }
+ if err == nil {
+ return childInode, nil
+ } else if !linuxerr.Equals(linuxerr.EPERM, err) {
+ return nil, err
}
// EPERM means that gofer does not allow creating a socket or pipe. Fallback
@@ -751,10 +947,10 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
switch {
case err == nil:
// Step succeeded, another file exists.
- return linuxerr.EEXIST
+ return nil, linuxerr.EEXIST
case !linuxerr.Equals(linuxerr.ENOENT, err):
// Unexpected error.
- return err
+ return nil, err
}
switch opts.Mode.FileType() {
@@ -767,7 +963,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
endpoint: opts.Endpoint,
})
*ds = appendDentry(*ds, parent)
- return nil
+ return nil, nil
case linux.S_IFIFO:
parent.createSyntheticChildLocked(&createSyntheticOpts{
name: name,
@@ -777,11 +973,11 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize),
})
*ds = appendDentry(*ds, parent)
- return nil
+ return nil, nil
}
// Retain error from gofer if synthetic file cannot be created internally.
- return linuxerr.EPERM
- }, nil)
+ return nil, linuxerr.EPERM
+ }, nil, nil)
}
// OpenAt implements vfs.FilesystemImpl.OpenAt.
@@ -811,7 +1007,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
if rp.Done() {
// Reject attempts to open mount root directory with O_CREAT.
if mayCreate && rp.MustBeDir() {
- return nil, syserror.EISDIR
+ return nil, linuxerr.EISDIR
}
if mustCreate {
return nil, linuxerr.EEXIST
@@ -841,7 +1037,7 @@ afterTrailingSymlink:
}
// Reject attempts to open directories with O_CREAT.
if mayCreate && rp.MustBeDir() {
- return nil, syserror.EISDIR
+ return nil, linuxerr.EISDIR
}
if err := fs.revalidateOne(ctx, rp.VirtualFilesystem(), parent, rp.Component(), &ds); err != nil {
return nil, err
@@ -922,11 +1118,11 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
case linux.S_IFDIR:
// Can't open directories with O_CREAT.
if opts.Flags&linux.O_CREAT != 0 {
- return nil, syserror.EISDIR
+ return nil, linuxerr.EISDIR
}
// Can't open directories writably.
if ats&vfs.MayWrite != 0 {
- return nil, syserror.EISDIR
+ return nil, linuxerr.EISDIR
}
if opts.Flags&linux.O_DIRECT != 0 {
return nil, linuxerr.EINVAL
@@ -987,6 +1183,23 @@ func (d *dentry) openSocketByConnecting(ctx context.Context, opts *vfs.OpenOptio
if opts.Flags&linux.O_DIRECT != 0 {
return nil, linuxerr.EINVAL
}
+ if d.fs.opts.lisaEnabled {
+ // Note that special value of linux.SockType = 0 is interpreted by lisafs
+ // as "do not care about the socket type". Analogous to p9.AnonymousSocket.
+ sockFD, err := d.controlFDLisa.Connect(ctx, 0 /* sockType */)
+ if err != nil {
+ return nil, err
+ }
+ fd, err := host.NewFD(ctx, kernel.KernelFromContext(ctx).HostMount(), sockFD, &host.NewFDOptions{
+ HaveFlags: true,
+ Flags: opts.Flags,
+ })
+ if err != nil {
+ unix.Close(sockFD)
+ return nil, err
+ }
+ return fd, nil
+ }
fdObj, err := d.file.connect(ctx, p9.AnonymousSocket)
if err != nil {
return nil, err
@@ -999,6 +1212,7 @@ func (d *dentry) openSocketByConnecting(ctx context.Context, opts *vfs.OpenOptio
fdObj.Close()
return nil, err
}
+ // Ownership has been transferred to fd.
fdObj.Release()
return fd, nil
}
@@ -1018,7 +1232,13 @@ func (d *dentry) openSpecialFile(ctx context.Context, mnt *vfs.Mount, opts *vfs.
// since closed its end.
isBlockingOpenOfNamedPipe := d.fileType() == linux.S_IFIFO && opts.Flags&linux.O_NONBLOCK == 0
retry:
- h, err := openHandle(ctx, d.file, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0)
+ var h handle
+ var err error
+ if d.fs.opts.lisaEnabled {
+ h, err = openHandleLisa(ctx, d.controlFDLisa, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0)
+ } else {
+ h, err = openHandle(ctx, d.file, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0)
+ }
if err != nil {
if isBlockingOpenOfNamedPipe && ats == vfs.MayWrite && linuxerr.Equals(linuxerr.ENXIO, err) {
// An attempt to open a named pipe with O_WRONLY|O_NONBLOCK fails
@@ -1054,7 +1274,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
return nil, err
}
if d.isDeleted() {
- return nil, syserror.ENOENT
+ return nil, linuxerr.ENOENT
}
mnt := rp.Mount()
if err := mnt.CheckBeginWrite(); err != nil {
@@ -1062,18 +1282,8 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
}
defer mnt.EndWrite()
- // 9P2000.L's lcreate takes a fid representing the parent directory, and
- // converts it into an open fid representing the created file, so we need
- // to duplicate the directory fid first.
- _, dirfile, err := d.file.walk(ctx, nil)
- if err != nil {
- return nil, err
- }
creds := rp.Credentials()
name := rp.Component()
- // We only want the access mode for creating the file.
- createFlags := p9.OpenFlags(opts.Flags) & p9.OpenFlagsModeMask
-
// If the parent is a setgid directory, use the parent's GID rather
// than the caller's.
kgid := creds.EffectiveKGID
@@ -1081,51 +1291,87 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
kgid = auth.KGID(atomic.LoadUint32(&d.gid))
}
- fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, p9.FileMode(opts.Mode), (p9.UID)(creds.EffectiveKUID), p9.GID(kgid))
- if err != nil {
- dirfile.close(ctx)
- return nil, err
- }
- // Then we need to walk to the file we just created to get a non-open fid
- // representing it, and to get its metadata. This must use d.file since, as
- // explained above, dirfile was invalidated by dirfile.Create().
- _, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name)
- if err != nil {
- openFile.close(ctx)
- if fdobj != nil {
- fdobj.Close()
+ var child *dentry
+ var openP9File p9file
+ openLisaFD := lisafs.InvalidFDID
+ openHostFD := int32(-1)
+ if d.fs.opts.lisaEnabled {
+ ino, openFD, hostFD, err := d.controlFDLisa.OpenCreateAt(ctx, name, opts.Flags&linux.O_ACCMODE, opts.Mode, lisafs.UID(creds.EffectiveKUID), lisafs.GID(kgid))
+ if err != nil {
+ return nil, err
+ }
+ openHostFD = int32(hostFD)
+ openLisaFD = openFD
+
+ child, err = d.fs.newDentryLisa(ctx, &ino)
+ if err != nil {
+ d.fs.clientLisa.CloseFDBatched(ctx, ino.ControlFD)
+ d.fs.clientLisa.CloseFDBatched(ctx, openFD)
+ if hostFD >= 0 {
+ unix.Close(hostFD)
+ }
+ return nil, err
+ }
+ } else {
+ // 9P2000.L's lcreate takes a fid representing the parent directory, and
+ // converts it into an open fid representing the created file, so we need
+ // to duplicate the directory fid first.
+ _, dirfile, err := d.file.walk(ctx, nil)
+ if err != nil {
+ return nil, err
+ }
+ // We only want the access mode for creating the file.
+ createFlags := p9.OpenFlags(opts.Flags) & p9.OpenFlagsModeMask
+
+ fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, p9.FileMode(opts.Mode), (p9.UID)(creds.EffectiveKUID), p9.GID(kgid))
+ if err != nil {
+ dirfile.close(ctx)
+ return nil, err
+ }
+ // Then we need to walk to the file we just created to get a non-open fid
+ // representing it, and to get its metadata. This must use d.file since, as
+ // explained above, dirfile was invalidated by dirfile.Create().
+ _, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name)
+ if err != nil {
+ openFile.close(ctx)
+ if fdobj != nil {
+ fdobj.Close()
+ }
+ return nil, err
+ }
+
+ // Construct the new dentry.
+ child, err = d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr)
+ if err != nil {
+ nonOpenFile.close(ctx)
+ openFile.close(ctx)
+ if fdobj != nil {
+ fdobj.Close()
+ }
+ return nil, err
}
- return nil, err
- }
- // Construct the new dentry.
- child, err := d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr)
- if err != nil {
- nonOpenFile.close(ctx)
- openFile.close(ctx)
if fdobj != nil {
- fdobj.Close()
+ openHostFD = int32(fdobj.Release())
}
- return nil, err
+ openP9File = openFile
}
// Incorporate the fid that was opened by lcreate.
useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD
if useRegularFileFD {
- openFD := int32(-1)
- if fdobj != nil {
- openFD = int32(fdobj.Release())
- }
child.handleMu.Lock()
if vfs.MayReadFileWithOpenFlags(opts.Flags) {
- child.readFile = openFile
- if fdobj != nil {
- child.readFD = openFD
- child.mmapFD = openFD
+ child.readFile = openP9File
+ child.readFDLisa = d.fs.clientLisa.NewFD(openLisaFD)
+ if openHostFD != -1 {
+ child.readFD = openHostFD
+ child.mmapFD = openHostFD
}
}
if vfs.MayWriteFileWithOpenFlags(opts.Flags) {
- child.writeFile = openFile
- child.writeFD = openFD
+ child.writeFile = openP9File
+ child.writeFDLisa = d.fs.clientLisa.NewFD(openLisaFD)
+ child.writeFD = openHostFD
}
child.handleMu.Unlock()
}
@@ -1147,11 +1393,9 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
childVFSFD = &fd.vfsfd
} else {
h := handle{
- file: openFile,
- fd: -1,
- }
- if fdobj != nil {
- h.fd = int32(fdobj.Release())
+ file: openP9File,
+ fdLisa: d.fs.clientLisa.NewFD(openLisaFD),
+ fd: openHostFD,
}
fd, err := newSpecialFileFD(h, mnt, child, opts.Flags)
if err != nil {
@@ -1268,7 +1512,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
defer newParent.dirMu.Unlock()
}
if newParent.isDeleted() {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
replaced, err := fs.getChildLocked(ctx, newParent, newName, &ds)
if err != nil && !linuxerr.Equals(linuxerr.ENOENT, err) {
@@ -1282,7 +1526,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
replacedVFSD = &replaced.vfsd
if replaced.isDir() {
if !renamed.isDir() {
- return syserror.EISDIR
+ return linuxerr.EISDIR
}
if genericIsAncestorDentry(replaced, renamed) {
return linuxerr.ENOTEMPTY
@@ -1305,7 +1549,12 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
// Update the remote filesystem.
if !renamed.isSynthetic() {
- if err := renamed.file.rename(ctx, newParent.file, newName); err != nil {
+ if fs.opts.lisaEnabled {
+ err = renamed.controlFDLisa.RenameTo(ctx, newParent.controlFDLisa.ID(), newName)
+ } else {
+ err = renamed.file.rename(ctx, newParent.file, newName)
+ }
+ if err != nil {
vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
return err
}
@@ -1316,7 +1565,12 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if replaced.isDir() {
flags = linux.AT_REMOVEDIR
}
- if err := newParent.file.unlinkAt(ctx, newName, flags); err != nil {
+ if fs.opts.lisaEnabled {
+ err = newParent.controlFDLisa.UnlinkAt(ctx, newName, flags)
+ } else {
+ err = newParent.file.unlinkAt(ctx, newName, flags)
+ }
+ if err != nil {
vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
return err
}
@@ -1432,6 +1686,28 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
for d.isSynthetic() {
d = d.parent
}
+ if fs.opts.lisaEnabled {
+ var statFS lisafs.StatFS
+ if err := d.controlFDLisa.StatFSTo(ctx, &statFS); err != nil {
+ return linux.Statfs{}, err
+ }
+ if statFS.NameLength > maxFilenameLen {
+ statFS.NameLength = maxFilenameLen
+ }
+ return linux.Statfs{
+ // This is primarily for distinguishing a gofer file system in
+ // tests. Testing is important, so instead of defining
+ // something completely random, use a standard value.
+ Type: linux.V9FS_MAGIC,
+ BlockSize: statFS.BlockSize,
+ Blocks: statFS.Blocks,
+ BlocksFree: statFS.BlocksFree,
+ BlocksAvailable: statFS.BlocksAvailable,
+ Files: statFS.Files,
+ FilesFree: statFS.FilesFree,
+ NameLength: statFS.NameLength,
+ }, nil
+ }
fsstat, err := d.file.statFS(ctx)
if err != nil {
return linux.Statfs{}, err
@@ -1457,11 +1733,21 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
- return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, _ **[]*dentry) error {
+ return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) (*lisafs.Inode, error) {
creds := rp.Credentials()
+ if fs.opts.lisaEnabled {
+ return parent.controlFDLisa.SymlinkAt(ctx, name, target, lisafs.UID(creds.EffectiveKUID), lisafs.GID(creds.EffectiveKGID))
+ }
_, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
- return err
- }, nil)
+ return nil, err
+ }, nil, func(child *dentry) {
+ if fs.opts.interop != InteropModeShared {
+ // lisafs caches the symlink target on creation. In practice, this
+ // helps avoid a lot of ReadLink RPCs.
+ child.haveTarget = true
+ child.target = target
+ }
+ })
}
// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
@@ -1506,7 +1792,7 @@ func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
if err != nil {
return nil, err
}
- return d.listXattr(ctx, rp.Credentials(), size)
+ return d.listXattr(ctx, size)
}
// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
@@ -1613,6 +1899,9 @@ func (fs *filesystem) MountOptions() string {
if fs.opts.overlayfsStaleRead {
optsKV = append(optsKV, mopt{moptOverlayfsStaleRead, nil})
}
+ if fs.opts.lisaEnabled {
+ optsKV = append(optsKV, mopt{moptLisafs, nil})
+ }
opts := make([]string, 0, len(optsKV))
for _, opt := range optsKV {
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 25d2e39d6..b98825e26 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -48,6 +48,7 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/hostarch"
+ "gvisor.dev/gvisor/pkg/lisafs"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/p9"
refs_vfs1 "gvisor.dev/gvisor/pkg/refs"
@@ -62,7 +63,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/unet"
)
@@ -84,6 +84,7 @@ const (
moptForcePageCache = "force_page_cache"
moptLimitHostFDTranslation = "limit_host_fd_translation"
moptOverlayfsStaleRead = "overlayfs_stale_read"
+ moptLisafs = "lisafs"
)
// Valid values for the "cache" mount option.
@@ -119,6 +120,10 @@ type filesystem struct {
// client is the client used by this filesystem. client is immutable.
client *p9.Client `state:"nosave"`
+ // clientLisa is the client used for communicating with the server when
+ // lisafs is enabled. lisafsCient is immutable.
+ clientLisa *lisafs.Client `state:"nosave"`
+
// clock is a realtime clock used to set timestamps in file operations.
clock ktime.Clock
@@ -162,6 +167,12 @@ type filesystem struct {
inoMu sync.Mutex `state:"nosave"`
inoByQIDPath map[uint64]uint64 `state:"nosave"`
+ // inoByKey is the same as inoByQIDPath but only used by lisafs. It helps
+ // identify inodes based on the device ID and host inode number provided
+ // by the gofer process. It is not preserved across checkpoint/restore for
+ // the same reason as above. inoByKey is protected by inoMu.
+ inoByKey map[inoKey]uint64 `state:"nosave"`
+
// lastIno is the last inode number assigned to a file. lastIno is accessed
// using atomic memory operations.
lastIno uint64
@@ -215,6 +226,10 @@ type filesystemOptions struct {
// way that application FDs representing "special files" such as sockets
// do. Note that this disables client caching and mmap for regular files.
regularFilesUseSpecialFileFD bool
+
+ // lisaEnabled indicates whether the client will use lisafs protocol to
+ // communicate with the server instead of 9P.
+ lisaEnabled bool
}
// InteropMode controls the client's interaction with other remote filesystem
@@ -428,6 +443,14 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
delete(mopts, moptOverlayfsStaleRead)
fsopts.overlayfsStaleRead = true
}
+ if lisafs, ok := mopts[moptLisafs]; ok {
+ delete(mopts, moptLisafs)
+ fsopts.lisaEnabled, err = strconv.ParseBool(lisafs)
+ if err != nil {
+ ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid lisafs option: %s", lisafs)
+ return nil, nil, linuxerr.EINVAL
+ }
+ }
// fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying
// "cache=none".
@@ -459,44 +482,83 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
syncableDentries: make(map[*dentry]struct{}),
specialFileFDs: make(map[*specialFileFD]struct{}),
inoByQIDPath: make(map[uint64]uint64),
+ inoByKey: make(map[inoKey]uint64),
}
fs.vfsfs.Init(vfsObj, &fstype, fs)
+ if err := fs.initClientAndRoot(ctx); err != nil {
+ fs.vfsfs.DecRef(ctx)
+ return nil, nil, err
+ }
+
+ return &fs.vfsfs, &fs.root.vfsd, nil
+}
+
+func (fs *filesystem) initClientAndRoot(ctx context.Context) error {
+ var err error
+ if fs.opts.lisaEnabled {
+ var rootInode *lisafs.Inode
+ rootInode, err = fs.initClientLisa(ctx)
+ if err != nil {
+ return err
+ }
+ fs.root, err = fs.newDentryLisa(ctx, rootInode)
+ if err != nil {
+ fs.clientLisa.CloseFDBatched(ctx, rootInode.ControlFD)
+ }
+ } else {
+ fs.root, err = fs.initClient(ctx)
+ }
+
+ // Set the root's reference count to 2. One reference is returned to the
+ // caller, and the other is held by fs to prevent the root from being "cached"
+ // and subsequently evicted.
+ if err == nil {
+ fs.root.refs = 2
+ }
+ return err
+}
+
+func (fs *filesystem) initClientLisa(ctx context.Context) (*lisafs.Inode, error) {
+ sock, err := unet.NewSocket(fs.opts.fd)
+ if err != nil {
+ return nil, err
+ }
+
+ var rootInode *lisafs.Inode
+ ctx.UninterruptibleSleepStart(false)
+ fs.clientLisa, rootInode, err = lisafs.NewClient(sock, fs.opts.aname)
+ ctx.UninterruptibleSleepFinish(false)
+ return rootInode, err
+}
+
+func (fs *filesystem) initClient(ctx context.Context) (*dentry, error) {
// Connect to the server.
if err := fs.dial(ctx); err != nil {
- return nil, nil, err
+ return nil, err
}
// Perform attach to obtain the filesystem root.
ctx.UninterruptibleSleepStart(false)
- attached, err := fs.client.Attach(fsopts.aname)
+ attached, err := fs.client.Attach(fs.opts.aname)
ctx.UninterruptibleSleepFinish(false)
if err != nil {
- fs.vfsfs.DecRef(ctx)
- return nil, nil, err
+ return nil, err
}
attachFile := p9file{attached}
qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask())
if err != nil {
attachFile.close(ctx)
- fs.vfsfs.DecRef(ctx)
- return nil, nil, err
+ return nil, err
}
// Construct the root dentry.
root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr)
if err != nil {
attachFile.close(ctx)
- fs.vfsfs.DecRef(ctx)
- return nil, nil, err
+ return nil, err
}
- // Set the root's reference count to 2. One reference is returned to the
- // caller, and the other is held by fs to prevent the root from being "cached"
- // and subsequently evicted.
- root.refs = 2
- fs.root = root
-
- return &fs.vfsfs, &root.vfsd, nil
+ return root, nil
}
func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int, error) {
@@ -614,7 +676,11 @@ func (fs *filesystem) Release(ctx context.Context) {
if !fs.iopts.LeakConnection {
// Close the connection to the server. This implicitly clunks all fids.
- fs.client.Close()
+ if fs.opts.lisaEnabled {
+ fs.clientLisa.Close()
+ } else {
+ fs.client.Close()
+ }
}
fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
@@ -645,6 +711,23 @@ func (d *dentry) releaseSyntheticRecursiveLocked(ctx context.Context) {
}
}
+// inoKey is the key used to identify the inode backed by this dentry.
+//
+// +stateify savable
+type inoKey struct {
+ ino uint64
+ devMinor uint32
+ devMajor uint32
+}
+
+func inoKeyFromStat(stat *linux.Statx) inoKey {
+ return inoKey{
+ ino: stat.Ino,
+ devMinor: stat.DevMinor,
+ devMajor: stat.DevMajor,
+ }
+}
+
// dentry implements vfs.DentryImpl.
//
// +stateify savable
@@ -675,6 +758,9 @@ type dentry struct {
// qidPath is the p9.QID.Path for this file. qidPath is immutable.
qidPath uint64
+ // inoKey is used to identify this dentry's inode.
+ inoKey inoKey
+
// file is the unopened p9.File that backs this dentry. file is immutable.
//
// If file.isNil(), this dentry represents a synthetic file, i.e. a file
@@ -682,6 +768,14 @@ type dentry struct {
// only files that can be synthetic are sockets, pipes, and directories.
file p9file `state:"nosave"`
+ // controlFDLisa is used by lisafs to perform path based operations on this
+ // dentry.
+ //
+ // if !controlFDLisa.Ok(), this dentry represents a synthetic file, i.e. a
+ // file that does not exist on the remote filesystem. As of this writing, the
+ // only files that can be synthetic are sockets, pipes, and directories.
+ controlFDLisa lisafs.ClientFD `state:"nosave"`
+
// If deleted is non-zero, the file represented by this dentry has been
// deleted. deleted is accessed using atomic memory operations.
deleted uint32
@@ -792,12 +886,14 @@ type dentry struct {
// always either -1 or equal to readFD; if !writeFile.isNil() (the file has
// been opened for writing), it is additionally either -1 or equal to
// writeFD.
- handleMu sync.RWMutex `state:"nosave"`
- readFile p9file `state:"nosave"`
- writeFile p9file `state:"nosave"`
- readFD int32 `state:"nosave"`
- writeFD int32 `state:"nosave"`
- mmapFD int32 `state:"nosave"`
+ handleMu sync.RWMutex `state:"nosave"`
+ readFile p9file `state:"nosave"`
+ writeFile p9file `state:"nosave"`
+ readFDLisa lisafs.ClientFD `state:"nosave"`
+ writeFDLisa lisafs.ClientFD `state:"nosave"`
+ readFD int32 `state:"nosave"`
+ writeFD int32 `state:"nosave"`
+ mmapFD int32 `state:"nosave"`
dataMu sync.RWMutex `state:"nosave"`
@@ -865,11 +961,11 @@ func dentryAttrMask() p9.AttrMask {
func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) {
if !mask.Mode {
ctx.Warningf("can't create gofer.dentry without file type")
- return nil, syserror.EIO
+ return nil, linuxerr.EIO
}
if attr.Mode.FileType() == p9.ModeRegular && !mask.Size {
ctx.Warningf("can't create regular file gofer.dentry without file size")
- return nil, syserror.EIO
+ return nil, linuxerr.EIO
}
d := &dentry{
@@ -921,6 +1017,79 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
return d, nil
}
+func (fs *filesystem) newDentryLisa(ctx context.Context, ino *lisafs.Inode) (*dentry, error) {
+ if ino.Stat.Mask&linux.STATX_TYPE == 0 {
+ ctx.Warningf("can't create gofer.dentry without file type")
+ return nil, linuxerr.EIO
+ }
+ if ino.Stat.Mode&linux.FileTypeMask == linux.ModeRegular && ino.Stat.Mask&linux.STATX_SIZE == 0 {
+ ctx.Warningf("can't create regular file gofer.dentry without file size")
+ return nil, linuxerr.EIO
+ }
+
+ inoKey := inoKeyFromStat(&ino.Stat)
+ d := &dentry{
+ fs: fs,
+ inoKey: inoKey,
+ ino: fs.inoFromKey(inoKey),
+ mode: uint32(ino.Stat.Mode),
+ uid: uint32(fs.opts.dfltuid),
+ gid: uint32(fs.opts.dfltgid),
+ blockSize: hostarch.PageSize,
+ readFD: -1,
+ writeFD: -1,
+ mmapFD: -1,
+ controlFDLisa: fs.clientLisa.NewFD(ino.ControlFD),
+ }
+
+ d.pf.dentry = d
+ if ino.Stat.Mask&linux.STATX_UID != 0 {
+ d.uid = dentryUIDFromLisaUID(lisafs.UID(ino.Stat.UID))
+ }
+ if ino.Stat.Mask&linux.STATX_GID != 0 {
+ d.gid = dentryGIDFromLisaGID(lisafs.GID(ino.Stat.GID))
+ }
+ if ino.Stat.Mask&linux.STATX_SIZE != 0 {
+ d.size = ino.Stat.Size
+ }
+ if ino.Stat.Blksize != 0 {
+ d.blockSize = ino.Stat.Blksize
+ }
+ if ino.Stat.Mask&linux.STATX_ATIME != 0 {
+ d.atime = dentryTimestampFromLisa(ino.Stat.Atime)
+ }
+ if ino.Stat.Mask&linux.STATX_MTIME != 0 {
+ d.mtime = dentryTimestampFromLisa(ino.Stat.Mtime)
+ }
+ if ino.Stat.Mask&linux.STATX_CTIME != 0 {
+ d.ctime = dentryTimestampFromLisa(ino.Stat.Ctime)
+ }
+ if ino.Stat.Mask&linux.STATX_BTIME != 0 {
+ d.btime = dentryTimestampFromLisa(ino.Stat.Btime)
+ }
+ if ino.Stat.Mask&linux.STATX_NLINK != 0 {
+ d.nlink = ino.Stat.Nlink
+ }
+ d.vfsd.Init(d)
+ refsvfs2.Register(d)
+ fs.syncMu.Lock()
+ fs.syncableDentries[d] = struct{}{}
+ fs.syncMu.Unlock()
+ return d, nil
+}
+
+func (fs *filesystem) inoFromKey(key inoKey) uint64 {
+ fs.inoMu.Lock()
+ defer fs.inoMu.Unlock()
+
+ if ino, ok := fs.inoByKey[key]; ok {
+ return ino
+ }
+ ino := fs.nextIno()
+ fs.inoByKey[key] = ino
+ return ino
+}
+
func (fs *filesystem) inoFromQIDPath(qidPath uint64) uint64 {
fs.inoMu.Lock()
defer fs.inoMu.Unlock()
@@ -937,7 +1106,7 @@ func (fs *filesystem) nextIno() uint64 {
}
func (d *dentry) isSynthetic() bool {
- return d.file.isNil()
+ return !d.isControlFileOk()
}
func (d *dentry) cachedMetadataAuthoritative() bool {
@@ -987,6 +1156,50 @@ func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) {
}
}
+// updateFromLisaStatLocked is called to update d's metadata after an update
+// from the remote filesystem.
+// Precondition: d.metadataMu must be locked.
+// +checklocks:d.metadataMu
+func (d *dentry) updateFromLisaStatLocked(stat *linux.Statx) {
+ if stat.Mask&linux.STATX_TYPE != 0 {
+ if got, want := stat.Mode&linux.FileTypeMask, d.fileType(); uint32(got) != want {
+ panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got))
+ }
+ }
+ if stat.Mask&linux.STATX_MODE != 0 {
+ atomic.StoreUint32(&d.mode, uint32(stat.Mode))
+ }
+ if stat.Mask&linux.STATX_UID != 0 {
+ atomic.StoreUint32(&d.uid, dentryUIDFromLisaUID(lisafs.UID(stat.UID)))
+ }
+ if stat.Mask&linux.STATX_GID != 0 {
+ atomic.StoreUint32(&d.uid, dentryGIDFromLisaGID(lisafs.GID(stat.GID)))
+ }
+ if stat.Blksize != 0 {
+ atomic.StoreUint32(&d.blockSize, stat.Blksize)
+ }
+ // Don't override newer client-defined timestamps with old server-defined
+ // ones.
+ if stat.Mask&linux.STATX_ATIME != 0 && atomic.LoadUint32(&d.atimeDirty) == 0 {
+ atomic.StoreInt64(&d.atime, dentryTimestampFromLisa(stat.Atime))
+ }
+ if stat.Mask&linux.STATX_MTIME != 0 && atomic.LoadUint32(&d.mtimeDirty) == 0 {
+ atomic.StoreInt64(&d.mtime, dentryTimestampFromLisa(stat.Mtime))
+ }
+ if stat.Mask&linux.STATX_CTIME != 0 {
+ atomic.StoreInt64(&d.ctime, dentryTimestampFromLisa(stat.Ctime))
+ }
+ if stat.Mask&linux.STATX_BTIME != 0 {
+ atomic.StoreInt64(&d.btime, dentryTimestampFromLisa(stat.Btime))
+ }
+ if stat.Mask&linux.STATX_NLINK != 0 {
+ atomic.StoreUint32(&d.nlink, stat.Nlink)
+ }
+ if stat.Mask&linux.STATX_SIZE != 0 {
+ d.updateSizeLocked(stat.Size)
+ }
+}
+
// Preconditions: !d.isSynthetic().
// Preconditions: d.metadataMu is locked.
// +checklocks:d.metadataMu
@@ -996,7 +1209,10 @@ func (d *dentry) refreshSizeLocked(ctx context.Context) error {
if d.writeFD < 0 {
d.handleMu.RUnlock()
// Ask the gofer if we don't have a host FD.
- return d.updateFromGetattrLocked(ctx)
+ if d.fs.opts.lisaEnabled {
+ return d.updateFromStatLisaLocked(ctx, nil)
+ }
+ return d.updateFromGetattrLocked(ctx, p9file{})
}
var stat unix.Statx_t
@@ -1015,33 +1231,77 @@ func (d *dentry) updateFromGetattr(ctx context.Context) error {
// updating stale attributes in d.updateFromP9AttrsLocked().
d.metadataMu.Lock()
defer d.metadataMu.Unlock()
- return d.updateFromGetattrLocked(ctx)
+ if d.fs.opts.lisaEnabled {
+ return d.updateFromStatLisaLocked(ctx, nil)
+ }
+ return d.updateFromGetattrLocked(ctx, p9file{})
}
// Preconditions:
// * !d.isSynthetic().
// * d.metadataMu is locked.
// +checklocks:d.metadataMu
-func (d *dentry) updateFromGetattrLocked(ctx context.Context) error {
- // Use d.readFile or d.writeFile, which represent 9P FIDs that have been
- // opened, in preference to d.file, which represents a 9P fid that has not.
- // This may be significantly more efficient in some implementations. Prefer
- // d.writeFile over d.readFile since some filesystem implementations may
- // update a writable handle's metadata after writes to that handle, without
- // making metadata updates immediately visible to read-only handles
- // representing the same file.
- d.handleMu.RLock()
- handleMuRLocked := true
- var file p9file
- switch {
- case !d.writeFile.isNil():
- file = d.writeFile
- case !d.readFile.isNil():
- file = d.readFile
- default:
- file = d.file
- d.handleMu.RUnlock()
- handleMuRLocked = false
+func (d *dentry) updateFromStatLisaLocked(ctx context.Context, fdLisa *lisafs.ClientFD) error {
+ handleMuRLocked := false
+ if fdLisa == nil {
+ // Use open FDs in preferenece to the control FD. This may be significantly
+ // more efficient in some implementations. Prefer a writable FD over a
+ // readable one since some filesystem implementations may update a writable
+ // FD's metadata after writes, without making metadata updates immediately
+ // visible to read-only FDs representing the same file.
+ d.handleMu.RLock()
+ switch {
+ case d.writeFDLisa.Ok():
+ fdLisa = &d.writeFDLisa
+ handleMuRLocked = true
+ case d.readFDLisa.Ok():
+ fdLisa = &d.readFDLisa
+ handleMuRLocked = true
+ default:
+ fdLisa = &d.controlFDLisa
+ d.handleMu.RUnlock()
+ }
+ }
+
+ var stat linux.Statx
+ err := fdLisa.StatTo(ctx, &stat)
+ if handleMuRLocked {
+ // handleMu must be released before updateFromLisaStatLocked().
+ d.handleMu.RUnlock() // +checklocksforce: complex case.
+ }
+ if err != nil {
+ return err
+ }
+ d.updateFromLisaStatLocked(&stat)
+ return nil
+}
+
+// Preconditions:
+// * !d.isSynthetic().
+// * d.metadataMu is locked.
+// +checklocks:d.metadataMu
+func (d *dentry) updateFromGetattrLocked(ctx context.Context, file p9file) error {
+ handleMuRLocked := false
+ if file.isNil() {
+ // Use d.readFile or d.writeFile, which represent 9P FIDs that have
+ // been opened, in preference to d.file, which represents a 9P fid that
+ // has not. This may be significantly more efficient in some
+ // implementations. Prefer d.writeFile over d.readFile since some
+ // filesystem implementations may update a writable handle's metadata
+ // after writes to that handle, without making metadata updates
+ // immediately visible to read-only handles representing the same file.
+ d.handleMu.RLock()
+ switch {
+ case !d.writeFile.isNil():
+ file = d.writeFile
+ handleMuRLocked = true
+ case !d.readFile.isNil():
+ file = d.readFile
+ handleMuRLocked = true
+ default:
+ file = d.file
+ d.handleMu.RUnlock()
+ }
}
_, attrMask, attr, err := file.getAttr(ctx, dentryAttrMask())
@@ -1112,7 +1372,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
case linux.S_IFREG:
// ok
case linux.S_IFDIR:
- return syserror.EISDIR
+ return linuxerr.EISDIR
default:
return linuxerr.EINVAL
}
@@ -1159,6 +1419,13 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
}
}
+ // failureMask indicates which attributes could not be set on the remote
+ // filesystem. p9 returns an error if any of the attributes could not be set
+ // but that leads to inconsistency as the server could have set a few
+ // attributes successfully but a later failure will cause the successful ones
+ // to not be updated in the dentry cache.
+ var failureMask uint32
+ var failureErr error
if !d.isSynthetic() {
if stat.Mask != 0 {
if stat.Mask&linux.STATX_SIZE != 0 {
@@ -1168,35 +1435,50 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
// the remote file has been truncated).
d.dataMu.Lock()
}
- if err := d.file.setAttr(ctx, p9.SetAttrMask{
- Permissions: stat.Mask&linux.STATX_MODE != 0,
- UID: stat.Mask&linux.STATX_UID != 0,
- GID: stat.Mask&linux.STATX_GID != 0,
- Size: stat.Mask&linux.STATX_SIZE != 0,
- ATime: stat.Mask&linux.STATX_ATIME != 0,
- MTime: stat.Mask&linux.STATX_MTIME != 0,
- ATimeNotSystemTime: stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW,
- MTimeNotSystemTime: stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW,
- }, p9.SetAttr{
- Permissions: p9.FileMode(stat.Mode),
- UID: p9.UID(stat.UID),
- GID: p9.GID(stat.GID),
- Size: stat.Size,
- ATimeSeconds: uint64(stat.Atime.Sec),
- ATimeNanoSeconds: uint64(stat.Atime.Nsec),
- MTimeSeconds: uint64(stat.Mtime.Sec),
- MTimeNanoSeconds: uint64(stat.Mtime.Nsec),
- }); err != nil {
- if stat.Mask&linux.STATX_SIZE != 0 {
- d.dataMu.Unlock() // +checklocksforce: locked conditionally above
+ if d.fs.opts.lisaEnabled {
+ var err error
+ failureMask, failureErr, err = d.controlFDLisa.SetStat(ctx, stat)
+ if err != nil {
+ if stat.Mask&linux.STATX_SIZE != 0 {
+ d.dataMu.Unlock() // +checklocksforce: locked conditionally above
+ }
+ return err
+ }
+ } else {
+ if err := d.file.setAttr(ctx, p9.SetAttrMask{
+ Permissions: stat.Mask&linux.STATX_MODE != 0,
+ UID: stat.Mask&linux.STATX_UID != 0,
+ GID: stat.Mask&linux.STATX_GID != 0,
+ Size: stat.Mask&linux.STATX_SIZE != 0,
+ ATime: stat.Mask&linux.STATX_ATIME != 0,
+ MTime: stat.Mask&linux.STATX_MTIME != 0,
+ ATimeNotSystemTime: stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW,
+ MTimeNotSystemTime: stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW,
+ }, p9.SetAttr{
+ Permissions: p9.FileMode(stat.Mode),
+ UID: p9.UID(stat.UID),
+ GID: p9.GID(stat.GID),
+ Size: stat.Size,
+ ATimeSeconds: uint64(stat.Atime.Sec),
+ ATimeNanoSeconds: uint64(stat.Atime.Nsec),
+ MTimeSeconds: uint64(stat.Mtime.Sec),
+ MTimeNanoSeconds: uint64(stat.Mtime.Nsec),
+ }); err != nil {
+ if stat.Mask&linux.STATX_SIZE != 0 {
+ d.dataMu.Unlock() // +checklocksforce: locked conditionally above
+ }
+ return err
}
- return err
}
if stat.Mask&linux.STATX_SIZE != 0 {
- // d.size should be kept up to date, and privatized
- // copy-on-write mappings of truncated pages need to be
- // invalidated, even if InteropModeShared is in effect.
- d.updateSizeAndUnlockDataMuLocked(stat.Size) // +checklocksforce: locked conditionally above
+ if failureMask&linux.STATX_SIZE == 0 {
+ // d.size should be kept up to date, and privatized
+ // copy-on-write mappings of truncated pages need to be
+ // invalidated, even if InteropModeShared is in effect.
+ d.updateSizeAndUnlockDataMuLocked(stat.Size) // +checklocksforce: locked conditionally above
+ } else {
+ d.dataMu.Unlock() // +checklocksforce: locked conditionally above
+ }
}
}
if d.fs.opts.interop == InteropModeShared {
@@ -1207,13 +1489,13 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
return nil
}
}
- if stat.Mask&linux.STATX_MODE != 0 {
+ if stat.Mask&linux.STATX_MODE != 0 && failureMask&linux.STATX_MODE == 0 {
atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode))
}
- if stat.Mask&linux.STATX_UID != 0 {
+ if stat.Mask&linux.STATX_UID != 0 && failureMask&linux.STATX_UID == 0 {
atomic.StoreUint32(&d.uid, stat.UID)
}
- if stat.Mask&linux.STATX_GID != 0 {
+ if stat.Mask&linux.STATX_GID != 0 && failureMask&linux.STATX_GID == 0 {
atomic.StoreUint32(&d.gid, stat.GID)
}
// Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because
@@ -1221,15 +1503,19 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
// stat.Mtime to client-local timestamps above, and if
// !d.cachedMetadataAuthoritative() then we returned after calling
// d.file.setAttr(). For the same reason, now must have been initialized.
- if stat.Mask&linux.STATX_ATIME != 0 {
+ if stat.Mask&linux.STATX_ATIME != 0 && failureMask&linux.STATX_ATIME == 0 {
atomic.StoreInt64(&d.atime, stat.Atime.ToNsec())
atomic.StoreUint32(&d.atimeDirty, 0)
}
- if stat.Mask&linux.STATX_MTIME != 0 {
+ if stat.Mask&linux.STATX_MTIME != 0 && failureMask&linux.STATX_MTIME == 0 {
atomic.StoreInt64(&d.mtime, stat.Mtime.ToNsec())
atomic.StoreUint32(&d.mtimeDirty, 0)
}
atomic.StoreInt64(&d.ctime, now)
+ if failureMask != 0 {
+ // Setting some attribute failed on the remote filesystem.
+ return failureErr
+ }
return nil
}
@@ -1309,7 +1595,10 @@ func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats
// (b/148380782). Allow all other extended attributes to be passed through
// to the remote filesystem. This is inconsistent with Linux's 9p client,
// but consistent with other filesystems (e.g. FUSE).
- if strings.HasPrefix(name, linux.XATTR_SECURITY_PREFIX) || strings.HasPrefix(name, linux.XATTR_SYSTEM_PREFIX) {
+ //
+ // NOTE(b/202533394): Also disallow "trusted" namespace for now. This is
+ // consistent with the VFS1 gofer client.
+ if strings.HasPrefix(name, linux.XATTR_SECURITY_PREFIX) || strings.HasPrefix(name, linux.XATTR_SYSTEM_PREFIX) || strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) {
return linuxerr.EOPNOTSUPP
}
mode := linux.FileMode(atomic.LoadUint32(&d.mode))
@@ -1345,6 +1634,20 @@ func dentryGIDFromP9GID(gid p9.GID) uint32 {
return uint32(gid)
}
+func dentryUIDFromLisaUID(uid lisafs.UID) uint32 {
+ if !uid.Ok() {
+ return uint32(auth.OverflowUID)
+ }
+ return uint32(uid)
+}
+
+func dentryGIDFromLisaGID(gid lisafs.GID) uint32 {
+ if !gid.Ok() {
+ return uint32(auth.OverflowGID)
+ }
+ return uint32(gid)
+}
+
// IncRef implements vfs.DentryImpl.IncRef.
func (d *dentry) IncRef() {
// d.refs may be 0 if d.fs.renameMu is locked, which serializes against
@@ -1653,15 +1956,24 @@ func (d *dentry) destroyLocked(ctx context.Context) {
d.dirty.RemoveAll()
}
d.dataMu.Unlock()
- // Clunk open fids and close open host FDs.
- if !d.readFile.isNil() {
- _ = d.readFile.close(ctx)
- }
- if !d.writeFile.isNil() && d.readFile != d.writeFile {
- _ = d.writeFile.close(ctx)
+ if d.fs.opts.lisaEnabled {
+ if d.readFDLisa.Ok() && d.readFDLisa.ID() != d.writeFDLisa.ID() {
+ d.readFDLisa.CloseBatched(ctx)
+ }
+ if d.writeFDLisa.Ok() {
+ d.writeFDLisa.CloseBatched(ctx)
+ }
+ } else {
+ // Clunk open fids and close open host FDs.
+ if !d.readFile.isNil() {
+ _ = d.readFile.close(ctx)
+ }
+ if !d.writeFile.isNil() && d.readFile != d.writeFile {
+ _ = d.writeFile.close(ctx)
+ }
+ d.readFile = p9file{}
+ d.writeFile = p9file{}
}
- d.readFile = p9file{}
- d.writeFile = p9file{}
if d.readFD >= 0 {
_ = unix.Close(int(d.readFD))
}
@@ -1673,7 +1985,7 @@ func (d *dentry) destroyLocked(ctx context.Context) {
d.mmapFD = -1
d.handleMu.Unlock()
- if !d.file.isNil() {
+ if d.isControlFileOk() {
// Note that it's possible that d.atimeDirty or d.mtimeDirty are true,
// i.e. client and server timestamps may differ (because e.g. a client
// write was serviced by the page cache, and only written back to the
@@ -1682,10 +1994,16 @@ func (d *dentry) destroyLocked(ctx context.Context) {
// instantiated for the same file would remain coherent. Unfortunately,
// this turns out to be too expensive in many cases, so for now we
// don't do this.
- if err := d.file.close(ctx); err != nil {
- log.Warningf("gofer.dentry.destroyLocked: failed to close file: %v", err)
+
+ // Close the control FD.
+ if d.fs.opts.lisaEnabled {
+ d.controlFDLisa.CloseBatched(ctx)
+ } else {
+ if err := d.file.close(ctx); err != nil {
+ log.Warningf("gofer.dentry.destroyLocked: failed to close file: %v", err)
+ }
+ d.file = p9file{}
}
- d.file = p9file{}
// Remove d from the set of syncable dentries.
d.fs.syncMu.Lock()
@@ -1711,10 +2029,29 @@ func (d *dentry) setDeleted() {
atomic.StoreUint32(&d.deleted, 1)
}
-func (d *dentry) listXattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
- if d.file.isNil() {
+func (d *dentry) isControlFileOk() bool {
+ if d.fs.opts.lisaEnabled {
+ return d.controlFDLisa.Ok()
+ }
+ return !d.file.isNil()
+}
+
+func (d *dentry) isReadFileOk() bool {
+ if d.fs.opts.lisaEnabled {
+ return d.readFDLisa.Ok()
+ }
+ return !d.readFile.isNil()
+}
+
+func (d *dentry) listXattr(ctx context.Context, size uint64) ([]string, error) {
+ if !d.isControlFileOk() {
return nil, nil
}
+
+ if d.fs.opts.lisaEnabled {
+ return d.controlFDLisa.ListXattr(ctx, size)
+ }
+
xattrMap, err := d.file.listXattr(ctx, size)
if err != nil {
return nil, err
@@ -1727,32 +2064,41 @@ func (d *dentry) listXattr(ctx context.Context, creds *auth.Credentials, size ui
}
func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
- if d.file.isNil() {
+ if !d.isControlFileOk() {
return "", linuxerr.ENODATA
}
if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
return "", err
}
+ if d.fs.opts.lisaEnabled {
+ return d.controlFDLisa.GetXattr(ctx, opts.Name, opts.Size)
+ }
return d.file.getXattr(ctx, opts.Name, opts.Size)
}
func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
- if d.file.isNil() {
+ if !d.isControlFileOk() {
return linuxerr.EPERM
}
if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
return err
}
+ if d.fs.opts.lisaEnabled {
+ return d.controlFDLisa.SetXattr(ctx, opts.Name, opts.Value, opts.Flags)
+ }
return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
}
func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error {
- if d.file.isNil() {
+ if !d.isControlFileOk() {
return linuxerr.EPERM
}
if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
return err
}
+ if d.fs.opts.lisaEnabled {
+ return d.controlFDLisa.RemoveXattr(ctx, name)
+ }
return d.file.removeXattr(ctx, name)
}
@@ -1764,19 +2110,30 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
// O_TRUNC).
if !trunc {
d.handleMu.RLock()
- if (!read || !d.readFile.isNil()) && (!write || !d.writeFile.isNil()) {
+ var canReuseCurHandle bool
+ if d.fs.opts.lisaEnabled {
+ canReuseCurHandle = (!read || d.readFDLisa.Ok()) && (!write || d.writeFDLisa.Ok())
+ } else {
+ canReuseCurHandle = (!read || !d.readFile.isNil()) && (!write || !d.writeFile.isNil())
+ }
+ d.handleMu.RUnlock()
+ if canReuseCurHandle {
// Current handles are sufficient.
- d.handleMu.RUnlock()
return nil
}
- d.handleMu.RUnlock()
}
var fdsToCloseArr [2]int32
fdsToClose := fdsToCloseArr[:0]
invalidateTranslations := false
d.handleMu.Lock()
- if (read && d.readFile.isNil()) || (write && d.writeFile.isNil()) || trunc {
+ var needNewHandle bool
+ if d.fs.opts.lisaEnabled {
+ needNewHandle = (read && !d.readFDLisa.Ok()) || (write && !d.writeFDLisa.Ok()) || trunc
+ } else {
+ needNewHandle = (read && d.readFile.isNil()) || (write && d.writeFile.isNil()) || trunc
+ }
+ if needNewHandle {
// Get a new handle. If this file has been opened for both reading and
// writing, try to get a single handle that is usable for both:
//
@@ -1785,9 +2142,21 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
//
// - NOTE(b/141991141): Some filesystems may not ensure coherence
// between multiple handles for the same file.
- openReadable := !d.readFile.isNil() || read
- openWritable := !d.writeFile.isNil() || write
- h, err := openHandle(ctx, d.file, openReadable, openWritable, trunc)
+ var (
+ openReadable bool
+ openWritable bool
+ h handle
+ err error
+ )
+ if d.fs.opts.lisaEnabled {
+ openReadable = d.readFDLisa.Ok() || read
+ openWritable = d.writeFDLisa.Ok() || write
+ h, err = openHandleLisa(ctx, d.controlFDLisa, openReadable, openWritable, trunc)
+ } else {
+ openReadable = !d.readFile.isNil() || read
+ openWritable = !d.writeFile.isNil() || write
+ h, err = openHandle(ctx, d.file, openReadable, openWritable, trunc)
+ }
if linuxerr.Equals(linuxerr.EACCES, err) && (openReadable != read || openWritable != write) {
// It may not be possible to use a single handle for both
// reading and writing, since permissions on the file may have
@@ -1797,7 +2166,11 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
ctx.Debugf("gofer.dentry.ensureSharedHandle: bifurcating read/write handles for dentry %p", d)
openReadable = read
openWritable = write
- h, err = openHandle(ctx, d.file, openReadable, openWritable, trunc)
+ if d.fs.opts.lisaEnabled {
+ h, err = openHandleLisa(ctx, d.controlFDLisa, openReadable, openWritable, trunc)
+ } else {
+ h, err = openHandle(ctx, d.file, openReadable, openWritable, trunc)
+ }
}
if err != nil {
d.handleMu.Unlock()
@@ -1859,9 +2232,16 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
// previously opened for reading (without an FD), then existing
// translations of the file may use the internal page cache;
// invalidate those mappings.
- if d.writeFile.isNil() {
- invalidateTranslations = !d.readFile.isNil()
- atomic.StoreInt32(&d.mmapFD, h.fd)
+ if d.fs.opts.lisaEnabled {
+ if !d.writeFDLisa.Ok() {
+ invalidateTranslations = d.readFDLisa.Ok()
+ atomic.StoreInt32(&d.mmapFD, h.fd)
+ }
+ } else {
+ if d.writeFile.isNil() {
+ invalidateTranslations = !d.readFile.isNil()
+ atomic.StoreInt32(&d.mmapFD, h.fd)
+ }
}
} else if openWritable && d.writeFD < 0 {
atomic.StoreInt32(&d.writeFD, h.fd)
@@ -1888,24 +2268,45 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
atomic.StoreInt32(&d.mmapFD, -1)
}
- // Switch to new fids.
- var oldReadFile p9file
- if openReadable {
- oldReadFile = d.readFile
- d.readFile = h.file
- }
- var oldWriteFile p9file
- if openWritable {
- oldWriteFile = d.writeFile
- d.writeFile = h.file
- }
- // NOTE(b/141991141): Clunk old fids before making new fids visible (by
- // unlocking d.handleMu).
- if !oldReadFile.isNil() {
- oldReadFile.close(ctx)
- }
- if !oldWriteFile.isNil() && oldReadFile != oldWriteFile {
- oldWriteFile.close(ctx)
+ // Switch to new fids/FDs.
+ if d.fs.opts.lisaEnabled {
+ oldReadFD := lisafs.InvalidFDID
+ if openReadable {
+ oldReadFD = d.readFDLisa.ID()
+ d.readFDLisa = h.fdLisa
+ }
+ oldWriteFD := lisafs.InvalidFDID
+ if openWritable {
+ oldWriteFD = d.writeFDLisa.ID()
+ d.writeFDLisa = h.fdLisa
+ }
+ // NOTE(b/141991141): Close old FDs before making new fids visible (by
+ // unlocking d.handleMu).
+ if oldReadFD.Ok() {
+ d.fs.clientLisa.CloseFDBatched(ctx, oldReadFD)
+ }
+ if oldWriteFD.Ok() && oldReadFD != oldWriteFD {
+ d.fs.clientLisa.CloseFDBatched(ctx, oldWriteFD)
+ }
+ } else {
+ var oldReadFile p9file
+ if openReadable {
+ oldReadFile = d.readFile
+ d.readFile = h.file
+ }
+ var oldWriteFile p9file
+ if openWritable {
+ oldWriteFile = d.writeFile
+ d.writeFile = h.file
+ }
+ // NOTE(b/141991141): Clunk old fids before making new fids visible (by
+ // unlocking d.handleMu).
+ if !oldReadFile.isNil() {
+ oldReadFile.close(ctx)
+ }
+ if !oldWriteFile.isNil() && oldReadFile != oldWriteFile {
+ oldWriteFile.close(ctx)
+ }
}
}
d.handleMu.Unlock()
@@ -1929,27 +2330,29 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
// Preconditions: d.handleMu must be locked.
func (d *dentry) readHandleLocked() handle {
return handle{
- file: d.readFile,
- fd: d.readFD,
+ fdLisa: d.readFDLisa,
+ file: d.readFile,
+ fd: d.readFD,
}
}
// Preconditions: d.handleMu must be locked.
func (d *dentry) writeHandleLocked() handle {
return handle{
- file: d.writeFile,
- fd: d.writeFD,
+ fdLisa: d.writeFDLisa,
+ file: d.writeFile,
+ fd: d.writeFD,
}
}
func (d *dentry) syncRemoteFile(ctx context.Context) error {
d.handleMu.RLock()
defer d.handleMu.RUnlock()
- return d.syncRemoteFileLocked(ctx)
+ return d.syncRemoteFileLocked(ctx, nil /* accFsyncFDIDsLisa */)
}
// Preconditions: d.handleMu must be locked.
-func (d *dentry) syncRemoteFileLocked(ctx context.Context) error {
+func (d *dentry) syncRemoteFileLocked(ctx context.Context, accFsyncFDIDsLisa *[]lisafs.FDID) error {
// If we have a host FD, fsyncing it is likely to be faster than an fsync
// RPC. Prefer syncing write handles over read handles, since some remote
// filesystem implementations may not sync changes made through write
@@ -1960,7 +2363,13 @@ func (d *dentry) syncRemoteFileLocked(ctx context.Context) error {
ctx.UninterruptibleSleepFinish(false)
return err
}
- if !d.writeFile.isNil() {
+ if d.fs.opts.lisaEnabled && d.writeFDLisa.Ok() {
+ if accFsyncFDIDsLisa != nil {
+ *accFsyncFDIDsLisa = append(*accFsyncFDIDsLisa, d.writeFDLisa.ID())
+ return nil
+ }
+ return d.writeFDLisa.Sync(ctx)
+ } else if !d.fs.opts.lisaEnabled && !d.writeFile.isNil() {
return d.writeFile.fsync(ctx)
}
if d.readFD >= 0 {
@@ -1969,13 +2378,19 @@ func (d *dentry) syncRemoteFileLocked(ctx context.Context) error {
ctx.UninterruptibleSleepFinish(false)
return err
}
- if !d.readFile.isNil() {
+ if d.fs.opts.lisaEnabled && d.readFDLisa.Ok() {
+ if accFsyncFDIDsLisa != nil {
+ *accFsyncFDIDsLisa = append(*accFsyncFDIDsLisa, d.readFDLisa.ID())
+ return nil
+ }
+ return d.readFDLisa.Sync(ctx)
+ } else if !d.fs.opts.lisaEnabled && !d.readFile.isNil() {
return d.readFile.fsync(ctx)
}
return nil
}
-func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) error {
+func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool, accFsyncFDIDsLisa *[]lisafs.FDID) error {
d.handleMu.RLock()
defer d.handleMu.RUnlock()
h := d.writeHandleLocked()
@@ -1988,7 +2403,7 @@ func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) err
return err
}
}
- if err := d.syncRemoteFileLocked(ctx); err != nil {
+ if err := d.syncRemoteFileLocked(ctx, accFsyncFDIDsLisa); err != nil {
if !forFilesystemSync {
return err
}
@@ -2045,10 +2460,33 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
d := fd.dentry()
const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME)
if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
- // TODO(jamieliu): Use specialFileFD.handle.file for the getattr if
- // available?
- if err := d.updateFromGetattr(ctx); err != nil {
- return linux.Statx{}, err
+ if d.fs.opts.lisaEnabled {
+ // Use specialFileFD.handle.fileLisa for the Stat if available, for the
+ // same reason that we try to use open FD in updateFromStatLisaLocked().
+ var fdLisa *lisafs.ClientFD
+ if sffd, ok := fd.vfsfd.Impl().(*specialFileFD); ok {
+ fdLisa = &sffd.handle.fdLisa
+ }
+ d.metadataMu.Lock()
+ err := d.updateFromStatLisaLocked(ctx, fdLisa)
+ d.metadataMu.Unlock()
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ } else {
+ // Use specialFileFD.handle.file for the getattr if available, for the
+ // same reason that we try to use open file handles in
+ // dentry.updateFromGetattrLocked().
+ var file p9file
+ if sffd, ok := fd.vfsfd.Impl().(*specialFileFD); ok {
+ file = sffd.handle.file
+ }
+ d.metadataMu.Lock()
+ err := d.updateFromGetattrLocked(ctx, file)
+ d.metadataMu.Unlock()
+ if err != nil {
+ return linux.Statx{}, err
+ }
}
}
var stat linux.Statx
@@ -2069,7 +2507,7 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
- return fd.dentry().listXattr(ctx, auth.CredentialsFromContext(ctx), size)
+ return fd.dentry().listXattr(ctx, size)
}
// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go
index 806392d50..d5cc73f33 100644
--- a/pkg/sentry/fsimpl/gofer/gofer_test.go
+++ b/pkg/sentry/fsimpl/gofer/gofer_test.go
@@ -33,6 +33,7 @@ func TestDestroyIdempotent(t *testing.T) {
},
syncableDentries: make(map[*dentry]struct{}),
inoByQIDPath: make(map[uint64]uint64),
+ inoByKey: make(map[inoKey]uint64),
}
attr := &p9.Attr{
diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go
index 5c57f6fea..394aecd62 100644
--- a/pkg/sentry/fsimpl/gofer/handle.go
+++ b/pkg/sentry/fsimpl/gofer/handle.go
@@ -17,18 +17,23 @@ package gofer
import (
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/lisafs"
"gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/hostfd"
+ "gvisor.dev/gvisor/pkg/sync"
)
// handle represents a remote "open file descriptor", consisting of an opened
// fid (p9.File) and optionally a host file descriptor.
//
+// If lisafs is being used, fdLisa points to an open file on the server.
+//
// These are explicitly not savable.
type handle struct {
- file p9file
- fd int32 // -1 if unavailable
+ fdLisa lisafs.ClientFD
+ file p9file
+ fd int32 // -1 if unavailable
}
// Preconditions: read || write.
@@ -64,13 +69,47 @@ func openHandle(ctx context.Context, file p9file, read, write, trunc bool) (hand
}, nil
}
+// Preconditions: read || write.
+func openHandleLisa(ctx context.Context, fdLisa lisafs.ClientFD, read, write, trunc bool) (handle, error) {
+ var flags uint32
+ switch {
+ case read && write:
+ flags = unix.O_RDWR
+ case read:
+ flags = unix.O_RDONLY
+ case write:
+ flags = unix.O_WRONLY
+ default:
+ panic("tried to open unreadable and unwritable handle")
+ }
+ if trunc {
+ flags |= unix.O_TRUNC
+ }
+ openFD, hostFD, err := fdLisa.OpenAt(ctx, flags)
+ if err != nil {
+ return handle{fd: -1}, err
+ }
+ h := handle{
+ fdLisa: fdLisa.Client().NewFD(openFD),
+ fd: int32(hostFD),
+ }
+ return h, nil
+}
+
func (h *handle) isOpen() bool {
+ if h.fdLisa.Client() != nil {
+ return h.fdLisa.Ok()
+ }
return !h.file.isNil()
}
func (h *handle) close(ctx context.Context) {
- h.file.close(ctx)
- h.file = p9file{}
+ if h.fdLisa.Client() != nil {
+ h.fdLisa.CloseBatched(ctx)
+ } else {
+ h.file.close(ctx)
+ h.file = p9file{}
+ }
if h.fd >= 0 {
unix.Close(int(h.fd))
h.fd = -1
@@ -88,19 +127,27 @@ func (h *handle) readToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offs
return n, err
}
if dsts.NumBlocks() == 1 && !dsts.Head().NeedSafecopy() {
- n, err := h.file.readAt(ctx, dsts.Head().ToSlice(), offset)
- return uint64(n), err
+ if h.fdLisa.Client() != nil {
+ return h.fdLisa.Read(ctx, dsts.Head().ToSlice(), offset)
+ }
+ return h.file.readAt(ctx, dsts.Head().ToSlice(), offset)
}
// Buffer the read since p9.File.ReadAt() takes []byte.
buf := make([]byte, dsts.NumBytes())
- n, err := h.file.readAt(ctx, buf, offset)
+ var n uint64
+ var err error
+ if h.fdLisa.Client() != nil {
+ n, err = h.fdLisa.Read(ctx, buf, offset)
+ } else {
+ n, err = h.file.readAt(ctx, buf, offset)
+ }
if n == 0 {
return 0, err
}
if cp, cperr := safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:n]))); cperr != nil {
return cp, cperr
}
- return uint64(n), err
+ return n, err
}
func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) {
@@ -114,8 +161,10 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o
return n, err
}
if srcs.NumBlocks() == 1 && !srcs.Head().NeedSafecopy() {
- n, err := h.file.writeAt(ctx, srcs.Head().ToSlice(), offset)
- return uint64(n), err
+ if h.fdLisa.Client() != nil {
+ return h.fdLisa.Write(ctx, srcs.Head().ToSlice(), offset)
+ }
+ return h.file.writeAt(ctx, srcs.Head().ToSlice(), offset)
}
// Buffer the write since p9.File.WriteAt() takes []byte.
buf := make([]byte, srcs.NumBytes())
@@ -123,10 +172,56 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o
if cp == 0 {
return 0, cperr
}
- n, err := h.file.writeAt(ctx, buf[:cp], offset)
+ var n uint64
+ var err error
+ if h.fdLisa.Client() != nil {
+ n, err = h.fdLisa.Write(ctx, buf[:cp], offset)
+ } else {
+ n, err = h.file.writeAt(ctx, buf[:cp], offset)
+ }
// err takes precedence over cperr.
if err != nil {
- return uint64(n), err
+ return n, err
}
- return uint64(n), cperr
+ return n, cperr
+}
+
+type handleReadWriter struct {
+ ctx context.Context
+ h *handle
+ off uint64
+}
+
+var handleReadWriterPool = sync.Pool{
+ New: func() interface{} {
+ return &handleReadWriter{}
+ },
+}
+
+func getHandleReadWriter(ctx context.Context, h *handle, offset int64) *handleReadWriter {
+ rw := handleReadWriterPool.Get().(*handleReadWriter)
+ rw.ctx = ctx
+ rw.h = h
+ rw.off = uint64(offset)
+ return rw
+}
+
+func putHandleReadWriter(rw *handleReadWriter) {
+ rw.ctx = nil
+ rw.h = nil
+ handleReadWriterPool.Put(rw)
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *handleReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+ n, err := rw.h.readToBlocksAt(rw.ctx, dsts, rw.off)
+ rw.off += n
+ return n, err
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+func (rw *handleReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+ n, err := rw.h.writeFromBlocksAt(rw.ctx, srcs, rw.off)
+ rw.off += n
+ return n, err
}
diff --git a/pkg/sentry/fsimpl/gofer/host_named_pipe.go b/pkg/sentry/fsimpl/gofer/host_named_pipe.go
index 398288ee3..505916a57 100644
--- a/pkg/sentry/fsimpl/gofer/host_named_pipe.go
+++ b/pkg/sentry/fsimpl/gofer/host_named_pipe.go
@@ -22,7 +22,6 @@ import (
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/errors/linuxerr"
- "gvisor.dev/gvisor/pkg/syserror"
)
// Global pipe used by blockUntilNonblockingPipeHasWriter since we can't create
@@ -109,6 +108,6 @@ func sleepBetweenNamedPipeOpenChecks(ctx context.Context) error {
return nil
case <-cancel:
ctx.SleepFinish(false)
- return syserror.ErrInterrupted
+ return linuxerr.ErrInterrupted
}
}
diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go
index b0a429d42..0d97b60fd 100644
--- a/pkg/sentry/fsimpl/gofer/p9file.go
+++ b/pkg/sentry/fsimpl/gofer/p9file.go
@@ -16,9 +16,9 @@ package gofer
import (
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/p9"
- "gvisor.dev/gvisor/pkg/syserror"
)
// p9file is a wrapper around p9.File that provides methods that are
@@ -59,7 +59,7 @@ func (f p9file) walkGetAttrOne(ctx context.Context, name string) (p9.QID, p9file
if newfile != nil {
p9file{newfile}.close(ctx)
}
- return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, syserror.EIO
+ return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, linuxerr.EIO
}
return qids[0], p9file{newfile}, attrMask, attr, nil
}
@@ -141,18 +141,18 @@ func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, u
return fdobj, qid, iounit, err
}
-func (f p9file) readAt(ctx context.Context, p []byte, offset uint64) (int, error) {
+func (f p9file) readAt(ctx context.Context, p []byte, offset uint64) (uint64, error) {
ctx.UninterruptibleSleepStart(false)
n, err := f.file.ReadAt(p, offset)
ctx.UninterruptibleSleepFinish(false)
- return n, err
+ return uint64(n), err
}
-func (f p9file) writeAt(ctx context.Context, p []byte, offset uint64) (int, error) {
+func (f p9file) writeAt(ctx context.Context, p []byte, offset uint64) (uint64, error) {
ctx.UninterruptibleSleepStart(false)
n, err := f.file.WriteAt(p, offset)
ctx.UninterruptibleSleepFinish(false)
- return n, err
+ return uint64(n), err
}
func (f p9file) fsync(ctx context.Context) error {
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 947dbe05f..874f9873d 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -98,6 +98,12 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error {
}
d.handleMu.RLock()
defer d.handleMu.RUnlock()
+ if d.fs.opts.lisaEnabled {
+ if !d.writeFDLisa.Ok() {
+ return nil
+ }
+ return d.writeFDLisa.Flush(ctx)
+ }
if d.writeFile.isNil() {
return nil
}
@@ -110,6 +116,9 @@ func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint
return d.doAllocate(ctx, offset, length, func() error {
d.handleMu.RLock()
defer d.handleMu.RUnlock()
+ if d.fs.opts.lisaEnabled {
+ return d.writeFDLisa.Allocate(ctx, mode, offset, length)
+ }
return d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
})
}
@@ -282,8 +291,19 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off
// changes to the host.
if newMode := vfs.ClearSUIDAndSGID(oldMode); newMode != oldMode {
atomic.StoreUint32(&d.mode, newMode)
- if err := d.file.setAttr(ctx, p9.SetAttrMask{Permissions: true}, p9.SetAttr{Permissions: p9.FileMode(newMode)}); err != nil {
- return 0, offset, err
+ if d.fs.opts.lisaEnabled {
+ stat := linux.Statx{Mask: linux.STATX_MODE, Mode: uint16(newMode)}
+ failureMask, failureErr, err := d.controlFDLisa.SetStat(ctx, &stat)
+ if err != nil {
+ return 0, offset, err
+ }
+ if failureMask != 0 {
+ return 0, offset, failureErr
+ }
+ } else {
+ if err := d.file.setAttr(ctx, p9.SetAttrMask{Permissions: true}, p9.SetAttr{Permissions: p9.FileMode(newMode)}); err != nil {
+ return 0, offset, err
+ }
}
}
}
@@ -677,7 +697,7 @@ func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int6
// Sync implements vfs.FileDescriptionImpl.Sync.
func (fd *regularFileFD) Sync(ctx context.Context) error {
- return fd.dentry().syncCachedFile(ctx, false /* lowSyncExpectations */)
+ return fd.dentry().syncCachedFile(ctx, false /* forFilesystemSync */, nil /* accFsyncFDIDsLisa */)
}
// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
diff --git a/pkg/sentry/fsimpl/gofer/revalidate.go b/pkg/sentry/fsimpl/gofer/revalidate.go
index 226790a11..5d4009832 100644
--- a/pkg/sentry/fsimpl/gofer/revalidate.go
+++ b/pkg/sentry/fsimpl/gofer/revalidate.go
@@ -15,7 +15,9 @@
package gofer
import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
)
@@ -234,28 +236,54 @@ func (fs *filesystem) revalidateHelper(ctx context.Context, vfsObj *vfs.VirtualF
}
// Lock metadata on all dentries *before* getting attributes for them.
state.lockAllMetadata()
- stats, err := state.start.file.multiGetAttr(ctx, state.names)
- if err != nil {
- return err
+
+ var (
+ stats []p9.FullStat
+ statsLisa []linux.Statx
+ numStats int
+ )
+ if fs.opts.lisaEnabled {
+ var err error
+ statsLisa, err = state.start.controlFDLisa.WalkStat(ctx, state.names)
+ if err != nil {
+ return err
+ }
+ numStats = len(statsLisa)
+ } else {
+ var err error
+ stats, err = state.start.file.multiGetAttr(ctx, state.names)
+ if err != nil {
+ return err
+ }
+ numStats = len(stats)
}
i := -1
for d := state.popFront(); d != nil; d = state.popFront() {
i++
- found := i < len(stats)
+ found := i < numStats
if i == 0 && len(state.names[0]) == 0 {
if found && !d.isSynthetic() {
// First dentry is where the search is starting, just update attributes
// since it cannot be replaced.
- d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) // +checklocksforce: acquired by lockAllMetadata.
+ if fs.opts.lisaEnabled {
+ d.updateFromLisaStatLocked(&statsLisa[i]) // +checklocksforce: acquired by lockAllMetadata.
+ } else {
+ d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) // +checklocksforce: acquired by lockAllMetadata.
+ }
}
d.metadataMu.Unlock() // +checklocksforce: see above.
continue
}
- // Note that synthetic dentries will always fails the comparison check
- // below.
- if !found || d.qidPath != stats[i].QID.Path {
+ // Note that synthetic dentries will always fail this comparison check.
+ var shouldInvalidate bool
+ if fs.opts.lisaEnabled {
+ shouldInvalidate = !found || d.inoKey != inoKeyFromStat(&statsLisa[i])
+ } else {
+ shouldInvalidate = !found || d.qidPath != stats[i].QID.Path
+ }
+ if shouldInvalidate {
d.metadataMu.Unlock() // +checklocksforce: see above.
if !found && d.isSynthetic() {
// We have a synthetic file, and no remote file has arisen to replace
@@ -298,7 +326,11 @@ func (fs *filesystem) revalidateHelper(ctx context.Context, vfsObj *vfs.VirtualF
}
// The file at this path hasn't changed. Just update cached metadata.
- d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) // +checklocksforce: see above.
+ if fs.opts.lisaEnabled {
+ d.updateFromLisaStatLocked(&statsLisa[i]) // +checklocksforce: see above.
+ } else {
+ d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) // +checklocksforce: see above.
+ }
d.metadataMu.Unlock()
}
diff --git a/pkg/sentry/fsimpl/gofer/save_restore.go b/pkg/sentry/fsimpl/gofer/save_restore.go
index e67422a2f..475322527 100644
--- a/pkg/sentry/fsimpl/gofer/save_restore.go
+++ b/pkg/sentry/fsimpl/gofer/save_restore.go
@@ -24,6 +24,7 @@ import (
"gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/fdnotifier"
"gvisor.dev/gvisor/pkg/hostarch"
+ "gvisor.dev/gvisor/pkg/lisafs"
"gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/refsvfs2"
"gvisor.dev/gvisor/pkg/safemem"
@@ -112,10 +113,19 @@ func (d *dentry) prepareSaveRecursive(ctx context.Context) error {
return err
}
}
- if !d.readFile.isNil() || !d.writeFile.isNil() {
- d.fs.savedDentryRW[d] = savedDentryRW{
- read: !d.readFile.isNil(),
- write: !d.writeFile.isNil(),
+ if d.fs.opts.lisaEnabled {
+ if d.readFDLisa.Ok() || d.writeFDLisa.Ok() {
+ d.fs.savedDentryRW[d] = savedDentryRW{
+ read: d.readFDLisa.Ok(),
+ write: d.writeFDLisa.Ok(),
+ }
+ }
+ } else {
+ if !d.readFile.isNil() || !d.writeFile.isNil() {
+ d.fs.savedDentryRW[d] = savedDentryRW{
+ read: !d.readFile.isNil(),
+ write: !d.writeFile.isNil(),
+ }
}
}
d.dirMu.Lock()
@@ -158,6 +168,10 @@ func (d *dentryPlatformFile) afterLoad() {
// afterLoad is invoked by stateify.
func (fd *specialFileFD) afterLoad() {
fd.handle.fd = -1
+ if fd.hostFileMapper.IsInited() {
+ // Ensure that we don't call fd.hostFileMapper.Init() again.
+ fd.hostFileMapperInitOnce.Do(func() {})
+ }
}
// CompleteRestore implements
@@ -173,25 +187,37 @@ func (fs *filesystem) CompleteRestore(ctx context.Context, opts vfs.CompleteRest
return fmt.Errorf("no server FD available for filesystem with unique ID %q", fs.iopts.UniqueID)
}
fs.opts.fd = fd
- if err := fs.dial(ctx); err != nil {
- return err
- }
fs.inoByQIDPath = make(map[uint64]uint64)
+ fs.inoByKey = make(map[inoKey]uint64)
- // Restore the filesystem root.
- ctx.UninterruptibleSleepStart(false)
- attached, err := fs.client.Attach(fs.opts.aname)
- ctx.UninterruptibleSleepFinish(false)
- if err != nil {
- return err
- }
- attachFile := p9file{attached}
- qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask())
- if err != nil {
- return err
- }
- if err := fs.root.restoreFile(ctx, attachFile, qid, attrMask, &attr, &opts); err != nil {
- return err
+ if fs.opts.lisaEnabled {
+ rootInode, err := fs.initClientLisa(ctx)
+ if err != nil {
+ return err
+ }
+ if err := fs.root.restoreFileLisa(ctx, rootInode, &opts); err != nil {
+ return err
+ }
+ } else {
+ if err := fs.dial(ctx); err != nil {
+ return err
+ }
+
+ // Restore the filesystem root.
+ ctx.UninterruptibleSleepStart(false)
+ attached, err := fs.client.Attach(fs.opts.aname)
+ ctx.UninterruptibleSleepFinish(false)
+ if err != nil {
+ return err
+ }
+ attachFile := p9file{attached}
+ qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask())
+ if err != nil {
+ return err
+ }
+ if err := fs.root.restoreFile(ctx, attachFile, qid, attrMask, &attr, &opts); err != nil {
+ return err
+ }
}
// Restore remaining dentries.
@@ -279,6 +305,55 @@ func (d *dentry) restoreFile(ctx context.Context, file p9file, qid p9.QID, attrM
return nil
}
+func (d *dentry) restoreFileLisa(ctx context.Context, inode *lisafs.Inode, opts *vfs.CompleteRestoreOptions) error {
+ d.controlFDLisa = d.fs.clientLisa.NewFD(inode.ControlFD)
+
+ // Gofers do not preserve inoKey across checkpoint/restore, so:
+ //
+ // - We must assume that the remote filesystem did not change in a way that
+ // would invalidate dentries, since we can't revalidate dentries by
+ // checking inoKey.
+ //
+ // - We need to associate the new inoKey with the existing d.ino.
+ d.inoKey = inoKeyFromStat(&inode.Stat)
+ d.fs.inoMu.Lock()
+ d.fs.inoByKey[d.inoKey] = d.ino
+ d.fs.inoMu.Unlock()
+
+ // Check metadata stability before updating metadata.
+ d.metadataMu.Lock()
+ defer d.metadataMu.Unlock()
+ if d.isRegularFile() {
+ if opts.ValidateFileSizes {
+ if inode.Stat.Mask&linux.STATX_SIZE != 0 {
+ return fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: file size not available", genericDebugPathname(d))
+ }
+ if d.size != inode.Stat.Size {
+ return fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: size changed from %d to %d", genericDebugPathname(d), d.size, inode.Stat.Size)
+ }
+ }
+ if opts.ValidateFileModificationTimestamps {
+ if inode.Stat.Mask&linux.STATX_MTIME != 0 {
+ return fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime not available", genericDebugPathname(d))
+ }
+ if want := dentryTimestampFromLisa(inode.Stat.Mtime); d.mtime != want {
+ return fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime changed from %+v to %+v", genericDebugPathname(d), linux.NsecToStatxTimestamp(d.mtime), linux.NsecToStatxTimestamp(want))
+ }
+ }
+ }
+ if !d.cachedMetadataAuthoritative() {
+ d.updateFromLisaStatLocked(&inode.Stat)
+ }
+
+ if rw, ok := d.fs.savedDentryRW[d]; ok {
+ if err := d.ensureSharedHandle(ctx, rw.read, rw.write, false /* trunc */); err != nil {
+ return err
+ }
+ }
+
+ return nil
+}
+
// Preconditions: d is not synthetic.
func (d *dentry) restoreDescendantsRecursive(ctx context.Context, opts *vfs.CompleteRestoreOptions) error {
for _, child := range d.children {
@@ -301,19 +376,35 @@ func (d *dentry) restoreDescendantsRecursive(ctx context.Context, opts *vfs.Comp
// only be detected by checking filesystem.syncableDentries). d.parent has been
// restored.
func (d *dentry) restoreRecursive(ctx context.Context, opts *vfs.CompleteRestoreOptions) error {
- qid, file, attrMask, attr, err := d.parent.file.walkGetAttrOne(ctx, d.name)
- if err != nil {
- return err
- }
- if err := d.restoreFile(ctx, file, qid, attrMask, &attr, opts); err != nil {
- return err
+ if d.fs.opts.lisaEnabled {
+ inode, err := d.parent.controlFDLisa.Walk(ctx, d.name)
+ if err != nil {
+ return err
+ }
+ if err := d.restoreFileLisa(ctx, inode, opts); err != nil {
+ return err
+ }
+ } else {
+ qid, file, attrMask, attr, err := d.parent.file.walkGetAttrOne(ctx, d.name)
+ if err != nil {
+ return err
+ }
+ if err := d.restoreFile(ctx, file, qid, attrMask, &attr, opts); err != nil {
+ return err
+ }
}
return d.restoreDescendantsRecursive(ctx, opts)
}
func (fd *specialFileFD) completeRestore(ctx context.Context) error {
d := fd.dentry()
- h, err := openHandle(ctx, d.file, fd.vfsfd.IsReadable(), fd.vfsfd.IsWritable(), false /* trunc */)
+ var h handle
+ var err error
+ if d.fs.opts.lisaEnabled {
+ h, err = openHandleLisa(ctx, d.controlFDLisa, fd.vfsfd.IsReadable(), fd.vfsfd.IsWritable(), false /* trunc */)
+ } else {
+ h, err = openHandle(ctx, d.file, fd.vfsfd.IsReadable(), fd.vfsfd.IsWritable(), false /* trunc */)
+ }
if err != nil {
return err
}
diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go
index fe15f8583..86ab70453 100644
--- a/pkg/sentry/fsimpl/gofer/socket.go
+++ b/pkg/sentry/fsimpl/gofer/socket.go
@@ -59,11 +59,6 @@ func sockTypeToP9(t linux.SockType) (p9.ConnectFlags, bool) {
// BidirectionalConnect implements ConnectableEndpoint.BidirectionalConnect.
func (e *endpoint) BidirectionalConnect(ctx context.Context, ce transport.ConnectingEndpoint, returnConnect func(transport.Receiver, transport.ConnectedEndpoint)) *syserr.Error {
- cf, ok := sockTypeToP9(ce.Type())
- if !ok {
- return syserr.ErrConnectionRefused
- }
-
// No lock ordering required as only the ConnectingEndpoint has a mutex.
ce.Lock()
@@ -77,7 +72,7 @@ func (e *endpoint) BidirectionalConnect(ctx context.Context, ce transport.Connec
return syserr.ErrInvalidEndpointState
}
- c, err := e.newConnectedEndpoint(ctx, cf, ce.WaiterQueue())
+ c, err := e.newConnectedEndpoint(ctx, ce.Type(), ce.WaiterQueue())
if err != nil {
ce.Unlock()
return err
@@ -95,7 +90,7 @@ func (e *endpoint) BidirectionalConnect(ctx context.Context, ce transport.Connec
// UnidirectionalConnect implements
// transport.BoundEndpoint.UnidirectionalConnect.
func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.ConnectedEndpoint, *syserr.Error) {
- c, err := e.newConnectedEndpoint(ctx, p9.DgramSocket, &waiter.Queue{})
+ c, err := e.newConnectedEndpoint(ctx, linux.SOCK_DGRAM, &waiter.Queue{})
if err != nil {
return nil, err
}
@@ -111,25 +106,39 @@ func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.Connect
return c, nil
}
-func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFlags, queue *waiter.Queue) (*host.SCMConnectedEndpoint, *syserr.Error) {
- hostFile, err := e.dentry.file.connect(ctx, flags)
- if err != nil {
+func (e *endpoint) newConnectedEndpoint(ctx context.Context, sockType linux.SockType, queue *waiter.Queue) (*host.SCMConnectedEndpoint, *syserr.Error) {
+ if e.dentry.fs.opts.lisaEnabled {
+ hostSockFD, err := e.dentry.controlFDLisa.Connect(ctx, sockType)
+ if err != nil {
+ return nil, syserr.ErrConnectionRefused
+ }
+
+ c, serr := host.NewSCMEndpoint(ctx, hostSockFD, queue, e.path)
+ if serr != nil {
+ unix.Close(hostSockFD)
+ log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v sockType %d: %v", e.dentry.file, sockType, serr)
+ return nil, serr
+ }
+ return c, nil
+ }
+
+ flags, ok := sockTypeToP9(sockType)
+ if !ok {
return nil, syserr.ErrConnectionRefused
}
- // Dup the fd so that the new endpoint can manage its lifetime.
- hostFD, err := unix.Dup(hostFile.FD())
+ hostFile, err := e.dentry.file.connect(ctx, flags)
if err != nil {
- log.Warningf("Could not dup host socket fd %d: %v", hostFile.FD(), err)
- return nil, syserr.FromError(err)
+ return nil, syserr.ErrConnectionRefused
}
- // After duplicating, we no longer need hostFile.
- hostFile.Close()
- c, serr := host.NewSCMEndpoint(ctx, hostFD, queue, e.path)
+ c, serr := host.NewSCMEndpoint(ctx, hostFile.FD(), queue, e.path)
if serr != nil {
- log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v flags %+v: %v", e.dentry.file, flags, serr)
+ hostFile.Close()
+ log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v sockType %d: %v", e.dentry.file, sockType, serr)
return nil, serr
}
+ // Ownership has been transferred to c.
+ hostFile.Release()
return c, nil
}
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index 4b59c1c3c..c568bbfd2 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -22,13 +22,16 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/fdnotifier"
+ "gvisor.dev/gvisor/pkg/hostarch"
+ "gvisor.dev/gvisor/pkg/lisafs"
"gvisor.dev/gvisor/pkg/metric"
"gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/fsmetric"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -76,6 +79,16 @@ type specialFileFD struct {
bufMu sync.Mutex `state:"nosave"`
haveBuf uint32
buf []byte
+
+ // If handle.fd >= 0, hostFileMapper caches mappings of handle.fd, and
+ // hostFileMapperInitOnce is used to initialize it on first use.
+ hostFileMapperInitOnce sync.Once `state:"nosave"`
+ hostFileMapper fsutil.HostFileMapper
+
+ // If handle.fd >= 0, fileRefs counts references on memmap.File offsets.
+ // fileRefs is protected by fileRefsMu.
+ fileRefsMu sync.Mutex `state:"nosave"`
+ fileRefs fsutil.FrameRefSet
}
func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*specialFileFD, error) {
@@ -137,6 +150,9 @@ func (fd *specialFileFD) OnClose(ctx context.Context) error {
if !fd.vfsfd.IsWritable() {
return nil
}
+ if fs := fd.filesystem(); fs.opts.lisaEnabled {
+ return fd.handle.fdLisa.Flush(ctx)
+ }
return fd.handle.file.flush(ctx)
}
@@ -172,6 +188,9 @@ func (fd *specialFileFD) Allocate(ctx context.Context, mode, offset, length uint
if fd.isRegularFile {
d := fd.dentry()
return d.doAllocate(ctx, offset, length, func() error {
+ if d.fs.opts.lisaEnabled {
+ return fd.handle.fdLisa.Allocate(ctx, mode, offset, length)
+ }
return fd.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
})
}
@@ -230,23 +249,13 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
}
}
- // Going through dst.CopyOutFrom() would hold MM locks around file
- // operations of unknown duration. For regularFileFD, doing so is necessary
- // to support mmap due to lock ordering; MM locks precede dentry.dataMu.
- // That doesn't hold here since specialFileFD doesn't client-cache data.
- // Just buffer the read instead.
- buf := make([]byte, dst.NumBytes())
- n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+ rw := getHandleReadWriter(ctx, &fd.handle, offset)
+ n, err := dst.CopyOutFrom(ctx, rw)
+ putHandleReadWriter(rw)
if linuxerr.Equals(linuxerr.EAGAIN, err) {
- err = syserror.ErrWouldBlock
- }
- if n == 0 {
- return bufN, err
+ err = linuxerr.ErrWouldBlock
}
- if cp, cperr := dst.CopyOut(ctx, buf[:n]); cperr != nil {
- return bufN + int64(cp), cperr
- }
- return bufN + int64(n), err
+ return bufN + n, err
}
// Read implements vfs.FileDescriptionImpl.Read.
@@ -317,20 +326,15 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off
}
}
- // Do a buffered write. See rationale in PRead.
- buf := make([]byte, src.NumBytes())
- copied, copyErr := src.CopyIn(ctx, buf)
- if copied == 0 && copyErr != nil {
- // Only return the error if we didn't get any data.
- return 0, offset, copyErr
- }
- n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:copied])), uint64(offset))
+ rw := getHandleReadWriter(ctx, &fd.handle, offset)
+ n, err := src.CopyInTo(ctx, rw)
+ putHandleReadWriter(rw)
if linuxerr.Equals(linuxerr.EAGAIN, err) {
- err = syserror.ErrWouldBlock
+ err = linuxerr.ErrWouldBlock
}
// Update offset if the offset is valid.
if offset >= 0 {
- offset += int64(n)
+ offset += n
}
// Update file size for regular files.
if fd.isRegularFile {
@@ -341,10 +345,7 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off
atomic.StoreUint64(&d.size, uint64(offset))
}
}
- if err != nil {
- return int64(n), offset, err
- }
- return int64(n), offset, copyErr
+ return int64(n), offset, err
}
// Write implements vfs.FileDescriptionImpl.Write.
@@ -377,10 +378,10 @@ func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (
// Sync implements vfs.FileDescriptionImpl.Sync.
func (fd *specialFileFD) Sync(ctx context.Context) error {
- return fd.sync(ctx, false /* forFilesystemSync */)
+ return fd.sync(ctx, false /* forFilesystemSync */, nil /* accFsyncFDIDsLisa */)
}
-func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool) error {
+func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool, accFsyncFDIDsLisa *[]lisafs.FDID) error {
// Locks to ensure it didn't race with fd.Release().
fd.releaseMu.RLock()
defer fd.releaseMu.RUnlock()
@@ -397,6 +398,13 @@ func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool) error
ctx.UninterruptibleSleepFinish(false)
return err
}
+ if fs := fd.filesystem(); fs.opts.lisaEnabled {
+ if accFsyncFDIDsLisa != nil {
+ *accFsyncFDIDsLisa = append(*accFsyncFDIDsLisa, fd.handle.fdLisa.ID())
+ return nil
+ }
+ return fd.handle.fdLisa.Sync(ctx)
+ }
return fd.handle.file.fsync(ctx)
}()
if err != nil {
@@ -412,3 +420,85 @@ func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool) error
}
return nil
}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *specialFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+ if fd.handle.fd < 0 || fd.filesystem().opts.forcePageCache {
+ return linuxerr.ENODEV
+ }
+ // After this point, fd may be used as a memmap.Mappable and memmap.File.
+ fd.hostFileMapperInitOnce.Do(fd.hostFileMapper.Init)
+ return vfs.GenericConfigureMMap(&fd.vfsfd, fd, opts)
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (fd *specialFileFD) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error {
+ fd.hostFileMapper.IncRefOn(memmap.MappableRange{offset, offset + uint64(ar.Length())})
+ return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (fd *specialFileFD) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) {
+ fd.hostFileMapper.DecRefOn(memmap.MappableRange{offset, offset + uint64(ar.Length())})
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (fd *specialFileFD) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error {
+ return fd.AddMapping(ctx, ms, dstAR, offset, writable)
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (fd *specialFileFD) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
+ mr := optional
+ if fd.filesystem().opts.limitHostFDTranslation {
+ mr = maxFillRange(required, optional)
+ }
+ return []memmap.Translation{
+ {
+ Source: mr,
+ File: fd,
+ Offset: mr.Start,
+ Perms: hostarch.AnyAccess,
+ },
+ }, nil
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (fd *specialFileFD) InvalidateUnsavable(ctx context.Context) error {
+ return nil
+}
+
+// IncRef implements memmap.File.IncRef.
+func (fd *specialFileFD) IncRef(fr memmap.FileRange) {
+ fd.fileRefsMu.Lock()
+ defer fd.fileRefsMu.Unlock()
+ fd.fileRefs.IncRefAndAccount(fr)
+}
+
+// DecRef implements memmap.File.DecRef.
+func (fd *specialFileFD) DecRef(fr memmap.FileRange) {
+ fd.fileRefsMu.Lock()
+ defer fd.fileRefsMu.Unlock()
+ fd.fileRefs.DecRefAndAccount(fr)
+}
+
+// MapInternal implements memmap.File.MapInternal.
+func (fd *specialFileFD) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) {
+ fd.requireHostFD()
+ return fd.hostFileMapper.MapInternal(fr, int(fd.handle.fd), at.Write)
+}
+
+// FD implements memmap.File.FD.
+func (fd *specialFileFD) FD() int {
+ fd.requireHostFD()
+ return int(fd.handle.fd)
+}
+
+func (fd *specialFileFD) requireHostFD() {
+ if fd.handle.fd < 0 {
+ // This is possible if fd was successfully mmapped before saving, then
+ // was restored without a host FD. This is unrecoverable: without a
+ // host FD, we can't mmap this file post-restore.
+ panic("gofer.specialFileFD can no longer be memory-mapped without a host FD")
+ }
+}
diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go
index dbd834c67..27d9be5c4 100644
--- a/pkg/sentry/fsimpl/gofer/symlink.go
+++ b/pkg/sentry/fsimpl/gofer/symlink.go
@@ -35,7 +35,13 @@ func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
return target, nil
}
}
- target, err := d.file.readlink(ctx)
+ var target string
+ var err error
+ if d.fs.opts.lisaEnabled {
+ target, err = d.controlFDLisa.ReadLinkAt(ctx)
+ } else {
+ target, err = d.file.readlink(ctx)
+ }
if d.fs.opts.interop != InteropModeShared {
if err == nil {
d.haveTarget = true
diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go
index 9cbe805b9..07940b225 100644
--- a/pkg/sentry/fsimpl/gofer/time.go
+++ b/pkg/sentry/fsimpl/gofer/time.go
@@ -17,6 +17,7 @@ package gofer
import (
"sync/atomic"
+ "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/vfs"
)
@@ -24,6 +25,10 @@ func dentryTimestampFromP9(s, ns uint64) int64 {
return int64(s*1e9 + ns)
}
+func dentryTimestampFromLisa(t linux.StatxTimestamp) int64 {
+ return t.Sec*1e9 + int64(t.Nsec)
+}
+
// Preconditions: d.cachedMetadataAuthoritative() == true.
func (d *dentry) touchAtime(mnt *vfs.Mount) {
if mnt.Flags.NoATime || mnt.ReadOnly() {
diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
index 476545d00..180a35583 100644
--- a/pkg/sentry/fsimpl/host/BUILD
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -70,7 +70,6 @@ go_library(
"//pkg/sentry/vfs",
"//pkg/sync",
"//pkg/syserr",
- "//pkg/syserror",
"//pkg/tcpip",
"//pkg/unet",
"//pkg/usermem",
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 89aa7b3d9..984c6e8ee 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -37,7 +37,6 @@ import (
unixsocket "gvisor.dev/gvisor/pkg/sentry/socket/unix"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -712,7 +711,7 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts
if total != 0 {
err = nil
} else {
- err = syserror.ErrWouldBlock
+ err = linuxerr.ErrWouldBlock
}
}
return total, err
@@ -766,7 +765,7 @@ func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opt
if !i.seekable {
n, err := f.writeToHostFD(ctx, src, -1, opts.Flags)
if isBlockError(err) {
- err = syserror.ErrWouldBlock
+ err = linuxerr.ErrWouldBlock
}
return n, err
}
diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go
index 7f6ce4ee5..04ac73255 100644
--- a/pkg/sentry/fsimpl/host/tty.go
+++ b/pkg/sentry/fsimpl/host/tty.go
@@ -24,7 +24,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/unimpl"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -346,7 +345,7 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal)
// If the signal is SIGTTIN, then we are attempting to read
// from the TTY. Don't send the signal and return EIO.
if sig == linux.SIGTTIN {
- return syserror.EIO
+ return linuxerr.EIO
}
// Otherwise, we are writing or changing terminal state. This is allowed.
@@ -355,7 +354,7 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal)
// If the process group is an orphan, return EIO.
if pg.IsOrphan() {
- return syserror.EIO
+ return linuxerr.EIO
}
// Otherwise, send the signal to the process group and return ERESTARTSYS.
@@ -368,5 +367,5 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal)
//
// Linux ignores the result of kill_pgrp().
_ = pg.SendSignal(kernel.SignalInfoPriv(sig))
- return syserror.ERESTARTSYS
+ return linuxerr.ERESTARTSYS
}
diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go
index 95d7ebe2e..9850f3f41 100644
--- a/pkg/sentry/fsimpl/host/util.go
+++ b/pkg/sentry/fsimpl/host/util.go
@@ -42,7 +42,7 @@ func timespecToStatxTimestamp(ts unix.Timespec) linux.StatxTimestamp {
}
// isBlockError checks if an error is EAGAIN or EWOULDBLOCK.
-// If so, they can be transformed into syserror.ErrWouldBlock.
+// If so, they can be transformed into linuxerr.ErrWouldBlock.
func isBlockError(err error) bool {
return linuxerr.Equals(linuxerr.EAGAIN, err) || linuxerr.Equals(linuxerr.EWOULDBLOCK, err)
}
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index d53937db6..4b577ea43 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -119,7 +119,6 @@ go_library(
"//pkg/sentry/socket/unix/transport",
"//pkg/sentry/vfs",
"//pkg/sync",
- "//pkg/syserror",
"//pkg/usermem",
],
)
@@ -137,6 +136,7 @@ go_test(
"//pkg/abi/linux",
"//pkg/context",
"//pkg/errors/linuxerr",
+ "//pkg/fspath",
"//pkg/log",
"//pkg/refs",
"//pkg/refsvfs2",
@@ -144,7 +144,6 @@ go_test(
"//pkg/sentry/fsimpl/testutil",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/vfs",
- "//pkg/syserror",
"//pkg/usermem",
"@com_github_google_go_cmp//cmp:go_default_library",
],
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index 8b008dc10..7db1473c4 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -24,7 +24,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -99,7 +98,7 @@ func NewGenericDirectoryFD(m *vfs.Mount, d *Dentry, children *OrderedChildren, l
func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) error {
if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 {
// Can't open directories for writing.
- return syserror.EISDIR
+ return linuxerr.EISDIR
}
fd.LockFD.Init(locks)
fd.seekEnd = fdOpts.SeekEnd
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index a97473f7d..363ebc466 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -26,7 +26,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
)
// stepExistingLocked resolves rp.Component() in parent directory vfsd.
@@ -224,7 +223,7 @@ func checkCreateLocked(ctx context.Context, creds *auth.Credentials, name string
return linuxerr.EEXIST
}
if parent.VFSDentry().IsDead() {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayWrite); err != nil {
return err
@@ -241,7 +240,7 @@ func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) er
return linuxerr.EBUSY
}
if parent.vfsd.IsDead() {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
return err
@@ -362,7 +361,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
return err
}
if rp.MustBeDir() {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
if rp.Mount() != vd.Mount() {
return linuxerr.EXDEV
@@ -443,7 +442,7 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
return err
}
if rp.MustBeDir() {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
if err := rp.Mount().CheckBeginWrite(); err != nil {
return err
@@ -509,7 +508,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
defer unlock()
if rp.Done() {
if rp.MustBeDir() {
- return nil, syserror.EISDIR
+ return nil, linuxerr.EISDIR
}
if mustCreate {
return nil, linuxerr.EEXIST
@@ -536,11 +535,11 @@ afterTrailingSymlink:
}
// Reject attempts to open directories with O_CREAT.
if rp.MustBeDir() {
- return nil, syserror.EISDIR
+ return nil, linuxerr.EISDIR
}
pc := rp.Component()
if pc == "." || pc == ".." {
- return nil, syserror.EISDIR
+ return nil, linuxerr.EISDIR
}
if len(pc) > linux.NAME_MAX {
return nil, linuxerr.ENAMETOOLONG
@@ -861,7 +860,7 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
return err
}
if rp.MustBeDir() {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
if err := rp.Mount().CheckBeginWrite(); err != nil {
return err
@@ -895,7 +894,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return err
}
if d.isDir() {
- return syserror.EISDIR
+ return linuxerr.EISDIR
}
virtfs := rp.VirtualFilesystem()
parentDentry := d.parent
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index a42fc79b4..b96dc9ef7 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -26,7 +26,6 @@ import (
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
)
// InodeNoopRefCount partially implements the Inode interface, specifically the
@@ -234,6 +233,11 @@ func (a *InodeAttrs) Mode() linux.FileMode {
return linux.FileMode(atomic.LoadUint32(&a.mode))
}
+// Links returns the link count.
+func (a *InodeAttrs) Links() uint32 {
+ return atomic.LoadUint32(&a.nlink)
+}
+
// TouchAtime updates a.atime to the current time.
func (a *InodeAttrs) TouchAtime(ctx context.Context, mnt *vfs.Mount) {
if mnt.Flags.NoATime || mnt.ReadOnly() {
@@ -289,7 +293,7 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
return linuxerr.EPERM
}
if opts.Stat.Mask&linux.STATX_SIZE != 0 && a.Mode().IsDir() {
- return syserror.EISDIR
+ return linuxerr.EISDIR
}
if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
return err
@@ -475,7 +479,7 @@ func (o *OrderedChildren) Lookup(ctx context.Context, name string) (Inode, error
s, ok := o.set[name]
if !ok {
- return nil, syserror.ENOENT
+ return nil, linuxerr.ENOENT
}
s.inode.IncRef() // This ref is passed to the dentry upon creation via Init.
@@ -502,6 +506,30 @@ func (o *OrderedChildren) Insert(name string, child Inode) error {
return o.insert(name, child, false)
}
+// Inserter is like Insert, but obtains the child to insert by calling
+// makeChild. makeChild is only called if the insert will succeed. This allows
+// the caller to atomically check and insert a child without having to
+// clean up the child on failure.
+func (o *OrderedChildren) Inserter(name string, makeChild func() Inode) (Inode, error) {
+ o.mu.Lock()
+ defer o.mu.Unlock()
+ if _, ok := o.set[name]; ok {
+ return nil, linuxerr.EEXIST
+ }
+
+ // Note: We must not fail after we call makeChild().
+
+ child := makeChild()
+ s := &slot{
+ name: name,
+ inode: child,
+ static: false,
+ }
+ o.order.PushBack(s)
+ o.set[name] = s
+ return child, nil
+}
+
// insert inserts child into o.
//
// Precondition: Caller must be holding a ref on child if static is true.
@@ -559,7 +587,7 @@ func (o *OrderedChildren) replaceChildLocked(ctx context.Context, name string, n
func (o *OrderedChildren) checkExistingLocked(name string, child Inode) error {
s, ok := o.set[name]
if !ok {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
if s.inode != child {
panic(fmt.Sprintf("Inode doesn't match what kernfs thinks! OrderedChild: %+v, kernfs: %+v", s.inode, child))
@@ -746,5 +774,5 @@ type InodeNoStatFS struct{}
// StatFS implements Inode.StatFS.
func (*InodeNoStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
- return linux.Statfs{}, syserror.ENOSYS
+ return linux.Statfs{}, linuxerr.ENOSYS
}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 0e2867d49..544698694 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -61,6 +61,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/refsvfs2"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -542,6 +543,63 @@ func (d *Dentry) FSLocalPath() string {
return b.String()
}
+// WalkDentryTree traverses p in the dentry tree for this filesystem. Note that
+// this only traverses the dentry tree and is not a general path traversal. No
+// symlinks and dynamic children are resolved, and no permission checks are
+// performed. The caller is responsible for ensuring the returned Dentry exists
+// for an appropriate lifetime.
+//
+// p is interpreted starting at d, and may be absolute or relative (absolute vs
+// relative paths both refer to the same target here, since p is absolute from
+// d). p may contain "." and "..", but will not allow traversal above d (similar
+// to ".." at the root dentry).
+//
+// This is useful for filesystem internals, where the filesystem may not be
+// mounted yet. For a mounted filesystem, use GetDentryAt.
+func (d *Dentry) WalkDentryTree(ctx context.Context, vfsObj *vfs.VirtualFilesystem, p fspath.Path) (*Dentry, error) {
+ d.fs.mu.RLock()
+ defer d.fs.processDeferredDecRefs(ctx)
+ defer d.fs.mu.RUnlock()
+
+ target := d
+
+ for pit := p.Begin; pit.Ok(); pit = pit.Next() {
+ pc := pit.String()
+
+ switch {
+ case target == nil:
+ return nil, linuxerr.ENOENT
+ case pc == ".":
+ // No-op, consume component and continue.
+ case pc == "..":
+ if target == d {
+ // Don't let .. traverse above the start point of the walk.
+ continue
+ }
+ target = target.parent
+ // Parent doesn't need revalidation since we revalidated it on the
+ // way to the child, and we're still holding fs.mu.
+ default:
+ var err error
+
+ d.dirMu.Lock()
+ target, err = d.fs.revalidateChildLocked(ctx, vfsObj, target, pc, target.children[pc])
+ d.dirMu.Unlock()
+
+ if err != nil {
+ return nil, err
+ }
+ }
+ }
+
+ if target == nil {
+ return nil, linuxerr.ENOENT
+ }
+
+ target.IncRef()
+ return target, nil
+}
+
// The Inode interface maps filesystem-level operations that operate on paths to
// equivalent operations on specific filesystem nodes.
//
@@ -667,12 +725,15 @@ type inodeDirectory interface {
// RmDir removes an empty child directory from this directory
// inode. Implementations must update the parent directory's link count,
// if required. Implementations are not responsible for checking that child
- // is a directory, checking for an empty directory.
+ // is a directory, or checking for an empty directory.
RmDir(ctx context.Context, name string, child Inode) error
// Rename is called on the source directory containing an inode being
- // renamed. child should point to the resolved child in the source
- // directory.
+ // renamed. child points to the resolved child in the source directory.
+ // dstDir is guaranteed to be a directory inode.
+ //
+ // On a successful call to Rename, the caller updates the dentry tree to
+ // reflect the name change.
//
// Precondition: Caller must serialize concurrent calls to Rename.
Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index 609887943..a2aba9321 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -23,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/errors/linuxerr"
+ "gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/contexttest"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
@@ -346,3 +347,63 @@ func TestDirFDIterDirents(t *testing.T) {
"file1": linux.DT_REG,
})
}
+
+func TestDirWalkDentryTree(t *testing.T) {
+ sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+ return fs.newDir(ctx, creds, 0755, map[string]kernfs.Inode{
+ "dir1": fs.newDir(ctx, creds, 0755, nil),
+ "dir2": fs.newDir(ctx, creds, 0755, map[string]kernfs.Inode{
+ "file1": fs.newFile(ctx, creds, staticFileContent),
+ "dir3": fs.newDir(ctx, creds, 0755, nil),
+ }),
+ })
+ })
+ defer sys.Destroy()
+
+ testWalk := func(from *kernfs.Dentry, getDentryPath, walkPath string, expectedErr error) {
+ var d *kernfs.Dentry
+ if getDentryPath != "" {
+ pop := sys.PathOpAtRoot(getDentryPath)
+ vd := sys.GetDentryOrDie(pop)
+ defer vd.DecRef(sys.Ctx)
+ d = vd.Dentry().Impl().(*kernfs.Dentry)
+ }
+
+ match, err := from.WalkDentryTree(sys.Ctx, sys.VFS, fspath.Parse(walkPath))
+ if err == nil {
+ defer match.DecRef(sys.Ctx)
+ }
+
+ if err != expectedErr {
+ t.Fatalf("WalkDentryTree from %q to %q (with expected error: %v) unexpected error, want: %v, got: %v", from.FSLocalPath(), walkPath, expectedErr, expectedErr, err)
+ }
+ if expectedErr != nil {
+ return
+ }
+
+ if d != match {
+ t.Fatalf("WalkDentryTree from %q to %q (with expected error: %v) found unexpected dentry; want: %v, got: %v", from.FSLocalPath(), walkPath, expectedErr, d, match)
+ }
+ }
+
+ rootD := sys.Root.Dentry().Impl().(*kernfs.Dentry)
+
+ testWalk(rootD, "dir1", "/dir1", nil)
+ testWalk(rootD, "", "/dir-non-existent", linuxerr.ENOENT)
+ testWalk(rootD, "", "/dir1/child-non-existent", linuxerr.ENOENT)
+ testWalk(rootD, "", "/dir2/inner-non-existent/dir3", linuxerr.ENOENT)
+
+ testWalk(rootD, "dir2/dir3", "/dir2/../dir2/dir3", nil)
+ testWalk(rootD, "dir2/dir3", "/dir2/././dir3", nil)
+ testWalk(rootD, "dir2/dir3", "/dir2/././dir3/.././dir3", nil)
+
+ pop := sys.PathOpAtRoot("dir2")
+ dir2VD := sys.GetDentryOrDie(pop)
+ defer dir2VD.DecRef(sys.Ctx)
+ dir2D := dir2VD.Dentry().Impl().(*kernfs.Dentry)
+
+ testWalk(dir2D, "dir2/dir3", "/dir3", nil)
+ testWalk(dir2D, "dir2/dir3", "/../../../dir3", nil)
+ testWalk(dir2D, "dir2/file1", "/file1", nil)
+ testWalk(dir2D, "dir2/file1", "file1", nil)
+}
diff --git a/pkg/sentry/fsimpl/overlay/BUILD b/pkg/sentry/fsimpl/overlay/BUILD
index ed730e215..d16dfef9b 100644
--- a/pkg/sentry/fsimpl/overlay/BUILD
+++ b/pkg/sentry/fsimpl/overlay/BUILD
@@ -42,7 +42,6 @@ go_library(
"//pkg/sentry/socket/unix/transport",
"//pkg/sentry/vfs",
"//pkg/sync",
- "//pkg/syserror",
"//pkg/usermem",
"//pkg/waiter",
],
diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go
index 1f85a1f0d..520487066 100644
--- a/pkg/sentry/fsimpl/overlay/copy_up.go
+++ b/pkg/sentry/fsimpl/overlay/copy_up.go
@@ -26,7 +26,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
)
func (d *dentry) isCopiedUp() bool {
@@ -37,6 +36,10 @@ func (d *dentry) isCopiedUp() bool {
//
// Preconditions: filesystem.renameMu must be locked.
func (d *dentry) copyUpLocked(ctx context.Context) error {
+ return d.copyUpMaybeSyntheticMountpointLocked(ctx, false /* forSyntheticMountpoint */)
+}
+
+func (d *dentry) copyUpMaybeSyntheticMountpointLocked(ctx context.Context, forSyntheticMountpoint bool) error {
// Fast path.
if d.isCopiedUp() {
return nil
@@ -60,7 +63,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
// d is a filesystem root with no upper layer.
return linuxerr.EROFS
}
- if err := d.parent.copyUpLocked(ctx); err != nil {
+ if err := d.parent.copyUpMaybeSyntheticMountpointLocked(ctx, forSyntheticMountpoint); err != nil {
return err
}
@@ -72,7 +75,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
}
if d.vfsd.IsDead() {
// Raced with deletion of d.
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
// Obtain settable timestamps from the lower layer.
@@ -169,7 +172,8 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
case linux.S_IFDIR:
if err := vfsObj.MkdirAt(ctx, d.fs.creds, &newpop, &vfs.MkdirOptions{
- Mode: linux.FileMode(d.mode &^ linux.S_IFMT),
+ Mode: linux.FileMode(d.mode &^ linux.S_IFMT),
+ ForSyntheticMountpoint: forSyntheticMountpoint,
}); err != nil {
return err
}
diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
index 5e89928c5..3b3dcf836 100644
--- a/pkg/sentry/fsimpl/overlay/filesystem.go
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -28,7 +28,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
)
// _OVL_XATTR_PREFIX is an extended attribute key prefix to identify overlayfs
@@ -314,7 +313,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
}
if !topLookupLayer.existsInOverlay() {
child.destroyLocked(ctx)
- return nil, topLookupLayer, syserror.ENOENT
+ return nil, topLookupLayer, linuxerr.ENOENT
}
// Device and inode numbers were copied from the topmost layer above. Remap
@@ -463,13 +462,21 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
return d, nil
}
+type createType int
+
+const (
+ createNonDirectory createType = iota
+ createDirectory
+ createSyntheticMountpoint
+)
+
// doCreateAt checks that creating a file at rp is permitted, then invokes
// create to do so.
//
// Preconditions:
// * !rp.Done().
// * For the final path component in rp, !rp.ShouldFollowSymlink().
-func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error {
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, ct createType, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error {
var ds *[]*dentry
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
@@ -483,7 +490,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
return linuxerr.EEXIST
}
if parent.vfsd.IsDead() {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
@@ -505,8 +512,8 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
return linuxerr.EEXIST
}
- if !dir && rp.MustBeDir() {
- return syserror.ENOENT
+ if ct == createNonDirectory && rp.MustBeDir() {
+ return linuxerr.ENOENT
}
mnt := rp.Mount()
@@ -519,7 +526,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
}
// Ensure that the parent directory is copied-up so that we can create the
// new file in the upper layer.
- if err := parent.copyUpLocked(ctx); err != nil {
+ if err := parent.copyUpMaybeSyntheticMountpointLocked(ctx, ct == createSyntheticMountpoint); err != nil {
return err
}
@@ -530,7 +537,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
parent.dirents = nil
ev := linux.IN_CREATE
- if dir {
+ if ct != createNonDirectory {
ev |= linux.IN_ISDIR
}
parent.watches.Notify(ctx, name, uint32(ev), 0 /* cookie */, vfs.InodeEvent, false /* unlinked */)
@@ -619,7 +626,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa
// LinkAt implements vfs.FilesystemImpl.LinkAt.
func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
- return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+ return fs.doCreateAt(ctx, rp, createNonDirectory, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
if rp.Mount() != vd.Mount() {
return linuxerr.EXDEV
}
@@ -672,7 +679,11 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
- return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+ ct := createDirectory
+ if opts.ForSyntheticMountpoint {
+ ct = createSyntheticMountpoint
+ }
+ return fs.doCreateAt(ctx, rp, ct, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
vfsObj := fs.vfsfs.VirtualFilesystem()
pop := vfs.PathOperation{
Root: parent.upperVD,
@@ -723,7 +734,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
// MknodAt implements vfs.FilesystemImpl.MknodAt.
func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
- return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+ return fs.doCreateAt(ctx, rp, createNonDirectory, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
// Disallow attempts to create whiteouts.
if opts.Mode&linux.S_IFMT == linux.S_IFCHR && opts.DevMajor == 0 && opts.DevMinor == 0 {
return linuxerr.EPERM
@@ -780,7 +791,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
start := rp.Start().Impl().(*dentry)
if rp.Done() {
if mayCreate && rp.MustBeDir() {
- return nil, syserror.EISDIR
+ return nil, linuxerr.EISDIR
}
if mustCreate {
return nil, linuxerr.EEXIST
@@ -807,7 +818,7 @@ afterTrailingSymlink:
}
// Reject attempts to open directories with O_CREAT.
if mayCreate && rp.MustBeDir() {
- return nil, syserror.EISDIR
+ return nil, linuxerr.EISDIR
}
// Determine whether or not we need to create a file.
parent.dirMu.Lock()
@@ -865,11 +876,11 @@ func (d *dentry) openCopiedUp(ctx context.Context, rp *vfs.ResolvingPath, opts *
if ftype == linux.S_IFDIR {
// Can't open directories with O_CREAT.
if opts.Flags&linux.O_CREAT != 0 {
- return nil, syserror.EISDIR
+ return nil, linuxerr.EISDIR
}
// Can't open directories writably.
if ats.MayWrite() {
- return nil, syserror.EISDIR
+ return nil, linuxerr.EISDIR
}
if opts.Flags&linux.O_DIRECT != 0 {
return nil, linuxerr.EINVAL
@@ -919,7 +930,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
return nil, err
}
if parent.vfsd.IsDead() {
- return nil, syserror.ENOENT
+ return nil, linuxerr.ENOENT
}
mnt := rp.Mount()
if err := mnt.CheckBeginWrite(); err != nil {
@@ -1086,7 +1097,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
defer newParent.dirMu.Unlock()
}
if newParent.vfsd.IsDead() {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
var (
replaced *dentry
@@ -1105,7 +1116,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
replacedVFSD = &replaced.vfsd
if replaced.isDir() {
if !renamed.isDir() {
- return syserror.EISDIR
+ return linuxerr.EISDIR
}
if genericIsAncestorDentry(replaced, renamed) {
return linuxerr.ENOTEMPTY
@@ -1477,7 +1488,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
- return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+ return fs.doCreateAt(ctx, rp, createNonDirectory, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
vfsObj := fs.vfsfs.VirtualFilesystem()
pop := vfs.PathOperation{
Root: parent.upperVD,
@@ -1533,7 +1544,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
defer rp.Mount().EndWrite()
name := rp.Component()
if name == "." || name == ".." {
- return syserror.EISDIR
+ return linuxerr.EISDIR
}
if rp.MustBeDir() {
return linuxerr.ENOTDIR
@@ -1557,7 +1568,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return err
}
if child.isDir() {
- return syserror.EISDIR
+ return linuxerr.EISDIR
}
if err := parent.mayDelete(rp.Credentials(), child); err != nil {
return err
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index 1d3d2d95f..95cfbdc42 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -102,7 +102,6 @@ go_library(
"//pkg/sentry/usage",
"//pkg/sentry/vfs",
"//pkg/sync",
- "//pkg/syserror",
"//pkg/tcpip/header",
"//pkg/tcpip/network/ipv4",
"//pkg/usermem",
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index d99f90b36..e04ae6660 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -25,7 +25,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
)
// subtasksInode represents the inode for /proc/[pid]/task/ directory.
@@ -71,15 +70,15 @@ func (fs *filesystem) newSubtasks(ctx context.Context, task *kernel.Task, pidns
func (i *subtasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
tid, err := strconv.ParseUint(name, 10, 32)
if err != nil {
- return nil, syserror.ENOENT
+ return nil, linuxerr.ENOENT
}
subTask := i.pidns.TaskWithID(kernel.ThreadID(tid))
if subTask == nil {
- return nil, syserror.ENOENT
+ return nil, linuxerr.ENOENT
}
if subTask.ThreadGroup() != i.task.ThreadGroup() {
- return nil, syserror.ENOENT
+ return nil, linuxerr.ENOENT
}
return i.fs.newTaskInode(ctx, subTask, i.pidns, false, i.cgroupControllers)
}
@@ -88,7 +87,7 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode,
func (i *subtasksInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
tasks := i.task.ThreadGroup().MemberIDs(i.pidns)
if len(tasks) == 0 {
- return offset, syserror.ENOENT
+ return offset, linuxerr.ENOENT
}
if relOffset >= int64(len(tasks)) {
return offset, nil
@@ -124,7 +123,7 @@ type subtasksFD struct {
func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
if fd.task.ExitState() >= kernel.TaskExitZombie {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
return fd.GenericDirectoryFD.IterDirents(ctx, cb)
}
@@ -132,7 +131,7 @@ func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallbac
// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
if fd.task.ExitState() >= kernel.TaskExitZombie {
- return 0, syserror.ENOENT
+ return 0, linuxerr.ENOENT
}
return fd.GenericDirectoryFD.Seek(ctx, offset, whence)
}
@@ -140,7 +139,7 @@ func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int
// Stat implements vfs.FileDescriptionImpl.Stat.
func (fd *subtasksFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
if fd.task.ExitState() >= kernel.TaskExitZombie {
- return linux.Statx{}, syserror.ENOENT
+ return linux.Statx{}, linuxerr.ENOENT
}
return fd.GenericDirectoryFD.Stat(ctx, opts)
}
@@ -148,7 +147,7 @@ func (fd *subtasksFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Sta
// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
if fd.task.ExitState() >= kernel.TaskExitZombie {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
return fd.GenericDirectoryFD.SetStat(ctx, opts)
}
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index dfc0a924e..5c6412fc0 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -22,11 +22,11 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
)
func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags) {
@@ -142,11 +142,11 @@ func (i *fdDirInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.Ite
func (i *fdDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
fdInt, err := strconv.ParseInt(name, 10, 32)
if err != nil {
- return nil, syserror.ENOENT
+ return nil, linuxerr.ENOENT
}
fd := int32(fdInt)
if !taskFDExists(ctx, i.fs, i.task, fd) {
- return nil, syserror.ENOENT
+ return nil, linuxerr.ENOENT
}
return i.fs.newFDSymlink(ctx, i.task, fd, i.fs.NextIno()), nil
}
@@ -218,7 +218,7 @@ func (fs *filesystem) newFDSymlink(ctx context.Context, task *kernel.Task, fd in
func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
file, _ := getTaskFD(s.task, s.fd)
if file == nil {
- return "", syserror.ENOENT
+ return "", linuxerr.ENOENT
}
defer s.fs.SafeDecRefFD(ctx, file)
root := vfs.RootFromContext(ctx)
@@ -231,7 +231,7 @@ func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error)
func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
file, _ := getTaskFD(s.task, s.fd)
if file == nil {
- return vfs.VirtualDentry{}, "", syserror.ENOENT
+ return vfs.VirtualDentry{}, "", linuxerr.ENOENT
}
defer s.fs.SafeDecRefFD(ctx, file)
vd := file.VirtualDentry()
@@ -278,11 +278,11 @@ func (fs *filesystem) newFDInfoDirInode(ctx context.Context, task *kernel.Task)
func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
fdInt, err := strconv.ParseInt(name, 10, 32)
if err != nil {
- return nil, syserror.ENOENT
+ return nil, linuxerr.ENOENT
}
fd := int32(fdInt)
if !taskFDExists(ctx, i.fs, i.task, fd) {
- return nil, syserror.ENOENT
+ return nil, linuxerr.ENOENT
}
data := &fdInfoData{
fs: i.fs,
@@ -330,7 +330,7 @@ var _ dynamicInode = (*fdInfoData)(nil)
func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
file, descriptorFlags := getTaskFD(d.task, d.fd)
if file == nil {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
defer d.fs.SafeDecRefFD(ctx, file)
// TODO(b/121266871): Include pos, locks, and other data. For now we only
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 0ce3ed797..d3f9cf489 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -33,7 +33,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -41,7 +40,7 @@ import (
// Linux 3.18, the limit is five lines." - user_namespaces(7)
const maxIDMapLines = 5
-// mm gets the kernel task's MemoryManager. No additional reference is taken on
+// getMM gets the kernel task's MemoryManager. No additional reference is taken on
// mm here. This is safe because MemoryManager.destroy is required to leave the
// MemoryManager in a state where it's still usable as a DynamicBytesSource.
func getMM(task *kernel.Task) *mm.MemoryManager {
@@ -491,7 +490,7 @@ func (fd *memFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64
return int64(n), nil
}
if readErr != nil {
- return 0, syserror.EIO
+ return 0, linuxerr.EIO
}
return 0, nil
}
@@ -609,12 +608,10 @@ func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.task.StartTime().Sub(s.task.Kernel().Timekeeper().BootTime())))
var vss, rss uint64
- s.task.WithMuLocked(func(t *kernel.Task) {
- if mm := t.MemoryManager(); mm != nil {
- vss = mm.VirtualMemorySize()
- rss = mm.ResidentSetSize()
- }
- })
+ if mm := getMM(s.task); mm != nil {
+ vss = mm.VirtualMemorySize()
+ rss = mm.ResidentSetSize()
+ }
fmt.Fprintf(buf, "%d %d ", vss, rss/hostarch.PageSize)
// rsslim.
@@ -650,13 +647,10 @@ var _ dynamicInode = (*statmData)(nil)
// Generate implements vfs.DynamicBytesSource.Generate.
func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error {
var vss, rss uint64
- s.task.WithMuLocked(func(t *kernel.Task) {
- if mm := t.MemoryManager(); mm != nil {
- vss = mm.VirtualMemorySize()
- rss = mm.ResidentSetSize()
- }
- })
-
+ if mm := getMM(s.task); mm != nil {
+ vss = mm.VirtualMemorySize()
+ rss = mm.ResidentSetSize()
+ }
fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/hostarch.PageSize, rss/hostarch.PageSize)
return nil
}
@@ -780,12 +774,12 @@ func (s *statusFD) Generate(ctx context.Context, buf *bytes.Buffer) error {
if fdTable := t.FDTable(); fdTable != nil {
fds = fdTable.CurrentMaxFDs()
}
- if mm := t.MemoryManager(); mm != nil {
- vss = mm.VirtualMemorySize()
- rss = mm.ResidentSetSize()
- data = mm.VirtualDataSize()
- }
})
+ if mm := getMM(s.task); mm != nil {
+ vss = mm.VirtualMemorySize()
+ rss = mm.ResidentSetSize()
+ data = mm.VirtualDataSize()
+ }
// Filesystem user/group IDs aren't implemented; effective UID/GID are used
// instead.
fmt.Fprintf(buf, "Uid:\t%d\t%d\t%d\t%d\n", ruid, euid, suid, euid)
@@ -946,25 +940,17 @@ func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent
return vfs.VirtualDentry{}, "", err
}
- var err error
- var exec fsbridge.File
- s.task.WithMuLocked(func(t *kernel.Task) {
- mm := t.MemoryManager()
- if mm == nil {
- err = linuxerr.EACCES
- return
- }
+ mm := getMM(s.task)
+ if mm == nil {
+ return vfs.VirtualDentry{}, "", linuxerr.EACCES
+ }
- // The MemoryManager may be destroyed, in which case
- // MemoryManager.destroy will simply set the executable to nil
- // (with locks held).
- exec = mm.Executable()
- if exec == nil {
- err = linuxerr.ESRCH
- }
- })
- if err != nil {
- return vfs.VirtualDentry{}, "", err
+ // The MemoryManager may be destroyed, in which case
+ // MemoryManager.destroy will simply set the executable to nil
+ // (with locks held).
+ exec := mm.Executable()
+ if exec == nil {
+ return vfs.VirtualDentry{}, "", linuxerr.ESRCH
}
defer exec.DecRef(ctx)
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index cf905fae4..7b0be9c14 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -21,11 +21,11 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
)
const (
@@ -116,12 +116,12 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, err
case threadSelfName:
return i.newThreadSelfSymlink(ctx, root), nil
}
- return nil, syserror.ENOENT
+ return nil, linuxerr.ENOENT
}
task := i.pidns.TaskWithID(kernel.ThreadID(tid))
if task == nil {
- return nil, syserror.ENOENT
+ return nil, linuxerr.ENOENT
}
return i.fs.newTaskInode(ctx, task, i.pidns, true, i.fakeCgroupControllers)
@@ -268,6 +268,6 @@ func cpuInfoData(k *kernel.Kernel) string {
return buf.String()
}
-func shmData(v uint64) dynamicInode {
+func ipcData(v uint64) dynamicInode {
return newStaticFile(strconv.FormatUint(v, 10))
}
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index 03bed22a3..4d3a2f7e6 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -29,7 +29,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
)
// +stateify savable
@@ -58,7 +57,7 @@ func (s *selfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error
}
tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup())
if tgid == 0 {
- return "", syserror.ENOENT
+ return "", linuxerr.ENOENT
}
return strconv.FormatUint(uint64(tgid), 10), nil
}
@@ -100,7 +99,7 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string,
tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup())
tid := s.pidns.IDOfTask(t)
if tid == 0 || tgid == 0 {
- return "", syserror.ENOENT
+ return "", linuxerr.ENOENT
}
return fmt.Sprintf("%d/task/%d", tgid, tid), nil
}
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index 99f64a9d8..82e2857b3 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -47,9 +47,12 @@ func (fs *filesystem) newSysDir(ctx context.Context, root *auth.Credentials, k *
"kernel": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
"hostname": fs.newInode(ctx, root, 0444, &hostnameData{}),
"sem": fs.newInode(ctx, root, 0444, newStaticFile(fmt.Sprintf("%d\t%d\t%d\t%d\n", linux.SEMMSL, linux.SEMMNS, linux.SEMOPM, linux.SEMMNI))),
- "shmall": fs.newInode(ctx, root, 0444, shmData(linux.SHMALL)),
- "shmmax": fs.newInode(ctx, root, 0444, shmData(linux.SHMMAX)),
- "shmmni": fs.newInode(ctx, root, 0444, shmData(linux.SHMMNI)),
+ "shmall": fs.newInode(ctx, root, 0444, ipcData(linux.SHMALL)),
+ "shmmax": fs.newInode(ctx, root, 0444, ipcData(linux.SHMMAX)),
+ "shmmni": fs.newInode(ctx, root, 0444, ipcData(linux.SHMMNI)),
+ "msgmni": fs.newInode(ctx, root, 0444, ipcData(linux.MSGMNI)),
+ "msgmax": fs.newInode(ctx, root, 0444, ipcData(linux.MSGMAX)),
+ "msgmnb": fs.newInode(ctx, root, 0444, ipcData(linux.MSGMNB)),
"yama": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
"ptrace_scope": fs.newYAMAPtraceScopeFile(ctx, k, root),
}),
diff --git a/pkg/sentry/fsimpl/signalfd/BUILD b/pkg/sentry/fsimpl/signalfd/BUILD
index adb610213..403c6f254 100644
--- a/pkg/sentry/fsimpl/signalfd/BUILD
+++ b/pkg/sentry/fsimpl/signalfd/BUILD
@@ -9,10 +9,10 @@ go_library(
deps = [
"//pkg/abi/linux",
"//pkg/context",
+ "//pkg/errors/linuxerr",
"//pkg/sentry/kernel",
"//pkg/sentry/vfs",
"//pkg/sync",
- "//pkg/syserror",
"//pkg/usermem",
"//pkg/waiter",
],
diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go
index a7f5928b7..bdb03ef96 100644
--- a/pkg/sentry/fsimpl/signalfd/signalfd.go
+++ b/pkg/sentry/fsimpl/signalfd/signalfd.go
@@ -18,10 +18,10 @@ package signalfd
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -91,7 +91,7 @@ func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequen
info, err := sfd.target.Sigtimedwait(sfd.Mask(), 0)
if err != nil {
// There must be no signal available.
- return 0, syserror.ErrWouldBlock
+ return 0, linuxerr.ErrWouldBlock
}
// Copy out the signal info using the specified format.
diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD
index 1af0a5cbc..ab21f028e 100644
--- a/pkg/sentry/fsimpl/sys/BUILD
+++ b/pkg/sentry/fsimpl/sys/BUILD
@@ -36,7 +36,6 @@ go_library(
"//pkg/sentry/kernel/auth",
"//pkg/sentry/memmap",
"//pkg/sentry/vfs",
- "//pkg/syserror",
"//pkg/usermem",
],
)
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index f322d2747..7fcb2d26b 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -84,6 +84,18 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
fs.MaxCachedDentries = maxCachedDentries
fs.VFSFilesystem().Init(vfsObj, &fsType, fs)
+ k := kernel.KernelFromContext(ctx)
+ fsDirChildren := make(map[string]kernfs.Inode)
+ // Create an empty directory to serve as the mount point for cgroupfs when
+ // cgroups are available. This emulates Linux behaviour, see
+ // kernel/cgroup.c:cgroup_init(). Note that in Linux, userspace (typically
+ // the init process) is ultimately responsible for actually mounting
+ // cgroupfs, but the kernel creates the mountpoint. For the sentry, the
+ // launcher mounts cgroupfs.
+ if k.CgroupRegistry() != nil {
+ fsDirChildren["cgroup"] = fs.newDir(ctx, creds, defaultSysDirMode, nil)
+ }
+
root := fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
"block": fs.newDir(ctx, creds, defaultSysDirMode, nil),
"bus": fs.newDir(ctx, creds, defaultSysDirMode, nil),
@@ -97,7 +109,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
}),
}),
"firmware": fs.newDir(ctx, creds, defaultSysDirMode, nil),
- "fs": fs.newDir(ctx, creds, defaultSysDirMode, nil),
+ "fs": fs.newDir(ctx, creds, defaultSysDirMode, fsDirChildren),
"kernel": kernelDir(ctx, fs, creds),
"module": fs.newDir(ctx, creds, defaultSysDirMode, nil),
"power": fs.newDir(ctx, creds, defaultSysDirMode, nil),
diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go
index 0a0d914cc..0c46a3a13 100644
--- a/pkg/sentry/fsimpl/sys/sys_test.go
+++ b/pkg/sentry/fsimpl/sys/sys_test.go
@@ -87,3 +87,17 @@ func TestSysRootContainsExpectedEntries(t *testing.T) {
"power": linux.DT_DIR,
})
}
+
+func TestCgroupMountpointExists(t *testing.T) {
+ // Note: The mountpoint is only created if cgroups are available. This is
+ // the VFS2 implementation of sysfs and the test runs with VFS2 enabled, so
+ // we expect to see the mount point unconditionally.
+ s := newTestSystem(t)
+ defer s.Destroy()
+ pop := s.PathOpAtRoot("/fs")
+ s.AssertAllDirentTypes(s.ListDirents(pop), map[string]testutil.DirentType{
+ "cgroup": linux.DT_DIR,
+ })
+ pop = s.PathOpAtRoot("/fs/cgroup")
+ s.AssertAllDirentTypes(s.ListDirents(pop), map[string]testutil.DirentType{ /*empty*/ })
+}
diff --git a/pkg/sentry/fsimpl/timerfd/BUILD b/pkg/sentry/fsimpl/timerfd/BUILD
index e6980a314..2b83d7d9a 100644
--- a/pkg/sentry/fsimpl/timerfd/BUILD
+++ b/pkg/sentry/fsimpl/timerfd/BUILD
@@ -12,7 +12,6 @@ go_library(
"//pkg/hostarch",
"//pkg/sentry/kernel/time",
"//pkg/sentry/vfs",
- "//pkg/syserror",
"//pkg/usermem",
"//pkg/waiter",
],
diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go
index 655a1c76a..68b785791 100644
--- a/pkg/sentry/fsimpl/timerfd/timerfd.go
+++ b/pkg/sentry/fsimpl/timerfd/timerfd.go
@@ -23,7 +23,6 @@ import (
"gvisor.dev/gvisor/pkg/hostarch"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -82,7 +81,7 @@ func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequenc
}
return sizeofUint64, nil
}
- return 0, syserror.ErrWouldBlock
+ return 0, linuxerr.ErrWouldBlock
}
// Clock returns the timer fd's Clock.
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index dc8b9bfeb..94486bb63 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -82,7 +82,6 @@ go_library(
"//pkg/sentry/vfs",
"//pkg/sentry/vfs/memxattr",
"//pkg/sync",
- "//pkg/syserror",
"//pkg/usermem",
],
)
@@ -125,7 +124,6 @@ go_test(
"//pkg/sentry/fs/lock",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/vfs",
- "//pkg/syserror",
"//pkg/usermem",
],
)
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 8b04df038..e067f136e 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -26,7 +26,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
)
// Sync implements vfs.FilesystemImpl.Sync.
@@ -75,7 +74,7 @@ afterSymlink:
}
child, ok := dir.childMap[name]
if !ok {
- return nil, syserror.ENOENT
+ return nil, linuxerr.ENOENT
}
if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
return nil, err
@@ -171,12 +170,12 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
return linuxerr.EEXIST
}
if !dir && rp.MustBeDir() {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
// tmpfs never calls VFS.InvalidateDentry(), so parentDir.dentry can only
// be dead if it was deleted.
if parentDir.dentry.vfsd.IsDead() {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
mnt := rp.Mount()
if err := mnt.CheckBeginWrite(); err != nil {
@@ -258,7 +257,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
return err
}
if i.nlink == 0 {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
if i.nlink == maxLinks {
return linuxerr.EMLINK
@@ -345,7 +344,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
if rp.Done() {
// Reject attempts to open mount root directory with O_CREAT.
if rp.MustBeDir() {
- return nil, syserror.EISDIR
+ return nil, linuxerr.EISDIR
}
if mustCreate {
return nil, linuxerr.EEXIST
@@ -366,11 +365,11 @@ afterTrailingSymlink:
}
// Reject attempts to open directories with O_CREAT.
if rp.MustBeDir() {
- return nil, syserror.EISDIR
+ return nil, linuxerr.EISDIR
}
name := rp.Component()
if name == "." || name == ".." {
- return nil, syserror.EISDIR
+ return nil, linuxerr.EISDIR
}
if len(name) > linux.NAME_MAX {
return nil, linuxerr.ENAMETOOLONG
@@ -457,7 +456,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
case *directory:
// Can't open directories writably.
if ats&vfs.MayWrite != 0 {
- return nil, syserror.EISDIR
+ return nil, linuxerr.EISDIR
}
var fd directoryFD
fd.LockFD.Init(&d.inode.locks)
@@ -532,7 +531,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
}
renamed, ok := oldParentDir.childMap[oldName]
if !ok {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
if err := oldParentDir.mayDelete(rp.Credentials(), renamed); err != nil {
return err
@@ -567,7 +566,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
replacedDir, ok := replaced.inode.impl.(*directory)
if ok {
if !renamed.inode.isDir() {
- return syserror.EISDIR
+ return linuxerr.EISDIR
}
if len(replacedDir.childMap) != 0 {
return linuxerr.ENOTEMPTY
@@ -588,7 +587,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
// tmpfs never calls VFS.InvalidateDentry(), so newParentDir.dentry can
// only be dead if it was deleted.
if newParentDir.dentry.vfsd.IsDead() {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
// Linux places this check before some of those above; we do it here for
@@ -654,7 +653,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
}
child, ok := parentDir.childMap[name]
if !ok {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
if err := parentDir.mayDelete(rp.Credentials(), child); err != nil {
return err
@@ -754,17 +753,17 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
}
name := rp.Component()
if name == "." || name == ".." {
- return syserror.EISDIR
+ return linuxerr.EISDIR
}
child, ok := parentDir.childMap[name]
if !ok {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
if err := parentDir.mayDelete(rp.Credentials(), child); err != nil {
return err
}
if child.inode.isDir() {
- return syserror.EISDIR
+ return linuxerr.EISDIR
}
if rp.MustBeDir() {
return linuxerr.ENOTDIR
diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
index 418c7994e..99afd9817 100644
--- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
@@ -25,7 +25,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/contexttest"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -202,7 +201,7 @@ func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) {
readData := make([]byte, 1)
dst := usermem.BytesIOSequence(readData)
bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{})
- if err != syserror.ErrWouldBlock {
+ if err != linuxerr.ErrWouldBlock {
t.Fatalf("expected ErrWouldBlock reading from empty pipe %q, but got: %v", fileName, err)
}
if bytesRead != 0 {
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
index 4393cc13b..cb7711b39 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -21,10 +21,10 @@ import (
"testing"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/sentry/contexttest"
"gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -146,7 +146,7 @@ func TestLocks(t *testing.T) {
if err := fd.Impl().LockBSD(ctx, uid2, 0 /* ownerPID */, lock.ReadLock, nil); err != nil {
t.Fatalf("fd.Impl().LockBSD failed: err = %v", err)
}
- if got, want := fd.Impl().LockBSD(ctx, uid2, 0 /* ownerPID */, lock.WriteLock, nil), syserror.ErrWouldBlock; got != want {
+ if got, want := fd.Impl().LockBSD(ctx, uid2, 0 /* ownerPID */, lock.WriteLock, nil), linuxerr.ErrWouldBlock; got != want {
t.Fatalf("fd.Impl().LockBSD failed: got = %v, want = %v", got, want)
}
if err := fd.Impl().UnlockBSD(ctx, uid1); err != nil {
@@ -165,7 +165,7 @@ func TestLocks(t *testing.T) {
if err := fd.Impl().LockPOSIX(ctx, uid1, 0 /* ownerPID */, lock.WriteLock, lock.LockRange{Start: 0, End: 1}, nil); err != nil {
t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err)
}
- if got, want := fd.Impl().LockPOSIX(ctx, uid2, 0 /* ownerPID */, lock.ReadLock, lock.LockRange{Start: 0, End: 1}, nil), syserror.ErrWouldBlock; got != want {
+ if got, want := fd.Impl().LockPOSIX(ctx, uid2, 0 /* ownerPID */, lock.ReadLock, lock.LockRange{Start: 0, End: 1}, nil), linuxerr.ErrWouldBlock; got != want {
t.Fatalf("fd.Impl().LockPOSIX failed: got = %v, want = %v", got, want)
}
if err := fd.Impl().UnlockPOSIX(ctx, uid1, lock.LockRange{Start: 0, End: 1}); err != nil {
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index f2250c025..feafb06e4 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -44,7 +44,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sentry/vfs/memxattr"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
)
// Name is the default filesystem name.
@@ -556,7 +555,7 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.
needsCtimeBump = true
}
case *directory:
- return syserror.EISDIR
+ return linuxerr.EISDIR
default:
return linuxerr.EINVAL
}
diff --git a/pkg/sentry/fsimpl/verity/BUILD b/pkg/sentry/fsimpl/verity/BUILD
index 1d855234c..c12abdf33 100644
--- a/pkg/sentry/fsimpl/verity/BUILD
+++ b/pkg/sentry/fsimpl/verity/BUILD
@@ -1,10 +1,24 @@
load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
licenses(["notice"])
+go_template_instance(
+ name = "dentry_list",
+ out = "dentry_list.go",
+ package = "verity",
+ prefix = "dentry",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Element": "*dentry",
+ "Linker": "*dentry",
+ },
+)
+
go_library(
name = "verity",
srcs = [
+ "dentry_list.go",
"filesystem.go",
"save_restore.go",
"verity.go",
@@ -28,7 +42,6 @@ go_library(
"//pkg/sentry/socket/unix/transport",
"//pkg/sentry/vfs",
"//pkg/sync",
- "//pkg/syserror",
"//pkg/usermem",
],
)
diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
index 930016a3e..52d47994d 100644
--- a/pkg/sentry/fsimpl/verity/filesystem.go
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -32,7 +32,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -67,40 +66,23 @@ func putDentrySlice(ds *[]*dentry) {
dentrySlicePool.Put(ds)
}
-// renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls
-// dentry.checkDropLocked on all dentries in *ds with fs.renameMu locked for
+// renameMuRUnlockAndCheckCaching calls fs.renameMu.RUnlock(), then calls
+// dentry.checkCachingLocked on all dentries in *ds with fs.renameMu locked for
// writing.
//
// ds is a pointer-to-pointer since defer evaluates its arguments immediately,
// but dentry slices are allocated lazily, and it's much easier to say "defer
-// fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() {
-// fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this.
+// fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() {
+// fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this.
// +checklocksrelease:fs.renameMu
-func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
+func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) {
fs.renameMu.RUnlock()
if *ds == nil {
return
}
- if len(**ds) != 0 {
- fs.renameMu.Lock()
- for _, d := range **ds {
- d.checkDropLocked(ctx)
- }
- fs.renameMu.Unlock()
- }
- putDentrySlice(*ds)
-}
-
-// +checklocksrelease:fs.renameMu
-func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
- if *ds == nil {
- fs.renameMu.Unlock()
- return
- }
for _, d := range **ds {
- d.checkDropLocked(ctx)
+ d.checkCachingLocked(ctx, false /* renameMuWriteLocked */)
}
- fs.renameMu.Unlock()
putDentrySlice(*ds)
}
@@ -166,7 +148,7 @@ afterSymlink:
// verifyChildLocked verifies the hash of child against the already verified
// hash of the parent to ensure the child is expected. verifyChild triggers a
// sentry panic if unexpected modifications to the file system are detected. In
-// ErrorOnViolation mode it returns a syserror instead.
+// ErrorOnViolation mode it returns a linuxerr instead.
//
// Preconditions:
// * fs.renameMu must be locked.
@@ -547,7 +529,7 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry,
if parent.verityEnabled() {
if _, ok := parent.childrenNames[name]; !ok {
- return nil, syserror.ENOENT
+ return nil, linuxerr.ENOENT
}
}
@@ -595,23 +577,6 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry,
}
}
- // Clear the Merkle tree file if they are to be generated at runtime.
- // TODO(b/182315468): Optimize the Merkle tree generate process to
- // allow only updating certain files/directories.
- if fs.allowRuntimeEnable {
- childMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
- Root: childMerkleVD,
- Start: childMerkleVD,
- }, &vfs.OpenOptions{
- Flags: linux.O_RDWR | linux.O_TRUNC,
- Mode: 0644,
- })
- if err != nil {
- return nil, err
- }
- childMerkleFD.DecRef(ctx)
- }
-
// The dentry needs to be cleaned up if any error occurs. IncRef will be
// called if a verity child dentry is successfully created.
defer childMerkleVD.DecRef(ctx)
@@ -718,7 +683,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds
}
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return err
@@ -730,7 +695,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds
func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return nil, err
@@ -751,7 +716,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
start := rp.Start().Impl().(*dentry)
d, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
if err != nil {
@@ -788,7 +753,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
start := rp.Start().Impl().(*dentry)
if rp.Done() {
@@ -970,7 +935,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return "", err
@@ -1000,7 +965,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return linux.Statx{}, err
@@ -1046,7 +1011,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
if _, err := fs.resolveLocked(ctx, rp, &ds); err != nil {
return nil, err
}
@@ -1057,7 +1022,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return nil, err
@@ -1073,7 +1038,7 @@ func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
var ds *[]*dentry
fs.renameMu.RLock()
- defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+ defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
d, err := fs.resolveLocked(ctx, rp, &ds)
if err != nil {
return "", err
diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
index c5fa9855b..d2526263c 100644
--- a/pkg/sentry/fsimpl/verity/verity.go
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -23,10 +23,12 @@
// Lock order:
//
// filesystem.renameMu
-// dentry.dirMu
-// fileDescription.mu
-// filesystem.verityMu
-// dentry.hashMu
+// dentry.cachingMu
+// filesystem.cacheMu
+// dentry.dirMu
+// fileDescription.mu
+// filesystem.verityMu
+// dentry.hashMu
//
// Locking dentry.dirMu in multiple dentries requires that parent dentries are
// locked before child dentries, and that filesystem.renameMu is locked to
@@ -60,7 +62,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -97,6 +98,9 @@ const (
// sizeOfStringInt32 is the size for a 32 bit integer stored as string in
// extended attributes. The maximum value of a 32 bit integer has 10 digits.
sizeOfStringInt32 = 10
+
+ // defaultMaxCachedDentries is the default limit of dentry cache.
+ defaultMaxCachedDentries = uint64(1000)
)
var (
@@ -107,9 +111,10 @@ var (
// Mount option names for verityfs.
const (
- moptLowerPath = "lower_path"
- moptRootHash = "root_hash"
- moptRootName = "root_name"
+ moptLowerPath = "lower_path"
+ moptRootHash = "root_hash"
+ moptRootName = "root_name"
+ moptDentryCacheLimit = "dentry_cache_limit"
)
// HashAlgorithm is a type specifying the algorithm used to hash the file
@@ -189,6 +194,17 @@ type filesystem struct {
// dentries.
renameMu sync.RWMutex `state:"nosave"`
+ // cachedDentries contains all dentries with 0 references. (Due to race
+ // conditions, it may also contain dentries with non-zero references.)
+ // cachedDentriesLen is the number of dentries in cachedDentries. These
+ // fields are protected by cacheMu.
+ cacheMu sync.Mutex `state:"nosave"`
+ cachedDentries dentryList
+ cachedDentriesLen uint64
+
+ // maxCachedDentries is the maximum size of filesystem.cachedDentries.
+ maxCachedDentries uint64
+
// verityMu synchronizes enabling verity files, protects files or
// directories from being enabled by different threads simultaneously.
// It also ensures that verity does not access files that are being
@@ -199,6 +215,10 @@ type filesystem struct {
// is for the whole file system to ensure that no more than one file is
// enabled the same time.
verityMu sync.RWMutex `state:"nosave"`
+
+ // released is nonzero once filesystem.Release has been called. It is accessed
+ // with atomic memory operations.
+ released int32
}
// InternalFilesystemOptions may be passed as
@@ -239,7 +259,7 @@ func (FilesystemType) Release(ctx context.Context) {}
// mode, it returns EIO, otherwise it panic.
func (fs *filesystem) alertIntegrityViolation(msg string) error {
if fs.action == ErrorOnViolation {
- return syserror.EIO
+ return linuxerr.EIO
}
panic(msg)
}
@@ -267,6 +287,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
delete(mopts, moptRootName)
rootName = root
}
+ maxCachedDentries := defaultMaxCachedDentries
+ if str, ok := mopts[moptDentryCacheLimit]; ok {
+ delete(mopts, moptDentryCacheLimit)
+ maxCD, err := strconv.ParseUint(str, 10, 64)
+ if err != nil {
+ ctx.Warningf("verity.FilesystemType.GetFilesystem: invalid dentry cache limit: %s=%s", moptDentryCacheLimit, str)
+ return nil, nil, linuxerr.EINVAL
+ }
+ maxCachedDentries = maxCD
+ }
// Check for unparsed options.
if len(mopts) != 0 {
@@ -340,12 +370,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
action: iopts.Action,
opts: opts.Data,
allowRuntimeEnable: iopts.AllowRuntimeEnable,
+ maxCachedDentries: maxCachedDentries,
}
fs.vfsfs.Init(vfsObj, &fstype, fs)
// Construct the root dentry.
d := fs.newDentry()
- d.refs = 1
+ // Set the root's reference count to 2. One reference is returned to
+ // the caller, and the other is held by fs to prevent the root from
+ // being "cached" and subsequently evicted.
+ d.refs = 2
lowerVD := vfs.MakeVirtualDentry(lowerMount, lowerMount.Root())
lowerVD.IncRef()
d.lowerVD = lowerVD
@@ -520,7 +554,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
// Release implements vfs.FilesystemImpl.Release.
func (fs *filesystem) Release(ctx context.Context) {
+ atomic.StoreInt32(&fs.released, 1)
fs.lowerMount.DecRef(ctx)
+
+ fs.renameMu.Lock()
+ fs.evictAllCachedDentriesLocked(ctx)
+ fs.renameMu.Unlock()
+
+ // An extra reference was held by the filesystem on the root to prevent
+ // it from being cached/evicted.
+ fs.rootDentry.DecRef(ctx)
}
// MountOptions implements vfs.FilesystemImpl.MountOptions.
@@ -534,6 +577,11 @@ func (fs *filesystem) MountOptions() string {
type dentry struct {
vfsd vfs.Dentry
+ // refs is the reference count. Each dentry holds a reference on its
+ // parent, even if disowned. When refs reaches 0, the dentry may be
+ // added to the cache or destroyed. If refs == -1, the dentry has
+ // already been destroyed. refs is accessed using atomic memory
+ // operations.
refs int64
// fs is the owning filesystem. fs is immutable.
@@ -588,13 +636,23 @@ type dentry struct {
// is protected by hashMu.
hashMu sync.RWMutex `state:"nosave"`
hash []byte
+
+ // cachingMu is used to synchronize concurrent dentry caching attempts on
+ // this dentry.
+ cachingMu sync.Mutex `state:"nosave"`
+
+ // If cached is true, dentryEntry links dentry into
+ // filesystem.cachedDentries. cached and dentryEntry are protected by
+ // cachingMu.
+ cached bool
+ dentryEntry
}
// newDentry creates a new dentry representing the given verity file. The
-// dentry initially has no references; it is the caller's responsibility to set
-// the dentry's reference count and/or call dentry.destroy() as appropriate.
-// The dentry is initially invalid in that it contains no underlying dentry;
-// the caller is responsible for setting them.
+// dentry initially has no references, but is not cached; it is the caller's
+// responsibility to set the dentry's reference count and/or call
+// dentry.destroy() as appropriate. The dentry is initially invalid in that it
+// contains no underlying dentry; the caller is responsible for setting them.
func (fs *filesystem) newDentry() *dentry {
d := &dentry{
fs: fs,
@@ -630,42 +688,23 @@ func (d *dentry) TryIncRef() bool {
// DecRef implements vfs.DentryImpl.DecRef.
func (d *dentry) DecRef(ctx context.Context) {
- r := atomic.AddInt64(&d.refs, -1)
- if d.LogRefs() {
- refsvfs2.LogDecRef(d, r)
- }
- if r == 0 {
- d.fs.renameMu.Lock()
- d.checkDropLocked(ctx)
- d.fs.renameMu.Unlock()
- } else if r < 0 {
- panic("verity.dentry.DecRef() called without holding a reference")
+ if d.decRefNoCaching() == 0 {
+ d.checkCachingLocked(ctx, false /* renameMuWriteLocked */)
}
}
-func (d *dentry) decRefLocked(ctx context.Context) {
+// decRefNoCaching decrements d's reference count without calling
+// d.checkCachingLocked, even if d's reference count reaches 0; callers are
+// responsible for ensuring that d.checkCachingLocked will be called later.
+func (d *dentry) decRefNoCaching() int64 {
r := atomic.AddInt64(&d.refs, -1)
if d.LogRefs() {
refsvfs2.LogDecRef(d, r)
}
- if r == 0 {
- d.checkDropLocked(ctx)
- } else if r < 0 {
- panic("verity.dentry.decRefLocked() called without holding a reference")
+ if r < 0 {
+ panic("verity.dentry.decRefNoCaching() called without holding a reference")
}
-}
-
-// checkDropLocked should be called after d's reference count becomes 0 or it
-// becomes deleted.
-func (d *dentry) checkDropLocked(ctx context.Context) {
- // Dentries with a positive reference count must be retained. Dentries
- // with a negative reference count have already been destroyed.
- if atomic.LoadInt64(&d.refs) != 0 {
- return
- }
- // Refs is still zero; destroy it.
- d.destroyLocked(ctx)
- return
+ return r
}
// destroyLocked destroys the dentry.
@@ -684,6 +723,12 @@ func (d *dentry) destroyLocked(ctx context.Context) {
panic("verity.dentry.destroyLocked() called with references on the dentry")
}
+ // Drop the reference held by d on its parent without recursively
+ // locking d.fs.renameMu.
+ if d.parent != nil && d.parent.decRefNoCaching() == 0 {
+ d.parent.checkCachingLocked(ctx, true /* renameMuWriteLocked */)
+ }
+
if d.lowerVD.Ok() {
d.lowerVD.DecRef(ctx)
}
@@ -696,7 +741,6 @@ func (d *dentry) destroyLocked(ctx context.Context) {
delete(d.parent.children, d.name)
}
d.parent.dirMu.Unlock()
- d.parent.decRefLocked(ctx)
}
refsvfs2.Unregister(d)
}
@@ -735,6 +779,140 @@ func (d *dentry) OnZeroWatches(context.Context) {
//TODO(b/159261227): Implement OnZeroWatches.
}
+// checkCachingLocked should be called after d's reference count becomes 0 or
+// it becomes disowned.
+//
+// For performance, checkCachingLocked can also be called after d's reference
+// count becomes non-zero, so that d can be removed from the LRU cache. This
+// may help in reducing the size of the cache and hence reduce evictions. Note
+// that this is not necessary for correctness.
+//
+// It may be called on a destroyed dentry. For example,
+// renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times
+// for the same dentry when the dentry is visited more than once in the same
+// operation. One of the calls may destroy the dentry, so subsequent calls will
+// do nothing.
+//
+// Preconditions: d.fs.renameMu must be locked for writing if
+// renameMuWriteLocked is true; it may be temporarily unlocked.
+func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked bool) {
+ d.cachingMu.Lock()
+ refs := atomic.LoadInt64(&d.refs)
+ if refs == -1 {
+ // Dentry has already been destroyed.
+ d.cachingMu.Unlock()
+ return
+ }
+ if refs > 0 {
+ // fs.cachedDentries is permitted to contain dentries with non-zero refs,
+ // which are skipped by fs.evictCachedDentryLocked() upon reaching the end
+ // of the LRU. But it is still beneficial to remove d from the cache as we
+ // are already holding d.cachingMu. Keeping a cleaner cache also reduces
+ // the number of evictions (which is expensive as it acquires fs.renameMu).
+ d.removeFromCacheLocked()
+ d.cachingMu.Unlock()
+ return
+ }
+
+ if atomic.LoadInt32(&d.fs.released) != 0 {
+ d.cachingMu.Unlock()
+ if !renameMuWriteLocked {
+ // Need to lock d.fs.renameMu to access d.parent. Lock it for writing as
+ // needed by d.destroyLocked() later.
+ d.fs.renameMu.Lock()
+ defer d.fs.renameMu.Unlock()
+ }
+ if d.parent != nil {
+ d.parent.dirMu.Lock()
+ delete(d.parent.children, d.name)
+ d.parent.dirMu.Unlock()
+ }
+ d.destroyLocked(ctx) // +checklocksforce: see above.
+ return
+ }
+
+ d.fs.cacheMu.Lock()
+ // If d is already cached, just move it to the front of the LRU.
+ if d.cached {
+ d.fs.cachedDentries.Remove(d)
+ d.fs.cachedDentries.PushFront(d)
+ d.fs.cacheMu.Unlock()
+ d.cachingMu.Unlock()
+ return
+ }
+ // Cache the dentry, then evict the least recently used cached dentry if
+ // the cache becomes over-full.
+ d.fs.cachedDentries.PushFront(d)
+ d.fs.cachedDentriesLen++
+ d.cached = true
+ shouldEvict := d.fs.cachedDentriesLen > d.fs.maxCachedDentries
+ d.fs.cacheMu.Unlock()
+ d.cachingMu.Unlock()
+
+ if shouldEvict {
+ if !renameMuWriteLocked {
+ // Need to lock d.fs.renameMu for writing as needed by
+ // d.evictCachedDentryLocked().
+ d.fs.renameMu.Lock()
+ defer d.fs.renameMu.Unlock()
+ }
+ d.fs.evictCachedDentryLocked(ctx) // +checklocksforce: see above.
+ }
+}
+
+// Preconditions: d.cachingMu must be locked.
+func (d *dentry) removeFromCacheLocked() {
+ if d.cached {
+ d.fs.cacheMu.Lock()
+ d.fs.cachedDentries.Remove(d)
+ d.fs.cachedDentriesLen--
+ d.fs.cacheMu.Unlock()
+ d.cached = false
+ }
+}
+
+// Precondition: fs.renameMu must be locked for writing; it may be temporarily
+// unlocked.
+// +checklocks:fs.renameMu
+func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) {
+ for fs.cachedDentriesLen != 0 {
+ fs.evictCachedDentryLocked(ctx)
+ }
+}
+
+// Preconditions:
+// * fs.renameMu must be locked for writing; it may be temporarily unlocked.
+// +checklocks:fs.renameMu
+func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) {
+ fs.cacheMu.Lock()
+ victim := fs.cachedDentries.Back()
+ fs.cacheMu.Unlock()
+ if victim == nil {
+ // fs.cachedDentries may have become empty between when it was
+ // checked and when we locked fs.cacheMu.
+ return
+ }
+
+ victim.cachingMu.Lock()
+ victim.removeFromCacheLocked()
+ // victim.refs may have become non-zero from an earlier path resolution
+ // since it was inserted into fs.cachedDentries.
+ if atomic.LoadInt64(&victim.refs) != 0 {
+ victim.cachingMu.Unlock()
+ return
+ }
+ if victim.parent != nil {
+ victim.parent.dirMu.Lock()
+ // Note that victim can't be a mount point (in any mount
+ // namespace), since VFS holds references on mount points.
+ fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &victim.vfsd)
+ delete(victim.parent.children, victim.name)
+ victim.parent.dirMu.Unlock()
+ }
+ victim.cachingMu.Unlock()
+ victim.destroyLocked(ctx) // +checklocksforce: owned as precondition, victim.fs == fs.
+}
+
func (d *dentry) isSymlink() bool {
return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK
}
@@ -1091,6 +1269,21 @@ func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) {
return 0, fd.d.fs.alertIntegrityViolation("Unexpected verity fd: missing expected underlying fds")
}
+ // Populate children names here. We cannot rely on the children
+ // dentries to populate parent dentry's children names, because the
+ // parent dentry may be destroyed before users enable verity if its ref
+ // count drops to zero.
+ if fd.d.isDir() {
+ if err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
+ if dirent.Name != "." && dirent.Name != ".." {
+ fd.d.childrenNames[dirent.Name] = struct{}{}
+ }
+ return nil
+ })); err != nil {
+ return 0, err
+ }
+ }
+
hash, dataSize, err := fd.generateMerkleLocked(ctx)
if err != nil {
return 0, err
@@ -1118,9 +1311,6 @@ func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) {
}); err != nil {
return 0, err
}
-
- // Add the current child's name to parent's childrenNames.
- fd.d.parent.childrenNames[fd.d.name] = struct{}{}
}
// Record the size of the data being hashed for fd.
@@ -1215,7 +1405,7 @@ func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.
case linux.FS_IOC_GETFLAGS:
return fd.verityFlags(ctx, args[2].Pointer())
default:
- return 0, syserror.ENOSYS
+ return 0, linuxerr.ENOSYS
}
}