diff options
Diffstat (limited to 'pkg/sentry/fsimpl')
57 files changed, 884 insertions, 324 deletions
diff --git a/pkg/sentry/fsimpl/cgroupfs/BUILD b/pkg/sentry/fsimpl/cgroupfs/BUILD index 4c9c5b344..e5fdcc776 100644 --- a/pkg/sentry/fsimpl/cgroupfs/BUILD +++ b/pkg/sentry/fsimpl/cgroupfs/BUILD @@ -32,6 +32,7 @@ go_library( "//pkg/context", "//pkg/coverage", "//pkg/errors/linuxerr", + "//pkg/fspath", "//pkg/log", "//pkg/refs", "//pkg/refsvfs2", @@ -43,7 +44,6 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fsimpl/cgroupfs/base.go b/pkg/sentry/fsimpl/cgroupfs/base.go index 4290ffe0d..71bb0a9c8 100644 --- a/pkg/sentry/fsimpl/cgroupfs/base.go +++ b/pkg/sentry/fsimpl/cgroupfs/base.go @@ -88,7 +88,6 @@ type controller interface { // +stateify savable type cgroupInode struct { dir - fs *filesystem // ts is the list of tasks in this cgroup. The kernel is responsible for // removing tasks from this list before they're destroyed, so any tasks on @@ -102,9 +101,10 @@ var _ kernel.CgroupImpl = (*cgroupInode)(nil) func (fs *filesystem) newCgroupInode(ctx context.Context, creds *auth.Credentials) kernfs.Inode { c := &cgroupInode{ - fs: fs, - ts: make(map[*kernel.Task]struct{}), + dir: dir{fs: fs}, + ts: make(map[*kernel.Task]struct{}), } + c.dir.cgi = c contents := make(map[string]kernfs.Inode) contents["cgroup.procs"] = fs.newControllerFile(ctx, creds, &cgroupProcsData{c}) @@ -115,8 +115,7 @@ func (fs *filesystem) newCgroupInode(ctx context.Context, creds *auth.Credential } c.dir.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|linux.FileMode(0555)) - c.dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) - c.dir.InitRefs() + c.dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true}) c.dir.IncLinks(c.dir.OrderedChildren.Populate(contents)) atomic.AddUint64(&fs.numCgroups, 1) diff --git a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go index 22c8b7fda..edc3b50b9 100644 --- a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go +++ b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go @@ -32,7 +32,8 @@ // controllers associated with them. // // Since cgroupfs doesn't allow hardlinks, there is a unique mapping between -// cgroupfs dentries and inodes. +// cgroupfs dentries and inodes. Thus, cgroupfs inodes don't need to be ref +// counted and exist until they're unlinked once or the FS is destroyed. // // # Synchronization // @@ -48,10 +49,11 @@ // Lock order: // // kernel.CgroupRegistry.mu -// cgroupfs.filesystem.mu -// kernel.TaskSet.mu -// kernel.Task.mu -// cgroupfs.filesystem.tasksMu. +// kernfs.filesystem.mu +// kernel.TaskSet.mu +// kernel.Task.mu +// cgroupfs.filesystem.tasksMu. +// cgroupfs.dir.OrderedChildren.mu package cgroupfs import ( @@ -63,6 +65,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -108,6 +111,7 @@ type FilesystemType struct{} // +stateify savable type InternalData struct { DefaultControlValues map[string]int64 + InitialCgroupPath string } // filesystem implements vfs.FilesystemImpl and kernel.cgroupFS. @@ -134,6 +138,11 @@ type filesystem struct { numCgroups uint64 // Protected by atomic ops. root *kernfs.Dentry + // effectiveRoot is the initial cgroup new tasks are created in. Unless + // overwritten by internal mount options, root == effectiveRoot. If + // effectiveRoot != root, an extra reference is held on effectiveRoot for + // the lifetime of the filesystem. + effectiveRoot *kernfs.Dentry // tasksMu serializes task membership changes across all cgroups within a // filesystem. @@ -229,6 +238,9 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fs := vfsfs.Impl().(*filesystem) ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: mounting new view to hierarchy %v", fs.hierarchyID) fs.root.IncRef() + if fs.effectiveRoot != fs.root { + fs.effectiveRoot.IncRef() + } return vfsfs, fs.root.VFSDentry(), nil } @@ -245,8 +257,8 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt var defaults map[string]int64 if opts.InternalData != nil { - ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults) defaults = opts.InternalData.(*InternalData).DefaultControlValues + ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults) } for _, ty := range wantControllers { @@ -286,6 +298,14 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt var rootD kernfs.Dentry rootD.InitRoot(&fs.Filesystem, root) fs.root = &rootD + fs.effectiveRoot = fs.root + + if err := fs.prepareInitialCgroup(ctx, vfsObj, opts); err != nil { + ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: failed to prepare initial cgroup: %v", err) + rootD.DecRef(ctx) + fs.VFSFilesystem().DecRef(ctx) + return nil, nil, err + } // Register controllers. The registry may be modified concurrently, so if we // get an error, we raced with someone else who registered the same @@ -303,10 +323,47 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return fs.VFSFilesystem(), rootD.VFSDentry(), nil } +// prepareInitialCgroup creates the initial cgroup according to opts. An initial +// cgroup is optional, and if not specified, this function is a no-op. +func (fs *filesystem) prepareInitialCgroup(ctx context.Context, vfsObj *vfs.VirtualFilesystem, opts vfs.GetFilesystemOptions) error { + if opts.InternalData == nil { + return nil + } + initPathStr := opts.InternalData.(*InternalData).InitialCgroupPath + if initPathStr == "" { + return nil + } + ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path: %v", initPathStr) + initPath := fspath.Parse(initPathStr) + if !initPath.Absolute || !initPath.HasComponents() { + ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path invalid: %+v", initPath) + return linuxerr.EINVAL + } + + // Have initial cgroup target, create the tree. + cgDir := fs.root.Inode().(*cgroupInode) + for pit := initPath.Begin; pit.Ok(); pit = pit.Next() { + cgDirI, err := cgDir.NewDir(ctx, pit.String(), vfs.MkdirOptions{}) + if err != nil { + return err + } + cgDir = cgDirI.(*cgroupInode) + } + + // Walk to target dentry. + initDentry, err := fs.root.WalkDentryTree(ctx, vfsObj, initPath) + if err != nil { + ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup dentry not found: %v", err) + return linuxerr.ENOENT + } + fs.effectiveRoot = initDentry // Reference from WalkDentryTree transferred here. + return nil +} + func (fs *filesystem) rootCgroup() kernel.Cgroup { return kernel.Cgroup{ - Dentry: fs.root, - CgroupImpl: fs.root.Inode().(kernel.CgroupImpl), + Dentry: fs.effectiveRoot, + CgroupImpl: fs.effectiveRoot.Inode().(kernel.CgroupImpl), } } @@ -320,6 +377,10 @@ func (fs *filesystem) Release(ctx context.Context) { r.Unregister(fs.hierarchyID) } + if fs.root != fs.effectiveRoot { + fs.effectiveRoot.DecRef(ctx) + } + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) fs.Filesystem.Release(ctx) } @@ -346,15 +407,18 @@ func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error // // +stateify savable type dir struct { - dirRefs + kernfs.InodeNoopRefCount kernfs.InodeAlwaysValid kernfs.InodeAttrs kernfs.InodeNotSymlink - kernfs.InodeDirectoryNoNewChildren // TODO(b/183137098): Implement mkdir. + kernfs.InodeDirectoryNoNewChildren kernfs.OrderedChildren implStatFS locks vfs.FileLocks + + fs *filesystem // Immutable. + cgi *cgroupInode // Immutable. } // Keep implements kernfs.Inode.Keep. @@ -378,9 +442,100 @@ func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry return fd.VFSFileDescription(), nil } -// DecRef implements kernfs.Inode.DecRef. -func (d *dir) DecRef(ctx context.Context) { - d.dirRefs.DecRef(func() { d.Destroy(ctx) }) +// NewDir implements kernfs.Inode.NewDir. +func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) { + // "Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable." + // -- Linux, kernel/cgroup.c:cgroup_mkdir(). + if strings.Contains(name, "\n") { + return nil, linuxerr.EINVAL + } + return d.OrderedChildren.Inserter(name, func() kernfs.Inode { + d.IncLinks(1) + return d.fs.newCgroupInode(ctx, auth.CredentialsFromContext(ctx)) + }) +} + +// Rename implements kernfs.Inode.Rename. Cgroupfs only allows renaming of +// cgroup directories, and the rename may only change the name within the same +// parent. See linux, kernel/cgroup.c:cgroup_rename(). +func (d *dir) Rename(ctx context.Context, oldname, newname string, child, dst kernfs.Inode) error { + if _, ok := child.(*cgroupInode); !ok { + // Not a cgroup directory. Control files are backed by different types. + return linuxerr.ENOTDIR + } + + dstCGInode, ok := dst.(*cgroupInode) + if !ok { + // Not a cgroup inode, so definitely can't be *this* inode. + return linuxerr.EIO + } + // Note: We're intentionally comparing addresses, since two different dirs + // could plausibly be identical in memory, but would occupy different + // locations in memory. + if d != &dstCGInode.dir { + // Destination dir is a different cgroup inode. Cross directory renames + // aren't allowed. + return linuxerr.EIO + } + + // Rename moves oldname to newname within d. Proceed. + return d.OrderedChildren.Rename(ctx, oldname, newname, child, dst) +} + +// Unlink implements kernfs.Inode.Unlink. Cgroupfs disallows unlink, as the only +// files in the filesystem are control files, which can't be deleted. +func (d *dir) Unlink(ctx context.Context, name string, child kernfs.Inode) error { + return linuxerr.EPERM +} + +// hasChildrenLocked returns whether the cgroup dir contains any objects that +// prevent it from being deleted. +func (d *dir) hasChildrenLocked() bool { + // Subdirs take a link on the parent, so checks if there are any direct + // children cgroups. Exclude the dir's self link and the link from ".". + if d.InodeAttrs.Links()-2 > 0 { + return true + } + return len(d.cgi.ts) > 0 +} + +// HasChildren implements kernfs.Inode.HasChildren. +// +// The empty check for a cgroupfs directory is unlike a regular directory since +// a cgroupfs directory will always have control files. A cgroupfs directory can +// be deleted if cgroup contains no tasks and has no sub-cgroups. +func (d *dir) HasChildren() bool { + d.fs.tasksMu.RLock() + defer d.fs.tasksMu.RUnlock() + return d.hasChildrenLocked() +} + +// RmDir implements kernfs.Inode.RmDir. +func (d *dir) RmDir(ctx context.Context, name string, child kernfs.Inode) error { + // Unlike a normal directory, we need to recheck if d is empty again, since + // vfs/kernfs can't stop tasks from entering or leaving the cgroup. + d.fs.tasksMu.RLock() + defer d.fs.tasksMu.RUnlock() + + cgi, ok := child.(*cgroupInode) + if !ok { + return linuxerr.ENOTDIR + } + if cgi.dir.hasChildrenLocked() { + return linuxerr.ENOTEMPTY + } + + // Disallow deletion of the effective root cgroup. + if cgi == d.fs.effectiveRoot.Inode().(*cgroupInode) { + ctx.Warningf("Cannot delete initial cgroup for new tasks %q", d.fs.effectiveRoot.FSLocalPath()) + return linuxerr.EBUSY + } + + err := d.OrderedChildren.RmDir(ctx, name, child) + if err == nil { + d.InodeAttrs.DecLinks() + } + return err } // controllerFile represents a generic control file that appears within a cgroup diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD index f981ff296..e0b879339 100644 --- a/pkg/sentry/fsimpl/devpts/BUILD +++ b/pkg/sentry/fsimpl/devpts/BUILD @@ -45,7 +45,6 @@ go_library( "//pkg/sentry/unimpl", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index 7a488e9fd..e711debcb 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -29,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // Name is the filesystem name. @@ -180,7 +179,7 @@ func (i *rootInode) allocateTerminal(ctx context.Context, creds *auth.Credential i.mu.Lock() defer i.mu.Unlock() if i.nextIdx == math.MaxUint32 { - return nil, syserror.ENOMEM + return nil, linuxerr.ENOMEM } idx := i.nextIdx i.nextIdx++ @@ -241,7 +240,7 @@ func (i *rootInode) Lookup(ctx context.Context, name string) (kernfs.Inode, erro // Not a static entry. idx, err := strconv.ParseUint(name, 10, 32) if err != nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } i.mu.Lock() defer i.mu.Unlock() @@ -250,7 +249,7 @@ func (i *rootInode) Lookup(ctx context.Context, name string) (kernfs.Inode, erro return ri, nil } - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } // IterDirents implements kernfs.Inode.IterDirents. diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go index 9cb21e83b..609623f9f 100644 --- a/pkg/sentry/fsimpl/devpts/line_discipline.go +++ b/pkg/sentry/fsimpl/devpts/line_discipline.go @@ -20,10 +20,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -203,7 +203,7 @@ func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSeque } else if notifyEcho { l.masterWaiter.Notify(waiter.ReadableEvents) } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) { @@ -220,7 +220,7 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ l.replicaWaiter.Notify(waiter.ReadableEvents) return n, nil } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error { @@ -242,7 +242,7 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ } return n, nil } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) { @@ -257,7 +257,7 @@ func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSeq l.masterWaiter.Notify(waiter.ReadableEvents) return n, nil } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // transformer is a helper interface to make it easier to stateify queue. diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go index ff1d89955..85aeefa43 100644 --- a/pkg/sentry/fsimpl/devpts/queue.go +++ b/pkg/sentry/fsimpl/devpts/queue.go @@ -17,12 +17,12 @@ package devpts import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -110,7 +110,7 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl defer q.mu.Unlock() if !q.readable { - return 0, false, false, syserror.ErrWouldBlock + return 0, false, false, linuxerr.ErrWouldBlock } if dst.NumBytes() > canonMaxBytes { @@ -156,7 +156,7 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip room := waitBufMaxBytes - q.waitBufLen // If out of room, return EAGAIN. if room == 0 && copyLen > 0 { - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // Cap the size of the wait buffer. if copyLen > room { diff --git a/pkg/sentry/fsimpl/eventfd/BUILD b/pkg/sentry/fsimpl/eventfd/BUILD index c09fdc7f9..1cb049a29 100644 --- a/pkg/sentry/fsimpl/eventfd/BUILD +++ b/pkg/sentry/fsimpl/eventfd/BUILD @@ -9,11 +9,11 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/fdnotifier", "//pkg/hostarch", "//pkg/log", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go index 4f79cfcb7..af5ba5131 100644 --- a/pkg/sentry/fsimpl/eventfd/eventfd.go +++ b/pkg/sentry/fsimpl/eventfd/eventfd.go @@ -22,11 +22,11 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -149,7 +149,7 @@ func (efd *EventFileDescription) hostReadLocked(ctx context.Context, dst usermem var buf [8]byte if _, err := unix.Read(efd.hostfd, buf[:]); err != nil { if err == unix.EWOULDBLOCK { - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } return err } @@ -167,7 +167,7 @@ func (efd *EventFileDescription) read(ctx context.Context, dst usermem.IOSequenc // We can't complete the read if the value is currently zero. if efd.val == 0 { efd.mu.Unlock() - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } // Update the value based on the mode the event is operating in. @@ -200,7 +200,7 @@ func (efd *EventFileDescription) hostWriteLocked(val uint64) error { hostarch.ByteOrder.PutUint64(buf[:], val) _, err := unix.Write(efd.hostfd, buf[:]) if err == unix.EWOULDBLOCK { - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } return err } @@ -232,7 +232,7 @@ func (efd *EventFileDescription) Signal(val uint64) error { // uint64 minus 1. if val > math.MaxUint64-1-efd.val { efd.mu.Unlock() - return syserror.ErrWouldBlock + return linuxerr.ErrWouldBlock } efd.val += val diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/pkg/sentry/fsimpl/ext/BUILD diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD index 871df5984..05c4fbeb2 100644 --- a/pkg/sentry/fsimpl/fuse/BUILD +++ b/pkg/sentry/fsimpl/fuse/BUILD @@ -59,7 +59,6 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", @@ -84,7 +83,6 @@ go_test( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index dab1e779d..0f855ac59 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -38,7 +37,7 @@ type fuseDevice struct{} // Open implements vfs.Device.Open. func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { if !kernel.FUSEEnabled { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } var fd DeviceFD @@ -126,7 +125,7 @@ func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset in return 0, linuxerr.EPERM } - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS } // Read implements vfs.FileDescriptionImpl.Read. @@ -192,7 +191,7 @@ func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts } if req == nil { - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // We already checked the size: dst must be able to fit the whole request. @@ -205,7 +204,7 @@ func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts return 0, err } if n != len(req.data) { - return 0, syserror.EIO + return 0, linuxerr.EIO } if req.hdr.Opcode == linux.FUSE_WRITE { @@ -214,7 +213,7 @@ func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts return 0, err } if written != len(req.payload) { - return 0, syserror.EIO + return 0, linuxerr.EIO } n += int(written) } @@ -238,7 +237,7 @@ func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset i return 0, linuxerr.EPERM } - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS } // Write implements vfs.FileDescriptionImpl.Write. @@ -395,7 +394,7 @@ func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64 return 0, linuxerr.EPERM } - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS } // sendResponse sends a response to the waiting task (if any). diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go index 04250d796..8951b5ba8 100644 --- a/pkg/sentry/fsimpl/fuse/dev_test.go +++ b/pkg/sentry/fsimpl/fuse/dev_test.go @@ -20,11 +20,11 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -186,7 +186,7 @@ func ReadTest(serverTask *kernel.Task, fd *vfs.FileDescription, inIOseq usermem. // "would block". n, err = dev.Read(serverTask, inIOseq, vfs.ReadOptions{}) total += n - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { break } diff --git a/pkg/sentry/fsimpl/fuse/directory.go b/pkg/sentry/fsimpl/fuse/directory.go index fcc5d9a2a..9611edd5a 100644 --- a/pkg/sentry/fsimpl/fuse/directory.go +++ b/pkg/sentry/fsimpl/fuse/directory.go @@ -19,10 +19,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -32,27 +32,27 @@ type directoryFD struct { // Allocate implements directoryFD.Allocate. func (*directoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error { - return syserror.EISDIR + return linuxerr.EISDIR } // PRead implements vfs.FileDescriptionImpl.PRead. func (*directoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // Read implements vfs.FileDescriptionImpl.Read. func (*directoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (*directoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // Write implements vfs.FileDescriptionImpl.Write. func (*directoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.EISDIR + return 0, linuxerr.EISDIR } // IterDirents implements vfs.FileDescriptionImpl.IterDirents. diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index 172cbd88f..af16098d2 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -30,7 +30,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -612,7 +611,7 @@ func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMo return nil, err } if opcode != linux.FUSE_LOOKUP && ((out.Attr.Mode&linux.S_IFMT)^uint32(fileType) != 0 || out.NodeID == 0 || out.NodeID == linux.FUSE_ROOT_ID) { - return nil, syserror.EIO + return nil, linuxerr.EIO } child := i.fs.newInode(ctx, out.NodeID, out.Attr) return child, nil diff --git a/pkg/sentry/fsimpl/fuse/read_write.go b/pkg/sentry/fsimpl/fuse/read_write.go index 35d0ab6f4..fe119aa43 100644 --- a/pkg/sentry/fsimpl/fuse/read_write.go +++ b/pkg/sentry/fsimpl/fuse/read_write.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" ) // ReadInPages sends FUSE_READ requests for the size after round it up to @@ -221,7 +220,7 @@ func (fs *filesystem) Write(ctx context.Context, fd *regularFileFD, off uint64, // Write more than requested? EIO. if out.Size > toWrite { - return 0, syserror.EIO + return 0, linuxerr.EIO } written += out.Size diff --git a/pkg/sentry/fsimpl/fuse/regular_file.go b/pkg/sentry/fsimpl/fuse/regular_file.go index 6c4de3507..38cde8208 100644 --- a/pkg/sentry/fsimpl/fuse/regular_file.go +++ b/pkg/sentry/fsimpl/fuse/regular_file.go @@ -24,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -108,7 +107,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs return 0, err } if int64(cp) != toCopy { - return 0, syserror.EIO + return 0, linuxerr.EIO } copied += toCopy } @@ -205,7 +204,7 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off return 0, offset, err } if int64(cp) != srclen { - return 0, offset, syserror.EIO + return 0, offset, linuxerr.EIO } n, err := fd.inode().fs.Write(ctx, fd, uint64(offset), uint32(srclen), data) @@ -216,7 +215,7 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off if n == 0 { // We have checked srclen != 0 previously. // If err == nil, then it's a short write and we return EIO. - return 0, offset, syserror.EIO + return 0, offset, linuxerr.EIO } written = int64(n) diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index 752060044..4244f2cf5 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -79,7 +79,6 @@ go_library( "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", - "//pkg/syserror", "//pkg/unet", "//pkg/usermem", "//pkg/waiter", diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 05b776c2e..00228c469 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -33,7 +33,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // Sync implements vfs.FilesystemImpl.Sync. @@ -235,7 +234,7 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s } if child, ok := parent.children[name]; ok || parent.isSynthetic() { if child == nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } return child, nil } @@ -349,7 +348,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir return linuxerr.EEXIST } if parent.isDeleted() { - return syserror.ENOENT + return linuxerr.ENOENT } if err := fs.revalidateOne(ctx, rp.VirtualFilesystem(), parent, name, &ds); err != nil { return err @@ -395,7 +394,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir return err } if !dir && rp.MustBeDir() { - return syserror.ENOENT + return linuxerr.ENOENT } if parent.isSynthetic() { if createInSyntheticDir == nil { @@ -463,7 +462,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b } } else { if name == "." || name == ".." { - return syserror.EISDIR + return linuxerr.EISDIR } } @@ -486,7 +485,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b child, ok = parent.children[name] if ok && child == nil { // Hit a negative cached entry, child doesn't exist. - return syserror.ENOENT + return linuxerr.ENOENT } } else { child, _, err = fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds) @@ -552,7 +551,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b // child must be a non-directory file. if child != nil && child.isDir() { vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. - return syserror.EISDIR + return linuxerr.EISDIR } if rp.MustBeDir() { if child != nil { @@ -563,7 +562,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b } if parent.isSynthetic() { if child == nil { - return syserror.ENOENT + return linuxerr.ENOENT } } else if child == nil || !child.isSynthetic() { err = parent.file.unlinkAt(ctx, name, flags) @@ -674,7 +673,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return err } if d.nlink == 0 { - return syserror.ENOENT + return linuxerr.ENOENT } if d.nlink == math.MaxUint32 { return linuxerr.EMLINK @@ -811,7 +810,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if rp.Done() { // Reject attempts to open mount root directory with O_CREAT. if mayCreate && rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if mustCreate { return nil, linuxerr.EEXIST @@ -841,7 +840,7 @@ afterTrailingSymlink: } // Reject attempts to open directories with O_CREAT. if mayCreate && rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if err := fs.revalidateOne(ctx, rp.VirtualFilesystem(), parent, rp.Component(), &ds); err != nil { return nil, err @@ -922,11 +921,11 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open case linux.S_IFDIR: // Can't open directories with O_CREAT. if opts.Flags&linux.O_CREAT != 0 { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } // Can't open directories writably. if ats&vfs.MayWrite != 0 { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if opts.Flags&linux.O_DIRECT != 0 { return nil, linuxerr.EINVAL @@ -1054,7 +1053,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving return nil, err } if d.isDeleted() { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { @@ -1268,7 +1267,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa defer newParent.dirMu.Unlock() } if newParent.isDeleted() { - return syserror.ENOENT + return linuxerr.ENOENT } replaced, err := fs.getChildLocked(ctx, newParent, newName, &ds) if err != nil && !linuxerr.Equals(linuxerr.ENOENT, err) { @@ -1282,7 +1281,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa replacedVFSD = &replaced.vfsd if replaced.isDir() { if !renamed.isDir() { - return syserror.EISDIR + return linuxerr.EISDIR } if genericIsAncestorDentry(replaced, renamed) { return linuxerr.ENOTEMPTY diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 25d2e39d6..bd6b30397 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -62,7 +62,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/unet" ) @@ -865,11 +864,11 @@ func dentryAttrMask() p9.AttrMask { func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) { if !mask.Mode { ctx.Warningf("can't create gofer.dentry without file type") - return nil, syserror.EIO + return nil, linuxerr.EIO } if attr.Mode.FileType() == p9.ModeRegular && !mask.Size { ctx.Warningf("can't create regular file gofer.dentry without file size") - return nil, syserror.EIO + return nil, linuxerr.EIO } d := &dentry{ @@ -1112,7 +1111,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs case linux.S_IFREG: // ok case linux.S_IFDIR: - return syserror.EISDIR + return linuxerr.EISDIR default: return linuxerr.EINVAL } diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go index 5c57f6fea..02540a754 100644 --- a/pkg/sentry/fsimpl/gofer/handle.go +++ b/pkg/sentry/fsimpl/gofer/handle.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/hostfd" + "gvisor.dev/gvisor/pkg/sync" ) // handle represents a remote "open file descriptor", consisting of an opened @@ -130,3 +131,43 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o } return uint64(n), cperr } + +type handleReadWriter struct { + ctx context.Context + h *handle + off uint64 +} + +var handleReadWriterPool = sync.Pool{ + New: func() interface{} { + return &handleReadWriter{} + }, +} + +func getHandleReadWriter(ctx context.Context, h *handle, offset int64) *handleReadWriter { + rw := handleReadWriterPool.Get().(*handleReadWriter) + rw.ctx = ctx + rw.h = h + rw.off = uint64(offset) + return rw +} + +func putHandleReadWriter(rw *handleReadWriter) { + rw.ctx = nil + rw.h = nil + handleReadWriterPool.Put(rw) +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +func (rw *handleReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + n, err := rw.h.readToBlocksAt(rw.ctx, dsts, rw.off) + rw.off += n + return n, err +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +func (rw *handleReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + n, err := rw.h.writeFromBlocksAt(rw.ctx, srcs, rw.off) + rw.off += n + return n, err +} diff --git a/pkg/sentry/fsimpl/gofer/host_named_pipe.go b/pkg/sentry/fsimpl/gofer/host_named_pipe.go index 398288ee3..505916a57 100644 --- a/pkg/sentry/fsimpl/gofer/host_named_pipe.go +++ b/pkg/sentry/fsimpl/gofer/host_named_pipe.go @@ -22,7 +22,6 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" - "gvisor.dev/gvisor/pkg/syserror" ) // Global pipe used by blockUntilNonblockingPipeHasWriter since we can't create @@ -109,6 +108,6 @@ func sleepBetweenNamedPipeOpenChecks(ctx context.Context) error { return nil case <-cancel: ctx.SleepFinish(false) - return syserror.ErrInterrupted + return linuxerr.ErrInterrupted } } diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go index b0a429d42..5a3ddfc9d 100644 --- a/pkg/sentry/fsimpl/gofer/p9file.go +++ b/pkg/sentry/fsimpl/gofer/p9file.go @@ -16,9 +16,9 @@ package gofer import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/syserror" ) // p9file is a wrapper around p9.File that provides methods that are @@ -59,7 +59,7 @@ func (f p9file) walkGetAttrOne(ctx context.Context, name string) (p9.QID, p9file if newfile != nil { p9file{newfile}.close(ctx) } - return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, syserror.EIO + return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, linuxerr.EIO } return qids[0], p9file{newfile}, attrMask, attr, nil } diff --git a/pkg/sentry/fsimpl/gofer/save_restore.go b/pkg/sentry/fsimpl/gofer/save_restore.go index e67422a2f..8dcbc61ed 100644 --- a/pkg/sentry/fsimpl/gofer/save_restore.go +++ b/pkg/sentry/fsimpl/gofer/save_restore.go @@ -158,6 +158,10 @@ func (d *dentryPlatformFile) afterLoad() { // afterLoad is invoked by stateify. func (fd *specialFileFD) afterLoad() { fd.handle.fd = -1 + if fd.hostFileMapper.IsInited() { + // Ensure that we don't call fd.hostFileMapper.Init() again. + fd.hostFileMapperInitOnce.Do(func() {}) + } } // CompleteRestore implements diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 4b59c1c3c..a8d47b65b 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -22,13 +22,15 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fsmetric" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -76,6 +78,16 @@ type specialFileFD struct { bufMu sync.Mutex `state:"nosave"` haveBuf uint32 buf []byte + + // If handle.fd >= 0, hostFileMapper caches mappings of handle.fd, and + // hostFileMapperInitOnce is used to initialize it on first use. + hostFileMapperInitOnce sync.Once `state:"nosave"` + hostFileMapper fsutil.HostFileMapper + + // If handle.fd >= 0, fileRefs counts references on memmap.File offsets. + // fileRefs is protected by fileRefsMu. + fileRefsMu sync.Mutex `state:"nosave"` + fileRefs fsutil.FrameRefSet } func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*specialFileFD, error) { @@ -230,23 +242,13 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs } } - // Going through dst.CopyOutFrom() would hold MM locks around file - // operations of unknown duration. For regularFileFD, doing so is necessary - // to support mmap due to lock ordering; MM locks precede dentry.dataMu. - // That doesn't hold here since specialFileFD doesn't client-cache data. - // Just buffer the read instead. - buf := make([]byte, dst.NumBytes()) - n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) + rw := getHandleReadWriter(ctx, &fd.handle, offset) + n, err := dst.CopyOutFrom(ctx, rw) + putHandleReadWriter(rw) if linuxerr.Equals(linuxerr.EAGAIN, err) { - err = syserror.ErrWouldBlock - } - if n == 0 { - return bufN, err + err = linuxerr.ErrWouldBlock } - if cp, cperr := dst.CopyOut(ctx, buf[:n]); cperr != nil { - return bufN + int64(cp), cperr - } - return bufN + int64(n), err + return bufN + n, err } // Read implements vfs.FileDescriptionImpl.Read. @@ -317,20 +319,15 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off } } - // Do a buffered write. See rationale in PRead. - buf := make([]byte, src.NumBytes()) - copied, copyErr := src.CopyIn(ctx, buf) - if copied == 0 && copyErr != nil { - // Only return the error if we didn't get any data. - return 0, offset, copyErr - } - n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:copied])), uint64(offset)) + rw := getHandleReadWriter(ctx, &fd.handle, offset) + n, err := src.CopyInTo(ctx, rw) + putHandleReadWriter(rw) if linuxerr.Equals(linuxerr.EAGAIN, err) { - err = syserror.ErrWouldBlock + err = linuxerr.ErrWouldBlock } // Update offset if the offset is valid. if offset >= 0 { - offset += int64(n) + offset += n } // Update file size for regular files. if fd.isRegularFile { @@ -341,10 +338,7 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off atomic.StoreUint64(&d.size, uint64(offset)) } } - if err != nil { - return int64(n), offset, err - } - return int64(n), offset, copyErr + return int64(n), offset, err } // Write implements vfs.FileDescriptionImpl.Write. @@ -412,3 +406,85 @@ func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool) error } return nil } + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *specialFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + if fd.handle.fd < 0 || fd.filesystem().opts.forcePageCache { + return linuxerr.ENODEV + } + // After this point, fd may be used as a memmap.Mappable and memmap.File. + fd.hostFileMapperInitOnce.Do(fd.hostFileMapper.Init) + return vfs.GenericConfigureMMap(&fd.vfsfd, fd, opts) +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (fd *specialFileFD) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { + fd.hostFileMapper.IncRefOn(memmap.MappableRange{offset, offset + uint64(ar.Length())}) + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (fd *specialFileFD) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { + fd.hostFileMapper.DecRefOn(memmap.MappableRange{offset, offset + uint64(ar.Length())}) +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (fd *specialFileFD) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { + return fd.AddMapping(ctx, ms, dstAR, offset, writable) +} + +// Translate implements memmap.Mappable.Translate. +func (fd *specialFileFD) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { + mr := optional + if fd.filesystem().opts.limitHostFDTranslation { + mr = maxFillRange(required, optional) + } + return []memmap.Translation{ + { + Source: mr, + File: fd, + Offset: mr.Start, + Perms: hostarch.AnyAccess, + }, + }, nil +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (fd *specialFileFD) InvalidateUnsavable(ctx context.Context) error { + return nil +} + +// IncRef implements memmap.File.IncRef. +func (fd *specialFileFD) IncRef(fr memmap.FileRange) { + fd.fileRefsMu.Lock() + defer fd.fileRefsMu.Unlock() + fd.fileRefs.IncRefAndAccount(fr) +} + +// DecRef implements memmap.File.DecRef. +func (fd *specialFileFD) DecRef(fr memmap.FileRange) { + fd.fileRefsMu.Lock() + defer fd.fileRefsMu.Unlock() + fd.fileRefs.DecRefAndAccount(fr) +} + +// MapInternal implements memmap.File.MapInternal. +func (fd *specialFileFD) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { + fd.requireHostFD() + return fd.hostFileMapper.MapInternal(fr, int(fd.handle.fd), at.Write) +} + +// FD implements memmap.File.FD. +func (fd *specialFileFD) FD() int { + fd.requireHostFD() + return int(fd.handle.fd) +} + +func (fd *specialFileFD) requireHostFD() { + if fd.handle.fd < 0 { + // This is possible if fd was successfully mmapped before saving, then + // was restored without a host FD. This is unrecoverable: without a + // host FD, we can't mmap this file post-restore. + panic("gofer.specialFileFD can no longer be memory-mapped without a host FD") + } +} diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index 476545d00..180a35583 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -70,7 +70,6 @@ go_library( "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", - "//pkg/syserror", "//pkg/tcpip", "//pkg/unet", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 89aa7b3d9..984c6e8ee 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -37,7 +37,6 @@ import ( unixsocket "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -712,7 +711,7 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts if total != 0 { err = nil } else { - err = syserror.ErrWouldBlock + err = linuxerr.ErrWouldBlock } } return total, err @@ -766,7 +765,7 @@ func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opt if !i.seekable { n, err := f.writeToHostFD(ctx, src, -1, opts.Flags) if isBlockError(err) { - err = syserror.ErrWouldBlock + err = linuxerr.ErrWouldBlock } return n, err } diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go index 7f6ce4ee5..04ac73255 100644 --- a/pkg/sentry/fsimpl/host/tty.go +++ b/pkg/sentry/fsimpl/host/tty.go @@ -24,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -346,7 +345,7 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal) // If the signal is SIGTTIN, then we are attempting to read // from the TTY. Don't send the signal and return EIO. if sig == linux.SIGTTIN { - return syserror.EIO + return linuxerr.EIO } // Otherwise, we are writing or changing terminal state. This is allowed. @@ -355,7 +354,7 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal) // If the process group is an orphan, return EIO. if pg.IsOrphan() { - return syserror.EIO + return linuxerr.EIO } // Otherwise, send the signal to the process group and return ERESTARTSYS. @@ -368,5 +367,5 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal) // // Linux ignores the result of kill_pgrp(). _ = pg.SendSignal(kernel.SignalInfoPriv(sig)) - return syserror.ERESTARTSYS + return linuxerr.ERESTARTSYS } diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go index 95d7ebe2e..9850f3f41 100644 --- a/pkg/sentry/fsimpl/host/util.go +++ b/pkg/sentry/fsimpl/host/util.go @@ -42,7 +42,7 @@ func timespecToStatxTimestamp(ts unix.Timespec) linux.StatxTimestamp { } // isBlockError checks if an error is EAGAIN or EWOULDBLOCK. -// If so, they can be transformed into syserror.ErrWouldBlock. +// If so, they can be transformed into linuxerr.ErrWouldBlock. func isBlockError(err error) bool { return linuxerr.Equals(linuxerr.EAGAIN, err) || linuxerr.Equals(linuxerr.EWOULDBLOCK, err) } diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index d53937db6..4b577ea43 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -119,7 +119,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", ], ) @@ -137,6 +136,7 @@ go_test( "//pkg/abi/linux", "//pkg/context", "//pkg/errors/linuxerr", + "//pkg/fspath", "//pkg/log", "//pkg/refs", "//pkg/refsvfs2", @@ -144,7 +144,6 @@ go_test( "//pkg/sentry/fsimpl/testutil", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", "@com_github_google_go_cmp//cmp:go_default_library", ], diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 8b008dc10..7db1473c4 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -24,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -99,7 +98,7 @@ func NewGenericDirectoryFD(m *vfs.Mount, d *Dentry, children *OrderedChildren, l func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) error { if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 { // Can't open directories for writing. - return syserror.EISDIR + return linuxerr.EISDIR } fd.LockFD.Init(locks) fd.seekEnd = fdOpts.SeekEnd diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index a97473f7d..363ebc466 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // stepExistingLocked resolves rp.Component() in parent directory vfsd. @@ -224,7 +223,7 @@ func checkCreateLocked(ctx context.Context, creds *auth.Credentials, name string return linuxerr.EEXIST } if parent.VFSDentry().IsDead() { - return syserror.ENOENT + return linuxerr.ENOENT } if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayWrite); err != nil { return err @@ -241,7 +240,7 @@ func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) er return linuxerr.EBUSY } if parent.vfsd.IsDead() { - return syserror.ENOENT + return linuxerr.ENOENT } if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err @@ -362,7 +361,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return err } if rp.MustBeDir() { - return syserror.ENOENT + return linuxerr.ENOENT } if rp.Mount() != vd.Mount() { return linuxerr.EXDEV @@ -443,7 +442,7 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return err } if rp.MustBeDir() { - return syserror.ENOENT + return linuxerr.ENOENT } if err := rp.Mount().CheckBeginWrite(); err != nil { return err @@ -509,7 +508,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf defer unlock() if rp.Done() { if rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if mustCreate { return nil, linuxerr.EEXIST @@ -536,11 +535,11 @@ afterTrailingSymlink: } // Reject attempts to open directories with O_CREAT. if rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } pc := rp.Component() if pc == "." || pc == ".." { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if len(pc) > linux.NAME_MAX { return nil, linuxerr.ENAMETOOLONG @@ -861,7 +860,7 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ return err } if rp.MustBeDir() { - return syserror.ENOENT + return linuxerr.ENOENT } if err := rp.Mount().CheckBeginWrite(); err != nil { return err @@ -895,7 +894,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } if d.isDir() { - return syserror.EISDIR + return linuxerr.EISDIR } virtfs := rp.VirtualFilesystem() parentDentry := d.parent diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index a42fc79b4..b96dc9ef7 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -26,7 +26,6 @@ import ( ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // InodeNoopRefCount partially implements the Inode interface, specifically the @@ -234,6 +233,11 @@ func (a *InodeAttrs) Mode() linux.FileMode { return linux.FileMode(atomic.LoadUint32(&a.mode)) } +// Links returns the link count. +func (a *InodeAttrs) Links() uint32 { + return atomic.LoadUint32(&a.nlink) +} + // TouchAtime updates a.atime to the current time. func (a *InodeAttrs) TouchAtime(ctx context.Context, mnt *vfs.Mount) { if mnt.Flags.NoATime || mnt.ReadOnly() { @@ -289,7 +293,7 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut return linuxerr.EPERM } if opts.Stat.Mask&linux.STATX_SIZE != 0 && a.Mode().IsDir() { - return syserror.EISDIR + return linuxerr.EISDIR } if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil { return err @@ -475,7 +479,7 @@ func (o *OrderedChildren) Lookup(ctx context.Context, name string) (Inode, error s, ok := o.set[name] if !ok { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } s.inode.IncRef() // This ref is passed to the dentry upon creation via Init. @@ -502,6 +506,30 @@ func (o *OrderedChildren) Insert(name string, child Inode) error { return o.insert(name, child, false) } +// Inserter is like Insert, but obtains the child to insert by calling +// makeChild. makeChild is only called if the insert will succeed. This allows +// the caller to atomically check and insert a child without having to +// clean up the child on failure. +func (o *OrderedChildren) Inserter(name string, makeChild func() Inode) (Inode, error) { + o.mu.Lock() + defer o.mu.Unlock() + if _, ok := o.set[name]; ok { + return nil, linuxerr.EEXIST + } + + // Note: We must not fail after we call makeChild(). + + child := makeChild() + s := &slot{ + name: name, + inode: child, + static: false, + } + o.order.PushBack(s) + o.set[name] = s + return child, nil +} + // insert inserts child into o. // // Precondition: Caller must be holding a ref on child if static is true. @@ -559,7 +587,7 @@ func (o *OrderedChildren) replaceChildLocked(ctx context.Context, name string, n func (o *OrderedChildren) checkExistingLocked(name string, child Inode) error { s, ok := o.set[name] if !ok { - return syserror.ENOENT + return linuxerr.ENOENT } if s.inode != child { panic(fmt.Sprintf("Inode doesn't match what kernfs thinks! OrderedChild: %+v, kernfs: %+v", s.inode, child)) @@ -746,5 +774,5 @@ type InodeNoStatFS struct{} // StatFS implements Inode.StatFS. func (*InodeNoStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { - return linux.Statfs{}, syserror.ENOSYS + return linux.Statfs{}, linuxerr.ENOSYS } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 0e2867d49..544698694 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -61,6 +61,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/refsvfs2" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -542,6 +543,63 @@ func (d *Dentry) FSLocalPath() string { return b.String() } +// WalkDentryTree traverses p in the dentry tree for this filesystem. Note that +// this only traverses the dentry tree and is not a general path traversal. No +// symlinks and dynamic children are resolved, and no permission checks are +// performed. The caller is responsible for ensuring the returned Dentry exists +// for an appropriate lifetime. +// +// p is interpreted starting at d, and may be absolute or relative (absolute vs +// relative paths both refer to the same target here, since p is absolute from +// d). p may contain "." and "..", but will not allow traversal above d (similar +// to ".." at the root dentry). +// +// This is useful for filesystem internals, where the filesystem may not be +// mounted yet. For a mounted filesystem, use GetDentryAt. +func (d *Dentry) WalkDentryTree(ctx context.Context, vfsObj *vfs.VirtualFilesystem, p fspath.Path) (*Dentry, error) { + d.fs.mu.RLock() + defer d.fs.processDeferredDecRefs(ctx) + defer d.fs.mu.RUnlock() + + target := d + + for pit := p.Begin; pit.Ok(); pit = pit.Next() { + pc := pit.String() + + switch { + case target == nil: + return nil, linuxerr.ENOENT + case pc == ".": + // No-op, consume component and continue. + case pc == "..": + if target == d { + // Don't let .. traverse above the start point of the walk. + continue + } + target = target.parent + // Parent doesn't need revalidation since we revalidated it on the + // way to the child, and we're still holding fs.mu. + default: + var err error + + d.dirMu.Lock() + target, err = d.fs.revalidateChildLocked(ctx, vfsObj, target, pc, target.children[pc]) + d.dirMu.Unlock() + + if err != nil { + return nil, err + } + } + } + + if target == nil { + return nil, linuxerr.ENOENT + } + + target.IncRef() + return target, nil +} + // The Inode interface maps filesystem-level operations that operate on paths to // equivalent operations on specific filesystem nodes. // @@ -667,12 +725,15 @@ type inodeDirectory interface { // RmDir removes an empty child directory from this directory // inode. Implementations must update the parent directory's link count, // if required. Implementations are not responsible for checking that child - // is a directory, checking for an empty directory. + // is a directory, or checking for an empty directory. RmDir(ctx context.Context, name string, child Inode) error // Rename is called on the source directory containing an inode being - // renamed. child should point to the resolved child in the source - // directory. + // renamed. child points to the resolved child in the source directory. + // dstDir is guaranteed to be a directory inode. + // + // On a successful call to Rename, the caller updates the dentry tree to + // reflect the name change. // // Precondition: Caller must serialize concurrent calls to Rename. Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 609887943..a2aba9321 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" @@ -346,3 +347,63 @@ func TestDirFDIterDirents(t *testing.T) { "file1": linux.DT_REG, }) } + +func TestDirWalkDentryTree(t *testing.T) { + sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode { + return fs.newDir(ctx, creds, 0755, map[string]kernfs.Inode{ + "dir1": fs.newDir(ctx, creds, 0755, nil), + "dir2": fs.newDir(ctx, creds, 0755, map[string]kernfs.Inode{ + "file1": fs.newFile(ctx, creds, staticFileContent), + "dir3": fs.newDir(ctx, creds, 0755, nil), + }), + }) + }) + defer sys.Destroy() + + testWalk := func(from *kernfs.Dentry, getDentryPath, walkPath string, expectedErr error) { + var d *kernfs.Dentry + if getDentryPath != "" { + pop := sys.PathOpAtRoot(getDentryPath) + vd := sys.GetDentryOrDie(pop) + defer vd.DecRef(sys.Ctx) + d = vd.Dentry().Impl().(*kernfs.Dentry) + } + + match, err := from.WalkDentryTree(sys.Ctx, sys.VFS, fspath.Parse(walkPath)) + if err == nil { + defer match.DecRef(sys.Ctx) + } + + if err != expectedErr { + t.Fatalf("WalkDentryTree from %q to %q (with expected error: %v) unexpected error, want: %v, got: %v", from.FSLocalPath(), walkPath, expectedErr, expectedErr, err) + } + if expectedErr != nil { + return + } + + if d != match { + t.Fatalf("WalkDentryTree from %q to %q (with expected error: %v) found unexpected dentry; want: %v, got: %v", from.FSLocalPath(), walkPath, expectedErr, d, match) + } + } + + rootD := sys.Root.Dentry().Impl().(*kernfs.Dentry) + + testWalk(rootD, "dir1", "/dir1", nil) + testWalk(rootD, "", "/dir-non-existent", linuxerr.ENOENT) + testWalk(rootD, "", "/dir1/child-non-existent", linuxerr.ENOENT) + testWalk(rootD, "", "/dir2/inner-non-existent/dir3", linuxerr.ENOENT) + + testWalk(rootD, "dir2/dir3", "/dir2/../dir2/dir3", nil) + testWalk(rootD, "dir2/dir3", "/dir2/././dir3", nil) + testWalk(rootD, "dir2/dir3", "/dir2/././dir3/.././dir3", nil) + + pop := sys.PathOpAtRoot("dir2") + dir2VD := sys.GetDentryOrDie(pop) + defer dir2VD.DecRef(sys.Ctx) + dir2D := dir2VD.Dentry().Impl().(*kernfs.Dentry) + + testWalk(dir2D, "dir2/dir3", "/dir3", nil) + testWalk(dir2D, "dir2/dir3", "/../../../dir3", nil) + testWalk(dir2D, "dir2/file1", "/file1", nil) + testWalk(dir2D, "dir2/file1", "file1", nil) +} diff --git a/pkg/sentry/fsimpl/overlay/BUILD b/pkg/sentry/fsimpl/overlay/BUILD index ed730e215..d16dfef9b 100644 --- a/pkg/sentry/fsimpl/overlay/BUILD +++ b/pkg/sentry/fsimpl/overlay/BUILD @@ -42,7 +42,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go index 1f85a1f0d..618092ef1 100644 --- a/pkg/sentry/fsimpl/overlay/copy_up.go +++ b/pkg/sentry/fsimpl/overlay/copy_up.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) func (d *dentry) isCopiedUp() bool { @@ -72,7 +71,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { } if d.vfsd.IsDead() { // Raced with deletion of d. - return syserror.ENOENT + return linuxerr.ENOENT } // Obtain settable timestamps from the lower layer. diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go index 5e89928c5..c04c80590 100644 --- a/pkg/sentry/fsimpl/overlay/filesystem.go +++ b/pkg/sentry/fsimpl/overlay/filesystem.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // _OVL_XATTR_PREFIX is an extended attribute key prefix to identify overlayfs @@ -314,7 +313,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str } if !topLookupLayer.existsInOverlay() { child.destroyLocked(ctx) - return nil, topLookupLayer, syserror.ENOENT + return nil, topLookupLayer, linuxerr.ENOENT } // Device and inode numbers were copied from the topmost layer above. Remap @@ -483,7 +482,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir return linuxerr.EEXIST } if parent.vfsd.IsDead() { - return syserror.ENOENT + return linuxerr.ENOENT } if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { @@ -506,7 +505,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir } if !dir && rp.MustBeDir() { - return syserror.ENOENT + return linuxerr.ENOENT } mnt := rp.Mount() @@ -780,7 +779,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf start := rp.Start().Impl().(*dentry) if rp.Done() { if mayCreate && rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if mustCreate { return nil, linuxerr.EEXIST @@ -807,7 +806,7 @@ afterTrailingSymlink: } // Reject attempts to open directories with O_CREAT. if mayCreate && rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } // Determine whether or not we need to create a file. parent.dirMu.Lock() @@ -865,11 +864,11 @@ func (d *dentry) openCopiedUp(ctx context.Context, rp *vfs.ResolvingPath, opts * if ftype == linux.S_IFDIR { // Can't open directories with O_CREAT. if opts.Flags&linux.O_CREAT != 0 { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } // Can't open directories writably. if ats.MayWrite() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if opts.Flags&linux.O_DIRECT != 0 { return nil, linuxerr.EINVAL @@ -919,7 +918,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving return nil, err } if parent.vfsd.IsDead() { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { @@ -1086,7 +1085,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa defer newParent.dirMu.Unlock() } if newParent.vfsd.IsDead() { - return syserror.ENOENT + return linuxerr.ENOENT } var ( replaced *dentry @@ -1105,7 +1104,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa replacedVFSD = &replaced.vfsd if replaced.isDir() { if !renamed.isDir() { - return syserror.EISDIR + return linuxerr.EISDIR } if genericIsAncestorDentry(replaced, renamed) { return linuxerr.ENOTEMPTY @@ -1533,7 +1532,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error defer rp.Mount().EndWrite() name := rp.Component() if name == "." || name == ".." { - return syserror.EISDIR + return linuxerr.EISDIR } if rp.MustBeDir() { return linuxerr.ENOTDIR @@ -1557,7 +1556,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } if child.isDir() { - return syserror.EISDIR + return linuxerr.EISDIR } if err := parent.mayDelete(rp.Credentials(), child); err != nil { return err diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 1d3d2d95f..95cfbdc42 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -102,7 +102,6 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/tcpip/header", "//pkg/tcpip/network/ipv4", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index d99f90b36..e04ae6660 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // subtasksInode represents the inode for /proc/[pid]/task/ directory. @@ -71,15 +70,15 @@ func (fs *filesystem) newSubtasks(ctx context.Context, task *kernel.Task, pidns func (i *subtasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { tid, err := strconv.ParseUint(name, 10, 32) if err != nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } subTask := i.pidns.TaskWithID(kernel.ThreadID(tid)) if subTask == nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } if subTask.ThreadGroup() != i.task.ThreadGroup() { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } return i.fs.newTaskInode(ctx, subTask, i.pidns, false, i.cgroupControllers) } @@ -88,7 +87,7 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, func (i *subtasksInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { tasks := i.task.ThreadGroup().MemberIDs(i.pidns) if len(tasks) == 0 { - return offset, syserror.ENOENT + return offset, linuxerr.ENOENT } if relOffset >= int64(len(tasks)) { return offset, nil @@ -124,7 +123,7 @@ type subtasksFD struct { func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { if fd.task.ExitState() >= kernel.TaskExitZombie { - return syserror.ENOENT + return linuxerr.ENOENT } return fd.GenericDirectoryFD.IterDirents(ctx, cb) } @@ -132,7 +131,7 @@ func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallbac // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { if fd.task.ExitState() >= kernel.TaskExitZombie { - return 0, syserror.ENOENT + return 0, linuxerr.ENOENT } return fd.GenericDirectoryFD.Seek(ctx, offset, whence) } @@ -140,7 +139,7 @@ func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *subtasksFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { if fd.task.ExitState() >= kernel.TaskExitZombie { - return linux.Statx{}, syserror.ENOENT + return linux.Statx{}, linuxerr.ENOENT } return fd.GenericDirectoryFD.Stat(ctx, opts) } @@ -148,7 +147,7 @@ func (fd *subtasksFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Sta // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { if fd.task.ExitState() >= kernel.TaskExitZombie { - return syserror.ENOENT + return linuxerr.ENOENT } return fd.GenericDirectoryFD.SetStat(ctx, opts) } diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index dfc0a924e..5c6412fc0 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -22,11 +22,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags) { @@ -142,11 +142,11 @@ func (i *fdDirInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.Ite func (i *fdDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { fdInt, err := strconv.ParseInt(name, 10, 32) if err != nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } fd := int32(fdInt) if !taskFDExists(ctx, i.fs, i.task, fd) { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } return i.fs.newFDSymlink(ctx, i.task, fd, i.fs.NextIno()), nil } @@ -218,7 +218,7 @@ func (fs *filesystem) newFDSymlink(ctx context.Context, task *kernel.Task, fd in func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { file, _ := getTaskFD(s.task, s.fd) if file == nil { - return "", syserror.ENOENT + return "", linuxerr.ENOENT } defer s.fs.SafeDecRefFD(ctx, file) root := vfs.RootFromContext(ctx) @@ -231,7 +231,7 @@ func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { file, _ := getTaskFD(s.task, s.fd) if file == nil { - return vfs.VirtualDentry{}, "", syserror.ENOENT + return vfs.VirtualDentry{}, "", linuxerr.ENOENT } defer s.fs.SafeDecRefFD(ctx, file) vd := file.VirtualDentry() @@ -278,11 +278,11 @@ func (fs *filesystem) newFDInfoDirInode(ctx context.Context, task *kernel.Task) func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { fdInt, err := strconv.ParseInt(name, 10, 32) if err != nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } fd := int32(fdInt) if !taskFDExists(ctx, i.fs, i.task, fd) { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } data := &fdInfoData{ fs: i.fs, @@ -330,7 +330,7 @@ var _ dynamicInode = (*fdInfoData)(nil) func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { file, descriptorFlags := getTaskFD(d.task, d.fd) if file == nil { - return syserror.ENOENT + return linuxerr.ENOENT } defer d.fs.SafeDecRefFD(ctx, file) // TODO(b/121266871): Include pos, locks, and other data. For now we only diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 0ce3ed797..34b0c4f63 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -33,7 +33,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -491,7 +490,7 @@ func (fd *memFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64 return int64(n), nil } if readErr != nil { - return 0, syserror.EIO + return 0, linuxerr.EIO } return 0, nil } diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index cf905fae4..26d44744b 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -21,11 +21,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) const ( @@ -116,12 +116,12 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, err case threadSelfName: return i.newThreadSelfSymlink(ctx, root), nil } - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } task := i.pidns.TaskWithID(kernel.ThreadID(tid)) if task == nil { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } return i.fs.newTaskInode(ctx, task, i.pidns, true, i.fakeCgroupControllers) diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index 03bed22a3..4d3a2f7e6 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -29,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // +stateify savable @@ -58,7 +57,7 @@ func (s *selfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error } tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) if tgid == 0 { - return "", syserror.ENOENT + return "", linuxerr.ENOENT } return strconv.FormatUint(uint64(tgid), 10), nil } @@ -100,7 +99,7 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) tid := s.pidns.IDOfTask(t) if tid == 0 || tgid == 0 { - return "", syserror.ENOENT + return "", linuxerr.ENOENT } return fmt.Sprintf("%d/task/%d", tgid, tid), nil } diff --git a/pkg/sentry/fsimpl/signalfd/BUILD b/pkg/sentry/fsimpl/signalfd/BUILD index adb610213..403c6f254 100644 --- a/pkg/sentry/fsimpl/signalfd/BUILD +++ b/pkg/sentry/fsimpl/signalfd/BUILD @@ -9,10 +9,10 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/sentry/kernel", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go index a7f5928b7..bdb03ef96 100644 --- a/pkg/sentry/fsimpl/signalfd/signalfd.go +++ b/pkg/sentry/fsimpl/signalfd/signalfd.go @@ -18,10 +18,10 @@ package signalfd import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -91,7 +91,7 @@ func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequen info, err := sfd.target.Sigtimedwait(sfd.Mask(), 0) if err != nil { // There must be no signal available. - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // Copy out the signal info using the specified format. diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD index 1af0a5cbc..ab21f028e 100644 --- a/pkg/sentry/fsimpl/sys/BUILD +++ b/pkg/sentry/fsimpl/sys/BUILD @@ -36,7 +36,6 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fsimpl/timerfd/BUILD b/pkg/sentry/fsimpl/timerfd/BUILD index e6980a314..2b83d7d9a 100644 --- a/pkg/sentry/fsimpl/timerfd/BUILD +++ b/pkg/sentry/fsimpl/timerfd/BUILD @@ -12,7 +12,6 @@ go_library( "//pkg/hostarch", "//pkg/sentry/kernel/time", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go index 655a1c76a..68b785791 100644 --- a/pkg/sentry/fsimpl/timerfd/timerfd.go +++ b/pkg/sentry/fsimpl/timerfd/timerfd.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/hostarch" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -82,7 +81,7 @@ func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequenc } return sizeofUint64, nil } - return 0, syserror.ErrWouldBlock + return 0, linuxerr.ErrWouldBlock } // Clock returns the timer fd's Clock. diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index dc8b9bfeb..94486bb63 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -82,7 +82,6 @@ go_library( "//pkg/sentry/vfs", "//pkg/sentry/vfs/memxattr", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", ], ) @@ -125,7 +124,6 @@ go_test( "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 8b04df038..e067f136e 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -26,7 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" ) // Sync implements vfs.FilesystemImpl.Sync. @@ -75,7 +74,7 @@ afterSymlink: } child, ok := dir.childMap[name] if !ok { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, err @@ -171,12 +170,12 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir return linuxerr.EEXIST } if !dir && rp.MustBeDir() { - return syserror.ENOENT + return linuxerr.ENOENT } // tmpfs never calls VFS.InvalidateDentry(), so parentDir.dentry can only // be dead if it was deleted. if parentDir.dentry.vfsd.IsDead() { - return syserror.ENOENT + return linuxerr.ENOENT } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { @@ -258,7 +257,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return err } if i.nlink == 0 { - return syserror.ENOENT + return linuxerr.ENOENT } if i.nlink == maxLinks { return linuxerr.EMLINK @@ -345,7 +344,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if rp.Done() { // Reject attempts to open mount root directory with O_CREAT. if rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if mustCreate { return nil, linuxerr.EEXIST @@ -366,11 +365,11 @@ afterTrailingSymlink: } // Reject attempts to open directories with O_CREAT. if rp.MustBeDir() { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } name := rp.Component() if name == "." || name == ".." { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } if len(name) > linux.NAME_MAX { return nil, linuxerr.ENAMETOOLONG @@ -457,7 +456,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open case *directory: // Can't open directories writably. if ats&vfs.MayWrite != 0 { - return nil, syserror.EISDIR + return nil, linuxerr.EISDIR } var fd directoryFD fd.LockFD.Init(&d.inode.locks) @@ -532,7 +531,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } renamed, ok := oldParentDir.childMap[oldName] if !ok { - return syserror.ENOENT + return linuxerr.ENOENT } if err := oldParentDir.mayDelete(rp.Credentials(), renamed); err != nil { return err @@ -567,7 +566,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa replacedDir, ok := replaced.inode.impl.(*directory) if ok { if !renamed.inode.isDir() { - return syserror.EISDIR + return linuxerr.EISDIR } if len(replacedDir.childMap) != 0 { return linuxerr.ENOTEMPTY @@ -588,7 +587,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // tmpfs never calls VFS.InvalidateDentry(), so newParentDir.dentry can // only be dead if it was deleted. if newParentDir.dentry.vfsd.IsDead() { - return syserror.ENOENT + return linuxerr.ENOENT } // Linux places this check before some of those above; we do it here for @@ -654,7 +653,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error } child, ok := parentDir.childMap[name] if !ok { - return syserror.ENOENT + return linuxerr.ENOENT } if err := parentDir.mayDelete(rp.Credentials(), child); err != nil { return err @@ -754,17 +753,17 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } name := rp.Component() if name == "." || name == ".." { - return syserror.EISDIR + return linuxerr.EISDIR } child, ok := parentDir.childMap[name] if !ok { - return syserror.ENOENT + return linuxerr.ENOENT } if err := parentDir.mayDelete(rp.Credentials(), child); err != nil { return err } if child.inode.isDir() { - return syserror.EISDIR + return linuxerr.EISDIR } if rp.MustBeDir() { return linuxerr.ENOTDIR diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go index 418c7994e..99afd9817 100644 --- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go +++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -202,7 +201,7 @@ func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) { readData := make([]byte, 1) dst := usermem.BytesIOSequence(readData) bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{}) - if err != syserror.ErrWouldBlock { + if err != linuxerr.ErrWouldBlock { t.Fatalf("expected ErrWouldBlock reading from empty pipe %q, but got: %v", fileName, err) } if bytesRead != 0 { diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index 4393cc13b..cb7711b39 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -21,10 +21,10 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -146,7 +146,7 @@ func TestLocks(t *testing.T) { if err := fd.Impl().LockBSD(ctx, uid2, 0 /* ownerPID */, lock.ReadLock, nil); err != nil { t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) } - if got, want := fd.Impl().LockBSD(ctx, uid2, 0 /* ownerPID */, lock.WriteLock, nil), syserror.ErrWouldBlock; got != want { + if got, want := fd.Impl().LockBSD(ctx, uid2, 0 /* ownerPID */, lock.WriteLock, nil), linuxerr.ErrWouldBlock; got != want { t.Fatalf("fd.Impl().LockBSD failed: got = %v, want = %v", got, want) } if err := fd.Impl().UnlockBSD(ctx, uid1); err != nil { @@ -165,7 +165,7 @@ func TestLocks(t *testing.T) { if err := fd.Impl().LockPOSIX(ctx, uid1, 0 /* ownerPID */, lock.WriteLock, lock.LockRange{Start: 0, End: 1}, nil); err != nil { t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) } - if got, want := fd.Impl().LockPOSIX(ctx, uid2, 0 /* ownerPID */, lock.ReadLock, lock.LockRange{Start: 0, End: 1}, nil), syserror.ErrWouldBlock; got != want { + if got, want := fd.Impl().LockPOSIX(ctx, uid2, 0 /* ownerPID */, lock.ReadLock, lock.LockRange{Start: 0, End: 1}, nil), linuxerr.ErrWouldBlock; got != want { t.Fatalf("fd.Impl().LockPOSIX failed: got = %v, want = %v", got, want) } if err := fd.Impl().UnlockPOSIX(ctx, uid1, lock.LockRange{Start: 0, End: 1}); err != nil { diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index f2250c025..feafb06e4 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -44,7 +44,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/vfs/memxattr" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // Name is the default filesystem name. @@ -556,7 +555,7 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs. needsCtimeBump = true } case *directory: - return syserror.EISDIR + return linuxerr.EISDIR default: return linuxerr.EINVAL } diff --git a/pkg/sentry/fsimpl/verity/BUILD b/pkg/sentry/fsimpl/verity/BUILD index 1d855234c..c12abdf33 100644 --- a/pkg/sentry/fsimpl/verity/BUILD +++ b/pkg/sentry/fsimpl/verity/BUILD @@ -1,10 +1,24 @@ load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") licenses(["notice"]) +go_template_instance( + name = "dentry_list", + out = "dentry_list.go", + package = "verity", + prefix = "dentry", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*dentry", + "Linker": "*dentry", + }, +) + go_library( name = "verity", srcs = [ + "dentry_list.go", "filesystem.go", "save_restore.go", "verity.go", @@ -28,7 +42,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", "//pkg/sync", - "//pkg/syserror", "//pkg/usermem", ], ) diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go index 930016a3e..52d47994d 100644 --- a/pkg/sentry/fsimpl/verity/filesystem.go +++ b/pkg/sentry/fsimpl/verity/filesystem.go @@ -32,7 +32,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -67,40 +66,23 @@ func putDentrySlice(ds *[]*dentry) { dentrySlicePool.Put(ds) } -// renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls -// dentry.checkDropLocked on all dentries in *ds with fs.renameMu locked for +// renameMuRUnlockAndCheckCaching calls fs.renameMu.RUnlock(), then calls +// dentry.checkCachingLocked on all dentries in *ds with fs.renameMu locked for // writing. // // ds is a pointer-to-pointer since defer evaluates its arguments immediately, // but dentry slices are allocated lazily, and it's much easier to say "defer -// fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() { -// fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this. +// fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() { +// fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this. // +checklocksrelease:fs.renameMu -func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { +func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) { fs.renameMu.RUnlock() if *ds == nil { return } - if len(**ds) != 0 { - fs.renameMu.Lock() - for _, d := range **ds { - d.checkDropLocked(ctx) - } - fs.renameMu.Unlock() - } - putDentrySlice(*ds) -} - -// +checklocksrelease:fs.renameMu -func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { - if *ds == nil { - fs.renameMu.Unlock() - return - } for _, d := range **ds { - d.checkDropLocked(ctx) + d.checkCachingLocked(ctx, false /* renameMuWriteLocked */) } - fs.renameMu.Unlock() putDentrySlice(*ds) } @@ -166,7 +148,7 @@ afterSymlink: // verifyChildLocked verifies the hash of child against the already verified // hash of the parent to ensure the child is expected. verifyChild triggers a // sentry panic if unexpected modifications to the file system are detected. In -// ErrorOnViolation mode it returns a syserror instead. +// ErrorOnViolation mode it returns a linuxerr instead. // // Preconditions: // * fs.renameMu must be locked. @@ -547,7 +529,7 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, if parent.verityEnabled() { if _, ok := parent.childrenNames[name]; !ok { - return nil, syserror.ENOENT + return nil, linuxerr.ENOENT } } @@ -595,23 +577,6 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, } } - // Clear the Merkle tree file if they are to be generated at runtime. - // TODO(b/182315468): Optimize the Merkle tree generate process to - // allow only updating certain files/directories. - if fs.allowRuntimeEnable { - childMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{ - Root: childMerkleVD, - Start: childMerkleVD, - }, &vfs.OpenOptions{ - Flags: linux.O_RDWR | linux.O_TRUNC, - Mode: 0644, - }) - if err != nil { - return nil, err - } - childMerkleFD.DecRef(ctx) - } - // The dentry needs to be cleaned up if any error occurs. IncRef will be // called if a verity child dentry is successfully created. defer childMerkleVD.DecRef(ctx) @@ -718,7 +683,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds } var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return err @@ -730,7 +695,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -751,7 +716,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) d, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { @@ -788,7 +753,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) if rp.Done() { @@ -970,7 +935,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err @@ -1000,7 +965,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return linux.Statx{}, err @@ -1046,7 +1011,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) if _, err := fs.resolveLocked(ctx, rp, &ds); err != nil { return nil, err } @@ -1057,7 +1022,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -1073,7 +1038,7 @@ func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, si func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go index c5fa9855b..d2526263c 100644 --- a/pkg/sentry/fsimpl/verity/verity.go +++ b/pkg/sentry/fsimpl/verity/verity.go @@ -23,10 +23,12 @@ // Lock order: // // filesystem.renameMu -// dentry.dirMu -// fileDescription.mu -// filesystem.verityMu -// dentry.hashMu +// dentry.cachingMu +// filesystem.cacheMu +// dentry.dirMu +// fileDescription.mu +// filesystem.verityMu +// dentry.hashMu // // Locking dentry.dirMu in multiple dentries requires that parent dentries are // locked before child dentries, and that filesystem.renameMu is locked to @@ -60,7 +62,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -97,6 +98,9 @@ const ( // sizeOfStringInt32 is the size for a 32 bit integer stored as string in // extended attributes. The maximum value of a 32 bit integer has 10 digits. sizeOfStringInt32 = 10 + + // defaultMaxCachedDentries is the default limit of dentry cache. + defaultMaxCachedDentries = uint64(1000) ) var ( @@ -107,9 +111,10 @@ var ( // Mount option names for verityfs. const ( - moptLowerPath = "lower_path" - moptRootHash = "root_hash" - moptRootName = "root_name" + moptLowerPath = "lower_path" + moptRootHash = "root_hash" + moptRootName = "root_name" + moptDentryCacheLimit = "dentry_cache_limit" ) // HashAlgorithm is a type specifying the algorithm used to hash the file @@ -189,6 +194,17 @@ type filesystem struct { // dentries. renameMu sync.RWMutex `state:"nosave"` + // cachedDentries contains all dentries with 0 references. (Due to race + // conditions, it may also contain dentries with non-zero references.) + // cachedDentriesLen is the number of dentries in cachedDentries. These + // fields are protected by cacheMu. + cacheMu sync.Mutex `state:"nosave"` + cachedDentries dentryList + cachedDentriesLen uint64 + + // maxCachedDentries is the maximum size of filesystem.cachedDentries. + maxCachedDentries uint64 + // verityMu synchronizes enabling verity files, protects files or // directories from being enabled by different threads simultaneously. // It also ensures that verity does not access files that are being @@ -199,6 +215,10 @@ type filesystem struct { // is for the whole file system to ensure that no more than one file is // enabled the same time. verityMu sync.RWMutex `state:"nosave"` + + // released is nonzero once filesystem.Release has been called. It is accessed + // with atomic memory operations. + released int32 } // InternalFilesystemOptions may be passed as @@ -239,7 +259,7 @@ func (FilesystemType) Release(ctx context.Context) {} // mode, it returns EIO, otherwise it panic. func (fs *filesystem) alertIntegrityViolation(msg string) error { if fs.action == ErrorOnViolation { - return syserror.EIO + return linuxerr.EIO } panic(msg) } @@ -267,6 +287,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt delete(mopts, moptRootName) rootName = root } + maxCachedDentries := defaultMaxCachedDentries + if str, ok := mopts[moptDentryCacheLimit]; ok { + delete(mopts, moptDentryCacheLimit) + maxCD, err := strconv.ParseUint(str, 10, 64) + if err != nil { + ctx.Warningf("verity.FilesystemType.GetFilesystem: invalid dentry cache limit: %s=%s", moptDentryCacheLimit, str) + return nil, nil, linuxerr.EINVAL + } + maxCachedDentries = maxCD + } // Check for unparsed options. if len(mopts) != 0 { @@ -340,12 +370,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt action: iopts.Action, opts: opts.Data, allowRuntimeEnable: iopts.AllowRuntimeEnable, + maxCachedDentries: maxCachedDentries, } fs.vfsfs.Init(vfsObj, &fstype, fs) // Construct the root dentry. d := fs.newDentry() - d.refs = 1 + // Set the root's reference count to 2. One reference is returned to + // the caller, and the other is held by fs to prevent the root from + // being "cached" and subsequently evicted. + d.refs = 2 lowerVD := vfs.MakeVirtualDentry(lowerMount, lowerMount.Root()) lowerVD.IncRef() d.lowerVD = lowerVD @@ -520,7 +554,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { + atomic.StoreInt32(&fs.released, 1) fs.lowerMount.DecRef(ctx) + + fs.renameMu.Lock() + fs.evictAllCachedDentriesLocked(ctx) + fs.renameMu.Unlock() + + // An extra reference was held by the filesystem on the root to prevent + // it from being cached/evicted. + fs.rootDentry.DecRef(ctx) } // MountOptions implements vfs.FilesystemImpl.MountOptions. @@ -534,6 +577,11 @@ func (fs *filesystem) MountOptions() string { type dentry struct { vfsd vfs.Dentry + // refs is the reference count. Each dentry holds a reference on its + // parent, even if disowned. When refs reaches 0, the dentry may be + // added to the cache or destroyed. If refs == -1, the dentry has + // already been destroyed. refs is accessed using atomic memory + // operations. refs int64 // fs is the owning filesystem. fs is immutable. @@ -588,13 +636,23 @@ type dentry struct { // is protected by hashMu. hashMu sync.RWMutex `state:"nosave"` hash []byte + + // cachingMu is used to synchronize concurrent dentry caching attempts on + // this dentry. + cachingMu sync.Mutex `state:"nosave"` + + // If cached is true, dentryEntry links dentry into + // filesystem.cachedDentries. cached and dentryEntry are protected by + // cachingMu. + cached bool + dentryEntry } // newDentry creates a new dentry representing the given verity file. The -// dentry initially has no references; it is the caller's responsibility to set -// the dentry's reference count and/or call dentry.destroy() as appropriate. -// The dentry is initially invalid in that it contains no underlying dentry; -// the caller is responsible for setting them. +// dentry initially has no references, but is not cached; it is the caller's +// responsibility to set the dentry's reference count and/or call +// dentry.destroy() as appropriate. The dentry is initially invalid in that it +// contains no underlying dentry; the caller is responsible for setting them. func (fs *filesystem) newDentry() *dentry { d := &dentry{ fs: fs, @@ -630,42 +688,23 @@ func (d *dentry) TryIncRef() bool { // DecRef implements vfs.DentryImpl.DecRef. func (d *dentry) DecRef(ctx context.Context) { - r := atomic.AddInt64(&d.refs, -1) - if d.LogRefs() { - refsvfs2.LogDecRef(d, r) - } - if r == 0 { - d.fs.renameMu.Lock() - d.checkDropLocked(ctx) - d.fs.renameMu.Unlock() - } else if r < 0 { - panic("verity.dentry.DecRef() called without holding a reference") + if d.decRefNoCaching() == 0 { + d.checkCachingLocked(ctx, false /* renameMuWriteLocked */) } } -func (d *dentry) decRefLocked(ctx context.Context) { +// decRefNoCaching decrements d's reference count without calling +// d.checkCachingLocked, even if d's reference count reaches 0; callers are +// responsible for ensuring that d.checkCachingLocked will be called later. +func (d *dentry) decRefNoCaching() int64 { r := atomic.AddInt64(&d.refs, -1) if d.LogRefs() { refsvfs2.LogDecRef(d, r) } - if r == 0 { - d.checkDropLocked(ctx) - } else if r < 0 { - panic("verity.dentry.decRefLocked() called without holding a reference") + if r < 0 { + panic("verity.dentry.decRefNoCaching() called without holding a reference") } -} - -// checkDropLocked should be called after d's reference count becomes 0 or it -// becomes deleted. -func (d *dentry) checkDropLocked(ctx context.Context) { - // Dentries with a positive reference count must be retained. Dentries - // with a negative reference count have already been destroyed. - if atomic.LoadInt64(&d.refs) != 0 { - return - } - // Refs is still zero; destroy it. - d.destroyLocked(ctx) - return + return r } // destroyLocked destroys the dentry. @@ -684,6 +723,12 @@ func (d *dentry) destroyLocked(ctx context.Context) { panic("verity.dentry.destroyLocked() called with references on the dentry") } + // Drop the reference held by d on its parent without recursively + // locking d.fs.renameMu. + if d.parent != nil && d.parent.decRefNoCaching() == 0 { + d.parent.checkCachingLocked(ctx, true /* renameMuWriteLocked */) + } + if d.lowerVD.Ok() { d.lowerVD.DecRef(ctx) } @@ -696,7 +741,6 @@ func (d *dentry) destroyLocked(ctx context.Context) { delete(d.parent.children, d.name) } d.parent.dirMu.Unlock() - d.parent.decRefLocked(ctx) } refsvfs2.Unregister(d) } @@ -735,6 +779,140 @@ func (d *dentry) OnZeroWatches(context.Context) { //TODO(b/159261227): Implement OnZeroWatches. } +// checkCachingLocked should be called after d's reference count becomes 0 or +// it becomes disowned. +// +// For performance, checkCachingLocked can also be called after d's reference +// count becomes non-zero, so that d can be removed from the LRU cache. This +// may help in reducing the size of the cache and hence reduce evictions. Note +// that this is not necessary for correctness. +// +// It may be called on a destroyed dentry. For example, +// renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times +// for the same dentry when the dentry is visited more than once in the same +// operation. One of the calls may destroy the dentry, so subsequent calls will +// do nothing. +// +// Preconditions: d.fs.renameMu must be locked for writing if +// renameMuWriteLocked is true; it may be temporarily unlocked. +func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked bool) { + d.cachingMu.Lock() + refs := atomic.LoadInt64(&d.refs) + if refs == -1 { + // Dentry has already been destroyed. + d.cachingMu.Unlock() + return + } + if refs > 0 { + // fs.cachedDentries is permitted to contain dentries with non-zero refs, + // which are skipped by fs.evictCachedDentryLocked() upon reaching the end + // of the LRU. But it is still beneficial to remove d from the cache as we + // are already holding d.cachingMu. Keeping a cleaner cache also reduces + // the number of evictions (which is expensive as it acquires fs.renameMu). + d.removeFromCacheLocked() + d.cachingMu.Unlock() + return + } + + if atomic.LoadInt32(&d.fs.released) != 0 { + d.cachingMu.Unlock() + if !renameMuWriteLocked { + // Need to lock d.fs.renameMu to access d.parent. Lock it for writing as + // needed by d.destroyLocked() later. + d.fs.renameMu.Lock() + defer d.fs.renameMu.Unlock() + } + if d.parent != nil { + d.parent.dirMu.Lock() + delete(d.parent.children, d.name) + d.parent.dirMu.Unlock() + } + d.destroyLocked(ctx) // +checklocksforce: see above. + return + } + + d.fs.cacheMu.Lock() + // If d is already cached, just move it to the front of the LRU. + if d.cached { + d.fs.cachedDentries.Remove(d) + d.fs.cachedDentries.PushFront(d) + d.fs.cacheMu.Unlock() + d.cachingMu.Unlock() + return + } + // Cache the dentry, then evict the least recently used cached dentry if + // the cache becomes over-full. + d.fs.cachedDentries.PushFront(d) + d.fs.cachedDentriesLen++ + d.cached = true + shouldEvict := d.fs.cachedDentriesLen > d.fs.maxCachedDentries + d.fs.cacheMu.Unlock() + d.cachingMu.Unlock() + + if shouldEvict { + if !renameMuWriteLocked { + // Need to lock d.fs.renameMu for writing as needed by + // d.evictCachedDentryLocked(). + d.fs.renameMu.Lock() + defer d.fs.renameMu.Unlock() + } + d.fs.evictCachedDentryLocked(ctx) // +checklocksforce: see above. + } +} + +// Preconditions: d.cachingMu must be locked. +func (d *dentry) removeFromCacheLocked() { + if d.cached { + d.fs.cacheMu.Lock() + d.fs.cachedDentries.Remove(d) + d.fs.cachedDentriesLen-- + d.fs.cacheMu.Unlock() + d.cached = false + } +} + +// Precondition: fs.renameMu must be locked for writing; it may be temporarily +// unlocked. +// +checklocks:fs.renameMu +func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) { + for fs.cachedDentriesLen != 0 { + fs.evictCachedDentryLocked(ctx) + } +} + +// Preconditions: +// * fs.renameMu must be locked for writing; it may be temporarily unlocked. +// +checklocks:fs.renameMu +func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { + fs.cacheMu.Lock() + victim := fs.cachedDentries.Back() + fs.cacheMu.Unlock() + if victim == nil { + // fs.cachedDentries may have become empty between when it was + // checked and when we locked fs.cacheMu. + return + } + + victim.cachingMu.Lock() + victim.removeFromCacheLocked() + // victim.refs may have become non-zero from an earlier path resolution + // since it was inserted into fs.cachedDentries. + if atomic.LoadInt64(&victim.refs) != 0 { + victim.cachingMu.Unlock() + return + } + if victim.parent != nil { + victim.parent.dirMu.Lock() + // Note that victim can't be a mount point (in any mount + // namespace), since VFS holds references on mount points. + fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &victim.vfsd) + delete(victim.parent.children, victim.name) + victim.parent.dirMu.Unlock() + } + victim.cachingMu.Unlock() + victim.destroyLocked(ctx) // +checklocksforce: owned as precondition, victim.fs == fs. +} + func (d *dentry) isSymlink() bool { return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK } @@ -1091,6 +1269,21 @@ func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) { return 0, fd.d.fs.alertIntegrityViolation("Unexpected verity fd: missing expected underlying fds") } + // Populate children names here. We cannot rely on the children + // dentries to populate parent dentry's children names, because the + // parent dentry may be destroyed before users enable verity if its ref + // count drops to zero. + if fd.d.isDir() { + if err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { + if dirent.Name != "." && dirent.Name != ".." { + fd.d.childrenNames[dirent.Name] = struct{}{} + } + return nil + })); err != nil { + return 0, err + } + } + hash, dataSize, err := fd.generateMerkleLocked(ctx) if err != nil { return 0, err @@ -1118,9 +1311,6 @@ func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) { }); err != nil { return 0, err } - - // Add the current child's name to parent's childrenNames. - fd.d.parent.childrenNames[fd.d.name] = struct{}{} } // Record the size of the data being hashed for fd. @@ -1215,7 +1405,7 @@ func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch. case linux.FS_IOC_GETFLAGS: return fd.verityFlags(ctx, args[2].Pointer()) default: - return 0, syserror.ENOSYS + return 0, linuxerr.ENOSYS } } |