diff options
Diffstat (limited to 'pkg/sentry/fsimpl/kernfs')
-rw-r--r-- | pkg/sentry/fsimpl/kernfs/BUILD | 67 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 11 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 61 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/kernfs/filesystem.go | 428 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 327 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/kernfs/kernfs.go | 196 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/kernfs/kernfs_test.go | 117 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/kernfs/symlink.go | 14 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/kernfs/synthetic_directory.go | 112 |
9 files changed, 871 insertions, 462 deletions
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index 3835557fe..858cc24ce 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -26,9 +26,65 @@ go_template_instance( }, ) +go_template_instance( + name = "dentry_refs", + out = "dentry_refs.go", + package = "kernfs", + prefix = "Dentry", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "Dentry", + }, +) + +go_template_instance( + name = "static_directory_refs", + out = "static_directory_refs.go", + package = "kernfs", + prefix = "StaticDirectory", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "StaticDirectory", + }, +) + +go_template_instance( + name = "dir_refs", + out = "dir_refs.go", + package = "kernfs_test", + prefix = "dir", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "dir", + }, +) + +go_template_instance( + name = "readonly_dir_refs", + out = "readonly_dir_refs.go", + package = "kernfs_test", + prefix = "readonlyDir", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "readonlyDir", + }, +) + +go_template_instance( + name = "synthetic_directory_refs", + out = "synthetic_directory_refs.go", + package = "kernfs", + prefix = "syntheticDirectory", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "syntheticDirectory", + }, +) + go_library( name = "kernfs", srcs = [ + "dentry_refs.go", "dynamic_bytes_file.go", "fd_impl_util.go", "filesystem.go", @@ -36,7 +92,10 @@ go_library( "inode_impl_util.go", "kernfs.go", "slot_list.go", + "static_directory_refs.go", "symlink.go", + "synthetic_directory.go", + "synthetic_directory_refs.go", ], visibility = ["//pkg/sentry:internal"], deps = [ @@ -59,11 +118,17 @@ go_library( go_test( name = "kernfs_test", size = "small", - srcs = ["kernfs_test.go"], + srcs = [ + "dir_refs.go", + "kernfs_test.go", + "readonly_dir_refs.go", + ], deps = [ ":kernfs", "//pkg/abi/linux", "//pkg/context", + "//pkg/log", + "//pkg/refs", "//pkg/sentry/contexttest", "//pkg/sentry/fsimpl/testutil", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 12adf727a..b929118b1 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -35,6 +35,7 @@ import ( // +stateify savable type DynamicBytesFile struct { InodeAttrs + InodeNoStatFS InodeNoopRefCount InodeNotDirectory InodeNotSymlink @@ -55,9 +56,9 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, devMajor, devMinor uint } // Open implements Inode.Open. -func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &DynamicBytesFD{} - if err := fd.Init(rp.Mount(), vfsd, f.data, &f.locks, opts.Flags); err != nil { + if err := fd.Init(rp.Mount(), d, f.data, &f.locks, opts.Flags); err != nil { return nil, err } return &fd.vfsfd, nil @@ -86,12 +87,12 @@ type DynamicBytesFD struct { } // Init initializes a DynamicBytesFD. -func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error { +func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error { fd.LockFD.Init(locks) - if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return err } - fd.inode = d.Impl().(*Dentry).inode + fd.inode = d.inode fd.SetDataSource(data) return nil } diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index fcee6200a..abf1905d6 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -15,7 +15,7 @@ package kernfs import ( - "math" + "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -28,9 +28,29 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// SeekEndConfig describes the SEEK_END behaviour for FDs. +// +// +stateify savable +type SeekEndConfig int + +// Constants related to SEEK_END behaviour for FDs. +const ( + // Consider the end of the file to be after the final static entry. This is + // the default option. + SeekEndStaticEntries = iota + // Consider the end of the file to be at offset 0. + SeekEndZero +) + +// GenericDirectoryFDOptions contains configuration for a GenericDirectoryFD. +// +// +stateify savable +type GenericDirectoryFDOptions struct { + SeekEnd SeekEndConfig +} + // GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory -// inode that uses OrderChildren to track child nodes. GenericDirectoryFD is not -// compatible with dynamic directories. +// inode that uses OrderChildren to track child nodes. // // Note that GenericDirectoryFD holds a lock over OrderedChildren while calling // IterDirents callback. The IterDirents callback therefore cannot hash or @@ -40,16 +60,21 @@ import ( // Must be initialize with Init before first use. // // Lock ordering: mu => children.mu. +// +// +stateify savable type GenericDirectoryFD struct { vfs.FileDescriptionDefaultImpl vfs.DirectoryFileDescriptionDefaultImpl vfs.LockFD + // Immutable. + seekEnd SeekEndConfig + vfsfd vfs.FileDescription children *OrderedChildren // mu protects the fields below. - mu sync.Mutex + mu sync.Mutex `state:"nosave"` // off is the current directory offset. Protected by "mu". off int64 @@ -57,12 +82,12 @@ type GenericDirectoryFD struct { // NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its // dentry. -func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) { +func NewGenericDirectoryFD(m *vfs.Mount, d *Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) (*GenericDirectoryFD, error) { fd := &GenericDirectoryFD{} - if err := fd.Init(children, locks, opts); err != nil { + if err := fd.Init(children, locks, opts, fdOpts); err != nil { return nil, err } - if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(fd, opts.Flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return fd, nil @@ -71,12 +96,13 @@ func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildre // Init initializes a GenericDirectoryFD. Use it when overriding // GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the // correct implementation. -func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) error { +func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) error { if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 { // Can't open directories for writing. return syserror.EISDIR } fd.LockFD.Init(locks) + fd.seekEnd = fdOpts.SeekEnd fd.children = children return nil } @@ -175,13 +201,12 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent // these. childIdx := fd.off - 2 for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() { - inode := it.Dentry.Impl().(*Dentry).inode - stat, err := inode.Stat(ctx, fd.filesystem(), opts) + stat, err := it.inode.Stat(ctx, fd.filesystem(), opts) if err != nil { return err } dirent := vfs.Dirent{ - Name: it.Name, + Name: it.name, Type: linux.FileMode(stat.Mode).DirentType(), Ino: stat.Ino, NextOff: fd.off + 1, @@ -209,9 +234,17 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int case linux.SEEK_CUR: offset += fd.off case linux.SEEK_END: - // TODO(gvisor.dev/issue/1193): This can prevent new files from showing up - // if they are added after SEEK_END. - offset = math.MaxInt64 + switch fd.seekEnd { + case SeekEndStaticEntries: + fd.children.mu.RLock() + offset += int64(len(fd.children.set)) + offset += 2 // '.' and '..' aren't tracked in children. + fd.children.mu.RUnlock() + case SeekEndZero: + // No-op: offset += 0. + default: + panic(fmt.Sprintf("Invalid GenericDirectoryFD.seekEnd = %v", fd.seekEnd)) + } default: return 0, syserror.EINVAL } diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 3e5192edd..6426a55f6 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -32,11 +32,12 @@ import ( // // stepExistingLocked is loosely analogous to fs/namei.c:walk_component(). // -// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done(). +// Preconditions: +// * Filesystem.mu must be locked for at least reading. +// * !rp.Done(). // // Postcondition: Caller must call fs.processDeferredDecRefs*. -func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, mayFollowSymlinks bool) (*vfs.Dentry, error) { - d := vfsd.Impl().(*Dentry) +func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, mayFollowSymlinks bool) (*Dentry, error) { if !d.isDir() { return nil, syserror.ENOTDIR } @@ -53,20 +54,20 @@ afterSymlink: // calls d_revalidate(), but walk_component() => handle_dots() does not. if name == "." { rp.Advance() - return vfsd, nil + return d, nil } if name == ".." { - if isRoot, err := rp.CheckRoot(ctx, vfsd); err != nil { + if isRoot, err := rp.CheckRoot(ctx, d.VFSDentry()); err != nil { return nil, err } else if isRoot || d.parent == nil { rp.Advance() - return vfsd, nil + return d, nil } - if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { + if err := rp.CheckMount(ctx, d.parent.VFSDentry()); err != nil { return nil, err } rp.Advance() - return &d.parent.vfsd, nil + return d.parent, nil } if len(name) > linux.NAME_MAX { return nil, syserror.ENAMETOOLONG @@ -77,7 +78,7 @@ afterSymlink: if err != nil { return nil, err } - if err := rp.CheckMount(ctx, &next.vfsd); err != nil { + if err := rp.CheckMount(ctx, next.VFSDentry()); err != nil { return nil, err } // Resolve any symlink at current path component. @@ -88,7 +89,7 @@ afterSymlink: } if targetVD.Ok() { err := rp.HandleJump(targetVD) - targetVD.DecRef(ctx) + fs.deferDecRefVD(ctx, targetVD) if err != nil { return nil, err } @@ -100,15 +101,18 @@ afterSymlink: goto afterSymlink } rp.Advance() - return &next.vfsd, nil + return next, nil } // revalidateChildLocked must be called after a call to parent.vfsd.Child(name) // or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be // nil) to verify that the returned child (or lack thereof) is correct. // -// Preconditions: Filesystem.mu must be locked for at least reading. -// parent.dirMu must be locked. parent.isDir(). name is not "." or "..". +// Preconditions: +// * Filesystem.mu must be locked for at least reading. +// * parent.dirMu must be locked. +// * parent.isDir(). +// * name is not "." or "..". // // Postconditions: Caller must call fs.processDeferredDecRefs*. func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, child *Dentry) (*Dentry, error) { @@ -116,26 +120,33 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // Cached dentry exists, revalidate. if !child.inode.Valid(ctx) { delete(parent.children, name) - vfsObj.InvalidateDentry(ctx, &child.vfsd) - fs.deferDecRef(&child.vfsd) // Reference from Lookup. + if child.inode.Keep() { + // Drop the ref owned by kernfs. + fs.deferDecRef(child) + } + vfsObj.InvalidateDentry(ctx, child.VFSDentry()) child = nil } } if child == nil { - // Dentry isn't cached; it either doesn't exist or failed - // revalidation. Attempt to resolve it via Lookup. - // - // FIXME(gvisor.dev/issue/1193): Inode.Lookup() should return - // *(kernfs.)Dentry, not *vfs.Dentry, since (kernfs.)Filesystem assumes - // that all dentries in the filesystem are (kernfs.)Dentry and performs - // vfs.DentryImpl casts accordingly. - childVFSD, err := parent.inode.Lookup(ctx, name) + // Dentry isn't cached; it either doesn't exist or failed revalidation. + // Attempt to resolve it via Lookup. + childInode, err := parent.inode.Lookup(ctx, name) if err != nil { return nil, err } - // Reference on childVFSD dropped by a corresponding Valid. - child = childVFSD.Impl().(*Dentry) - parent.insertChildLocked(name, child) + var newChild Dentry + newChild.Init(fs, childInode) // childInode's ref is transferred to newChild. + parent.insertChildLocked(name, &newChild) + child = &newChild + + // Drop the ref on newChild. This will cause the dentry to get pruned + // from the dentry tree by the end of current filesystem operation + // (before returning to the VFS layer) if another ref is not picked on + // this dentry. + if !childInode.Keep() { + fs.deferDecRef(&newChild) + } } return child, nil } @@ -148,20 +159,19 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // Preconditions: Filesystem.mu must be locked for at least reading. // // Postconditions: Caller must call fs.processDeferredDecRefs*. -func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) { - vfsd := rp.Start() +func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) { + d := rp.Start().Impl().(*Dentry) for !rp.Done() { var err error - vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */) + d, err = fs.stepExistingLocked(ctx, rp, d, true /* mayFollowSymlinks */) if err != nil { - return nil, nil, err + return nil, err } } - d := vfsd.Impl().(*Dentry) if rp.MustBeDir() && !d.isDir() { - return nil, nil, syserror.ENOTDIR + return nil, syserror.ENOTDIR } - return vfsd, d.inode, nil + return d, nil } // walkParentDirLocked resolves all but the last path component of rp to an @@ -171,32 +181,34 @@ func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingP // walkParentDirLocked is loosely analogous to Linux's // fs/namei.c:path_parentat(). // -// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done(). +// Preconditions: +// * Filesystem.mu must be locked for at least reading. +// * !rp.Done(). // // Postconditions: Caller must call fs.processDeferredDecRefs*. -func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) { - vfsd := rp.Start() +func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) { + d := rp.Start().Impl().(*Dentry) for !rp.Final() { var err error - vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */) + d, err = fs.stepExistingLocked(ctx, rp, d, true /* mayFollowSymlinks */) if err != nil { - return nil, nil, err + return nil, err } } - d := vfsd.Impl().(*Dentry) if !d.isDir() { - return nil, nil, syserror.ENOTDIR + return nil, syserror.ENOTDIR } - return vfsd, d.inode, nil + return d, nil } // checkCreateLocked checks that a file named rp.Component() may be created in -// directory parentVFSD, then returns rp.Component(). +// directory parent, then returns rp.Component(). // -// Preconditions: Filesystem.mu must be locked for at least reading. parentInode -// == parentVFSD.Impl().(*Dentry).Inode. isDir(parentInode) == true. -func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) { - if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { +// Preconditions: +// * Filesystem.mu must be locked for at least reading. +// * isDir(parentInode) == true. +func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *Dentry) (string, error) { + if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return "", err } pc := rp.Component() @@ -206,11 +218,10 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v if len(pc) > linux.NAME_MAX { return "", syserror.ENAMETOOLONG } - // FIXME(gvisor.dev/issue/1193): Data race due to not holding dirMu. - if _, ok := parentVFSD.Impl().(*Dentry).children[pc]; ok { + if _, ok := parent.children[pc]; ok { return "", syserror.EEXIST } - if parentVFSD.IsDead() { + if parent.VFSDentry().IsDead() { return "", syserror.ENOENT } return pc, nil @@ -219,8 +230,8 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v // checkDeleteLocked checks that the file represented by vfsd may be deleted. // // Preconditions: Filesystem.mu must be locked for at least reading. -func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error { - parent := vfsd.Impl().(*Dentry).parent +func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) error { + parent := d.parent if parent == nil { return syserror.EBUSY } @@ -249,11 +260,11 @@ func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() - _, inode, err := fs.walkExistingLocked(ctx, rp) + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return err } - return inode.CheckPermissions(ctx, creds, ats) + return d.inode.CheckPermissions(ctx, creds, ats) } // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. @@ -261,20 +272,20 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op fs.mu.RLock() defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() - vfsd, inode, err := fs.walkExistingLocked(ctx, rp) + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return nil, err } if opts.CheckSearchable { - d := vfsd.Impl().(*Dentry) if !d.isDir() { return nil, syserror.ENOTDIR } - if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { + if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { return nil, err } } + vfsd := d.VFSDentry() vfsd.IncRef() // Ownership transferred to caller. return vfsd, nil } @@ -284,12 +295,12 @@ func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa fs.mu.RLock() defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() - vfsd, _, err := fs.walkParentDirLocked(ctx, rp) + d, err := fs.walkParentDirLocked(ctx, rp) if err != nil { return nil, err } - vfsd.IncRef() // Ownership transferred to caller. - return vfsd, nil + d.IncRef() // Ownership transferred to caller. + return d.VFSDentry(), nil } // LinkAt implements vfs.FilesystemImpl.LinkAt. @@ -298,13 +309,16 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EEXIST } fs.mu.Lock() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() - parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked(ctx) + parent, err := fs.walkParentDirLocked(ctx, rp) if err != nil { return err } - pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) + + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + pc, err := checkCreateLocked(ctx, rp, parent) if err != nil { return err } @@ -321,11 +335,13 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EPERM } - childVFSD, err := parentInode.NewLink(ctx, pc, d.inode) + childI, err := parent.inode.NewLink(ctx, pc, d.inode) if err != nil { return err } - parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry)) + var child Dentry + child.Init(fs, childI) + parent.insertChildLocked(pc, &child) return nil } @@ -335,13 +351,16 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return syserror.EEXIST } fs.mu.Lock() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() - parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked(ctx) + parent, err := fs.walkParentDirLocked(ctx, rp) if err != nil { return err } - pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) + + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + pc, err := checkCreateLocked(ctx, rp, parent) if err != nil { return err } @@ -349,11 +368,16 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return err } defer rp.Mount().EndWrite() - childVFSD, err := parentInode.NewDir(ctx, pc, opts) + childI, err := parent.inode.NewDir(ctx, pc, opts) if err != nil { - return err + if !opts.ForSyntheticMountpoint || err == syserror.EEXIST { + return err + } + childI = newSyntheticDirectory(rp.Credentials(), opts.Mode) } - parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry)) + var child Dentry + child.Init(fs, childI) + parent.insertChildLocked(pc, &child) return nil } @@ -363,13 +387,16 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return syserror.EEXIST } fs.mu.Lock() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() - parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked(ctx) + parent, err := fs.walkParentDirLocked(ctx, rp) if err != nil { return err } - pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) + + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + pc, err := checkCreateLocked(ctx, rp, parent) if err != nil { return err } @@ -377,11 +404,13 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return err } defer rp.Mount().EndWrite() - newVFSD, err := parentInode.NewNode(ctx, pc, opts) + newI, err := parent.inode.NewNode(ctx, pc, opts) if err != nil { return err } - parentVFSD.Impl().(*Dentry).InsertChild(pc, newVFSD.Impl().(*Dentry)) + var newD Dentry + newD.Init(fs, newI) + parent.insertChildLocked(pc, &newD) return nil } @@ -397,28 +426,28 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // Do not create new file. if opts.Flags&linux.O_CREAT == 0 { fs.mu.RLock() - vfsd, inode, err := fs.walkExistingLocked(ctx, rp) + defer fs.processDeferredDecRefs(ctx) + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) return nil, err } - if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { + if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) return nil, err } - inode.IncRef() - defer inode.DecRef(ctx) + // Open may block so we need to unlock fs.mu. IncRef d to prevent + // its destruction while fs.mu is unlocked. + d.IncRef() fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) - return inode.Open(ctx, rp, vfsd, opts) + fd, err := d.inode.Open(ctx, rp, d, opts) + d.DecRef(ctx) + return fd, err } // May create new file. mustCreate := opts.Flags&linux.O_EXCL != 0 - vfsd := rp.Start() - inode := vfsd.Impl().(*Dentry).inode + d := rp.Start().Impl().(*Dentry) fs.mu.Lock() unlocked := false unlock := func() { @@ -427,6 +456,10 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf unlocked = true } } + // Process all to-be-decref'd dentries at the end at once. + // Since we defer unlock() AFTER this, fs.mu is guaranteed to be unlocked + // when this is executed. + defer fs.processDeferredDecRefs(ctx) defer unlock() if rp.Done() { if rp.MustBeDir() { @@ -435,22 +468,24 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if mustCreate { return nil, syserror.EEXIST } - if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { + if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - inode.IncRef() - defer inode.DecRef(ctx) + // Open may block so we need to unlock fs.mu. IncRef d to prevent + // its destruction while fs.mu is unlocked. + d.IncRef() unlock() - return inode.Open(ctx, rp, vfsd, opts) + fd, err := d.inode.Open(ctx, rp, d, opts) + d.DecRef(ctx) + return fd, err } afterTrailingSymlink: - parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked(ctx) + parent, err := fs.walkParentDirLocked(ctx, rp) if err != nil { return nil, err } // Check for search permission in the parent directory. - if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { + if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { return nil, err } // Reject attempts to open directories with O_CREAT. @@ -465,10 +500,10 @@ afterTrailingSymlink: return nil, syserror.ENAMETOOLONG } // Determine whether or not we need to create a file. - childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD, false /* mayFollowSymlinks */) + child, err := fs.stepExistingLocked(ctx, rp, parent, false /* mayFollowSymlinks */) if err == syserror.ENOENT { // Already checked for searchability above; now check for writability. - if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { + if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } if err := rp.Mount().CheckBeginWrite(); err != nil { @@ -476,16 +511,23 @@ afterTrailingSymlink: } defer rp.Mount().EndWrite() // Create and open the child. - childVFSD, err = parentInode.NewFile(ctx, pc, opts) + childI, err := parent.inode.NewFile(ctx, pc, opts) if err != nil { return nil, err } - child := childVFSD.Impl().(*Dentry) - parentVFSD.Impl().(*Dentry).InsertChild(pc, child) - child.inode.IncRef() - defer child.inode.DecRef(ctx) + var child Dentry + child.Init(fs, childI) + // FIXME(gvisor.dev/issue/1193): Race between checking existence with + // fs.stepExistingLocked and parent.insertChild. If possible, we should hold + // dirMu from one to the other. + parent.insertChild(pc, &child) + // Open may block so we need to unlock fs.mu. IncRef child to prevent + // its destruction while fs.mu is unlocked. + child.IncRef() unlock() - return child.inode.Open(ctx, rp, childVFSD, opts) + fd, err := child.inode.Open(ctx, rp, &child, opts) + child.DecRef(ctx) + return fd, err } if err != nil { return nil, err @@ -494,7 +536,6 @@ afterTrailingSymlink: if mustCreate { return nil, syserror.EEXIST } - child := childVFSD.Impl().(*Dentry) if rp.ShouldFollowSymlink() && child.isSymlink() { targetVD, targetPathname, err := child.inode.Getlink(ctx, rp.Mount()) if err != nil { @@ -502,7 +543,7 @@ afterTrailingSymlink: } if targetVD.Ok() { err := rp.HandleJump(targetVD) - targetVD.DecRef(ctx) + fs.deferDecRefVD(ctx, targetVD) if err != nil { return nil, err } @@ -518,25 +559,28 @@ afterTrailingSymlink: if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - child.inode.IncRef() - defer child.inode.DecRef(ctx) + // Open may block so we need to unlock fs.mu. IncRef child to prevent + // its destruction while fs.mu is unlocked. + child.IncRef() unlock() - return child.inode.Open(ctx, rp, &child.vfsd, opts) + fd, err := child.inode.Open(ctx, rp, child, opts) + child.DecRef(ctx) + return fd, err } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { fs.mu.RLock() - d, inode, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return "", err } - if !d.Impl().(*Dentry).isSymlink() { + if !d.isSymlink() { return "", syserror.EINVAL } - return inode.Readlink(ctx) + return d.inode.Readlink(ctx, rp.Mount()) } // RenameAt implements vfs.FilesystemImpl.RenameAt. @@ -548,16 +592,15 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0 fs.mu.Lock() - defer fs.processDeferredDecRefsLocked(ctx) + defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() // Resolve the destination directory first to verify that it's on this // Mount. - dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp) + dstDir, err := fs.walkParentDirLocked(ctx, rp) if err != nil { return err } - dstDir := dstDirVFSD.Impl().(*Dentry) mnt := rp.Mount() if mnt != oldParentVD.Mount() { return syserror.EXDEV @@ -575,16 +618,15 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if err != nil { return err } - srcVFSD := &src.vfsd // Can we remove the src dentry? - if err := checkDeleteLocked(ctx, rp, srcVFSD); err != nil { + if err := checkDeleteLocked(ctx, rp, src); err != nil { return err } // Can we create the dst dentry? var dst *Dentry - pc, err := checkCreateLocked(ctx, rp, dstDirVFSD, dstDirInode) + pc, err := checkCreateLocked(ctx, rp, dstDir) switch err { case nil: // Ok, continue with rename as replacement. @@ -595,14 +637,14 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } dst = dstDir.children[pc] if dst == nil { - panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD)) + panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDir)) } default: return err } var dstVFSD *vfs.Dentry if dst != nil { - dstVFSD = &dst.vfsd + dstVFSD = dst.VFSDentry() } mntns := vfs.MountNamespaceFromContext(ctx) @@ -618,35 +660,44 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa defer dstDir.dirMu.Unlock() } + srcVFSD := src.VFSDentry() if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil { return err } - replaced, err := srcDir.inode.Rename(ctx, src.name, pc, srcVFSD, dstDirVFSD) + err = srcDir.inode.Rename(ctx, src.name, pc, src.inode, dstDir.inode) if err != nil { virtfs.AbortRenameDentry(srcVFSD, dstVFSD) return err } delete(srcDir.children, src.name) if srcDir != dstDir { - fs.deferDecRef(srcDirVFSD) - dstDir.IncRef() + fs.deferDecRef(srcDir) // child (src) drops ref on old parent. + dstDir.IncRef() // child (src) takes a ref on the new parent. } src.parent = dstDir src.name = pc if dstDir.children == nil { dstDir.children = make(map[string]*Dentry) } + replaced := dstDir.children[pc] dstDir.children[pc] = src - virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaced) + var replaceVFSD *vfs.Dentry + if replaced != nil { + // deferDecRef so that fs.mu and dstDir.mu are unlocked by then. + fs.deferDecRef(replaced) + replaceVFSD = replaced.VFSDentry() + } + virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD) return nil } // RmdirAt implements vfs.FilesystemImpl.RmdirAt. func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() - vfsd, inode, err := fs.walkExistingLocked(ctx, rp) - fs.processDeferredDecRefsLocked(ctx) + + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return err } @@ -654,14 +705,13 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } defer rp.Mount().EndWrite() - if err := checkDeleteLocked(ctx, rp, vfsd); err != nil { + if err := checkDeleteLocked(ctx, rp, d); err != nil { return err } - d := vfsd.Impl().(*Dentry) if !d.isDir() { return syserror.ENOTDIR } - if inode.HasChildren() { + if d.inode.HasChildren() { return syserror.ENOTEMPTY } virtfs := rp.VirtualFilesystem() @@ -671,13 +721,18 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef(ctx) + vfsd := d.VFSDentry() if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } - if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil { + + if err := parentDentry.inode.RmDir(ctx, d.name, d.inode); err != nil { virtfs.AbortDeleteDentry(vfsd) return err } + delete(parentDentry.children, d.name) + // Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then. + fs.deferDecRef(d) virtfs.CommitDeleteDentry(ctx, vfsd) return nil } @@ -685,41 +740,40 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { fs.mu.RLock() - _, inode, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return err } if opts.Stat.Mask == 0 { return nil } - return inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts) + return d.inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts) } // StatAt implements vfs.FilesystemImpl.StatAt. func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { fs.mu.RLock() - _, inode, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return linux.Statx{}, err } - return inode.Stat(ctx, fs.VFSFilesystem(), opts) + return d.inode.Stat(ctx, fs.VFSFilesystem(), opts) } // StatFSAt implements vfs.FilesystemImpl.StatFSAt. func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { fs.mu.RLock() - _, _, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return linux.Statfs{}, err } - // TODO(gvisor.dev/issue/1193): actually implement statfs. - return linux.Statfs{}, syserror.ENOSYS + return d.inode.StatFS(ctx, fs.VFSFilesystem()) } // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. @@ -728,13 +782,16 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ return syserror.EEXIST } fs.mu.Lock() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() - parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked(ctx) + parent, err := fs.walkParentDirLocked(ctx, rp) if err != nil { return err } - pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + + pc, err := checkCreateLocked(ctx, rp, parent) if err != nil { return err } @@ -742,20 +799,23 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ return err } defer rp.Mount().EndWrite() - childVFSD, err := parentInode.NewSymlink(ctx, pc, target) + childI, err := parent.inode.NewSymlink(ctx, pc, target) if err != nil { return err } - parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry)) + var child Dentry + child.Init(fs, childI) + parent.insertChildLocked(pc, &child) return nil } // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.Unlock() - vfsd, _, err := fs.walkExistingLocked(ctx, rp) - fs.processDeferredDecRefsLocked(ctx) + + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return err } @@ -763,10 +823,9 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } defer rp.Mount().EndWrite() - if err := checkDeleteLocked(ctx, rp, vfsd); err != nil { + if err := checkDeleteLocked(ctx, rp, d); err != nil { return err } - d := vfsd.Impl().(*Dentry) if d.isDir() { return syserror.EISDIR } @@ -776,38 +835,42 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error defer parentDentry.dirMu.Unlock() mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef(ctx) + vfsd := d.VFSDentry() if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } - if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil { + if err := parentDentry.inode.Unlink(ctx, d.name, d.inode); err != nil { virtfs.AbortDeleteDentry(vfsd) return err } + delete(parentDentry.children, d.name) + // Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then. + fs.deferDecRef(d) virtfs.CommitDeleteDentry(ctx, vfsd) return nil } -// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { fs.mu.RLock() - _, inode, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() + d, err := fs.walkExistingLocked(ctx, rp) if err != nil { return nil, err } - if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { + if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } return nil, syserror.ECONNREFUSED } -// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { +// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. +func (fs *Filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { fs.mu.RLock() - _, _, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() + _, err := fs.walkExistingLocked(ctx, rp) if err != nil { return nil, err } @@ -815,12 +878,12 @@ func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si return nil, syserror.ENOTSUP } -// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { +// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. +func (fs *Filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { fs.mu.RLock() - _, _, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() + _, err := fs.walkExistingLocked(ctx, rp) if err != nil { return "", err } @@ -828,12 +891,12 @@ func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt return "", syserror.ENOTSUP } -// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. -func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { +// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. +func (fs *Filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { fs.mu.RLock() - _, _, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() + _, err := fs.walkExistingLocked(ctx, rp) if err != nil { return err } @@ -841,12 +904,12 @@ func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt return syserror.ENOTSUP } -// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. -func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { +// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. +func (fs *Filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { fs.mu.RLock() - _, _, err := fs.walkExistingLocked(ctx, rp) - fs.mu.RUnlock() - fs.processDeferredDecRefs(ctx) + defer fs.processDeferredDecRefs(ctx) + defer fs.mu.RUnlock() + _, err := fs.walkExistingLocked(ctx, rp) if err != nil { return err } @@ -860,3 +923,16 @@ func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe defer fs.mu.RUnlock() return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*Dentry), b) } + +func (fs *Filesystem) deferDecRefVD(ctx context.Context, vd vfs.VirtualDentry) { + if d, ok := vd.Dentry().Impl().(*Dentry); ok && d.fs == fs { + // The following is equivalent to vd.DecRef(ctx). This is needed + // because if d belongs to this filesystem, we can not DecRef it right + // away as we may be holding fs.mu. d.DecRef may acquire fs.mu. So we + // defer the DecRef to when locks are dropped. + vd.Mount().DecRef(ctx) + fs.deferDecRef(d) + } else { + vd.DecRef(ctx) + } +} diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index fe8a1e710..122b10591 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -32,7 +31,10 @@ import ( // count for inodes, performing no extra actions when references are obtained or // released. This is suitable for simple file inodes that don't reference any // resources. +// +// +stateify savable type InodeNoopRefCount struct { + InodeTemporary } // IncRef implements Inode.IncRef. @@ -51,30 +53,32 @@ func (InodeNoopRefCount) TryIncRef() bool { // InodeDirectoryNoNewChildren partially implements the Inode interface. // InodeDirectoryNoNewChildren represents a directory inode which does not // support creation of new children. +// +// +stateify savable type InodeDirectoryNoNewChildren struct{} // NewFile implements Inode.NewFile. -func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) { return nil, syserror.EPERM } // NewDir implements Inode.NewDir. -func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) { return nil, syserror.EPERM } // NewLink implements Inode.NewLink. -func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (Inode, error) { return nil, syserror.EPERM } // NewSymlink implements Inode.NewSymlink. -func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (Inode, error) { return nil, syserror.EPERM } // NewNode implements Inode.NewNode. -func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) { return nil, syserror.EPERM } @@ -82,7 +86,10 @@ func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOpt // inodeDirectory and inodeDynamicDirectory sub interfaces. Inodes that do not // represent directories can embed this to provide no-op implementations for // directory-related functions. +// +// +stateify savable type InodeNotDirectory struct { + InodeAlwaysValid } // HasChildren implements Inode.HasChildren. @@ -91,47 +98,47 @@ func (InodeNotDirectory) HasChildren() bool { } // NewFile implements Inode.NewFile. -func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) { panic("NewFile called on non-directory inode") } // NewDir implements Inode.NewDir. -func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) { panic("NewDir called on non-directory inode") } // NewLink implements Inode.NewLinkink. -func (InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewLink(context.Context, string, Inode) (Inode, error) { panic("NewLink called on non-directory inode") } // NewSymlink implements Inode.NewSymlink. -func (InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewSymlink(context.Context, string, string) (Inode, error) { panic("NewSymlink called on non-directory inode") } // NewNode implements Inode.NewNode. -func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) { panic("NewNode called on non-directory inode") } // Unlink implements Inode.Unlink. -func (InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error { +func (InodeNotDirectory) Unlink(context.Context, string, Inode) error { panic("Unlink called on non-directory inode") } // RmDir implements Inode.RmDir. -func (InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error { +func (InodeNotDirectory) RmDir(context.Context, string, Inode) error { panic("RmDir called on non-directory inode") } // Rename implements Inode.Rename. -func (InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) { +func (InodeNotDirectory) Rename(context.Context, string, string, Inode, Inode) error { panic("Rename called on non-directory inode") } // Lookup implements Inode.Lookup. -func (InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { +func (InodeNotDirectory) Lookup(ctx context.Context, name string) (Inode, error) { panic("Lookup called on non-directory inode") } @@ -140,40 +147,15 @@ func (InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDiren panic("IterDirents called on non-directory inode") } -// Valid implements Inode.Valid. -func (InodeNotDirectory) Valid(context.Context) bool { - return true -} - -// InodeNoDynamicLookup partially implements the Inode interface, specifically -// the inodeDynamicLookup sub interface. Directory inodes that do not support -// dymanic entries (i.e. entries that are not "hashed" into the -// vfs.Dentry.children) can embed this to provide no-op implementations for -// functions related to dynamic entries. -type InodeNoDynamicLookup struct{} - -// Lookup implements Inode.Lookup. -func (InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { - return nil, syserror.ENOENT -} - -// IterDirents implements Inode.IterDirents. -func (InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { - return offset, nil -} - -// Valid implements Inode.Valid. -func (InodeNoDynamicLookup) Valid(ctx context.Context) bool { - return true -} - // InodeNotSymlink partially implements the Inode interface, specifically the // inodeSymlink sub interface. All inodes that are not symlinks may embed this // to return the appropriate errors from symlink-related functions. +// +// +stateify savable type InodeNotSymlink struct{} // Readlink implements Inode.Readlink. -func (InodeNotSymlink) Readlink(context.Context) (string, error) { +func (InodeNotSymlink) Readlink(context.Context, *vfs.Mount) (string, error) { return "", syserror.EINVAL } @@ -187,6 +169,8 @@ func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, // inode attributes. // // Must be initialized by Init prior to first use. +// +// +stateify savable type InodeAttrs struct { devMajor uint32 devMinor uint32 @@ -257,12 +241,29 @@ func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (li // SetStat implements Inode.SetStat. func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + return a.SetInodeStat(ctx, fs, creds, opts) +} + +// SetInodeStat sets the corresponding attributes from opts to InodeAttrs. +// This function can be used by other kernfs-based filesystem implementation to +// sets the unexported attributes into InodeAttrs. +func (a *InodeAttrs) SetInodeStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { if opts.Stat.Mask == 0 { return nil } - if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 { + + // Note that not all fields are modifiable. For example, the file type and + // inode numbers are immutable after node creation. Setting the size is often + // allowed by kernfs files but does not do anything. If some other behavior is + // needed, the embedder should consider extending SetStat. + // + // TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps. + if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_SIZE) != 0 { return syserror.EPERM } + if opts.Stat.Mask&linux.STATX_SIZE != 0 && a.Mode().IsDir() { + return syserror.EISDIR + } if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil { return err } @@ -285,13 +286,6 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut atomic.StoreUint32(&a.gid, stat.GID) } - // Note that not all fields are modifiable. For example, the file type and - // inode numbers are immutable after node creation. - - // TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps. - // Also, STATX_SIZE will need some special handling, because read-only static - // files should return EIO for truncate operations. - return nil } @@ -321,13 +315,17 @@ func (a *InodeAttrs) DecLinks() { } } +// +stateify savable type slot struct { - Name string - Dentry *vfs.Dentry + name string + inode Inode + static bool slotEntry } // OrderedChildrenOptions contains initialization options for OrderedChildren. +// +// +stateify savable type OrderedChildrenOptions struct { // Writable indicates whether vfs.FilesystemImpl methods implemented by // OrderedChildren may modify the tracked children. This applies to @@ -337,20 +335,28 @@ type OrderedChildrenOptions struct { } // OrderedChildren partially implements the Inode interface. OrderedChildren can -// be embedded in directory inodes to keep track of the children in the +// be embedded in directory inodes to keep track of children in the // directory, and can then be used to implement a generic directory FD -- see -// GenericDirectoryFD. OrderedChildren is not compatible with dynamic -// directories. +// GenericDirectoryFD. +// +// OrderedChildren can represent a node in an Inode tree. The children inodes +// might be directories themselves using OrderedChildren; hence extending the +// tree. The parent inode (OrderedChildren user) holds a ref on all its static +// children. This lets the static inodes outlive their associated dentry. +// While the dentry might have to be regenerated via a Lookup() call, we can +// keep reusing the same static inode. These static children inodes are finally +// DecRef'd when this directory inode is being destroyed. This makes +// OrderedChildren suitable for static directory entries as well. // // Must be initialize with Init before first use. +// +// +stateify savable type OrderedChildren struct { - refs.AtomicRefCount - // Can children be modified by user syscalls? It set to false, interface // methods that would modify the children return EPERM. Immutable. writable bool - mu sync.RWMutex + mu sync.RWMutex `state:"nosave"` order slotList set map[string]*slot } @@ -361,36 +367,66 @@ func (o *OrderedChildren) Init(opts OrderedChildrenOptions) { o.set = make(map[string]*slot) } -// DecRef implements Inode.DecRef. -func (o *OrderedChildren) DecRef(ctx context.Context) { - o.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) { - o.mu.Lock() - defer o.mu.Unlock() - o.order.Reset() - o.set = nil - }) +// Destroy clears the children stored in o. It should be called by structs +// embedding OrderedChildren upon destruction, i.e. when their reference count +// reaches zero. +func (o *OrderedChildren) Destroy(ctx context.Context) { + o.mu.Lock() + defer o.mu.Unlock() + // Drop the ref that o owns on the static inodes it holds. + for _, s := range o.set { + if s.static { + s.inode.DecRef(ctx) + } + } + o.order.Reset() + o.set = nil } -// Populate inserts children into this OrderedChildren, and d's dentry -// cache. Populate returns the number of directories inserted, which the caller +// Populate inserts static children into this OrderedChildren. +// Populate returns the number of directories inserted, which the caller // may use to update the link count for the parent directory. // -// Precondition: d must represent a directory inode. children must not contain -// any conflicting entries already in o. -func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint32 { +// Precondition: +// * d must represent a directory inode. +// * children must not contain any conflicting entries already in o. +// * Caller must hold a reference on all inodes passed. +// +// Postcondition: Caller's references on inodes are transferred to o. +func (o *OrderedChildren) Populate(children map[string]Inode) uint32 { var links uint32 for name, child := range children { - if child.isDir() { + if child.Mode().IsDir() { links++ } - if err := o.Insert(name, child.VFSDentry()); err != nil { - panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d)) + if err := o.insert(name, child, true); err != nil { + panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v)", name, child)) } - d.InsertChild(name, child) } return links } +// Lookup implements Inode.Lookup. +func (o *OrderedChildren) Lookup(ctx context.Context, name string) (Inode, error) { + o.mu.RLock() + defer o.mu.RUnlock() + + s, ok := o.set[name] + if !ok { + return nil, syserror.ENOENT + } + + s.inode.IncRef() // This ref is passed to the dentry upon creation via Init. + return s.inode, nil +} + +// IterDirents implements Inode.IterDirents. +func (o *OrderedChildren) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) { + // All entries from OrderedChildren have already been handled in + // GenericDirectoryFD.IterDirents. + return offset, nil +} + // HasChildren implements Inode.HasChildren. func (o *OrderedChildren) HasChildren() bool { o.mu.RLock() @@ -398,17 +434,27 @@ func (o *OrderedChildren) HasChildren() bool { return len(o.set) > 0 } -// Insert inserts child into o. This ignores the writability of o, as this is -// not part of the vfs.FilesystemImpl interface, and is a lower-level operation. -func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error { +// Insert inserts a dynamic child into o. This ignores the writability of o, as +// this is not part of the vfs.FilesystemImpl interface, and is a lower-level operation. +func (o *OrderedChildren) Insert(name string, child Inode) error { + return o.insert(name, child, false) +} + +// insert inserts child into o. +// +// Precondition: Caller must be holding a ref on child if static is true. +// +// Postcondition: Caller's ref on child is transferred to o if static is true. +func (o *OrderedChildren) insert(name string, child Inode, static bool) error { o.mu.Lock() defer o.mu.Unlock() if _, ok := o.set[name]; ok { return syserror.EEXIST } s := &slot{ - Name: name, - Dentry: child, + name: name, + inode: child, + static: static, } o.order.PushBack(s) o.set[name] = s @@ -418,44 +464,49 @@ func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error { // Precondition: caller must hold o.mu for writing. func (o *OrderedChildren) removeLocked(name string) { if s, ok := o.set[name]; ok { + if s.static { + panic(fmt.Sprintf("removeLocked called on a static inode: %v", s.inode)) + } delete(o.set, name) o.order.Remove(s) } } // Precondition: caller must hold o.mu for writing. -func (o *OrderedChildren) replaceChildLocked(name string, new *vfs.Dentry) *vfs.Dentry { +func (o *OrderedChildren) replaceChildLocked(ctx context.Context, name string, newI Inode) { if s, ok := o.set[name]; ok { + if s.static { + panic(fmt.Sprintf("replacing a static inode: %v", s.inode)) + } + // Existing slot with given name, simply replace the dentry. - var old *vfs.Dentry - old, s.Dentry = s.Dentry, new - return old + s.inode = newI } // No existing slot with given name, create and hash new slot. s := &slot{ - Name: name, - Dentry: new, + name: name, + inode: newI, + static: false, } o.order.PushBack(s) o.set[name] = s - return nil } // Precondition: caller must hold o.mu for reading or writing. -func (o *OrderedChildren) checkExistingLocked(name string, child *vfs.Dentry) error { +func (o *OrderedChildren) checkExistingLocked(name string, child Inode) error { s, ok := o.set[name] if !ok { return syserror.ENOENT } - if s.Dentry != child { - panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! OrderedChild: %+v, vfs: %+v", s.Dentry, child)) + if s.inode != child { + panic(fmt.Sprintf("Inode doesn't match what kernfs thinks! OrderedChild: %+v, kernfs: %+v", s.inode, child)) } return nil } // Unlink implements Inode.Unlink. -func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.Dentry) error { +func (o *OrderedChildren) Unlink(ctx context.Context, name string, child Inode) error { if !o.writable { return syserror.EPERM } @@ -470,13 +521,14 @@ func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.De return nil } -// Rmdir implements Inode.Rmdir. -func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *vfs.Dentry) error { +// RmDir implements Inode.RmDir. +func (o *OrderedChildren) RmDir(ctx context.Context, name string, child Inode) error { // We're not responsible for checking that child is a directory, that it's // empty, or updating any link counts; so this is the same as unlink. return o.Unlink(ctx, name, child) } +// +stateify savable type renameAcrossDifferentImplementationsError struct{} func (renameAcrossDifferentImplementationsError) Error() string { @@ -492,13 +544,13 @@ func (renameAcrossDifferentImplementationsError) Error() string { // that will support Rename. // // Postcondition: reference on any replaced dentry transferred to caller. -func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (*vfs.Dentry, error) { - dst, ok := dstDir.Impl().(*Dentry).inode.(interface{}).(*OrderedChildren) +func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error { + dst, ok := dstDir.(interface{}).(*OrderedChildren) if !ok { - return nil, renameAcrossDifferentImplementationsError{} + return renameAcrossDifferentImplementationsError{} } if !o.writable || !dst.writable { - return nil, syserror.EPERM + return syserror.EPERM } // Note: There's a potential deadlock below if concurrent calls to Rename // refer to the same src and dst directories in reverse. We avoid any @@ -511,12 +563,12 @@ func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, c defer dst.mu.Unlock() } if err := o.checkExistingLocked(oldname, child); err != nil { - return nil, err + return err } // TODO(gvisor.dev/issue/3027): Check sticky bit before removing. - replaced := dst.replaceChildLocked(newname, child) - return replaced, nil + dst.replaceChildLocked(ctx, newname, child) + return nil } // nthLocked returns an iterator to the nth child tracked by this object. The @@ -535,12 +587,14 @@ func (o *OrderedChildren) nthLocked(i int64) *slot { } // InodeSymlink partially implements Inode interface for symlinks. +// +// +stateify savable type InodeSymlink struct { InodeNotDirectory } // Open implements Inode.Open. -func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { return nil, syserror.ELOOP } @@ -549,43 +603,46 @@ func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D // // +stateify savable type StaticDirectory struct { - InodeNotSymlink - InodeDirectoryNoNewChildren + InodeAlwaysValid InodeAttrs - InodeNoDynamicLookup + InodeDirectoryNoNewChildren + InodeNoStatFS + InodeNotSymlink + InodeTemporary OrderedChildren + StaticDirectoryRefs - locks vfs.FileLocks + locks vfs.FileLocks + fdOpts GenericDirectoryFDOptions } var _ Inode = (*StaticDirectory)(nil) // NewStaticDir creates a new static directory and returns its dentry. -func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry { +func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]Inode, fdOpts GenericDirectoryFDOptions) Inode { inode := &StaticDirectory{} - inode.Init(creds, devMajor, devMinor, ino, perm) - - dentry := &Dentry{} - dentry.Init(inode) + inode.Init(creds, devMajor, devMinor, ino, perm, fdOpts) + inode.EnableLeakCheck() inode.OrderedChildren.Init(OrderedChildrenOptions{}) - links := inode.OrderedChildren.Populate(dentry, children) + links := inode.OrderedChildren.Populate(children) inode.IncLinks(links) - return dentry + return inode } // Init initializes StaticDirectory. -func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { +func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, fdOpts GenericDirectoryFDOptions) { if perm&^linux.PermissionsMask != 0 { panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) } + s.fdOpts = fdOpts s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeDirectory|perm) } -// Open implements kernfs.Inode. -func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts) +// Open implements Inode.Open. +func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := NewGenericDirectoryFD(rp.Mount(), d, &s.OrderedChildren, &s.locks, &opts, s.fdOpts) if err != nil { return nil, err } @@ -597,10 +654,38 @@ func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credenti return syserror.EPERM } -// AlwaysValid partially implements kernfs.inodeDynamicLookup. -type AlwaysValid struct{} +// DecRef implements Inode.DecRef. +func (s *StaticDirectory) DecRef(ctx context.Context) { + s.StaticDirectoryRefs.DecRef(func() { s.Destroy(ctx) }) +} + +// InodeAlwaysValid partially implements Inode. +// +// +stateify savable +type InodeAlwaysValid struct{} -// Valid implements kernfs.inodeDynamicLookup. -func (*AlwaysValid) Valid(context.Context) bool { +// Valid implements Inode.Valid. +func (*InodeAlwaysValid) Valid(context.Context) bool { return true } + +// InodeTemporary partially implements Inode. +// +// +stateify savable +type InodeTemporary struct{} + +// Keep implements Inode.Keep. +func (*InodeTemporary) Keep() bool { + return false +} + +// InodeNoStatFS partially implements the Inode interface, where the client +// filesystem doesn't support statfs(2). +// +// +stateify savable +type InodeNoStatFS struct{} + +// StatFS implements Inode.StatFS. +func (*InodeNoStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { + return linux.Statfs{}, syserror.ENOSYS +} diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 51dbc050c..606081e68 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -29,12 +29,16 @@ // // Reference Model: // -// Kernfs dentries represents named pointers to inodes. Dentries and inode have +// Kernfs dentries represents named pointers to inodes. Kernfs is solely +// reponsible for maintaining and modifying its dentry tree; inode +// implementations can not access the tree. Dentries and inodes have // independent lifetimes and reference counts. A child dentry unconditionally // holds a reference on its parent directory's dentry. A dentry also holds a -// reference on the inode it points to. Multiple dentries can point to the same -// inode (for example, in the case of hardlinks). File descriptors hold a -// reference to the dentry they're opened on. +// reference on the inode it points to (although that might not be the only +// reference on the inode). Due to this inodes can outlive the dentries that +// point to them. Multiple dentries can point to the same inode (for example, +// in the case of hardlinks). File descriptors hold a reference to the dentry +// they're opened on. // // Dentries are guaranteed to exist while holding Filesystem.mu for // reading. Dropping dentries require holding Filesystem.mu for writing. To @@ -47,8 +51,8 @@ // kernfs.Dentry.dirMu // vfs.VirtualFilesystem.mountMu // vfs.Dentry.mu -// kernfs.Filesystem.droppedDentriesMu // (inode implementation locks, if any) +// kernfs.Filesystem.droppedDentriesMu package kernfs import ( @@ -57,7 +61,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -66,15 +69,17 @@ import ( // Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory // filesystem. Concrete implementations are expected to embed this in their own // Filesystem type. +// +// +stateify savable type Filesystem struct { vfsfs vfs.Filesystem - droppedDentriesMu sync.Mutex + droppedDentriesMu sync.Mutex `state:"nosave"` // droppedDentries is a list of dentries waiting to be DecRef()ed. This is // used to defer dentry destruction until mu can be acquired for // writing. Protected by droppedDentriesMu. - droppedDentries []*vfs.Dentry + droppedDentries []*Dentry // mu synchronizes the lifetime of Dentries on this filesystem. Holding it // for reading guarantees continued existence of any resolved dentries, but @@ -93,11 +98,11 @@ type Filesystem struct { // example: // // fs.mu.RLock() - // fs.mu.processDeferredDecRefs() + // defer fs.processDeferredDecRefs() // defer fs.mu.RUnlock() // ... // fs.deferDecRef(dentry) - mu sync.RWMutex + mu sync.RWMutex `state:"nosave"` // nextInoMinusOne is used to to allocate inode numbers on this // filesystem. Must be accessed by atomic operations. @@ -106,9 +111,8 @@ type Filesystem struct { // deferDecRef defers dropping a dentry ref until the next call to // processDeferredDecRefs{,Locked}. See comment on Filesystem.mu. -// -// Precondition: d must not already be pending destruction. -func (fs *Filesystem) deferDecRef(d *vfs.Dentry) { +// This may be called while Filesystem.mu or Dentry.dirMu is locked. +func (fs *Filesystem) deferDecRef(d *Dentry) { fs.droppedDentriesMu.Lock() fs.droppedDentries = append(fs.droppedDentries, d) fs.droppedDentriesMu.Unlock() @@ -116,17 +120,14 @@ func (fs *Filesystem) deferDecRef(d *vfs.Dentry) { // processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the // droppedDentries list. See comment on Filesystem.mu. +// +// Precondition: Filesystem.mu or Dentry.dirMu must NOT be locked. func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) { - fs.mu.Lock() - fs.processDeferredDecRefsLocked(ctx) - fs.mu.Unlock() -} - -// Precondition: fs.mu must be held for writing. -func (fs *Filesystem) processDeferredDecRefsLocked(ctx context.Context) { fs.droppedDentriesMu.Lock() for _, d := range fs.droppedDentries { - d.DecRef(ctx) + // Defer the DecRef call so that we are not holding droppedDentriesMu + // when DecRef is called. + defer d.DecRef(ctx) } fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse. fs.droppedDentriesMu.Unlock() @@ -155,15 +156,19 @@ const ( // // A kernfs dentry is similar to a dentry in a traditional filesystem: it's a // named reference to an inode. A dentry generally lives as long as it's part of -// a mounted filesystem tree. Kernfs doesn't cache dentries once all references -// to them are removed. Dentries hold a single reference to the inode they point +// a mounted filesystem tree. Kernfs drops dentries once all references to them +// are dropped. Dentries hold a single reference to the inode they point // to, and child dentries hold a reference on their parent. // // Must be initialized by Init prior to first use. +// +// +stateify savable type Dentry struct { vfsd vfs.Dentry + DentryRefs - refs.AtomicRefCount + // fs is the owning filesystem. fs is immutable. + fs *Filesystem // flags caches useful information about the dentry from the inode. See the // dflags* consts above. Must be accessed by atomic ops. @@ -173,7 +178,11 @@ type Dentry struct { name string // dirMu protects children and the names of child Dentries. - dirMu sync.Mutex + // + // Note that holding fs.mu for writing is not sufficient; + // revalidateChildLocked(), which is a very hot path, may modify children with + // fs.mu acquired for reading only. + dirMu sync.Mutex `state:"nosave"` children map[string]*Dentry inode Inode @@ -184,8 +193,9 @@ type Dentry struct { // Precondition: Caller must hold a reference on inode. // // Postcondition: Caller's reference on inode is transferred to the dentry. -func (d *Dentry) Init(inode Inode) { +func (d *Dentry) Init(fs *Filesystem, inode Inode) { d.vfsd.Init(d) + d.fs = fs d.inode = inode ftype := inode.Mode().FileType() if ftype == linux.ModeDirectory { @@ -194,6 +204,7 @@ func (d *Dentry) Init(inode Inode) { if ftype == linux.ModeSymlink { d.flags |= dflagsIsSymlink } + d.EnableLeakCheck() } // VFSDentry returns the generic vfs dentry for this kernfs dentry. @@ -213,15 +224,27 @@ func (d *Dentry) isSymlink() bool { // DecRef implements vfs.DentryImpl.DecRef. func (d *Dentry) DecRef(ctx context.Context) { - d.AtomicRefCount.DecRefWithDestructor(ctx, d.destroy) -} - -// Precondition: Dentry must be removed from VFS' dentry cache. -func (d *Dentry) destroy(ctx context.Context) { - d.inode.DecRef(ctx) // IncRef from Init. - d.inode = nil - if d.parent != nil { - d.parent.DecRef(ctx) // IncRef from Dentry.InsertChild. + decRefParent := false + d.fs.mu.Lock() + d.DentryRefs.DecRef(func() { + d.inode.DecRef(ctx) // IncRef from Init. + d.inode = nil + if d.parent != nil { + // We will DecRef d.parent once all locks are dropped. + decRefParent = true + d.parent.dirMu.Lock() + // Remove d from parent.children. It might already have been + // removed due to invalidation. + if _, ok := d.parent.children[d.name]; ok { + delete(d.parent.children, d.name) + d.fs.VFSFilesystem().VirtualFilesystem().InvalidateDentry(ctx, d.VFSDentry()) + } + d.parent.dirMu.Unlock() + } + }) + d.fs.mu.Unlock() + if decRefParent { + d.parent.DecRef(ctx) // IncRef from Dentry.insertChild. } } @@ -240,25 +263,26 @@ func (d *Dentry) Watches() *vfs.Watches { // OnZeroWatches implements vfs.Dentry.OnZeroWatches. func (d *Dentry) OnZeroWatches(context.Context) {} -// InsertChild inserts child into the vfs dentry cache with the given name under -// this dentry. This does not update the directory inode, so calling this on -// its own isn't sufficient to insert a child into a directory. InsertChild -// updates the link count on d if required. +// insertChild inserts child into the vfs dentry cache with the given name under +// this dentry. This does not update the directory inode, so calling this on its +// own isn't sufficient to insert a child into a directory. // // Precondition: d must represent a directory inode. -func (d *Dentry) InsertChild(name string, child *Dentry) { +func (d *Dentry) insertChild(name string, child *Dentry) { d.dirMu.Lock() d.insertChildLocked(name, child) d.dirMu.Unlock() } -// insertChildLocked is equivalent to InsertChild, with additional +// insertChildLocked is equivalent to insertChild, with additional // preconditions. // -// Precondition: d.dirMu must be locked. +// Preconditions: +// * d must represent a directory inode. +// * d.dirMu must be locked. func (d *Dentry) insertChildLocked(name string, child *Dentry) { if !d.isDir() { - panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d)) + panic(fmt.Sprintf("insertChildLocked called on non-directory Dentry: %+v.", d)) } d.IncRef() // DecRef in child's Dentry.destroy. child.parent = d @@ -289,7 +313,6 @@ func (d *Dentry) Inode() Inode { // // - Checking that dentries passed to methods are of the appropriate file type. // - Checking permissions. -// - Updating link and reference counts. // // Specific responsibilities of implementations are documented below. type Inode interface { @@ -299,7 +322,8 @@ type Inode interface { inodeRefs // Methods related to node metadata. A generic implementation is provided by - // InodeAttrs. + // InodeAttrs. Note that a concrete filesystem using kernfs is responsible for + // managing link counts. inodeMetadata // Method for inodes that represent symlink. InodeNotSymlink provides a @@ -310,18 +334,26 @@ type Inode interface { // a blanket implementation for all non-directory inodes. inodeDirectory - // Method for inodes that represent dynamic directories and their - // children. InodeNoDynamicLookup provides a blanket implementation for all - // non-dynamic-directory inodes. - inodeDynamicLookup - // Open creates a file description for the filesystem object represented by // this inode. The returned file description should hold a reference on the - // inode for its lifetime. + // dentry for its lifetime. // // Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing // the inode on which Open() is being called. - Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) + Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) + + // StatFS returns filesystem statistics for the client filesystem. This + // corresponds to vfs.FilesystemImpl.StatFSAt. If the client filesystem + // doesn't support statfs(2), this should return ENOSYS. + StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) + + // Keep indicates whether the dentry created after Inode.Lookup should be + // kept in the kernfs dentry tree. + Keep() bool + + // Valid should return true if this inode is still valid, or needs to + // be resolved again by a call to Lookup. + Valid(ctx context.Context) bool } type inodeRefs interface { @@ -354,8 +386,8 @@ type inodeMetadata interface { // Precondition: All methods in this interface may only be called on directory // inodes. type inodeDirectory interface { - // The New{File,Dir,Node,Symlink} methods below should return a new inode - // hashed into this inode. + // The New{File,Dir,Node,Link,Symlink} methods below should return a new inode + // that will be hashed into the dentry tree. // // These inode constructors are inode-level operations rather than // filesystem-level operations to allow client filesystems to mix different @@ -366,63 +398,57 @@ type inodeDirectory interface { HasChildren() bool // NewFile creates a new regular file inode. - NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) + NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error) // NewDir creates a new directory inode. - NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) + NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error) // NewLink creates a new hardlink to a specified inode in this // directory. Implementations should create a new kernfs Dentry pointing to // target, and update target's link count. - NewLink(ctx context.Context, name string, target Inode) (*vfs.Dentry, error) + NewLink(ctx context.Context, name string, target Inode) (Inode, error) // NewSymlink creates a new symbolic link inode. - NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error) + NewSymlink(ctx context.Context, name, target string) (Inode, error) // NewNode creates a new filesystem node for a mknod syscall. - NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error) + NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error) // Unlink removes a child dentry from this directory inode. - Unlink(ctx context.Context, name string, child *vfs.Dentry) error + Unlink(ctx context.Context, name string, child Inode) error // RmDir removes an empty child directory from this directory // inode. Implementations must update the parent directory's link count, // if required. Implementations are not responsible for checking that child // is a directory, checking for an empty directory. - RmDir(ctx context.Context, name string, child *vfs.Dentry) error + RmDir(ctx context.Context, name string, child Inode) error // Rename is called on the source directory containing an inode being // renamed. child should point to the resolved child in the source - // directory. If Rename replaces a dentry in the destination directory, it - // should return the replaced dentry or nil otherwise. + // directory. // // Precondition: Caller must serialize concurrent calls to Rename. - Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (replaced *vfs.Dentry, err error) -} + Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error -type inodeDynamicLookup interface { - // Lookup should return an appropriate dentry if name should resolve to a - // child of this dynamic directory inode. This gives the directory an - // opportunity on every lookup to resolve additional entries that aren't - // hashed into the directory. This is only called when the inode is a - // directory. If the inode is not a directory, or if the directory only - // contains a static set of children, the implementer can unconditionally - // return an appropriate error (ENOTDIR and ENOENT respectively). + // Lookup should return an appropriate inode if name should resolve to a + // child of this directory inode. This gives the directory an opportunity + // on every lookup to resolve additional entries. This is only called when + // the inode is a directory. // - // The child returned by Lookup will be hashed into the VFS dentry tree. Its - // lifetime can be controlled by the filesystem implementation with an - // appropriate implementation of Valid. + // The child returned by Lookup will be hashed into the VFS dentry tree, + // atleast for the duration of the current FS operation. // - // Lookup returns the child with an extra reference and the caller owns this - // reference. - Lookup(ctx context.Context, name string) (*vfs.Dentry, error) - - // Valid should return true if this inode is still valid, or needs to - // be resolved again by a call to Lookup. - Valid(ctx context.Context) bool + // Lookup must return the child with an extra reference whose ownership is + // transferred to the dentry that is created to point to that inode. If + // Inode.Keep returns false, that new dentry will be dropped at the end of + // the current filesystem operation (before returning back to the VFS + // layer) if no other ref is picked on that dentry. If Inode.Keep returns + // true, then the dentry will be cached into the dentry tree until it is + // Unlink'd or RmDir'd. + Lookup(ctx context.Context, name string) (Inode, error) // IterDirents is used to iterate over dynamically created entries. It invokes - // cb on each entry in the directory represented by the FileDescription. + // cb on each entry in the directory represented by the Inode. // 'offset' is the offset for the entire IterDirents call, which may include // results from the caller (e.g. "." and ".."). 'relOffset' is the offset // inside the entries returned by this IterDirents invocation. In other words, @@ -434,7 +460,7 @@ type inodeDynamicLookup interface { type inodeSymlink interface { // Readlink returns the target of a symbolic link. If an inode is not a // symlink, the implementation should return EINVAL. - Readlink(ctx context.Context) (string, error) + Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) // Getlink returns the target of a symbolic link, as used by path // resolution: diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index c5d5afedf..82fa19c03 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -36,7 +36,7 @@ const staticFileContent = "This is sample content for a static test file." // RootDentryFn is a generator function for creating the root dentry of a test // filesystem. See newTestSystem. -type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry +type RootDentryFn func(*auth.Credentials, *filesystem) kernfs.Inode // newTestSystem sets up a minimal environment for running a test, including an // instance of a test filesystem. Tests can control the contents of the @@ -52,7 +52,7 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System { v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{}) + mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.MountOptions{}) if err != nil { t.Fatalf("Failed to create testfs root mount: %v", err) } @@ -72,14 +72,11 @@ type file struct { content string } -func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry { +func (fs *filesystem) newFile(creds *auth.Credentials, content string) kernfs.Inode { f := &file{} f.content = content f.DynamicBytesFile.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), f, 0777) - - d := &kernfs.Dentry{} - d.Init(f) - return d + return f } func (f *file) Generate(ctx context.Context, buf *bytes.Buffer) error { @@ -96,100 +93,110 @@ func (*attrs) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.S } type readonlyDir struct { + readonlyDirRefs attrs - kernfs.InodeNotSymlink - kernfs.InodeNoDynamicLookup + kernfs.InodeAlwaysValid kernfs.InodeDirectoryNoNewChildren + kernfs.InodeNoStatFS + kernfs.InodeNotSymlink + kernfs.InodeTemporary kernfs.OrderedChildren locks vfs.FileLocks - - dentry kernfs.Dentry } -func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { +func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode { dir := &readonlyDir{} dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode) dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) - dir.dentry.Init(dir) - - dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents)) - - return &dir.dentry + dir.EnableLeakCheck() + dir.IncLinks(dir.OrderedChildren.Populate(contents)) + return dir } -func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts) +func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndStaticEntries, + }) if err != nil { return nil, err } return fd.VFSFileDescription(), nil } +func (d *readonlyDir) DecRef(ctx context.Context) { + d.readonlyDirRefs.DecRef(func() { d.Destroy(ctx) }) +} + type dir struct { + dirRefs attrs + kernfs.InodeAlwaysValid kernfs.InodeNotSymlink - kernfs.InodeNoDynamicLookup + kernfs.InodeNoStatFS + kernfs.InodeTemporary kernfs.OrderedChildren locks vfs.FileLocks - fs *filesystem - dentry kernfs.Dentry + fs *filesystem } -func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { +func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode { dir := &dir{} dir.fs = fs dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode) dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true}) - dir.dentry.Init(dir) + dir.EnableLeakCheck() - dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents)) - - return &dir.dentry + dir.IncLinks(dir.OrderedChildren.Populate(contents)) + return dir } -func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts) +func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndStaticEntries, + }) if err != nil { return nil, err } return fd.VFSFileDescription(), nil } -func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) { +func (d *dir) DecRef(ctx context.Context) { + d.dirRefs.DecRef(func() { d.Destroy(ctx) }) +} + +func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) { creds := auth.CredentialsFromContext(ctx) dir := d.fs.newDir(creds, opts.Mode, nil) - dirVFSD := dir.VFSDentry() - if err := d.OrderedChildren.Insert(name, dirVFSD); err != nil { + if err := d.OrderedChildren.Insert(name, dir); err != nil { dir.DecRef(ctx) return nil, err } d.IncLinks(1) - return dirVFSD, nil + return dir, nil } -func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) { +func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (kernfs.Inode, error) { creds := auth.CredentialsFromContext(ctx) f := d.fs.newFile(creds, "") - fVFSD := f.VFSDentry() - if err := d.OrderedChildren.Insert(name, fVFSD); err != nil { + if err := d.OrderedChildren.Insert(name, f); err != nil { f.DecRef(ctx) return nil, err } - return fVFSD, nil + return f, nil } -func (*dir) NewLink(context.Context, string, kernfs.Inode) (*vfs.Dentry, error) { +func (*dir) NewLink(context.Context, string, kernfs.Inode) (kernfs.Inode, error) { return nil, syserror.EPERM } -func (*dir) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { +func (*dir) NewSymlink(context.Context, string, string) (kernfs.Inode, error) { return nil, syserror.EPERM } -func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { +func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (kernfs.Inode, error) { return nil, syserror.EPERM } @@ -197,18 +204,22 @@ func (fsType) Name() string { return "kernfs" } +func (fsType) Release(ctx context.Context) {} + func (fst fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { fs := &filesystem{} fs.VFSFilesystem().Init(vfsObj, &fst, fs) root := fst.rootFn(creds, fs) - return fs.VFSFilesystem(), root.VFSDentry(), nil + var d kernfs.Dentry + d.Init(&fs.Filesystem, root) + return fs.VFSFilesystem(), d.VFSDentry(), nil } // -------------------- Remainder of the file are test cases -------------------- func TestBasic(t *testing.T) { - sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { - return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode { + return fs.newReadonlyDir(creds, 0755, map[string]kernfs.Inode{ "file1": fs.newFile(creds, staticFileContent), }) }) @@ -217,8 +228,8 @@ func TestBasic(t *testing.T) { } func TestMkdirGetDentry(t *testing.T) { - sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { - return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode { + return fs.newReadonlyDir(creds, 0755, map[string]kernfs.Inode{ "dir1": fs.newDir(creds, 0755, nil), }) }) @@ -232,8 +243,8 @@ func TestMkdirGetDentry(t *testing.T) { } func TestReadStaticFile(t *testing.T) { - sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { - return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode { + return fs.newReadonlyDir(creds, 0755, map[string]kernfs.Inode{ "file1": fs.newFile(creds, staticFileContent), }) }) @@ -258,8 +269,8 @@ func TestReadStaticFile(t *testing.T) { } func TestCreateNewFileInStaticDir(t *testing.T) { - sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { - return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode { + return fs.newReadonlyDir(creds, 0755, map[string]kernfs.Inode{ "dir1": fs.newDir(creds, 0755, nil), }) }) @@ -285,7 +296,7 @@ func TestCreateNewFileInStaticDir(t *testing.T) { } func TestDirFDReadWrite(t *testing.T) { - sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode { return fs.newReadonlyDir(creds, 0755, nil) }) defer sys.Destroy() @@ -309,11 +320,11 @@ func TestDirFDReadWrite(t *testing.T) { } func TestDirFDIterDirents(t *testing.T) { - sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { - return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) kernfs.Inode { + return fs.newReadonlyDir(creds, 0755, map[string]kernfs.Inode{ // Fill root with nodes backed by various inode implementations. "dir1": fs.newReadonlyDir(creds, 0755, nil), - "dir2": fs.newDir(creds, 0755, map[string]*kernfs.Dentry{ + "dir2": fs.newDir(creds, 0755, map[string]kernfs.Inode{ "dir3": fs.newDir(creds, 0755, nil), }), "file1": fs.newFile(creds, staticFileContent), diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index 2ab3f53fd..934cc6c9e 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -24,10 +24,13 @@ import ( // StaticSymlink provides an Inode implementation for symlinks that point to // a immutable target. +// +// +stateify savable type StaticSymlink struct { InodeAttrs InodeNoopRefCount InodeSymlink + InodeNoStatFS target string } @@ -35,13 +38,10 @@ type StaticSymlink struct { var _ Inode = (*StaticSymlink)(nil) // NewStaticSymlink creates a new symlink file pointing to 'target'. -func NewStaticSymlink(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) *Dentry { +func NewStaticSymlink(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) Inode { inode := &StaticSymlink{} inode.Init(creds, devMajor, devMinor, ino, target) - - d := &Dentry{} - d.Init(inode) - return d + return inode } // Init initializes the instance. @@ -50,8 +50,8 @@ func (s *StaticSymlink) Init(creds *auth.Credentials, devMajor uint32, devMinor s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeSymlink|0777) } -// Readlink implements Inode. -func (s *StaticSymlink) Readlink(_ context.Context) (string, error) { +// Readlink implements Inode.Readlink. +func (s *StaticSymlink) Readlink(_ context.Context, _ *vfs.Mount) (string, error) { return s.target, nil } diff --git a/pkg/sentry/fsimpl/kernfs/synthetic_directory.go b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go new file mode 100644 index 000000000..d0ed17b18 --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go @@ -0,0 +1,112 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// syntheticDirectory implements kernfs.Inode for a directory created by +// MkdirAt(ForSyntheticMountpoint=true). +// +// +stateify savable +type syntheticDirectory struct { + InodeAlwaysValid + InodeAttrs + InodeNoStatFS + InodeNotSymlink + OrderedChildren + syntheticDirectoryRefs + + locks vfs.FileLocks +} + +var _ Inode = (*syntheticDirectory)(nil) + +func newSyntheticDirectory(creds *auth.Credentials, perm linux.FileMode) Inode { + inode := &syntheticDirectory{} + inode.Init(creds, 0 /* devMajor */, 0 /* devMinor */, 0 /* ino */, perm) + return inode +} + +func (dir *syntheticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { + if perm&^linux.PermissionsMask != 0 { + panic(fmt.Sprintf("perm contains non-permission bits: %#o", perm)) + } + dir.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.S_IFDIR|perm) + dir.OrderedChildren.Init(OrderedChildrenOptions{ + Writable: true, + }) +} + +// Open implements Inode.Open. +func (dir *syntheticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := NewGenericDirectoryFD(rp.Mount(), d, &dir.OrderedChildren, &dir.locks, &opts, GenericDirectoryFDOptions{}) + if err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// NewFile implements Inode.NewFile. +func (dir *syntheticDirectory) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error) { + return nil, syserror.EPERM +} + +// NewDir implements Inode.NewDir. +func (dir *syntheticDirectory) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error) { + if !opts.ForSyntheticMountpoint { + return nil, syserror.EPERM + } + subdirI := newSyntheticDirectory(auth.CredentialsFromContext(ctx), opts.Mode&linux.PermissionsMask) + if err := dir.OrderedChildren.Insert(name, subdirI); err != nil { + subdirI.DecRef(ctx) + return nil, err + } + return subdirI, nil +} + +// NewLink implements Inode.NewLink. +func (dir *syntheticDirectory) NewLink(ctx context.Context, name string, target Inode) (Inode, error) { + return nil, syserror.EPERM +} + +// NewSymlink implements Inode.NewSymlink. +func (dir *syntheticDirectory) NewSymlink(ctx context.Context, name, target string) (Inode, error) { + return nil, syserror.EPERM +} + +// NewNode implements Inode.NewNode. +func (dir *syntheticDirectory) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error) { + return nil, syserror.EPERM +} + +// DecRef implements Inode.DecRef. +func (dir *syntheticDirectory) DecRef(ctx context.Context) { + dir.syntheticDirectoryRefs.DecRef(func() { dir.Destroy(ctx) }) +} + +// Keep implements Inode.Keep. This is redundant because inodes will never be +// created via Lookup and inodes are always valid. Makes sense to return true +// because these inodes are not temporary and should only be removed on RmDir. +func (dir *syntheticDirectory) Keep() bool { + return true +} |