summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorgVisor bot <gvisor-bot@google.com>2021-08-12 00:26:22 +0000
committergVisor bot <gvisor-bot@google.com>2021-08-12 00:26:22 +0000
commit3486648e75a831840219939f41ac89ef64dea0f3 (patch)
treea7afe5244eed1fdfc9d3ccb550bf526c45e35a83
parentddd6507953d90452314553fedce6bbf7f372c635 (diff)
parenta50596874a4971167f97a05181363e91292a2885 (diff)
Merge release-20210726.0-50-ga50596874 (automated)
-rw-r--r--pkg/sentry/fsimpl/cgroupfs/base.go9
-rw-r--r--pkg/sentry/fsimpl/cgroupfs/cgroupfs.go181
-rw-r--r--pkg/sentry/fsimpl/cgroupfs/cgroupfs_state_autogen.go25
-rw-r--r--pkg/sentry/fsimpl/kernfs/inode_impl_util.go38
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs.go67
-rw-r--r--pkg/sentry/kernel/cgroup.go1
6 files changed, 287 insertions, 34 deletions
diff --git a/pkg/sentry/fsimpl/cgroupfs/base.go b/pkg/sentry/fsimpl/cgroupfs/base.go
index 4290ffe0d..71bb0a9c8 100644
--- a/pkg/sentry/fsimpl/cgroupfs/base.go
+++ b/pkg/sentry/fsimpl/cgroupfs/base.go
@@ -88,7 +88,6 @@ type controller interface {
// +stateify savable
type cgroupInode struct {
dir
- fs *filesystem
// ts is the list of tasks in this cgroup. The kernel is responsible for
// removing tasks from this list before they're destroyed, so any tasks on
@@ -102,9 +101,10 @@ var _ kernel.CgroupImpl = (*cgroupInode)(nil)
func (fs *filesystem) newCgroupInode(ctx context.Context, creds *auth.Credentials) kernfs.Inode {
c := &cgroupInode{
- fs: fs,
- ts: make(map[*kernel.Task]struct{}),
+ dir: dir{fs: fs},
+ ts: make(map[*kernel.Task]struct{}),
}
+ c.dir.cgi = c
contents := make(map[string]kernfs.Inode)
contents["cgroup.procs"] = fs.newControllerFile(ctx, creds, &cgroupProcsData{c})
@@ -115,8 +115,7 @@ func (fs *filesystem) newCgroupInode(ctx context.Context, creds *auth.Credential
}
c.dir.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|linux.FileMode(0555))
- c.dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
- c.dir.InitRefs()
+ c.dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true})
c.dir.IncLinks(c.dir.OrderedChildren.Populate(contents))
atomic.AddUint64(&fs.numCgroups, 1)
diff --git a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go
index 22c8b7fda..edc3b50b9 100644
--- a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go
+++ b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go
@@ -32,7 +32,8 @@
// controllers associated with them.
//
// Since cgroupfs doesn't allow hardlinks, there is a unique mapping between
-// cgroupfs dentries and inodes.
+// cgroupfs dentries and inodes. Thus, cgroupfs inodes don't need to be ref
+// counted and exist until they're unlinked once or the FS is destroyed.
//
// # Synchronization
//
@@ -48,10 +49,11 @@
// Lock order:
//
// kernel.CgroupRegistry.mu
-// cgroupfs.filesystem.mu
-// kernel.TaskSet.mu
-// kernel.Task.mu
-// cgroupfs.filesystem.tasksMu.
+// kernfs.filesystem.mu
+// kernel.TaskSet.mu
+// kernel.Task.mu
+// cgroupfs.filesystem.tasksMu.
+// cgroupfs.dir.OrderedChildren.mu
package cgroupfs
import (
@@ -63,6 +65,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/errors/linuxerr"
+ "gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -108,6 +111,7 @@ type FilesystemType struct{}
// +stateify savable
type InternalData struct {
DefaultControlValues map[string]int64
+ InitialCgroupPath string
}
// filesystem implements vfs.FilesystemImpl and kernel.cgroupFS.
@@ -134,6 +138,11 @@ type filesystem struct {
numCgroups uint64 // Protected by atomic ops.
root *kernfs.Dentry
+ // effectiveRoot is the initial cgroup new tasks are created in. Unless
+ // overwritten by internal mount options, root == effectiveRoot. If
+ // effectiveRoot != root, an extra reference is held on effectiveRoot for
+ // the lifetime of the filesystem.
+ effectiveRoot *kernfs.Dentry
// tasksMu serializes task membership changes across all cgroups within a
// filesystem.
@@ -229,6 +238,9 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
fs := vfsfs.Impl().(*filesystem)
ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: mounting new view to hierarchy %v", fs.hierarchyID)
fs.root.IncRef()
+ if fs.effectiveRoot != fs.root {
+ fs.effectiveRoot.IncRef()
+ }
return vfsfs, fs.root.VFSDentry(), nil
}
@@ -245,8 +257,8 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
var defaults map[string]int64
if opts.InternalData != nil {
- ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults)
defaults = opts.InternalData.(*InternalData).DefaultControlValues
+ ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults)
}
for _, ty := range wantControllers {
@@ -286,6 +298,14 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
var rootD kernfs.Dentry
rootD.InitRoot(&fs.Filesystem, root)
fs.root = &rootD
+ fs.effectiveRoot = fs.root
+
+ if err := fs.prepareInitialCgroup(ctx, vfsObj, opts); err != nil {
+ ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: failed to prepare initial cgroup: %v", err)
+ rootD.DecRef(ctx)
+ fs.VFSFilesystem().DecRef(ctx)
+ return nil, nil, err
+ }
// Register controllers. The registry may be modified concurrently, so if we
// get an error, we raced with someone else who registered the same
@@ -303,10 +323,47 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
return fs.VFSFilesystem(), rootD.VFSDentry(), nil
}
+// prepareInitialCgroup creates the initial cgroup according to opts. An initial
+// cgroup is optional, and if not specified, this function is a no-op.
+func (fs *filesystem) prepareInitialCgroup(ctx context.Context, vfsObj *vfs.VirtualFilesystem, opts vfs.GetFilesystemOptions) error {
+ if opts.InternalData == nil {
+ return nil
+ }
+ initPathStr := opts.InternalData.(*InternalData).InitialCgroupPath
+ if initPathStr == "" {
+ return nil
+ }
+ ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path: %v", initPathStr)
+ initPath := fspath.Parse(initPathStr)
+ if !initPath.Absolute || !initPath.HasComponents() {
+ ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path invalid: %+v", initPath)
+ return linuxerr.EINVAL
+ }
+
+ // Have initial cgroup target, create the tree.
+ cgDir := fs.root.Inode().(*cgroupInode)
+ for pit := initPath.Begin; pit.Ok(); pit = pit.Next() {
+ cgDirI, err := cgDir.NewDir(ctx, pit.String(), vfs.MkdirOptions{})
+ if err != nil {
+ return err
+ }
+ cgDir = cgDirI.(*cgroupInode)
+ }
+
+ // Walk to target dentry.
+ initDentry, err := fs.root.WalkDentryTree(ctx, vfsObj, initPath)
+ if err != nil {
+ ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup dentry not found: %v", err)
+ return linuxerr.ENOENT
+ }
+ fs.effectiveRoot = initDentry // Reference from WalkDentryTree transferred here.
+ return nil
+}
+
func (fs *filesystem) rootCgroup() kernel.Cgroup {
return kernel.Cgroup{
- Dentry: fs.root,
- CgroupImpl: fs.root.Inode().(kernel.CgroupImpl),
+ Dentry: fs.effectiveRoot,
+ CgroupImpl: fs.effectiveRoot.Inode().(kernel.CgroupImpl),
}
}
@@ -320,6 +377,10 @@ func (fs *filesystem) Release(ctx context.Context) {
r.Unregister(fs.hierarchyID)
}
+ if fs.root != fs.effectiveRoot {
+ fs.effectiveRoot.DecRef(ctx)
+ }
+
fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
fs.Filesystem.Release(ctx)
}
@@ -346,15 +407,18 @@ func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error
//
// +stateify savable
type dir struct {
- dirRefs
+ kernfs.InodeNoopRefCount
kernfs.InodeAlwaysValid
kernfs.InodeAttrs
kernfs.InodeNotSymlink
- kernfs.InodeDirectoryNoNewChildren // TODO(b/183137098): Implement mkdir.
+ kernfs.InodeDirectoryNoNewChildren
kernfs.OrderedChildren
implStatFS
locks vfs.FileLocks
+
+ fs *filesystem // Immutable.
+ cgi *cgroupInode // Immutable.
}
// Keep implements kernfs.Inode.Keep.
@@ -378,9 +442,100 @@ func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry
return fd.VFSFileDescription(), nil
}
-// DecRef implements kernfs.Inode.DecRef.
-func (d *dir) DecRef(ctx context.Context) {
- d.dirRefs.DecRef(func() { d.Destroy(ctx) })
+// NewDir implements kernfs.Inode.NewDir.
+func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) {
+ // "Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable."
+ // -- Linux, kernel/cgroup.c:cgroup_mkdir().
+ if strings.Contains(name, "\n") {
+ return nil, linuxerr.EINVAL
+ }
+ return d.OrderedChildren.Inserter(name, func() kernfs.Inode {
+ d.IncLinks(1)
+ return d.fs.newCgroupInode(ctx, auth.CredentialsFromContext(ctx))
+ })
+}
+
+// Rename implements kernfs.Inode.Rename. Cgroupfs only allows renaming of
+// cgroup directories, and the rename may only change the name within the same
+// parent. See linux, kernel/cgroup.c:cgroup_rename().
+func (d *dir) Rename(ctx context.Context, oldname, newname string, child, dst kernfs.Inode) error {
+ if _, ok := child.(*cgroupInode); !ok {
+ // Not a cgroup directory. Control files are backed by different types.
+ return linuxerr.ENOTDIR
+ }
+
+ dstCGInode, ok := dst.(*cgroupInode)
+ if !ok {
+ // Not a cgroup inode, so definitely can't be *this* inode.
+ return linuxerr.EIO
+ }
+ // Note: We're intentionally comparing addresses, since two different dirs
+ // could plausibly be identical in memory, but would occupy different
+ // locations in memory.
+ if d != &dstCGInode.dir {
+ // Destination dir is a different cgroup inode. Cross directory renames
+ // aren't allowed.
+ return linuxerr.EIO
+ }
+
+ // Rename moves oldname to newname within d. Proceed.
+ return d.OrderedChildren.Rename(ctx, oldname, newname, child, dst)
+}
+
+// Unlink implements kernfs.Inode.Unlink. Cgroupfs disallows unlink, as the only
+// files in the filesystem are control files, which can't be deleted.
+func (d *dir) Unlink(ctx context.Context, name string, child kernfs.Inode) error {
+ return linuxerr.EPERM
+}
+
+// hasChildrenLocked returns whether the cgroup dir contains any objects that
+// prevent it from being deleted.
+func (d *dir) hasChildrenLocked() bool {
+ // Subdirs take a link on the parent, so checks if there are any direct
+ // children cgroups. Exclude the dir's self link and the link from ".".
+ if d.InodeAttrs.Links()-2 > 0 {
+ return true
+ }
+ return len(d.cgi.ts) > 0
+}
+
+// HasChildren implements kernfs.Inode.HasChildren.
+//
+// The empty check for a cgroupfs directory is unlike a regular directory since
+// a cgroupfs directory will always have control files. A cgroupfs directory can
+// be deleted if cgroup contains no tasks and has no sub-cgroups.
+func (d *dir) HasChildren() bool {
+ d.fs.tasksMu.RLock()
+ defer d.fs.tasksMu.RUnlock()
+ return d.hasChildrenLocked()
+}
+
+// RmDir implements kernfs.Inode.RmDir.
+func (d *dir) RmDir(ctx context.Context, name string, child kernfs.Inode) error {
+ // Unlike a normal directory, we need to recheck if d is empty again, since
+ // vfs/kernfs can't stop tasks from entering or leaving the cgroup.
+ d.fs.tasksMu.RLock()
+ defer d.fs.tasksMu.RUnlock()
+
+ cgi, ok := child.(*cgroupInode)
+ if !ok {
+ return linuxerr.ENOTDIR
+ }
+ if cgi.dir.hasChildrenLocked() {
+ return linuxerr.ENOTEMPTY
+ }
+
+ // Disallow deletion of the effective root cgroup.
+ if cgi == d.fs.effectiveRoot.Inode().(*cgroupInode) {
+ ctx.Warningf("Cannot delete initial cgroup for new tasks %q", d.fs.effectiveRoot.FSLocalPath())
+ return linuxerr.EBUSY
+ }
+
+ err := d.OrderedChildren.RmDir(ctx, name, child)
+ if err == nil {
+ d.InodeAttrs.DecLinks()
+ }
+ return err
}
// controllerFile represents a generic control file that appears within a cgroup
diff --git a/pkg/sentry/fsimpl/cgroupfs/cgroupfs_state_autogen.go b/pkg/sentry/fsimpl/cgroupfs/cgroupfs_state_autogen.go
index 3142ab6f8..aa40bb193 100644
--- a/pkg/sentry/fsimpl/cgroupfs/cgroupfs_state_autogen.go
+++ b/pkg/sentry/fsimpl/cgroupfs/cgroupfs_state_autogen.go
@@ -41,7 +41,6 @@ func (c *cgroupInode) StateTypeName() string {
func (c *cgroupInode) StateFields() []string {
return []string{
"dir",
- "fs",
"ts",
}
}
@@ -52,8 +51,7 @@ func (c *cgroupInode) beforeSave() {}
func (c *cgroupInode) StateSave(stateSinkObject state.Sink) {
c.beforeSave()
stateSinkObject.Save(0, &c.dir)
- stateSinkObject.Save(1, &c.fs)
- stateSinkObject.Save(2, &c.ts)
+ stateSinkObject.Save(1, &c.ts)
}
func (c *cgroupInode) afterLoad() {}
@@ -61,8 +59,7 @@ func (c *cgroupInode) afterLoad() {}
// +checklocksignore
func (c *cgroupInode) StateLoad(stateSourceObject state.Source) {
stateSourceObject.Load(0, &c.dir)
- stateSourceObject.Load(1, &c.fs)
- stateSourceObject.Load(2, &c.ts)
+ stateSourceObject.Load(1, &c.ts)
}
func (d *cgroupProcsData) StateTypeName() string {
@@ -143,6 +140,7 @@ func (i *InternalData) StateTypeName() string {
func (i *InternalData) StateFields() []string {
return []string{
"DefaultControlValues",
+ "InitialCgroupPath",
}
}
@@ -152,6 +150,7 @@ func (i *InternalData) beforeSave() {}
func (i *InternalData) StateSave(stateSinkObject state.Sink) {
i.beforeSave()
stateSinkObject.Save(0, &i.DefaultControlValues)
+ stateSinkObject.Save(1, &i.InitialCgroupPath)
}
func (i *InternalData) afterLoad() {}
@@ -159,6 +158,7 @@ func (i *InternalData) afterLoad() {}
// +checklocksignore
func (i *InternalData) StateLoad(stateSourceObject state.Source) {
stateSourceObject.Load(0, &i.DefaultControlValues)
+ stateSourceObject.Load(1, &i.InitialCgroupPath)
}
func (fs *filesystem) StateTypeName() string {
@@ -174,6 +174,7 @@ func (fs *filesystem) StateFields() []string {
"kcontrollers",
"numCgroups",
"root",
+ "effectiveRoot",
}
}
@@ -189,6 +190,7 @@ func (fs *filesystem) StateSave(stateSinkObject state.Sink) {
stateSinkObject.Save(4, &fs.kcontrollers)
stateSinkObject.Save(5, &fs.numCgroups)
stateSinkObject.Save(6, &fs.root)
+ stateSinkObject.Save(7, &fs.effectiveRoot)
}
func (fs *filesystem) afterLoad() {}
@@ -202,6 +204,7 @@ func (fs *filesystem) StateLoad(stateSourceObject state.Source) {
stateSourceObject.Load(4, &fs.kcontrollers)
stateSourceObject.Load(5, &fs.numCgroups)
stateSourceObject.Load(6, &fs.root)
+ stateSourceObject.Load(7, &fs.effectiveRoot)
}
func (i *implStatFS) StateTypeName() string {
@@ -231,7 +234,7 @@ func (d *dir) StateTypeName() string {
func (d *dir) StateFields() []string {
return []string{
- "dirRefs",
+ "InodeNoopRefCount",
"InodeAlwaysValid",
"InodeAttrs",
"InodeNotSymlink",
@@ -239,6 +242,8 @@ func (d *dir) StateFields() []string {
"OrderedChildren",
"implStatFS",
"locks",
+ "fs",
+ "cgi",
}
}
@@ -247,7 +252,7 @@ func (d *dir) beforeSave() {}
// +checklocksignore
func (d *dir) StateSave(stateSinkObject state.Sink) {
d.beforeSave()
- stateSinkObject.Save(0, &d.dirRefs)
+ stateSinkObject.Save(0, &d.InodeNoopRefCount)
stateSinkObject.Save(1, &d.InodeAlwaysValid)
stateSinkObject.Save(2, &d.InodeAttrs)
stateSinkObject.Save(3, &d.InodeNotSymlink)
@@ -255,13 +260,15 @@ func (d *dir) StateSave(stateSinkObject state.Sink) {
stateSinkObject.Save(5, &d.OrderedChildren)
stateSinkObject.Save(6, &d.implStatFS)
stateSinkObject.Save(7, &d.locks)
+ stateSinkObject.Save(8, &d.fs)
+ stateSinkObject.Save(9, &d.cgi)
}
func (d *dir) afterLoad() {}
// +checklocksignore
func (d *dir) StateLoad(stateSourceObject state.Source) {
- stateSourceObject.Load(0, &d.dirRefs)
+ stateSourceObject.Load(0, &d.InodeNoopRefCount)
stateSourceObject.Load(1, &d.InodeAlwaysValid)
stateSourceObject.Load(2, &d.InodeAttrs)
stateSourceObject.Load(3, &d.InodeNotSymlink)
@@ -269,6 +276,8 @@ func (d *dir) StateLoad(stateSourceObject state.Source) {
stateSourceObject.Load(5, &d.OrderedChildren)
stateSourceObject.Load(6, &d.implStatFS)
stateSourceObject.Load(7, &d.locks)
+ stateSourceObject.Load(8, &d.fs)
+ stateSourceObject.Load(9, &d.cgi)
}
func (c *controllerFile) StateTypeName() string {
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index a42fc79b4..b96dc9ef7 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -26,7 +26,6 @@ import (
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
- "gvisor.dev/gvisor/pkg/syserror"
)
// InodeNoopRefCount partially implements the Inode interface, specifically the
@@ -234,6 +233,11 @@ func (a *InodeAttrs) Mode() linux.FileMode {
return linux.FileMode(atomic.LoadUint32(&a.mode))
}
+// Links returns the link count.
+func (a *InodeAttrs) Links() uint32 {
+ return atomic.LoadUint32(&a.nlink)
+}
+
// TouchAtime updates a.atime to the current time.
func (a *InodeAttrs) TouchAtime(ctx context.Context, mnt *vfs.Mount) {
if mnt.Flags.NoATime || mnt.ReadOnly() {
@@ -289,7 +293,7 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
return linuxerr.EPERM
}
if opts.Stat.Mask&linux.STATX_SIZE != 0 && a.Mode().IsDir() {
- return syserror.EISDIR
+ return linuxerr.EISDIR
}
if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
return err
@@ -475,7 +479,7 @@ func (o *OrderedChildren) Lookup(ctx context.Context, name string) (Inode, error
s, ok := o.set[name]
if !ok {
- return nil, syserror.ENOENT
+ return nil, linuxerr.ENOENT
}
s.inode.IncRef() // This ref is passed to the dentry upon creation via Init.
@@ -502,6 +506,30 @@ func (o *OrderedChildren) Insert(name string, child Inode) error {
return o.insert(name, child, false)
}
+// Inserter is like Insert, but obtains the child to insert by calling
+// makeChild. makeChild is only called if the insert will succeed. This allows
+// the caller to atomically check and insert a child without having to
+// clean up the child on failure.
+func (o *OrderedChildren) Inserter(name string, makeChild func() Inode) (Inode, error) {
+ o.mu.Lock()
+ defer o.mu.Unlock()
+ if _, ok := o.set[name]; ok {
+ return nil, linuxerr.EEXIST
+ }
+
+ // Note: We must not fail after we call makeChild().
+
+ child := makeChild()
+ s := &slot{
+ name: name,
+ inode: child,
+ static: false,
+ }
+ o.order.PushBack(s)
+ o.set[name] = s
+ return child, nil
+}
+
// insert inserts child into o.
//
// Precondition: Caller must be holding a ref on child if static is true.
@@ -559,7 +587,7 @@ func (o *OrderedChildren) replaceChildLocked(ctx context.Context, name string, n
func (o *OrderedChildren) checkExistingLocked(name string, child Inode) error {
s, ok := o.set[name]
if !ok {
- return syserror.ENOENT
+ return linuxerr.ENOENT
}
if s.inode != child {
panic(fmt.Sprintf("Inode doesn't match what kernfs thinks! OrderedChild: %+v, kernfs: %+v", s.inode, child))
@@ -746,5 +774,5 @@ type InodeNoStatFS struct{}
// StatFS implements Inode.StatFS.
func (*InodeNoStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
- return linux.Statfs{}, syserror.ENOSYS
+ return linux.Statfs{}, linuxerr.ENOSYS
}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 0e2867d49..90c8b75d1 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -66,6 +66,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
)
// Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory
@@ -542,6 +543,63 @@ func (d *Dentry) FSLocalPath() string {
return b.String()
}
+// WalkDentryTree traverses p in the dentry tree for this filesystem. Note that
+// this only traverses the dentry tree and is not a general path traversal. No
+// symlinks and dynamic children are resolved, and no permission checks are
+// performed. The caller is responsible for ensuring the returned Dentry exists
+// for an appropriate lifetime.
+//
+// p is interpreted starting at d, and may be absolute or relative (absolute vs
+// relative paths both refer to the same target here, since p is absolute from
+// d). p may contain "." and "..", but will not allow traversal above d (similar
+// to ".." at the root dentry).
+//
+// This is useful for filesystem internals, where the filesystem may not be
+// mounted yet. For a mounted filesystem, use GetDentryAt.
+func (d *Dentry) WalkDentryTree(ctx context.Context, vfsObj *vfs.VirtualFilesystem, p fspath.Path) (*Dentry, error) {
+ d.fs.mu.RLock()
+ defer d.fs.processDeferredDecRefs(ctx)
+ defer d.fs.mu.RUnlock()
+
+ target := d
+
+ for pit := p.Begin; pit.Ok(); pit = pit.Next() {
+ pc := pit.String()
+
+ switch {
+ case target == nil:
+ return nil, syserror.ENOENT
+ case pc == ".":
+ // No-op, consume component and continue.
+ case pc == "..":
+ if target == d {
+ // Don't let .. traverse above the start point of the walk.
+ continue
+ }
+ target = target.parent
+ // Parent doesn't need revalidation since we revalidated it on the
+ // way to the child, and we're still holding fs.mu.
+ default:
+ var err error
+
+ d.dirMu.Lock()
+ target, err = d.fs.revalidateChildLocked(ctx, vfsObj, target, pc, target.children[pc])
+ d.dirMu.Unlock()
+
+ if err != nil {
+ return nil, err
+ }
+ }
+ }
+
+ if target == nil {
+ return nil, syserror.ENOENT
+ }
+
+ target.IncRef()
+ return target, nil
+}
+
// The Inode interface maps filesystem-level operations that operate on paths to
// equivalent operations on specific filesystem nodes.
//
@@ -667,12 +725,15 @@ type inodeDirectory interface {
// RmDir removes an empty child directory from this directory
// inode. Implementations must update the parent directory's link count,
// if required. Implementations are not responsible for checking that child
- // is a directory, checking for an empty directory.
+ // is a directory, or checking for an empty directory.
RmDir(ctx context.Context, name string, child Inode) error
// Rename is called on the source directory containing an inode being
- // renamed. child should point to the resolved child in the source
- // directory.
+ // renamed. child points to the resolved child in the source directory.
+ // dstDir is guaranteed to be a directory inode.
+ //
+ // On a successful call to Rename, the caller updates the dentry tree to
+ // reflect the name change.
//
// Precondition: Caller must serialize concurrent calls to Rename.
Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error
diff --git a/pkg/sentry/kernel/cgroup.go b/pkg/sentry/kernel/cgroup.go
index c93ef6ac1..a0e291f58 100644
--- a/pkg/sentry/kernel/cgroup.go
+++ b/pkg/sentry/kernel/cgroup.go
@@ -196,6 +196,7 @@ func (r *CgroupRegistry) FindHierarchy(ctypes []CgroupControllerType) *vfs.Files
// uniqueness of controllers enforced by Register, drop the
// dying hierarchy now. The eventual unregister by the FS
// teardown will become a no-op.
+ r.unregisterLocked(h.id)
return nil
}
return h.fs