diff options
author | gVisor bot <gvisor-bot@google.com> | 2021-08-12 00:26:22 +0000 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2021-08-12 00:26:22 +0000 |
commit | 3486648e75a831840219939f41ac89ef64dea0f3 (patch) | |
tree | a7afe5244eed1fdfc9d3ccb550bf526c45e35a83 /pkg/sentry/fsimpl/cgroupfs | |
parent | ddd6507953d90452314553fedce6bbf7f372c635 (diff) | |
parent | a50596874a4971167f97a05181363e91292a2885 (diff) |
Merge release-20210726.0-50-ga50596874 (automated)
Diffstat (limited to 'pkg/sentry/fsimpl/cgroupfs')
-rw-r--r-- | pkg/sentry/fsimpl/cgroupfs/base.go | 9 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/cgroupfs/cgroupfs.go | 181 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/cgroupfs/cgroupfs_state_autogen.go | 25 |
3 files changed, 189 insertions, 26 deletions
diff --git a/pkg/sentry/fsimpl/cgroupfs/base.go b/pkg/sentry/fsimpl/cgroupfs/base.go index 4290ffe0d..71bb0a9c8 100644 --- a/pkg/sentry/fsimpl/cgroupfs/base.go +++ b/pkg/sentry/fsimpl/cgroupfs/base.go @@ -88,7 +88,6 @@ type controller interface { // +stateify savable type cgroupInode struct { dir - fs *filesystem // ts is the list of tasks in this cgroup. The kernel is responsible for // removing tasks from this list before they're destroyed, so any tasks on @@ -102,9 +101,10 @@ var _ kernel.CgroupImpl = (*cgroupInode)(nil) func (fs *filesystem) newCgroupInode(ctx context.Context, creds *auth.Credentials) kernfs.Inode { c := &cgroupInode{ - fs: fs, - ts: make(map[*kernel.Task]struct{}), + dir: dir{fs: fs}, + ts: make(map[*kernel.Task]struct{}), } + c.dir.cgi = c contents := make(map[string]kernfs.Inode) contents["cgroup.procs"] = fs.newControllerFile(ctx, creds, &cgroupProcsData{c}) @@ -115,8 +115,7 @@ func (fs *filesystem) newCgroupInode(ctx context.Context, creds *auth.Credential } c.dir.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|linux.FileMode(0555)) - c.dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) - c.dir.InitRefs() + c.dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true}) c.dir.IncLinks(c.dir.OrderedChildren.Populate(contents)) atomic.AddUint64(&fs.numCgroups, 1) diff --git a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go index 22c8b7fda..edc3b50b9 100644 --- a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go +++ b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go @@ -32,7 +32,8 @@ // controllers associated with them. // // Since cgroupfs doesn't allow hardlinks, there is a unique mapping between -// cgroupfs dentries and inodes. +// cgroupfs dentries and inodes. Thus, cgroupfs inodes don't need to be ref +// counted and exist until they're unlinked once or the FS is destroyed. // // # Synchronization // @@ -48,10 +49,11 @@ // Lock order: // // kernel.CgroupRegistry.mu -// cgroupfs.filesystem.mu -// kernel.TaskSet.mu -// kernel.Task.mu -// cgroupfs.filesystem.tasksMu. +// kernfs.filesystem.mu +// kernel.TaskSet.mu +// kernel.Task.mu +// cgroupfs.filesystem.tasksMu. +// cgroupfs.dir.OrderedChildren.mu package cgroupfs import ( @@ -63,6 +65,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -108,6 +111,7 @@ type FilesystemType struct{} // +stateify savable type InternalData struct { DefaultControlValues map[string]int64 + InitialCgroupPath string } // filesystem implements vfs.FilesystemImpl and kernel.cgroupFS. @@ -134,6 +138,11 @@ type filesystem struct { numCgroups uint64 // Protected by atomic ops. root *kernfs.Dentry + // effectiveRoot is the initial cgroup new tasks are created in. Unless + // overwritten by internal mount options, root == effectiveRoot. If + // effectiveRoot != root, an extra reference is held on effectiveRoot for + // the lifetime of the filesystem. + effectiveRoot *kernfs.Dentry // tasksMu serializes task membership changes across all cgroups within a // filesystem. @@ -229,6 +238,9 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fs := vfsfs.Impl().(*filesystem) ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: mounting new view to hierarchy %v", fs.hierarchyID) fs.root.IncRef() + if fs.effectiveRoot != fs.root { + fs.effectiveRoot.IncRef() + } return vfsfs, fs.root.VFSDentry(), nil } @@ -245,8 +257,8 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt var defaults map[string]int64 if opts.InternalData != nil { - ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults) defaults = opts.InternalData.(*InternalData).DefaultControlValues + ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults) } for _, ty := range wantControllers { @@ -286,6 +298,14 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt var rootD kernfs.Dentry rootD.InitRoot(&fs.Filesystem, root) fs.root = &rootD + fs.effectiveRoot = fs.root + + if err := fs.prepareInitialCgroup(ctx, vfsObj, opts); err != nil { + ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: failed to prepare initial cgroup: %v", err) + rootD.DecRef(ctx) + fs.VFSFilesystem().DecRef(ctx) + return nil, nil, err + } // Register controllers. The registry may be modified concurrently, so if we // get an error, we raced with someone else who registered the same @@ -303,10 +323,47 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return fs.VFSFilesystem(), rootD.VFSDentry(), nil } +// prepareInitialCgroup creates the initial cgroup according to opts. An initial +// cgroup is optional, and if not specified, this function is a no-op. +func (fs *filesystem) prepareInitialCgroup(ctx context.Context, vfsObj *vfs.VirtualFilesystem, opts vfs.GetFilesystemOptions) error { + if opts.InternalData == nil { + return nil + } + initPathStr := opts.InternalData.(*InternalData).InitialCgroupPath + if initPathStr == "" { + return nil + } + ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path: %v", initPathStr) + initPath := fspath.Parse(initPathStr) + if !initPath.Absolute || !initPath.HasComponents() { + ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path invalid: %+v", initPath) + return linuxerr.EINVAL + } + + // Have initial cgroup target, create the tree. + cgDir := fs.root.Inode().(*cgroupInode) + for pit := initPath.Begin; pit.Ok(); pit = pit.Next() { + cgDirI, err := cgDir.NewDir(ctx, pit.String(), vfs.MkdirOptions{}) + if err != nil { + return err + } + cgDir = cgDirI.(*cgroupInode) + } + + // Walk to target dentry. + initDentry, err := fs.root.WalkDentryTree(ctx, vfsObj, initPath) + if err != nil { + ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup dentry not found: %v", err) + return linuxerr.ENOENT + } + fs.effectiveRoot = initDentry // Reference from WalkDentryTree transferred here. + return nil +} + func (fs *filesystem) rootCgroup() kernel.Cgroup { return kernel.Cgroup{ - Dentry: fs.root, - CgroupImpl: fs.root.Inode().(kernel.CgroupImpl), + Dentry: fs.effectiveRoot, + CgroupImpl: fs.effectiveRoot.Inode().(kernel.CgroupImpl), } } @@ -320,6 +377,10 @@ func (fs *filesystem) Release(ctx context.Context) { r.Unregister(fs.hierarchyID) } + if fs.root != fs.effectiveRoot { + fs.effectiveRoot.DecRef(ctx) + } + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) fs.Filesystem.Release(ctx) } @@ -346,15 +407,18 @@ func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error // // +stateify savable type dir struct { - dirRefs + kernfs.InodeNoopRefCount kernfs.InodeAlwaysValid kernfs.InodeAttrs kernfs.InodeNotSymlink - kernfs.InodeDirectoryNoNewChildren // TODO(b/183137098): Implement mkdir. + kernfs.InodeDirectoryNoNewChildren kernfs.OrderedChildren implStatFS locks vfs.FileLocks + + fs *filesystem // Immutable. + cgi *cgroupInode // Immutable. } // Keep implements kernfs.Inode.Keep. @@ -378,9 +442,100 @@ func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry return fd.VFSFileDescription(), nil } -// DecRef implements kernfs.Inode.DecRef. -func (d *dir) DecRef(ctx context.Context) { - d.dirRefs.DecRef(func() { d.Destroy(ctx) }) +// NewDir implements kernfs.Inode.NewDir. +func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) { + // "Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable." + // -- Linux, kernel/cgroup.c:cgroup_mkdir(). + if strings.Contains(name, "\n") { + return nil, linuxerr.EINVAL + } + return d.OrderedChildren.Inserter(name, func() kernfs.Inode { + d.IncLinks(1) + return d.fs.newCgroupInode(ctx, auth.CredentialsFromContext(ctx)) + }) +} + +// Rename implements kernfs.Inode.Rename. Cgroupfs only allows renaming of +// cgroup directories, and the rename may only change the name within the same +// parent. See linux, kernel/cgroup.c:cgroup_rename(). +func (d *dir) Rename(ctx context.Context, oldname, newname string, child, dst kernfs.Inode) error { + if _, ok := child.(*cgroupInode); !ok { + // Not a cgroup directory. Control files are backed by different types. + return linuxerr.ENOTDIR + } + + dstCGInode, ok := dst.(*cgroupInode) + if !ok { + // Not a cgroup inode, so definitely can't be *this* inode. + return linuxerr.EIO + } + // Note: We're intentionally comparing addresses, since two different dirs + // could plausibly be identical in memory, but would occupy different + // locations in memory. + if d != &dstCGInode.dir { + // Destination dir is a different cgroup inode. Cross directory renames + // aren't allowed. + return linuxerr.EIO + } + + // Rename moves oldname to newname within d. Proceed. + return d.OrderedChildren.Rename(ctx, oldname, newname, child, dst) +} + +// Unlink implements kernfs.Inode.Unlink. Cgroupfs disallows unlink, as the only +// files in the filesystem are control files, which can't be deleted. +func (d *dir) Unlink(ctx context.Context, name string, child kernfs.Inode) error { + return linuxerr.EPERM +} + +// hasChildrenLocked returns whether the cgroup dir contains any objects that +// prevent it from being deleted. +func (d *dir) hasChildrenLocked() bool { + // Subdirs take a link on the parent, so checks if there are any direct + // children cgroups. Exclude the dir's self link and the link from ".". + if d.InodeAttrs.Links()-2 > 0 { + return true + } + return len(d.cgi.ts) > 0 +} + +// HasChildren implements kernfs.Inode.HasChildren. +// +// The empty check for a cgroupfs directory is unlike a regular directory since +// a cgroupfs directory will always have control files. A cgroupfs directory can +// be deleted if cgroup contains no tasks and has no sub-cgroups. +func (d *dir) HasChildren() bool { + d.fs.tasksMu.RLock() + defer d.fs.tasksMu.RUnlock() + return d.hasChildrenLocked() +} + +// RmDir implements kernfs.Inode.RmDir. +func (d *dir) RmDir(ctx context.Context, name string, child kernfs.Inode) error { + // Unlike a normal directory, we need to recheck if d is empty again, since + // vfs/kernfs can't stop tasks from entering or leaving the cgroup. + d.fs.tasksMu.RLock() + defer d.fs.tasksMu.RUnlock() + + cgi, ok := child.(*cgroupInode) + if !ok { + return linuxerr.ENOTDIR + } + if cgi.dir.hasChildrenLocked() { + return linuxerr.ENOTEMPTY + } + + // Disallow deletion of the effective root cgroup. + if cgi == d.fs.effectiveRoot.Inode().(*cgroupInode) { + ctx.Warningf("Cannot delete initial cgroup for new tasks %q", d.fs.effectiveRoot.FSLocalPath()) + return linuxerr.EBUSY + } + + err := d.OrderedChildren.RmDir(ctx, name, child) + if err == nil { + d.InodeAttrs.DecLinks() + } + return err } // controllerFile represents a generic control file that appears within a cgroup diff --git a/pkg/sentry/fsimpl/cgroupfs/cgroupfs_state_autogen.go b/pkg/sentry/fsimpl/cgroupfs/cgroupfs_state_autogen.go index 3142ab6f8..aa40bb193 100644 --- a/pkg/sentry/fsimpl/cgroupfs/cgroupfs_state_autogen.go +++ b/pkg/sentry/fsimpl/cgroupfs/cgroupfs_state_autogen.go @@ -41,7 +41,6 @@ func (c *cgroupInode) StateTypeName() string { func (c *cgroupInode) StateFields() []string { return []string{ "dir", - "fs", "ts", } } @@ -52,8 +51,7 @@ func (c *cgroupInode) beforeSave() {} func (c *cgroupInode) StateSave(stateSinkObject state.Sink) { c.beforeSave() stateSinkObject.Save(0, &c.dir) - stateSinkObject.Save(1, &c.fs) - stateSinkObject.Save(2, &c.ts) + stateSinkObject.Save(1, &c.ts) } func (c *cgroupInode) afterLoad() {} @@ -61,8 +59,7 @@ func (c *cgroupInode) afterLoad() {} // +checklocksignore func (c *cgroupInode) StateLoad(stateSourceObject state.Source) { stateSourceObject.Load(0, &c.dir) - stateSourceObject.Load(1, &c.fs) - stateSourceObject.Load(2, &c.ts) + stateSourceObject.Load(1, &c.ts) } func (d *cgroupProcsData) StateTypeName() string { @@ -143,6 +140,7 @@ func (i *InternalData) StateTypeName() string { func (i *InternalData) StateFields() []string { return []string{ "DefaultControlValues", + "InitialCgroupPath", } } @@ -152,6 +150,7 @@ func (i *InternalData) beforeSave() {} func (i *InternalData) StateSave(stateSinkObject state.Sink) { i.beforeSave() stateSinkObject.Save(0, &i.DefaultControlValues) + stateSinkObject.Save(1, &i.InitialCgroupPath) } func (i *InternalData) afterLoad() {} @@ -159,6 +158,7 @@ func (i *InternalData) afterLoad() {} // +checklocksignore func (i *InternalData) StateLoad(stateSourceObject state.Source) { stateSourceObject.Load(0, &i.DefaultControlValues) + stateSourceObject.Load(1, &i.InitialCgroupPath) } func (fs *filesystem) StateTypeName() string { @@ -174,6 +174,7 @@ func (fs *filesystem) StateFields() []string { "kcontrollers", "numCgroups", "root", + "effectiveRoot", } } @@ -189,6 +190,7 @@ func (fs *filesystem) StateSave(stateSinkObject state.Sink) { stateSinkObject.Save(4, &fs.kcontrollers) stateSinkObject.Save(5, &fs.numCgroups) stateSinkObject.Save(6, &fs.root) + stateSinkObject.Save(7, &fs.effectiveRoot) } func (fs *filesystem) afterLoad() {} @@ -202,6 +204,7 @@ func (fs *filesystem) StateLoad(stateSourceObject state.Source) { stateSourceObject.Load(4, &fs.kcontrollers) stateSourceObject.Load(5, &fs.numCgroups) stateSourceObject.Load(6, &fs.root) + stateSourceObject.Load(7, &fs.effectiveRoot) } func (i *implStatFS) StateTypeName() string { @@ -231,7 +234,7 @@ func (d *dir) StateTypeName() string { func (d *dir) StateFields() []string { return []string{ - "dirRefs", + "InodeNoopRefCount", "InodeAlwaysValid", "InodeAttrs", "InodeNotSymlink", @@ -239,6 +242,8 @@ func (d *dir) StateFields() []string { "OrderedChildren", "implStatFS", "locks", + "fs", + "cgi", } } @@ -247,7 +252,7 @@ func (d *dir) beforeSave() {} // +checklocksignore func (d *dir) StateSave(stateSinkObject state.Sink) { d.beforeSave() - stateSinkObject.Save(0, &d.dirRefs) + stateSinkObject.Save(0, &d.InodeNoopRefCount) stateSinkObject.Save(1, &d.InodeAlwaysValid) stateSinkObject.Save(2, &d.InodeAttrs) stateSinkObject.Save(3, &d.InodeNotSymlink) @@ -255,13 +260,15 @@ func (d *dir) StateSave(stateSinkObject state.Sink) { stateSinkObject.Save(5, &d.OrderedChildren) stateSinkObject.Save(6, &d.implStatFS) stateSinkObject.Save(7, &d.locks) + stateSinkObject.Save(8, &d.fs) + stateSinkObject.Save(9, &d.cgi) } func (d *dir) afterLoad() {} // +checklocksignore func (d *dir) StateLoad(stateSourceObject state.Source) { - stateSourceObject.Load(0, &d.dirRefs) + stateSourceObject.Load(0, &d.InodeNoopRefCount) stateSourceObject.Load(1, &d.InodeAlwaysValid) stateSourceObject.Load(2, &d.InodeAttrs) stateSourceObject.Load(3, &d.InodeNotSymlink) @@ -269,6 +276,8 @@ func (d *dir) StateLoad(stateSourceObject state.Source) { stateSourceObject.Load(5, &d.OrderedChildren) stateSourceObject.Load(6, &d.implStatFS) stateSourceObject.Load(7, &d.locks) + stateSourceObject.Load(8, &d.fs) + stateSourceObject.Load(9, &d.cgi) } func (c *controllerFile) StateTypeName() string { |