// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package cgroupfs implements cgroupfs. // // A cgroup is a collection of tasks on the system, organized into a tree-like // structure similar to a filesystem directory tree. In fact, each cgroup is // represented by a directory on cgroupfs, and is manipulated through control // files in the directory. // // All cgroups on a system are organized into hierarchies. Hierarchies are a // distinct tree of cgroups, with a common set of controllers. One or more // cgroupfs mounts may point to each hierarchy. These mounts provide a common // view into the same tree of cgroups. // // A controller (also known as a "resource controller", or a cgroup "subsystem") // determines the behaviour of each cgroup. // // In addition to cgroupfs, the kernel has a cgroup registry that tracks // system-wide state related to cgroups such as active hierarchies and the // controllers associated with them. // // Since cgroupfs doesn't allow hardlinks, there is a unique mapping between // cgroupfs dentries and inodes. Thus, cgroupfs inodes don't need to be ref // counted and exist until they're unlinked once or the FS is destroyed. // // # Synchronization // // Cgroup hierarchy creation and destruction is protected by the // kernel.CgroupRegistry.mu. Once created, a hierarchy's set of controllers, the // filesystem associated with it, and the root cgroup for the hierarchy are // immutable. // // Membership of tasks within cgroups is protected by // cgroupfs.filesystem.tasksMu. Tasks also maintain a set of all cgroups they're // in, and this list is protected by Task.mu. // // Lock order: // // kernel.CgroupRegistry.mu // kernfs.filesystem.mu // kernel.TaskSet.mu // kernel.Task.mu // cgroupfs.filesystem.tasksMu. // cgroupfs.dir.OrderedChildren.mu package cgroupfs import ( "fmt" "sort" "strconv" "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) const ( // Name is the default filesystem name. Name = "cgroup" readonlyFileMode = linux.FileMode(0444) writableFileMode = linux.FileMode(0644) defaultMaxCachedDentries = uint64(1000) ) const ( controllerCPU = kernel.CgroupControllerType("cpu") controllerCPUAcct = kernel.CgroupControllerType("cpuacct") controllerCPUSet = kernel.CgroupControllerType("cpuset") controllerJob = kernel.CgroupControllerType("job") controllerMemory = kernel.CgroupControllerType("memory") ) var allControllers = []kernel.CgroupControllerType{ controllerCPU, controllerCPUAcct, controllerCPUSet, controllerJob, controllerMemory, } // SupportedMountOptions is the set of supported mount options for cgroupfs. var SupportedMountOptions = []string{"all", "cpu", "cpuacct", "cpuset", "job", "memory"} // FilesystemType implements vfs.FilesystemType. // // +stateify savable type FilesystemType struct{} // InternalData contains internal data passed in to the cgroupfs mount via // vfs.GetFilesystemOptions.InternalData. // // +stateify savable type InternalData struct { DefaultControlValues map[string]int64 InitialCgroupPath string } // filesystem implements vfs.FilesystemImpl and kernel.cgroupFS. // // +stateify savable type filesystem struct { kernfs.Filesystem devMinor uint32 // hierarchyID is the id the cgroup registry assigns to this hierarchy. Has // the value kernel.InvalidCgroupHierarchyID until the FS is fully // initialized. // // hierarchyID is immutable after initialization. hierarchyID uint32 // controllers and kcontrollers are both the list of controllers attached to // this cgroupfs. Both lists are the same set of controllers, but typecast // to different interfaces for convenience. Both must stay in sync, and are // immutable. controllers []controller kcontrollers []kernel.CgroupController numCgroups uint64 // Protected by atomic ops. root *kernfs.Dentry // effectiveRoot is the initial cgroup new tasks are created in. Unless // overwritten by internal mount options, root == effectiveRoot. If // effectiveRoot != root, an extra reference is held on effectiveRoot for // the lifetime of the filesystem. effectiveRoot *kernfs.Dentry // tasksMu serializes task membership changes across all cgroups within a // filesystem. tasksMu sync.RWMutex `state:"nosave"` } // InitializeHierarchyID implements kernel.cgroupFS.InitializeHierarchyID. func (fs *filesystem) InitializeHierarchyID(hid uint32) { fs.hierarchyID = hid } // Name implements vfs.FilesystemType.Name. func (FilesystemType) Name() string { return Name } // Release implements vfs.FilesystemType.Release. func (FilesystemType) Release(ctx context.Context) {} // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { devMinor, err := vfsObj.GetAnonBlockDevMinor() if err != nil { return nil, nil, err } mopts := vfs.GenericParseMountOptions(opts.Data) maxCachedDentries := defaultMaxCachedDentries if str, ok := mopts["dentry_cache_limit"]; ok { delete(mopts, "dentry_cache_limit") maxCachedDentries, err = strconv.ParseUint(str, 10, 64) if err != nil { ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) return nil, nil, linuxerr.EINVAL } } var wantControllers []kernel.CgroupControllerType if _, ok := mopts["cpu"]; ok { delete(mopts, "cpu") wantControllers = append(wantControllers, controllerCPU) } if _, ok := mopts["cpuacct"]; ok { delete(mopts, "cpuacct") wantControllers = append(wantControllers, controllerCPUAcct) } if _, ok := mopts["cpuset"]; ok { delete(mopts, "cpuset") wantControllers = append(wantControllers, controllerCPUSet) } if _, ok := mopts["job"]; ok { delete(mopts, "job") wantControllers = append(wantControllers, controllerJob) } if _, ok := mopts["memory"]; ok { delete(mopts, "memory") wantControllers = append(wantControllers, controllerMemory) } if _, ok := mopts["all"]; ok { if len(wantControllers) > 0 { ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: other controllers specified with all: %v", wantControllers) return nil, nil, linuxerr.EINVAL } delete(mopts, "all") wantControllers = allControllers } if len(wantControllers) == 0 { // Specifying no controllers implies all controllers. wantControllers = allControllers } if len(mopts) != 0 { ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: unknown options: %v", mopts) return nil, nil, linuxerr.EINVAL } k := kernel.KernelFromContext(ctx) r := k.CgroupRegistry() // "It is not possible to mount the same controller against multiple // cgroup hierarchies. For example, it is not possible to mount both // the cpu and cpuacct controllers against one hierarchy, and to mount // the cpu controller alone against another hierarchy." - man cgroups(7) // // Is there a hierarchy available with all the controllers we want? If so, // this mount is a view into the same hierarchy. // // Note: we're guaranteed to have at least one requested controller, since // no explicit controller name implies all controllers. if vfsfs := r.FindHierarchy(wantControllers); vfsfs != nil { fs := vfsfs.Impl().(*filesystem) ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: mounting new view to hierarchy %v", fs.hierarchyID) fs.root.IncRef() if fs.effectiveRoot != fs.root { fs.effectiveRoot.IncRef() } return vfsfs, fs.root.VFSDentry(), nil } // No existing hierarchy with the exactly controllers found. Make a new // one. Note that it's possible this mount creation is unsatisfiable, if one // or more of the requested controllers are already on existing // hierarchies. We'll find out about such collisions when we try to register // the new hierarchy later. fs := &filesystem{ devMinor: devMinor, } fs.MaxCachedDentries = maxCachedDentries fs.VFSFilesystem().Init(vfsObj, &fsType, fs) var defaults map[string]int64 if opts.InternalData != nil { defaults = opts.InternalData.(*InternalData).DefaultControlValues ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults) } for _, ty := range wantControllers { var c controller switch ty { case controllerCPU: c = newCPUController(fs, defaults) case controllerCPUAcct: c = newCPUAcctController(fs) case controllerCPUSet: c = newCPUSetController(k, fs) case controllerJob: c = newJobController(fs) case controllerMemory: c = newMemoryController(fs, defaults) default: panic(fmt.Sprintf("Unreachable: unknown cgroup controller %q", ty)) } fs.controllers = append(fs.controllers, c) } if len(defaults) != 0 { // Internal data is always provided at sentry startup and unused values // indicate a problem with the sandbox config. Fail fast. panic(fmt.Sprintf("cgroupfs.FilesystemType.GetFilesystem: unknown internal mount data: %v", defaults)) } // Controllers usually appear in alphabetical order when displayed. Sort it // here now, so it never needs to be sorted elsewhere. sort.Slice(fs.controllers, func(i, j int) bool { return fs.controllers[i].Type() < fs.controllers[j].Type() }) fs.kcontrollers = make([]kernel.CgroupController, 0, len(fs.controllers)) for _, c := range fs.controllers { fs.kcontrollers = append(fs.kcontrollers, c) } root := fs.newCgroupInode(ctx, creds) var rootD kernfs.Dentry rootD.InitRoot(&fs.Filesystem, root) fs.root = &rootD fs.effectiveRoot = fs.root if err := fs.prepareInitialCgroup(ctx, vfsObj, opts); err != nil { ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: failed to prepare initial cgroup: %v", err) rootD.DecRef(ctx) fs.VFSFilesystem().DecRef(ctx) return nil, nil, err } // Register controllers. The registry may be modified concurrently, so if we // get an error, we raced with someone else who registered the same // controllers first. if err := r.Register(fs.kcontrollers, fs); err != nil { ctx.Infof("cgroupfs.FilesystemType.GetFilesystem: failed to register new hierarchy with controllers %v: %v", wantControllers, err) rootD.DecRef(ctx) fs.VFSFilesystem().DecRef(ctx) return nil, nil, linuxerr.EBUSY } // Move all existing tasks to the root of the new hierarchy. k.PopulateNewCgroupHierarchy(fs.rootCgroup()) return fs.VFSFilesystem(), rootD.VFSDentry(), nil } // prepareInitialCgroup creates the initial cgroup according to opts. An initial // cgroup is optional, and if not specified, this function is a no-op. func (fs *filesystem) prepareInitialCgroup(ctx context.Context, vfsObj *vfs.VirtualFilesystem, opts vfs.GetFilesystemOptions) error { if opts.InternalData == nil { return nil } initPathStr := opts.InternalData.(*InternalData).InitialCgroupPath if initPathStr == "" { return nil } ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path: %v", initPathStr) initPath := fspath.Parse(initPathStr) if !initPath.Absolute || !initPath.HasComponents() { ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path invalid: %+v", initPath) return linuxerr.EINVAL } // Have initial cgroup target, create the tree. cgDir := fs.root.Inode().(*cgroupInode) for pit := initPath.Begin; pit.Ok(); pit = pit.Next() { cgDirI, err := cgDir.NewDir(ctx, pit.String(), vfs.MkdirOptions{}) if err != nil { return err } cgDir = cgDirI.(*cgroupInode) } // Walk to target dentry. initDentry, err := fs.root.WalkDentryTree(ctx, vfsObj, initPath) if err != nil { ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup dentry not found: %v", err) return linuxerr.ENOENT } fs.effectiveRoot = initDentry // Reference from WalkDentryTree transferred here. return nil } func (fs *filesystem) rootCgroup() kernel.Cgroup { return kernel.Cgroup{ Dentry: fs.effectiveRoot, CgroupImpl: fs.effectiveRoot.Inode().(kernel.CgroupImpl), } } // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { k := kernel.KernelFromContext(ctx) r := k.CgroupRegistry() if fs.hierarchyID != kernel.InvalidCgroupHierarchyID { k.ReleaseCgroupHierarchy(fs.hierarchyID) r.Unregister(fs.hierarchyID) } if fs.root != fs.effectiveRoot { fs.effectiveRoot.DecRef(ctx) } fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) fs.Filesystem.Release(ctx) } // MountOptions implements vfs.FilesystemImpl.MountOptions. func (fs *filesystem) MountOptions() string { var cnames []string for _, c := range fs.controllers { cnames = append(cnames, string(c.Type())) } return strings.Join(cnames, ",") } // +stateify savable type implStatFS struct{} // StatFS implements kernfs.Inode.StatFS. func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { return vfs.GenericStatFS(linux.CGROUP_SUPER_MAGIC), nil } // dir implements kernfs.Inode for a generic cgroup resource controller // directory. Specific controllers extend this to add their own functionality. // // +stateify savable type dir struct { kernfs.InodeNoopRefCount kernfs.InodeAlwaysValid kernfs.InodeAttrs kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren kernfs.OrderedChildren implStatFS locks vfs.FileLocks fs *filesystem // Immutable. cgi *cgroupInode // Immutable. } // Keep implements kernfs.Inode.Keep. func (*dir) Keep() bool { return true } // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return linuxerr.EPERM } // Open implements kernfs.Inode.Open. func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndStaticEntries, }) if err != nil { return nil, err } return fd.VFSFileDescription(), nil } // NewDir implements kernfs.Inode.NewDir. func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) { // "Do not accept '\n' to prevent making /proc//cgroup unparsable." // -- Linux, kernel/cgroup.c:cgroup_mkdir(). if strings.Contains(name, "\n") { return nil, linuxerr.EINVAL } return d.OrderedChildren.Inserter(name, func() kernfs.Inode { d.IncLinks(1) return d.fs.newCgroupInode(ctx, auth.CredentialsFromContext(ctx)) }) } // Rename implements kernfs.Inode.Rename. Cgroupfs only allows renaming of // cgroup directories, and the rename may only change the name within the same // parent. See linux, kernel/cgroup.c:cgroup_rename(). func (d *dir) Rename(ctx context.Context, oldname, newname string, child, dst kernfs.Inode) error { if _, ok := child.(*cgroupInode); !ok { // Not a cgroup directory. Control files are backed by different types. return linuxerr.ENOTDIR } dstCGInode, ok := dst.(*cgroupInode) if !ok { // Not a cgroup inode, so definitely can't be *this* inode. return linuxerr.EIO } // Note: We're intentionally comparing addresses, since two different dirs // could plausibly be identical in memory, but would occupy different // locations in memory. if d != &dstCGInode.dir { // Destination dir is a different cgroup inode. Cross directory renames // aren't allowed. return linuxerr.EIO } // Rename moves oldname to newname within d. Proceed. return d.OrderedChildren.Rename(ctx, oldname, newname, child, dst) } // Unlink implements kernfs.Inode.Unlink. Cgroupfs disallows unlink, as the only // files in the filesystem are control files, which can't be deleted. func (d *dir) Unlink(ctx context.Context, name string, child kernfs.Inode) error { return linuxerr.EPERM } // hasChildrenLocked returns whether the cgroup dir contains any objects that // prevent it from being deleted. func (d *dir) hasChildrenLocked() bool { // Subdirs take a link on the parent, so checks if there are any direct // children cgroups. Exclude the dir's self link and the link from ".". if d.InodeAttrs.Links()-2 > 0 { return true } return len(d.cgi.ts) > 0 } // HasChildren implements kernfs.Inode.HasChildren. // // The empty check for a cgroupfs directory is unlike a regular directory since // a cgroupfs directory will always have control files. A cgroupfs directory can // be deleted if cgroup contains no tasks and has no sub-cgroups. func (d *dir) HasChildren() bool { d.fs.tasksMu.RLock() defer d.fs.tasksMu.RUnlock() return d.hasChildrenLocked() } // RmDir implements kernfs.Inode.RmDir. func (d *dir) RmDir(ctx context.Context, name string, child kernfs.Inode) error { // Unlike a normal directory, we need to recheck if d is empty again, since // vfs/kernfs can't stop tasks from entering or leaving the cgroup. d.fs.tasksMu.RLock() defer d.fs.tasksMu.RUnlock() cgi, ok := child.(*cgroupInode) if !ok { return linuxerr.ENOTDIR } if cgi.dir.hasChildrenLocked() { return linuxerr.ENOTEMPTY } // Disallow deletion of the effective root cgroup. if cgi == d.fs.effectiveRoot.Inode().(*cgroupInode) { ctx.Warningf("Cannot delete initial cgroup for new tasks %q", d.fs.effectiveRoot.FSLocalPath()) return linuxerr.EBUSY } err := d.OrderedChildren.RmDir(ctx, name, child) if err == nil { d.InodeAttrs.DecLinks() } return err } // controllerFile represents a generic control file that appears within a cgroup // directory. // // +stateify savable type controllerFile struct { kernfs.DynamicBytesFile } func (fs *filesystem) newControllerFile(ctx context.Context, creds *auth.Credentials, data vfs.DynamicBytesSource) kernfs.Inode { f := &controllerFile{} f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), data, readonlyFileMode) return f } func (fs *filesystem) newControllerWritableFile(ctx context.Context, creds *auth.Credentials, data vfs.WritableDynamicBytesSource) kernfs.Inode { f := &controllerFile{} f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), data, writableFileMode) return f } // staticControllerFile represents a generic control file that appears within a // cgroup directory which always returns the same data when read. // staticControllerFiles are not writable. // // +stateify savable type staticControllerFile struct { kernfs.DynamicBytesFile vfs.StaticData } // Note: We let the caller provide the mode so that static files may be used to // fake both readable and writable control files. However, static files are // effectively readonly, as attempting to write to them will return EIO // regardless of the mode. func (fs *filesystem) newStaticControllerFile(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, data string) kernfs.Inode { f := &staticControllerFile{StaticData: vfs.StaticData{Data: data}} f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), f, mode) return f }