Initial cgroupfs support for subcontainers

Allow creation and management of subcontainers through cgroupfs directory syscalls. Also add a mechanism to specify a default root container to start new jobs in. This implements the filesystem support for subcontainers, but doesn't implement hierarchical resource accounting or task migration. PiperOrigin-RevId: 390254870
author: Rahat Mahmood <rahat@google.com> 2021-08-11 17:18:53 -0700
committer: gVisor bot <gvisor-bot@google.com> 2021-08-11 17:21:37 -0700
commit: a50596874a4971167f97a05181363e91292a2885 (patch)
tree: fea65128126ec05e6fe0b96b4a74369bf6aa208b
parent: 09b453cec07bceeb4185bc9bc951efbda366472b (diff)
12 files changed, 380 insertions, 30 deletions
diff --git a/pkg/sentry/fsimpl/cgroupfs/BUILD b/pkg/sentry/fsimpl/cgroupfs/BUILD
index 4c9c5b344..e5fdcc776 100644
--- a/pkg/sentry/fsimpl/cgroupfs/BUILD
+++ b/pkg/sentry/fsimpl/cgroupfs/BUILD
@@ -32,6 +32,7 @@ go_library(
         "//pkg/context",
         "//pkg/coverage",
         "//pkg/errors/linuxerr",
+        "//pkg/fspath",
         "//pkg/log",
         "//pkg/refs",
         "//pkg/refsvfs2",
@@ -43,7 +44,6 @@ go_library(
         "//pkg/sentry/usage",
         "//pkg/sentry/vfs",
         "//pkg/sync",
-        "//pkg/syserror",
         "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/fsimpl/cgroupfs/base.go b/pkg/sentry/fsimpl/cgroupfs/base.go
index 4290ffe0d..71bb0a9c8 100644
--- a/pkg/sentry/fsimpl/cgroupfs/base.go
+++ b/pkg/sentry/fsimpl/cgroupfs/base.go
@@ -88,7 +88,6 @@ type controller interface {
 // +stateify savable
 type cgroupInode struct {
 	dir
-	fs *filesystem
 
 	// ts is the list of tasks in this cgroup. The kernel is responsible for
 	// removing tasks from this list before they're destroyed, so any tasks on
@@ -102,9 +101,10 @@ var _ kernel.CgroupImpl = (*cgroupInode)(nil)
 
 func (fs *filesystem) newCgroupInode(ctx context.Context, creds *auth.Credentials) kernfs.Inode {
 	c := &cgroupInode{
-		fs: fs,
-		ts: make(map[*kernel.Task]struct{}),
+		dir: dir{fs: fs},
+		ts:  make(map[*kernel.Task]struct{}),
 	}
+	c.dir.cgi = c
 
 	contents := make(map[string]kernfs.Inode)
 	contents["cgroup.procs"] = fs.newControllerFile(ctx, creds, &cgroupProcsData{c})
@@ -115,8 +115,7 @@ func (fs *filesystem) newCgroupInode(ctx context.Context, creds *auth.Credential
 	}
 
 	c.dir.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|linux.FileMode(0555))
-	c.dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-	c.dir.InitRefs()
+	c.dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true})
 	c.dir.IncLinks(c.dir.OrderedChildren.Populate(contents))
 
 	atomic.AddUint64(&fs.numCgroups, 1)
diff --git a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go
index 22c8b7fda..edc3b50b9 100644
--- a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go
+++ b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go
@@ -32,7 +32,8 @@
 // controllers associated with them.
 //
 // Since cgroupfs doesn't allow hardlinks, there is a unique mapping between
-// cgroupfs dentries and inodes.
+// cgroupfs dentries and inodes. Thus, cgroupfs inodes don't need to be ref
+// counted and exist until they're unlinked once or the FS is destroyed.
 //
 // # Synchronization
 //
@@ -48,10 +49,11 @@
 // Lock order:
 //
 // kernel.CgroupRegistry.mu
-//   cgroupfs.filesystem.mu
-//     kernel.TaskSet.mu
-//       kernel.Task.mu
-//         cgroupfs.filesystem.tasksMu.
+//   kernfs.filesystem.mu
+//   kernel.TaskSet.mu
+//     kernel.Task.mu
+//       cgroupfs.filesystem.tasksMu.
+//         cgroupfs.dir.OrderedChildren.mu
 package cgroupfs
 
 import (
@@ -63,6 +65,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/errors/linuxerr"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -108,6 +111,7 @@ type FilesystemType struct{}
 // +stateify savable
 type InternalData struct {
 	DefaultControlValues map[string]int64
+	InitialCgroupPath    string
 }
 
 // filesystem implements vfs.FilesystemImpl and kernel.cgroupFS.
@@ -134,6 +138,11 @@ type filesystem struct {
 	numCgroups uint64 // Protected by atomic ops.
 
 	root *kernfs.Dentry
+	// effectiveRoot is the initial cgroup new tasks are created in. Unless
+	// overwritten by internal mount options, root == effectiveRoot. If
+	// effectiveRoot != root, an extra reference is held on effectiveRoot for
+	// the lifetime of the filesystem.
+	effectiveRoot *kernfs.Dentry
 
 	// tasksMu serializes task membership changes across all cgroups within a
 	// filesystem.
@@ -229,6 +238,9 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		fs := vfsfs.Impl().(*filesystem)
 		ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: mounting new view to hierarchy %v", fs.hierarchyID)
 		fs.root.IncRef()
+		if fs.effectiveRoot != fs.root {
+			fs.effectiveRoot.IncRef()
+		}
 		return vfsfs, fs.root.VFSDentry(), nil
 	}
 
@@ -245,8 +257,8 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 
 	var defaults map[string]int64
 	if opts.InternalData != nil {
-		ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults)
 		defaults = opts.InternalData.(*InternalData).DefaultControlValues
+		ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults)
 	}
 
 	for _, ty := range wantControllers {
@@ -286,6 +298,14 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	var rootD kernfs.Dentry
 	rootD.InitRoot(&fs.Filesystem, root)
 	fs.root = &rootD
+	fs.effectiveRoot = fs.root
+
+	if err := fs.prepareInitialCgroup(ctx, vfsObj, opts); err != nil {
+		ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: failed to prepare initial cgroup: %v", err)
+		rootD.DecRef(ctx)
+		fs.VFSFilesystem().DecRef(ctx)
+		return nil, nil, err
+	}
 
 	// Register controllers. The registry may be modified concurrently, so if we
 	// get an error, we raced with someone else who registered the same
@@ -303,10 +323,47 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	return fs.VFSFilesystem(), rootD.VFSDentry(), nil
 }
 
+// prepareInitialCgroup creates the initial cgroup according to opts. An initial
+// cgroup is optional, and if not specified, this function is a no-op.
+func (fs *filesystem) prepareInitialCgroup(ctx context.Context, vfsObj *vfs.VirtualFilesystem, opts vfs.GetFilesystemOptions) error {
+	if opts.InternalData == nil {
+		return nil
+	}
+	initPathStr := opts.InternalData.(*InternalData).InitialCgroupPath
+	if initPathStr == "" {
+		return nil
+	}
+	ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path: %v", initPathStr)
+	initPath := fspath.Parse(initPathStr)
+	if !initPath.Absolute || !initPath.HasComponents() {
+		ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup path invalid: %+v", initPath)
+		return linuxerr.EINVAL
+	}
+
+	// Have initial cgroup target, create the tree.
+	cgDir := fs.root.Inode().(*cgroupInode)
+	for pit := initPath.Begin; pit.Ok(); pit = pit.Next() {
+		cgDirI, err := cgDir.NewDir(ctx, pit.String(), vfs.MkdirOptions{})
+		if err != nil {
+			return err
+		}
+		cgDir = cgDirI.(*cgroupInode)
+	}
+
+	// Walk to target dentry.
+	initDentry, err := fs.root.WalkDentryTree(ctx, vfsObj, initPath)
+	if err != nil {
+		ctx.Warningf("cgroupfs.FilesystemType.GetFilesystem: initial cgroup dentry not found: %v", err)
+		return linuxerr.ENOENT
+	}
+	fs.effectiveRoot = initDentry // Reference from WalkDentryTree transferred here.
+	return nil
+}
+
 func (fs *filesystem) rootCgroup() kernel.Cgroup {
 	return kernel.Cgroup{
-		Dentry:     fs.root,
-		CgroupImpl: fs.root.Inode().(kernel.CgroupImpl),
+		Dentry:     fs.effectiveRoot,
+		CgroupImpl: fs.effectiveRoot.Inode().(kernel.CgroupImpl),
 	}
 }
 
@@ -320,6 +377,10 @@ func (fs *filesystem) Release(ctx context.Context) {
 		r.Unregister(fs.hierarchyID)
 	}
 
+	if fs.root != fs.effectiveRoot {
+		fs.effectiveRoot.DecRef(ctx)
+	}
+
 	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
 	fs.Filesystem.Release(ctx)
 }
@@ -346,15 +407,18 @@ func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error
 //
 // +stateify savable
 type dir struct {
-	dirRefs
+	kernfs.InodeNoopRefCount
 	kernfs.InodeAlwaysValid
 	kernfs.InodeAttrs
 	kernfs.InodeNotSymlink
-	kernfs.InodeDirectoryNoNewChildren // TODO(b/183137098): Implement mkdir.
+	kernfs.InodeDirectoryNoNewChildren
 	kernfs.OrderedChildren
 	implStatFS
 
 	locks vfs.FileLocks
+
+	fs  *filesystem  // Immutable.
+	cgi *cgroupInode // Immutable.
 }
 
 // Keep implements kernfs.Inode.Keep.
@@ -378,9 +442,100 @@ func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry
 	return fd.VFSFileDescription(), nil
 }
 
-// DecRef implements kernfs.Inode.DecRef.
-func (d *dir) DecRef(ctx context.Context) {
-	d.dirRefs.DecRef(func() { d.Destroy(ctx) })
+// NewDir implements kernfs.Inode.NewDir.
+func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) {
+	// "Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable."
+	//   -- Linux, kernel/cgroup.c:cgroup_mkdir().
+	if strings.Contains(name, "\n") {
+		return nil, linuxerr.EINVAL
+	}
+	return d.OrderedChildren.Inserter(name, func() kernfs.Inode {
+		d.IncLinks(1)
+		return d.fs.newCgroupInode(ctx, auth.CredentialsFromContext(ctx))
+	})
+}
+
+// Rename implements kernfs.Inode.Rename. Cgroupfs only allows renaming of
+// cgroup directories, and the rename may only change the name within the same
+// parent. See linux, kernel/cgroup.c:cgroup_rename().
+func (d *dir) Rename(ctx context.Context, oldname, newname string, child, dst kernfs.Inode) error {
+	if _, ok := child.(*cgroupInode); !ok {
+		// Not a cgroup directory. Control files are backed by different types.
+		return linuxerr.ENOTDIR
+	}
+
+	dstCGInode, ok := dst.(*cgroupInode)
+	if !ok {
+		// Not a cgroup inode, so definitely can't be *this* inode.
+		return linuxerr.EIO
+	}
+	// Note: We're intentionally comparing addresses, since two different dirs
+	// could plausibly be identical in memory, but would occupy different
+	// locations in memory.
+	if d != &dstCGInode.dir {
+		// Destination dir is a different cgroup inode. Cross directory renames
+		// aren't allowed.
+		return linuxerr.EIO
+	}
+
+	// Rename moves oldname to newname within d. Proceed.
+	return d.OrderedChildren.Rename(ctx, oldname, newname, child, dst)
+}
+
+// Unlink implements kernfs.Inode.Unlink. Cgroupfs disallows unlink, as the only
+// files in the filesystem are control files, which can't be deleted.
+func (d *dir) Unlink(ctx context.Context, name string, child kernfs.Inode) error {
+	return linuxerr.EPERM
+}
+
+// hasChildrenLocked returns whether the cgroup dir contains any objects that
+// prevent it from being deleted.
+func (d *dir) hasChildrenLocked() bool {
+	// Subdirs take a link on the parent, so checks if there are any direct
+	// children cgroups. Exclude the dir's self link and the link from ".".
+	if d.InodeAttrs.Links()-2 > 0 {
+		return true
+	}
+	return len(d.cgi.ts) > 0
+}
+
+// HasChildren implements kernfs.Inode.HasChildren.
+//
+// The empty check for a cgroupfs directory is unlike a regular directory since
+// a cgroupfs directory will always have control files. A cgroupfs directory can
+// be deleted if cgroup contains no tasks and has no sub-cgroups.
+func (d *dir) HasChildren() bool {
+	d.fs.tasksMu.RLock()
+	defer d.fs.tasksMu.RUnlock()
+	return d.hasChildrenLocked()
+}
+
+// RmDir implements kernfs.Inode.RmDir.
+func (d *dir) RmDir(ctx context.Context, name string, child kernfs.Inode) error {
+	// Unlike a normal directory, we need to recheck if d is empty again, since
+	// vfs/kernfs can't stop tasks from entering or leaving the cgroup.
+	d.fs.tasksMu.RLock()
+	defer d.fs.tasksMu.RUnlock()
+
+	cgi, ok := child.(*cgroupInode)
+	if !ok {
+		return linuxerr.ENOTDIR
+	}
+	if cgi.dir.hasChildrenLocked() {
+		return linuxerr.ENOTEMPTY
+	}
+
+	// Disallow deletion of the effective root cgroup.
+	if cgi == d.fs.effectiveRoot.Inode().(*cgroupInode) {
+		ctx.Warningf("Cannot delete initial cgroup for new tasks %q", d.fs.effectiveRoot.FSLocalPath())
+		return linuxerr.EBUSY
+	}
+
+	err := d.OrderedChildren.RmDir(ctx, name, child)
+	if err == nil {
+		d.InodeAttrs.DecLinks()
+	}
+	return err
 }
 
 // controllerFile represents a generic control file that appears within a cgroup
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index d53937db6..f676ff7e9 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -137,6 +137,7 @@ go_test(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/errors/linuxerr",
+        "//pkg/fspath",
         "//pkg/log",
         "//pkg/refs",
         "//pkg/refsvfs2",
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index a42fc79b4..b96dc9ef7 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -26,7 +26,6 @@ import (
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
-	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // InodeNoopRefCount partially implements the Inode interface, specifically the
@@ -234,6 +233,11 @@ func (a *InodeAttrs) Mode() linux.FileMode {
 	return linux.FileMode(atomic.LoadUint32(&a.mode))
 }
 
+// Links returns the link count.
+func (a *InodeAttrs) Links() uint32 {
+	return atomic.LoadUint32(&a.nlink)
+}
+
 // TouchAtime updates a.atime to the current time.
 func (a *InodeAttrs) TouchAtime(ctx context.Context, mnt *vfs.Mount) {
 	if mnt.Flags.NoATime || mnt.ReadOnly() {
@@ -289,7 +293,7 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
 		return linuxerr.EPERM
 	}
 	if opts.Stat.Mask&linux.STATX_SIZE != 0 && a.Mode().IsDir() {
-		return syserror.EISDIR
+		return linuxerr.EISDIR
 	}
 	if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
 		return err
@@ -475,7 +479,7 @@ func (o *OrderedChildren) Lookup(ctx context.Context, name string) (Inode, error
 
 	s, ok := o.set[name]
 	if !ok {
-		return nil, syserror.ENOENT
+		return nil, linuxerr.ENOENT
 	}
 
 	s.inode.IncRef() // This ref is passed to the dentry upon creation via Init.
@@ -502,6 +506,30 @@ func (o *OrderedChildren) Insert(name string, child Inode) error {
 	return o.insert(name, child, false)
 }
 
+// Inserter is like Insert, but obtains the child to insert by calling
+// makeChild. makeChild is only called if the insert will succeed. This allows
+// the caller to atomically check and insert a child without having to
+// clean up the child on failure.
+func (o *OrderedChildren) Inserter(name string, makeChild func() Inode) (Inode, error) {
+	o.mu.Lock()
+	defer o.mu.Unlock()
+	if _, ok := o.set[name]; ok {
+		return nil, linuxerr.EEXIST
+	}
+
+	// Note: We must not fail after we call makeChild().
+
+	child := makeChild()
+	s := &slot{
+		name:   name,
+		inode:  child,
+		static: false,
+	}
+	o.order.PushBack(s)
+	o.set[name] = s
+	return child, nil
+}
+
 // insert inserts child into o.
 //
 // Precondition: Caller must be holding a ref on child if static is true.
@@ -559,7 +587,7 @@ func (o *OrderedChildren) replaceChildLocked(ctx context.Context, name string, n
 func (o *OrderedChildren) checkExistingLocked(name string, child Inode) error {
 	s, ok := o.set[name]
 	if !ok {
-		return syserror.ENOENT
+		return linuxerr.ENOENT
 	}
 	if s.inode != child {
 		panic(fmt.Sprintf("Inode doesn't match what kernfs thinks! OrderedChild: %+v, kernfs: %+v", s.inode, child))
@@ -746,5 +774,5 @@ type InodeNoStatFS struct{}
 
 // StatFS implements Inode.StatFS.
 func (*InodeNoStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
-	return linux.Statfs{}, syserror.ENOSYS
+	return linux.Statfs{}, linuxerr.ENOSYS
 }
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 0e2867d49..90c8b75d1 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -66,6 +66,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory
@@ -542,6 +543,63 @@ func (d *Dentry) FSLocalPath() string {
 	return b.String()
 }
 
+// WalkDentryTree traverses p in the dentry tree for this filesystem. Note that
+// this only traverses the dentry tree and is not a general path traversal. No
+// symlinks and dynamic children are resolved, and no permission checks are
+// performed. The caller is responsible for ensuring the returned Dentry exists
+// for an appropriate lifetime.
+//
+// p is interpreted starting at d, and may be absolute or relative (absolute vs
+// relative paths both refer to the same target here, since p is absolute from
+// d). p may contain "." and "..", but will not allow traversal above d (similar
+// to ".." at the root dentry).
+//
+// This is useful for filesystem internals, where the filesystem may not be
+// mounted yet. For a mounted filesystem, use GetDentryAt.
+func (d *Dentry) WalkDentryTree(ctx context.Context, vfsObj *vfs.VirtualFilesystem, p fspath.Path) (*Dentry, error) {
+	d.fs.mu.RLock()
+	defer d.fs.processDeferredDecRefs(ctx)
+	defer d.fs.mu.RUnlock()
+
+	target := d
+
+	for pit := p.Begin; pit.Ok(); pit = pit.Next() {
+		pc := pit.String()
+
+		switch {
+		case target == nil:
+			return nil, syserror.ENOENT
+		case pc == ".":
+			// No-op, consume component and continue.
+		case pc == "..":
+			if target == d {
+				// Don't let .. traverse above the start point of the walk.
+				continue
+			}
+			target = target.parent
+			// Parent doesn't need revalidation since we revalidated it on the
+			// way to the child, and we're still holding fs.mu.
+		default:
+			var err error
+
+			d.dirMu.Lock()
+			target, err = d.fs.revalidateChildLocked(ctx, vfsObj, target, pc, target.children[pc])
+			d.dirMu.Unlock()
+
+			if err != nil {
+				return nil, err
+			}
+		}
+	}
+
+	if target == nil {
+		return nil, syserror.ENOENT
+	}
+
+	target.IncRef()
+	return target, nil
+}
+
 // The Inode interface maps filesystem-level operations that operate on paths to
 // equivalent operations on specific filesystem nodes.
 //
@@ -667,12 +725,15 @@ type inodeDirectory interface {
 	// RmDir removes an empty child directory from this directory
 	// inode. Implementations must update the parent directory's link count,
 	// if required. Implementations are not responsible for checking that child
-	// is a directory, checking for an empty directory.
+	// is a directory, or checking for an empty directory.
 	RmDir(ctx context.Context, name string, child Inode) error
 
 	// Rename is called on the source directory containing an inode being
-	// renamed. child should point to the resolved child in the source
-	// directory.
+	// renamed. child points to the resolved child in the source directory.
+	// dstDir is guaranteed to be a directory inode.
+	//
+	// On a successful call to Rename, the caller updates the dentry tree to
+	// reflect the name change.
 	//
 	// Precondition: Caller must serialize concurrent calls to Rename.
 	Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index 609887943..a2aba9321 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/errors/linuxerr"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
@@ -346,3 +347,63 @@ func TestDirFDIterDirents(t *testing.T) {
 		"file1": linux.DT_REG,
 	})
 }
+
+func TestDirWalkDentryTree(t *testing.T) {
+	sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+		return fs.newDir(ctx, creds, 0755, map[string]kernfs.Inode{
+			"dir1": fs.newDir(ctx, creds, 0755, nil),
+			"dir2": fs.newDir(ctx, creds, 0755, map[string]kernfs.Inode{
+				"file1": fs.newFile(ctx, creds, staticFileContent),
+				"dir3":  fs.newDir(ctx, creds, 0755, nil),
+			}),
+		})
+	})
+	defer sys.Destroy()
+
+	testWalk := func(from *kernfs.Dentry, getDentryPath, walkPath string, expectedErr error) {
+		var d *kernfs.Dentry
+		if getDentryPath != "" {
+			pop := sys.PathOpAtRoot(getDentryPath)
+			vd := sys.GetDentryOrDie(pop)
+			defer vd.DecRef(sys.Ctx)
+			d = vd.Dentry().Impl().(*kernfs.Dentry)
+		}
+
+		match, err := from.WalkDentryTree(sys.Ctx, sys.VFS, fspath.Parse(walkPath))
+		if err == nil {
+			defer match.DecRef(sys.Ctx)
+		}
+
+		if err != expectedErr {
+			t.Fatalf("WalkDentryTree from %q to %q (with expected error: %v) unexpected error, want: %v, got: %v", from.FSLocalPath(), walkPath, expectedErr, expectedErr, err)
+		}
+		if expectedErr != nil {
+			return
+		}
+
+		if d != match {
+			t.Fatalf("WalkDentryTree from %q to %q (with expected error: %v) found unexpected dentry; want: %v, got: %v", from.FSLocalPath(), walkPath, expectedErr, d, match)
+		}
+	}
+
+	rootD := sys.Root.Dentry().Impl().(*kernfs.Dentry)
+
+	testWalk(rootD, "dir1", "/dir1", nil)
+	testWalk(rootD, "", "/dir-non-existent", linuxerr.ENOENT)
+	testWalk(rootD, "", "/dir1/child-non-existent", linuxerr.ENOENT)
+	testWalk(rootD, "", "/dir2/inner-non-existent/dir3", linuxerr.ENOENT)
+
+	testWalk(rootD, "dir2/dir3", "/dir2/../dir2/dir3", nil)
+	testWalk(rootD, "dir2/dir3", "/dir2/././dir3", nil)
+	testWalk(rootD, "dir2/dir3", "/dir2/././dir3/.././dir3", nil)
+
+	pop := sys.PathOpAtRoot("dir2")
+	dir2VD := sys.GetDentryOrDie(pop)
+	defer dir2VD.DecRef(sys.Ctx)
+	dir2D := dir2VD.Dentry().Impl().(*kernfs.Dentry)
+
+	testWalk(dir2D, "dir2/dir3", "/dir3", nil)
+	testWalk(dir2D, "dir2/dir3", "/../../../dir3", nil)
+	testWalk(dir2D, "dir2/file1", "/file1", nil)
+	testWalk(dir2D, "dir2/file1", "file1", nil)
+}
diff --git a/pkg/sentry/kernel/cgroup.go b/pkg/sentry/kernel/cgroup.go
index c93ef6ac1..a0e291f58 100644
--- a/pkg/sentry/kernel/cgroup.go
+++ b/pkg/sentry/kernel/cgroup.go
@@ -196,6 +196,7 @@ func (r *CgroupRegistry) FindHierarchy(ctypes []CgroupControllerType) *vfs.Files
 				// uniqueness of controllers enforced by Register, drop the
 				// dying hierarchy now. The eventual unregister by the FS
 				// teardown will become a no-op.
+				r.unregisterLocked(h.id)
 				return nil
 			}
 			return h.fs
diff --git a/test/syscalls/linux/cgroup.cc b/test/syscalls/linux/cgroup.cc
index f29891571..ca23dfeee 100644
--- a/test/syscalls/linux/cgroup.cc
+++ b/test/syscalls/linux/cgroup.cc
@@ -279,6 +279,23 @@ TEST(Cgroup, UnmountRepeated) {
   EXPECT_THAT(umount(c.Path().c_str()), SyscallFailsWithErrno(EINVAL));
 }
 
+TEST(Cgroup, Create) {
+  SKIP_IF(!CgroupsAvailable());
+  Mounter m(ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()));
+  Cgroup c = ASSERT_NO_ERRNO_AND_VALUE(m.MountCgroupfs(""));
+  ASSERT_NO_ERRNO(c.CreateChild("child1"));
+  EXPECT_TRUE(ASSERT_NO_ERRNO_AND_VALUE(Exists(c.Path())));
+}
+
+TEST(Cgroup, SubcontainerInitiallyEmpty) {
+  SKIP_IF(!CgroupsAvailable());
+  Mounter m(ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()));
+  Cgroup c = ASSERT_NO_ERRNO_AND_VALUE(m.MountCgroupfs(""));
+  Cgroup child = ASSERT_NO_ERRNO_AND_VALUE(c.CreateChild("child1"));
+  auto procs = ASSERT_NO_ERRNO_AND_VALUE(child.Procs());
+  EXPECT_TRUE(procs.empty());
+}
+
 TEST(MemoryCgroup, MemoryUsageInBytes) {
   SKIP_IF(!CgroupsAvailable());
 
diff --git a/test/util/cgroup_util.cc b/test/util/cgroup_util.cc
index 977993f41..df3c57b87 100644
--- a/test/util/cgroup_util.cc
+++ b/test/util/cgroup_util.cc
@@ -25,12 +25,26 @@
 namespace gvisor {
 namespace testing {
 
-Cgroup::Cgroup(std::string_view path) : cgroup_path_(path) {
+Cgroup::Cgroup(absl::string_view path) : cgroup_path_(path) {
   id_ = ++Cgroup::next_id_;
   std::cerr << absl::StreamFormat("[cg#%d] <= %s", id_, cgroup_path_)
             << std::endl;
 }
 
+PosixErrorOr<Cgroup> Cgroup::RecursivelyCreate(absl::string_view path) {
+  RETURN_IF_ERRNO(RecursivelyCreateDir(path));
+  return Cgroup(path);
+}
+
+PosixErrorOr<Cgroup> Cgroup::Create(absl::string_view path) {
+  RETURN_IF_ERRNO(Mkdir(path));
+  return Cgroup(path);
+}
+
+PosixErrorOr<Cgroup> Cgroup::CreateChild(absl::string_view name) const {
+  return Cgroup::Create(JoinPath(Path(), name));
+}
+
 PosixErrorOr<std::string> Cgroup::ReadControlFile(
     absl::string_view name) const {
   std::string buf;
@@ -93,7 +107,7 @@ PosixErrorOr<absl::flat_hash_set<pid_t>> Cgroup::ParsePIDList(
     absl::string_view data) const {
   absl::flat_hash_set<pid_t> res;
   std::vector<absl::string_view> lines = absl::StrSplit(data, '\n');
-  for (const std::string_view& line : lines) {
+  for (const absl::string_view& line : lines) {
     if (line.empty()) {
       continue;
     }
diff --git a/test/util/cgroup_util.h b/test/util/cgroup_util.h
index e3f696a89..ccc7219e3 100644
--- a/test/util/cgroup_util.h
+++ b/test/util/cgroup_util.h
@@ -34,8 +34,20 @@ class Cgroup {
 
   uint64_t id() const { return id_; }
 
+  // RecursivelyCreate creates cgroup specified by path, including all
+  // components leading up to path. Path should end inside a cgroupfs mount. If
+  // path already exists, RecursivelyCreate does nothing and silently succeeds.
+  static PosixErrorOr<Cgroup> RecursivelyCreate(std::string_view path);
+
+  // Creates a new cgroup at path. The parent directory must exist and be a
+  // cgroupfs directory.
+  static PosixErrorOr<Cgroup> Create(std::string_view path);
+
   const std::string& Path() const { return cgroup_path_; }
 
+  // Creates a child cgroup under this cgroup with the given name.
+  PosixErrorOr<Cgroup> CreateChild(std::string_view name) const;
+
   std::string Relpath(absl::string_view leaf) const {
     return JoinPath(cgroup_path_, leaf);
   }
diff --git a/test/util/fs_util.cc b/test/util/fs_util.cc
index 483ae848d..253411858 100644
--- a/test/util/fs_util.cc
+++ b/test/util/fs_util.cc
@@ -201,7 +201,8 @@ PosixError UnlinkAt(const FileDescriptor& dfd, absl::string_view path,
 PosixError Mkdir(absl::string_view path, int mode) {
   int res = mkdir(std::string(path).c_str(), mode);
   if (res < 0) {
-    return PosixError(errno, absl::StrCat("mkdir ", path, " mode ", mode));
+    return PosixError(errno,
+                      absl::StrFormat("mkdir \"%s\" mode %#o", path, mode));
   }
 
   return NoError();
author	Rahat Mahmood <rahat@google.com>	2021-08-11 17:18:53 -0700
committer	gVisor bot <gvisor-bot@google.com>	2021-08-11 17:21:37 -0700
commit	a50596874a4971167f97a05181363e91292a2885 (patch)
tree	fea65128126ec05e6fe0b96b4a74369bf6aa208b
parent	09b453cec07bceeb4185bc9bc951efbda366472b (diff)