7 files changed, 107 insertions, 22 deletions
diff --git a/pkg/sentry/fsimpl/cgroupfs/base.go b/pkg/sentry/fsimpl/cgroupfs/base.go
index 0f54888d8..6512e9cdb 100644
--- a/pkg/sentry/fsimpl/cgroupfs/base.go
+++ b/pkg/sentry/fsimpl/cgroupfs/base.go
@@ -26,7 +26,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -68,11 +67,6 @@ func (c *controllerCommon) Enabled() bool {
 	return true
 }
 
-// Filesystem implements kernel.CgroupController.Filesystem.
-func (c *controllerCommon) Filesystem() *vfs.Filesystem {
-	return c.fs.VFSFilesystem()
-}
-
 // RootCgroup implements kernel.CgroupController.RootCgroup.
 func (c *controllerCommon) RootCgroup() kernel.Cgroup {
 	return c.fs.rootCgroup()
diff --git a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go
index bd3e69757..54050de3c 100644
--- a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go
+++ b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go
@@ -109,7 +109,7 @@ type InternalData struct {
 	DefaultControlValues map[string]int64
 }
 
-// filesystem implements vfs.FilesystemImpl.
+// filesystem implements vfs.FilesystemImpl and kernel.cgroupFS.
 //
 // +stateify savable
 type filesystem struct {
@@ -139,6 +139,11 @@ type filesystem struct {
 	tasksMu sync.RWMutex `state:"nosave"`
 }
 
+// InitializeHierarchyID implements kernel.cgroupFS.InitializeHierarchyID.
+func (fs *filesystem) InitializeHierarchyID(hid uint32) {
+	fs.hierarchyID = hid
+}
+
 // Name implements vfs.FilesystemType.Name.
 func (FilesystemType) Name() string {
 	return Name
@@ -284,14 +289,12 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	// Register controllers. The registry may be modified concurrently, so if we
 	// get an error, we raced with someone else who registered the same
 	// controllers first.
-	hid, err := r.Register(fs.kcontrollers)
-	if err != nil {
+	if err := r.Register(fs.kcontrollers, fs); err != nil {
 		ctx.Infof("cgroupfs.FilesystemType.GetFilesystem: failed to register new hierarchy with controllers %v: %v", wantControllers, err)
 		rootD.DecRef(ctx)
 		fs.VFSFilesystem().DecRef(ctx)
 		return nil, nil, syserror.EBUSY
 	}
-	fs.hierarchyID = hid
 
 	// Move all existing tasks to the root of the new hierarchy.
 	k.PopulateNewCgroupHierarchy(fs.rootCgroup())
diff --git a/pkg/sentry/kernel/cgroup.go b/pkg/sentry/kernel/cgroup.go
index 1f1c63f37..0fbf27f64 100644
--- a/pkg/sentry/kernel/cgroup.go
+++ b/pkg/sentry/kernel/cgroup.go
@@ -48,10 +48,6 @@ type CgroupController interface {
 	// attached to. Returned value is valid for the lifetime of the controller.
 	HierarchyID() uint32
 
-	// Filesystem returns the filesystem this controller is attached to.
-	// Returned value is valid for the lifetime of the controller.
-	Filesystem() *vfs.Filesystem
-
 	// RootCgroup returns the root cgroup for this controller. Returned value is
 	// valid for the lifetime of the controller.
 	RootCgroup() Cgroup
@@ -124,6 +120,19 @@ func (h *hierarchy) match(ctypes []CgroupControllerType) bool {
 	return true
 }
 
+// cgroupFS is the public interface to cgroupfs. This lets the kernel package
+// refer to cgroupfs.filesystem methods without directly depending on the
+// cgroupfs package, which would lead to a circular dependency.
+type cgroupFS interface {
+	// Returns the vfs.Filesystem for the cgroupfs.
+	VFSFilesystem() *vfs.Filesystem
+
+	// InitializeHierarchyID sets the hierarchy ID for this filesystem during
+	// filesystem creation. May only be called before the filesystem is visible
+	// to the vfs layer.
+	InitializeHierarchyID(hid uint32)
+}
+
 // CgroupRegistry tracks the active set of cgroup controllers on the system.
 //
 // +stateify savable
@@ -182,31 +191,35 @@ func (r *CgroupRegistry) FindHierarchy(ctypes []CgroupControllerType) *vfs.Files
 
 // Register registers the provided set of controllers with the registry as a new
 // hierarchy. If any controller is already registered, the function returns an
-// error without modifying the registry. The hierarchy can be later referenced
-// by the returned id.
-func (r *CgroupRegistry) Register(cs []CgroupController) (uint32, error) {
+// error without modifying the registry. Register sets the hierarchy ID for the
+// filesystem on success.
+func (r *CgroupRegistry) Register(cs []CgroupController, fs cgroupFS) error {
 	r.mu.Lock()
 	defer r.mu.Unlock()
 
 	if len(cs) == 0 {
-		return InvalidCgroupHierarchyID, fmt.Errorf("can't register hierarchy with no controllers")
+		return fmt.Errorf("can't register hierarchy with no controllers")
 	}
 
 	for _, c := range cs {
 		if _, ok := r.controllers[c.Type()]; ok {
-			return InvalidCgroupHierarchyID, fmt.Errorf("controllers may only be mounted on a single hierarchy")
+			return fmt.Errorf("controllers may only be mounted on a single hierarchy")
 		}
 	}
 
 	hid, err := r.nextHierarchyID()
 	if err != nil {
-		return hid, err
+		return err
 	}
 
+	// Must not fail below here, once we publish the hierarchy ID.
+
+	fs.InitializeHierarchyID(hid)
+
 	h := hierarchy{
 		id:          hid,
 		controllers: make(map[CgroupControllerType]CgroupController),
-		fs:          cs[0].Filesystem(),
+		fs:          fs.VFSFilesystem(),
 	}
 	for _, c := range cs {
 		n := c.Type()
@@ -214,7 +227,7 @@ func (r *CgroupRegistry) Register(cs []CgroupController) (uint32, error) {
 		h.controllers[n] = c
 	}
 	r.hierarchies[hid] = h
-	return hid, nil
+	return nil
 }
 
 // Unregister removes a previously registered hierarchy from the registry. If
@@ -253,6 +266,11 @@ func (r *CgroupRegistry) computeInitialGroups(inherit map[Cgroup]struct{}) map[C
 	for name, ctl := range r.controllers {
 		if _, ok := ctlSet[name]; !ok {
 			cg := ctl.RootCgroup()
+			// Multiple controllers may share the same hierarchy, so may have
+			// the same root cgroup. Grab a single ref per hierarchy root.
+			if _, ok := cgset[cg]; ok {
+				continue
+			}
 			cg.IncRef() // Ref transferred to caller.
 			cgset[cg] = struct{}{}
 		}
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 94a582256..efed4aeb0 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -4247,10 +4247,12 @@ cc_binary(
         "//test/util:mount_util",
         "@com_google_absl//absl/strings",
         gtest,
+        "//test/util:cleanup",
         "//test/util:posix_error",
         "//test/util:temp_path",
         "//test/util:test_main",
         "//test/util:test_util",
+        "//test/util:thread_util",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
diff --git a/test/syscalls/linux/cgroup.cc b/test/syscalls/linux/cgroup.cc
index 70ad5868f..a009ade7e 100644
--- a/test/syscalls/linux/cgroup.cc
+++ b/test/syscalls/linux/cgroup.cc
@@ -25,9 +25,11 @@
 #include "absl/strings/str_split.h"
 #include "test/util/capability_util.h"
 #include "test/util/cgroup_util.h"
+#include "test/util/cleanup.h"
 #include "test/util/mount_util.h"
 #include "test/util/temp_path.h"
 #include "test/util/test_util.h"
+#include "test/util/thread_util.h"
 
 namespace gvisor {
 namespace testing {
@@ -192,6 +194,56 @@ TEST(Cgroup, MoptAllMustBeExclusive) {
       SyscallFailsWithErrno(EINVAL));
 }
 
+TEST(Cgroup, MountRace) {
+  SKIP_IF(!CgroupsAvailable());
+
+  TempPath mountpoint = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+  const DisableSave ds;  // Too many syscalls.
+
+  auto mount_thread = [&mountpoint]() {
+    for (int i = 0; i < 100; ++i) {
+      mount("none", mountpoint.path().c_str(), "cgroup", 0, 0);
+    }
+  };
+  std::list<ScopedThread> threads;
+  for (int i = 0; i < 10; ++i) {
+    threads.emplace_back(mount_thread);
+  }
+  for (auto& t : threads) {
+    t.Join();
+  }
+
+  auto cleanup = Cleanup([&mountpoint] {
+    // We need 1 umount call per successful mount. If some of the mount calls
+    // were unsuccessful, their corresponding umount will silently fail.
+    for (int i = 0; i < (10 * 100) + 1; ++i) {
+      umount(mountpoint.path().c_str());
+    }
+  });
+
+  Cgroup c = Cgroup(mountpoint.path());
+  // c should be a valid cgroup.
+  EXPECT_NO_ERRNO(c.ContainsCallingProcess());
+}
+
+TEST(Cgroup, UnmountRepeated) {
+  SKIP_IF(!CgroupsAvailable());
+
+  const DisableSave ds;  // Too many syscalls.
+
+  Mounter m(ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()));
+  Cgroup c = ASSERT_NO_ERRNO_AND_VALUE(m.MountCgroupfs(""));
+
+  // First unmount should succeed.
+  EXPECT_THAT(umount(c.Path().c_str()), SyscallSucceeds());
+
+  // We just manually unmounted, so release managed resources.
+  m.release(c);
+
+  EXPECT_THAT(umount(c.Path().c_str()), SyscallFailsWithErrno(EINVAL));
+}
+
 TEST(MemoryCgroup, MemoryUsageInBytes) {
   SKIP_IF(!CgroupsAvailable());
 
diff --git a/test/util/cgroup_util.cc b/test/util/cgroup_util.cc
index 04d4f8de0..977993f41 100644
--- a/test/util/cgroup_util.cc
+++ b/test/util/cgroup_util.cc
@@ -142,6 +142,20 @@ PosixError Mounter::Unmount(const Cgroup& c) {
   return NoError();
 }
 
+void Mounter::release(const Cgroup& c) {
+  auto mp = mountpoints_.find(c.id());
+  if (mp != mountpoints_.end()) {
+    mp->second.release();
+    mountpoints_.erase(mp);
+  }
+
+  auto m = mounts_.find(c.id());
+  if (m != mounts_.end()) {
+    m->second.Release();
+    mounts_.erase(m);
+  }
+}
+
 constexpr char kProcCgroupsHeader[] =
     "#subsys_name\thierarchy\tnum_cgroups\tenabled";
 
diff --git a/test/util/cgroup_util.h b/test/util/cgroup_util.h
index b797a8b24..e3f696a89 100644
--- a/test/util/cgroup_util.h
+++ b/test/util/cgroup_util.h
@@ -83,6 +83,8 @@ class Mounter {
 
   PosixError Unmount(const Cgroup& c);
 
+  void release(const Cgroup& c);
+
  private:
   // The destruction order of these members avoids errors during cleanup. We
   // first unmount (by executing the mounts_ cleanups), then delete the