5 files changed, 74 insertions, 20 deletions
diff --git a/pkg/sentry/kernel/cgroup.go b/pkg/sentry/kernel/cgroup.go
index 1f1c63f37..c93ef6ac1 100644
--- a/pkg/sentry/kernel/cgroup.go
+++ b/pkg/sentry/kernel/cgroup.go
@@ -48,10 +48,6 @@ type CgroupController interface {
 	// attached to. Returned value is valid for the lifetime of the controller.
 	HierarchyID() uint32
 
-	// Filesystem returns the filesystem this controller is attached to.
-	// Returned value is valid for the lifetime of the controller.
-	Filesystem() *vfs.Filesystem
-
 	// RootCgroup returns the root cgroup for this controller. Returned value is
 	// valid for the lifetime of the controller.
 	RootCgroup() Cgroup
@@ -124,6 +120,19 @@ func (h *hierarchy) match(ctypes []CgroupControllerType) bool {
 	return true
 }
 
+// cgroupFS is the public interface to cgroupfs. This lets the kernel package
+// refer to cgroupfs.filesystem methods without directly depending on the
+// cgroupfs package, which would lead to a circular dependency.
+type cgroupFS interface {
+	// Returns the vfs.Filesystem for the cgroupfs.
+	VFSFilesystem() *vfs.Filesystem
+
+	// InitializeHierarchyID sets the hierarchy ID for this filesystem during
+	// filesystem creation. May only be called before the filesystem is visible
+	// to the vfs layer.
+	InitializeHierarchyID(hid uint32)
+}
+
 // CgroupRegistry tracks the active set of cgroup controllers on the system.
 //
 // +stateify savable
@@ -172,7 +181,23 @@ func (r *CgroupRegistry) FindHierarchy(ctypes []CgroupControllerType) *vfs.Files
 
 	for _, h := range r.hierarchies {
 		if h.match(ctypes) {
-			h.fs.IncRef()
+			if !h.fs.TryIncRef() {
+				// Racing with filesystem destruction, namely h.fs.Release.
+				// Since we hold r.mu, we know the hierarchy hasn't been
+				// unregistered yet, but its associated filesystem is tearing
+				// down.
+				//
+				// If we simply indicate the hierarchy wasn't found without
+				// cleaning up the registry, the caller can race with the
+				// unregister and find itself temporarily unable to create a new
+				// hierarchy with a subset of the relevant controllers.
+				//
+				// To keep the result of FindHierarchy consistent with the
+				// uniqueness of controllers enforced by Register, drop the
+				// dying hierarchy now. The eventual unregister by the FS
+				// teardown will become a no-op.
+				return nil
+			}
 			return h.fs
 		}
 	}
@@ -182,31 +207,35 @@ func (r *CgroupRegistry) FindHierarchy(ctypes []CgroupControllerType) *vfs.Files
 
 // Register registers the provided set of controllers with the registry as a new
 // hierarchy. If any controller is already registered, the function returns an
-// error without modifying the registry. The hierarchy can be later referenced
-// by the returned id.
-func (r *CgroupRegistry) Register(cs []CgroupController) (uint32, error) {
+// error without modifying the registry. Register sets the hierarchy ID for the
+// filesystem on success.
+func (r *CgroupRegistry) Register(cs []CgroupController, fs cgroupFS) error {
 	r.mu.Lock()
 	defer r.mu.Unlock()
 
 	if len(cs) == 0 {
-		return InvalidCgroupHierarchyID, fmt.Errorf("can't register hierarchy with no controllers")
+		return fmt.Errorf("can't register hierarchy with no controllers")
 	}
 
 	for _, c := range cs {
 		if _, ok := r.controllers[c.Type()]; ok {
-			return InvalidCgroupHierarchyID, fmt.Errorf("controllers may only be mounted on a single hierarchy")
+			return fmt.Errorf("controllers may only be mounted on a single hierarchy")
 		}
 	}
 
 	hid, err := r.nextHierarchyID()
 	if err != nil {
-		return hid, err
+		return err
 	}
 
+	// Must not fail below here, once we publish the hierarchy ID.
+
+	fs.InitializeHierarchyID(hid)
+
 	h := hierarchy{
 		id:          hid,
 		controllers: make(map[CgroupControllerType]CgroupController),
-		fs:          cs[0].Filesystem(),
+		fs:          fs.VFSFilesystem(),
 	}
 	for _, c := range cs {
 		n := c.Type()
@@ -214,15 +243,20 @@ func (r *CgroupRegistry) Register(cs []CgroupController) (uint32, error) {
 		h.controllers[n] = c
 	}
 	r.hierarchies[hid] = h
-	return hid, nil
+	return nil
 }
 
-// Unregister removes a previously registered hierarchy from the registry. If
-// the controller was not previously registered, Unregister is a no-op.
+// Unregister removes a previously registered hierarchy from the registry. If no
+// such hierarchy is registered, Unregister is a no-op.
 func (r *CgroupRegistry) Unregister(hid uint32) {
 	r.mu.Lock()
-	defer r.mu.Unlock()
+	r.unregisterLocked(hid)
+	r.mu.Unlock()
+}
 
+// Precondition: Caller must hold r.mu.
+// +checklocks:r.mu
+func (r *CgroupRegistry) unregisterLocked(hid uint32) {
 	if h, ok := r.hierarchies[hid]; ok {
 		for name, _ := range h.controllers {
 			delete(r.controllers, name)
@@ -253,6 +287,11 @@ func (r *CgroupRegistry) computeInitialGroups(inherit map[Cgroup]struct{}) map[C
 	for name, ctl := range r.controllers {
 		if _, ok := ctlSet[name]; !ok {
 			cg := ctl.RootCgroup()
+			// Multiple controllers may share the same hierarchy, so may have
+			// the same root cgroup. Grab a single ref per hierarchy root.
+			if _, ok := cgset[cg]; ok {
+				continue
+			}
 			cg.IncRef() // Ref transferred to caller.
 			cgset[cg] = struct{}{}
 		}
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 10885688c..62777faa8 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -154,9 +154,11 @@ func (f *FDTable) drop(ctx context.Context, file *fs.File) {
 // dropVFS2 drops the table reference.
 func (f *FDTable) dropVFS2(ctx context.Context, file *vfs.FileDescription) {
 	// Release any POSIX lock possibly held by the FDTable.
-	err := file.UnlockPOSIX(ctx, f, lock.LockRange{0, lock.LockEOF})
-	if err != nil && err != syserror.ENOLCK {
-		panic(fmt.Sprintf("UnlockPOSIX failed: %v", err))
+	if file.SupportsLocks() {
+		err := file.UnlockPOSIX(ctx, f, lock.LockRange{0, lock.LockEOF})
+		if err != nil && err != syserror.ENOLCK {
+			panic(fmt.Sprintf("UnlockPOSIX failed: %v", err))
+		}
 	}
 
 	// Drop the table's reference.
diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go
index 2d89b9ccd..24e467e93 100644
--- a/pkg/sentry/kernel/pipe/pipe_util.go
+++ b/pkg/sentry/kernel/pipe/pipe_util.go
@@ -86,6 +86,12 @@ func (p *Pipe) Write(ctx context.Context, src usermem.IOSequence) (int64, error)
 	if n > 0 {
 		p.Notify(waiter.ReadableEvents)
 	}
+	if err == unix.EPIPE {
+		// If we are returning EPIPE send SIGPIPE to the task.
+		if sendSig := linux.SignalNoInfoFuncFromContext(ctx); sendSig != nil {
+			sendSig(linux.SIGPIPE)
+		}
+	}
 	return n, err
 }
 
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index fe2ab1662..3c5bd8ff7 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -702,7 +702,9 @@ func (s *Set) checkPerms(creds *auth.Credentials, reqPerms fs.PermMask) bool {
 	return s.checkCapability(creds)
 }
 
-// destroy destroys the set. Caller must hold 's.mu'.
+// destroy destroys the set.
+//
+// Preconditions: Caller must hold 's.mu'.
 func (s *Set) destroy() {
 	// Notify all waiters. They will fail on the next attempt to execute
 	// operations and return error.
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 70b0699dc..c82d9e82b 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -17,6 +17,7 @@ package kernel
 import (
 	"time"
 
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -113,6 +114,10 @@ func (t *Task) contextValue(key interface{}, isTaskGoroutine bool) interface{} {
 		return t.k.RealtimeClock()
 	case limits.CtxLimits:
 		return t.tg.limits
+	case linux.CtxSignalNoInfoFunc:
+		return func(sig linux.Signal) error {
+			return t.SendSignal(SignalInfoNoInfo(sig, t, t))
+		}
 	case pgalloc.CtxMemoryFile:
 		return t.k.mf
 	case pgalloc.CtxMemoryFileProvider: