summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/kernel')
-rw-r--r--pkg/sentry/kernel/context.go3
-rw-r--r--pkg/sentry/kernel/ipc_namespace.go14
-rw-r--r--pkg/sentry/kernel/ipc_namespace_refs.go118
-rw-r--r--pkg/sentry/kernel/kernel.go11
-rw-r--r--pkg/sentry/kernel/kernel_state_autogen.go39
-rw-r--r--pkg/sentry/kernel/shm/shm.go46
-rw-r--r--pkg/sentry/kernel/task.go4
-rw-r--r--pkg/sentry/kernel/task_clone.go5
-rw-r--r--pkg/sentry/kernel/task_exit.go1
-rw-r--r--pkg/sentry/kernel/task_start.go1
10 files changed, 220 insertions, 22 deletions
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go
index dd5f0f5fa..bb94769c4 100644
--- a/pkg/sentry/kernel/context.go
+++ b/pkg/sentry/kernel/context.go
@@ -81,7 +81,8 @@ func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace {
}
// IPCNamespaceFromContext returns the IPC namespace in which ctx is executing,
-// or nil if there is no such IPC namespace.
+// or nil if there is no such IPC namespace. It takes a reference on the
+// namespace.
func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace {
if v := ctx.Value(CtxIPCNamespace); v != nil {
return v.(*IPCNamespace)
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
index 80a070d7e..3f34ee0db 100644
--- a/pkg/sentry/kernel/ipc_namespace.go
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -15,6 +15,7 @@
package kernel
import (
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/kernel/semaphore"
"gvisor.dev/gvisor/pkg/sentry/kernel/shm"
@@ -24,6 +25,8 @@ import (
//
// +stateify savable
type IPCNamespace struct {
+ IPCNamespaceRefs
+
// User namespace which owns this IPC namespace. Immutable.
userNS *auth.UserNamespace
@@ -33,11 +36,13 @@ type IPCNamespace struct {
// NewIPCNamespace creates a new IPC namespace.
func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace {
- return &IPCNamespace{
+ ns := &IPCNamespace{
userNS: userNS,
semaphores: semaphore.NewRegistry(userNS),
shms: shm.NewRegistry(userNS),
}
+ ns.EnableLeakCheck()
+ return ns
}
// SemaphoreRegistry returns the semaphore set registry for this namespace.
@@ -50,6 +55,13 @@ func (i *IPCNamespace) ShmRegistry() *shm.Registry {
return i.shms
}
+// DecRef implements refs_vfs2.RefCounter.DecRef.
+func (i *IPCNamespace) DecRef(ctx context.Context) {
+ i.IPCNamespaceRefs.DecRef(func() {
+ i.shms.Release(ctx)
+ })
+}
+
// IPCNamespace returns the task's IPC namespace.
func (t *Task) IPCNamespace() *IPCNamespace {
t.mu.Lock()
diff --git a/pkg/sentry/kernel/ipc_namespace_refs.go b/pkg/sentry/kernel/ipc_namespace_refs.go
new file mode 100644
index 000000000..263919299
--- /dev/null
+++ b/pkg/sentry/kernel/ipc_namespace_refs.go
@@ -0,0 +1,118 @@
+package kernel
+
+import (
+ "fmt"
+ "runtime"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/log"
+ refs_vfs1 "gvisor.dev/gvisor/pkg/refs"
+)
+
+// ownerType is used to customize logging. Note that we use a pointer to T so
+// that we do not copy the entire object when passed as a format parameter.
+var IPCNamespaceownerType *IPCNamespace
+
+// Refs implements refs.RefCounter. It keeps a reference count using atomic
+// operations and calls the destructor when the count reaches zero.
+//
+// Note that the number of references is actually refCount + 1 so that a default
+// zero-value Refs object contains one reference.
+//
+// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in
+// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount.
+// This will allow us to add stack trace information to the leak messages
+// without growing the size of Refs.
+//
+// +stateify savable
+type IPCNamespaceRefs struct {
+ // refCount is composed of two fields:
+ //
+ // [32-bit speculative references]:[32-bit real references]
+ //
+ // Speculative references are used for TryIncRef, to avoid a CompareAndSwap
+ // loop. See IncRef, DecRef and TryIncRef for details of how these fields are
+ // used.
+ refCount int64
+}
+
+func (r *IPCNamespaceRefs) finalize() {
+ var note string
+ switch refs_vfs1.GetLeakMode() {
+ case refs_vfs1.NoLeakChecking:
+ return
+ case refs_vfs1.UninitializedLeakChecking:
+ note = "(Leak checker uninitialized): "
+ }
+ if n := r.ReadRefs(); n != 0 {
+ log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, IPCNamespaceownerType, n)
+ }
+}
+
+// EnableLeakCheck checks for reference leaks when Refs gets garbage collected.
+func (r *IPCNamespaceRefs) EnableLeakCheck() {
+ if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking {
+ runtime.SetFinalizer(r, (*IPCNamespaceRefs).finalize)
+ }
+}
+
+// ReadRefs returns the current number of references. The returned count is
+// inherently racy and is unsafe to use without external synchronization.
+func (r *IPCNamespaceRefs) ReadRefs() int64 {
+
+ return atomic.LoadInt64(&r.refCount) + 1
+}
+
+// IncRef implements refs.RefCounter.IncRef.
+//
+//go:nosplit
+func (r *IPCNamespaceRefs) IncRef() {
+ if v := atomic.AddInt64(&r.refCount, 1); v <= 0 {
+ panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, IPCNamespaceownerType))
+ }
+}
+
+// TryIncRef implements refs.RefCounter.TryIncRef.
+//
+// To do this safely without a loop, a speculative reference is first acquired
+// on the object. This allows multiple concurrent TryIncRef calls to distinguish
+// other TryIncRef calls from genuine references held.
+//
+//go:nosplit
+func (r *IPCNamespaceRefs) TryIncRef() bool {
+ const speculativeRef = 1 << 32
+ v := atomic.AddInt64(&r.refCount, speculativeRef)
+ if int32(v) < 0 {
+
+ atomic.AddInt64(&r.refCount, -speculativeRef)
+ return false
+ }
+
+ atomic.AddInt64(&r.refCount, -speculativeRef+1)
+ return true
+}
+
+// DecRef implements refs.RefCounter.DecRef.
+//
+// Note that speculative references are counted here. Since they were added
+// prior to real references reaching zero, they will successfully convert to
+// real references. In other words, we see speculative references only in the
+// following case:
+//
+// A: TryIncRef [speculative increase => sees non-negative references]
+// B: DecRef [real decrease]
+// A: TryIncRef [transform speculative to real]
+//
+//go:nosplit
+func (r *IPCNamespaceRefs) DecRef(destroy func()) {
+ switch v := atomic.AddInt64(&r.refCount, -1); {
+ case v < -1:
+ panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, IPCNamespaceownerType))
+
+ case v == -1:
+
+ if destroy != nil {
+ destroy()
+ }
+ }
+}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 675506269..652cbb732 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -828,7 +828,9 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
case CtxUTSNamespace:
return ctx.args.UTSNamespace
case CtxIPCNamespace:
- return ctx.args.IPCNamespace
+ ipcns := ctx.args.IPCNamespace
+ ipcns.IncRef()
+ return ipcns
case auth.CtxCredentials:
return ctx.args.Credentials
case fs.CtxRoot:
@@ -1374,8 +1376,9 @@ func (k *Kernel) RootUTSNamespace() *UTSNamespace {
return k.rootUTSNamespace
}
-// RootIPCNamespace returns the root IPCNamespace.
+// RootIPCNamespace takes a reference and returns the root IPCNamespace.
func (k *Kernel) RootIPCNamespace() *IPCNamespace {
+ k.rootIPCNamespace.IncRef()
return k.rootIPCNamespace
}
@@ -1636,7 +1639,9 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
case CtxUTSNamespace:
return ctx.k.rootUTSNamespace
case CtxIPCNamespace:
- return ctx.k.rootIPCNamespace
+ ipcns := ctx.k.rootIPCNamespace
+ ipcns.IncRef()
+ return ipcns
case auth.CtxCredentials:
// The supervisor context is global root.
return auth.NewRootCredentials(ctx.k.rootUserNamespace)
diff --git a/pkg/sentry/kernel/kernel_state_autogen.go b/pkg/sentry/kernel/kernel_state_autogen.go
index d8ff396b7..b7d6d4f1c 100644
--- a/pkg/sentry/kernel/kernel_state_autogen.go
+++ b/pkg/sentry/kernel/kernel_state_autogen.go
@@ -242,6 +242,7 @@ func (i *IPCNamespace) StateTypeName() string {
func (i *IPCNamespace) StateFields() []string {
return []string{
+ "IPCNamespaceRefs",
"userNS",
"semaphores",
"shms",
@@ -252,17 +253,42 @@ func (i *IPCNamespace) beforeSave() {}
func (i *IPCNamespace) StateSave(stateSinkObject state.Sink) {
i.beforeSave()
- stateSinkObject.Save(0, &i.userNS)
- stateSinkObject.Save(1, &i.semaphores)
- stateSinkObject.Save(2, &i.shms)
+ stateSinkObject.Save(0, &i.IPCNamespaceRefs)
+ stateSinkObject.Save(1, &i.userNS)
+ stateSinkObject.Save(2, &i.semaphores)
+ stateSinkObject.Save(3, &i.shms)
}
func (i *IPCNamespace) afterLoad() {}
func (i *IPCNamespace) StateLoad(stateSourceObject state.Source) {
- stateSourceObject.Load(0, &i.userNS)
- stateSourceObject.Load(1, &i.semaphores)
- stateSourceObject.Load(2, &i.shms)
+ stateSourceObject.Load(0, &i.IPCNamespaceRefs)
+ stateSourceObject.Load(1, &i.userNS)
+ stateSourceObject.Load(2, &i.semaphores)
+ stateSourceObject.Load(3, &i.shms)
+}
+
+func (r *IPCNamespaceRefs) StateTypeName() string {
+ return "pkg/sentry/kernel.IPCNamespaceRefs"
+}
+
+func (r *IPCNamespaceRefs) StateFields() []string {
+ return []string{
+ "refCount",
+ }
+}
+
+func (r *IPCNamespaceRefs) beforeSave() {}
+
+func (r *IPCNamespaceRefs) StateSave(stateSinkObject state.Sink) {
+ r.beforeSave()
+ stateSinkObject.Save(0, &r.refCount)
+}
+
+func (r *IPCNamespaceRefs) afterLoad() {}
+
+func (r *IPCNamespaceRefs) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.Load(0, &r.refCount)
}
func (k *Kernel) StateTypeName() string {
@@ -2292,6 +2318,7 @@ func init() {
state.Register((*FSContext)(nil))
state.Register((*FSContextRefs)(nil))
state.Register((*IPCNamespace)(nil))
+ state.Register((*IPCNamespaceRefs)(nil))
state.Register((*Kernel)(nil))
state.Register((*SocketRecord)(nil))
state.Register((*SocketRecordVFS1)(nil))
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 00c03585e..ebbebf46b 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -321,9 +321,32 @@ func (r *Registry) remove(s *Shm) {
r.totalPages -= s.effectiveSize / usermem.PageSize
}
+// Release drops the self-reference of each active shm segment in the registry.
+// It is called when the kernel.IPCNamespace containing r is being destroyed.
+func (r *Registry) Release(ctx context.Context) {
+ // Because Shm.DecRef() may acquire the same locks, collect the segments to
+ // release first. Note that this should not race with any updates to r, since
+ // the IPC namespace containing it has no more references.
+ toRelease := make([]*Shm, 0)
+ r.mu.Lock()
+ for _, s := range r.keysToShms {
+ s.mu.Lock()
+ if !s.pendingDestruction {
+ toRelease = append(toRelease, s)
+ }
+ s.mu.Unlock()
+ }
+ r.mu.Unlock()
+
+ for _, s := range toRelease {
+ r.dissociateKey(s)
+ s.DecRef(ctx)
+ }
+}
+
// Shm represents a single shared memory segment.
//
-// Shm segment are backed directly by an allocation from platform memory.
+// Shm segments are backed directly by an allocation from platform memory.
// Segments are always mapped as a whole, greatly simplifying how mappings are
// tracked. However note that mremap and munmap calls may cause the vma for a
// segment to become fragmented; which requires special care when unmapping a
@@ -652,17 +675,20 @@ func (s *Shm) MarkDestroyed(ctx context.Context) {
s.registry.dissociateKey(s)
s.mu.Lock()
- defer s.mu.Unlock()
- if !s.pendingDestruction {
- s.pendingDestruction = true
- // Drop the self-reference so destruction occurs when all
- // external references are gone.
- //
- // N.B. This cannot be the final DecRef, as the caller also
- // holds a reference.
- s.DecRef(ctx)
+ if s.pendingDestruction {
+ s.mu.Unlock()
return
}
+ s.pendingDestruction = true
+ s.mu.Unlock()
+
+ // Drop the self-reference so destruction occurs when all
+ // external references are gone.
+ //
+ // N.B. This cannot be the final DecRef, as the caller also
+ // holds a reference.
+ s.DecRef(ctx)
+ return
}
// checkOwnership verifies whether a segment may be accessed by ctx as an
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index e90a19cfb..037971393 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -656,7 +656,9 @@ func (t *Task) Value(key interface{}) interface{} {
case CtxUTSNamespace:
return t.utsns
case CtxIPCNamespace:
- return t.ipcns
+ ipcns := t.IPCNamespace()
+ ipcns.IncRef()
+ return ipcns
case CtxTask:
return t
case auth.CtxCredentials:
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index fce1064a7..7a053f369 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -203,6 +203,8 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
// namespace"
ipcns = NewIPCNamespace(userns)
+ } else {
+ ipcns.IncRef()
}
netns := t.NetworkNamespace()
@@ -218,6 +220,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace)
if err != nil {
+ ipcns.DecRef(t)
return 0, nil, err
}
// clone() returns 0 in the child.
@@ -227,6 +230,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
}
if opts.SetTLS {
if !tc.Arch.SetTLS(uintptr(opts.TLS)) {
+ ipcns.DecRef(t)
return 0, nil, syserror.EPERM
}
}
@@ -509,6 +513,7 @@ func (t *Task) Unshare(opts *SharingOptions) error {
}
// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
// namespace"
+ t.ipcns.DecRef(t)
t.ipcns = NewIPCNamespace(creds.UserNamespace)
}
var oldFDTable *FDTable
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index b400a8b41..239551eb6 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -280,6 +280,7 @@ func (*runExitMain) execute(t *Task) taskRunState {
t.mountNamespaceVFS2.DecRef(t)
t.mountNamespaceVFS2 = nil
}
+ t.ipcns.DecRef(t)
t.mu.Unlock()
// If this is the last task to exit from the thread group, release the
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 64c1e120a..6e2ff573a 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -104,6 +104,7 @@ func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) {
cfg.TaskContext.release()
cfg.FSContext.DecRef(t)
cfg.FDTable.DecRef(t)
+ cfg.IPCNamespace.DecRef(t)
if cfg.MountNamespaceVFS2 != nil {
cfg.MountNamespaceVFS2.DecRef(t)
}