diff options
Diffstat (limited to 'pkg/sentry')
-rw-r--r-- | pkg/sentry/kernel/context.go | 3 | ||||
-rw-r--r-- | pkg/sentry/kernel/ipc_namespace.go | 14 | ||||
-rw-r--r-- | pkg/sentry/kernel/ipc_namespace_refs.go | 118 | ||||
-rw-r--r-- | pkg/sentry/kernel/kernel.go | 11 | ||||
-rw-r--r-- | pkg/sentry/kernel/kernel_state_autogen.go | 39 | ||||
-rw-r--r-- | pkg/sentry/kernel/shm/shm.go | 46 | ||||
-rw-r--r-- | pkg/sentry/kernel/task.go | 4 | ||||
-rw-r--r-- | pkg/sentry/kernel/task_clone.go | 5 | ||||
-rw-r--r-- | pkg/sentry/kernel/task_exit.go | 1 | ||||
-rw-r--r-- | pkg/sentry/kernel/task_start.go | 1 |
10 files changed, 220 insertions, 22 deletions
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go index dd5f0f5fa..bb94769c4 100644 --- a/pkg/sentry/kernel/context.go +++ b/pkg/sentry/kernel/context.go @@ -81,7 +81,8 @@ func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace { } // IPCNamespaceFromContext returns the IPC namespace in which ctx is executing, -// or nil if there is no such IPC namespace. +// or nil if there is no such IPC namespace. It takes a reference on the +// namespace. func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace { if v := ctx.Value(CtxIPCNamespace); v != nil { return v.(*IPCNamespace) diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go index 80a070d7e..3f34ee0db 100644 --- a/pkg/sentry/kernel/ipc_namespace.go +++ b/pkg/sentry/kernel/ipc_namespace.go @@ -15,6 +15,7 @@ package kernel import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/semaphore" "gvisor.dev/gvisor/pkg/sentry/kernel/shm" @@ -24,6 +25,8 @@ import ( // // +stateify savable type IPCNamespace struct { + IPCNamespaceRefs + // User namespace which owns this IPC namespace. Immutable. userNS *auth.UserNamespace @@ -33,11 +36,13 @@ type IPCNamespace struct { // NewIPCNamespace creates a new IPC namespace. func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace { - return &IPCNamespace{ + ns := &IPCNamespace{ userNS: userNS, semaphores: semaphore.NewRegistry(userNS), shms: shm.NewRegistry(userNS), } + ns.EnableLeakCheck() + return ns } // SemaphoreRegistry returns the semaphore set registry for this namespace. @@ -50,6 +55,13 @@ func (i *IPCNamespace) ShmRegistry() *shm.Registry { return i.shms } +// DecRef implements refs_vfs2.RefCounter.DecRef. +func (i *IPCNamespace) DecRef(ctx context.Context) { + i.IPCNamespaceRefs.DecRef(func() { + i.shms.Release(ctx) + }) +} + // IPCNamespace returns the task's IPC namespace. func (t *Task) IPCNamespace() *IPCNamespace { t.mu.Lock() diff --git a/pkg/sentry/kernel/ipc_namespace_refs.go b/pkg/sentry/kernel/ipc_namespace_refs.go new file mode 100644 index 000000000..263919299 --- /dev/null +++ b/pkg/sentry/kernel/ipc_namespace_refs.go @@ -0,0 +1,118 @@ +package kernel + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var IPCNamespaceownerType *IPCNamespace + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type IPCNamespaceRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *IPCNamespaceRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, IPCNamespaceownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *IPCNamespaceRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*IPCNamespaceRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *IPCNamespaceRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *IPCNamespaceRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, IPCNamespaceownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *IPCNamespaceRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *IPCNamespaceRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, IPCNamespaceownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 675506269..652cbb732 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -828,7 +828,9 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} { case CtxUTSNamespace: return ctx.args.UTSNamespace case CtxIPCNamespace: - return ctx.args.IPCNamespace + ipcns := ctx.args.IPCNamespace + ipcns.IncRef() + return ipcns case auth.CtxCredentials: return ctx.args.Credentials case fs.CtxRoot: @@ -1374,8 +1376,9 @@ func (k *Kernel) RootUTSNamespace() *UTSNamespace { return k.rootUTSNamespace } -// RootIPCNamespace returns the root IPCNamespace. +// RootIPCNamespace takes a reference and returns the root IPCNamespace. func (k *Kernel) RootIPCNamespace() *IPCNamespace { + k.rootIPCNamespace.IncRef() return k.rootIPCNamespace } @@ -1636,7 +1639,9 @@ func (ctx supervisorContext) Value(key interface{}) interface{} { case CtxUTSNamespace: return ctx.k.rootUTSNamespace case CtxIPCNamespace: - return ctx.k.rootIPCNamespace + ipcns := ctx.k.rootIPCNamespace + ipcns.IncRef() + return ipcns case auth.CtxCredentials: // The supervisor context is global root. return auth.NewRootCredentials(ctx.k.rootUserNamespace) diff --git a/pkg/sentry/kernel/kernel_state_autogen.go b/pkg/sentry/kernel/kernel_state_autogen.go index d8ff396b7..b7d6d4f1c 100644 --- a/pkg/sentry/kernel/kernel_state_autogen.go +++ b/pkg/sentry/kernel/kernel_state_autogen.go @@ -242,6 +242,7 @@ func (i *IPCNamespace) StateTypeName() string { func (i *IPCNamespace) StateFields() []string { return []string{ + "IPCNamespaceRefs", "userNS", "semaphores", "shms", @@ -252,17 +253,42 @@ func (i *IPCNamespace) beforeSave() {} func (i *IPCNamespace) StateSave(stateSinkObject state.Sink) { i.beforeSave() - stateSinkObject.Save(0, &i.userNS) - stateSinkObject.Save(1, &i.semaphores) - stateSinkObject.Save(2, &i.shms) + stateSinkObject.Save(0, &i.IPCNamespaceRefs) + stateSinkObject.Save(1, &i.userNS) + stateSinkObject.Save(2, &i.semaphores) + stateSinkObject.Save(3, &i.shms) } func (i *IPCNamespace) afterLoad() {} func (i *IPCNamespace) StateLoad(stateSourceObject state.Source) { - stateSourceObject.Load(0, &i.userNS) - stateSourceObject.Load(1, &i.semaphores) - stateSourceObject.Load(2, &i.shms) + stateSourceObject.Load(0, &i.IPCNamespaceRefs) + stateSourceObject.Load(1, &i.userNS) + stateSourceObject.Load(2, &i.semaphores) + stateSourceObject.Load(3, &i.shms) +} + +func (r *IPCNamespaceRefs) StateTypeName() string { + return "pkg/sentry/kernel.IPCNamespaceRefs" +} + +func (r *IPCNamespaceRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (r *IPCNamespaceRefs) beforeSave() {} + +func (r *IPCNamespaceRefs) StateSave(stateSinkObject state.Sink) { + r.beforeSave() + stateSinkObject.Save(0, &r.refCount) +} + +func (r *IPCNamespaceRefs) afterLoad() {} + +func (r *IPCNamespaceRefs) StateLoad(stateSourceObject state.Source) { + stateSourceObject.Load(0, &r.refCount) } func (k *Kernel) StateTypeName() string { @@ -2292,6 +2318,7 @@ func init() { state.Register((*FSContext)(nil)) state.Register((*FSContextRefs)(nil)) state.Register((*IPCNamespace)(nil)) + state.Register((*IPCNamespaceRefs)(nil)) state.Register((*Kernel)(nil)) state.Register((*SocketRecord)(nil)) state.Register((*SocketRecordVFS1)(nil)) diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index 00c03585e..ebbebf46b 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -321,9 +321,32 @@ func (r *Registry) remove(s *Shm) { r.totalPages -= s.effectiveSize / usermem.PageSize } +// Release drops the self-reference of each active shm segment in the registry. +// It is called when the kernel.IPCNamespace containing r is being destroyed. +func (r *Registry) Release(ctx context.Context) { + // Because Shm.DecRef() may acquire the same locks, collect the segments to + // release first. Note that this should not race with any updates to r, since + // the IPC namespace containing it has no more references. + toRelease := make([]*Shm, 0) + r.mu.Lock() + for _, s := range r.keysToShms { + s.mu.Lock() + if !s.pendingDestruction { + toRelease = append(toRelease, s) + } + s.mu.Unlock() + } + r.mu.Unlock() + + for _, s := range toRelease { + r.dissociateKey(s) + s.DecRef(ctx) + } +} + // Shm represents a single shared memory segment. // -// Shm segment are backed directly by an allocation from platform memory. +// Shm segments are backed directly by an allocation from platform memory. // Segments are always mapped as a whole, greatly simplifying how mappings are // tracked. However note that mremap and munmap calls may cause the vma for a // segment to become fragmented; which requires special care when unmapping a @@ -652,17 +675,20 @@ func (s *Shm) MarkDestroyed(ctx context.Context) { s.registry.dissociateKey(s) s.mu.Lock() - defer s.mu.Unlock() - if !s.pendingDestruction { - s.pendingDestruction = true - // Drop the self-reference so destruction occurs when all - // external references are gone. - // - // N.B. This cannot be the final DecRef, as the caller also - // holds a reference. - s.DecRef(ctx) + if s.pendingDestruction { + s.mu.Unlock() return } + s.pendingDestruction = true + s.mu.Unlock() + + // Drop the self-reference so destruction occurs when all + // external references are gone. + // + // N.B. This cannot be the final DecRef, as the caller also + // holds a reference. + s.DecRef(ctx) + return } // checkOwnership verifies whether a segment may be accessed by ctx as an diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index e90a19cfb..037971393 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -656,7 +656,9 @@ func (t *Task) Value(key interface{}) interface{} { case CtxUTSNamespace: return t.utsns case CtxIPCNamespace: - return t.ipcns + ipcns := t.IPCNamespace() + ipcns.IncRef() + return ipcns case CtxTask: return t case auth.CtxCredentials: diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index fce1064a7..7a053f369 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -203,6 +203,8 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC // namespace" ipcns = NewIPCNamespace(userns) + } else { + ipcns.IncRef() } netns := t.NetworkNamespace() @@ -218,6 +220,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace) if err != nil { + ipcns.DecRef(t) return 0, nil, err } // clone() returns 0 in the child. @@ -227,6 +230,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { } if opts.SetTLS { if !tc.Arch.SetTLS(uintptr(opts.TLS)) { + ipcns.DecRef(t) return 0, nil, syserror.EPERM } } @@ -509,6 +513,7 @@ func (t *Task) Unshare(opts *SharingOptions) error { } // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC // namespace" + t.ipcns.DecRef(t) t.ipcns = NewIPCNamespace(creds.UserNamespace) } var oldFDTable *FDTable diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index b400a8b41..239551eb6 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -280,6 +280,7 @@ func (*runExitMain) execute(t *Task) taskRunState { t.mountNamespaceVFS2.DecRef(t) t.mountNamespaceVFS2 = nil } + t.ipcns.DecRef(t) t.mu.Unlock() // If this is the last task to exit from the thread group, release the diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 64c1e120a..6e2ff573a 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -104,6 +104,7 @@ func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) { cfg.TaskContext.release() cfg.FSContext.DecRef(t) cfg.FDTable.DecRef(t) + cfg.IPCNamespace.DecRef(t) if cfg.MountNamespaceVFS2 != nil { cfg.MountNamespaceVFS2.DecRef(t) } |