From a7b7b7b9804e9968c1fed5f7b3849233f585a88b Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Wed, 14 Oct 2020 00:11:00 -0700 Subject: Fix shm reference leak. All shm segments in an IPC namespace should be released once that namespace is destroyed. Add reference counting to IPCNamespace so that once the last task with a reference on it exits, we can trigger a destructor that will clean up all shm segments that have not been explicitly freed by the application. PiperOrigin-RevId: 337032977 --- pkg/sentry/kernel/BUILD | 12 ++++++++++ pkg/sentry/kernel/context.go | 3 ++- pkg/sentry/kernel/ipc_namespace.go | 14 +++++++++++- pkg/sentry/kernel/kernel.go | 11 ++++++--- pkg/sentry/kernel/shm/BUILD | 1 + pkg/sentry/kernel/shm/shm.go | 46 +++++++++++++++++++++++++++++--------- pkg/sentry/kernel/task.go | 4 +++- pkg/sentry/kernel/task_clone.go | 5 +++++ pkg/sentry/kernel/task_exit.go | 1 + pkg/sentry/kernel/task_start.go | 1 + 10 files changed, 82 insertions(+), 16 deletions(-) (limited to 'pkg/sentry/kernel') diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 5de70aecb..9a24c6bdb 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -96,6 +96,17 @@ go_template_instance( }, ) +go_template_instance( + name = "ipc_namespace_refs", + out = "ipc_namespace_refs.go", + package = "kernel", + prefix = "IPCNamespace", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "IPCNamespace", + }, +) + go_template_instance( name = "process_group_refs", out = "process_group_refs.go", @@ -137,6 +148,7 @@ go_library( "fs_context.go", "fs_context_refs.go", "ipc_namespace.go", + "ipc_namespace_refs.go", "kcov.go", "kcov_unsafe.go", "kernel.go", diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go index dd5f0f5fa..bb94769c4 100644 --- a/pkg/sentry/kernel/context.go +++ b/pkg/sentry/kernel/context.go @@ -81,7 +81,8 @@ func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace { } // IPCNamespaceFromContext returns the IPC namespace in which ctx is executing, -// or nil if there is no such IPC namespace. +// or nil if there is no such IPC namespace. It takes a reference on the +// namespace. func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace { if v := ctx.Value(CtxIPCNamespace); v != nil { return v.(*IPCNamespace) diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go index 80a070d7e..3f34ee0db 100644 --- a/pkg/sentry/kernel/ipc_namespace.go +++ b/pkg/sentry/kernel/ipc_namespace.go @@ -15,6 +15,7 @@ package kernel import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/semaphore" "gvisor.dev/gvisor/pkg/sentry/kernel/shm" @@ -24,6 +25,8 @@ import ( // // +stateify savable type IPCNamespace struct { + IPCNamespaceRefs + // User namespace which owns this IPC namespace. Immutable. userNS *auth.UserNamespace @@ -33,11 +36,13 @@ type IPCNamespace struct { // NewIPCNamespace creates a new IPC namespace. func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace { - return &IPCNamespace{ + ns := &IPCNamespace{ userNS: userNS, semaphores: semaphore.NewRegistry(userNS), shms: shm.NewRegistry(userNS), } + ns.EnableLeakCheck() + return ns } // SemaphoreRegistry returns the semaphore set registry for this namespace. @@ -50,6 +55,13 @@ func (i *IPCNamespace) ShmRegistry() *shm.Registry { return i.shms } +// DecRef implements refs_vfs2.RefCounter.DecRef. +func (i *IPCNamespace) DecRef(ctx context.Context) { + i.IPCNamespaceRefs.DecRef(func() { + i.shms.Release(ctx) + }) +} + // IPCNamespace returns the task's IPC namespace. func (t *Task) IPCNamespace() *IPCNamespace { t.mu.Lock() diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 675506269..652cbb732 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -828,7 +828,9 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} { case CtxUTSNamespace: return ctx.args.UTSNamespace case CtxIPCNamespace: - return ctx.args.IPCNamespace + ipcns := ctx.args.IPCNamespace + ipcns.IncRef() + return ipcns case auth.CtxCredentials: return ctx.args.Credentials case fs.CtxRoot: @@ -1374,8 +1376,9 @@ func (k *Kernel) RootUTSNamespace() *UTSNamespace { return k.rootUTSNamespace } -// RootIPCNamespace returns the root IPCNamespace. +// RootIPCNamespace takes a reference and returns the root IPCNamespace. func (k *Kernel) RootIPCNamespace() *IPCNamespace { + k.rootIPCNamespace.IncRef() return k.rootIPCNamespace } @@ -1636,7 +1639,9 @@ func (ctx supervisorContext) Value(key interface{}) interface{} { case CtxUTSNamespace: return ctx.k.rootUTSNamespace case CtxIPCNamespace: - return ctx.k.rootIPCNamespace + ipcns := ctx.k.rootIPCNamespace + ipcns.IncRef() + return ipcns case auth.CtxCredentials: // The supervisor context is global root. return auth.NewRootCredentials(ctx.k.rootUserNamespace) diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index b7e4b480d..f8a382fd8 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -27,6 +27,7 @@ go_library( "//pkg/context", "//pkg/log", "//pkg/refs", + "//pkg/refs_vfs2", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index 00c03585e..ebbebf46b 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -321,9 +321,32 @@ func (r *Registry) remove(s *Shm) { r.totalPages -= s.effectiveSize / usermem.PageSize } +// Release drops the self-reference of each active shm segment in the registry. +// It is called when the kernel.IPCNamespace containing r is being destroyed. +func (r *Registry) Release(ctx context.Context) { + // Because Shm.DecRef() may acquire the same locks, collect the segments to + // release first. Note that this should not race with any updates to r, since + // the IPC namespace containing it has no more references. + toRelease := make([]*Shm, 0) + r.mu.Lock() + for _, s := range r.keysToShms { + s.mu.Lock() + if !s.pendingDestruction { + toRelease = append(toRelease, s) + } + s.mu.Unlock() + } + r.mu.Unlock() + + for _, s := range toRelease { + r.dissociateKey(s) + s.DecRef(ctx) + } +} + // Shm represents a single shared memory segment. // -// Shm segment are backed directly by an allocation from platform memory. +// Shm segments are backed directly by an allocation from platform memory. // Segments are always mapped as a whole, greatly simplifying how mappings are // tracked. However note that mremap and munmap calls may cause the vma for a // segment to become fragmented; which requires special care when unmapping a @@ -652,17 +675,20 @@ func (s *Shm) MarkDestroyed(ctx context.Context) { s.registry.dissociateKey(s) s.mu.Lock() - defer s.mu.Unlock() - if !s.pendingDestruction { - s.pendingDestruction = true - // Drop the self-reference so destruction occurs when all - // external references are gone. - // - // N.B. This cannot be the final DecRef, as the caller also - // holds a reference. - s.DecRef(ctx) + if s.pendingDestruction { + s.mu.Unlock() return } + s.pendingDestruction = true + s.mu.Unlock() + + // Drop the self-reference so destruction occurs when all + // external references are gone. + // + // N.B. This cannot be the final DecRef, as the caller also + // holds a reference. + s.DecRef(ctx) + return } // checkOwnership verifies whether a segment may be accessed by ctx as an diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index e90a19cfb..037971393 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -656,7 +656,9 @@ func (t *Task) Value(key interface{}) interface{} { case CtxUTSNamespace: return t.utsns case CtxIPCNamespace: - return t.ipcns + ipcns := t.IPCNamespace() + ipcns.IncRef() + return ipcns case CtxTask: return t case auth.CtxCredentials: diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index fce1064a7..7a053f369 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -203,6 +203,8 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC // namespace" ipcns = NewIPCNamespace(userns) + } else { + ipcns.IncRef() } netns := t.NetworkNamespace() @@ -218,6 +220,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace) if err != nil { + ipcns.DecRef(t) return 0, nil, err } // clone() returns 0 in the child. @@ -227,6 +230,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { } if opts.SetTLS { if !tc.Arch.SetTLS(uintptr(opts.TLS)) { + ipcns.DecRef(t) return 0, nil, syserror.EPERM } } @@ -509,6 +513,7 @@ func (t *Task) Unshare(opts *SharingOptions) error { } // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC // namespace" + t.ipcns.DecRef(t) t.ipcns = NewIPCNamespace(creds.UserNamespace) } var oldFDTable *FDTable diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index b400a8b41..239551eb6 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -280,6 +280,7 @@ func (*runExitMain) execute(t *Task) taskRunState { t.mountNamespaceVFS2.DecRef(t) t.mountNamespaceVFS2 = nil } + t.ipcns.DecRef(t) t.mu.Unlock() // If this is the last task to exit from the thread group, release the diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 64c1e120a..6e2ff573a 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -104,6 +104,7 @@ func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) { cfg.TaskContext.release() cfg.FSContext.DecRef(t) cfg.FDTable.DecRef(t) + cfg.IPCNamespace.DecRef(t) if cfg.MountNamespaceVFS2 != nil { cfg.MountNamespaceVFS2.DecRef(t) } -- cgit v1.2.3