summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel/task_exit.go
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/kernel/task_exit.go')
-rw-r--r--pkg/sentry/kernel/task_exit.go1159
1 files changed, 1159 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
new file mode 100644
index 000000000..158e665d3
--- /dev/null
+++ b/pkg/sentry/kernel/task_exit.go
@@ -0,0 +1,1159 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// This file implements the task exit cycle:
+//
+// - Tasks are asynchronously requested to exit with Task.Kill.
+//
+// - When able, the task goroutine enters the exit path starting from state
+// runExit.
+//
+// - Other tasks observe completed exits with Task.Wait (which implements the
+// wait*() family of syscalls).
+
+import (
+ "errors"
+ "fmt"
+ "strconv"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// An ExitStatus is a value communicated from an exiting task or thread group
+// to the party that reaps it.
+//
+// +stateify savable
+type ExitStatus struct {
+ // Code is the numeric value passed to the call to exit or exit_group that
+ // caused the exit. If the exit was not caused by such a call, Code is 0.
+ Code int
+
+ // Signo is the signal that caused the exit. If the exit was not caused by
+ // a signal, Signo is 0.
+ Signo int
+}
+
+// Signaled returns true if the ExitStatus indicates that the exiting task or
+// thread group was killed by a signal.
+func (es ExitStatus) Signaled() bool {
+ return es.Signo != 0
+}
+
+// Status returns the numeric representation of the ExitStatus returned by e.g.
+// the wait4() system call.
+func (es ExitStatus) Status() uint32 {
+ return ((uint32(es.Code) & 0xff) << 8) | (uint32(es.Signo) & 0xff)
+}
+
+// ShellExitCode returns the numeric exit code that Bash would return for an
+// exit status of es.
+func (es ExitStatus) ShellExitCode() int {
+ if es.Signaled() {
+ return 128 + es.Signo
+ }
+ return es.Code
+}
+
+// TaskExitState represents a step in the task exit path.
+//
+// "Exiting" and "exited" are often ambiguous; prefer to name specific states.
+type TaskExitState int
+
+const (
+ // TaskExitNone indicates that the task has not begun exiting.
+ TaskExitNone TaskExitState = iota
+
+ // TaskExitInitiated indicates that the task goroutine has entered the exit
+ // path, and the task is no longer eligible to participate in group stops
+ // or group signal handling. TaskExitInitiated is analogous to Linux's
+ // PF_EXITING.
+ TaskExitInitiated
+
+ // TaskExitZombie indicates that the task has released its resources, and
+ // the task no longer prevents a sibling thread from completing execve.
+ TaskExitZombie
+
+ // TaskExitDead indicates that the task's thread IDs have been released,
+ // and the task no longer prevents its thread group leader from being
+ // reaped. ("Reaping" refers to the transitioning of a task from
+ // TaskExitZombie to TaskExitDead.)
+ TaskExitDead
+)
+
+// String implements fmt.Stringer.
+func (t TaskExitState) String() string {
+ switch t {
+ case TaskExitNone:
+ return "TaskExitNone"
+ case TaskExitInitiated:
+ return "TaskExitInitiated"
+ case TaskExitZombie:
+ return "TaskExitZombie"
+ case TaskExitDead:
+ return "TaskExitDead"
+ default:
+ return strconv.Itoa(int(t))
+ }
+}
+
+// killLocked marks t as killed by enqueueing a SIGKILL, without causing the
+// thread-group-affecting side effects SIGKILL usually has.
+//
+// Preconditions: The signal mutex must be locked.
+func (t *Task) killLocked() {
+ // Clear killable stops.
+ if t.stop != nil && t.stop.Killable() {
+ t.endInternalStopLocked()
+ }
+ t.pendingSignals.enqueue(&arch.SignalInfo{
+ Signo: int32(linux.SIGKILL),
+ // Linux just sets SIGKILL in the pending signal bitmask without
+ // enqueueing an actual siginfo, such that
+ // kernel/signal.c:collect_signal() initializes si_code to SI_USER.
+ Code: arch.SignalInfoUser,
+ }, nil)
+ t.interrupt()
+}
+
+// killed returns true if t has a SIGKILL pending. killed is analogous to
+// Linux's fatal_signal_pending().
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) killed() bool {
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ return t.killedLocked()
+}
+
+func (t *Task) killedLocked() bool {
+ return t.pendingSignals.pendingSet&linux.SignalSetOf(linux.SIGKILL) != 0
+}
+
+// PrepareExit indicates an exit with status es.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) PrepareExit(es ExitStatus) {
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ t.exitStatus = es
+}
+
+// PrepareGroupExit indicates a group exit with status es to t's thread group.
+//
+// PrepareGroupExit is analogous to Linux's do_group_exit(), except that it
+// does not tail-call do_exit(), except that it *does* set Task.exitStatus.
+// (Linux does not do so until within do_exit(), since it reuses exit_code for
+// ptrace.)
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) PrepareGroupExit(es ExitStatus) {
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ if t.tg.exiting || t.tg.execing != nil {
+ // Note that if t.tg.exiting is false but t.tg.execing is not nil, i.e.
+ // this "group exit" is being executed by the killed sibling of an
+ // execing task, then Task.Execve never set t.tg.exitStatus, so it's
+ // still the zero value. This is consistent with Linux, both in intent
+ // ("all other threads ... report death as if they exited via _exit(2)
+ // with exit code 0" - ptrace(2), "execve under ptrace") and in
+ // implementation (compare fs/exec.c:de_thread() =>
+ // kernel/signal.c:zap_other_threads() and
+ // kernel/exit.c:do_group_exit() =>
+ // include/linux/sched.h:signal_group_exit()).
+ t.exitStatus = t.tg.exitStatus
+ return
+ }
+ t.tg.exiting = true
+ t.tg.exitStatus = es
+ t.exitStatus = es
+ for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() {
+ if sibling != t {
+ sibling.killLocked()
+ }
+ }
+}
+
+// Kill requests that all tasks in ts exit as if group exiting with status es.
+// Kill does not wait for tasks to exit.
+//
+// Kill has no analogue in Linux; it's provided for save/restore only.
+func (ts *TaskSet) Kill(es ExitStatus) {
+ ts.mu.Lock()
+ defer ts.mu.Unlock()
+ ts.Root.exiting = true
+ for t := range ts.Root.tids {
+ t.tg.signalHandlers.mu.Lock()
+ if !t.tg.exiting {
+ t.tg.exiting = true
+ t.tg.exitStatus = es
+ }
+ t.killLocked()
+ t.tg.signalHandlers.mu.Unlock()
+ }
+}
+
+// advanceExitStateLocked checks that t's current exit state is oldExit, then
+// sets it to newExit. If t's current exit state is not oldExit,
+// advanceExitStateLocked panics.
+//
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) advanceExitStateLocked(oldExit, newExit TaskExitState) {
+ if t.exitState != oldExit {
+ panic(fmt.Sprintf("Transitioning from exit state %v to %v: unexpected preceding state %v", oldExit, newExit, t.exitState))
+ }
+ t.Debugf("Transitioning from exit state %v to %v", oldExit, newExit)
+ t.exitState = newExit
+}
+
+// runExit is the entry point into the task exit path.
+//
+// +stateify savable
+type runExit struct{}
+
+func (*runExit) execute(t *Task) taskRunState {
+ t.ptraceExit()
+ return (*runExitMain)(nil)
+}
+
+// +stateify savable
+type runExitMain struct{}
+
+func (*runExitMain) execute(t *Task) taskRunState {
+ lastExiter := t.exitThreadGroup()
+
+ // If the task has a cleartid, and the thread group wasn't killed by a
+ // signal, handle that before releasing the MM.
+ if t.cleartid != 0 {
+ t.tg.signalHandlers.mu.Lock()
+ signaled := t.tg.exiting && t.tg.exitStatus.Signaled()
+ t.tg.signalHandlers.mu.Unlock()
+ if !signaled {
+ if _, err := t.CopyOut(t.cleartid, ThreadID(0)); err == nil {
+ t.Futex().Wake(t, t.cleartid, false, ^uint32(0), 1)
+ }
+ // If the CopyOut fails, there's nothing we can do.
+ }
+ }
+
+ // Deactivate the address space and update max RSS before releasing the
+ // task's MM.
+ t.Deactivate()
+ t.tg.pidns.owner.mu.Lock()
+ t.updateRSSLocked()
+ t.tg.pidns.owner.mu.Unlock()
+ t.mu.Lock()
+ t.tc.release()
+ t.mu.Unlock()
+
+ // Releasing the MM unblocks a blocked CLONE_VFORK parent.
+ t.unstopVforkParent()
+
+ t.fsc.DecRef()
+ t.fds.DecRef()
+
+ // If this is the last task to exit from the thread group, release the
+ // thread group's resources.
+ if lastExiter {
+ t.tg.release()
+ }
+
+ // Detach tracees.
+ t.exitPtrace()
+
+ // Reparent the task's children.
+ t.exitChildren()
+
+ // Don't tail-call runExitNotify, as exitChildren may have initiated a stop
+ // to wait for a PID namespace to die.
+ return (*runExitNotify)(nil)
+}
+
+// exitThreadGroup transitions t to TaskExitInitiated, indicating to t's thread
+// group that it is no longer eligible to participate in group activities. It
+// returns true if t is the last task in its thread group to call
+// exitThreadGroup.
+func (t *Task) exitThreadGroup() bool {
+ t.tg.pidns.owner.mu.Lock()
+ defer t.tg.pidns.owner.mu.Unlock()
+ t.tg.signalHandlers.mu.Lock()
+ // Can't defer unlock: see below.
+
+ t.advanceExitStateLocked(TaskExitNone, TaskExitInitiated)
+ t.tg.activeTasks--
+ last := t.tg.activeTasks == 0
+
+ // Ensure that someone will handle the signals we can't.
+ t.setSignalMaskLocked(^linux.SignalSet(0))
+
+ // Check if this task's exit interacts with an initiated group stop.
+ if !t.groupStopPending {
+ t.tg.signalHandlers.mu.Unlock()
+ return last
+ }
+ t.groupStopPending = false
+ sig := t.tg.groupStopSignal
+ notifyParent := t.participateGroupStopLocked()
+ // signalStop must be called with t's signal mutex unlocked.
+ t.tg.signalHandlers.mu.Unlock()
+ if notifyParent && t.tg.leader.parent != nil {
+ t.tg.leader.parent.signalStop(t, arch.CLD_STOPPED, int32(sig))
+ t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop)
+ }
+ return last
+}
+
+func (t *Task) exitChildren() {
+ t.tg.pidns.owner.mu.Lock()
+ defer t.tg.pidns.owner.mu.Unlock()
+ newParent := t.findReparentTargetLocked()
+ if newParent == nil {
+ // "If the init process of a PID namespace terminates, the kernel
+ // terminates all of the processes in the namespace via a SIGKILL
+ // signal." - pid_namespaces(7)
+ t.Debugf("Init process terminating, killing namespace")
+ t.tg.pidns.exiting = true
+ for other := range t.tg.pidns.tgids {
+ if other == t.tg {
+ continue
+ }
+ other.signalHandlers.mu.Lock()
+ other.leader.sendSignalLocked(&arch.SignalInfo{
+ Signo: int32(linux.SIGKILL),
+ }, true /* group */)
+ other.signalHandlers.mu.Unlock()
+ }
+ // TODO(b/37722272): The init process waits for all processes in the
+ // namespace to exit before completing its own exit
+ // (kernel/pid_namespace.c:zap_pid_ns_processes()). Stop until all
+ // other tasks in the namespace are dead, except possibly for this
+ // thread group's leader (which can't be reaped until this task exits).
+ }
+ // This is correct even if newParent is nil (it ensures that children don't
+ // wait for a parent to reap them.)
+ for c := range t.children {
+ if sig := c.ParentDeathSignal(); sig != 0 {
+ siginfo := &arch.SignalInfo{
+ Signo: int32(sig),
+ Code: arch.SignalInfoUser,
+ }
+ siginfo.SetPid(int32(c.tg.pidns.tids[t]))
+ siginfo.SetUid(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow()))
+ c.tg.signalHandlers.mu.Lock()
+ c.sendSignalLocked(siginfo, true /* group */)
+ c.tg.signalHandlers.mu.Unlock()
+ }
+ c.reparentLocked(newParent)
+ if newParent != nil {
+ newParent.children[c] = struct{}{}
+ }
+ }
+}
+
+// findReparentTargetLocked returns the task to which t's children should be
+// reparented. If no such task exists, findNewParentLocked returns nil.
+//
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) findReparentTargetLocked() *Task {
+ // Reparent to any sibling in the same thread group that hasn't begun
+ // exiting.
+ if t2 := t.tg.anyNonExitingTaskLocked(); t2 != nil {
+ return t2
+ }
+ // "A child process that is orphaned within the namespace will be
+ // reparented to [the init process for the namespace] ..." -
+ // pid_namespaces(7)
+ if init := t.tg.pidns.tasks[InitTID]; init != nil {
+ return init.tg.anyNonExitingTaskLocked()
+ }
+ return nil
+}
+
+func (tg *ThreadGroup) anyNonExitingTaskLocked() *Task {
+ for t := tg.tasks.Front(); t != nil; t = t.Next() {
+ if t.exitState == TaskExitNone {
+ return t
+ }
+ }
+ return nil
+}
+
+// reparentLocked changes t's parent. The new parent may be nil.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) reparentLocked(parent *Task) {
+ oldParent := t.parent
+ t.parent = parent
+ // If a thread group leader's parent changes, reset the thread group's
+ // termination signal to SIGCHLD and re-check exit notification. (Compare
+ // kernel/exit.c:reparent_leader().)
+ if t != t.tg.leader {
+ return
+ }
+ if oldParent == nil && parent == nil {
+ return
+ }
+ if oldParent != nil && parent != nil && oldParent.tg == parent.tg {
+ return
+ }
+ t.tg.terminationSignal = linux.SIGCHLD
+ if t.exitParentNotified && !t.exitParentAcked {
+ t.exitParentNotified = false
+ t.exitNotifyLocked(false)
+ }
+}
+
+// When a task exits, other tasks in the system, notably the task's parent and
+// ptracer, may want to be notified. The exit notification system ensures that
+// interested tasks receive signals and/or are woken from blocking calls to
+// wait*() syscalls; these notifications must be resolved before exiting tasks
+// can be reaped and disappear from the system.
+//
+// Each task may have a parent task and/or a tracer task. If both a parent and
+// a tracer exist, they may be the same task, different tasks in the same
+// thread group, or tasks in different thread groups. (In the last case, Linux
+// refers to the task as being ptrace-reparented due to an implementation
+// detail; we avoid this terminology to avoid confusion.)
+//
+// A thread group is *empty* if all non-leader tasks in the thread group are
+// dead, and the leader is either a zombie or dead. The exit of a thread group
+// leader is never waitable - by either the parent or tracer - until the thread
+// group is empty.
+//
+// There are a few ways for an exit notification to be resolved:
+//
+// - The exit notification may be acknowledged by a call to Task.Wait with
+// WaitOptions.ConsumeEvent set (e.g. due to a wait4() syscall).
+//
+// - If the notified party is the parent, and the parent thread group is not
+// also the tracer thread group, and the notification signal is SIGCHLD, the
+// parent may explicitly ignore the notification (see quote in exitNotify).
+// Note that it's possible for the notified party to ignore the signal in other
+// cases, but the notification is only resolved under the above conditions.
+// (Actually, there is one exception; see the last paragraph of the "leader,
+// has tracer, tracer thread group is parent thread group" case below.)
+//
+// - If the notified party is the parent, and the parent does not exist, the
+// notification is resolved as if ignored. (This is only possible in the
+// sentry. In Linux, the only task / thread group without a parent is global
+// init, and killing global init causes a kernel panic.)
+//
+// - If the notified party is a tracer, the tracer may detach the traced task.
+// (Zombie tasks cannot be ptrace-attached, so the reverse is not possible.)
+//
+// In addition, if the notified party is the parent, the parent may exit and
+// cause the notifying task to be reparented to another thread group. This does
+// not resolve the notification; instead, the notification must be resent to
+// the new parent.
+//
+// The series of notifications generated for a given task's exit depend on
+// whether it is a thread group leader; whether the task is ptraced; and, if
+// so, whether the tracer thread group is the same as the parent thread group.
+//
+// - Non-leader, no tracer: No notification is generated; the task is reaped
+// immediately.
+//
+// - Non-leader, has tracer: SIGCHLD is sent to the tracer. When the tracer
+// notification is resolved (by waiting or detaching), the task is reaped. (For
+// non-leaders, whether the tracer and parent thread groups are the same is
+// irrelevant.)
+//
+// - Leader, no tracer: The task remains a zombie, with no notification sent,
+// until all other tasks in the thread group are dead. (In Linux terms, this
+// condition is indicated by include/linux/sched.h:thread_group_empty(); tasks
+// are removed from their thread_group list in kernel/exit.c:release_task() =>
+// __exit_signal() => __unhash_process().) Then the thread group's termination
+// signal is sent to the parent. When the parent notification is resolved (by
+// waiting or ignoring), the task is reaped.
+//
+// - Leader, has tracer, tracer thread group is not parent thread group:
+// SIGCHLD is sent to the tracer. When the tracer notification is resolved (by
+// waiting or detaching), and all other tasks in the thread group are dead, the
+// thread group's termination signal is sent to the parent. (Note that the
+// tracer cannot resolve the exit notification by waiting until the thread
+// group is empty.) When the parent notification is resolved, the task is
+// reaped.
+//
+// - Leader, has tracer, tracer thread group is parent thread group:
+//
+// If all other tasks in the thread group are dead, the thread group's
+// termination signal is sent to the parent. At this point, the notification
+// can only be resolved by waiting. If the parent detaches from the task as a
+// tracer, the notification is not resolved, but the notification can now be
+// resolved by waiting or ignoring. When the parent notification is resolved,
+// the task is reaped.
+//
+// If at least one task in the thread group is not dead, SIGCHLD is sent to the
+// parent. At this point, the notification cannot be resolved at all; once the
+// thread group becomes empty, it can be resolved only by waiting. If the
+// parent detaches from the task as a tracer before all remaining tasks die,
+// then exit notification proceeds as in the case where the leader never had a
+// tracer. If the parent detaches from the task as a tracer after all remaining
+// tasks die, the notification is not resolved, but the notification can now be
+// resolved by waiting or ignoring. When the parent notification is resolved,
+// the task is reaped.
+//
+// In both of the above cases, when the parent detaches from the task as a
+// tracer while the thread group is empty, whether or not the parent resolves
+// the notification by ignoring it is based on the parent's SIGCHLD signal
+// action, whether or not the thread group's termination signal is SIGCHLD
+// (Linux: kernel/ptrace.c:__ptrace_detach() => ignoring_children()).
+//
+// There is one final wrinkle: A leader can become a non-leader due to a
+// sibling execve. In this case, the execing thread detaches the leader's
+// tracer (if one exists) and reaps the leader immediately. In Linux, this is
+// in fs/exec.c:de_thread(); in the sentry, this is in Task.promoteLocked().
+
+// +stateify savable
+type runExitNotify struct{}
+
+func (*runExitNotify) execute(t *Task) taskRunState {
+ t.tg.pidns.owner.mu.Lock()
+ defer t.tg.pidns.owner.mu.Unlock()
+ t.advanceExitStateLocked(TaskExitInitiated, TaskExitZombie)
+ t.tg.liveTasks--
+ // Check if this completes a sibling's execve.
+ if t.tg.execing != nil && t.tg.liveTasks == 1 {
+ // execing blocks the addition of new tasks to the thread group, so
+ // the sole living task must be the execing one.
+ e := t.tg.execing
+ e.tg.signalHandlers.mu.Lock()
+ if _, ok := e.stop.(*execStop); ok {
+ e.endInternalStopLocked()
+ }
+ e.tg.signalHandlers.mu.Unlock()
+ }
+ t.exitNotifyLocked(false)
+ // The task goroutine will now exit.
+ return nil
+}
+
+// exitNotifyLocked is called after changes to t's state that affect exit
+// notification.
+//
+// If fromPtraceDetach is true, the caller is ptraceDetach or exitPtrace;
+// thanks to Linux's haphazard implementation of this functionality, such cases
+// determine whether parent notifications are ignored based on the parent's
+// handling of SIGCHLD, regardless of what the exited task's thread group's
+// termination signal is.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) exitNotifyLocked(fromPtraceDetach bool) {
+ if t.exitState != TaskExitZombie {
+ return
+ }
+ if !t.exitTracerNotified {
+ t.exitTracerNotified = true
+ tracer := t.Tracer()
+ if tracer == nil {
+ t.exitTracerAcked = true
+ } else if t != t.tg.leader || t.parent == nil || tracer.tg != t.parent.tg {
+ // Don't set exitParentNotified if t is non-leader, even if the
+ // tracer is in the parent thread group, so that if the parent
+ // detaches the following call to exitNotifyLocked passes through
+ // the !exitParentNotified case below and causes t to be reaped
+ // immediately.
+ //
+ // Tracer notification doesn't care about about
+ // SIG_IGN/SA_NOCLDWAIT.
+ tracer.tg.signalHandlers.mu.Lock()
+ tracer.sendSignalLocked(t.exitNotificationSignal(linux.SIGCHLD, tracer), true /* group */)
+ tracer.tg.signalHandlers.mu.Unlock()
+ // Wake EventTraceeStop waiters as well since this task will never
+ // ptrace-stop again.
+ tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop)
+ } else {
+ // t is a leader and the tracer is in the parent thread group.
+ t.exitParentNotified = true
+ sig := linux.SIGCHLD
+ if t.tg.tasksCount == 1 {
+ sig = t.tg.terminationSignal
+ }
+ // This notification doesn't care about SIG_IGN/SA_NOCLDWAIT either
+ // (in Linux, the check in do_notify_parent() is gated by
+ // !tsk->ptrace.)
+ t.parent.tg.signalHandlers.mu.Lock()
+ t.parent.sendSignalLocked(t.exitNotificationSignal(sig, t.parent), true /* group */)
+ t.parent.tg.signalHandlers.mu.Unlock()
+ // See below for rationale for this event mask.
+ t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue)
+ }
+ }
+ if t.exitTracerAcked && !t.exitParentNotified {
+ if t != t.tg.leader {
+ t.exitParentNotified = true
+ t.exitParentAcked = true
+ } else if t.tg.tasksCount == 1 {
+ t.exitParentNotified = true
+ if t.parent == nil {
+ t.exitParentAcked = true
+ } else {
+ // "POSIX.1-2001 specifies that if the disposition of SIGCHLD is
+ // set to SIG_IGN or the SA_NOCLDWAIT flag is set for SIGCHLD (see
+ // sigaction(2)), then children that terminate do not become
+ // zombies and a call to wait() or waitpid() will block until all
+ // children have terminated, and then fail with errno set to
+ // ECHILD. (The original POSIX standard left the behavior of
+ // setting SIGCHLD to SIG_IGN unspecified. Note that even though
+ // the default disposition of SIGCHLD is "ignore", explicitly
+ // setting the disposition to SIG_IGN results in different
+ // treatment of zombie process children.) Linux 2.6 conforms to
+ // this specification." - wait(2)
+ //
+ // Some undocumented Linux-specific details:
+ //
+ // - All of the above is ignored if the termination signal isn't
+ // SIGCHLD.
+ //
+ // - SA_NOCLDWAIT causes the leader to be immediately reaped, but
+ // does not suppress the SIGCHLD.
+ signalParent := t.tg.terminationSignal.IsValid()
+ t.parent.tg.signalHandlers.mu.Lock()
+ if t.tg.terminationSignal == linux.SIGCHLD || fromPtraceDetach {
+ if act, ok := t.parent.tg.signalHandlers.actions[linux.SIGCHLD]; ok {
+ if act.Handler == arch.SignalActIgnore {
+ t.exitParentAcked = true
+ signalParent = false
+ } else if act.Flags&arch.SignalFlagNoCldWait != 0 {
+ t.exitParentAcked = true
+ }
+ }
+ }
+ if signalParent {
+ t.parent.tg.leader.sendSignalLocked(t.exitNotificationSignal(t.tg.terminationSignal, t.parent), true /* group */)
+ }
+ t.parent.tg.signalHandlers.mu.Unlock()
+ // If a task in the parent was waiting for a child group stop
+ // or continue, it needs to be notified of the exit, because
+ // there may be no remaining eligible tasks (so that wait
+ // should return ECHILD).
+ t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue)
+ }
+ }
+ }
+ if t.exitTracerAcked && t.exitParentAcked {
+ t.advanceExitStateLocked(TaskExitZombie, TaskExitDead)
+ for ns := t.tg.pidns; ns != nil; ns = ns.parent {
+ tid := ns.tids[t]
+ delete(ns.tasks, tid)
+ delete(ns.tids, t)
+ if t == t.tg.leader {
+ delete(ns.tgids, t.tg)
+ }
+ }
+ t.tg.exitedCPUStats.Accumulate(t.CPUStats())
+ t.tg.ioUsage.Accumulate(t.ioUsage)
+ t.tg.signalHandlers.mu.Lock()
+ t.tg.tasks.Remove(t)
+ t.tg.tasksCount--
+ tc := t.tg.tasksCount
+ t.tg.signalHandlers.mu.Unlock()
+ if tc == 1 && t != t.tg.leader {
+ // Our fromPtraceDetach doesn't matter here (in Linux terms, this
+ // is via a call to release_task()).
+ t.tg.leader.exitNotifyLocked(false)
+ } else if tc == 0 {
+ t.tg.processGroup.decRefWithParent(t.tg.parentPG())
+ }
+ if t.parent != nil {
+ delete(t.parent.children, t)
+ t.parent = nil
+ }
+ }
+}
+
+// Preconditions: The TaskSet mutex must be locked.
+func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *arch.SignalInfo {
+ info := &arch.SignalInfo{
+ Signo: int32(sig),
+ }
+ info.SetPid(int32(receiver.tg.pidns.tids[t]))
+ info.SetUid(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
+ if t.exitStatus.Signaled() {
+ info.Code = arch.CLD_KILLED
+ info.SetStatus(int32(t.exitStatus.Signo))
+ } else {
+ info.Code = arch.CLD_EXITED
+ info.SetStatus(int32(t.exitStatus.Code))
+ }
+ // TODO(b/72102453): Set utime, stime.
+ return info
+}
+
+// ExitStatus returns t's exit status, which is only guaranteed to be
+// meaningful if t.ExitState() != TaskExitNone.
+func (t *Task) ExitStatus() ExitStatus {
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ return t.exitStatus
+}
+
+// ExitStatus returns the exit status that would be returned by a consuming
+// wait*() on tg.
+func (tg *ThreadGroup) ExitStatus() ExitStatus {
+ tg.pidns.owner.mu.RLock()
+ defer tg.pidns.owner.mu.RUnlock()
+ tg.signalHandlers.mu.Lock()
+ defer tg.signalHandlers.mu.Unlock()
+ if tg.exiting {
+ return tg.exitStatus
+ }
+ return tg.leader.exitStatus
+}
+
+// TerminationSignal returns the thread group's termination signal.
+func (tg *ThreadGroup) TerminationSignal() linux.Signal {
+ tg.pidns.owner.mu.RLock()
+ defer tg.pidns.owner.mu.RUnlock()
+ return tg.terminationSignal
+}
+
+// Task events that can be waited for.
+const (
+ // EventExit represents an exit notification generated for a child thread
+ // group leader or a tracee under the conditions specified in the comment
+ // above runExitNotify.
+ EventExit waiter.EventMask = 1 << iota
+
+ // EventChildGroupStop occurs when a child thread group completes a group
+ // stop (i.e. all tasks in the child thread group have entered a stopped
+ // state as a result of a group stop).
+ EventChildGroupStop
+
+ // EventTraceeStop occurs when a task that is ptraced by a task in the
+ // notified thread group enters a ptrace stop (see ptrace(2)).
+ EventTraceeStop
+
+ // EventGroupContinue occurs when a child thread group, or a thread group
+ // whose leader is ptraced by a task in the notified thread group, that had
+ // initiated or completed a group stop leaves the group stop, due to the
+ // child thread group or any task in the child thread group being sent
+ // SIGCONT.
+ EventGroupContinue
+)
+
+// WaitOptions controls the behavior of Task.Wait.
+type WaitOptions struct {
+ // If SpecificTID is non-zero, only events from the task with thread ID
+ // SpecificTID are eligible to be waited for. SpecificTID is resolved in
+ // the PID namespace of the waiter (the method receiver of Task.Wait). If
+ // no such task exists, or that task would not otherwise be eligible to be
+ // waited for by the waiting task, then there are no waitable tasks and
+ // Wait will return ECHILD.
+ SpecificTID ThreadID
+
+ // If SpecificPGID is non-zero, only events from ThreadGroups with a
+ // matching ProcessGroupID are eligible to be waited for. (Same
+ // constraints as SpecificTID apply.)
+ SpecificPGID ProcessGroupID
+
+ // Terminology note: Per waitpid(2), "a clone child is one which delivers
+ // no signal, or a signal other than SIGCHLD to its parent upon
+ // termination." In Linux, termination signal is technically a per-task
+ // property rather than a per-thread-group property. However, clone()
+ // forces no termination signal for tasks created with CLONE_THREAD, and
+ // execve() resets the termination signal to SIGCHLD, so all
+ // non-group-leader threads have no termination signal and are therefore
+ // "clone tasks".
+
+ // If NonCloneTasks is true, events from non-clone tasks are eligible to be
+ // waited for.
+ NonCloneTasks bool
+
+ // If CloneTasks is true, events from clone tasks are eligible to be waited
+ // for.
+ CloneTasks bool
+
+ // If SiblingChildren is true, events from children tasks of any task
+ // in the thread group of the waiter are eligible to be waited for.
+ SiblingChildren bool
+
+ // Events is a bitwise combination of the events defined above that specify
+ // what events are of interest to the call to Wait.
+ Events waiter.EventMask
+
+ // If ConsumeEvent is true, the Wait should consume the event such that it
+ // cannot be returned by a future Wait. Note that if a task exit is
+ // consumed in this way, in most cases the task will be reaped.
+ ConsumeEvent bool
+
+ // If BlockInterruptErr is not nil, Wait will block until either an event
+ // is available or there are no tasks that could produce a waitable event;
+ // if that blocking is interrupted, Wait returns BlockInterruptErr. If
+ // BlockInterruptErr is nil, Wait will not block.
+ BlockInterruptErr error
+}
+
+// Preconditions: The TaskSet mutex must be locked (for reading or writing).
+func (o *WaitOptions) matchesTask(t *Task, pidns *PIDNamespace, tracee bool) bool {
+ if o.SpecificTID != 0 && o.SpecificTID != pidns.tids[t] {
+ return false
+ }
+ if o.SpecificPGID != 0 && o.SpecificPGID != pidns.pgids[t.tg.processGroup] {
+ return false
+ }
+ // Tracees are always eligible.
+ if tracee {
+ return true
+ }
+ if t == t.tg.leader && t.tg.terminationSignal == linux.SIGCHLD {
+ return o.NonCloneTasks
+ }
+ return o.CloneTasks
+}
+
+// ErrNoWaitableEvent is returned by non-blocking Task.Waits (e.g.
+// waitpid(WNOHANG)) that find no waitable events, but determine that waitable
+// events may exist in the future. (In contrast, if a non-blocking or blocking
+// Wait determines that there are no tasks that can produce a waitable event,
+// Task.Wait returns ECHILD.)
+var ErrNoWaitableEvent = errors.New("non-blocking Wait found eligible threads but no waitable events")
+
+// WaitResult contains information about a waited-for event.
+type WaitResult struct {
+ // Task is the task that reported the event.
+ Task *Task
+
+ // TID is the thread ID of Task in the PID namespace of the task that
+ // called Wait (that is, the method receiver of the call to Task.Wait). TID
+ // is provided because consuming exit waits cause the thread ID to be
+ // deallocated.
+ TID ThreadID
+
+ // UID is the real UID of Task in the user namespace of the task that
+ // called Wait.
+ UID auth.UID
+
+ // Event is exactly one of the events defined above.
+ Event waiter.EventMask
+
+ // Status is the numeric status associated with the event.
+ Status uint32
+}
+
+// Wait waits for an event from a thread group that is a child of t's thread
+// group, or a task in such a thread group, or a task that is ptraced by t,
+// subject to the options specified in opts.
+func (t *Task) Wait(opts *WaitOptions) (*WaitResult, error) {
+ if opts.BlockInterruptErr == nil {
+ return t.waitOnce(opts)
+ }
+ w, ch := waiter.NewChannelEntry(nil)
+ t.tg.eventQueue.EventRegister(&w, opts.Events)
+ defer t.tg.eventQueue.EventUnregister(&w)
+ for {
+ wr, err := t.waitOnce(opts)
+ if err != ErrNoWaitableEvent {
+ // This includes err == nil.
+ return wr, err
+ }
+ if err := t.Block(ch); err != nil {
+ return wr, syserror.ConvertIntr(err, opts.BlockInterruptErr)
+ }
+ }
+}
+
+func (t *Task) waitOnce(opts *WaitOptions) (*WaitResult, error) {
+ anyWaitableTasks := false
+
+ t.tg.pidns.owner.mu.Lock()
+ defer t.tg.pidns.owner.mu.Unlock()
+
+ if opts.SiblingChildren {
+ // We can wait on the children and tracees of any task in the
+ // same thread group.
+ for parent := t.tg.tasks.Front(); parent != nil; parent = parent.Next() {
+ wr, any := t.waitParentLocked(opts, parent)
+ if wr != nil {
+ return wr, nil
+ }
+ anyWaitableTasks = anyWaitableTasks || any
+ }
+ } else {
+ // We can only wait on this task.
+ var wr *WaitResult
+ wr, anyWaitableTasks = t.waitParentLocked(opts, t)
+ if wr != nil {
+ return wr, nil
+ }
+ }
+
+ if anyWaitableTasks {
+ return nil, ErrNoWaitableEvent
+ }
+ return nil, syserror.ECHILD
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitParentLocked(opts *WaitOptions, parent *Task) (*WaitResult, bool) {
+ anyWaitableTasks := false
+
+ for child := range parent.children {
+ if !opts.matchesTask(child, parent.tg.pidns, false) {
+ continue
+ }
+ // Non-leaders don't notify parents on exit and aren't eligible to
+ // be waited on.
+ if opts.Events&EventExit != 0 && child == child.tg.leader && !child.exitParentAcked {
+ anyWaitableTasks = true
+ if wr := t.waitCollectZombieLocked(child, opts, false); wr != nil {
+ return wr, anyWaitableTasks
+ }
+ }
+ // Check for group stops and continues. Tasks that have passed
+ // TaskExitInitiated can no longer participate in group stops.
+ if opts.Events&(EventChildGroupStop|EventGroupContinue) == 0 {
+ continue
+ }
+ if child.exitState >= TaskExitInitiated {
+ continue
+ }
+ // If the waiter is in the same thread group as the task's
+ // tracer, do not report its group stops; they will be reported
+ // as ptrace stops instead. This also skips checking for group
+ // continues, but they'll be checked for when scanning tracees
+ // below. (Per kernel/exit.c:wait_consider_task(): "If a
+ // ptracer wants to distinguish the two events for its own
+ // children, it should create a separate process which takes
+ // the role of real parent.")
+ if tracer := child.Tracer(); tracer != nil && tracer.tg == parent.tg {
+ continue
+ }
+ anyWaitableTasks = true
+ if opts.Events&EventChildGroupStop != 0 {
+ if wr := t.waitCollectChildGroupStopLocked(child, opts); wr != nil {
+ return wr, anyWaitableTasks
+ }
+ }
+ if opts.Events&EventGroupContinue != 0 {
+ if wr := t.waitCollectGroupContinueLocked(child, opts); wr != nil {
+ return wr, anyWaitableTasks
+ }
+ }
+ }
+ for tracee := range parent.ptraceTracees {
+ if !opts.matchesTask(tracee, parent.tg.pidns, true) {
+ continue
+ }
+ // Non-leaders do notify tracers on exit.
+ if opts.Events&EventExit != 0 && !tracee.exitTracerAcked {
+ anyWaitableTasks = true
+ if wr := t.waitCollectZombieLocked(tracee, opts, true); wr != nil {
+ return wr, anyWaitableTasks
+ }
+ }
+ if opts.Events&(EventTraceeStop|EventGroupContinue) == 0 {
+ continue
+ }
+ if tracee.exitState >= TaskExitInitiated {
+ continue
+ }
+ anyWaitableTasks = true
+ if opts.Events&EventTraceeStop != 0 {
+ if wr := t.waitCollectTraceeStopLocked(tracee, opts); wr != nil {
+ return wr, anyWaitableTasks
+ }
+ }
+ if opts.Events&EventGroupContinue != 0 {
+ if wr := t.waitCollectGroupContinueLocked(tracee, opts); wr != nil {
+ return wr, anyWaitableTasks
+ }
+ }
+ }
+
+ return nil, anyWaitableTasks
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectZombieLocked(target *Task, opts *WaitOptions, asPtracer bool) *WaitResult {
+ if asPtracer && !target.exitTracerNotified {
+ return nil
+ }
+ if !asPtracer && !target.exitParentNotified {
+ return nil
+ }
+ // Zombied thread group leaders are never waitable until their thread group
+ // is otherwise empty. Usually this is caught by the
+ // target.exitParentNotified check above, but if t is both (in the thread
+ // group of) target's tracer and parent, asPtracer may be true.
+ if target == target.tg.leader && target.tg.tasksCount != 1 {
+ return nil
+ }
+ pid := t.tg.pidns.tids[target]
+ uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+ status := target.exitStatus.Status()
+ if !opts.ConsumeEvent {
+ return &WaitResult{
+ Task: target,
+ TID: pid,
+ UID: uid,
+ Event: EventExit,
+ Status: status,
+ }
+ }
+ // Surprisingly, the exit status reported by a non-consuming wait can
+ // differ from that reported by a consuming wait; the latter will return
+ // the group exit code if one is available.
+ if target.tg.exiting {
+ status = target.tg.exitStatus.Status()
+ }
+ // t may be (in the thread group of) target's parent, tracer, or both. We
+ // don't need to check for !exitTracerAcked because tracees are detached
+ // here, and we don't need to check for !exitParentAcked because zombies
+ // will be reaped here.
+ if tracer := target.Tracer(); tracer != nil && tracer.tg == t.tg && target.exitTracerNotified {
+ target.exitTracerAcked = true
+ target.ptraceTracer.Store((*Task)(nil))
+ delete(t.ptraceTracees, target)
+ }
+ if target.parent != nil && target.parent.tg == t.tg && target.exitParentNotified {
+ target.exitParentAcked = true
+ if target == target.tg.leader {
+ // target.tg.exitedCPUStats doesn't include target.CPUStats() yet,
+ // and won't until after target.exitNotifyLocked() (maybe). Include
+ // target.CPUStats() explicitly. This is consistent with Linux,
+ // which accounts an exited task's cputime to its thread group in
+ // kernel/exit.c:release_task() => __exit_signal(), and uses
+ // thread_group_cputime_adjusted() in wait_task_zombie().
+ t.tg.childCPUStats.Accumulate(target.CPUStats())
+ t.tg.childCPUStats.Accumulate(target.tg.exitedCPUStats)
+ t.tg.childCPUStats.Accumulate(target.tg.childCPUStats)
+ // Update t's child max resident set size. The size will be the maximum
+ // of this thread's size and all its childrens' sizes.
+ if t.tg.childMaxRSS < target.tg.maxRSS {
+ t.tg.childMaxRSS = target.tg.maxRSS
+ }
+ if t.tg.childMaxRSS < target.tg.childMaxRSS {
+ t.tg.childMaxRSS = target.tg.childMaxRSS
+ }
+ }
+ }
+ target.exitNotifyLocked(false)
+ return &WaitResult{
+ Task: target,
+ TID: pid,
+ UID: uid,
+ Event: EventExit,
+ Status: status,
+ }
+}
+
+// updateRSSLocked updates t.tg.maxRSS.
+//
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) updateRSSLocked() {
+ if mmMaxRSS := t.MemoryManager().MaxResidentSetSize(); t.tg.maxRSS < mmMaxRSS {
+ t.tg.maxRSS = mmMaxRSS
+ }
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectChildGroupStopLocked(target *Task, opts *WaitOptions) *WaitResult {
+ target.tg.signalHandlers.mu.Lock()
+ defer target.tg.signalHandlers.mu.Unlock()
+ if !target.tg.groupStopWaitable {
+ return nil
+ }
+ pid := t.tg.pidns.tids[target]
+ uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+ sig := target.tg.groupStopSignal
+ if opts.ConsumeEvent {
+ target.tg.groupStopWaitable = false
+ }
+ return &WaitResult{
+ Task: target,
+ TID: pid,
+ UID: uid,
+ Event: EventChildGroupStop,
+ // There is no name for these status constants.
+ Status: (uint32(sig)&0xff)<<8 | 0x7f,
+ }
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectGroupContinueLocked(target *Task, opts *WaitOptions) *WaitResult {
+ target.tg.signalHandlers.mu.Lock()
+ defer target.tg.signalHandlers.mu.Unlock()
+ if !target.tg.groupContWaitable {
+ return nil
+ }
+ pid := t.tg.pidns.tids[target]
+ uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+ if opts.ConsumeEvent {
+ target.tg.groupContWaitable = false
+ }
+ return &WaitResult{
+ Task: target,
+ TID: pid,
+ UID: uid,
+ Event: EventGroupContinue,
+ Status: 0xffff,
+ }
+}
+
+// Preconditions: The TaskSet mutex must be locked for writing.
+func (t *Task) waitCollectTraceeStopLocked(target *Task, opts *WaitOptions) *WaitResult {
+ target.tg.signalHandlers.mu.Lock()
+ defer target.tg.signalHandlers.mu.Unlock()
+ if target.stop == nil {
+ return nil
+ }
+ if _, ok := target.stop.(*ptraceStop); !ok {
+ return nil
+ }
+ if target.ptraceCode == 0 {
+ return nil
+ }
+ pid := t.tg.pidns.tids[target]
+ uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
+ code := target.ptraceCode
+ if opts.ConsumeEvent {
+ target.ptraceCode = 0
+ }
+ return &WaitResult{
+ Task: target,
+ TID: pid,
+ UID: uid,
+ Event: EventTraceeStop,
+ Status: uint32(code)<<8 | 0x7f,
+ }
+}
+
+// ExitState returns t's current progress through the exit path.
+func (t *Task) ExitState() TaskExitState {
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ return t.exitState
+}
+
+// ParentDeathSignal returns t's parent death signal.
+func (t *Task) ParentDeathSignal() linux.Signal {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ return t.parentDeathSignal
+}
+
+// SetParentDeathSignal sets t's parent death signal.
+func (t *Task) SetParentDeathSignal(sig linux.Signal) {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ t.parentDeathSignal = sig
+}