// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel // This file implements the task exit cycle: // // - Tasks are asynchronously requested to exit with Task.Kill. // // - When able, the task goroutine enters the exit path starting from state // runExit. // // - Other tasks observe completed exits with Task.Wait (which implements the // wait*() family of syscalls). import ( "errors" "fmt" "strconv" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) // An ExitStatus is a value communicated from an exiting task or thread group // to the party that reaps it. // // +stateify savable type ExitStatus struct { // Code is the numeric value passed to the call to exit or exit_group that // caused the exit. If the exit was not caused by such a call, Code is 0. Code int // Signo is the signal that caused the exit. If the exit was not caused by // a signal, Signo is 0. Signo int } // Signaled returns true if the ExitStatus indicates that the exiting task or // thread group was killed by a signal. func (es ExitStatus) Signaled() bool { return es.Signo != 0 } // Status returns the numeric representation of the ExitStatus returned by e.g. // the wait4() system call. func (es ExitStatus) Status() uint32 { return ((uint32(es.Code) & 0xff) << 8) | (uint32(es.Signo) & 0xff) } // ShellExitCode returns the numeric exit code that Bash would return for an // exit status of es. func (es ExitStatus) ShellExitCode() int { if es.Signaled() { return 128 + es.Signo } return es.Code } // TaskExitState represents a step in the task exit path. // // "Exiting" and "exited" are often ambiguous; prefer to name specific states. type TaskExitState int const ( // TaskExitNone indicates that the task has not begun exiting. TaskExitNone TaskExitState = iota // TaskExitInitiated indicates that the task goroutine has entered the exit // path, and the task is no longer eligible to participate in group stops // or group signal handling. TaskExitInitiated is analogous to Linux's // PF_EXITING. TaskExitInitiated // TaskExitZombie indicates that the task has released its resources, and // the task no longer prevents a sibling thread from completing execve. TaskExitZombie // TaskExitDead indicates that the task's thread IDs have been released, // and the task no longer prevents its thread group leader from being // reaped. ("Reaping" refers to the transitioning of a task from // TaskExitZombie to TaskExitDead.) TaskExitDead ) // String implements fmt.Stringer. func (t TaskExitState) String() string { switch t { case TaskExitNone: return "TaskExitNone" case TaskExitInitiated: return "TaskExitInitiated" case TaskExitZombie: return "TaskExitZombie" case TaskExitDead: return "TaskExitDead" default: return strconv.Itoa(int(t)) } } // killLocked marks t as killed by enqueueing a SIGKILL, without causing the // thread-group-affecting side effects SIGKILL usually has. // // Preconditions: The signal mutex must be locked. func (t *Task) killLocked() { // Clear killable stops. if t.stop != nil && t.stop.Killable() { t.endInternalStopLocked() } t.pendingSignals.enqueue(&arch.SignalInfo{ Signo: int32(linux.SIGKILL), // Linux just sets SIGKILL in the pending signal bitmask without // enqueueing an actual siginfo, such that // kernel/signal.c:collect_signal() initializes si_code to SI_USER. Code: arch.SignalInfoUser, }, nil) t.interrupt() } // killed returns true if t has a SIGKILL pending. killed is analogous to // Linux's fatal_signal_pending(). // // Preconditions: The caller must be running on the task goroutine. func (t *Task) killed() bool { t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() return t.killedLocked() } func (t *Task) killedLocked() bool { return t.pendingSignals.pendingSet&linux.SignalSetOf(linux.SIGKILL) != 0 } // PrepareExit indicates an exit with status es. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) PrepareExit(es ExitStatus) { t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() t.exitStatus = es } // PrepareGroupExit indicates a group exit with status es to t's thread group. // // PrepareGroupExit is analogous to Linux's do_group_exit(), except that it // does not tail-call do_exit(), except that it *does* set Task.exitStatus. // (Linux does not do so until within do_exit(), since it reuses exit_code for // ptrace.) // // Preconditions: The caller must be running on the task goroutine. func (t *Task) PrepareGroupExit(es ExitStatus) { t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() if t.tg.exiting || t.tg.execing != nil { // Note that if t.tg.exiting is false but t.tg.execing is not nil, i.e. // this "group exit" is being executed by the killed sibling of an // execing task, then Task.Execve never set t.tg.exitStatus, so it's // still the zero value. This is consistent with Linux, both in intent // ("all other threads ... report death as if they exited via _exit(2) // with exit code 0" - ptrace(2), "execve under ptrace") and in // implementation (compare fs/exec.c:de_thread() => // kernel/signal.c:zap_other_threads() and // kernel/exit.c:do_group_exit() => // include/linux/sched.h:signal_group_exit()). t.exitStatus = t.tg.exitStatus return } t.tg.exiting = true t.tg.exitStatus = es t.exitStatus = es for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() { if sibling != t { sibling.killLocked() } } } // Kill requests that all tasks in ts exit as if group exiting with status es. // Kill does not wait for tasks to exit. // // Kill has no analogue in Linux; it's provided for save/restore only. func (ts *TaskSet) Kill(es ExitStatus) { ts.mu.Lock() defer ts.mu.Unlock() ts.Root.exiting = true for t := range ts.Root.tids { t.tg.signalHandlers.mu.Lock() if !t.tg.exiting { t.tg.exiting = true t.tg.exitStatus = es } t.killLocked() t.tg.signalHandlers.mu.Unlock() } } // advanceExitStateLocked checks that t's current exit state is oldExit, then // sets it to newExit. If t's current exit state is not oldExit, // advanceExitStateLocked panics. // // Preconditions: The TaskSet mutex must be locked. func (t *Task) advanceExitStateLocked(oldExit, newExit TaskExitState) { if t.exitState != oldExit { panic(fmt.Sprintf("Transitioning from exit state %v to %v: unexpected preceding state %v", oldExit, newExit, t.exitState)) } t.Debugf("Transitioning from exit state %v to %v", oldExit, newExit) t.exitState = newExit } // runExit is the entry point into the task exit path. // // +stateify savable type runExit struct{} func (*runExit) execute(t *Task) taskRunState { t.ptraceExit() return (*runExitMain)(nil) } // +stateify savable type runExitMain struct{} func (*runExitMain) execute(t *Task) taskRunState { t.traceExitEvent() lastExiter := t.exitThreadGroup() t.ResetKcov() // If the task has a cleartid, and the thread group wasn't killed by a // signal, handle that before releasing the MM. if t.cleartid != 0 { t.tg.signalHandlers.mu.Lock() signaled := t.tg.exiting && t.tg.exitStatus.Signaled() t.tg.signalHandlers.mu.Unlock() if !signaled { zero := ThreadID(0) if _, err := zero.CopyOut(t, t.cleartid); err == nil { t.Futex().Wake(t, t.cleartid, false, ^uint32(0), 1) } // If the CopyOut fails, there's nothing we can do. } } // Handle the robust futex list. t.exitRobustList() // Deactivate the address space and update max RSS before releasing the // task's MM. t.Deactivate() t.tg.pidns.owner.mu.Lock() t.updateRSSLocked() t.tg.pidns.owner.mu.Unlock() t.mu.Lock() t.image.release() t.mu.Unlock() // Releasing the MM unblocks a blocked CLONE_VFORK parent. t.unstopVforkParent() t.fsContext.DecRef(t) t.fdTable.DecRef(t) t.mu.Lock() if t.mountNamespaceVFS2 != nil { t.mountNamespaceVFS2.DecRef(t) t.mountNamespaceVFS2 = nil } t.ipcns.DecRef(t) t.mu.Unlock() // If this is the last task to exit from the thread group, release the // thread group's resources. if lastExiter { t.tg.Release(t) } // Detach tracees. t.exitPtrace() // Reparent the task's children. t.exitChildren() // Don't tail-call runExitNotify, as exitChildren may have initiated a stop // to wait for a PID namespace to die. return (*runExitNotify)(nil) } // exitThreadGroup transitions t to TaskExitInitiated, indicating to t's thread // group that it is no longer eligible to participate in group activities. It // returns true if t is the last task in its thread group to call // exitThreadGroup. func (t *Task) exitThreadGroup() bool { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() t.tg.signalHandlers.mu.Lock() // Can't defer unlock: see below. t.advanceExitStateLocked(TaskExitNone, TaskExitInitiated) t.tg.activeTasks-- last := t.tg.activeTasks == 0 // Ensure that someone will handle the signals we can't. t.setSignalMaskLocked(^linux.SignalSet(0)) // Check if this task's exit interacts with an initiated group stop. if !t.groupStopPending { t.tg.signalHandlers.mu.Unlock() return last } t.groupStopPending = false sig := t.tg.groupStopSignal notifyParent := t.participateGroupStopLocked() // signalStop must be called with t's signal mutex unlocked. t.tg.signalHandlers.mu.Unlock() if notifyParent && t.tg.leader.parent != nil { t.tg.leader.parent.signalStop(t, arch.CLD_STOPPED, int32(sig)) t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop) } return last } func (t *Task) exitChildren() { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() newParent := t.findReparentTargetLocked() if newParent == nil { // "If the init process of a PID namespace terminates, the kernel // terminates all of the processes in the namespace via a SIGKILL // signal." - pid_namespaces(7) t.Debugf("Init process terminating, killing namespace") t.tg.pidns.exiting = true for other := range t.tg.pidns.tgids { if other == t.tg { continue } other.signalHandlers.mu.Lock() other.leader.sendSignalLocked(&arch.SignalInfo{ Signo: int32(linux.SIGKILL), }, true /* group */) other.signalHandlers.mu.Unlock() } // TODO(b/37722272): The init process waits for all processes in the // namespace to exit before completing its own exit // (kernel/pid_namespace.c:zap_pid_ns_processes()). Stop until all // other tasks in the namespace are dead, except possibly for this // thread group's leader (which can't be reaped until this task exits). } // This is correct even if newParent is nil (it ensures that children don't // wait for a parent to reap them.) for c := range t.children { if sig := c.ParentDeathSignal(); sig != 0 { siginfo := &arch.SignalInfo{ Signo: int32(sig), Code: arch.SignalInfoUser, } siginfo.SetPID(int32(c.tg.pidns.tids[t])) siginfo.SetUID(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow())) c.tg.signalHandlers.mu.Lock() c.sendSignalLocked(siginfo, true /* group */) c.tg.signalHandlers.mu.Unlock() } c.reparentLocked(newParent) if newParent != nil { newParent.children[c] = struct{}{} } } } // findReparentTargetLocked returns the task to which t's children should be // reparented. If no such task exists, findNewParentLocked returns nil. // // Preconditions: The TaskSet mutex must be locked. func (t *Task) findReparentTargetLocked() *Task { // Reparent to any sibling in the same thread group that hasn't begun // exiting. if t2 := t.tg.anyNonExitingTaskLocked(); t2 != nil { return t2 } // "A child process that is orphaned within the namespace will be // reparented to [the init process for the namespace] ..." - // pid_namespaces(7) if init := t.tg.pidns.tasks[InitTID]; init != nil { return init.tg.anyNonExitingTaskLocked() } return nil } func (tg *ThreadGroup) anyNonExitingTaskLocked() *Task { for t := tg.tasks.Front(); t != nil; t = t.Next() { if t.exitState == TaskExitNone { return t } } return nil } // reparentLocked changes t's parent. The new parent may be nil. // // Preconditions: The TaskSet mutex must be locked for writing. func (t *Task) reparentLocked(parent *Task) { oldParent := t.parent t.parent = parent if oldParent != nil { delete(oldParent.children, t) } if parent != nil { parent.children[t] = struct{}{} } // If a thread group leader's parent changes, reset the thread group's // termination signal to SIGCHLD and re-check exit notification. (Compare // kernel/exit.c:reparent_leader().) if t != t.tg.leader { return } if oldParent == nil && parent == nil { return } if oldParent != nil && parent != nil && oldParent.tg == parent.tg { return } t.tg.terminationSignal = linux.SIGCHLD if t.exitParentNotified && !t.exitParentAcked { t.exitParentNotified = false t.exitNotifyLocked(false) } } // When a task exits, other tasks in the system, notably the task's parent and // ptracer, may want to be notified. The exit notification system ensures that // interested tasks receive signals and/or are woken from blocking calls to // wait*() syscalls; these notifications must be resolved before exiting tasks // can be reaped and disappear from the system. // // Each task may have a parent task and/or a tracer task. If both a parent and // a tracer exist, they may be the same task, different tasks in the same // thread group, or tasks in different thread groups. (In the last case, Linux // refers to the task as being ptrace-reparented due to an implementation // detail; we avoid this terminology to avoid confusion.) // // A thread group is *empty* if all non-leader tasks in the thread group are // dead, and the leader is either a zombie or dead. The exit of a thread group // leader is never waitable - by either the parent or tracer - until the thread // group is empty. // // There are a few ways for an exit notification to be resolved: // // - The exit notification may be acknowledged by a call to Task.Wait with // WaitOptions.ConsumeEvent set (e.g. due to a wait4() syscall). // // - If the notified party is the parent, and the parent thread group is not // also the tracer thread group, and the notification signal is SIGCHLD, the // parent may explicitly ignore the notification (see quote in exitNotify). // Note that it's possible for the notified party to ignore the signal in other // cases, but the notification is only resolved under the above conditions. // (Actually, there is one exception; see the last paragraph of the "leader, // has tracer, tracer thread group is parent thread group" case below.) // // - If the notified party is the parent, and the parent does not exist, the // notification is resolved as if ignored. (This is only possible in the // sentry. In Linux, the only task / thread group without a parent is global // init, and killing global init causes a kernel panic.) // // - If the notified party is a tracer, the tracer may detach the traced task. // (Zombie tasks cannot be ptrace-attached, so the reverse is not possible.) // // In addition, if the notified party is the parent, the parent may exit and // cause the notifying task to be reparented to another thread group. This does // not resolve the notification; instead, the notification must be resent to // the new parent. // // The series of notifications generated for a given task's exit depend on // whether it is a thread group leader; whether the task is ptraced; and, if // so, whether the tracer thread group is the same as the parent thread group. // // - Non-leader, no tracer: No notification is generated; the task is reaped // immediately. // // - Non-leader, has tracer: SIGCHLD is sent to the tracer. When the tracer // notification is resolved (by waiting or detaching), the task is reaped. (For // non-leaders, whether the tracer and parent thread groups are the same is // irrelevant.) // // - Leader, no tracer: The task remains a zombie, with no notification sent, // until all other tasks in the thread group are dead. (In Linux terms, this // condition is indicated by include/linux/sched.h:thread_group_empty(); tasks // are removed from their thread_group list in kernel/exit.c:release_task() => // __exit_signal() => __unhash_process().) Then the thread group's termination // signal is sent to the parent. When the parent notification is resolved (by // waiting or ignoring), the task is reaped. // // - Leader, has tracer, tracer thread group is not parent thread group: // SIGCHLD is sent to the tracer. When the tracer notification is resolved (by // waiting or detaching), and all other tasks in the thread group are dead, the // thread group's termination signal is sent to the parent. (Note that the // tracer cannot resolve the exit notification by waiting until the thread // group is empty.) When the parent notification is resolved, the task is // reaped. // // - Leader, has tracer, tracer thread group is parent thread group: // // If all other tasks in the thread group are dead, the thread group's // termination signal is sent to the parent. At this point, the notification // can only be resolved by waiting. If the parent detaches from the task as a // tracer, the notification is not resolved, but the notification can now be // resolved by waiting or ignoring. When the parent notification is resolved, // the task is reaped. // // If at least one task in the thread group is not dead, SIGCHLD is sent to the // parent. At this point, the notification cannot be resolved at all; once the // thread group becomes empty, it can be resolved only by waiting. If the // parent detaches from the task as a tracer before all remaining tasks die, // then exit notification proceeds as in the case where the leader never had a // tracer. If the parent detaches from the task as a tracer after all remaining // tasks die, the notification is not resolved, but the notification can now be // resolved by waiting or ignoring. When the parent notification is resolved, // the task is reaped. // // In both of the above cases, when the parent detaches from the task as a // tracer while the thread group is empty, whether or not the parent resolves // the notification by ignoring it is based on the parent's SIGCHLD signal // action, whether or not the thread group's termination signal is SIGCHLD // (Linux: kernel/ptrace.c:__ptrace_detach() => ignoring_children()). // // There is one final wrinkle: A leader can become a non-leader due to a // sibling execve. In this case, the execing thread detaches the leader's // tracer (if one exists) and reaps the leader immediately. In Linux, this is // in fs/exec.c:de_thread(); in the sentry, this is in Task.promoteLocked(). // +stateify savable type runExitNotify struct{} func (*runExitNotify) execute(t *Task) taskRunState { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() t.advanceExitStateLocked(TaskExitInitiated, TaskExitZombie) t.tg.liveTasks-- // Check if this completes a sibling's execve. if t.tg.execing != nil && t.tg.liveTasks == 1 { // execing blocks the addition of new tasks to the thread group, so // the sole living task must be the execing one. e := t.tg.execing e.tg.signalHandlers.mu.Lock() if _, ok := e.stop.(*execStop); ok { e.endInternalStopLocked() } e.tg.signalHandlers.mu.Unlock() } t.exitNotifyLocked(false) // The task goroutine will now exit. return nil } // exitNotifyLocked is called after changes to t's state that affect exit // notification. // // If fromPtraceDetach is true, the caller is ptraceDetach or exitPtrace; // thanks to Linux's haphazard implementation of this functionality, such cases // determine whether parent notifications are ignored based on the parent's // handling of SIGCHLD, regardless of what the exited task's thread group's // termination signal is. // // Preconditions: The TaskSet mutex must be locked for writing. func (t *Task) exitNotifyLocked(fromPtraceDetach bool) { if t.exitState != TaskExitZombie { return } if !t.exitTracerNotified { t.exitTracerNotified = true tracer := t.Tracer() if tracer == nil { t.exitTracerAcked = true } else if t != t.tg.leader || t.parent == nil || tracer.tg != t.parent.tg { // Don't set exitParentNotified if t is non-leader, even if the // tracer is in the parent thread group, so that if the parent // detaches the following call to exitNotifyLocked passes through // the !exitParentNotified case below and causes t to be reaped // immediately. // // Tracer notification doesn't care about about // SIG_IGN/SA_NOCLDWAIT. tracer.tg.signalHandlers.mu.Lock() tracer.sendSignalLocked(t.exitNotificationSignal(linux.SIGCHLD, tracer), true /* group */) tracer.tg.signalHandlers.mu.Unlock() // Wake EventTraceeStop waiters as well since this task will never // ptrace-stop again. tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop) } else { // t is a leader and the tracer is in the parent thread group. t.exitParentNotified = true sig := linux.SIGCHLD if t.tg.tasksCount == 1 { sig = t.tg.terminationSignal } // This notification doesn't care about SIG_IGN/SA_NOCLDWAIT either // (in Linux, the check in do_notify_parent() is gated by // !tsk->ptrace.) t.parent.tg.signalHandlers.mu.Lock() t.parent.sendSignalLocked(t.exitNotificationSignal(sig, t.parent), true /* group */) t.parent.tg.signalHandlers.mu.Unlock() // See below for rationale for this event mask. t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue) } } if t.exitTracerAcked && !t.exitParentNotified { if t != t.tg.leader { t.exitParentNotified = true t.exitParentAcked = true } else if t.tg.tasksCount == 1 { t.exitParentNotified = true if t.parent == nil { t.exitParentAcked = true } else { // "POSIX.1-2001 specifies that if the disposition of SIGCHLD is // set to SIG_IGN or the SA_NOCLDWAIT flag is set for SIGCHLD (see // sigaction(2)), then children that terminate do not become // zombies and a call to wait() or waitpid() will block until all // children have terminated, and then fail with errno set to // ECHILD. (The original POSIX standard left the behavior of // setting SIGCHLD to SIG_IGN unspecified. Note that even though // the default disposition of SIGCHLD is "ignore", explicitly // setting the disposition to SIG_IGN results in different // treatment of zombie process children.) Linux 2.6 conforms to // this specification." - wait(2) // // Some undocumented Linux-specific details: // // - All of the above is ignored if the termination signal isn't // SIGCHLD. // // - SA_NOCLDWAIT causes the leader to be immediately reaped, but // does not suppress the SIGCHLD. signalParent := t.tg.terminationSignal.IsValid() t.parent.tg.signalHandlers.mu.Lock() if t.tg.terminationSignal == linux.SIGCHLD || fromPtraceDetach { if act, ok := t.parent.tg.signalHandlers.actions[linux.SIGCHLD]; ok { if act.Handler == arch.SignalActIgnore { t.exitParentAcked = true signalParent = false } else if act.Flags&arch.SignalFlagNoCldWait != 0 { t.exitParentAcked = true } } } if signalParent { t.parent.tg.leader.sendSignalLocked(t.exitNotificationSignal(t.tg.terminationSignal, t.parent), true /* group */) } t.parent.tg.signalHandlers.mu.Unlock() // If a task in the parent was waiting for a child group stop // or continue, it needs to be notified of the exit, because // there may be no remaining eligible tasks (so that wait // should return ECHILD). t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue) } } } if t.exitTracerAcked && t.exitParentAcked { t.advanceExitStateLocked(TaskExitZombie, TaskExitDead) for ns := t.tg.pidns; ns != nil; ns = ns.parent { tid := ns.tids[t] delete(ns.tasks, tid) delete(ns.tids, t) if t == t.tg.leader { delete(ns.tgids, t.tg) } } t.tg.exitedCPUStats.Accumulate(t.CPUStats()) t.tg.ioUsage.Accumulate(t.ioUsage) t.tg.signalHandlers.mu.Lock() t.tg.tasks.Remove(t) t.tg.tasksCount-- tc := t.tg.tasksCount t.tg.signalHandlers.mu.Unlock() if tc == 1 && t != t.tg.leader { // Our fromPtraceDetach doesn't matter here (in Linux terms, this // is via a call to release_task()). t.tg.leader.exitNotifyLocked(false) } else if tc == 0 { t.tg.processGroup.decRefWithParent(t.tg.parentPG()) } if t.parent != nil { delete(t.parent.children, t) t.parent = nil } } } // Preconditions: The TaskSet mutex must be locked. func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *arch.SignalInfo { info := &arch.SignalInfo{ Signo: int32(sig), } info.SetPID(int32(receiver.tg.pidns.tids[t])) info.SetUID(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) if t.exitStatus.Signaled() { info.Code = arch.CLD_KILLED info.SetStatus(int32(t.exitStatus.Signo)) } else { info.Code = arch.CLD_EXITED info.SetStatus(int32(t.exitStatus.Code)) } // TODO(b/72102453): Set utime, stime. return info } // ExitStatus returns t's exit status, which is only guaranteed to be // meaningful if t.ExitState() != TaskExitNone. func (t *Task) ExitStatus() ExitStatus { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() return t.exitStatus } // ExitStatus returns the exit status that would be returned by a consuming // wait*() on tg. func (tg *ThreadGroup) ExitStatus() ExitStatus { tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() tg.signalHandlers.mu.Lock() defer tg.signalHandlers.mu.Unlock() if tg.exiting { return tg.exitStatus } return tg.leader.exitStatus } // TerminationSignal returns the thread group's termination signal. func (tg *ThreadGroup) TerminationSignal() linux.Signal { tg.pidns.owner.mu.RLock() defer tg.pidns.owner.mu.RUnlock() return tg.terminationSignal } // Task events that can be waited for. const ( // EventExit represents an exit notification generated for a child thread // group leader or a tracee under the conditions specified in the comment // above runExitNotify. EventExit waiter.EventMask = 1 << iota // EventChildGroupStop occurs when a child thread group completes a group // stop (i.e. all tasks in the child thread group have entered a stopped // state as a result of a group stop). EventChildGroupStop // EventTraceeStop occurs when a task that is ptraced by a task in the // notified thread group enters a ptrace stop (see ptrace(2)). EventTraceeStop // EventGroupContinue occurs when a child thread group, or a thread group // whose leader is ptraced by a task in the notified thread group, that had // initiated or completed a group stop leaves the group stop, due to the // child thread group or any task in the child thread group being sent // SIGCONT. EventGroupContinue ) // WaitOptions controls the behavior of Task.Wait. type WaitOptions struct { // If SpecificTID is non-zero, only events from the task with thread ID // SpecificTID are eligible to be waited for. SpecificTID is resolved in // the PID namespace of the waiter (the method receiver of Task.Wait). If // no such task exists, or that task would not otherwise be eligible to be // waited for by the waiting task, then there are no waitable tasks and // Wait will return ECHILD. SpecificTID ThreadID // If SpecificPGID is non-zero, only events from ThreadGroups with a // matching ProcessGroupID are eligible to be waited for. (Same // constraints as SpecificTID apply.) SpecificPGID ProcessGroupID // Terminology note: Per waitpid(2), "a clone child is one which delivers // no signal, or a signal other than SIGCHLD to its parent upon // termination." In Linux, termination signal is technically a per-task // property rather than a per-thread-group property. However, clone() // forces no termination signal for tasks created with CLONE_THREAD, and // execve() resets the termination signal to SIGCHLD, so all // non-group-leader threads have no termination signal and are therefore // "clone tasks". // If NonCloneTasks is true, events from non-clone tasks are eligible to be // waited for. NonCloneTasks bool // If CloneTasks is true, events from clone tasks are eligible to be waited // for. CloneTasks bool // If SiblingChildren is true, events from children tasks of any task // in the thread group of the waiter are eligible to be waited for. SiblingChildren bool // Events is a bitwise combination of the events defined above that specify // what events are of interest to the call to Wait. Events waiter.EventMask // If ConsumeEvent is true, the Wait should consume the event such that it // cannot be returned by a future Wait. Note that if a task exit is // consumed in this way, in most cases the task will be reaped. ConsumeEvent bool // If BlockInterruptErr is not nil, Wait will block until either an event // is available or there are no tasks that could produce a waitable event; // if that blocking is interrupted, Wait returns BlockInterruptErr. If // BlockInterruptErr is nil, Wait will not block. BlockInterruptErr error } // Preconditions: The TaskSet mutex must be locked (for reading or writing). func (o *WaitOptions) matchesTask(t *Task, pidns *PIDNamespace, tracee bool) bool { if o.SpecificTID != 0 && o.SpecificTID != pidns.tids[t] { return false } if o.SpecificPGID != 0 && o.SpecificPGID != pidns.pgids[t.tg.processGroup] { return false } // Tracees are always eligible. if tracee { return true } if t == t.tg.leader && t.tg.terminationSignal == linux.SIGCHLD { return o.NonCloneTasks } return o.CloneTasks } // ErrNoWaitableEvent is returned by non-blocking Task.Waits (e.g. // waitpid(WNOHANG)) that find no waitable events, but determine that waitable // events may exist in the future. (In contrast, if a non-blocking or blocking // Wait determines that there are no tasks that can produce a waitable event, // Task.Wait returns ECHILD.) var ErrNoWaitableEvent = errors.New("non-blocking Wait found eligible threads but no waitable events") // WaitResult contains information about a waited-for event. type WaitResult struct { // Task is the task that reported the event. Task *Task // TID is the thread ID of Task in the PID namespace of the task that // called Wait (that is, the method receiver of the call to Task.Wait). TID // is provided because consuming exit waits cause the thread ID to be // deallocated. TID ThreadID // UID is the real UID of Task in the user namespace of the task that // called Wait. UID auth.UID // Event is exactly one of the events defined above. Event waiter.EventMask // Status is the numeric status associated with the event. Status uint32 } // Wait waits for an event from a thread group that is a child of t's thread // group, or a task in such a thread group, or a task that is ptraced by t, // subject to the options specified in opts. func (t *Task) Wait(opts *WaitOptions) (*WaitResult, error) { if opts.BlockInterruptErr == nil { return t.waitOnce(opts) } w, ch := waiter.NewChannelEntry(nil) t.tg.eventQueue.EventRegister(&w, opts.Events) defer t.tg.eventQueue.EventUnregister(&w) for { wr, err := t.waitOnce(opts) if err != ErrNoWaitableEvent { // This includes err == nil. return wr, err } if err := t.Block(ch); err != nil { return wr, syserror.ConvertIntr(err, opts.BlockInterruptErr) } } } func (t *Task) waitOnce(opts *WaitOptions) (*WaitResult, error) { anyWaitableTasks := false t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() if opts.SiblingChildren { // We can wait on the children and tracees of any task in the // same thread group. for parent := t.tg.tasks.Front(); parent != nil; parent = parent.Next() { wr, any := t.waitParentLocked(opts, parent) if wr != nil { return wr, nil } anyWaitableTasks = anyWaitableTasks || any } } else { // We can only wait on this task. var wr *WaitResult wr, anyWaitableTasks = t.waitParentLocked(opts, t) if wr != nil { return wr, nil } } if anyWaitableTasks { return nil, ErrNoWaitableEvent } return nil, syserror.ECHILD } // Preconditions: The TaskSet mutex must be locked for writing. func (t *Task) waitParentLocked(opts *WaitOptions, parent *Task) (*WaitResult, bool) { anyWaitableTasks := false for child := range parent.children { if !opts.matchesTask(child, parent.tg.pidns, false) { continue } // Non-leaders don't notify parents on exit and aren't eligible to // be waited on. if opts.Events&EventExit != 0 && child == child.tg.leader && !child.exitParentAcked { anyWaitableTasks = true if wr := t.waitCollectZombieLocked(child, opts, false); wr != nil { return wr, anyWaitableTasks } } // Check for group stops and continues. Tasks that have passed // TaskExitInitiated can no longer participate in group stops. if opts.Events&(EventChildGroupStop|EventGroupContinue) == 0 { continue } if child.exitState >= TaskExitInitiated { continue } // If the waiter is in the same thread group as the task's // tracer, do not report its group stops; they will be reported // as ptrace stops instead. This also skips checking for group // continues, but they'll be checked for when scanning tracees // below. (Per kernel/exit.c:wait_consider_task(): "If a // ptracer wants to distinguish the two events for its own // children, it should create a separate process which takes // the role of real parent.") if tracer := child.Tracer(); tracer != nil && tracer.tg == parent.tg { continue } anyWaitableTasks = true if opts.Events&EventChildGroupStop != 0 { if wr := t.waitCollectChildGroupStopLocked(child, opts); wr != nil { return wr, anyWaitableTasks } } if opts.Events&EventGroupContinue != 0 { if wr := t.waitCollectGroupContinueLocked(child, opts); wr != nil { return wr, anyWaitableTasks } } } for tracee := range parent.ptraceTracees { if !opts.matchesTask(tracee, parent.tg.pidns, true) { continue } // Non-leaders do notify tracers on exit. if opts.Events&EventExit != 0 && !tracee.exitTracerAcked { anyWaitableTasks = true if wr := t.waitCollectZombieLocked(tracee, opts, true); wr != nil { return wr, anyWaitableTasks } } if opts.Events&(EventTraceeStop|EventGroupContinue) == 0 { continue } if tracee.exitState >= TaskExitInitiated { continue } anyWaitableTasks = true if opts.Events&EventTraceeStop != 0 { if wr := t.waitCollectTraceeStopLocked(tracee, opts); wr != nil { return wr, anyWaitableTasks } } if opts.Events&EventGroupContinue != 0 { if wr := t.waitCollectGroupContinueLocked(tracee, opts); wr != nil { return wr, anyWaitableTasks } } } return nil, anyWaitableTasks } // Preconditions: The TaskSet mutex must be locked for writing. func (t *Task) waitCollectZombieLocked(target *Task, opts *WaitOptions, asPtracer bool) *WaitResult { if asPtracer && !target.exitTracerNotified { return nil } if !asPtracer && !target.exitParentNotified { return nil } // Zombied thread group leaders are never waitable until their thread group // is otherwise empty. Usually this is caught by the // target.exitParentNotified check above, but if t is both (in the thread // group of) target's tracer and parent, asPtracer may be true. if target == target.tg.leader && target.tg.tasksCount != 1 { return nil } pid := t.tg.pidns.tids[target] uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() status := target.exitStatus.Status() if !opts.ConsumeEvent { return &WaitResult{ Task: target, TID: pid, UID: uid, Event: EventExit, Status: status, } } // Surprisingly, the exit status reported by a non-consuming wait can // differ from that reported by a consuming wait; the latter will return // the group exit code if one is available. if target.tg.exiting { status = target.tg.exitStatus.Status() } // t may be (in the thread group of) target's parent, tracer, or both. We // don't need to check for !exitTracerAcked because tracees are detached // here, and we don't need to check for !exitParentAcked because zombies // will be reaped here. if tracer := target.Tracer(); tracer != nil && tracer.tg == t.tg && target.exitTracerNotified { target.exitTracerAcked = true target.ptraceTracer.Store((*Task)(nil)) delete(t.ptraceTracees, target) } if target.parent != nil && target.parent.tg == t.tg && target.exitParentNotified { target.exitParentAcked = true if target == target.tg.leader { // target.tg.exitedCPUStats doesn't include target.CPUStats() yet, // and won't until after target.exitNotifyLocked() (maybe). Include // target.CPUStats() explicitly. This is consistent with Linux, // which accounts an exited task's cputime to its thread group in // kernel/exit.c:release_task() => __exit_signal(), and uses // thread_group_cputime_adjusted() in wait_task_zombie(). t.tg.childCPUStats.Accumulate(target.CPUStats()) t.tg.childCPUStats.Accumulate(target.tg.exitedCPUStats) t.tg.childCPUStats.Accumulate(target.tg.childCPUStats) // Update t's child max resident set size. The size will be the maximum // of this thread's size and all its childrens' sizes. if t.tg.childMaxRSS < target.tg.maxRSS { t.tg.childMaxRSS = target.tg.maxRSS } if t.tg.childMaxRSS < target.tg.childMaxRSS { t.tg.childMaxRSS = target.tg.childMaxRSS } } } target.exitNotifyLocked(false) return &WaitResult{ Task: target, TID: pid, UID: uid, Event: EventExit, Status: status, } } // updateRSSLocked updates t.tg.maxRSS. // // Preconditions: The TaskSet mutex must be locked for writing. func (t *Task) updateRSSLocked() { if mmMaxRSS := t.MemoryManager().MaxResidentSetSize(); t.tg.maxRSS < mmMaxRSS { t.tg.maxRSS = mmMaxRSS } } // Preconditions: The TaskSet mutex must be locked for writing. func (t *Task) waitCollectChildGroupStopLocked(target *Task, opts *WaitOptions) *WaitResult { target.tg.signalHandlers.mu.Lock() defer target.tg.signalHandlers.mu.Unlock() if !target.tg.groupStopWaitable { return nil } pid := t.tg.pidns.tids[target] uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() sig := target.tg.groupStopSignal if opts.ConsumeEvent { target.tg.groupStopWaitable = false } return &WaitResult{ Task: target, TID: pid, UID: uid, Event: EventChildGroupStop, // There is no name for these status constants. Status: (uint32(sig)&0xff)<<8 | 0x7f, } } // Preconditions: The TaskSet mutex must be locked for writing. func (t *Task) waitCollectGroupContinueLocked(target *Task, opts *WaitOptions) *WaitResult { target.tg.signalHandlers.mu.Lock() defer target.tg.signalHandlers.mu.Unlock() if !target.tg.groupContWaitable { return nil } pid := t.tg.pidns.tids[target] uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() if opts.ConsumeEvent { target.tg.groupContWaitable = false } return &WaitResult{ Task: target, TID: pid, UID: uid, Event: EventGroupContinue, Status: 0xffff, } } // Preconditions: The TaskSet mutex must be locked for writing. func (t *Task) waitCollectTraceeStopLocked(target *Task, opts *WaitOptions) *WaitResult { target.tg.signalHandlers.mu.Lock() defer target.tg.signalHandlers.mu.Unlock() if target.stop == nil { return nil } if _, ok := target.stop.(*ptraceStop); !ok { return nil } if target.ptraceCode == 0 { return nil } pid := t.tg.pidns.tids[target] uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() code := target.ptraceCode if opts.ConsumeEvent { target.ptraceCode = 0 } return &WaitResult{ Task: target, TID: pid, UID: uid, Event: EventTraceeStop, Status: uint32(code)<<8 | 0x7f, } } // ExitState returns t's current progress through the exit path. func (t *Task) ExitState() TaskExitState { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() return t.exitState } // ParentDeathSignal returns t's parent death signal. func (t *Task) ParentDeathSignal() linux.Signal { t.mu.Lock() defer t.mu.Unlock() return t.parentDeathSignal } // SetParentDeathSignal sets t's parent death signal. func (t *Task) SetParentDeathSignal(sig linux.Signal) { t.mu.Lock() defer t.mu.Unlock() t.parentDeathSignal = sig }