diff options
-rw-r--r-- | pkg/sentry/kernel/ptrace.go | 242 | ||||
-rw-r--r-- | pkg/sentry/kernel/sessions.go | 2 | ||||
-rw-r--r-- | pkg/sentry/kernel/task.go | 42 | ||||
-rw-r--r-- | pkg/sentry/kernel/task_exit.go | 26 | ||||
-rw-r--r-- | pkg/sentry/kernel/task_signals.go | 220 | ||||
-rw-r--r-- | pkg/sentry/kernel/thread_group.go | 47 | ||||
-rw-r--r-- | test/syscalls/linux/ptrace.cc | 135 |
7 files changed, 496 insertions, 218 deletions
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index fa7a0d141..e8043bf8a 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -193,6 +193,10 @@ type ptraceStop struct { // If frozen is true, the stopped task's tracer is currently operating on // it, so Task.Kill should not remove the stop. frozen bool + + // If listen is true, the stopped task's tracer invoked PTRACE_LISTEN, so + // ptraceFreeze should fail. + listen bool } // Killable implements TaskStop.Killable. @@ -216,11 +220,11 @@ func (t *Task) beginPtraceStopLocked() bool { // is what prevents tasks from entering ptrace-stops after being killed. // Note that if t was SIGKILLed and beingPtraceStopLocked is being called // for PTRACE_EVENT_EXIT, the task will have dequeued the signal before - // entering the exit path, so t.killable() will no longer return true. This - // is consistent with Linux: "Bugs: ... A SIGKILL signal may still cause a - // PTRACE_EVENT_EXIT stop before actual signal death. This may be changed - // in the future; SIGKILL is meant to always immediately kill tasks even - // under ptrace. Last confirmed on Linux 3.13." - ptrace(2) + // entering the exit path, so t.killedLocked() will no longer return true. + // This is consistent with Linux: "Bugs: ... A SIGKILL signal may still + // cause a PTRACE_EVENT_EXIT stop before actual signal death. This may be + // changed in the future; SIGKILL is meant to always immediately kill tasks + // even under ptrace. Last confirmed on Linux 3.13." - ptrace(2) if t.killedLocked() { return false } @@ -230,6 +234,10 @@ func (t *Task) beginPtraceStopLocked() bool { // Preconditions: The TaskSet mutex must be locked. func (t *Task) ptraceTrapLocked(code int32) { + // This is unconditional in ptrace_stop(). + t.tg.signalHandlers.mu.Lock() + t.trapStopPending = false + t.tg.signalHandlers.mu.Unlock() t.ptraceCode = code t.ptraceSiginfo = &arch.SignalInfo{ Signo: int32(linux.SIGTRAP), @@ -260,6 +268,9 @@ func (t *Task) ptraceFreeze() bool { if !ok { return false } + if s.listen { + return false + } s.frozen = true return true } @@ -273,6 +284,12 @@ func (t *Task) ptraceUnfreeze() { // preventing its thread group from completing execve. t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() + t.ptraceUnfreezeLocked() +} + +// Preconditions: t must be in a frozen ptraceStop. t's signal mutex must be +// locked. +func (t *Task) ptraceUnfreezeLocked() { // Do this even if the task has been killed to ensure a panic if t.stop is // nil or not a ptraceStop. t.stop.(*ptraceStop).frozen = false @@ -336,8 +353,9 @@ func (t *Task) ptraceTraceme() error { return nil } -// ptraceAttach implements ptrace(PTRACE_ATTACH, target). t is the caller. -func (t *Task) ptraceAttach(target *Task) error { +// ptraceAttach implements ptrace(PTRACE_ATTACH, target) if seize is false, and +// ptrace(PTRACE_SEIZE, target, 0, opts) if seize is true. t is the caller. +func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error { if t.tg == target.tg { return syserror.EPERM } @@ -355,19 +373,31 @@ func (t *Task) ptraceAttach(target *Task) error { if target.exitState >= TaskExitZombie { return syserror.EPERM } + if seize { + if err := t.ptraceSetOptionsLocked(opts); err != nil { + return syserror.EIO + } + } target.ptraceTracer.Store(t) t.ptraceTracees[target] = struct{}{} + target.ptraceSeized = seize target.tg.signalHandlers.mu.Lock() - target.sendSignalLocked(&arch.SignalInfo{ - Signo: int32(linux.SIGSTOP), - Code: arch.SignalInfoUser, - }, false /* group */) + // "Unlike PTRACE_ATTACH, PTRACE_SEIZE does not stop the process." - + // ptrace(2) + if !seize { + target.sendSignalLocked(&arch.SignalInfo{ + Signo: int32(linux.SIGSTOP), + Code: arch.SignalInfoUser, + }, false /* group */) + } // Undocumented Linux feature: If the tracee is already group-stopped (and // consequently will not report the SIGSTOP just sent), force it to leave // and re-enter the stop so that it will switch to a ptrace-stop. if target.stop == (*groupStop)(nil) { - target.groupStopRequired = true + target.trapStopPending = true target.endInternalStopLocked() + // TODO: Linux blocks ptrace_attach() until the task has + // entered the ptrace-stop (or exited) via JOBCTL_TRAPPING. } target.tg.signalHandlers.mu.Unlock() return nil @@ -418,6 +448,7 @@ func (t *Task) exitPtrace() { // // Preconditions: The TaskSet mutex must be locked for writing. func (t *Task) forgetTracerLocked() { + t.ptraceSeized = false t.ptraceOpts = ptraceOptions{} t.ptraceSyscallMode = ptraceSyscallNone t.ptraceSinglestep = false @@ -426,21 +457,25 @@ func (t *Task) forgetTracerLocked() { t.exitTracerAcked = true t.exitNotifyLocked(true) } - // If t is ptrace-stopped, but its thread group is in a group stop and t is - // eligible to participate, make it do so. This is essentially the reverse - // of the special case in ptraceAttach, which converts a group stop to a - // ptrace stop. ("Handling of restart from group-stop is currently buggy, - // but the "as planned" behavior is to leave tracee stopped and waiting for - // SIGCONT." - ptrace(2)) t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() - if t.stop == nil { - return + // Unset t.trapStopPending, which might have been set by PTRACE_INTERRUPT. If + // it wasn't, it will be reset via t.groupStopPending after the following. + t.trapStopPending = false + // If t's thread group is in a group stop and t is eligible to participate, + // make it do so. This is essentially the reverse of the special case in + // ptraceAttach, which converts a group stop to a ptrace stop. ("Handling + // of restart from group-stop is currently buggy, but the "as planned" + // behavior is to leave tracee stopped and waiting for SIGCONT." - + // ptrace(2)) + if (t.tg.groupStopComplete || t.tg.groupStopPendingCount != 0) && !t.groupStopPending && t.exitState < TaskExitInitiated { + t.groupStopPending = true + // t already participated in the group stop when it unset + // groupStopPending. + t.groupStopAcknowledged = true + t.interrupt() } if _, ok := t.stop.(*ptraceStop); ok { - if t.exitState < TaskExitInitiated && t.tg.groupStopPhase >= groupStopInitiated { - t.groupStopRequired = true - } t.endInternalStopLocked() } } @@ -460,9 +495,9 @@ func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool { // The tracer might change this signal into a stop signal, in which case // any SIGCONT received after the signal was originally dequeued should // cancel it. This is consistent with Linux. - if t.tg.groupStopPhase == groupStopNone { - t.tg.groupStopPhase = groupStopDequeued - } + t.tg.groupStopDequeued = true + // This is unconditional in ptrace_stop(). + t.trapStopPending = false // Can't lock the TaskSet mutex while holding a signal mutex. t.tg.signalHandlers.mu.Unlock() defer t.tg.signalHandlers.mu.Lock() @@ -612,22 +647,27 @@ func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions if tracer != nil { child.ptraceTracer.Store(tracer) tracer.ptraceTracees[child] = struct{}{} + // "The "seized" behavior ... is inherited by children that are + // automatically attached using PTRACE_O_TRACEFORK, + // PTRACE_O_TRACEVFORK, and PTRACE_O_TRACECLONE." - ptrace(2) + child.ptraceSeized = t.ptraceSeized // "Flags are inherited by new tracees created and "auto-attached" // via active PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or - // PTRACE_O_TRACECLONE options." + // PTRACE_O_TRACECLONE options." - ptrace(2) child.ptraceOpts = t.ptraceOpts child.tg.signalHandlers.mu.Lock() - // If the child is PT_SEIZED (currently not possible in the sentry - // because PTRACE_SEIZE is unimplemented, but for future - // reference), Linux just sets JOBCTL_TRAP_STOP instead, so the - // child skips signal-delivery-stop and goes directly to - // group-stop. - // - // The child will self-t.interrupt() when its task goroutine starts + // "PTRACE_SEIZE: ... Automatically attached children stop with + // PTRACE_EVENT_STOP and WSTOPSIG(status) returns SIGTRAP instead + // of having SIGSTOP signal delivered to them." - ptrace(2) + if child.ptraceSeized { + child.trapStopPending = true + } else { + child.pendingSignals.enqueue(&arch.SignalInfo{ + Signo: int32(linux.SIGSTOP), + }, nil) + } + // The child will self-interrupt() when its task goroutine starts // running, so we don't have to. - child.pendingSignals.enqueue(&arch.SignalInfo{ - Signo: int32(linux.SIGSTOP), - }, nil) child.tg.signalHandlers.mu.Unlock() } } @@ -681,6 +721,9 @@ func (t *Task) ptraceExec(oldTID ThreadID) { // Employing PTRACE_GETSIGINFO for this signal returns si_code set to 0 // (SI_USER). This signal may be blocked by signal mask, and thus may be // delivered (much) later." - ptrace(2) + if t.ptraceSeized { + return + } t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() t.sendSignalLocked(&arch.SignalInfo{ @@ -749,6 +792,57 @@ func (t *Task) ptraceKill(target *Task) error { return nil } +func (t *Task) ptraceInterrupt(target *Task) error { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + if target.Tracer() != t { + return syserror.ESRCH + } + if !target.ptraceSeized { + return syserror.EIO + } + target.tg.signalHandlers.mu.Lock() + defer target.tg.signalHandlers.mu.Unlock() + if target.killedLocked() || target.exitState >= TaskExitInitiated { + return nil + } + target.trapStopPending = true + if s, ok := target.stop.(*ptraceStop); ok && s.listen { + target.endInternalStopLocked() + } + target.interrupt() + return nil +} + +// Preconditions: The TaskSet mutex must be locked for writing. t must have a +// tracer. +func (t *Task) ptraceSetOptionsLocked(opts uintptr) error { + const valid = uintptr(linux.PTRACE_O_EXITKILL | + linux.PTRACE_O_TRACESYSGOOD | + linux.PTRACE_O_TRACECLONE | + linux.PTRACE_O_TRACEEXEC | + linux.PTRACE_O_TRACEEXIT | + linux.PTRACE_O_TRACEFORK | + linux.PTRACE_O_TRACESECCOMP | + linux.PTRACE_O_TRACEVFORK | + linux.PTRACE_O_TRACEVFORKDONE) + if opts&^valid != 0 { + return syserror.EINVAL + } + t.ptraceOpts = ptraceOptions{ + ExitKill: opts&linux.PTRACE_O_EXITKILL != 0, + SysGood: opts&linux.PTRACE_O_TRACESYSGOOD != 0, + TraceClone: opts&linux.PTRACE_O_TRACECLONE != 0, + TraceExec: opts&linux.PTRACE_O_TRACEEXEC != 0, + TraceExit: opts&linux.PTRACE_O_TRACEEXIT != 0, + TraceFork: opts&linux.PTRACE_O_TRACEFORK != 0, + TraceSeccomp: opts&linux.PTRACE_O_TRACESECCOMP != 0, + TraceVfork: opts&linux.PTRACE_O_TRACEVFORK != 0, + TraceVforkDone: opts&linux.PTRACE_O_TRACEVFORKDONE != 0, + } + return nil +} + // Ptrace implements the ptrace system call. func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { // PTRACE_TRACEME ignores all other arguments. @@ -762,16 +856,23 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { return syserror.ESRCH } - // PTRACE_ATTACH (and PTRACE_SEIZE, which is unimplemented) do not require - // that target is not already a tracee. - if req == linux.PTRACE_ATTACH { - return t.ptraceAttach(target) + // PTRACE_ATTACH and PTRACE_SEIZE do not require that target is not already + // a tracee. + if req == linux.PTRACE_ATTACH || req == linux.PTRACE_SEIZE { + seize := req == linux.PTRACE_SEIZE + if seize && addr != 0 { + return syserror.EIO + } + return t.ptraceAttach(target, seize, uintptr(data)) } - // PTRACE_KILL (and PTRACE_INTERRUPT, which is unimplemented) require that - // the target is a tracee, but does not require that it is ptrace-stopped. + // PTRACE_KILL and PTRACE_INTERRUPT require that the target is a tracee, + // but does not require that it is ptrace-stopped. if req == linux.PTRACE_KILL { return t.ptraceKill(target) } + if req == linux.PTRACE_INTERRUPT { + return t.ptraceInterrupt(target) + } // All other ptrace requests require that the target is a ptrace-stopped // tracee, and freeze the ptrace-stop so the tracee can be operated on. t.tg.pidns.owner.mu.RLock() @@ -801,6 +902,8 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { t.UninterruptibleSleepFinish(true) // Resuming commands end the ptrace stop, but only if successful. + // PTRACE_LISTEN ends the ptrace stop if trapNotifyPending is already set on the + // target. switch req { case linux.PTRACE_DETACH: if err := t.ptraceDetach(target, linux.Signal(data)); err != nil { @@ -808,37 +911,65 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { return err } return nil + case linux.PTRACE_CONT: if err := target.ptraceUnstop(ptraceSyscallNone, false, linux.Signal(data)); err != nil { target.ptraceUnfreeze() return err } return nil + case linux.PTRACE_SYSCALL: if err := target.ptraceUnstop(ptraceSyscallIntercept, false, linux.Signal(data)); err != nil { target.ptraceUnfreeze() return err } return nil + case linux.PTRACE_SINGLESTEP: if err := target.ptraceUnstop(ptraceSyscallNone, true, linux.Signal(data)); err != nil { target.ptraceUnfreeze() return err } return nil + case linux.PTRACE_SYSEMU: if err := target.ptraceUnstop(ptraceSyscallEmu, false, linux.Signal(data)); err != nil { target.ptraceUnfreeze() return err } return nil + case linux.PTRACE_SYSEMU_SINGLESTEP: if err := target.ptraceUnstop(ptraceSyscallEmu, true, linux.Signal(data)); err != nil { target.ptraceUnfreeze() return err } return nil + + case linux.PTRACE_LISTEN: + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if !target.ptraceSeized { + return syserror.EIO + } + if target.ptraceSiginfo == nil { + return syserror.EIO + } + if target.ptraceSiginfo.Code>>8 != linux.PTRACE_EVENT_STOP { + return syserror.EIO + } + target.tg.signalHandlers.mu.Lock() + defer target.tg.signalHandlers.mu.Unlock() + if target.trapNotifyPending { + target.endInternalStopLocked() + } else { + target.stop.(*ptraceStop).listen = true + target.ptraceUnfreezeLocked() + } + return nil } + // All other ptrace requests expect us to unfreeze the stop. defer target.ptraceUnfreeze() @@ -958,30 +1089,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { case linux.PTRACE_SETOPTIONS: t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() - validOpts := uintptr(linux.PTRACE_O_EXITKILL | - linux.PTRACE_O_TRACESYSGOOD | - linux.PTRACE_O_TRACECLONE | - linux.PTRACE_O_TRACEEXEC | - linux.PTRACE_O_TRACEEXIT | - linux.PTRACE_O_TRACEFORK | - linux.PTRACE_O_TRACESECCOMP | - linux.PTRACE_O_TRACEVFORK | - linux.PTRACE_O_TRACEVFORKDONE) - if uintptr(data)&^validOpts != 0 { - return syserror.EINVAL - } - target.ptraceOpts = ptraceOptions{ - ExitKill: data&linux.PTRACE_O_EXITKILL != 0, - SysGood: data&linux.PTRACE_O_TRACESYSGOOD != 0, - TraceClone: data&linux.PTRACE_O_TRACECLONE != 0, - TraceExec: data&linux.PTRACE_O_TRACEEXEC != 0, - TraceExit: data&linux.PTRACE_O_TRACEEXIT != 0, - TraceFork: data&linux.PTRACE_O_TRACEFORK != 0, - TraceSeccomp: data&linux.PTRACE_O_TRACESECCOMP != 0, - TraceVfork: data&linux.PTRACE_O_TRACEVFORK != 0, - TraceVforkDone: data&linux.PTRACE_O_TRACEVFORKDONE != 0, - } - return nil + return target.ptraceSetOptionsLocked(uintptr(data)) case linux.PTRACE_GETEVENTMSG: t.tg.pidns.owner.mu.RLock() diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index 6fd65f2b0..ae6daac60 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -204,7 +204,7 @@ func (pg *ProcessGroup) handleOrphan() { return } tg.signalHandlers.mu.Lock() - if tg.groupStopPhase == groupStopComplete { + if tg.groupStopComplete { hasStopped = true } tg.signalHandlers.mu.Unlock() diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index e9f133c0b..f958aba26 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -133,28 +133,42 @@ type Task struct { // signalStack is exclusive to the task goroutine. signalStack arch.SignalStack - // If groupStopRequired is true, the task should enter a group stop in the - // interrupt path. groupStopRequired is not redundant with - // tg.groupStopPhase != groupStopNone, because ptrace allows tracers to - // resume individual tasks from a group stop without ending the group stop - // as a whole. + // If groupStopPending is true, the task should participate in a group + // stop in the interrupt path. // - // groupStopRequired is analogous to JOBCTL_TRAP_STOP in Linux, except that - // Linux only uses that flag for ptraced tasks. + // groupStopPending is analogous to JOBCTL_STOP_PENDING in Linux. // - // groupStopRequired is protected by the signal mutex. - groupStopRequired bool + // groupStopPending is protected by the signal mutex. + groupStopPending bool // If groupStopAcknowledged is true, the task has already acknowledged that // it is entering the most recent group stop that has been initiated on its - // thread group. groupStopAcknowledged is only meaningful if - // tg.groupStopPhase == groupStopInitiated. + // thread group. // // groupStopAcknowledged is analogous to !JOBCTL_STOP_CONSUME in Linux. // // groupStopAcknowledged is protected by the signal mutex. groupStopAcknowledged bool + // If trapStopPending is true, the task goroutine should enter a + // PTRACE_INTERRUPT-induced stop from the interrupt path. + // + // trapStopPending is analogous to JOBCTL_TRAP_STOP in Linux, except that + // Linux also sets JOBCTL_TRAP_STOP when a ptraced task detects + // JOBCTL_STOP_PENDING. + // + // trapStopPending is protected by the signal mutex. + trapStopPending bool + + // If trapNotifyPending is true, this task is PTRACE_SEIZEd, and a group + // stop has begun or ended since the last time the task entered a + // ptrace-stop from the group-stop path. + // + // trapNotifyPending is analogous to JOBCTL_TRAP_NOTIFY in Linux. + // + // trapNotifyPending is protected by the signal mutex. + trapNotifyPending bool + // If stop is not nil, it is the internally-initiated condition that // currently prevents the task goroutine from running. // @@ -296,6 +310,12 @@ type Task struct { // ptraceTracees is protected by the TaskSet mutex. ptraceTracees map[*Task]struct{} + // ptraceSeized is true if ptraceTracer attached to this task with + // PTRACE_SEIZE. + // + // ptraceSeized is protected by the TaskSet mutex. + ptraceSeized bool + // ptraceOpts contains ptrace options explicitly set by the tracer. If // ptraceTracer is nil, ptraceOpts is expected to be the zero value. // diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index 791cc9831..b9c558ccb 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -122,7 +122,6 @@ func (t *Task) killLocked() { if t.stop != nil && t.stop.Killable() { t.endInternalStopLocked() } - t.groupStopRequired = false t.pendingSignals.enqueue(&arch.SignalInfo{ Signo: int32(linux.SIGKILL), // Linux just sets SIGKILL in the pending signal bitmask without @@ -304,33 +303,16 @@ func (t *Task) exitThreadGroup() bool { t.setSignalMaskLocked(^linux.SignalSet(0)) // Check if this task's exit interacts with an initiated group stop. - if t.tg.groupStopPhase != groupStopInitiated { + if !t.groupStopPending { t.tg.signalHandlers.mu.Unlock() return last } - if t.groupStopAcknowledged { - // Un-acknowledge the group stop. - t.tg.groupStopCount-- - t.groupStopAcknowledged = false - // If the group stop wasn't complete before, then there is still at - // least one other task that hasn't acknowledged the group stop, so - // it is still not complete now. - t.tg.signalHandlers.mu.Unlock() - return last - } - if t.tg.groupStopCount != t.tg.activeTasks { - t.tg.signalHandlers.mu.Unlock() - return last - } - t.Debugf("Completing group stop") - t.tg.groupStopPhase = groupStopComplete - t.tg.groupStopWaitable = true + t.groupStopPending = false sig := t.tg.groupStopSignal - t.tg.groupContNotify = false - t.tg.groupContWaitable = false + notifyParent := t.participateGroupStopLocked() // signalStop must be called with t's signal mutex unlocked. t.tg.signalHandlers.mu.Unlock() - if t.tg.leader.parent != nil { + if notifyParent && t.tg.leader.parent != nil { t.tg.leader.parent.signalStop(t, arch.CLD_STOPPED, int32(sig)) t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop) } diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index 583acddb1..6a204aa59 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -748,48 +748,21 @@ type groupStop struct{} // Killable implements TaskStop.Killable. func (*groupStop) Killable() bool { return true } -type groupStopPhase int - -const ( - // groupStopNone indicates that a thread group is not in, or attempting to - // enter or leave, a group stop. - groupStopNone groupStopPhase = iota - - // groupStopDequeued indicates that at least one task in a thread group has - // dequeued a stop signal (or dequeued any signal and entered a - // signal-delivery-stop as a result, which allows ptrace to change the - // signal into a stop signal), but temporarily dropped the signal mutex - // without initiating the group stop. - // - // groupStopDequeued is analogous to JOBCTL_STOP_DEQUEUED in Linux. - groupStopDequeued - - // groupStopInitiated indicates that a task in a thread group has initiated - // a group stop, but not all tasks in the thread group have acknowledged - // entering the group stop. - // - // groupStopInitiated is represented by JOBCTL_STOP_PENDING && - // !SIGNAL_STOP_STOPPED in Linux. - groupStopInitiated - - // groupStopComplete indicates that all tasks in a thread group have - // acknowledged entering the group stop, and the last one to do so has - // notified the thread group's parent. - // - // groupStopComplete is represented by JOBCTL_STOP_PENDING && - // SIGNAL_STOP_STOPPED in Linux. - groupStopComplete -) - // initiateGroupStop attempts to initiate a group stop based on a // previously-dequeued stop signal. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) initiateGroupStop(info *arch.SignalInfo) { + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() - if t.tg.groupStopPhase != groupStopDequeued { - t.Debugf("Signal %d: not stopping thread group: lost to racing signal", info.Signo) + if t.groupStopPending { + t.Debugf("Signal %d: not stopping thread group: lost to racing stop signal", info.Signo) + return + } + if !t.tg.groupStopDequeued { + t.Debugf("Signal %d: not stopping thread group: lost to racing SIGCONT", info.Signo) return } if t.tg.exiting { @@ -800,15 +773,27 @@ func (t *Task) initiateGroupStop(info *arch.SignalInfo) { t.Debugf("Signal %d: not stopping thread group: lost to racing execve", info.Signo) return } - t.Debugf("Signal %d: stopping thread group", info.Signo) - t.tg.groupStopPhase = groupStopInitiated - t.tg.groupStopSignal = linux.Signal(info.Signo) - t.tg.groupStopCount = 0 + if !t.tg.groupStopComplete { + t.tg.groupStopSignal = linux.Signal(info.Signo) + } + t.tg.groupStopPendingCount = 0 for t2 := t.tg.tasks.Front(); t2 != nil; t2 = t2.Next() { - t2.groupStopRequired = true + if t2.killedLocked() || t2.exitState >= TaskExitInitiated { + t2.groupStopPending = false + continue + } + t2.groupStopPending = true t2.groupStopAcknowledged = false + if t2.ptraceSeized { + t2.trapNotifyPending = true + if s, ok := t2.stop.(*ptraceStop); ok && s.listen { + t2.endInternalStopLocked() + } + } t2.interrupt() + t.tg.groupStopPendingCount++ } + t.Debugf("Signal %d: stopping %d threads in thread group", info.Signo, t.tg.groupStopPendingCount) } // endGroupStopLocked ensures that all prior stop signals received by tg are @@ -820,37 +805,77 @@ func (tg *ThreadGroup) endGroupStopLocked(broadcast bool) { // Discard all previously-queued stop signals. linux.ForEachSignal(StopSignals, tg.discardSpecificLocked) - if tg.groupStopPhase != groupStopNone { - tg.leader.Debugf("Ending group stop currently in phase %d", tg.groupStopPhase) - if tg.groupStopPhase == groupStopInitiated || tg.groupStopPhase == groupStopComplete { - tg.groupStopSignal = 0 - for t := tg.tasks.Front(); t != nil; t = t.Next() { - if _, ok := t.stop.(*groupStop); ok { - t.endInternalStopLocked() - } + if tg.groupStopPendingCount == 0 && !tg.groupStopComplete { + return + } + + completeStr := "incomplete" + if tg.groupStopComplete { + completeStr = "complete" + } + tg.leader.Debugf("Ending %s group stop with %d threads pending", completeStr, tg.groupStopPendingCount) + for t := tg.tasks.Front(); t != nil; t = t.Next() { + t.groupStopPending = false + if t.ptraceSeized { + t.trapNotifyPending = true + if s, ok := t.stop.(*ptraceStop); ok && s.listen { + t.endInternalStopLocked() } - if broadcast { - // Instead of notifying the parent here, set groupContNotify so - // that one of the continuing tasks does so. (Linux does - // something similar.) The reason we do this is to keep locking - // sane. In order to send a signal to the parent, we need to - // lock its signal mutex, but we're already holding tg's signal - // mutex, and the TaskSet mutex must be locked for writing for - // us to hold two signal mutexes. Since we don't want to - // require this for endGroupStopLocked (which is called from - // signal-sending paths), nor do we want to lose atomicity by - // releasing the mutexes we're already holding, just let the - // continuing thread group deal with it. - tg.groupContNotify = true - tg.groupContInterrupted = tg.groupStopPhase == groupStopInitiated - tg.groupContWaitable = true + } else { + if _, ok := t.stop.(*groupStop); ok { + t.endInternalStopLocked() } } - // If groupStopPhase was groupStopDequeued, setting it to groupStopNone - // will cause following calls to initiateGroupStop to recognize that - // the group stop has been cancelled. - tg.groupStopPhase = groupStopNone } + if broadcast { + // Instead of notifying the parent here, set groupContNotify so that + // one of the continuing tasks does so. (Linux does something similar.) + // The reason we do this is to keep locking sane. In order to send a + // signal to the parent, we need to lock its signal mutex, but we're + // already holding tg's signal mutex, and the TaskSet mutex must be + // locked for writing for us to hold two signal mutexes. Since we don't + // want to require this for endGroupStopLocked (which is called from + // signal-sending paths), nor do we want to lose atomicity by releasing + // the mutexes we're already holding, just let the continuing thread + // group deal with it. + tg.groupContNotify = true + tg.groupContInterrupted = !tg.groupStopComplete + tg.groupContWaitable = true + } + // Unsetting groupStopDequeued will cause racing calls to initiateGroupStop + // to recognize that the group stop has been cancelled. + tg.groupStopDequeued = false + tg.groupStopSignal = 0 + tg.groupStopPendingCount = 0 + tg.groupStopComplete = false + tg.groupStopWaitable = false +} + +// participateGroupStopLocked is called to handle thread group side effects +// after t unsets t.groupStopPending. The caller must handle task side effects +// (e.g. placing the task goroutine into the group stop). It returns true if +// the caller must notify t.tg.leader's parent of a completed group stop (which +// participateGroupStopLocked cannot do due to holding the wrong locks). +// +// Preconditions: The signal mutex must be locked. +func (t *Task) participateGroupStopLocked() bool { + if t.groupStopAcknowledged { + return false + } + t.groupStopAcknowledged = true + t.tg.groupStopPendingCount-- + if t.tg.groupStopPendingCount != 0 { + return false + } + if t.tg.groupStopComplete { + return false + } + t.Debugf("Completing group stop") + t.tg.groupStopComplete = true + t.tg.groupStopWaitable = true + t.tg.groupContNotify = false + t.tg.groupContWaitable = false + return true } // signalStop sends a signal to t's thread group of a new group stop, group @@ -899,7 +924,7 @@ func (*runInterrupt) execute(t *Task) taskRunState { // leader's) tracer are in the same thread group, deduplicate // notifications. notifyParent := t.tg.leader.parent != nil - if tracer := t.tg.leader.ptraceTracer.Load().(*Task); tracer != nil { + if tracer := t.tg.leader.Tracer(); tracer != nil { if notifyParent && tracer.tg == t.tg.leader.parent.tg { notifyParent = false } @@ -938,23 +963,21 @@ func (*runInterrupt) execute(t *Task) taskRunState { return (*runInterrupt)(nil) } - // Do we need to enter a group stop? - if t.groupStopRequired { - t.groupStopRequired = false + // Do we need to enter a group stop or related ptrace stop? This path is + // analogous to Linux's kernel/signal.c:get_signal() => do_signal_stop() + // (with ptrace enabled) and do_jobctl_trap(). + if t.groupStopPending || t.trapStopPending || t.trapNotifyPending { sig := t.tg.groupStopSignal notifyParent := false - if !t.groupStopAcknowledged { - t.groupStopAcknowledged = true - t.tg.groupStopCount++ - if t.tg.groupStopCount == t.tg.activeTasks { - t.Debugf("Completing group stop") - notifyParent = true - t.tg.groupStopPhase = groupStopComplete - t.tg.groupStopWaitable = true - t.tg.groupContNotify = false - t.tg.groupContWaitable = false - } + if t.groupStopPending { + t.groupStopPending = false + // We care about t.tg.groupStopSignal (for tracer notification) + // even if this doesn't complete a group stop, so keep the + // value of sig we've already read. + notifyParent = t.participateGroupStopLocked() } + t.trapStopPending = false + t.trapNotifyPending = false // Drop the signal mutex so we can take the TaskSet mutex. t.tg.signalHandlers.mu.Unlock() @@ -963,8 +986,26 @@ func (*runInterrupt) execute(t *Task) taskRunState { notifyParent = false } if tracer := t.Tracer(); tracer != nil { - t.ptraceCode = int32(sig) - t.ptraceSiginfo = nil + if t.ptraceSeized { + if sig == 0 { + sig = linux.SIGTRAP + } + // "If tracee was attached using PTRACE_SEIZE, group-stop is + // indicated by PTRACE_EVENT_STOP: status>>16 == + // PTRACE_EVENT_STOP. This allows detection of group-stops + // without requiring an extra PTRACE_GETSIGINFO call." - + // "Group-stop", ptrace(2) + t.ptraceCode = int32(sig) | linux.PTRACE_EVENT_STOP<<8 + t.ptraceSiginfo = &arch.SignalInfo{ + Signo: int32(sig), + Code: t.ptraceCode, + } + t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t])) + t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + } else { + t.ptraceCode = int32(sig) + t.ptraceSiginfo = nil + } if t.beginPtraceStopLocked() { tracer.signalStop(t, arch.CLD_STOPPED, int32(sig)) // For consistency with Linux, if the parent and tracer are in the @@ -994,12 +1035,11 @@ func (*runInterrupt) execute(t *Task) taskRunState { // Are there signals pending? if info := t.dequeueSignalLocked(t.signalMask); info != nil { - if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 && t.tg.groupStopPhase == groupStopNone { - // Indicate that we've dequeued a stop signal before - // unlocking the signal mutex; initiateGroupStop will check - // that the phase hasn't changed (or is at least another - // "stop signal dequeued" phase) after relocking it. - t.tg.groupStopPhase = groupStopDequeued + if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 { + // Indicate that we've dequeued a stop signal before unlocking the + // signal mutex; initiateGroupStop will check for races with + // endGroupStopLocked after relocking it. + t.tg.groupStopDequeued = true } if t.ptraceSignalLocked(info) { // Dequeueing the signal action must wait until after the diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index d7652f57c..1b7b74319 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -60,25 +60,35 @@ type ThreadGroup struct { // pendingSignals is protected by the signal mutex. pendingSignals pendingSignals - // groupStopPhase indicates the state of a group stop in progress on the - // thread group, if any. + // If groupStopDequeued is true, a task in the thread group has dequeued a + // stop signal, but has not yet initiated the group stop. // - // groupStopPhase is protected by the signal mutex. - groupStopPhase groupStopPhase + // groupStopDequeued is analogous to Linux's JOBCTL_STOP_DEQUEUED. + // + // groupStopDequeued is protected by the signal mutex. + groupStopDequeued bool // groupStopSignal is the signal that caused a group stop to be initiated. - // groupStopSignal is only meaningful if groupStopPhase is - // groupStopInitiated or groupStopComplete. // // groupStopSignal is protected by the signal mutex. groupStopSignal linux.Signal - // groupStopCount is the number of non-exited tasks in the thread group - // that have acknowledged an initiated group stop. groupStopCount is only - // meaningful if groupStopPhase is groupStopInitiated. + // groupStopPendingCount is the number of active tasks in the thread group + // for which Task.groupStopPending is set. + // + // groupStopPendingCount is analogous to Linux's + // signal_struct::group_stop_count. // - // groupStopCount is protected by the signal mutex. - groupStopCount int + // groupStopPendingCount is protected by the signal mutex. + groupStopPendingCount int + + // If groupStopComplete is true, groupStopPendingCount transitioned from + // non-zero to zero without an intervening SIGCONT. + // + // groupStopComplete is analogous to Linux's SIGNAL_STOP_STOPPED. + // + // groupStopComplete is protected by the signal mutex. + groupStopComplete bool // If groupStopWaitable is true, the thread group is indicating a waitable // group stop event (as defined by EventChildGroupStop). @@ -91,14 +101,9 @@ type ThreadGroup struct { // If groupContNotify is true, then a SIGCONT has recently ended a group // stop on this thread group, and the first task to observe it should - // notify its parent. - // - // groupContNotify is protected by the signal mutex. - groupContNotify bool - - // If groupContNotify is true, groupContInterrupted is true iff SIGCONT - // ended a group stop in phase groupStopInitiated. If groupContNotify is - // false, groupContInterrupted is meaningless. + // notify its parent. groupContInterrupted is true iff SIGCONT ended an + // incomplete group stop. If groupContNotify is false, groupContInterrupted is + // meaningless. // // Analogues in Linux: // @@ -110,7 +115,9 @@ type ThreadGroup struct { // // - !groupContNotify is represented by neither flag being set. // - // groupContInterrupted is protected by the signal mutex. + // groupContNotify and groupContInterrupted are protected by the signal + // mutex. + groupContNotify bool groupContInterrupted bool // If groupContWaitable is true, the thread group is indicating a waitable diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc index 6f1701aef..6d5c425d8 100644 --- a/test/syscalls/linux/ptrace.cc +++ b/test/syscalls/linux/ptrace.cc @@ -44,6 +44,20 @@ namespace testing { namespace { +// PTRACE_GETSIGMASK and PTRACE_SETSIGMASK are not defined until glibc 2.23 +// (fb53a27c5741 "Add new header definitions from Linux 4.4 (plus older ptrace +// definitions)"). +constexpr auto kPtraceGetSigMask = static_cast<__ptrace_request>(0x420a); +constexpr auto kPtraceSetSigMask = static_cast<__ptrace_request>(0x420b); + +// PTRACE_SYSEMU is not defined until glibc 2.27 (c48831d0eebf "linux/x86: sync +// sys/ptrace.h with Linux 4.14 [BZ #22433]"). +constexpr auto kPtraceSysemu = static_cast<__ptrace_request>(31); + +// PTRACE_EVENT_STOP is not defined until glibc 2.26 (3f67d1a7021e "Add Linux +// PTRACE_EVENT_STOP"). +constexpr int kPtraceEventStop = 128; + // Sends sig to the current process with tgkill(2). // // glibc's raise(2) may change the signal mask before sending the signal. These @@ -146,10 +160,6 @@ TEST(PtraceTest, AttachParent_PeekData_PokeData_SignalSuppression) { } TEST(PtraceTest, GetSigMask) { - // <sys/user.h> doesn't define these until Linux 4.4, even though the features - // were added in 3.11. - constexpr auto kPtraceGetSigMask = static_cast<enum __ptrace_request>(0x420a); - constexpr auto kPtraceSetSigMask = static_cast<enum __ptrace_request>(0x420b); // glibc and the Linux kernel define a sigset_t with different sizes. To avoid // creating a kernel_sigset_t and recreating all the modification functions // (sigemptyset, etc), we just hardcode the kernel sigset size. @@ -878,9 +888,7 @@ TEST(PtraceTest, Sysemu_PokeUser) { << " status " << status; // Suppress the SIGSTOP and wait for the child to enter syscall-enter-stop - // for its first exit_group syscall. glibc doesn't necessarily define - // PTRACE_SYSEMU. - constexpr auto kPtraceSysemu = static_cast<__ptrace_request>(31); + // for its first exit_group syscall. ASSERT_THAT(ptrace(kPtraceSysemu, child_pid, 0, 0), SyscallSucceeds()); ASSERT_THAT(waitpid(child_pid, &status, 0), SyscallSucceedsWithValue(child_pid)); @@ -999,6 +1007,119 @@ TEST(PtraceTest, ERESTART_NoRandomSave) { } #endif // defined(__x86_64__) +TEST(PtraceTest, Seize_Interrupt_Listen) { + volatile long child_should_spin = 1; + pid_t const child_pid = fork(); + if (child_pid == 0) { + // In child process. + while (child_should_spin) { + SleepSafe(absl::Seconds(1)); + } + _exit(1); + } + + // In parent process. + ASSERT_THAT(child_pid, SyscallSucceeds()); + + // Attach to the child with PTRACE_SEIZE; doing so should not stop the child. + ASSERT_THAT(ptrace(PTRACE_SEIZE, child_pid, 0, 0), SyscallSucceeds()); + int status; + EXPECT_THAT(waitpid(child_pid, &status, WNOHANG), + SyscallSucceedsWithValue(0)); + + // Stop the child with PTRACE_INTERRUPT. + ASSERT_THAT(ptrace(PTRACE_INTERRUPT, child_pid, 0, 0), SyscallSucceeds()); + ASSERT_THAT(waitpid(child_pid, &status, 0), + SyscallSucceedsWithValue(child_pid)); + EXPECT_EQ(SIGTRAP | (kPtraceEventStop << 8), status >> 8); + + // Unset child_should_spin to verify that the child never leaves the spin + // loop. + ASSERT_THAT(ptrace(PTRACE_POKEDATA, child_pid, &child_should_spin, 0), + SyscallSucceeds()); + + // Send SIGSTOP to the child, then resume it, allowing it to proceed to + // signal-delivery-stop. + ASSERT_THAT(kill(child_pid, SIGSTOP), SyscallSucceeds()); + ASSERT_THAT(ptrace(PTRACE_CONT, child_pid, 0, 0), SyscallSucceeds()); + ASSERT_THAT(waitpid(child_pid, &status, 0), + SyscallSucceedsWithValue(child_pid)); + EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP) + << " status " << status; + + // Release the child from signal-delivery-stop without suppressing the + // SIGSTOP, causing it to enter group-stop. + ASSERT_THAT(ptrace(PTRACE_CONT, child_pid, 0, SIGSTOP), SyscallSucceeds()); + ASSERT_THAT(waitpid(child_pid, &status, 0), + SyscallSucceedsWithValue(child_pid)); + EXPECT_EQ(SIGSTOP | (kPtraceEventStop << 8), status >> 8); + + // "The state of the tracee after PTRACE_LISTEN is somewhat of a gray area: it + // is not in any ptrace-stop (ptrace commands won't work on it, and it will + // deliver waitpid(2) notifications), but it also may be considered 'stopped' + // because it is not executing instructions (is not scheduled), and if it was + // in group-stop before PTRACE_LISTEN, it will not respond to signals until + // SIGCONT is received." - ptrace(2). + ASSERT_THAT(ptrace(PTRACE_LISTEN, child_pid, 0, 0), SyscallSucceeds()); + EXPECT_THAT(ptrace(PTRACE_CONT, child_pid, 0, 0), + SyscallFailsWithErrno(ESRCH)); + EXPECT_THAT(waitpid(child_pid, &status, WNOHANG), + SyscallSucceedsWithValue(0)); + EXPECT_THAT(kill(child_pid, SIGTERM), SyscallSucceeds()); + absl::SleepFor(absl::Seconds(1)); + EXPECT_THAT(waitpid(child_pid, &status, WNOHANG), + SyscallSucceedsWithValue(0)); + + // Send SIGCONT to the child, causing it to leave group-stop and re-trap due + // to PTRACE_LISTEN. + EXPECT_THAT(kill(child_pid, SIGCONT), SyscallSucceeds()); + ASSERT_THAT(waitpid(child_pid, &status, 0), + SyscallSucceedsWithValue(child_pid)); + EXPECT_EQ(SIGTRAP | (kPtraceEventStop << 8), status >> 8); + + // Detach the child and expect it to exit due to the SIGTERM we sent while + // it was stopped by PTRACE_LISTEN. + ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, 0), SyscallSucceeds()); + ASSERT_THAT(waitpid(child_pid, &status, 0), + SyscallSucceedsWithValue(child_pid)); + EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGTERM) + << " status " << status; +} + +TEST(PtraceTest, Interrupt_Listen_RequireSeize) { + pid_t const child_pid = fork(); + if (child_pid == 0) { + // In child process. + TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0); + MaybeSave(); + raise(SIGSTOP); + _exit(0); + } + // In parent process. + ASSERT_THAT(child_pid, SyscallSucceeds()); + + // Wait for the child to send itself SIGSTOP and enter signal-delivery-stop. + int status; + ASSERT_THAT(waitpid(child_pid, &status, 0), + SyscallSucceedsWithValue(child_pid)); + EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP) + << " status " << status; + + // PTRACE_INTERRUPT and PTRACE_LISTEN should fail since the child wasn't + // attached with PTRACE_SEIZE, leaving the child in signal-delivery-stop. + EXPECT_THAT(ptrace(PTRACE_INTERRUPT, child_pid, 0, 0), + SyscallFailsWithErrno(EIO)); + EXPECT_THAT(ptrace(PTRACE_LISTEN, child_pid, 0, 0), + SyscallFailsWithErrno(EIO)); + + // Suppress SIGSTOP and detach from the child, expecting it to exit normally. + ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, 0), SyscallSucceeds()); + ASSERT_THAT(waitpid(child_pid, &status, 0), + SyscallSucceedsWithValue(child_pid)); + EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) + << " status " << status; +} + } // namespace } // namespace testing |