diff options
Diffstat (limited to 'pkg/sentry/kernel/ptrace.go')
-rw-r--r-- | pkg/sentry/kernel/ptrace.go | 1105 |
1 files changed, 1105 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go new file mode 100644 index 000000000..4423e7efd --- /dev/null +++ b/pkg/sentry/kernel/ptrace.go @@ -0,0 +1,1105 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// ptraceOptions are the subset of options controlling a task's ptrace behavior +// that are set by ptrace(PTRACE_SETOPTIONS). +// +// +stateify savable +type ptraceOptions struct { + // ExitKill is true if the tracee should be sent SIGKILL when the tracer + // exits. + ExitKill bool + + // If SysGood is true, set bit 7 in the signal number for + // syscall-entry-stop and syscall-exit-stop traps delivered to this task's + // tracer. + SysGood bool + + // TraceClone is true if the tracer wants to receive PTRACE_EVENT_CLONE + // events. + TraceClone bool + + // TraceExec is true if the tracer wants to receive PTRACE_EVENT_EXEC + // events. + TraceExec bool + + // TraceExit is true if the tracer wants to receive PTRACE_EVENT_EXIT + // events. + TraceExit bool + + // TraceFork is true if the tracer wants to receive PTRACE_EVENT_FORK + // events. + TraceFork bool + + // TraceSeccomp is true if the tracer wants to receive PTRACE_EVENT_SECCOMP + // events. + TraceSeccomp bool + + // TraceVfork is true if the tracer wants to receive PTRACE_EVENT_VFORK + // events. + TraceVfork bool + + // TraceVforkDone is true if the tracer wants to receive + // PTRACE_EVENT_VFORK_DONE events. + TraceVforkDone bool +} + +// ptraceSyscallMode controls the behavior of a ptraced task at syscall entry +// and exit. +type ptraceSyscallMode int + +const ( + // ptraceSyscallNone indicates that the task has never ptrace-stopped, or + // that it was resumed from its last ptrace-stop by PTRACE_CONT or + // PTRACE_DETACH. The task's syscalls will not be intercepted. + ptraceSyscallNone ptraceSyscallMode = iota + + // ptraceSyscallIntercept indicates that the task was resumed from its last + // ptrace-stop by PTRACE_SYSCALL. The next time the task enters or exits a + // syscall, a ptrace-stop will occur. + ptraceSyscallIntercept + + // ptraceSyscallEmu indicates that the task was resumed from its last + // ptrace-stop by PTRACE_SYSEMU or PTRACE_SYSEMU_SINGLESTEP. The next time + // the task enters a syscall, the syscall will be skipped, and a + // ptrace-stop will occur. + ptraceSyscallEmu +) + +// CanTrace checks that t is permitted to access target's state, as defined by +// ptrace(2), subsection "Ptrace access mode checking". If attach is true, it +// checks for access mode PTRACE_MODE_ATTACH; otherwise, it checks for access +// mode PTRACE_MODE_READ. +func (t *Task) CanTrace(target *Task, attach bool) bool { + // "1. If the calling thread and the target thread are in the same thread + // group, access is always allowed." - ptrace(2) + // + // Note: Strictly speaking, prior to 73af963f9f30 ("__ptrace_may_access() + // should not deny sub-threads", first released in Linux 3.12), the rule + // only applies if t and target are the same task. But, as that commit + // message puts it, "[any] security check is pointless when the tasks share + // the same ->mm." + if t.tg == target.tg { + return true + } + + // """ + // 2. If the access mode specifies PTRACE_MODE_FSCREDS (ED: snipped, + // doesn't exist until Linux 4.5). + // + // Otherwise, the access mode specifies PTRACE_MODE_REALCREDS, so use the + // caller's real UID and GID for the checks in the next step. (Most APIs + // that check the caller's UID and GID use the effective IDs. For + // historical reasons, the PTRACE_MODE_REALCREDS check uses the real IDs + // instead.) + // + // 3. Deny access if neither of the following is true: + // + // - The real, effective, and saved-set user IDs of the target match the + // caller's user ID, *and* the real, effective, and saved-set group IDs of + // the target match the caller's group ID. + // + // - The caller has the CAP_SYS_PTRACE capability in the user namespace of + // the target. + // + // 4. Deny access if the target process "dumpable" attribute has a value + // other than 1 (SUID_DUMP_USER; see the discussion of PR_SET_DUMPABLE in + // prctl(2)), and the caller does not have the CAP_SYS_PTRACE capability in + // the user namespace of the target process. + // + // 5. The kernel LSM security_ptrace_access_check() interface is invoked to + // see if ptrace access is permitted. The results depend on the LSM(s). The + // implementation of this interface in the commoncap LSM performs the + // following steps: + // + // a) If the access mode includes PTRACE_MODE_FSCREDS, then use the + // caller's effective capability set; otherwise (the access mode specifies + // PTRACE_MODE_REALCREDS, so) use the caller's permitted capability set. + // + // b) Deny access if neither of the following is true: + // + // - The caller and the target process are in the same user namespace, and + // the caller's capabilities are a proper superset of the target process's + // permitted capabilities. + // + // - The caller has the CAP_SYS_PTRACE capability in the target process's + // user namespace. + // + // Note that the commoncap LSM does not distinguish between + // PTRACE_MODE_READ and PTRACE_MODE_ATTACH. (ED: From earlier in this + // section: "the commoncap LSM ... is always invoked".) + // """ + callerCreds := t.Credentials() + targetCreds := target.Credentials() + if callerCreds.HasCapabilityIn(linux.CAP_SYS_PTRACE, targetCreds.UserNamespace) { + return true + } + if cuid := callerCreds.RealKUID; cuid != targetCreds.RealKUID || cuid != targetCreds.EffectiveKUID || cuid != targetCreds.SavedKUID { + return false + } + if cgid := callerCreds.RealKGID; cgid != targetCreds.RealKGID || cgid != targetCreds.EffectiveKGID || cgid != targetCreds.SavedKGID { + return false + } + // TODO(b/31916171): dumpability check + if callerCreds.UserNamespace != targetCreds.UserNamespace { + return false + } + if targetCreds.PermittedCaps&^callerCreds.PermittedCaps != 0 { + return false + } + // TODO: Yama LSM + return true +} + +// Tracer returns t's ptrace Tracer. +func (t *Task) Tracer() *Task { + return t.ptraceTracer.Load().(*Task) +} + +// hasTracer returns true if t has a ptrace tracer attached. +func (t *Task) hasTracer() bool { + // This isn't just inlined into callers so that if Task.Tracer() turns out + // to be too expensive because of e.g. interface conversion, we can switch + // to having a separate atomic flag more easily. + return t.Tracer() != nil +} + +// ptraceStop is a TaskStop placed on tasks in a ptrace-stop. +// +// +stateify savable +type ptraceStop struct { + // If frozen is true, the stopped task's tracer is currently operating on + // it, so Task.Kill should not remove the stop. + frozen bool + + // If listen is true, the stopped task's tracer invoked PTRACE_LISTEN, so + // ptraceFreeze should fail. + listen bool +} + +// Killable implements TaskStop.Killable. +func (s *ptraceStop) Killable() bool { + return !s.frozen +} + +// beginPtraceStopLocked initiates an unfrozen ptrace-stop on t. If t has been +// killed, the stop is skipped, and beginPtraceStopLocked returns false. +// +// beginPtraceStopLocked does not signal t's tracer or wake it if it is +// waiting. +// +// Preconditions: The TaskSet mutex must be locked. The caller must be running +// on the task goroutine. +func (t *Task) beginPtraceStopLocked() bool { + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + // This is analogous to Linux's kernel/signal.c:ptrace_stop() => ... => + // kernel/sched/core.c:__schedule() => signal_pending_state() check, which + // is what prevents tasks from entering ptrace-stops after being killed. + // Note that if t was SIGKILLed and beingPtraceStopLocked is being called + // for PTRACE_EVENT_EXIT, the task will have dequeued the signal before + // entering the exit path, so t.killedLocked() will no longer return true. + // This is consistent with Linux: "Bugs: ... A SIGKILL signal may still + // cause a PTRACE_EVENT_EXIT stop before actual signal death. This may be + // changed in the future; SIGKILL is meant to always immediately kill tasks + // even under ptrace. Last confirmed on Linux 3.13." - ptrace(2) + if t.killedLocked() { + return false + } + t.beginInternalStopLocked(&ptraceStop{}) + return true +} + +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) ptraceTrapLocked(code int32) { + // This is unconditional in ptrace_stop(). + t.tg.signalHandlers.mu.Lock() + t.trapStopPending = false + t.tg.signalHandlers.mu.Unlock() + t.ptraceCode = code + t.ptraceSiginfo = &arch.SignalInfo{ + Signo: int32(linux.SIGTRAP), + Code: code, + } + t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t])) + t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) + if t.beginPtraceStopLocked() { + tracer := t.Tracer() + tracer.signalStop(t, arch.CLD_TRAPPED, int32(linux.SIGTRAP)) + tracer.tg.eventQueue.Notify(EventTraceeStop) + } +} + +// ptraceFreeze checks if t is in a ptraceStop. If so, it freezes the +// ptraceStop, temporarily preventing it from being removed by a concurrent +// Task.Kill, and returns true. Otherwise it returns false. +// +// Preconditions: The TaskSet mutex must be locked. The caller must be running +// on the task goroutine of t's tracer. +func (t *Task) ptraceFreeze() bool { + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + if t.stop == nil { + return false + } + s, ok := t.stop.(*ptraceStop) + if !ok { + return false + } + if s.listen { + return false + } + s.frozen = true + return true +} + +// ptraceUnfreeze ends the effect of a previous successful call to +// ptraceFreeze. +// +// Preconditions: t must be in a frozen ptraceStop. +func (t *Task) ptraceUnfreeze() { + // t.tg.signalHandlers is stable because t is in a frozen ptrace-stop, + // preventing its thread group from completing execve. + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.ptraceUnfreezeLocked() +} + +// Preconditions: t must be in a frozen ptraceStop. t's signal mutex must be +// locked. +func (t *Task) ptraceUnfreezeLocked() { + // Do this even if the task has been killed to ensure a panic if t.stop is + // nil or not a ptraceStop. + t.stop.(*ptraceStop).frozen = false + if t.killedLocked() { + t.endInternalStopLocked() + } +} + +// ptraceUnstop implements ptrace request PTRACE_CONT, PTRACE_SYSCALL, +// PTRACE_SINGLESTEP, PTRACE_SYSEMU, or PTRACE_SYSEMU_SINGLESTEP depending on +// mode and singlestep. +// +// Preconditions: t must be in a frozen ptrace stop. +// +// Postconditions: If ptraceUnstop returns nil, t will no longer be in a ptrace +// stop. +func (t *Task) ptraceUnstop(mode ptraceSyscallMode, singlestep bool, sig linux.Signal) error { + if sig != 0 && !sig.IsValid() { + return syserror.EIO + } + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + t.ptraceCode = int32(sig) + t.ptraceSyscallMode = mode + t.ptraceSinglestep = singlestep + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.endInternalStopLocked() + return nil +} + +func (t *Task) ptraceTraceme() error { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + if t.hasTracer() { + return syserror.EPERM + } + if t.parent == nil { + // In Linux, only init can not have a parent, and init is assumed never + // to invoke PTRACE_TRACEME. In the sentry, TGID 1 is an arbitrary user + // application that may invoke PTRACE_TRACEME; having no parent can + // also occur if all tasks in the parent thread group have exited, and + // failed to find a living thread group to reparent to. The former case + // is treated as if TGID 1 has an exited parent in an invisible + // ancestor PID namespace that is an owner of the root user namespace + // (and consequently has CAP_SYS_PTRACE), and the latter case is a + // special form of the exited parent case below. In either case, + // returning nil here is correct. + return nil + } + if !t.parent.CanTrace(t, true) { + return syserror.EPERM + } + if t.parent.exitState != TaskExitNone { + // Fail silently, as if we were successfully attached but then + // immediately detached. This is consistent with Linux. + return nil + } + t.ptraceTracer.Store(t.parent) + t.parent.ptraceTracees[t] = struct{}{} + return nil +} + +// ptraceAttach implements ptrace(PTRACE_ATTACH, target) if seize is false, and +// ptrace(PTRACE_SEIZE, target, 0, opts) if seize is true. t is the caller. +func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error { + if t.tg == target.tg { + return syserror.EPERM + } + if !t.CanTrace(target, true) { + return syserror.EPERM + } + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + if target.hasTracer() { + return syserror.EPERM + } + // Attaching to zombies and dead tasks is not permitted; the exit + // notification logic relies on this. Linux allows attaching to PF_EXITING + // tasks, though. + if target.exitState >= TaskExitZombie { + return syserror.EPERM + } + if seize { + if err := target.ptraceSetOptionsLocked(opts); err != nil { + return syserror.EIO + } + } + target.ptraceTracer.Store(t) + t.ptraceTracees[target] = struct{}{} + target.ptraceSeized = seize + target.tg.signalHandlers.mu.Lock() + // "Unlike PTRACE_ATTACH, PTRACE_SEIZE does not stop the process." - + // ptrace(2) + if !seize { + target.sendSignalLocked(&arch.SignalInfo{ + Signo: int32(linux.SIGSTOP), + Code: arch.SignalInfoUser, + }, false /* group */) + } + // Undocumented Linux feature: If the tracee is already group-stopped (and + // consequently will not report the SIGSTOP just sent), force it to leave + // and re-enter the stop so that it will switch to a ptrace-stop. + if target.stop == (*groupStop)(nil) { + target.trapStopPending = true + target.endInternalStopLocked() + // TODO(jamieliu): Linux blocks ptrace_attach() until the task has + // entered the ptrace-stop (or exited) via JOBCTL_TRAPPING. + } + target.tg.signalHandlers.mu.Unlock() + return nil +} + +// ptraceDetach implements ptrace(PTRACE_DETACH, target, 0, sig). t is the +// caller. +// +// Preconditions: target must be a tracee of t in a frozen ptrace stop. +// +// Postconditions: If ptraceDetach returns nil, target will no longer be in a +// ptrace stop. +func (t *Task) ptraceDetach(target *Task, sig linux.Signal) error { + if sig != 0 && !sig.IsValid() { + return syserror.EIO + } + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + target.ptraceCode = int32(sig) + target.forgetTracerLocked() + delete(t.ptraceTracees, target) + return nil +} + +// exitPtrace is called in the exit path to detach all of t's tracees. +func (t *Task) exitPtrace() { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + for target := range t.ptraceTracees { + if target.ptraceOpts.ExitKill { + target.tg.signalHandlers.mu.Lock() + target.sendSignalLocked(&arch.SignalInfo{ + Signo: int32(linux.SIGKILL), + }, false /* group */) + target.tg.signalHandlers.mu.Unlock() + } + // Leave ptraceCode unchanged so that if the task is ptrace-stopped, it + // observes the ptraceCode it set before it entered the stop. I believe + // this is consistent with Linux. + target.forgetTracerLocked() + } + // "nil maps cannot be saved" + t.ptraceTracees = make(map[*Task]struct{}) +} + +// forgetTracerLocked detaches t's tracer and ensures that t is no longer +// ptrace-stopped. +// +// Preconditions: The TaskSet mutex must be locked for writing. +func (t *Task) forgetTracerLocked() { + t.ptraceSeized = false + t.ptraceOpts = ptraceOptions{} + t.ptraceSyscallMode = ptraceSyscallNone + t.ptraceSinglestep = false + t.ptraceTracer.Store((*Task)(nil)) + if t.exitTracerNotified && !t.exitTracerAcked { + t.exitTracerAcked = true + t.exitNotifyLocked(true) + } + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + // Unset t.trapStopPending, which might have been set by PTRACE_INTERRUPT. If + // it wasn't, it will be reset via t.groupStopPending after the following. + t.trapStopPending = false + // If t's thread group is in a group stop and t is eligible to participate, + // make it do so. This is essentially the reverse of the special case in + // ptraceAttach, which converts a group stop to a ptrace stop. ("Handling + // of restart from group-stop is currently buggy, but the "as planned" + // behavior is to leave tracee stopped and waiting for SIGCONT." - + // ptrace(2)) + if (t.tg.groupStopComplete || t.tg.groupStopPendingCount != 0) && !t.groupStopPending && t.exitState < TaskExitInitiated { + t.groupStopPending = true + // t already participated in the group stop when it unset + // groupStopPending. + t.groupStopAcknowledged = true + t.interrupt() + } + if _, ok := t.stop.(*ptraceStop); ok { + t.endInternalStopLocked() + } +} + +// ptraceSignalLocked is called after signal dequeueing to check if t should +// enter ptrace signal-delivery-stop. +// +// Preconditions: The signal mutex must be locked. The caller must be running +// on the task goroutine. +func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool { + if linux.Signal(info.Signo) == linux.SIGKILL { + return false + } + if !t.hasTracer() { + return false + } + // The tracer might change this signal into a stop signal, in which case + // any SIGCONT received after the signal was originally dequeued should + // cancel it. This is consistent with Linux. + t.tg.groupStopDequeued = true + // This is unconditional in ptrace_stop(). + t.trapStopPending = false + // Can't lock the TaskSet mutex while holding a signal mutex. + t.tg.signalHandlers.mu.Unlock() + defer t.tg.signalHandlers.mu.Lock() + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + tracer := t.Tracer() + if tracer == nil { + return false + } + t.ptraceCode = info.Signo + t.ptraceSiginfo = info + t.Debugf("Entering signal-delivery-stop for signal %d", info.Signo) + if t.beginPtraceStopLocked() { + tracer.signalStop(t, arch.CLD_TRAPPED, info.Signo) + tracer.tg.eventQueue.Notify(EventTraceeStop) + } + return true +} + +// ptraceSeccomp is called when a seccomp-bpf filter returns action +// SECCOMP_RET_TRACE to check if t should enter PTRACE_EVENT_SECCOMP stop. data +// is the lower 16 bits of the filter's return value. +func (t *Task) ptraceSeccomp(data uint16) bool { + if !t.hasTracer() { + return false + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if !t.ptraceOpts.TraceSeccomp { + return false + } + t.Debugf("Entering PTRACE_EVENT_SECCOMP stop") + t.ptraceEventLocked(linux.PTRACE_EVENT_SECCOMP, uint64(data)) + return true +} + +// ptraceSyscallEnter is called immediately before entering a syscall to check +// if t should enter ptrace syscall-enter-stop. +func (t *Task) ptraceSyscallEnter() (taskRunState, bool) { + if !t.hasTracer() { + return nil, false + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + switch t.ptraceSyscallMode { + case ptraceSyscallNone: + return nil, false + case ptraceSyscallIntercept: + t.Debugf("Entering syscall-enter-stop from PTRACE_SYSCALL") + t.ptraceSyscallStopLocked() + return (*runSyscallAfterSyscallEnterStop)(nil), true + case ptraceSyscallEmu: + t.Debugf("Entering syscall-enter-stop from PTRACE_SYSEMU") + t.ptraceSyscallStopLocked() + return (*runSyscallAfterSysemuStop)(nil), true + } + panic(fmt.Sprintf("Unknown ptraceSyscallMode: %v", t.ptraceSyscallMode)) +} + +// ptraceSyscallExit is called immediately after leaving a syscall to check if +// t should enter ptrace syscall-exit-stop. +func (t *Task) ptraceSyscallExit() { + if !t.hasTracer() { + return + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if t.ptraceSyscallMode != ptraceSyscallIntercept { + return + } + t.Debugf("Entering syscall-exit-stop") + t.ptraceSyscallStopLocked() +} + +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) ptraceSyscallStopLocked() { + code := int32(linux.SIGTRAP) + if t.ptraceOpts.SysGood { + code |= 0x80 + } + t.ptraceTrapLocked(code) +} + +type ptraceCloneKind int32 + +const ( + // ptraceCloneKindClone represents a call to Task.Clone where + // TerminationSignal is not SIGCHLD and Vfork is false. + ptraceCloneKindClone ptraceCloneKind = iota + + // ptraceCloneKindFork represents a call to Task.Clone where + // TerminationSignal is SIGCHLD and Vfork is false. + ptraceCloneKindFork + + // ptraceCloneKindVfork represents a call to Task.Clone where Vfork is + // true. + ptraceCloneKindVfork +) + +// ptraceClone is called at the end of a clone or fork syscall to check if t +// should enter PTRACE_EVENT_CLONE, PTRACE_EVENT_FORK, or PTRACE_EVENT_VFORK +// stop. child is the new task. +func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions) bool { + if !t.hasTracer() { + return false + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + event := false + if !opts.Untraced { + switch kind { + case ptraceCloneKindClone: + if t.ptraceOpts.TraceClone { + t.Debugf("Entering PTRACE_EVENT_CLONE stop") + t.ptraceEventLocked(linux.PTRACE_EVENT_CLONE, uint64(t.tg.pidns.tids[child])) + event = true + } + case ptraceCloneKindFork: + if t.ptraceOpts.TraceFork { + t.Debugf("Entering PTRACE_EVENT_FORK stop") + t.ptraceEventLocked(linux.PTRACE_EVENT_FORK, uint64(t.tg.pidns.tids[child])) + event = true + } + case ptraceCloneKindVfork: + if t.ptraceOpts.TraceVfork { + t.Debugf("Entering PTRACE_EVENT_VFORK stop") + t.ptraceEventLocked(linux.PTRACE_EVENT_VFORK, uint64(t.tg.pidns.tids[child])) + event = true + } + default: + panic(fmt.Sprintf("Unknown ptraceCloneKind: %v", kind)) + } + } + // "If the PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or PTRACE_O_TRACECLONE + // options are in effect, then children created by, respectively, vfork(2) + // or clone(2) with the CLONE_VFORK flag, fork(2) or clone(2) with the exit + // signal set to SIGCHLD, and other kinds of clone(2), are automatically + // attached to the same tracer which traced their parent. SIGSTOP is + // delivered to the children, causing them to enter signal-delivery-stop + // after they exit the system call which created them." - ptrace(2) + // + // clone(2)'s documentation of CLONE_UNTRACED and CLONE_PTRACE is + // confusingly wrong; see kernel/fork.c:_do_fork() => copy_process() => + // include/linux/ptrace.h:ptrace_init_task(). + if event || opts.InheritTracer { + tracer := t.Tracer() + if tracer != nil { + child.ptraceTracer.Store(tracer) + tracer.ptraceTracees[child] = struct{}{} + // "The "seized" behavior ... is inherited by children that are + // automatically attached using PTRACE_O_TRACEFORK, + // PTRACE_O_TRACEVFORK, and PTRACE_O_TRACECLONE." - ptrace(2) + child.ptraceSeized = t.ptraceSeized + // "Flags are inherited by new tracees created and "auto-attached" + // via active PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or + // PTRACE_O_TRACECLONE options." - ptrace(2) + child.ptraceOpts = t.ptraceOpts + child.tg.signalHandlers.mu.Lock() + // "PTRACE_SEIZE: ... Automatically attached children stop with + // PTRACE_EVENT_STOP and WSTOPSIG(status) returns SIGTRAP instead + // of having SIGSTOP signal delivered to them." - ptrace(2) + if child.ptraceSeized { + child.trapStopPending = true + } else { + child.pendingSignals.enqueue(&arch.SignalInfo{ + Signo: int32(linux.SIGSTOP), + }, nil) + } + // The child will self-interrupt() when its task goroutine starts + // running, so we don't have to. + child.tg.signalHandlers.mu.Unlock() + } + } + return event +} + +// ptraceVforkDone is called after the end of a vfork stop to check if t should +// enter PTRACE_EVENT_VFORK_DONE stop. child is the new task's thread ID in t's +// PID namespace. +func (t *Task) ptraceVforkDone(child ThreadID) bool { + if !t.hasTracer() { + return false + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if !t.ptraceOpts.TraceVforkDone { + return false + } + t.Debugf("Entering PTRACE_EVENT_VFORK_DONE stop") + t.ptraceEventLocked(linux.PTRACE_EVENT_VFORK_DONE, uint64(child)) + return true +} + +// ptraceExec is called at the end of an execve syscall to check if t should +// enter PTRACE_EVENT_EXEC stop. oldTID is t's thread ID, in its *tracer's* PID +// namespace, prior to the execve. (If t did not have a tracer at the time +// oldTID was read, oldTID may be 0. This is consistent with Linux.) +func (t *Task) ptraceExec(oldTID ThreadID) { + if !t.hasTracer() { + return + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + // Recheck with the TaskSet mutex locked. Most ptrace points don't need to + // do this because detaching resets ptrace options, but PTRACE_EVENT_EXEC + // is special because both TraceExec and !TraceExec do something if a + // tracer is attached. + if !t.hasTracer() { + return + } + if t.ptraceOpts.TraceExec { + t.Debugf("Entering PTRACE_EVENT_EXEC stop") + t.ptraceEventLocked(linux.PTRACE_EVENT_EXEC, uint64(oldTID)) + return + } + // "If the PTRACE_O_TRACEEXEC option is not in effect for the execing + // tracee, and if the tracee was PTRACE_ATTACHed rather that [sic] + // PTRACE_SEIZEd, the kernel delivers an extra SIGTRAP to the tracee after + // execve(2) returns. This is an ordinary signal (similar to one which can + // be generated by `kill -TRAP`, not a special kind of ptrace-stop. + // Employing PTRACE_GETSIGINFO for this signal returns si_code set to 0 + // (SI_USER). This signal may be blocked by signal mask, and thus may be + // delivered (much) later." - ptrace(2) + if t.ptraceSeized { + return + } + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.sendSignalLocked(&arch.SignalInfo{ + Signo: int32(linux.SIGTRAP), + Code: arch.SignalInfoUser, + }, false /* group */) +} + +// ptraceExit is called early in the task exit path to check if t should enter +// PTRACE_EVENT_EXIT stop. +func (t *Task) ptraceExit() { + if !t.hasTracer() { + return + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if !t.ptraceOpts.TraceExit { + return + } + t.tg.signalHandlers.mu.Lock() + status := t.exitStatus.Status() + t.tg.signalHandlers.mu.Unlock() + t.Debugf("Entering PTRACE_EVENT_EXIT stop") + t.ptraceEventLocked(linux.PTRACE_EVENT_EXIT, uint64(status)) +} + +// Preconditions: The TaskSet mutex must be locked. +func (t *Task) ptraceEventLocked(event int32, msg uint64) { + t.ptraceEventMsg = msg + // """ + // PTRACE_EVENT stops are observed by the tracer as waitpid(2) returning + // with WIFSTOPPED(status), and WSTOPSIG(status) returns SIGTRAP. An + // additional bit is set in the higher byte of the status word: the value + // status>>8 will be + // + // (SIGTRAP | PTRACE_EVENT_foo << 8). + // + // ... + // + // """ - ptrace(2) + t.ptraceTrapLocked(int32(linux.SIGTRAP) | (event << 8)) +} + +// ptraceKill implements ptrace(PTRACE_KILL, target). t is the caller. +func (t *Task) ptraceKill(target *Task) error { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + if target.Tracer() != t { + return syserror.ESRCH + } + target.tg.signalHandlers.mu.Lock() + defer target.tg.signalHandlers.mu.Unlock() + // "This operation is deprecated; do not use it! Instead, send a SIGKILL + // directly using kill(2) or tgkill(2). The problem with PTRACE_KILL is + // that it requires the tracee to be in signal-delivery-stop, otherwise it + // may not work (i.e., may complete successfully but won't kill the + // tracee)." - ptrace(2) + if target.stop == nil { + return nil + } + if _, ok := target.stop.(*ptraceStop); !ok { + return nil + } + target.ptraceCode = int32(linux.SIGKILL) + target.endInternalStopLocked() + return nil +} + +func (t *Task) ptraceInterrupt(target *Task) error { + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + if target.Tracer() != t { + return syserror.ESRCH + } + if !target.ptraceSeized { + return syserror.EIO + } + target.tg.signalHandlers.mu.Lock() + defer target.tg.signalHandlers.mu.Unlock() + if target.killedLocked() || target.exitState >= TaskExitInitiated { + return nil + } + target.trapStopPending = true + if s, ok := target.stop.(*ptraceStop); ok && s.listen { + target.endInternalStopLocked() + } + target.interrupt() + return nil +} + +// Preconditions: The TaskSet mutex must be locked for writing. t must have a +// tracer. +func (t *Task) ptraceSetOptionsLocked(opts uintptr) error { + const valid = uintptr(linux.PTRACE_O_EXITKILL | + linux.PTRACE_O_TRACESYSGOOD | + linux.PTRACE_O_TRACECLONE | + linux.PTRACE_O_TRACEEXEC | + linux.PTRACE_O_TRACEEXIT | + linux.PTRACE_O_TRACEFORK | + linux.PTRACE_O_TRACESECCOMP | + linux.PTRACE_O_TRACEVFORK | + linux.PTRACE_O_TRACEVFORKDONE) + if opts&^valid != 0 { + return syserror.EINVAL + } + t.ptraceOpts = ptraceOptions{ + ExitKill: opts&linux.PTRACE_O_EXITKILL != 0, + SysGood: opts&linux.PTRACE_O_TRACESYSGOOD != 0, + TraceClone: opts&linux.PTRACE_O_TRACECLONE != 0, + TraceExec: opts&linux.PTRACE_O_TRACEEXEC != 0, + TraceExit: opts&linux.PTRACE_O_TRACEEXIT != 0, + TraceFork: opts&linux.PTRACE_O_TRACEFORK != 0, + TraceSeccomp: opts&linux.PTRACE_O_TRACESECCOMP != 0, + TraceVfork: opts&linux.PTRACE_O_TRACEVFORK != 0, + TraceVforkDone: opts&linux.PTRACE_O_TRACEVFORKDONE != 0, + } + return nil +} + +// Ptrace implements the ptrace system call. +func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error { + // PTRACE_TRACEME ignores all other arguments. + if req == linux.PTRACE_TRACEME { + return t.ptraceTraceme() + } + // All other ptrace requests operate on a current or future tracee + // specified by pid. + target := t.tg.pidns.TaskWithID(pid) + if target == nil { + return syserror.ESRCH + } + + // PTRACE_ATTACH and PTRACE_SEIZE do not require that target is not already + // a tracee. + if req == linux.PTRACE_ATTACH || req == linux.PTRACE_SEIZE { + seize := req == linux.PTRACE_SEIZE + if seize && addr != 0 { + return syserror.EIO + } + return t.ptraceAttach(target, seize, uintptr(data)) + } + // PTRACE_KILL and PTRACE_INTERRUPT require that the target is a tracee, + // but does not require that it is ptrace-stopped. + if req == linux.PTRACE_KILL { + return t.ptraceKill(target) + } + if req == linux.PTRACE_INTERRUPT { + return t.ptraceInterrupt(target) + } + // All other ptrace requests require that the target is a ptrace-stopped + // tracee, and freeze the ptrace-stop so the tracee can be operated on. + t.tg.pidns.owner.mu.RLock() + if target.Tracer() != t { + t.tg.pidns.owner.mu.RUnlock() + return syserror.ESRCH + } + if !target.ptraceFreeze() { + t.tg.pidns.owner.mu.RUnlock() + // "Most ptrace commands (all except PTRACE_ATTACH, PTRACE_SEIZE, + // PTRACE_TRACEME, PTRACE_INTERRUPT, and PTRACE_KILL) require the + // tracee to be in a ptrace-stop, otherwise they fail with ESRCH." - + // ptrace(2) + return syserror.ESRCH + } + t.tg.pidns.owner.mu.RUnlock() + // Even if the target has a ptrace-stop active, the tracee's task goroutine + // may not yet have reached Task.doStop; wait for it to do so. This is safe + // because there's no way for target to initiate a ptrace-stop and then + // block (by calling Task.block) before entering it. + // + // Caveat: If tasks were just restored, the tracee's first call to + // Task.Activate (in Task.run) occurs before its first call to Task.doStop, + // which may block if the tracer's address space is active. + t.UninterruptibleSleepStart(true) + target.waitGoroutineStoppedOrExited() + t.UninterruptibleSleepFinish(true) + + // Resuming commands end the ptrace stop, but only if successful. + // PTRACE_LISTEN ends the ptrace stop if trapNotifyPending is already set on the + // target. + switch req { + case linux.PTRACE_DETACH: + if err := t.ptraceDetach(target, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + + case linux.PTRACE_CONT: + if err := target.ptraceUnstop(ptraceSyscallNone, false, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + + case linux.PTRACE_SYSCALL: + if err := target.ptraceUnstop(ptraceSyscallIntercept, false, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + + case linux.PTRACE_SINGLESTEP: + if err := target.ptraceUnstop(ptraceSyscallNone, true, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + + case linux.PTRACE_SYSEMU: + if err := target.ptraceUnstop(ptraceSyscallEmu, false, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + + case linux.PTRACE_SYSEMU_SINGLESTEP: + if err := target.ptraceUnstop(ptraceSyscallEmu, true, linux.Signal(data)); err != nil { + target.ptraceUnfreeze() + return err + } + return nil + + case linux.PTRACE_LISTEN: + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if !target.ptraceSeized { + return syserror.EIO + } + if target.ptraceSiginfo == nil { + return syserror.EIO + } + if target.ptraceSiginfo.Code>>8 != linux.PTRACE_EVENT_STOP { + return syserror.EIO + } + target.tg.signalHandlers.mu.Lock() + defer target.tg.signalHandlers.mu.Unlock() + if target.trapNotifyPending { + target.endInternalStopLocked() + } else { + target.stop.(*ptraceStop).listen = true + target.ptraceUnfreezeLocked() + } + return nil + } + + // All other ptrace requests expect us to unfreeze the stop. + defer target.ptraceUnfreeze() + + switch req { + case linux.PTRACE_PEEKTEXT, linux.PTRACE_PEEKDATA: + // "At the system call level, the PTRACE_PEEKTEXT, PTRACE_PEEKDATA, and + // PTRACE_PEEKUSER requests have a different API: they store the result + // at the address specified by the data parameter, and the return value + // is the error flag." - ptrace(2) + word := t.Arch().Native(0) + if _, err := usermem.CopyObjectIn(t, target.MemoryManager(), addr, word, usermem.IOOpts{ + IgnorePermissions: true, + }); err != nil { + return err + } + _, err := t.CopyOut(data, word) + return err + + case linux.PTRACE_POKETEXT, linux.PTRACE_POKEDATA: + _, err := usermem.CopyObjectOut(t, target.MemoryManager(), addr, t.Arch().Native(uintptr(data)), usermem.IOOpts{ + IgnorePermissions: true, + }) + return err + + case linux.PTRACE_GETREGSET: + // "Read the tracee's registers. addr specifies, in an + // architecture-dependent way, the type of registers to be read. ... + // data points to a struct iovec, which describes the destination + // buffer's location and length. On return, the kernel modifies iov.len + // to indicate the actual number of bytes returned." - ptrace(2) + ars, err := t.CopyInIovecs(data, 1) + if err != nil { + return err + } + ar := ars.Head() + n, err := target.Arch().PtraceGetRegSet(uintptr(addr), &usermem.IOReadWriter{ + Ctx: t, + IO: t.MemoryManager(), + Addr: ar.Start, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + }, int(ar.Length())) + if err != nil { + return err + } + + // Update iovecs to represent the range of the written register set. + end, ok := ar.Start.AddLength(uint64(n)) + if !ok { + panic(fmt.Sprintf("%#x + %#x overflows. Invalid reg size > %#x", ar.Start, n, ar.Length())) + } + ar.End = end + return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar)) + + case linux.PTRACE_SETREGSET: + ars, err := t.CopyInIovecs(data, 1) + if err != nil { + return err + } + ar := ars.Head() + n, err := target.Arch().PtraceSetRegSet(uintptr(addr), &usermem.IOReadWriter{ + Ctx: t, + IO: t.MemoryManager(), + Addr: ar.Start, + Opts: usermem.IOOpts{ + AddressSpaceActive: true, + }, + }, int(ar.Length())) + if err != nil { + return err + } + ar.End -= usermem.Addr(n) + return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar)) + + case linux.PTRACE_GETSIGINFO: + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if target.ptraceSiginfo == nil { + return syserror.EINVAL + } + _, err := t.CopyOut(data, target.ptraceSiginfo) + return err + + case linux.PTRACE_SETSIGINFO: + var info arch.SignalInfo + if _, err := t.CopyIn(data, &info); err != nil { + return err + } + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + if target.ptraceSiginfo == nil { + return syserror.EINVAL + } + target.ptraceSiginfo = &info + return nil + + case linux.PTRACE_GETSIGMASK: + if addr != linux.SignalSetSize { + return syserror.EINVAL + } + _, err := t.CopyOut(data, target.SignalMask()) + return err + + case linux.PTRACE_SETSIGMASK: + if addr != linux.SignalSetSize { + return syserror.EINVAL + } + var mask linux.SignalSet + if _, err := t.CopyIn(data, &mask); err != nil { + return err + } + // The target's task goroutine is stopped, so this is safe: + target.SetSignalMask(mask &^ UnblockableSignals) + return nil + + case linux.PTRACE_SETOPTIONS: + t.tg.pidns.owner.mu.Lock() + defer t.tg.pidns.owner.mu.Unlock() + return target.ptraceSetOptionsLocked(uintptr(data)) + + case linux.PTRACE_GETEVENTMSG: + t.tg.pidns.owner.mu.RLock() + defer t.tg.pidns.owner.mu.RUnlock() + _, err := t.CopyOut(usermem.Addr(data), target.ptraceEventMsg) + return err + + // PEEKSIGINFO is unimplemented but seems to have no users anywhere. + + default: + return t.ptraceArch(target, req, addr, data) + } +} |