// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "fmt" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/marshal/primitive" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) // ptraceOptions are the subset of options controlling a task's ptrace behavior // that are set by ptrace(PTRACE_SETOPTIONS). // // +stateify savable type ptraceOptions struct { // ExitKill is true if the tracee should be sent SIGKILL when the tracer // exits. ExitKill bool // If SysGood is true, set bit 7 in the signal number for // syscall-entry-stop and syscall-exit-stop traps delivered to this task's // tracer. SysGood bool // TraceClone is true if the tracer wants to receive PTRACE_EVENT_CLONE // events. TraceClone bool // TraceExec is true if the tracer wants to receive PTRACE_EVENT_EXEC // events. TraceExec bool // TraceExit is true if the tracer wants to receive PTRACE_EVENT_EXIT // events. TraceExit bool // TraceFork is true if the tracer wants to receive PTRACE_EVENT_FORK // events. TraceFork bool // TraceSeccomp is true if the tracer wants to receive PTRACE_EVENT_SECCOMP // events. TraceSeccomp bool // TraceVfork is true if the tracer wants to receive PTRACE_EVENT_VFORK // events. TraceVfork bool // TraceVforkDone is true if the tracer wants to receive // PTRACE_EVENT_VFORK_DONE events. TraceVforkDone bool } // ptraceSyscallMode controls the behavior of a ptraced task at syscall entry // and exit. type ptraceSyscallMode int const ( // ptraceSyscallNone indicates that the task has never ptrace-stopped, or // that it was resumed from its last ptrace-stop by PTRACE_CONT or // PTRACE_DETACH. The task's syscalls will not be intercepted. ptraceSyscallNone ptraceSyscallMode = iota // ptraceSyscallIntercept indicates that the task was resumed from its last // ptrace-stop by PTRACE_SYSCALL. The next time the task enters or exits a // syscall, a ptrace-stop will occur. ptraceSyscallIntercept // ptraceSyscallEmu indicates that the task was resumed from its last // ptrace-stop by PTRACE_SYSEMU or PTRACE_SYSEMU_SINGLESTEP. The next time // the task enters a syscall, the syscall will be skipped, and a // ptrace-stop will occur. ptraceSyscallEmu ) // CanTrace checks that t is permitted to access target's state, as defined by // ptrace(2), subsection "Ptrace access mode checking". If attach is true, it // checks for access mode PTRACE_MODE_ATTACH; otherwise, it checks for access // mode PTRACE_MODE_READ. // // In Linux, ptrace access restrictions may be configured by LSMs. While we do // not support LSMs, we do add additional restrictions based on the commoncap // and YAMA LSMs. // // TODO(gvisor.dev/issue/212): The result of CanTrace is immediately stale (e.g., a // racing setuid(2) may change traceability). This may pose a risk when a task // changes from traceable to not traceable. This is only problematic across // execve, where privileges may increase. // // We currently do not implement privileged executables (set-user/group-ID bits // and file capabilities), so that case is not reachable. func (t *Task) CanTrace(target *Task, attach bool) bool { // "If the calling thread and the target thread are in the same thread // group, access is always allowed." - ptrace(2) // // Note: Strictly speaking, prior to 73af963f9f30 ("__ptrace_may_access() // should not deny sub-threads", first released in Linux 3.12), the rule // only applies if t and target are the same task. But, as that commit // message puts it, "[any] security check is pointless when the tasks share // the same ->mm." if t.tg == target.tg { return true } if !t.canTraceStandard(target, attach) { return false } // YAMA only supported for vfs2. if !VFS2Enabled { return true } if atomic.LoadInt32(&t.k.YAMAPtraceScope) == linux.YAMA_SCOPE_RELATIONAL { t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if !t.canTraceYAMALocked(target) { return false } } return true } // canTraceLocked is the same as CanTrace, except the caller must already hold // the TaskSet mutex (for reading or writing). func (t *Task) canTraceLocked(target *Task, attach bool) bool { if t.tg == target.tg { return true } if !t.canTraceStandard(target, attach) { return false } // YAMA only supported for vfs2. if !VFS2Enabled { return true } if atomic.LoadInt32(&t.k.YAMAPtraceScope) == linux.YAMA_SCOPE_RELATIONAL { if !t.canTraceYAMALocked(target) { return false } } return true } // canTraceStandard performs standard ptrace access checks as defined by // kernel/ptrace.c:__ptrace_may_access as well as the commoncap LSM // implementation of the security_ptrace_access_check() interface, which is // always invoked. func (t *Task) canTraceStandard(target *Task, attach bool) bool { // """ // TODO(gvisor.dev/issue/260): 1. If the access mode specifies // PTRACE_MODE_FSCREDS (ED: snipped, doesn't exist until Linux 4.5). // // Otherwise, the access mode specifies PTRACE_MODE_REALCREDS, so use the // caller's real UID and GID for the checks in the next step. (Most APIs // that check the caller's UID and GID use the effective IDs. For // historical reasons, the PTRACE_MODE_REALCREDS check uses the real IDs // instead.) // // 2. Deny access if neither of the following is true: // // - The real, effective, and saved-set user IDs of the target match the // caller's user ID, *and* the real, effective, and saved-set group IDs of // the target match the caller's group ID. // // - The caller has the CAP_SYS_PTRACE capability in the user namespace of // the target. // // 3. Deny access if the target process "dumpable" attribute has a value // other than 1 (SUID_DUMP_USER; see the discussion of PR_SET_DUMPABLE in // prctl(2)), and the caller does not have the CAP_SYS_PTRACE capability in // the user namespace of the target process. // // 4. The commoncap LSM performs the following steps: // // a) If the access mode includes PTRACE_MODE_FSCREDS, then use the // caller's effective capability set; otherwise (the access mode specifies // PTRACE_MODE_REALCREDS, so) use the caller's permitted capability set. // // b) Deny access if neither of the following is true: // // - The caller and the target process are in the same user namespace, and // the caller's capabilities are a proper superset of the target process's // permitted capabilities. // // - The caller has the CAP_SYS_PTRACE capability in the target process's // user namespace. // // Note that the commoncap LSM does not distinguish between // PTRACE_MODE_READ and PTRACE_MODE_ATTACH. (ED: From earlier in this // section: "the commoncap LSM ... is always invoked".) // """ callerCreds := t.Credentials() targetCreds := target.Credentials() if callerCreds.HasCapabilityIn(linux.CAP_SYS_PTRACE, targetCreds.UserNamespace) { return true } if cuid := callerCreds.RealKUID; cuid != targetCreds.RealKUID || cuid != targetCreds.EffectiveKUID || cuid != targetCreds.SavedKUID { return false } if cgid := callerCreds.RealKGID; cgid != targetCreds.RealKGID || cgid != targetCreds.EffectiveKGID || cgid != targetCreds.SavedKGID { return false } var targetMM *mm.MemoryManager target.WithMuLocked(func(t *Task) { targetMM = t.MemoryManager() }) if targetMM != nil && targetMM.Dumpability() != mm.UserDumpable { return false } if callerCreds.UserNamespace != targetCreds.UserNamespace { return false } if targetCreds.PermittedCaps&^callerCreds.PermittedCaps != 0 { return false } return true } // canTraceYAMALocked performs ptrace access checks as defined by the YAMA LSM // implementation of the security_ptrace_access_check() interface, with YAMA // configured to mode 1. This is a common default among various Linux // distributions. // // It only permits the tracer to proceed if one of the following conditions is // met: // // a) The tracer is already attached to the tracee. // // b) The target is a descendant of the tracer. // // c) The target has explicitly given permission to the tracer through the // PR_SET_PTRACER prctl. // // d) The tracer has CAP_SYS_PTRACE. // // See security/yama/yama_lsm.c:yama_ptrace_access_check. // // Precondition: the TaskSet mutex must be locked (for reading or writing). func (t *Task) canTraceYAMALocked(target *Task) bool { if tracer := target.Tracer(); tracer != nil { if tracer.tg == t.tg { return true } } if target.isYAMADescendantOfLocked(t) { return true } if target.hasYAMAExceptionForLocked(t) { return true } if t.HasCapabilityIn(linux.CAP_SYS_PTRACE, target.UserNamespace()) { return true } return false } // Determines whether t is considered a descendant of ancestor for the purposes // of YAMA permissions (specifically, whether t's thread group is descended from // ancestor's). // // Precondition: the TaskSet mutex must be locked (for reading or writing). func (t *Task) isYAMADescendantOfLocked(ancestor *Task) bool { walker := t for walker != nil { if walker.tg.leader == ancestor.tg.leader { return true } walker = walker.parent } return false } // Precondition: the TaskSet mutex must be locked (for reading or writing). func (t *Task) hasYAMAExceptionForLocked(tracer *Task) bool { allowed, ok := t.k.ptraceExceptions[t] if !ok { return false } return allowed == nil || tracer.isYAMADescendantOfLocked(allowed) } // ClearYAMAException removes any YAMA exception with t as the tracee. func (t *Task) ClearYAMAException() { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() tracee := t.tg.leader delete(t.k.ptraceExceptions, tracee) } // SetYAMAException creates a YAMA exception allowing all descendants of tracer // to trace t. If tracer is nil, then any task is allowed to trace t. // // If there was an existing exception, it is overwritten with the new one. func (t *Task) SetYAMAException(tracer *Task) { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() tracee := t.tg.leader tracee.ptraceYAMAExceptionAdded = true if tracer != nil { tracer.ptraceYAMAExceptionAdded = true } t.k.ptraceExceptions[tracee] = tracer } // Tracer returns t's ptrace Tracer. func (t *Task) Tracer() *Task { return t.ptraceTracer.Load().(*Task) } // hasTracer returns true if t has a ptrace tracer attached. func (t *Task) hasTracer() bool { // This isn't just inlined into callers so that if Task.Tracer() turns out // to be too expensive because of e.g. interface conversion, we can switch // to having a separate atomic flag more easily. return t.Tracer() != nil } // ptraceStop is a TaskStop placed on tasks in a ptrace-stop. // // +stateify savable type ptraceStop struct { // If frozen is true, the stopped task's tracer is currently operating on // it, so Task.Kill should not remove the stop. frozen bool // If listen is true, the stopped task's tracer invoked PTRACE_LISTEN, so // ptraceFreeze should fail. listen bool } // Killable implements TaskStop.Killable. func (s *ptraceStop) Killable() bool { return !s.frozen } // beginPtraceStopLocked initiates an unfrozen ptrace-stop on t. If t has been // killed, the stop is skipped, and beginPtraceStopLocked returns false. // // beginPtraceStopLocked does not signal t's tracer or wake it if it is // waiting. // // Preconditions: // * The TaskSet mutex must be locked. // * The caller must be running on the task goroutine. func (t *Task) beginPtraceStopLocked() bool { t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() // This is analogous to Linux's kernel/signal.c:ptrace_stop() => ... => // kernel/sched/core.c:__schedule() => signal_pending_state() check, which // is what prevents tasks from entering ptrace-stops after being killed. // Note that if t was SIGKILLed and beingPtraceStopLocked is being called // for PTRACE_EVENT_EXIT, the task will have dequeued the signal before // entering the exit path, so t.killedLocked() will no longer return true. // This is consistent with Linux: "Bugs: ... A SIGKILL signal may still // cause a PTRACE_EVENT_EXIT stop before actual signal death. This may be // changed in the future; SIGKILL is meant to always immediately kill tasks // even under ptrace. Last confirmed on Linux 3.13." - ptrace(2) if t.killedLocked() { return false } t.beginInternalStopLocked(&ptraceStop{}) return true } // Preconditions: The TaskSet mutex must be locked. func (t *Task) ptraceTrapLocked(code int32) { // This is unconditional in ptrace_stop(). t.tg.signalHandlers.mu.Lock() t.trapStopPending = false t.tg.signalHandlers.mu.Unlock() t.ptraceCode = code t.ptraceSiginfo = &arch.SignalInfo{ Signo: int32(linux.SIGTRAP), Code: code, } t.ptraceSiginfo.SetPID(int32(t.tg.pidns.tids[t])) t.ptraceSiginfo.SetUID(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow())) if t.beginPtraceStopLocked() { tracer := t.Tracer() tracer.signalStop(t, arch.CLD_TRAPPED, int32(linux.SIGTRAP)) tracer.tg.eventQueue.Notify(EventTraceeStop) } } // ptraceFreeze checks if t is in a ptraceStop. If so, it freezes the // ptraceStop, temporarily preventing it from being removed by a concurrent // Task.Kill, and returns true. Otherwise it returns false. // // Preconditions: // * The TaskSet mutex must be locked. // * The caller must be running on the task goroutine of t's tracer. func (t *Task) ptraceFreeze() bool { t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() if t.stop == nil { return false } s, ok := t.stop.(*ptraceStop) if !ok { return false } if s.listen { return false } s.frozen = true return true } // ptraceUnfreeze ends the effect of a previous successful call to // ptraceFreeze. // // Preconditions: t must be in a frozen ptraceStop. func (t *Task) ptraceUnfreeze() { // t.tg.signalHandlers is stable because t is in a frozen ptrace-stop, // preventing its thread group from completing execve. t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() t.ptraceUnfreezeLocked() } // Preconditions: // * t must be in a frozen ptraceStop. // * t's signal mutex must be locked. func (t *Task) ptraceUnfreezeLocked() { // Do this even if the task has been killed to ensure a panic if t.stop is // nil or not a ptraceStop. t.stop.(*ptraceStop).frozen = false if t.killedLocked() { t.endInternalStopLocked() } } // ptraceUnstop implements ptrace request PTRACE_CONT, PTRACE_SYSCALL, // PTRACE_SINGLESTEP, PTRACE_SYSEMU, or PTRACE_SYSEMU_SINGLESTEP depending on // mode and singlestep. // // Preconditions: t must be in a frozen ptrace stop. // // Postconditions: If ptraceUnstop returns nil, t will no longer be in a ptrace // stop. func (t *Task) ptraceUnstop(mode ptraceSyscallMode, singlestep bool, sig linux.Signal) error { if sig != 0 && !sig.IsValid() { return syserror.EIO } t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() t.ptraceCode = int32(sig) t.ptraceSyscallMode = mode t.ptraceSinglestep = singlestep t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() t.endInternalStopLocked() return nil } func (t *Task) ptraceTraceme() error { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() if t.hasTracer() { return syserror.EPERM } if t.parent == nil { // In Linux, only init can not have a parent, and init is assumed never // to invoke PTRACE_TRACEME. In the sentry, TGID 1 is an arbitrary user // application that may invoke PTRACE_TRACEME; having no parent can // also occur if all tasks in the parent thread group have exited, and // failed to find a living thread group to reparent to. The former case // is treated as if TGID 1 has an exited parent in an invisible // ancestor PID namespace that is an owner of the root user namespace // (and consequently has CAP_SYS_PTRACE), and the latter case is a // special form of the exited parent case below. In either case, // returning nil here is correct. return nil } if !t.parent.canTraceLocked(t, true) { return syserror.EPERM } if t.parent.exitState != TaskExitNone { // Fail silently, as if we were successfully attached but then // immediately detached. This is consistent with Linux. return nil } t.ptraceTracer.Store(t.parent) t.parent.ptraceTracees[t] = struct{}{} return nil } // ptraceAttach implements ptrace(PTRACE_ATTACH, target) if seize is false, and // ptrace(PTRACE_SEIZE, target, 0, opts) if seize is true. t is the caller. func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error { if t.tg == target.tg { return syserror.EPERM } t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() if !t.canTraceLocked(target, true) { return syserror.EPERM } if target.hasTracer() { return syserror.EPERM } // Attaching to zombies and dead tasks is not permitted; the exit // notification logic relies on this. Linux allows attaching to PF_EXITING // tasks, though. if target.exitState >= TaskExitZombie { return syserror.EPERM } if seize { if err := target.ptraceSetOptionsLocked(opts); err != nil { return syserror.EIO } } target.ptraceTracer.Store(t) t.ptraceTracees[target] = struct{}{} target.ptraceSeized = seize target.tg.signalHandlers.mu.Lock() // "Unlike PTRACE_ATTACH, PTRACE_SEIZE does not stop the process." - // ptrace(2) if !seize { target.sendSignalLocked(&arch.SignalInfo{ Signo: int32(linux.SIGSTOP), Code: arch.SignalInfoUser, }, false /* group */) } // Undocumented Linux feature: If the tracee is already group-stopped (and // consequently will not report the SIGSTOP just sent), force it to leave // and re-enter the stop so that it will switch to a ptrace-stop. if target.stop == (*groupStop)(nil) { target.trapStopPending = true target.endInternalStopLocked() // TODO(jamieliu): Linux blocks ptrace_attach() until the task has // entered the ptrace-stop (or exited) via JOBCTL_TRAPPING. } target.tg.signalHandlers.mu.Unlock() return nil } // ptraceDetach implements ptrace(PTRACE_DETACH, target, 0, sig). t is the // caller. // // Preconditions: target must be a tracee of t in a frozen ptrace stop. // // Postconditions: If ptraceDetach returns nil, target will no longer be in a // ptrace stop. func (t *Task) ptraceDetach(target *Task, sig linux.Signal) error { if sig != 0 && !sig.IsValid() { return syserror.EIO } t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() target.ptraceCode = int32(sig) target.forgetTracerLocked() delete(t.ptraceTracees, target) return nil } // exitPtrace is called in the exit path to detach all of t's tracees. func (t *Task) exitPtrace() { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() for target := range t.ptraceTracees { if target.ptraceOpts.ExitKill { target.tg.signalHandlers.mu.Lock() target.sendSignalLocked(&arch.SignalInfo{ Signo: int32(linux.SIGKILL), }, false /* group */) target.tg.signalHandlers.mu.Unlock() } // Leave ptraceCode unchanged so that if the task is ptrace-stopped, it // observes the ptraceCode it set before it entered the stop. I believe // this is consistent with Linux. target.forgetTracerLocked() } // "nil maps cannot be saved" t.ptraceTracees = make(map[*Task]struct{}) if t.ptraceYAMAExceptionAdded { delete(t.k.ptraceExceptions, t) for tracee, tracer := range t.k.ptraceExceptions { if tracer == t { delete(t.k.ptraceExceptions, tracee) } } } } // forgetTracerLocked detaches t's tracer and ensures that t is no longer // ptrace-stopped. // // Preconditions: The TaskSet mutex must be locked for writing. func (t *Task) forgetTracerLocked() { t.ptraceSeized = false t.ptraceOpts = ptraceOptions{} t.ptraceSyscallMode = ptraceSyscallNone t.ptraceSinglestep = false t.ptraceTracer.Store((*Task)(nil)) if t.exitTracerNotified && !t.exitTracerAcked { t.exitTracerAcked = true t.exitNotifyLocked(true) } t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() // Unset t.trapStopPending, which might have been set by PTRACE_INTERRUPT. If // it wasn't, it will be reset via t.groupStopPending after the following. t.trapStopPending = false // If t's thread group is in a group stop and t is eligible to participate, // make it do so. This is essentially the reverse of the special case in // ptraceAttach, which converts a group stop to a ptrace stop. ("Handling // of restart from group-stop is currently buggy, but the "as planned" // behavior is to leave tracee stopped and waiting for SIGCONT." - // ptrace(2)) if (t.tg.groupStopComplete || t.tg.groupStopPendingCount != 0) && !t.groupStopPending && t.exitState < TaskExitInitiated { t.groupStopPending = true // t already participated in the group stop when it unset // groupStopPending. t.groupStopAcknowledged = true t.interrupt() } if _, ok := t.stop.(*ptraceStop); ok { t.endInternalStopLocked() } } // ptraceSignalLocked is called after signal dequeueing to check if t should // enter ptrace signal-delivery-stop. // // Preconditions: // * The signal mutex must be locked. // * The caller must be running on the task goroutine. func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool { if linux.Signal(info.Signo) == linux.SIGKILL { return false } if !t.hasTracer() { return false } // The tracer might change this signal into a stop signal, in which case // any SIGCONT received after the signal was originally dequeued should // cancel it. This is consistent with Linux. t.tg.groupStopDequeued = true // This is unconditional in ptrace_stop(). t.trapStopPending = false // Can't lock the TaskSet mutex while holding a signal mutex. t.tg.signalHandlers.mu.Unlock() defer t.tg.signalHandlers.mu.Lock() t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() tracer := t.Tracer() if tracer == nil { return false } t.ptraceCode = info.Signo t.ptraceSiginfo = info t.Debugf("Entering signal-delivery-stop for signal %d", info.Signo) if t.beginPtraceStopLocked() { tracer.signalStop(t, arch.CLD_TRAPPED, info.Signo) tracer.tg.eventQueue.Notify(EventTraceeStop) } return true } // ptraceSeccomp is called when a seccomp-bpf filter returns action // SECCOMP_RET_TRACE to check if t should enter PTRACE_EVENT_SECCOMP stop. data // is the lower 16 bits of the filter's return value. func (t *Task) ptraceSeccomp(data uint16) bool { if !t.hasTracer() { return false } t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if !t.ptraceOpts.TraceSeccomp { return false } t.Debugf("Entering PTRACE_EVENT_SECCOMP stop") t.ptraceEventLocked(linux.PTRACE_EVENT_SECCOMP, uint64(data)) return true } // ptraceSyscallEnter is called immediately before entering a syscall to check // if t should enter ptrace syscall-enter-stop. func (t *Task) ptraceSyscallEnter() (taskRunState, bool) { if !t.hasTracer() { return nil, false } t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() switch t.ptraceSyscallMode { case ptraceSyscallNone: return nil, false case ptraceSyscallIntercept: t.Debugf("Entering syscall-enter-stop from PTRACE_SYSCALL") t.ptraceSyscallStopLocked() return (*runSyscallAfterSyscallEnterStop)(nil), true case ptraceSyscallEmu: t.Debugf("Entering syscall-enter-stop from PTRACE_SYSEMU") t.ptraceSyscallStopLocked() return (*runSyscallAfterSysemuStop)(nil), true } panic(fmt.Sprintf("Unknown ptraceSyscallMode: %v", t.ptraceSyscallMode)) } // ptraceSyscallExit is called immediately after leaving a syscall to check if // t should enter ptrace syscall-exit-stop. func (t *Task) ptraceSyscallExit() { if !t.hasTracer() { return } t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if t.ptraceSyscallMode != ptraceSyscallIntercept { return } t.Debugf("Entering syscall-exit-stop") t.ptraceSyscallStopLocked() } // Preconditions: The TaskSet mutex must be locked. func (t *Task) ptraceSyscallStopLocked() { code := int32(linux.SIGTRAP) if t.ptraceOpts.SysGood { code |= 0x80 } t.ptraceTrapLocked(code) } type ptraceCloneKind int32 const ( // ptraceCloneKindClone represents a call to Task.Clone where // TerminationSignal is not SIGCHLD and Vfork is false. ptraceCloneKindClone ptraceCloneKind = iota // ptraceCloneKindFork represents a call to Task.Clone where // TerminationSignal is SIGCHLD and Vfork is false. ptraceCloneKindFork // ptraceCloneKindVfork represents a call to Task.Clone where Vfork is // true. ptraceCloneKindVfork ) // ptraceClone is called at the end of a clone or fork syscall to check if t // should enter PTRACE_EVENT_CLONE, PTRACE_EVENT_FORK, or PTRACE_EVENT_VFORK // stop. child is the new task. func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions) bool { if !t.hasTracer() { return false } t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() event := false if !opts.Untraced { switch kind { case ptraceCloneKindClone: if t.ptraceOpts.TraceClone { t.Debugf("Entering PTRACE_EVENT_CLONE stop") t.ptraceEventLocked(linux.PTRACE_EVENT_CLONE, uint64(t.tg.pidns.tids[child])) event = true } case ptraceCloneKindFork: if t.ptraceOpts.TraceFork { t.Debugf("Entering PTRACE_EVENT_FORK stop") t.ptraceEventLocked(linux.PTRACE_EVENT_FORK, uint64(t.tg.pidns.tids[child])) event = true } case ptraceCloneKindVfork: if t.ptraceOpts.TraceVfork { t.Debugf("Entering PTRACE_EVENT_VFORK stop") t.ptraceEventLocked(linux.PTRACE_EVENT_VFORK, uint64(t.tg.pidns.tids[child])) event = true } default: panic(fmt.Sprintf("Unknown ptraceCloneKind: %v", kind)) } } // "If the PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or PTRACE_O_TRACECLONE // options are in effect, then children created by, respectively, vfork(2) // or clone(2) with the CLONE_VFORK flag, fork(2) or clone(2) with the exit // signal set to SIGCHLD, and other kinds of clone(2), are automatically // attached to the same tracer which traced their parent. SIGSTOP is // delivered to the children, causing them to enter signal-delivery-stop // after they exit the system call which created them." - ptrace(2) // // clone(2)'s documentation of CLONE_UNTRACED and CLONE_PTRACE is // confusingly wrong; see kernel/fork.c:_do_fork() => copy_process() => // include/linux/ptrace.h:ptrace_init_task(). if event || opts.InheritTracer { tracer := t.Tracer() if tracer != nil { child.ptraceTracer.Store(tracer) tracer.ptraceTracees[child] = struct{}{} // "The "seized" behavior ... is inherited by children that are // automatically attached using PTRACE_O_TRACEFORK, // PTRACE_O_TRACEVFORK, and PTRACE_O_TRACECLONE." - ptrace(2) child.ptraceSeized = t.ptraceSeized // "Flags are inherited by new tracees created and "auto-attached" // via active PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or // PTRACE_O_TRACECLONE options." - ptrace(2) child.ptraceOpts = t.ptraceOpts child.tg.signalHandlers.mu.Lock() // "PTRACE_SEIZE: ... Automatically attached children stop with // PTRACE_EVENT_STOP and WSTOPSIG(status) returns SIGTRAP instead // of having SIGSTOP signal delivered to them." - ptrace(2) if child.ptraceSeized { child.trapStopPending = true } else { child.pendingSignals.enqueue(&arch.SignalInfo{ Signo: int32(linux.SIGSTOP), }, nil) } // The child will self-interrupt() when its task goroutine starts // running, so we don't have to. child.tg.signalHandlers.mu.Unlock() } } return event } // ptraceVforkDone is called after the end of a vfork stop to check if t should // enter PTRACE_EVENT_VFORK_DONE stop. child is the new task's thread ID in t's // PID namespace. func (t *Task) ptraceVforkDone(child ThreadID) bool { if !t.hasTracer() { return false } t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if !t.ptraceOpts.TraceVforkDone { return false } t.Debugf("Entering PTRACE_EVENT_VFORK_DONE stop") t.ptraceEventLocked(linux.PTRACE_EVENT_VFORK_DONE, uint64(child)) return true } // ptraceExec is called at the end of an execve syscall to check if t should // enter PTRACE_EVENT_EXEC stop. oldTID is t's thread ID, in its *tracer's* PID // namespace, prior to the execve. (If t did not have a tracer at the time // oldTID was read, oldTID may be 0. This is consistent with Linux.) func (t *Task) ptraceExec(oldTID ThreadID) { if !t.hasTracer() { return } t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() // Recheck with the TaskSet mutex locked. Most ptrace points don't need to // do this because detaching resets ptrace options, but PTRACE_EVENT_EXEC // is special because both TraceExec and !TraceExec do something if a // tracer is attached. if !t.hasTracer() { return } if t.ptraceOpts.TraceExec { t.Debugf("Entering PTRACE_EVENT_EXEC stop") t.ptraceEventLocked(linux.PTRACE_EVENT_EXEC, uint64(oldTID)) return } // "If the PTRACE_O_TRACEEXEC option is not in effect for the execing // tracee, and if the tracee was PTRACE_ATTACHed rather that [sic] // PTRACE_SEIZEd, the kernel delivers an extra SIGTRAP to the tracee after // execve(2) returns. This is an ordinary signal (similar to one which can // be generated by `kill -TRAP`, not a special kind of ptrace-stop. // Employing PTRACE_GETSIGINFO for this signal returns si_code set to 0 // (SI_USER). This signal may be blocked by signal mask, and thus may be // delivered (much) later." - ptrace(2) if t.ptraceSeized { return } t.tg.signalHandlers.mu.Lock() defer t.tg.signalHandlers.mu.Unlock() t.sendSignalLocked(&arch.SignalInfo{ Signo: int32(linux.SIGTRAP), Code: arch.SignalInfoUser, }, false /* group */) } // ptraceExit is called early in the task exit path to check if t should enter // PTRACE_EVENT_EXIT stop. func (t *Task) ptraceExit() { if !t.hasTracer() { return } t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if !t.ptraceOpts.TraceExit { return } t.tg.signalHandlers.mu.Lock() status := t.exitStatus.Status() t.tg.signalHandlers.mu.Unlock() t.Debugf("Entering PTRACE_EVENT_EXIT stop") t.ptraceEventLocked(linux.PTRACE_EVENT_EXIT, uint64(status)) } // Preconditions: The TaskSet mutex must be locked. func (t *Task) ptraceEventLocked(event int32, msg uint64) { t.ptraceEventMsg = msg // """ // PTRACE_EVENT stops are observed by the tracer as waitpid(2) returning // with WIFSTOPPED(status), and WSTOPSIG(status) returns SIGTRAP. An // additional bit is set in the higher byte of the status word: the value // status>>8 will be // // (SIGTRAP | PTRACE_EVENT_foo << 8). // // ... // // """ - ptrace(2) t.ptraceTrapLocked(int32(linux.SIGTRAP) | (event << 8)) } // ptraceKill implements ptrace(PTRACE_KILL, target). t is the caller. func (t *Task) ptraceKill(target *Task) error { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() if target.Tracer() != t { return syserror.ESRCH } target.tg.signalHandlers.mu.Lock() defer target.tg.signalHandlers.mu.Unlock() // "This operation is deprecated; do not use it! Instead, send a SIGKILL // directly using kill(2) or tgkill(2). The problem with PTRACE_KILL is // that it requires the tracee to be in signal-delivery-stop, otherwise it // may not work (i.e., may complete successfully but won't kill the // tracee)." - ptrace(2) if target.stop == nil { return nil } if _, ok := target.stop.(*ptraceStop); !ok { return nil } target.ptraceCode = int32(linux.SIGKILL) target.endInternalStopLocked() return nil } func (t *Task) ptraceInterrupt(target *Task) error { t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() if target.Tracer() != t { return syserror.ESRCH } if !target.ptraceSeized { return syserror.EIO } target.tg.signalHandlers.mu.Lock() defer target.tg.signalHandlers.mu.Unlock() if target.killedLocked() || target.exitState >= TaskExitInitiated { return nil } target.trapStopPending = true if s, ok := target.stop.(*ptraceStop); ok && s.listen { target.endInternalStopLocked() } target.interrupt() return nil } // Preconditions: // * The TaskSet mutex must be locked for writing. // * t must have a tracer. func (t *Task) ptraceSetOptionsLocked(opts uintptr) error { const valid = uintptr(linux.PTRACE_O_EXITKILL | linux.PTRACE_O_TRACESYSGOOD | linux.PTRACE_O_TRACECLONE | linux.PTRACE_O_TRACEEXEC | linux.PTRACE_O_TRACEEXIT | linux.PTRACE_O_TRACEFORK | linux.PTRACE_O_TRACESECCOMP | linux.PTRACE_O_TRACEVFORK | linux.PTRACE_O_TRACEVFORKDONE) if opts&^valid != 0 { return syserror.EINVAL } t.ptraceOpts = ptraceOptions{ ExitKill: opts&linux.PTRACE_O_EXITKILL != 0, SysGood: opts&linux.PTRACE_O_TRACESYSGOOD != 0, TraceClone: opts&linux.PTRACE_O_TRACECLONE != 0, TraceExec: opts&linux.PTRACE_O_TRACEEXEC != 0, TraceExit: opts&linux.PTRACE_O_TRACEEXIT != 0, TraceFork: opts&linux.PTRACE_O_TRACEFORK != 0, TraceSeccomp: opts&linux.PTRACE_O_TRACESECCOMP != 0, TraceVfork: opts&linux.PTRACE_O_TRACEVFORK != 0, TraceVforkDone: opts&linux.PTRACE_O_TRACEVFORKDONE != 0, } return nil } // Ptrace implements the ptrace system call. func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error { // PTRACE_TRACEME ignores all other arguments. if req == linux.PTRACE_TRACEME { return t.ptraceTraceme() } // All other ptrace requests operate on a current or future tracee // specified by pid. target := t.tg.pidns.TaskWithID(pid) if target == nil { return syserror.ESRCH } // PTRACE_ATTACH and PTRACE_SEIZE do not require that target is not already // a tracee. if req == linux.PTRACE_ATTACH || req == linux.PTRACE_SEIZE { seize := req == linux.PTRACE_SEIZE if seize && addr != 0 { return syserror.EIO } return t.ptraceAttach(target, seize, uintptr(data)) } // PTRACE_KILL and PTRACE_INTERRUPT require that the target is a tracee, // but does not require that it is ptrace-stopped. if req == linux.PTRACE_KILL { return t.ptraceKill(target) } if req == linux.PTRACE_INTERRUPT { return t.ptraceInterrupt(target) } // All other ptrace requests require that the target is a ptrace-stopped // tracee, and freeze the ptrace-stop so the tracee can be operated on. t.tg.pidns.owner.mu.RLock() if target.Tracer() != t { t.tg.pidns.owner.mu.RUnlock() return syserror.ESRCH } if !target.ptraceFreeze() { t.tg.pidns.owner.mu.RUnlock() // "Most ptrace commands (all except PTRACE_ATTACH, PTRACE_SEIZE, // PTRACE_TRACEME, PTRACE_INTERRUPT, and PTRACE_KILL) require the // tracee to be in a ptrace-stop, otherwise they fail with ESRCH." - // ptrace(2) return syserror.ESRCH } t.tg.pidns.owner.mu.RUnlock() // Even if the target has a ptrace-stop active, the tracee's task goroutine // may not yet have reached Task.doStop; wait for it to do so. This is safe // because there's no way for target to initiate a ptrace-stop and then // block (by calling Task.block) before entering it. // // Caveat: If tasks were just restored, the tracee's first call to // Task.Activate (in Task.run) occurs before its first call to Task.doStop, // which may block if the tracer's address space is active. t.UninterruptibleSleepStart(true) target.waitGoroutineStoppedOrExited() t.UninterruptibleSleepFinish(true) // Resuming commands end the ptrace stop, but only if successful. // PTRACE_LISTEN ends the ptrace stop if trapNotifyPending is already set on the // target. switch req { case linux.PTRACE_DETACH: if err := t.ptraceDetach(target, linux.Signal(data)); err != nil { target.ptraceUnfreeze() return err } return nil case linux.PTRACE_CONT: if err := target.ptraceUnstop(ptraceSyscallNone, false, linux.Signal(data)); err != nil { target.ptraceUnfreeze() return err } return nil case linux.PTRACE_SYSCALL: if err := target.ptraceUnstop(ptraceSyscallIntercept, false, linux.Signal(data)); err != nil { target.ptraceUnfreeze() return err } return nil case linux.PTRACE_SINGLESTEP: if err := target.ptraceUnstop(ptraceSyscallNone, true, linux.Signal(data)); err != nil { target.ptraceUnfreeze() return err } return nil case linux.PTRACE_SYSEMU: if err := target.ptraceUnstop(ptraceSyscallEmu, false, linux.Signal(data)); err != nil { target.ptraceUnfreeze() return err } return nil case linux.PTRACE_SYSEMU_SINGLESTEP: if err := target.ptraceUnstop(ptraceSyscallEmu, true, linux.Signal(data)); err != nil { target.ptraceUnfreeze() return err } return nil case linux.PTRACE_LISTEN: t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if !target.ptraceSeized { return syserror.EIO } if target.ptraceSiginfo == nil { return syserror.EIO } if target.ptraceSiginfo.Code>>8 != linux.PTRACE_EVENT_STOP { return syserror.EIO } target.tg.signalHandlers.mu.Lock() defer target.tg.signalHandlers.mu.Unlock() if target.trapNotifyPending { target.endInternalStopLocked() } else { target.stop.(*ptraceStop).listen = true target.ptraceUnfreezeLocked() } return nil } // All other ptrace requests expect us to unfreeze the stop. defer target.ptraceUnfreeze() switch req { case linux.PTRACE_PEEKTEXT, linux.PTRACE_PEEKDATA: // "At the system call level, the PTRACE_PEEKTEXT, PTRACE_PEEKDATA, and // PTRACE_PEEKUSER requests have a different API: they store the result // at the address specified by the data parameter, and the return value // is the error flag." - ptrace(2) word := t.Arch().Native(0) if _, err := word.CopyIn(target.CopyContext(t, usermem.IOOpts{IgnorePermissions: true}), addr); err != nil { return err } _, err := word.CopyOut(t, data) return err case linux.PTRACE_POKETEXT, linux.PTRACE_POKEDATA: word := t.Arch().Native(uintptr(data)) _, err := word.CopyOut(target.CopyContext(t, usermem.IOOpts{IgnorePermissions: true}), addr) return err case linux.PTRACE_GETREGSET: // "Read the tracee's registers. addr specifies, in an // architecture-dependent way, the type of registers to be read. ... // data points to a struct iovec, which describes the destination // buffer's location and length. On return, the kernel modifies iov.len // to indicate the actual number of bytes returned." - ptrace(2) ars, err := t.CopyInIovecs(data, 1) if err != nil { return err } t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) ar := ars.Head() n, err := target.Arch().PtraceGetRegSet(uintptr(addr), &usermem.IOReadWriter{ Ctx: t, IO: t.MemoryManager(), Addr: ar.Start, Opts: usermem.IOOpts{ AddressSpaceActive: true, }, }, int(ar.Length())) if err != nil { return err } // Update iovecs to represent the range of the written register set. end, ok := ar.Start.AddLength(uint64(n)) if !ok { panic(fmt.Sprintf("%#x + %#x overflows. Invalid reg size > %#x", ar.Start, n, ar.Length())) } ar.End = end return t.CopyOutIovecs(data, hostarch.AddrRangeSeqOf(ar)) case linux.PTRACE_SETREGSET: ars, err := t.CopyInIovecs(data, 1) if err != nil { return err } mm := t.MemoryManager() t.p.PullFullState(mm.AddressSpace(), t.Arch()) ar := ars.Head() n, err := target.Arch().PtraceSetRegSet(uintptr(addr), &usermem.IOReadWriter{ Ctx: t, IO: mm, Addr: ar.Start, Opts: usermem.IOOpts{ AddressSpaceActive: true, }, }, int(ar.Length())) if err != nil { return err } t.p.FullStateChanged() ar.End -= hostarch.Addr(n) return t.CopyOutIovecs(data, hostarch.AddrRangeSeqOf(ar)) case linux.PTRACE_GETSIGINFO: t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if target.ptraceSiginfo == nil { return syserror.EINVAL } _, err := target.ptraceSiginfo.CopyOut(t, data) return err case linux.PTRACE_SETSIGINFO: var info arch.SignalInfo if _, err := info.CopyIn(t, data); err != nil { return err } t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() if target.ptraceSiginfo == nil { return syserror.EINVAL } target.ptraceSiginfo = &info return nil case linux.PTRACE_GETSIGMASK: if addr != linux.SignalSetSize { return syserror.EINVAL } mask := target.SignalMask() _, err := mask.CopyOut(t, data) return err case linux.PTRACE_SETSIGMASK: if addr != linux.SignalSetSize { return syserror.EINVAL } var mask linux.SignalSet if _, err := mask.CopyIn(t, data); err != nil { return err } // The target's task goroutine is stopped, so this is safe: target.SetSignalMask(mask &^ UnblockableSignals) return nil case linux.PTRACE_SETOPTIONS: t.tg.pidns.owner.mu.Lock() defer t.tg.pidns.owner.mu.Unlock() return target.ptraceSetOptionsLocked(uintptr(data)) case linux.PTRACE_GETEVENTMSG: t.tg.pidns.owner.mu.RLock() defer t.tg.pidns.owner.mu.RUnlock() _, err := primitive.CopyUint64Out(t, hostarch.Addr(data), target.ptraceEventMsg) return err // PEEKSIGINFO is unimplemented but seems to have no users anywhere. default: return t.ptraceArch(target, req, addr, data) } }