// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
	"fmt"

	"gvisor.dev/gvisor/pkg/abi/linux"
	"gvisor.dev/gvisor/pkg/marshal/primitive"
	"gvisor.dev/gvisor/pkg/sentry/arch"
	"gvisor.dev/gvisor/pkg/sentry/mm"
	"gvisor.dev/gvisor/pkg/syserror"
	"gvisor.dev/gvisor/pkg/usermem"
)

// ptraceOptions are the subset of options controlling a task's ptrace behavior
// that are set by ptrace(PTRACE_SETOPTIONS).
//
// +stateify savable
type ptraceOptions struct {
	// ExitKill is true if the tracee should be sent SIGKILL when the tracer
	// exits.
	ExitKill bool

	// If SysGood is true, set bit 7 in the signal number for
	// syscall-entry-stop and syscall-exit-stop traps delivered to this task's
	// tracer.
	SysGood bool

	// TraceClone is true if the tracer wants to receive PTRACE_EVENT_CLONE
	// events.
	TraceClone bool

	// TraceExec is true if the tracer wants to receive PTRACE_EVENT_EXEC
	// events.
	TraceExec bool

	// TraceExit is true if the tracer wants to receive PTRACE_EVENT_EXIT
	// events.
	TraceExit bool

	// TraceFork is true if the tracer wants to receive PTRACE_EVENT_FORK
	// events.
	TraceFork bool

	// TraceSeccomp is true if the tracer wants to receive PTRACE_EVENT_SECCOMP
	// events.
	TraceSeccomp bool

	// TraceVfork is true if the tracer wants to receive PTRACE_EVENT_VFORK
	// events.
	TraceVfork bool

	// TraceVforkDone is true if the tracer wants to receive
	// PTRACE_EVENT_VFORK_DONE events.
	TraceVforkDone bool
}

// ptraceSyscallMode controls the behavior of a ptraced task at syscall entry
// and exit.
type ptraceSyscallMode int

const (
	// ptraceSyscallNone indicates that the task has never ptrace-stopped, or
	// that it was resumed from its last ptrace-stop by PTRACE_CONT or
	// PTRACE_DETACH. The task's syscalls will not be intercepted.
	ptraceSyscallNone ptraceSyscallMode = iota

	// ptraceSyscallIntercept indicates that the task was resumed from its last
	// ptrace-stop by PTRACE_SYSCALL. The next time the task enters or exits a
	// syscall, a ptrace-stop will occur.
	ptraceSyscallIntercept

	// ptraceSyscallEmu indicates that the task was resumed from its last
	// ptrace-stop by PTRACE_SYSEMU or PTRACE_SYSEMU_SINGLESTEP. The next time
	// the task enters a syscall, the syscall will be skipped, and a
	// ptrace-stop will occur.
	ptraceSyscallEmu
)

// CanTrace checks that t is permitted to access target's state, as defined by
// ptrace(2), subsection "Ptrace access mode checking". If attach is true, it
// checks for access mode PTRACE_MODE_ATTACH; otherwise, it checks for access
// mode PTRACE_MODE_READ.
//
// NOTE(b/30815691): The result of CanTrace is immediately stale (e.g., a
// racing setuid(2) may change traceability). This may pose a risk when a task
// changes from traceable to not traceable. This is only problematic across
// execve, where privileges may increase.
//
// We currently do not implement privileged executables (set-user/group-ID bits
// and file capabilities), so that case is not reachable.
func (t *Task) CanTrace(target *Task, attach bool) bool {
	// "1. If the calling thread and the target thread are in the same thread
	// group, access is always allowed." - ptrace(2)
	//
	// Note: Strictly speaking, prior to 73af963f9f30 ("__ptrace_may_access()
	// should not deny sub-threads", first released in Linux 3.12), the rule
	// only applies if t and target are the same task. But, as that commit
	// message puts it, "[any] security check is pointless when the tasks share
	// the same ->mm."
	if t.tg == target.tg {
		return true
	}

	// """
	// 2. If the access mode specifies PTRACE_MODE_FSCREDS (ED: snipped,
	// doesn't exist until Linux 4.5).
	//
	// Otherwise, the access mode specifies PTRACE_MODE_REALCREDS, so use the
	// caller's real UID and GID for the checks in the next step. (Most APIs
	// that check the caller's UID and GID use the effective IDs. For
	// historical reasons, the PTRACE_MODE_REALCREDS check uses the real IDs
	// instead.)
	//
	// 3. Deny access if neither of the following is true:
	//
	// - The real, effective, and saved-set user IDs of the target match the
	// caller's user ID, *and* the real, effective, and saved-set group IDs of
	// the target match the caller's group ID.
	//
	// - The caller has the CAP_SYS_PTRACE capability in the user namespace of
	// the target.
	//
	// 4. Deny access if the target process "dumpable" attribute has a value
	// other than 1 (SUID_DUMP_USER; see the discussion of PR_SET_DUMPABLE in
	// prctl(2)), and the caller does not have the CAP_SYS_PTRACE capability in
	// the user namespace of the target process.
	//
	// 5. The kernel LSM security_ptrace_access_check() interface is invoked to
	// see if ptrace access is permitted. The results depend on the LSM(s). The
	// implementation of this interface in the commoncap LSM performs the
	// following steps:
	//
	// a) If the access mode includes PTRACE_MODE_FSCREDS, then use the
	// caller's effective capability set; otherwise (the access mode specifies
	// PTRACE_MODE_REALCREDS, so) use the caller's permitted capability set.
	//
	// b) Deny access if neither of the following is true:
	//
	// - The caller and the target process are in the same user namespace, and
	// the caller's capabilities are a proper superset of the target process's
	// permitted capabilities.
	//
	// - The caller has the CAP_SYS_PTRACE capability in the target process's
	// user namespace.
	//
	// Note that the commoncap LSM does not distinguish between
	// PTRACE_MODE_READ and PTRACE_MODE_ATTACH. (ED: From earlier in this
	// section: "the commoncap LSM ... is always invoked".)
	// """
	callerCreds := t.Credentials()
	targetCreds := target.Credentials()
	if callerCreds.HasCapabilityIn(linux.CAP_SYS_PTRACE, targetCreds.UserNamespace) {
		return true
	}
	if cuid := callerCreds.RealKUID; cuid != targetCreds.RealKUID || cuid != targetCreds.EffectiveKUID || cuid != targetCreds.SavedKUID {
		return false
	}
	if cgid := callerCreds.RealKGID; cgid != targetCreds.RealKGID || cgid != targetCreds.EffectiveKGID || cgid != targetCreds.SavedKGID {
		return false
	}
	var targetMM *mm.MemoryManager
	target.WithMuLocked(func(t *Task) {
		targetMM = t.MemoryManager()
	})
	if targetMM != nil && targetMM.Dumpability() != mm.UserDumpable {
		return false
	}
	if callerCreds.UserNamespace != targetCreds.UserNamespace {
		return false
	}
	if targetCreds.PermittedCaps&^callerCreds.PermittedCaps != 0 {
		return false
	}
	return true
}

// Tracer returns t's ptrace Tracer.
func (t *Task) Tracer() *Task {
	return t.ptraceTracer.Load().(*Task)
}

// hasTracer returns true if t has a ptrace tracer attached.
func (t *Task) hasTracer() bool {
	// This isn't just inlined into callers so that if Task.Tracer() turns out
	// to be too expensive because of e.g. interface conversion, we can switch
	// to having a separate atomic flag more easily.
	return t.Tracer() != nil
}

// ptraceStop is a TaskStop placed on tasks in a ptrace-stop.
//
// +stateify savable
type ptraceStop struct {
	// If frozen is true, the stopped task's tracer is currently operating on
	// it, so Task.Kill should not remove the stop.
	frozen bool

	// If listen is true, the stopped task's tracer invoked PTRACE_LISTEN, so
	// ptraceFreeze should fail.
	listen bool
}

// Killable implements TaskStop.Killable.
func (s *ptraceStop) Killable() bool {
	return !s.frozen
}

// beginPtraceStopLocked initiates an unfrozen ptrace-stop on t. If t has been
// killed, the stop is skipped, and beginPtraceStopLocked returns false.
//
// beginPtraceStopLocked does not signal t's tracer or wake it if it is
// waiting.
//
// Preconditions:
// * The TaskSet mutex must be locked.
// * The caller must be running on the task goroutine.
func (t *Task) beginPtraceStopLocked() bool {
	t.tg.signalHandlers.mu.Lock()
	defer t.tg.signalHandlers.mu.Unlock()
	// This is analogous to Linux's kernel/signal.c:ptrace_stop() => ... =>
	// kernel/sched/core.c:__schedule() => signal_pending_state() check, which
	// is what prevents tasks from entering ptrace-stops after being killed.
	// Note that if t was SIGKILLed and beingPtraceStopLocked is being called
	// for PTRACE_EVENT_EXIT, the task will have dequeued the signal before
	// entering the exit path, so t.killedLocked() will no longer return true.
	// This is consistent with Linux: "Bugs: ... A SIGKILL signal may still
	// cause a PTRACE_EVENT_EXIT stop before actual signal death. This may be
	// changed in the future; SIGKILL is meant to always immediately kill tasks
	// even under ptrace. Last confirmed on Linux 3.13." - ptrace(2)
	if t.killedLocked() {
		return false
	}
	t.beginInternalStopLocked(&ptraceStop{})
	return true
}

// Preconditions: The TaskSet mutex must be locked.
func (t *Task) ptraceTrapLocked(code int32) {
	// This is unconditional in ptrace_stop().
	t.tg.signalHandlers.mu.Lock()
	t.trapStopPending = false
	t.tg.signalHandlers.mu.Unlock()
	t.ptraceCode = code
	t.ptraceSiginfo = &arch.SignalInfo{
		Signo: int32(linux.SIGTRAP),
		Code:  code,
	}
	t.ptraceSiginfo.SetPID(int32(t.tg.pidns.tids[t]))
	t.ptraceSiginfo.SetUID(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
	if t.beginPtraceStopLocked() {
		tracer := t.Tracer()
		tracer.signalStop(t, arch.CLD_TRAPPED, int32(linux.SIGTRAP))
		tracer.tg.eventQueue.Notify(EventTraceeStop)
	}
}

// ptraceFreeze checks if t is in a ptraceStop. If so, it freezes the
// ptraceStop, temporarily preventing it from being removed by a concurrent
// Task.Kill, and returns true. Otherwise it returns false.
//
// Preconditions:
// * The TaskSet mutex must be locked.
// * The caller must be running on the task goroutine of t's tracer.
func (t *Task) ptraceFreeze() bool {
	t.tg.signalHandlers.mu.Lock()
	defer t.tg.signalHandlers.mu.Unlock()
	if t.stop == nil {
		return false
	}
	s, ok := t.stop.(*ptraceStop)
	if !ok {
		return false
	}
	if s.listen {
		return false
	}
	s.frozen = true
	return true
}

// ptraceUnfreeze ends the effect of a previous successful call to
// ptraceFreeze.
//
// Preconditions: t must be in a frozen ptraceStop.
func (t *Task) ptraceUnfreeze() {
	// t.tg.signalHandlers is stable because t is in a frozen ptrace-stop,
	// preventing its thread group from completing execve.
	t.tg.signalHandlers.mu.Lock()
	defer t.tg.signalHandlers.mu.Unlock()
	t.ptraceUnfreezeLocked()
}

// Preconditions:
// * t must be in a frozen ptraceStop.
// * t's signal mutex must be locked.
func (t *Task) ptraceUnfreezeLocked() {
	// Do this even if the task has been killed to ensure a panic if t.stop is
	// nil or not a ptraceStop.
	t.stop.(*ptraceStop).frozen = false
	if t.killedLocked() {
		t.endInternalStopLocked()
	}
}

// ptraceUnstop implements ptrace request PTRACE_CONT, PTRACE_SYSCALL,
// PTRACE_SINGLESTEP, PTRACE_SYSEMU, or PTRACE_SYSEMU_SINGLESTEP depending on
// mode and singlestep.
//
// Preconditions: t must be in a frozen ptrace stop.
//
// Postconditions: If ptraceUnstop returns nil, t will no longer be in a ptrace
// stop.
func (t *Task) ptraceUnstop(mode ptraceSyscallMode, singlestep bool, sig linux.Signal) error {
	if sig != 0 && !sig.IsValid() {
		return syserror.EIO
	}
	t.tg.pidns.owner.mu.Lock()
	defer t.tg.pidns.owner.mu.Unlock()
	t.ptraceCode = int32(sig)
	t.ptraceSyscallMode = mode
	t.ptraceSinglestep = singlestep
	t.tg.signalHandlers.mu.Lock()
	defer t.tg.signalHandlers.mu.Unlock()
	t.endInternalStopLocked()
	return nil
}

func (t *Task) ptraceTraceme() error {
	t.tg.pidns.owner.mu.Lock()
	defer t.tg.pidns.owner.mu.Unlock()
	if t.hasTracer() {
		return syserror.EPERM
	}
	if t.parent == nil {
		// In Linux, only init can not have a parent, and init is assumed never
		// to invoke PTRACE_TRACEME. In the sentry, TGID 1 is an arbitrary user
		// application that may invoke PTRACE_TRACEME; having no parent can
		// also occur if all tasks in the parent thread group have exited, and
		// failed to find a living thread group to reparent to. The former case
		// is treated as if TGID 1 has an exited parent in an invisible
		// ancestor PID namespace that is an owner of the root user namespace
		// (and consequently has CAP_SYS_PTRACE), and the latter case is a
		// special form of the exited parent case below. In either case,
		// returning nil here is correct.
		return nil
	}
	if !t.parent.CanTrace(t, true) {
		return syserror.EPERM
	}
	if t.parent.exitState != TaskExitNone {
		// Fail silently, as if we were successfully attached but then
		// immediately detached. This is consistent with Linux.
		return nil
	}
	t.ptraceTracer.Store(t.parent)
	t.parent.ptraceTracees[t] = struct{}{}
	return nil
}

// ptraceAttach implements ptrace(PTRACE_ATTACH, target) if seize is false, and
// ptrace(PTRACE_SEIZE, target, 0, opts) if seize is true. t is the caller.
func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error {
	if t.tg == target.tg {
		return syserror.EPERM
	}
	if !t.CanTrace(target, true) {
		return syserror.EPERM
	}
	t.tg.pidns.owner.mu.Lock()
	defer t.tg.pidns.owner.mu.Unlock()
	if target.hasTracer() {
		return syserror.EPERM
	}
	// Attaching to zombies and dead tasks is not permitted; the exit
	// notification logic relies on this. Linux allows attaching to PF_EXITING
	// tasks, though.
	if target.exitState >= TaskExitZombie {
		return syserror.EPERM
	}
	if seize {
		if err := target.ptraceSetOptionsLocked(opts); err != nil {
			return syserror.EIO
		}
	}
	target.ptraceTracer.Store(t)
	t.ptraceTracees[target] = struct{}{}
	target.ptraceSeized = seize
	target.tg.signalHandlers.mu.Lock()
	// "Unlike PTRACE_ATTACH, PTRACE_SEIZE does not stop the process." -
	// ptrace(2)
	if !seize {
		target.sendSignalLocked(&arch.SignalInfo{
			Signo: int32(linux.SIGSTOP),
			Code:  arch.SignalInfoUser,
		}, false /* group */)
	}
	// Undocumented Linux feature: If the tracee is already group-stopped (and
	// consequently will not report the SIGSTOP just sent), force it to leave
	// and re-enter the stop so that it will switch to a ptrace-stop.
	if target.stop == (*groupStop)(nil) {
		target.trapStopPending = true
		target.endInternalStopLocked()
		// TODO(jamieliu): Linux blocks ptrace_attach() until the task has
		// entered the ptrace-stop (or exited) via JOBCTL_TRAPPING.
	}
	target.tg.signalHandlers.mu.Unlock()
	return nil
}

// ptraceDetach implements ptrace(PTRACE_DETACH, target, 0, sig). t is the
// caller.
//
// Preconditions: target must be a tracee of t in a frozen ptrace stop.
//
// Postconditions: If ptraceDetach returns nil, target will no longer be in a
// ptrace stop.
func (t *Task) ptraceDetach(target *Task, sig linux.Signal) error {
	if sig != 0 && !sig.IsValid() {
		return syserror.EIO
	}
	t.tg.pidns.owner.mu.Lock()
	defer t.tg.pidns.owner.mu.Unlock()
	target.ptraceCode = int32(sig)
	target.forgetTracerLocked()
	delete(t.ptraceTracees, target)
	return nil
}

// exitPtrace is called in the exit path to detach all of t's tracees.
func (t *Task) exitPtrace() {
	t.tg.pidns.owner.mu.Lock()
	defer t.tg.pidns.owner.mu.Unlock()
	for target := range t.ptraceTracees {
		if target.ptraceOpts.ExitKill {
			target.tg.signalHandlers.mu.Lock()
			target.sendSignalLocked(&arch.SignalInfo{
				Signo: int32(linux.SIGKILL),
			}, false /* group */)
			target.tg.signalHandlers.mu.Unlock()
		}
		// Leave ptraceCode unchanged so that if the task is ptrace-stopped, it
		// observes the ptraceCode it set before it entered the stop. I believe
		// this is consistent with Linux.
		target.forgetTracerLocked()
	}
	// "nil maps cannot be saved"
	t.ptraceTracees = make(map[*Task]struct{})
}

// forgetTracerLocked detaches t's tracer and ensures that t is no longer
// ptrace-stopped.
//
// Preconditions: The TaskSet mutex must be locked for writing.
func (t *Task) forgetTracerLocked() {
	t.ptraceSeized = false
	t.ptraceOpts = ptraceOptions{}
	t.ptraceSyscallMode = ptraceSyscallNone
	t.ptraceSinglestep = false
	t.ptraceTracer.Store((*Task)(nil))
	if t.exitTracerNotified && !t.exitTracerAcked {
		t.exitTracerAcked = true
		t.exitNotifyLocked(true)
	}
	t.tg.signalHandlers.mu.Lock()
	defer t.tg.signalHandlers.mu.Unlock()
	// Unset t.trapStopPending, which might have been set by PTRACE_INTERRUPT. If
	// it wasn't, it will be reset via t.groupStopPending after the following.
	t.trapStopPending = false
	// If t's thread group is in a group stop and t is eligible to participate,
	// make it do so. This is essentially the reverse of the special case in
	// ptraceAttach, which converts a group stop to a ptrace stop. ("Handling
	// of restart from group-stop is currently buggy, but the "as planned"
	// behavior is to leave tracee stopped and waiting for SIGCONT." -
	// ptrace(2))
	if (t.tg.groupStopComplete || t.tg.groupStopPendingCount != 0) && !t.groupStopPending && t.exitState < TaskExitInitiated {
		t.groupStopPending = true
		// t already participated in the group stop when it unset
		// groupStopPending.
		t.groupStopAcknowledged = true
		t.interrupt()
	}
	if _, ok := t.stop.(*ptraceStop); ok {
		t.endInternalStopLocked()
	}
}

// ptraceSignalLocked is called after signal dequeueing to check if t should
// enter ptrace signal-delivery-stop.
//
// Preconditions:
// * The signal mutex must be locked.
// * The caller must be running on the task goroutine.
func (t *Task) ptraceSignalLocked(info *arch.SignalInfo) bool {
	if linux.Signal(info.Signo) == linux.SIGKILL {
		return false
	}
	if !t.hasTracer() {
		return false
	}
	// The tracer might change this signal into a stop signal, in which case
	// any SIGCONT received after the signal was originally dequeued should
	// cancel it. This is consistent with Linux.
	t.tg.groupStopDequeued = true
	// This is unconditional in ptrace_stop().
	t.trapStopPending = false
	// Can't lock the TaskSet mutex while holding a signal mutex.
	t.tg.signalHandlers.mu.Unlock()
	defer t.tg.signalHandlers.mu.Lock()
	t.tg.pidns.owner.mu.RLock()
	defer t.tg.pidns.owner.mu.RUnlock()
	tracer := t.Tracer()
	if tracer == nil {
		return false
	}
	t.ptraceCode = info.Signo
	t.ptraceSiginfo = info
	t.Debugf("Entering signal-delivery-stop for signal %d", info.Signo)
	if t.beginPtraceStopLocked() {
		tracer.signalStop(t, arch.CLD_TRAPPED, info.Signo)
		tracer.tg.eventQueue.Notify(EventTraceeStop)
	}
	return true
}

// ptraceSeccomp is called when a seccomp-bpf filter returns action
// SECCOMP_RET_TRACE to check if t should enter PTRACE_EVENT_SECCOMP stop. data
// is the lower 16 bits of the filter's return value.
func (t *Task) ptraceSeccomp(data uint16) bool {
	if !t.hasTracer() {
		return false
	}
	t.tg.pidns.owner.mu.RLock()
	defer t.tg.pidns.owner.mu.RUnlock()
	if !t.ptraceOpts.TraceSeccomp {
		return false
	}
	t.Debugf("Entering PTRACE_EVENT_SECCOMP stop")
	t.ptraceEventLocked(linux.PTRACE_EVENT_SECCOMP, uint64(data))
	return true
}

// ptraceSyscallEnter is called immediately before entering a syscall to check
// if t should enter ptrace syscall-enter-stop.
func (t *Task) ptraceSyscallEnter() (taskRunState, bool) {
	if !t.hasTracer() {
		return nil, false
	}
	t.tg.pidns.owner.mu.RLock()
	defer t.tg.pidns.owner.mu.RUnlock()
	switch t.ptraceSyscallMode {
	case ptraceSyscallNone:
		return nil, false
	case ptraceSyscallIntercept:
		t.Debugf("Entering syscall-enter-stop from PTRACE_SYSCALL")
		t.ptraceSyscallStopLocked()
		return (*runSyscallAfterSyscallEnterStop)(nil), true
	case ptraceSyscallEmu:
		t.Debugf("Entering syscall-enter-stop from PTRACE_SYSEMU")
		t.ptraceSyscallStopLocked()
		return (*runSyscallAfterSysemuStop)(nil), true
	}
	panic(fmt.Sprintf("Unknown ptraceSyscallMode: %v", t.ptraceSyscallMode))
}

// ptraceSyscallExit is called immediately after leaving a syscall to check if
// t should enter ptrace syscall-exit-stop.
func (t *Task) ptraceSyscallExit() {
	if !t.hasTracer() {
		return
	}
	t.tg.pidns.owner.mu.RLock()
	defer t.tg.pidns.owner.mu.RUnlock()
	if t.ptraceSyscallMode != ptraceSyscallIntercept {
		return
	}
	t.Debugf("Entering syscall-exit-stop")
	t.ptraceSyscallStopLocked()
}

// Preconditions: The TaskSet mutex must be locked.
func (t *Task) ptraceSyscallStopLocked() {
	code := int32(linux.SIGTRAP)
	if t.ptraceOpts.SysGood {
		code |= 0x80
	}
	t.ptraceTrapLocked(code)
}

type ptraceCloneKind int32

const (
	// ptraceCloneKindClone represents a call to Task.Clone where
	// TerminationSignal is not SIGCHLD and Vfork is false.
	ptraceCloneKindClone ptraceCloneKind = iota

	// ptraceCloneKindFork represents a call to Task.Clone where
	// TerminationSignal is SIGCHLD and Vfork is false.
	ptraceCloneKindFork

	// ptraceCloneKindVfork represents a call to Task.Clone where Vfork is
	// true.
	ptraceCloneKindVfork
)

// ptraceClone is called at the end of a clone or fork syscall to check if t
// should enter PTRACE_EVENT_CLONE, PTRACE_EVENT_FORK, or PTRACE_EVENT_VFORK
// stop. child is the new task.
func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions) bool {
	if !t.hasTracer() {
		return false
	}
	t.tg.pidns.owner.mu.RLock()
	defer t.tg.pidns.owner.mu.RUnlock()
	event := false
	if !opts.Untraced {
		switch kind {
		case ptraceCloneKindClone:
			if t.ptraceOpts.TraceClone {
				t.Debugf("Entering PTRACE_EVENT_CLONE stop")
				t.ptraceEventLocked(linux.PTRACE_EVENT_CLONE, uint64(t.tg.pidns.tids[child]))
				event = true
			}
		case ptraceCloneKindFork:
			if t.ptraceOpts.TraceFork {
				t.Debugf("Entering PTRACE_EVENT_FORK stop")
				t.ptraceEventLocked(linux.PTRACE_EVENT_FORK, uint64(t.tg.pidns.tids[child]))
				event = true
			}
		case ptraceCloneKindVfork:
			if t.ptraceOpts.TraceVfork {
				t.Debugf("Entering PTRACE_EVENT_VFORK stop")
				t.ptraceEventLocked(linux.PTRACE_EVENT_VFORK, uint64(t.tg.pidns.tids[child]))
				event = true
			}
		default:
			panic(fmt.Sprintf("Unknown ptraceCloneKind: %v", kind))
		}
	}
	// "If the PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or PTRACE_O_TRACECLONE
	// options are in effect, then children created by, respectively, vfork(2)
	// or clone(2) with the CLONE_VFORK flag, fork(2) or clone(2) with the exit
	// signal set to SIGCHLD, and other kinds of clone(2), are automatically
	// attached to the same tracer which traced their parent. SIGSTOP is
	// delivered to the children, causing them to enter signal-delivery-stop
	// after they exit the system call which created them." - ptrace(2)
	//
	// clone(2)'s documentation of CLONE_UNTRACED and CLONE_PTRACE is
	// confusingly wrong; see kernel/fork.c:_do_fork() => copy_process() =>
	// include/linux/ptrace.h:ptrace_init_task().
	if event || opts.InheritTracer {
		tracer := t.Tracer()
		if tracer != nil {
			child.ptraceTracer.Store(tracer)
			tracer.ptraceTracees[child] = struct{}{}
			// "The "seized" behavior ... is inherited by children that are
			// automatically attached using PTRACE_O_TRACEFORK,
			// PTRACE_O_TRACEVFORK, and PTRACE_O_TRACECLONE." - ptrace(2)
			child.ptraceSeized = t.ptraceSeized
			// "Flags are inherited by new tracees created and "auto-attached"
			// via active PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or
			// PTRACE_O_TRACECLONE options." - ptrace(2)
			child.ptraceOpts = t.ptraceOpts
			child.tg.signalHandlers.mu.Lock()
			// "PTRACE_SEIZE: ... Automatically attached children stop with
			// PTRACE_EVENT_STOP and WSTOPSIG(status) returns SIGTRAP instead
			// of having SIGSTOP signal delivered to them." - ptrace(2)
			if child.ptraceSeized {
				child.trapStopPending = true
			} else {
				child.pendingSignals.enqueue(&arch.SignalInfo{
					Signo: int32(linux.SIGSTOP),
				}, nil)
			}
			// The child will self-interrupt() when its task goroutine starts
			// running, so we don't have to.
			child.tg.signalHandlers.mu.Unlock()
		}
	}
	return event
}

// ptraceVforkDone is called after the end of a vfork stop to check if t should
// enter PTRACE_EVENT_VFORK_DONE stop. child is the new task's thread ID in t's
// PID namespace.
func (t *Task) ptraceVforkDone(child ThreadID) bool {
	if !t.hasTracer() {
		return false
	}
	t.tg.pidns.owner.mu.RLock()
	defer t.tg.pidns.owner.mu.RUnlock()
	if !t.ptraceOpts.TraceVforkDone {
		return false
	}
	t.Debugf("Entering PTRACE_EVENT_VFORK_DONE stop")
	t.ptraceEventLocked(linux.PTRACE_EVENT_VFORK_DONE, uint64(child))
	return true
}

// ptraceExec is called at the end of an execve syscall to check if t should
// enter PTRACE_EVENT_EXEC stop. oldTID is t's thread ID, in its *tracer's* PID
// namespace, prior to the execve. (If t did not have a tracer at the time
// oldTID was read, oldTID may be 0. This is consistent with Linux.)
func (t *Task) ptraceExec(oldTID ThreadID) {
	if !t.hasTracer() {
		return
	}
	t.tg.pidns.owner.mu.RLock()
	defer t.tg.pidns.owner.mu.RUnlock()
	// Recheck with the TaskSet mutex locked. Most ptrace points don't need to
	// do this because detaching resets ptrace options, but PTRACE_EVENT_EXEC
	// is special because both TraceExec and !TraceExec do something if a
	// tracer is attached.
	if !t.hasTracer() {
		return
	}
	if t.ptraceOpts.TraceExec {
		t.Debugf("Entering PTRACE_EVENT_EXEC stop")
		t.ptraceEventLocked(linux.PTRACE_EVENT_EXEC, uint64(oldTID))
		return
	}
	// "If the PTRACE_O_TRACEEXEC option is not in effect for the execing
	// tracee, and if the tracee was PTRACE_ATTACHed rather that [sic]
	// PTRACE_SEIZEd, the kernel delivers an extra SIGTRAP to the tracee after
	// execve(2) returns. This is an ordinary signal (similar to one which can
	// be generated by `kill -TRAP`, not a special kind of ptrace-stop.
	// Employing PTRACE_GETSIGINFO for this signal returns si_code set to 0
	// (SI_USER). This signal may be blocked by signal mask, and thus may be
	// delivered (much) later." - ptrace(2)
	if t.ptraceSeized {
		return
	}
	t.tg.signalHandlers.mu.Lock()
	defer t.tg.signalHandlers.mu.Unlock()
	t.sendSignalLocked(&arch.SignalInfo{
		Signo: int32(linux.SIGTRAP),
		Code:  arch.SignalInfoUser,
	}, false /* group */)
}

// ptraceExit is called early in the task exit path to check if t should enter
// PTRACE_EVENT_EXIT stop.
func (t *Task) ptraceExit() {
	if !t.hasTracer() {
		return
	}
	t.tg.pidns.owner.mu.RLock()
	defer t.tg.pidns.owner.mu.RUnlock()
	if !t.ptraceOpts.TraceExit {
		return
	}
	t.tg.signalHandlers.mu.Lock()
	status := t.exitStatus.Status()
	t.tg.signalHandlers.mu.Unlock()
	t.Debugf("Entering PTRACE_EVENT_EXIT stop")
	t.ptraceEventLocked(linux.PTRACE_EVENT_EXIT, uint64(status))
}

// Preconditions: The TaskSet mutex must be locked.
func (t *Task) ptraceEventLocked(event int32, msg uint64) {
	t.ptraceEventMsg = msg
	// """
	// PTRACE_EVENT stops are observed by the tracer as waitpid(2) returning
	// with WIFSTOPPED(status), and WSTOPSIG(status) returns SIGTRAP. An
	// additional bit is set in the higher byte of the status word: the value
	// status>>8 will be
	//
	//   (SIGTRAP | PTRACE_EVENT_foo << 8).
	//
	// ...
	//
	// """ - ptrace(2)
	t.ptraceTrapLocked(int32(linux.SIGTRAP) | (event << 8))
}

// ptraceKill implements ptrace(PTRACE_KILL, target). t is the caller.
func (t *Task) ptraceKill(target *Task) error {
	t.tg.pidns.owner.mu.Lock()
	defer t.tg.pidns.owner.mu.Unlock()
	if target.Tracer() != t {
		return syserror.ESRCH
	}
	target.tg.signalHandlers.mu.Lock()
	defer target.tg.signalHandlers.mu.Unlock()
	// "This operation is deprecated; do not use it! Instead, send a SIGKILL
	// directly using kill(2) or tgkill(2). The problem with PTRACE_KILL is
	// that it requires the tracee to be in signal-delivery-stop, otherwise it
	// may not work (i.e., may complete successfully but won't kill the
	// tracee)." - ptrace(2)
	if target.stop == nil {
		return nil
	}
	if _, ok := target.stop.(*ptraceStop); !ok {
		return nil
	}
	target.ptraceCode = int32(linux.SIGKILL)
	target.endInternalStopLocked()
	return nil
}

func (t *Task) ptraceInterrupt(target *Task) error {
	t.tg.pidns.owner.mu.Lock()
	defer t.tg.pidns.owner.mu.Unlock()
	if target.Tracer() != t {
		return syserror.ESRCH
	}
	if !target.ptraceSeized {
		return syserror.EIO
	}
	target.tg.signalHandlers.mu.Lock()
	defer target.tg.signalHandlers.mu.Unlock()
	if target.killedLocked() || target.exitState >= TaskExitInitiated {
		return nil
	}
	target.trapStopPending = true
	if s, ok := target.stop.(*ptraceStop); ok && s.listen {
		target.endInternalStopLocked()
	}
	target.interrupt()
	return nil
}

// Preconditions:
// * The TaskSet mutex must be locked for writing.
// * t must have a tracer.
func (t *Task) ptraceSetOptionsLocked(opts uintptr) error {
	const valid = uintptr(linux.PTRACE_O_EXITKILL |
		linux.PTRACE_O_TRACESYSGOOD |
		linux.PTRACE_O_TRACECLONE |
		linux.PTRACE_O_TRACEEXEC |
		linux.PTRACE_O_TRACEEXIT |
		linux.PTRACE_O_TRACEFORK |
		linux.PTRACE_O_TRACESECCOMP |
		linux.PTRACE_O_TRACEVFORK |
		linux.PTRACE_O_TRACEVFORKDONE)
	if opts&^valid != 0 {
		return syserror.EINVAL
	}
	t.ptraceOpts = ptraceOptions{
		ExitKill:       opts&linux.PTRACE_O_EXITKILL != 0,
		SysGood:        opts&linux.PTRACE_O_TRACESYSGOOD != 0,
		TraceClone:     opts&linux.PTRACE_O_TRACECLONE != 0,
		TraceExec:      opts&linux.PTRACE_O_TRACEEXEC != 0,
		TraceExit:      opts&linux.PTRACE_O_TRACEEXIT != 0,
		TraceFork:      opts&linux.PTRACE_O_TRACEFORK != 0,
		TraceSeccomp:   opts&linux.PTRACE_O_TRACESECCOMP != 0,
		TraceVfork:     opts&linux.PTRACE_O_TRACEVFORK != 0,
		TraceVforkDone: opts&linux.PTRACE_O_TRACEVFORKDONE != 0,
	}
	return nil
}

// Ptrace implements the ptrace system call.
func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
	// PTRACE_TRACEME ignores all other arguments.
	if req == linux.PTRACE_TRACEME {
		return t.ptraceTraceme()
	}
	// All other ptrace requests operate on a current or future tracee
	// specified by pid.
	target := t.tg.pidns.TaskWithID(pid)
	if target == nil {
		return syserror.ESRCH
	}

	// PTRACE_ATTACH and PTRACE_SEIZE do not require that target is not already
	// a tracee.
	if req == linux.PTRACE_ATTACH || req == linux.PTRACE_SEIZE {
		seize := req == linux.PTRACE_SEIZE
		if seize && addr != 0 {
			return syserror.EIO
		}
		return t.ptraceAttach(target, seize, uintptr(data))
	}
	// PTRACE_KILL and PTRACE_INTERRUPT require that the target is a tracee,
	// but does not require that it is ptrace-stopped.
	if req == linux.PTRACE_KILL {
		return t.ptraceKill(target)
	}
	if req == linux.PTRACE_INTERRUPT {
		return t.ptraceInterrupt(target)
	}
	// All other ptrace requests require that the target is a ptrace-stopped
	// tracee, and freeze the ptrace-stop so the tracee can be operated on.
	t.tg.pidns.owner.mu.RLock()
	if target.Tracer() != t {
		t.tg.pidns.owner.mu.RUnlock()
		return syserror.ESRCH
	}
	if !target.ptraceFreeze() {
		t.tg.pidns.owner.mu.RUnlock()
		// "Most ptrace commands (all except PTRACE_ATTACH, PTRACE_SEIZE,
		// PTRACE_TRACEME, PTRACE_INTERRUPT, and PTRACE_KILL) require the
		// tracee to be in a ptrace-stop, otherwise they fail with ESRCH." -
		// ptrace(2)
		return syserror.ESRCH
	}
	t.tg.pidns.owner.mu.RUnlock()
	// Even if the target has a ptrace-stop active, the tracee's task goroutine
	// may not yet have reached Task.doStop; wait for it to do so. This is safe
	// because there's no way for target to initiate a ptrace-stop and then
	// block (by calling Task.block) before entering it.
	//
	// Caveat: If tasks were just restored, the tracee's first call to
	// Task.Activate (in Task.run) occurs before its first call to Task.doStop,
	// which may block if the tracer's address space is active.
	t.UninterruptibleSleepStart(true)
	target.waitGoroutineStoppedOrExited()
	t.UninterruptibleSleepFinish(true)

	// Resuming commands end the ptrace stop, but only if successful.
	// PTRACE_LISTEN ends the ptrace stop if trapNotifyPending is already set on the
	// target.
	switch req {
	case linux.PTRACE_DETACH:
		if err := t.ptraceDetach(target, linux.Signal(data)); err != nil {
			target.ptraceUnfreeze()
			return err
		}
		return nil

	case linux.PTRACE_CONT:
		if err := target.ptraceUnstop(ptraceSyscallNone, false, linux.Signal(data)); err != nil {
			target.ptraceUnfreeze()
			return err
		}
		return nil

	case linux.PTRACE_SYSCALL:
		if err := target.ptraceUnstop(ptraceSyscallIntercept, false, linux.Signal(data)); err != nil {
			target.ptraceUnfreeze()
			return err
		}
		return nil

	case linux.PTRACE_SINGLESTEP:
		if err := target.ptraceUnstop(ptraceSyscallNone, true, linux.Signal(data)); err != nil {
			target.ptraceUnfreeze()
			return err
		}
		return nil

	case linux.PTRACE_SYSEMU:
		if err := target.ptraceUnstop(ptraceSyscallEmu, false, linux.Signal(data)); err != nil {
			target.ptraceUnfreeze()
			return err
		}
		return nil

	case linux.PTRACE_SYSEMU_SINGLESTEP:
		if err := target.ptraceUnstop(ptraceSyscallEmu, true, linux.Signal(data)); err != nil {
			target.ptraceUnfreeze()
			return err
		}
		return nil

	case linux.PTRACE_LISTEN:
		t.tg.pidns.owner.mu.RLock()
		defer t.tg.pidns.owner.mu.RUnlock()
		if !target.ptraceSeized {
			return syserror.EIO
		}
		if target.ptraceSiginfo == nil {
			return syserror.EIO
		}
		if target.ptraceSiginfo.Code>>8 != linux.PTRACE_EVENT_STOP {
			return syserror.EIO
		}
		target.tg.signalHandlers.mu.Lock()
		defer target.tg.signalHandlers.mu.Unlock()
		if target.trapNotifyPending {
			target.endInternalStopLocked()
		} else {
			target.stop.(*ptraceStop).listen = true
			target.ptraceUnfreezeLocked()
		}
		return nil
	}

	// All other ptrace requests expect us to unfreeze the stop.
	defer target.ptraceUnfreeze()

	switch req {
	case linux.PTRACE_PEEKTEXT, linux.PTRACE_PEEKDATA:
		// "At the system call level, the PTRACE_PEEKTEXT, PTRACE_PEEKDATA, and
		// PTRACE_PEEKUSER requests have a different API: they store the result
		// at the address specified by the data parameter, and the return value
		// is the error flag." - ptrace(2)
		word := t.Arch().Native(0)
		if _, err := word.CopyIn(target.CopyContext(t, usermem.IOOpts{IgnorePermissions: true}), addr); err != nil {
			return err
		}
		_, err := word.CopyOut(t, data)
		return err

	case linux.PTRACE_POKETEXT, linux.PTRACE_POKEDATA:
		word := t.Arch().Native(uintptr(data))
		_, err := word.CopyOut(target.CopyContext(t, usermem.IOOpts{IgnorePermissions: true}), addr)
		return err

	case linux.PTRACE_GETREGSET:
		// "Read the tracee's registers. addr specifies, in an
		// architecture-dependent way, the type of registers to be read. ...
		// data points to a struct iovec, which describes the destination
		// buffer's location and length. On return, the kernel modifies iov.len
		// to indicate the actual number of bytes returned." - ptrace(2)
		ars, err := t.CopyInIovecs(data, 1)
		if err != nil {
			return err
		}

		t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch())

		ar := ars.Head()
		n, err := target.Arch().PtraceGetRegSet(uintptr(addr), &usermem.IOReadWriter{
			Ctx:  t,
			IO:   t.MemoryManager(),
			Addr: ar.Start,
			Opts: usermem.IOOpts{
				AddressSpaceActive: true,
			},
		}, int(ar.Length()))
		if err != nil {
			return err
		}

		// Update iovecs to represent the range of the written register set.
		end, ok := ar.Start.AddLength(uint64(n))
		if !ok {
			panic(fmt.Sprintf("%#x + %#x overflows. Invalid reg size > %#x", ar.Start, n, ar.Length()))
		}
		ar.End = end
		return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))

	case linux.PTRACE_SETREGSET:
		ars, err := t.CopyInIovecs(data, 1)
		if err != nil {
			return err
		}

		mm := t.MemoryManager()
		t.p.PullFullState(mm.AddressSpace(), t.Arch())

		ar := ars.Head()
		n, err := target.Arch().PtraceSetRegSet(uintptr(addr), &usermem.IOReadWriter{
			Ctx:  t,
			IO:   mm,
			Addr: ar.Start,
			Opts: usermem.IOOpts{
				AddressSpaceActive: true,
			},
		}, int(ar.Length()))
		if err != nil {
			return err
		}
		t.p.FullStateChanged()
		ar.End -= usermem.Addr(n)
		return t.CopyOutIovecs(data, usermem.AddrRangeSeqOf(ar))

	case linux.PTRACE_GETSIGINFO:
		t.tg.pidns.owner.mu.RLock()
		defer t.tg.pidns.owner.mu.RUnlock()
		if target.ptraceSiginfo == nil {
			return syserror.EINVAL
		}
		_, err := target.ptraceSiginfo.CopyOut(t, data)
		return err

	case linux.PTRACE_SETSIGINFO:
		var info arch.SignalInfo
		if _, err := info.CopyIn(t, data); err != nil {
			return err
		}
		t.tg.pidns.owner.mu.RLock()
		defer t.tg.pidns.owner.mu.RUnlock()
		if target.ptraceSiginfo == nil {
			return syserror.EINVAL
		}
		target.ptraceSiginfo = &info
		return nil

	case linux.PTRACE_GETSIGMASK:
		if addr != linux.SignalSetSize {
			return syserror.EINVAL
		}
		mask := target.SignalMask()
		_, err := mask.CopyOut(t, data)
		return err

	case linux.PTRACE_SETSIGMASK:
		if addr != linux.SignalSetSize {
			return syserror.EINVAL
		}
		var mask linux.SignalSet
		if _, err := mask.CopyIn(t, data); err != nil {
			return err
		}
		// The target's task goroutine is stopped, so this is safe:
		target.SetSignalMask(mask &^ UnblockableSignals)
		return nil

	case linux.PTRACE_SETOPTIONS:
		t.tg.pidns.owner.mu.Lock()
		defer t.tg.pidns.owner.mu.Unlock()
		return target.ptraceSetOptionsLocked(uintptr(data))

	case linux.PTRACE_GETEVENTMSG:
		t.tg.pidns.owner.mu.RLock()
		defer t.tg.pidns.owner.mu.RUnlock()
		_, err := primitive.CopyUint64Out(t, usermem.Addr(data), target.ptraceEventMsg)
		return err

	// PEEKSIGINFO is unimplemented but seems to have no users anywhere.

	default:
		return t.ptraceArch(target, req, addr, data)
	}
}