summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel/task_exec.go
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/kernel/task_exec.go')
-rw-r--r--pkg/sentry/kernel/task_exec.go262
1 files changed, 262 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
new file mode 100644
index 000000000..5d1425d5c
--- /dev/null
+++ b/pkg/sentry/kernel/task_exec.go
@@ -0,0 +1,262 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// This file implements the machinery behind the execve() syscall. In brief, a
+// thread executes an execve() by killing all other threads in its thread
+// group, assuming the leader's identity, and then switching process images.
+//
+// This design is effectively mandated by Linux. From ptrace(2):
+//
+// """
+// execve(2) under ptrace
+// When one thread in a multithreaded process calls execve(2), the
+// kernel destroys all other threads in the process, and resets the
+// thread ID of the execing thread to the thread group ID (process ID).
+// (Or, to put things another way, when a multithreaded process does an
+// execve(2), at completion of the call, it appears as though the
+// execve(2) occurred in the thread group leader, regardless of which
+// thread did the execve(2).) This resetting of the thread ID looks
+// very confusing to tracers:
+//
+// * All other threads stop in PTRACE_EVENT_EXIT stop, if the
+// PTRACE_O_TRACEEXIT option was turned on. Then all other threads
+// except the thread group leader report death as if they exited via
+// _exit(2) with exit code 0.
+//
+// * The execing tracee changes its thread ID while it is in the
+// execve(2). (Remember, under ptrace, the "pid" returned from
+// waitpid(2), or fed into ptrace calls, is the tracee's thread ID.)
+// That is, the tracee's thread ID is reset to be the same as its
+// process ID, which is the same as the thread group leader's thread
+// ID.
+//
+// * Then a PTRACE_EVENT_EXEC stop happens, if the PTRACE_O_TRACEEXEC
+// option was turned on.
+//
+// * If the thread group leader has reported its PTRACE_EVENT_EXIT stop
+// by this time, it appears to the tracer that the dead thread leader
+// "reappears from nowhere". (Note: the thread group leader does not
+// report death via WIFEXITED(status) until there is at least one
+// other live thread. This eliminates the possibility that the
+// tracer will see it dying and then reappearing.) If the thread
+// group leader was still alive, for the tracer this may look as if
+// thread group leader returns from a different system call than it
+// entered, or even "returned from a system call even though it was
+// not in any system call". If the thread group leader was not
+// traced (or was traced by a different tracer), then during
+// execve(2) it will appear as if it has become a tracee of the
+// tracer of the execing tracee.
+//
+// All of the above effects are the artifacts of the thread ID change in
+// the tracee.
+// """
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// execStop is a TaskStop that a task sets on itself when it wants to execve
+// and is waiting for the other tasks in its thread group to exit first.
+//
+// +stateify savable
+type execStop struct{}
+
+// Killable implements TaskStop.Killable.
+func (*execStop) Killable() bool { return true }
+
+// Execve implements the execve(2) syscall by killing all other tasks in its
+// thread group and switching to newTC. Execve always takes ownership of newTC.
+//
+// Preconditions: The caller must be running Task.doSyscallInvoke on the task
+// goroutine.
+func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) {
+ t.tg.pidns.owner.mu.Lock()
+ defer t.tg.pidns.owner.mu.Unlock()
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+
+ if t.tg.exiting || t.tg.execing != nil {
+ // We lost to a racing group-exit, kill, or exec from another thread
+ // and should just exit.
+ newTC.release()
+ return nil, syserror.EINTR
+ }
+
+ // Cancel any racing group stops.
+ t.tg.endGroupStopLocked(false)
+
+ // If the task has any siblings, they have to exit before the exec can
+ // continue.
+ t.tg.execing = t
+ if t.tg.tasks.Front() != t.tg.tasks.Back() {
+ // "[All] other threads except the thread group leader report death as
+ // if they exited via _exit(2) with exit code 0." - ptrace(2)
+ for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() {
+ if t != sibling {
+ sibling.killLocked()
+ }
+ }
+ // The last sibling to exit will wake t.
+ t.beginInternalStopLocked((*execStop)(nil))
+ }
+
+ return &SyscallControl{next: &runSyscallAfterExecStop{newTC}, ignoreReturn: true}, nil
+}
+
+// The runSyscallAfterExecStop state continues execve(2) after all siblings of
+// a thread in the execve syscall have exited.
+//
+// +stateify savable
+type runSyscallAfterExecStop struct {
+ tc *TaskContext
+}
+
+func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
+ t.tg.pidns.owner.mu.Lock()
+ t.tg.execing = nil
+ if t.killed() {
+ t.tg.pidns.owner.mu.Unlock()
+ r.tc.release()
+ return (*runInterrupt)(nil)
+ }
+ // We are the thread group leader now. Save our old thread ID for
+ // PTRACE_EVENT_EXEC. This is racy in that if a tracer attaches after this
+ // point it will get a PID of 0, but this is consistent with Linux.
+ oldTID := ThreadID(0)
+ if tracer := t.Tracer(); tracer != nil {
+ oldTID = tracer.tg.pidns.tids[t]
+ }
+ t.promoteLocked()
+ // "POSIX timers are not preserved (timer_create(2))." - execve(2). Handle
+ // this first since POSIX timers are protected by the signal mutex, which
+ // we're about to change. Note that we have to stop and destroy timers
+ // without holding any mutexes to avoid circular lock ordering.
+ var its []*IntervalTimer
+ t.tg.signalHandlers.mu.Lock()
+ for _, it := range t.tg.timers {
+ its = append(its, it)
+ }
+ t.tg.timers = make(map[linux.TimerID]*IntervalTimer)
+ t.tg.signalHandlers.mu.Unlock()
+ t.tg.pidns.owner.mu.Unlock()
+ for _, it := range its {
+ it.DestroyTimer()
+ }
+ t.tg.pidns.owner.mu.Lock()
+ // "During an execve(2), the dispositions of handled signals are reset to
+ // the default; the dispositions of ignored signals are left unchanged. ...
+ // [The] signal mask is preserved across execve(2). ... [The] pending
+ // signal set is preserved across an execve(2)." - signal(7)
+ //
+ // Details:
+ //
+ // - If the thread group is sharing its signal handlers with another thread
+ // group via CLONE_SIGHAND, execve forces the signal handlers to be copied
+ // (see Linux's fs/exec.c:de_thread). We're not reference-counting signal
+ // handlers, so we always make a copy.
+ //
+ // - "Disposition" only means sigaction::sa_handler/sa_sigaction; flags,
+ // restorer (if present), and mask are always reset. (See Linux's
+ // fs/exec.c:setup_new_exec => kernel/signal.c:flush_signal_handlers.)
+ t.tg.signalHandlers = t.tg.signalHandlers.CopyForExec()
+ t.endStopCond.L = &t.tg.signalHandlers.mu
+ // "Any alternate signal stack is not preserved (sigaltstack(2))." - execve(2)
+ t.signalStack = arch.SignalStack{Flags: arch.SignalStackFlagDisable}
+ // "The termination signal is reset to SIGCHLD (see clone(2))."
+ t.tg.terminationSignal = linux.SIGCHLD
+ // execed indicates that the process can no longer join a process group
+ // in some scenarios (namely, the parent call setpgid(2) on the child).
+ // See the JoinProcessGroup function in sessions.go for more context.
+ t.tg.execed = true
+ // Maximum RSS is preserved across execve(2).
+ t.updateRSSLocked()
+ // Restartable sequence state is discarded.
+ t.rseqPreempted = false
+ t.rseqCPUAddr = 0
+ t.rseqCPU = -1
+ t.tg.rscr.Store(&RSEQCriticalRegion{})
+ t.tg.pidns.owner.mu.Unlock()
+
+ // Remove FDs with the CloseOnExec flag set.
+ t.fds.RemoveIf(func(file *fs.File, flags FDFlags) bool {
+ return flags.CloseOnExec
+ })
+
+ // Switch to the new process.
+ t.MemoryManager().Deactivate()
+ t.mu.Lock()
+ // Update credentials to reflect the execve. This should precede switching
+ // MMs to ensure that dumpability has been reset first, if needed.
+ t.updateCredsForExecLocked()
+ t.tc.release()
+ t.tc = *r.tc
+ t.mu.Unlock()
+ t.unstopVforkParent()
+ // NOTE(b/30316266): All locks must be dropped prior to calling Activate.
+ t.MemoryManager().Activate()
+
+ t.ptraceExec(oldTID)
+ return (*runSyscallExit)(nil)
+}
+
+// promoteLocked makes t the leader of its thread group. If t is already the
+// thread group leader, promoteLocked is a no-op.
+//
+// Preconditions: All other tasks in t's thread group, including the existing
+// leader (if it is not t), have reached TaskExitZombie. The TaskSet mutex must
+// be locked for writing.
+func (t *Task) promoteLocked() {
+ oldLeader := t.tg.leader
+ if t == oldLeader {
+ return
+ }
+ // Swap the leader's TIDs with the execing task's. The latter will be
+ // released when the old leader is reaped below.
+ for ns := t.tg.pidns; ns != nil; ns = ns.parent {
+ oldTID, leaderTID := ns.tids[t], ns.tids[oldLeader]
+ ns.tids[oldLeader] = oldTID
+ ns.tids[t] = leaderTID
+ ns.tasks[oldTID] = oldLeader
+ ns.tasks[leaderTID] = t
+ // Neither the ThreadGroup nor TGID change, so no need to
+ // update ns.tgids.
+ }
+
+ // Inherit the old leader's start time.
+ oldStartTime := oldLeader.StartTime()
+ t.mu.Lock()
+ t.startTime = oldStartTime
+ t.mu.Unlock()
+
+ t.tg.leader = t
+ t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t])
+ t.updateLogPrefixLocked()
+ // Reap the original leader. If it has a tracer, detach it instead of
+ // waiting for it to acknowledge the original leader's death.
+ oldLeader.exitParentNotified = true
+ oldLeader.exitParentAcked = true
+ if tracer := oldLeader.Tracer(); tracer != nil {
+ delete(tracer.ptraceTracees, oldLeader)
+ oldLeader.forgetTracerLocked()
+ // Notify the tracer that it will no longer be receiving these events
+ // from the tracee.
+ tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop | EventGroupContinue)
+ }
+ oldLeader.exitNotifyLocked(false)
+}