pkg/sentry/kernel/task_exec.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282

// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

// This file implements the machinery behind the execve() syscall. In brief, a
// thread executes an execve() by killing all other threads in its thread
// group, assuming the leader's identity, and then switching process images.
//
// This design is effectively mandated by Linux. From ptrace(2):
//
// """
// execve(2) under ptrace
//     When one thread in a multithreaded process calls execve(2), the
//     kernel destroys all other threads in the process, and resets the
//     thread ID of the execing thread to the thread group ID (process ID).
//     (Or, to put things another way, when a multithreaded process does an
//     execve(2), at completion of the call, it appears as though the
//     execve(2) occurred in the thread group leader, regardless of which
//     thread did the execve(2).)  This resetting of the thread ID looks
//     very confusing to tracers:
//
//     *  All other threads stop in PTRACE_EVENT_EXIT stop, if the
//        PTRACE_O_TRACEEXIT option was turned on.  Then all other threads
//        except the thread group leader report death as if they exited via
//        _exit(2) with exit code 0.
//
//     *  The execing tracee changes its thread ID while it is in the
//        execve(2).  (Remember, under ptrace, the "pid" returned from
//        waitpid(2), or fed into ptrace calls, is the tracee's thread ID.)
//        That is, the tracee's thread ID is reset to be the same as its
//        process ID, which is the same as the thread group leader's thread
//        ID.
//
//     *  Then a PTRACE_EVENT_EXEC stop happens, if the PTRACE_O_TRACEEXEC
//        option was turned on.
//
//     *  If the thread group leader has reported its PTRACE_EVENT_EXIT stop
//        by this time, it appears to the tracer that the dead thread leader
//        "reappears from nowhere".  (Note: the thread group leader does not
//        report death via WIFEXITED(status) until there is at least one
//        other live thread.  This eliminates the possibility that the
//        tracer will see it dying and then reappearing.)  If the thread
//        group leader was still alive, for the tracer this may look as if
//        thread group leader returns from a different system call than it
//        entered, or even "returned from a system call even though it was
//        not in any system call".  If the thread group leader was not
//        traced (or was traced by a different tracer), then during
//        execve(2) it will appear as if it has become a tracee of the
//        tracer of the execing tracee.
//
//     All of the above effects are the artifacts of the thread ID change in
//     the tracee.
// """

import (
	"gvisor.dev/gvisor/pkg/abi/linux"
	"gvisor.dev/gvisor/pkg/sentry/fs"
	"gvisor.dev/gvisor/pkg/sentry/mm"
	"gvisor.dev/gvisor/pkg/sentry/vfs"
	"gvisor.dev/gvisor/pkg/syserror"
)

// execStop is a TaskStop that a task sets on itself when it wants to execve
// and is waiting for the other tasks in its thread group to exit first.
//
// +stateify savable
type execStop struct{}

// Killable implements TaskStop.Killable.
func (*execStop) Killable() bool { return true }

// Execve implements the execve(2) syscall by killing all other tasks in its
// thread group and switching to newImage. Execve always takes ownership of
// newImage.
//
// Preconditions: The caller must be running Task.doSyscallInvoke on the task
// goroutine.
func (t *Task) Execve(newImage *TaskImage) (*SyscallControl, error) {
	t.tg.pidns.owner.mu.Lock()
	defer t.tg.pidns.owner.mu.Unlock()
	t.tg.signalHandlers.mu.Lock()
	defer t.tg.signalHandlers.mu.Unlock()

	if t.tg.exiting || t.tg.execing != nil {
		// We lost to a racing group-exit, kill, or exec from another thread
		// and should just exit.
		newImage.release()
		return nil, syserror.EINTR
	}

	// Cancel any racing group stops.
	t.tg.endGroupStopLocked(false)

	// If the task has any siblings, they have to exit before the exec can
	// continue.
	t.tg.execing = t
	if t.tg.tasks.Front() != t.tg.tasks.Back() {
		// "[All] other threads except the thread group leader report death as
		// if they exited via _exit(2) with exit code 0." - ptrace(2)
		for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() {
			if t != sibling {
				sibling.killLocked()
			}
		}
		// The last sibling to exit will wake t.
		t.beginInternalStopLocked((*execStop)(nil))
	}

	return &SyscallControl{next: &runSyscallAfterExecStop{newImage}, ignoreReturn: true}, nil
}

// The runSyscallAfterExecStop state continues execve(2) after all siblings of
// a thread in the execve syscall have exited.
//
// +stateify savable
type runSyscallAfterExecStop struct {
	image *TaskImage
}

func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
	t.traceExecEvent(r.image)
	t.tg.pidns.owner.mu.Lock()
	t.tg.execing = nil
	if t.killed() {
		t.tg.pidns.owner.mu.Unlock()
		r.image.release()
		return (*runInterrupt)(nil)
	}
	// We are the thread group leader now. Save our old thread ID for
	// PTRACE_EVENT_EXEC. This is racy in that if a tracer attaches after this
	// point it will get a PID of 0, but this is consistent with Linux.
	oldTID := ThreadID(0)
	if tracer := t.Tracer(); tracer != nil {
		oldTID = tracer.tg.pidns.tids[t]
	}
	t.promoteLocked()
	// "POSIX timers are not preserved (timer_create(2))." - execve(2). Handle
	// this first since POSIX timers are protected by the signal mutex, which
	// we're about to change. Note that we have to stop and destroy timers
	// without holding any mutexes to avoid circular lock ordering.
	var its []*IntervalTimer
	t.tg.signalHandlers.mu.Lock()
	for _, it := range t.tg.timers {
		its = append(its, it)
	}
	t.tg.timers = make(map[linux.TimerID]*IntervalTimer)
	t.tg.signalHandlers.mu.Unlock()
	t.tg.pidns.owner.mu.Unlock()
	for _, it := range its {
		it.DestroyTimer()
	}
	t.tg.pidns.owner.mu.Lock()
	// "During an execve(2), the dispositions of handled signals are reset to
	// the default; the dispositions of ignored signals are left unchanged. ...
	// [The] signal mask is preserved across execve(2). ... [The] pending
	// signal set is preserved across an execve(2)." - signal(7)
	//
	// Details:
	//
	// - If the thread group is sharing its signal handlers with another thread
	// group via CLONE_SIGHAND, execve forces the signal handlers to be copied
	// (see Linux's fs/exec.c:de_thread). We're not reference-counting signal
	// handlers, so we always make a copy.
	//
	// - "Disposition" only means sigaction::sa_handler/sa_sigaction; flags,
	// restorer (if present), and mask are always reset. (See Linux's
	// fs/exec.c:setup_new_exec => kernel/signal.c:flush_signal_handlers.)
	t.tg.signalHandlers = t.tg.signalHandlers.CopyForExec()
	t.endStopCond.L = &t.tg.signalHandlers.mu
	// "Any alternate signal stack is not preserved (sigaltstack(2))." - execve(2)
	t.signalStack = linux.SignalStack{Flags: linux.SS_DISABLE}
	// "The termination signal is reset to SIGCHLD (see clone(2))."
	t.tg.terminationSignal = linux.SIGCHLD
	// execed indicates that the process can no longer join a process group
	// in some scenarios (namely, the parent call setpgid(2) on the child).
	// See the JoinProcessGroup function in sessions.go for more context.
	t.tg.execed = true
	// Maximum RSS is preserved across execve(2).
	t.updateRSSLocked()
	// Restartable sequence state is discarded.
	t.rseqPreempted = false
	t.rseqCPU = -1
	t.rseqAddr = 0
	t.rseqSignature = 0
	t.oldRSeqCPUAddr = 0
	t.tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{})
	t.tg.pidns.owner.mu.Unlock()

	oldFDTable := t.fdTable
	t.fdTable = t.fdTable.Fork(t)
	oldFDTable.DecRef(t)

	// Remove FDs with the CloseOnExec flag set.
	t.fdTable.RemoveIf(t, func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool {
		return flags.CloseOnExec
	})

	// Handle the robust futex list.
	t.exitRobustList()

	// NOTE(b/30815691): We currently do not implement privileged
	// executables (set-user/group-ID bits and file capabilities). This
	// allows us to unconditionally enable user dumpability on the new mm.
	// See fs/exec.c:setup_new_exec.
	r.image.MemoryManager.SetDumpability(mm.UserDumpable)

	// Switch to the new process.
	t.MemoryManager().Deactivate()
	t.mu.Lock()
	// Update credentials to reflect the execve. This should precede switching
	// MMs to ensure that dumpability has been reset first, if needed.
	t.updateCredsForExecLocked()
	t.image.release()
	t.image = *r.image
	t.mu.Unlock()
	t.unstopVforkParent()
	t.p.FullStateChanged()
	// NOTE(b/30316266): All locks must be dropped prior to calling Activate.
	t.MemoryManager().Activate(t)

	t.ptraceExec(oldTID)
	return (*runSyscallExit)(nil)
}

// promoteLocked makes t the leader of its thread group. If t is already the
// thread group leader, promoteLocked is a no-op.
//
// Preconditions:
// * All other tasks in t's thread group, including the existing leader (if it
//   is not t), have reached TaskExitZombie.
// * The TaskSet mutex must be locked for writing.
func (t *Task) promoteLocked() {
	oldLeader := t.tg.leader
	if t == oldLeader {
		return
	}
	// Swap the leader's TIDs with the execing task's. The latter will be
	// released when the old leader is reaped below.
	for ns := t.tg.pidns; ns != nil; ns = ns.parent {
		oldTID, leaderTID := ns.tids[t], ns.tids[oldLeader]
		ns.tids[oldLeader] = oldTID
		ns.tids[t] = leaderTID
		ns.tasks[oldTID] = oldLeader
		ns.tasks[leaderTID] = t
		// Neither the ThreadGroup nor TGID change, so no need to
		// update ns.tgids.
	}

	// Inherit the old leader's start time.
	oldStartTime := oldLeader.StartTime()
	t.mu.Lock()
	t.startTime = oldStartTime
	t.mu.Unlock()

	t.tg.leader = t
	t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t])
	t.updateInfoLocked()
	// Reap the original leader. If it has a tracer, detach it instead of
	// waiting for it to acknowledge the original leader's death.
	oldLeader.exitParentNotified = true
	oldLeader.exitParentAcked = true
	if tracer := oldLeader.Tracer(); tracer != nil {
		delete(tracer.ptraceTracees, oldLeader)
		oldLeader.forgetTracerLocked()
		// Notify the tracer that it will no longer be receiving these events
		// from the tracee.
		tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop | EventGroupContinue)
	}
	oldLeader.exitNotifyLocked(false)
}