1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
|
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package kernel
import (
"bytes"
"runtime"
"runtime/trace"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/hostcpu"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/usermem"
)
// A taskRunState is a reified state in the task state machine. See README.md
// for details. The canonical list of all run states, as well as transitions
// between them, is given in run_states.dot.
//
// The set of possible states is enumerable and completely defined by the
// kernel package, so taskRunState would ideally be represented by a
// discriminated union. However, Go does not support sum types.
//
// Hence, as with TaskStop, data-free taskRunStates should be represented as
// typecast nils to avoid unnecessary allocation.
type taskRunState interface {
// execute executes the code associated with this state over the given task
// and returns the following state. If execute returns nil, the task
// goroutine should exit.
//
// It is valid to tail-call a following state's execute to avoid the
// overhead of converting the following state to an interface object and
// checking for stops, provided that the tail-call cannot recurse.
execute(*Task) taskRunState
}
// run runs the task goroutine.
//
// threadID a dummy value set to the task's TID in the root PID namespace to
// make it visible in stack dumps. A goroutine for a given task can be identified
// searching for Task.run()'s argument value.
func (t *Task) run(threadID uintptr) {
// Construct t.blockingTimer here. We do this here because we can't
// reconstruct t.blockingTimer during restore in Task.afterLoad(), because
// kernel.timekeeper.SetClocks() hasn't been called yet.
blockingTimerNotifier, blockingTimerChan := ktime.NewChannelNotifier()
t.blockingTimer = ktime.NewTimer(t.k.MonotonicClock(), blockingTimerNotifier)
defer t.blockingTimer.Destroy()
t.blockingTimerChan = blockingTimerChan
// Activate our address space.
t.Activate()
// The corresponding t.Deactivate occurs in the exit path
// (runExitMain.execute) so that when
// Platform.CooperativelySharesAddressSpace() == true, we give up the
// AddressSpace before the task goroutine finishes executing.
// If this is a newly-started task, it should check for participation in
// group stops. If this is a task resuming after restore, it was
// interrupted by saving. In either case, the task is initially
// interrupted.
t.interruptSelf()
for {
// Explanation for this ordering:
//
// - A freshly-started task that is stopped should not do anything
// before it enters the stop.
//
// - If taskRunState.execute returns nil, the task goroutine should
// exit without checking for a stop.
//
// - Task.Start won't start Task.run if t.runState is nil, so this
// ordering is safe.
t.doStop()
t.runState = t.runState.execute(t)
if t.runState == nil {
t.accountTaskGoroutineEnter(TaskGoroutineNonexistent)
t.goroutineStopped.Done()
t.tg.liveGoroutines.Done()
t.tg.pidns.owner.liveGoroutines.Done()
t.tg.pidns.owner.runningGoroutines.Done()
t.p.Release()
// Keep argument alive because stack trace for dead variables may not be correct.
runtime.KeepAlive(threadID)
return
}
}
}
// doStop is called by Task.run to block until the task is not stopped.
func (t *Task) doStop() {
if atomic.LoadInt32(&t.stopCount) == 0 {
return
}
t.Deactivate()
// NOTE(b/30316266): t.Activate() must be called without any locks held, so
// this defer must precede the defer for unlocking the signal mutex.
defer t.Activate()
t.accountTaskGoroutineEnter(TaskGoroutineStopped)
defer t.accountTaskGoroutineLeave(TaskGoroutineStopped)
t.tg.signalHandlers.mu.Lock()
defer t.tg.signalHandlers.mu.Unlock()
t.tg.pidns.owner.runningGoroutines.Add(-1)
defer t.tg.pidns.owner.runningGoroutines.Add(1)
t.goroutineStopped.Add(-1)
defer t.goroutineStopped.Add(1)
for t.stopCount > 0 {
t.endStopCond.Wait()
}
}
func (*runApp) handleCPUIDInstruction(t *Task) error {
if len(arch.CPUIDInstruction) == 0 {
// CPUID emulation isn't supported, but this code can be
// executed, because the ptrace platform returns
// ErrContextSignalCPUID on page faults too. Look at
// pkg/sentry/platform/ptrace/ptrace.go:context.Switch for more
// details.
return platform.ErrContextSignal
}
// Is this a CPUID instruction?
region := trace.StartRegion(t.traceContext, cpuidRegion)
expected := arch.CPUIDInstruction[:]
found := make([]byte, len(expected))
_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
if err == nil && bytes.Equal(expected, found) {
// Skip the cpuid instruction.
t.Arch().CPUIDEmulate(t)
t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
region.End()
return nil
}
region.End() // Not an actual CPUID, but required copy-in.
return platform.ErrContextSignal
}
// The runApp state checks for interrupts before executing untrusted
// application code.
//
// +stateify savable
type runApp struct{}
func (app *runApp) execute(t *Task) taskRunState {
if t.interrupted() {
// Checkpointing instructs tasks to stop by sending an interrupt, so we
// must check for stops before entering runInterrupt (instead of
// tail-calling it).
return (*runInterrupt)(nil)
}
// We're about to switch to the application again. If there's still a
// unhandled SyscallRestartErrno that wasn't translated to an EINTR,
// restart the syscall that was interrupted. If there's a saved signal
// mask, restore it. (Note that restoring the saved signal mask may unblock
// a pending signal, causing another interruption, but that signal should
// not interact with the interrupted syscall.)
if t.haveSyscallReturn {
if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
if sre == ERESTART_RESTARTBLOCK {
t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
t.Arch().RestartSyscallWithRestartBlock()
} else {
t.Debugf("Restarting syscall %d after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
t.Arch().RestartSyscall()
}
}
t.haveSyscallReturn = false
}
if t.haveSavedSignalMask {
t.SetSignalMask(t.savedSignalMask)
t.haveSavedSignalMask = false
if t.interrupted() {
return (*runInterrupt)(nil)
}
}
// Apply restartable sequences.
if t.rseqPreempted {
t.rseqPreempted = false
if t.rseqAddr != 0 || t.oldRSeqCPUAddr != 0 {
// Linux writes the CPU on every preemption. We only do
// so if it changed. Thus we may delay delivery of
// SIGSEGV if rseqAddr/oldRSeqCPUAddr is invalid.
cpu := int32(hostcpu.GetCPU())
if t.rseqCPU != cpu {
t.rseqCPU = cpu
if err := t.rseqCopyOutCPU(); err != nil {
t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err)
t.forceSignal(linux.SIGSEGV, false)
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
// Re-enter the task run loop for signal delivery.
return (*runApp)(nil)
}
if err := t.oldRSeqCopyOutCPU(); err != nil {
t.Debugf("Failed to copy CPU to %#x for old rseq: %v", t.oldRSeqCPUAddr, err)
t.forceSignal(linux.SIGSEGV, false)
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
// Re-enter the task run loop for signal delivery.
return (*runApp)(nil)
}
}
}
t.rseqInterrupt()
}
// Check if we need to enable single-stepping. Tracers expect that the
// kernel preserves the value of the single-step flag set by PTRACE_SETREGS
// whether or not PTRACE_SINGLESTEP/PTRACE_SYSEMU_SINGLESTEP is used (this
// includes our ptrace platform, by the way), so we should only clear the
// single-step flag if we're responsible for setting it. (clearSinglestep
// is therefore analogous to Linux's TIF_FORCED_TF.)
//
// Strictly speaking, we should also not clear the single-step flag if we
// single-step through an instruction that sets the single-step flag
// (arch/x86/kernel/step.c:is_setting_trap_flag()). But nobody sets their
// own TF. (Famous last words, I know.)
clearSinglestep := false
if t.hasTracer() {
t.tg.pidns.owner.mu.RLock()
if t.ptraceSinglestep {
clearSinglestep = !t.Arch().SingleStep()
t.Arch().SetSingleStep()
}
t.tg.pidns.owner.mu.RUnlock()
}
region := trace.StartRegion(t.traceContext, runRegion)
t.accountTaskGoroutineEnter(TaskGoroutineRunningApp)
info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU)
t.accountTaskGoroutineLeave(TaskGoroutineRunningApp)
region.End()
if clearSinglestep {
t.Arch().ClearSingleStep()
}
switch err {
case nil:
// Handle application system call.
return t.doSyscall()
case platform.ErrContextInterrupt:
// Interrupted by platform.Context.Interrupt(). Re-enter the run
// loop to figure out why.
return (*runApp)(nil)
case platform.ErrContextSignalCPUID:
if err := app.handleCPUIDInstruction(t); err == nil {
// Resume execution.
return (*runApp)(nil)
}
// The instruction at the given RIP was not a CPUID, and we
// fallthrough to the default signal deliver behavior below.
fallthrough
case platform.ErrContextSignal:
// Looks like a signal has been delivered to us. If it's a synchronous
// signal (SEGV, SIGBUS, etc.), it should be sent to the application
// thread that received it.
sig := linux.Signal(info.Signo)
// Was it a fault that we should handle internally? If so, this wasn't
// an application-generated signal and we should continue execution
// normally.
if at.Any() {
region := trace.StartRegion(t.traceContext, faultRegion)
addr := usermem.Addr(info.Addr())
err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack()))
region.End()
if err == nil {
// The fault was handled appropriately.
// We can resume running the application.
return (*runApp)(nil)
}
// Is this a vsyscall that we need emulate?
//
// Note that we don't track vsyscalls as part of a
// specific trace region. This is because regions don't
// stack, and the actual system call will count as a
// region. We should be able to easily identify
// vsyscalls by having a <fault><syscall> pair.
if at.Execute {
if sysno, ok := t.tc.st.LookupEmulate(addr); ok {
return t.doVsyscall(addr, sysno)
}
}
// Faults are common, log only at debug level.
t.Debugf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err)
t.DebugDumpState()
// Continue to signal handling.
//
// Convert a BusError error to a SIGBUS from a SIGSEGV. All
// other info bits stay the same (address, etc.).
if _, ok := err.(*memmap.BusError); ok {
sig = linux.SIGBUS
info.Signo = int32(linux.SIGBUS)
}
}
switch sig {
case linux.SIGILL, linux.SIGSEGV, linux.SIGBUS, linux.SIGFPE, linux.SIGTRAP:
// Synchronous signal. Send it to ourselves. Assume the signal is
// legitimate and force it (work around the signal being ignored or
// blocked) like Linux does. Conveniently, this is even the correct
// behavior for SIGTRAP from single-stepping.
t.forceSignal(linux.Signal(sig), false /* unconditional */)
t.SendSignal(info)
case platform.SignalInterrupt:
// Assume that a call to platform.Context.Interrupt() misfired.
case linux.SIGPROF:
// It's a profiling interrupt: there's not much
// we can do. We've already paid a decent cost
// by intercepting the signal, at this point we
// simply ignore it.
default:
// Asynchronous signal. Let the system deal with it.
t.k.sendExternalSignal(info, "application")
}
return (*runApp)(nil)
case platform.ErrContextCPUPreempted:
// Ensure that rseq critical sections are interrupted and per-thread
// CPU values are updated before the next platform.Context.Switch().
t.rseqPreempted = true
return (*runApp)(nil)
default:
// What happened? Can't continue.
t.Warningf("Unexpected SwitchToApp error: %v", err)
t.PrepareExit(ExitStatus{Code: ExtractErrno(err, -1)})
return (*runExit)(nil)
}
}
// waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits.
func (t *Task) waitGoroutineStoppedOrExited() {
t.goroutineStopped.Wait()
}
// WaitExited blocks until all task goroutines in tg have exited.
//
// WaitExited does not correspond to anything in Linux; it's provided so that
// external callers of Kernel.CreateProcess can wait for the created thread
// group to terminate.
func (tg *ThreadGroup) WaitExited() {
tg.liveGoroutines.Wait()
}
// Yield yields the processor for the calling task.
func (t *Task) Yield() {
atomic.AddUint64(&t.yieldCount, 1)
runtime.Gosched()
}
|