diff options
Diffstat (limited to 'pkg/sentry/kernel/task_run.go')
-rw-r--r-- | pkg/sentry/kernel/task_run.go | 346 |
1 files changed, 346 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go new file mode 100644 index 000000000..94ce5582b --- /dev/null +++ b/pkg/sentry/kernel/task_run.go @@ -0,0 +1,346 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernel + +import ( + "bytes" + "runtime" + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// A taskRunState is a reified state in the task state machine. See README.md +// for details. The canonical list of all run states, as well as transitions +// between them, is given in run_states.dot. +// +// The set of possible states is enumerable and completely defined by the +// kernel package, so taskRunState would ideally be represented by a +// discriminated union. However, Go does not support sum types. +// +// Hence, as with TaskStop, data-free taskRunStates should be represented as +// typecast nils to avoid unnecessary allocation. +type taskRunState interface { + // execute executes the code associated with this state over the given task + // and returns the following state. If execute returns nil, the task + // goroutine should exit. + // + // It is valid to tail-call a following state's execute to avoid the + // overhead of converting the following state to an interface object and + // checking for stops, provided that the tail-call cannot recurse. + execute(*Task) taskRunState +} + +// run runs the task goroutine. +// +// threadID a dummy value set to the task's TID in the root PID namespace to +// make it visible in stack dumps. A goroutine for a given task can be identified +// searching for Task.run()'s argument value. +func (t *Task) run(threadID uintptr) { + // Construct t.blockingTimer here. We do this here because we can't + // reconstruct t.blockingTimer during restore in Task.afterLoad(), because + // kernel.timekeeper.SetClocks() hasn't been called yet. + blockingTimerNotifier, blockingTimerChan := ktime.NewChannelNotifier() + t.blockingTimer = ktime.NewTimer(t.k.MonotonicClock(), blockingTimerNotifier) + defer t.blockingTimer.Destroy() + t.blockingTimerChan = blockingTimerChan + + // Activate our address space. + t.Activate() + // The corresponding t.Deactivate occurs in the exit path + // (runExitMain.execute) so that when + // Platform.CooperativelySharesAddressSpace() == true, we give up the + // AddressSpace before the task goroutine finishes executing. + + // Ensure that thread group timers for execution time reflect that this + // task now exists. + t.tg.tm.kick() + + // If this is a newly-started task, it should check for participation in + // group stops. If this is a task resuming after restore, it was + // interrupted by saving. In either case, the task is initially + // interrupted. + t.interruptSelf() + + for { + // Explanation for this ordering: + // + // - A freshly-started task that is stopped should not do anything + // before it enters the stop. + // + // - If taskRunState.execute returns nil, the task goroutine should + // exit without checking for a stop. + // + // - Task.Start won't start Task.run if t.runState is nil, so this + // ordering is safe. + t.doStop() + t.runState = t.runState.execute(t) + if t.runState == nil { + t.accountTaskGoroutineEnter(TaskGoroutineNonexistent) + t.goroutineStopped.Done() + t.tg.liveGoroutines.Done() + t.tg.pidns.owner.liveGoroutines.Done() + t.tg.pidns.owner.runningGoroutines.Done() + + // Keep argument alive because stack trace for dead variables may not be correct. + runtime.KeepAlive(threadID) + return + } + } +} + +// doStop is called by Task.run to block until the task is not stopped. +func (t *Task) doStop() { + if atomic.LoadInt32(&t.stopCount) == 0 { + return + } + t.Deactivate() + // NOTE: t.Activate() must be called without any locks held, so + // this defer must precede the defer for unlocking the signal mutex. + defer t.Activate() + t.accountTaskGoroutineEnter(TaskGoroutineStopped) + defer t.accountTaskGoroutineLeave(TaskGoroutineStopped) + t.tg.signalHandlers.mu.Lock() + defer t.tg.signalHandlers.mu.Unlock() + t.tg.pidns.owner.runningGoroutines.Add(-1) + defer t.tg.pidns.owner.runningGoroutines.Add(1) + t.goroutineStopped.Add(-1) + defer t.goroutineStopped.Add(1) + for t.stopCount > 0 { + t.endStopCond.Wait() + } +} + +// The runApp state checks for interrupts before executing untrusted +// application code. +type runApp struct{} + +func (*runApp) execute(t *Task) taskRunState { + if t.interrupted() { + // Checkpointing instructs tasks to stop by sending an interrupt, so we + // must check for stops before entering runInterrupt (instead of + // tail-calling it). + return (*runInterrupt)(nil) + } + + // We're about to switch to the application again. If there's still a + // unhandled SyscallRestartErrno that wasn't translated to an EINTR, + // restart the syscall that was interrupted. If there's a saved signal + // mask, restore it. (Note that restoring the saved signal mask may unblock + // a pending signal, causing another interruption, but that signal should + // not interact with the interrupted syscall.) + if t.haveSyscallReturn { + if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok { + if sre == ERESTART_RESTARTBLOCK { + t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre) + t.Arch().RestartSyscallWithRestartBlock() + } else { + t.Debugf("Restarting syscall %d after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre) + t.Arch().RestartSyscall() + } + } + t.haveSyscallReturn = false + } + if t.haveSavedSignalMask { + t.SetSignalMask(t.savedSignalMask) + t.haveSavedSignalMask = false + if t.interrupted() { + return (*runInterrupt)(nil) + } + } + + // Apply restartable sequences. + if t.rseqPreempted { + t.rseqPreempted = false + if t.rseqCPUAddr != 0 { + if err := t.rseqCopyOutCPU(); err != nil { + t.Warningf("Failed to copy CPU to %#x for RSEQ: %v", t.rseqCPUAddr, err) + t.forceSignal(linux.SIGSEGV, false) + t.SendSignal(sigPriv(linux.SIGSEGV)) + // Re-enter the task run loop for signal delivery. + return (*runApp)(nil) + } + } + t.rseqInterrupt() + } + + // Check if we need to enable single-stepping. Tracers expect that the + // kernel preserves the value of the single-step flag set by PTRACE_SETREGS + // whether or not PTRACE_SINGLESTEP/PTRACE_SYSEMU_SINGLESTEP is used (this + // includes our ptrace platform, by the way), so we should only clear the + // single-step flag if we're responsible for setting it. (clearSinglestep + // is therefore analogous to Linux's TIF_FORCED_TF.) + // + // Strictly speaking, we should also not clear the single-step flag if we + // single-step through an instruction that sets the single-step flag + // (arch/x86/kernel/step.c:is_setting_trap_flag()). But nobody sets their + // own TF. (Famous last words, I know.) + clearSinglestep := false + if t.hasTracer() { + t.tg.pidns.owner.mu.RLock() + if t.ptraceSinglestep { + clearSinglestep = !t.Arch().SingleStep() + t.Arch().SetSingleStep() + } + t.tg.pidns.owner.mu.RUnlock() + } + + t.accountTaskGoroutineEnter(TaskGoroutineRunningApp) + info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU) + t.accountTaskGoroutineLeave(TaskGoroutineRunningApp) + + if clearSinglestep { + t.Arch().ClearSingleStep() + } + + switch err { + case nil: + // Handle application system call. + return t.doSyscall() + + case platform.ErrContextInterrupt: + // Interrupted by platform.Context.Interrupt(). Re-enter the run + // loop to figure out why. + return (*runApp)(nil) + + case platform.ErrContextSignal: + // Looks like a signal has been delivered to us. If it's a synchronous + // signal (SEGV, SIGBUS, etc.), it should be sent to the application + // thread that received it. + sig := linux.Signal(info.Signo) + + // Was it a fault that we should handle internally? If so, this wasn't + // an application-generated signal and we should continue execution + // normally. + if at.Any() { + addr := usermem.Addr(info.Addr()) + err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack())) + if err == nil { + // The fault was handled appropriately. + // We can resume running the application. + return (*runApp)(nil) + } + + // Is this a vsyscall that we need emulate? + if at.Execute { + if sysno, ok := t.tc.st.LookupEmulate(addr); ok { + return t.doVsyscall(addr, sysno) + } + } + + // The JVM will trigger these errors constantly, so don't + // spam logs with this error. + if err == syserror.EFAULT || err == syserror.EPERM { + t.Debugf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err) + } else { + t.Warningf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err) + } + t.DebugDumpState() + + // Continue to signal handling. + // + // Convert a BusError error to a SIGBUS from a SIGSEGV. All + // other info bits stay the same (address, etc.). + if _, ok := err.(*memmap.BusError); ok { + sig = linux.SIGBUS + info.Signo = int32(linux.SIGBUS) + } + } + + switch sig { + case linux.SIGILL: + // N.B. The debug stuff here is arguably + // expensive. Don't fret. This gets called + // about 5 times for a typical application, if + // that. + t.Debugf("SIGILL @ %x", t.Arch().IP()) + + // Is this a CPUID instruction? + expected := arch.CPUIDInstruction[:] + found := make([]byte, len(expected)) + _, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found) + if err == nil && bytes.Equal(expected, found) { + // Skip the cpuid instruction. + t.Arch().CPUIDEmulate(t) + t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected))) + break + } + + // Treat it like any other synchronous signal. + fallthrough + + case linux.SIGSEGV, linux.SIGBUS, linux.SIGFPE, linux.SIGTRAP: + // Synchronous signal. Send it to ourselves. Assume the signal is + // legitimate and force it (work around the signal being ignored or + // blocked) like Linux does. Conveniently, this is even the correct + // behavior for SIGTRAP from single-stepping. + t.forceSignal(linux.Signal(sig), false /* unconditional */) + t.SendSignal(info) + + case platform.SignalInterrupt: + // Assume that a call to platform.Context.Interrupt() misfired. + + case linux.SIGPROF: + // It's a profiling interrupt: there's not much + // we can do. We've already paid a decent cost + // by intercepting the signal, at this point we + // simply ignore it. + + default: + // Asynchronous signal. Let the system deal with it. + t.k.sendExternalSignal(info, "application") + } + + return (*runApp)(nil) + + case platform.ErrContextCPUPreempted: + // Ensure that RSEQ critical sections are interrupted and per-thread + // CPU values are updated before the next platform.Context.Switch(). + t.rseqPreempted = true + return (*runApp)(nil) + + default: + // What happened? Can't continue. + t.Warningf("Unexpected SwitchToApp error: %v", err) + t.PrepareExit(ExitStatus{Code: t.ExtractErrno(err, -1)}) + return (*runExit)(nil) + } +} + +// waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits. +func (t *Task) waitGoroutineStoppedOrExited() { + t.goroutineStopped.Wait() +} + +// WaitExited blocks until all task goroutines in tg have exited. +// +// WaitExited does not correspond to anything in Linux; it's provided so that +// external callers of Kernel.CreateProcess can wait for the created thread +// group to terminate. +func (tg *ThreadGroup) WaitExited() { + tg.liveGoroutines.Wait() +} + +// Yield yields the processor for the calling task. +func (t *Task) Yield() { + atomic.AddUint64(&t.yieldCount, 1) + runtime.Gosched() +} |