diff options
author | Adin Scannell <ascannell@google.com> | 2020-03-30 12:36:30 -0700 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2020-03-30 12:37:57 -0700 |
commit | 3fac85da951f9f56d0232718ea7584250cf11f31 (patch) | |
tree | aef282ad8f9dda532fb7d629d6d4aa58ab5e7487 /pkg/sentry/platform | |
parent | 4aee3706406d6b102540ad5bea272b7c893da827 (diff) |
kvm: handle exit reasons even under EINTR.
In the case of other signals (preemption), inject a normal bounce and
defer the signal until the vCPU has been returned from guest mode.
PiperOrigin-RevId: 303799678
Diffstat (limited to 'pkg/sentry/platform')
-rw-r--r-- | pkg/sentry/platform/kvm/BUILD | 1 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/bluepill.go | 12 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/bluepill_unsafe.go | 97 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/kvm_const.go | 1 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/kvm_test.go | 64 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine.go | 9 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine_amd64_unsafe.go | 25 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine_arm64_unsafe.go | 26 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine_unsafe.go | 41 |
9 files changed, 177 insertions, 99 deletions
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index 159f7eafd..e27f57536 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -70,6 +70,7 @@ go_test( "requires-kvm", ], deps = [ + "//pkg/procid", "//pkg/sentry/arch", "//pkg/sentry/platform", "//pkg/sentry/platform/kvm/testutil", diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go index 4b23f7803..555b5fa96 100644 --- a/pkg/sentry/platform/kvm/bluepill.go +++ b/pkg/sentry/platform/kvm/bluepill.go @@ -46,6 +46,14 @@ var ( // bounceSignalMask has only bounceSignal set. bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1)) + // otherSignalsMask includes all other signals that will be cause the + // vCPU to exit during execution. + // + // Currently, this includes the preemption signal and the profiling + // signal. In general, these should be signals whose delivery actually + // influences the way the program executes as the switch can be costly. + otherSignalsMask = uint64(1<<(uint64(syscall.SIGURG)-1)) | uint64(1<<(uint64(syscall.SIGPROF)-1)) + // bounce is the interrupt vector used to return to the kernel. bounce = uint32(ring0.VirtualizationException) @@ -86,8 +94,8 @@ func (c *vCPU) die(context *arch.SignalContext64, msg string) { } func init() { - // Install the handler. - if err := safecopy.ReplaceSignalHandler(bluepillSignal, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil { + // Install the handler, masking all signals. + if err := safecopy.ReplaceSignalHandler(bluepillSignal, reflect.ValueOf(sighandler).Pointer(), &savedHandler, ^uint64(0)); err != nil { panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err)) } diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go index 9add7c944..4e9d80765 100644 --- a/pkg/sentry/platform/kvm/bluepill_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go @@ -24,6 +24,7 @@ import ( "syscall" "unsafe" + "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/sentry/arch" ) @@ -58,6 +59,19 @@ func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 { return &((*arch.UContext64)(context).MContext) } +// injectInterrupt is a helper to inject an interrupt. +// +//go:nosplit +func injectInterrupt(c *vCPU) { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_INTERRUPT, + uintptr(unsafe.Pointer(&bounce))); errno != 0 { + throw("interrupt injection failed") + } +} + // bluepillHandler is called from the signal stub. // // The world may be stopped while this is executing, and it executes on the @@ -69,6 +83,9 @@ func bluepillHandler(context unsafe.Pointer) { // Sanitize the registers; interrupts must always be disabled. c := bluepillArchEnter(bluepillArchContext(context)) + // Enable preemption. + c.setSignalMask(true) + // Increment the number of switches. atomic.AddUint32(&c.switches, 1) @@ -89,6 +106,9 @@ func bluepillHandler(context unsafe.Pointer) { // interrupted KVM. Since we're in a signal handler // currently, all signals are masked and the signal // must have been delivered directly to this thread. + // + // We will not be able to actually do subsequent + // KVM_RUNs until this signal is processed. timeout := syscall.Timespec{} sig, _, errno := syscall.RawSyscall6( syscall.SYS_RT_SIGTIMEDWAIT, @@ -98,12 +118,24 @@ func bluepillHandler(context unsafe.Pointer) { 8, // sigset size. 0, 0) if errno == syscall.EAGAIN { - continue - } - if errno != 0 { + // If weren't able to process this signal, then + // it must not have been in the bounceMask. By + // elimination, it must have been the + // preemption signal. We can't process this + // signal right now, so we need to disable + // preemption until the interrupt is actually + // handled. + c.setSignalMask(false) + // Note that there is a waiter for this vCPU. + // This will cause the vCPU to exit at some + // point in the future (releasing the user lock + // and guest mode). + atomicbitops.OrUint32(&c.state, vCPUWaiter) + } else if errno != 0 { + // We only expect success or a timeout. throw("error waiting for pending signal") - } - if sig != uintptr(bounceSignal) { + } else if sig != uintptr(bounceSignal) { + // Only the bounce should be processed. throw("unexpected signal") } @@ -114,11 +146,10 @@ func bluepillHandler(context unsafe.Pointer) { // ready. if c.runData.readyForInterruptInjection == 0 { c.runData.requestInterruptWindow = 1 - continue // Rerun vCPU. } else { - // Force injection below; the vCPU is ready. - c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN + injectInterrupt(c) } + continue // Rerun vCPU. case syscall.EFAULT: // If a fault is not serviceable due to the host // backing pages having page permissions, instead of an @@ -137,6 +168,30 @@ func bluepillHandler(context unsafe.Pointer) { } switch c.runData.exitReason { + case _KVM_EXIT_HLT: + // Copy out registers. + bluepillArchExit(c, bluepillArchContext(context)) + + // Return to the vCPUReady state; notify any waiters. + user := atomic.LoadUint32(&c.state) & vCPUUser + switch atomic.SwapUint32(&c.state, user) { + case user | vCPUGuest: // Expected case. + case user | vCPUGuest | vCPUWaiter: + c.notify() + default: + throw("invalid state") + } + return + case _KVM_EXIT_IRQ_WINDOW_OPEN: + // Inject an interrupt now. + injectInterrupt(c) + // Clear previous injection request. + c.runData.requestInterruptWindow = 0 + case _KVM_EXIT_INTR: + // This is fine, it is the normal exit reason during + // signal delivery. However, we still need to handle + // other potential exit reasons *combined* with EINTR, + // so this switch must be hit even after the above. case _KVM_EXIT_EXCEPTION: c.die(bluepillArchContext(context), "exception") return @@ -155,20 +210,6 @@ func bluepillHandler(context unsafe.Pointer) { case _KVM_EXIT_DEBUG: c.die(bluepillArchContext(context), "debug") return - case _KVM_EXIT_HLT: - // Copy out registers. - bluepillArchExit(c, bluepillArchContext(context)) - - // Return to the vCPUReady state; notify any waiters. - user := atomic.LoadUint32(&c.state) & vCPUUser - switch atomic.SwapUint32(&c.state, user) { - case user | vCPUGuest: // Expected case. - case user | vCPUGuest | vCPUWaiter: - c.notify() - default: - throw("invalid state") - } - return case _KVM_EXIT_MMIO: // Increment the fault count. atomic.AddUint32(&c.faults, 1) @@ -200,18 +241,6 @@ func bluepillHandler(context unsafe.Pointer) { data[i] = *b } } - case _KVM_EXIT_IRQ_WINDOW_OPEN: - // Interrupt: we must have requested an interrupt - // window; set the interrupt line. - if _, _, errno := syscall.RawSyscall( - syscall.SYS_IOCTL, - uintptr(c.fd), - _KVM_INTERRUPT, - uintptr(unsafe.Pointer(&bounce))); errno != 0 { - throw("interrupt injection failed") - } - // Clear previous injection request. - c.runData.requestInterruptWindow = 0 case _KVM_EXIT_SHUTDOWN: c.die(bluepillArchContext(context), "shutdown") return diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go index 1d5c77ff4..07d9c9a98 100644 --- a/pkg/sentry/platform/kvm/kvm_const.go +++ b/pkg/sentry/platform/kvm/kvm_const.go @@ -48,6 +48,7 @@ const ( _KVM_EXIT_IRQ_WINDOW_OPEN = 0x7 _KVM_EXIT_SHUTDOWN = 0x8 _KVM_EXIT_FAIL_ENTRY = 0x9 + _KVM_EXIT_INTR = 0xa _KVM_EXIT_INTERNAL_ERROR = 0x11 _KVM_EXIT_SYSTEM_EVENT = 0x18 ) diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go index c42752d50..d42ba3f24 100644 --- a/pkg/sentry/platform/kvm/kvm_test.go +++ b/pkg/sentry/platform/kvm/kvm_test.go @@ -16,12 +16,15 @@ package kvm import ( "math/rand" + "os" "reflect" + "runtime" "sync/atomic" "syscall" "testing" "time" + "gvisor.dev/gvisor/pkg/procid" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil" @@ -320,15 +323,18 @@ func TestBounce(t *testing.T) { }) } +// randomSleep is used by some race tests below. +// +// O(hundreds of microseconds) is appropriate to ensure different overlaps and +// different schedules. +func randomSleep() { + if n := rand.Intn(1000); n > 100 { + time.Sleep(time.Duration(n) * time.Microsecond) + } +} + func TestBounceStress(t *testing.T) { applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { - randomSleep := func() { - // O(hundreds of microseconds) is appropriate to ensure - // different overlaps and different schedules. - if n := rand.Intn(1000); n > 100 { - time.Sleep(time.Duration(n) * time.Microsecond) - } - } for i := 0; i < 1000; i++ { // Start an asynchronously executing goroutine that // calls Bounce at pseudo-random point in time. @@ -355,6 +361,50 @@ func TestBounceStress(t *testing.T) { }) } +func TestPreemption(t *testing.T) { + applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + // Lock the main vCPU thread. + runtime.LockOSThread() + pid := os.Getpid() + tid := procid.Current() + running := uint32(1) + defer atomic.StoreUint32(&running, 0) + + // Start generating "preemptions". + go func() { + for atomic.LoadUint32(&running) != 0 { + // Kick via a preemption: best effort. + syscall.Tgkill(pid, int(tid), syscall.SIGURG) + randomSleep() + } + }() + + for i := 0; i < 1000; i++ { + randomSleep() + var si arch.SignalInfo + if _, err := c.SwitchToUser(ring0.SwitchOpts{ + Registers: regs, + FloatingPointState: dummyFPState, + PageTables: pt, + }, &si); err != platform.ErrContextInterrupt { + t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt) + } + // Was this caused by a preemption signal? + if got := atomic.LoadUint32(&c.state); got&vCPUGuest != 0 && got&vCPUWaiter == 0 { + continue + } + c.unlock() + // Should have dropped from guest mode, processed preemption. + if got := atomic.LoadUint32(&c.state); got != vCPUReady { + t.Errorf("vCPU not in ready state: got %v", got) + } + randomSleep() + c.lock() + } + return false + }) +} + func TestInvalidate(t *testing.T) { var data uintptr // Used below. applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index f1afc74dc..345b71e8f 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -108,6 +108,9 @@ type vCPU struct { // This is a bitmask of the three fields (vCPU*) described above. state uint32 + // signalMask is the vCPU signal mask. + signalMask uint64 + // runData for this vCPU. runData *runData @@ -121,6 +124,7 @@ type vCPU struct { // vCPUArchState is the architecture-specific state. vCPUArchState + // dieState is the temporary state associated with throwing exceptions. dieState dieState } @@ -153,11 +157,6 @@ func (m *machine) newVCPU() *vCPU { c.CPU.Init(&m.kernel, c) m.vCPUsByID[c.id] = c - // Ensure the signal mask is correct. - if err := c.setSignalMask(); err != nil { - panic(fmt.Sprintf("error setting signal mask: %v", err)) - } - // Map the run data. runData, err := mapRunData(int(fd)) if err != nil { diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go index 7156c245f..52286e56d 100644 --- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go @@ -111,31 +111,6 @@ func (c *vCPU) setSystemTime() error { return nil } -// setSignalMask sets the vCPU signal mask. -// -// This must be called prior to running the vCPU. -func (c *vCPU) setSignalMask() error { - // The layout of this structure implies that it will not necessarily be - // the same layout chosen by the Go compiler. It gets fudged here. - var data struct { - length uint32 - mask1 uint32 - mask2 uint32 - _ uint32 - } - data.length = 8 // Fixed sigset size. - data.mask1 = ^uint32(bounceSignalMask & 0xffffffff) - data.mask2 = ^uint32(bounceSignalMask >> 32) - if _, _, errno := syscall.RawSyscall( - syscall.SYS_IOCTL, - uintptr(c.fd), - _KVM_SET_SIGNAL_MASK, - uintptr(unsafe.Pointer(&data))); errno != 0 { - return fmt.Errorf("error setting signal mask: %v", errno) - } - return nil -} - // setUserRegisters sets user registers in the vCPU. func (c *vCPU) setUserRegisters(uregs *userRegs) error { if _, _, errno := syscall.RawSyscall( diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go index b531f2f85..185eeb4f0 100644 --- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go @@ -268,32 +268,6 @@ func (c *vCPU) setSystemTime() error { return nil } -// setSignalMask sets the vCPU signal mask. -// -// This must be called prior to running the vCPU. -func (c *vCPU) setSignalMask() error { - // The layout of this structure implies that it will not necessarily be - // the same layout chosen by the Go compiler. It gets fudged here. - var data struct { - length uint32 - mask1 uint32 - mask2 uint32 - _ uint32 - } - data.length = 8 // Fixed sigset size. - data.mask1 = ^uint32(bounceSignalMask & 0xffffffff) - data.mask2 = ^uint32(bounceSignalMask >> 32) - if _, _, errno := syscall.RawSyscall( - syscall.SYS_IOCTL, - uintptr(c.fd), - _KVM_SET_SIGNAL_MASK, - uintptr(unsafe.Pointer(&data))); errno != 0 { - return fmt.Errorf("error setting signal mask: %v", errno) - } - - return nil -} - // SwitchToUser unpacks architectural-details. func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (usermem.AccessType, error) { // Check for canonical addresses. diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go index f04be2ab5..e4de0a889 100644 --- a/pkg/sentry/platform/kvm/machine_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_unsafe.go @@ -87,6 +87,47 @@ func unmapRunData(r *runData) error { return nil } +// setSignalMask sets the vCPU signal mask. +// +// This will be called from the bluepill handler, and therefore must not +// perform any allocation. +// +//go:nosplit +func (c *vCPU) setSignalMask(enableOthers bool) { + // The signal mask is either: + // *) Only the bounce signal, which we need to use to execute the + // machine state up until the bounce interrupt can be processed. + // or + // *) All signals, which is the default state unless we need to + // continue execution to exit guest mode (the case above). + mask := bounceSignalMask + if enableOthers { + mask |= otherSignalsMask + } + if c.signalMask == mask { + return // Already set. + } + + // The layout of this structure implies that it will not necessarily be + // the same layout chosen by the Go compiler. It gets fudged here. + var data struct { + length uint32 + mask1 uint32 + mask2 uint32 + _ uint32 + } + data.length = 8 // Fixed sigset size. + data.mask1 = ^uint32(mask & 0xffffffff) + data.mask2 = ^uint32(mask >> 32) + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_SIGNAL_MASK, + uintptr(unsafe.Pointer(&data))); errno != 0 { + throw("setSignal mask failed") + } +} + // atomicAddressSpace is an atomic address space pointer. type atomicAddressSpace struct { pointer unsafe.Pointer |