diff options
Diffstat (limited to 'pkg/sentry/platform')
-rw-r--r-- | pkg/sentry/platform/kvm/bluepill.go | 12 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/bluepill_unsafe.go | 97 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/kvm_const.go | 1 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine.go | 9 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine_amd64_unsafe.go | 25 | ||||
-rwxr-xr-x | pkg/sentry/platform/kvm/machine_arm64_unsafe.go | 26 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine_unsafe.go | 41 |
7 files changed, 119 insertions, 92 deletions
diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go index 4b23f7803..555b5fa96 100644 --- a/pkg/sentry/platform/kvm/bluepill.go +++ b/pkg/sentry/platform/kvm/bluepill.go @@ -46,6 +46,14 @@ var ( // bounceSignalMask has only bounceSignal set. bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1)) + // otherSignalsMask includes all other signals that will be cause the + // vCPU to exit during execution. + // + // Currently, this includes the preemption signal and the profiling + // signal. In general, these should be signals whose delivery actually + // influences the way the program executes as the switch can be costly. + otherSignalsMask = uint64(1<<(uint64(syscall.SIGURG)-1)) | uint64(1<<(uint64(syscall.SIGPROF)-1)) + // bounce is the interrupt vector used to return to the kernel. bounce = uint32(ring0.VirtualizationException) @@ -86,8 +94,8 @@ func (c *vCPU) die(context *arch.SignalContext64, msg string) { } func init() { - // Install the handler. - if err := safecopy.ReplaceSignalHandler(bluepillSignal, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil { + // Install the handler, masking all signals. + if err := safecopy.ReplaceSignalHandler(bluepillSignal, reflect.ValueOf(sighandler).Pointer(), &savedHandler, ^uint64(0)); err != nil { panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err)) } diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go index 9add7c944..4e9d80765 100644 --- a/pkg/sentry/platform/kvm/bluepill_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go @@ -24,6 +24,7 @@ import ( "syscall" "unsafe" + "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/sentry/arch" ) @@ -58,6 +59,19 @@ func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 { return &((*arch.UContext64)(context).MContext) } +// injectInterrupt is a helper to inject an interrupt. +// +//go:nosplit +func injectInterrupt(c *vCPU) { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_INTERRUPT, + uintptr(unsafe.Pointer(&bounce))); errno != 0 { + throw("interrupt injection failed") + } +} + // bluepillHandler is called from the signal stub. // // The world may be stopped while this is executing, and it executes on the @@ -69,6 +83,9 @@ func bluepillHandler(context unsafe.Pointer) { // Sanitize the registers; interrupts must always be disabled. c := bluepillArchEnter(bluepillArchContext(context)) + // Enable preemption. + c.setSignalMask(true) + // Increment the number of switches. atomic.AddUint32(&c.switches, 1) @@ -89,6 +106,9 @@ func bluepillHandler(context unsafe.Pointer) { // interrupted KVM. Since we're in a signal handler // currently, all signals are masked and the signal // must have been delivered directly to this thread. + // + // We will not be able to actually do subsequent + // KVM_RUNs until this signal is processed. timeout := syscall.Timespec{} sig, _, errno := syscall.RawSyscall6( syscall.SYS_RT_SIGTIMEDWAIT, @@ -98,12 +118,24 @@ func bluepillHandler(context unsafe.Pointer) { 8, // sigset size. 0, 0) if errno == syscall.EAGAIN { - continue - } - if errno != 0 { + // If weren't able to process this signal, then + // it must not have been in the bounceMask. By + // elimination, it must have been the + // preemption signal. We can't process this + // signal right now, so we need to disable + // preemption until the interrupt is actually + // handled. + c.setSignalMask(false) + // Note that there is a waiter for this vCPU. + // This will cause the vCPU to exit at some + // point in the future (releasing the user lock + // and guest mode). + atomicbitops.OrUint32(&c.state, vCPUWaiter) + } else if errno != 0 { + // We only expect success or a timeout. throw("error waiting for pending signal") - } - if sig != uintptr(bounceSignal) { + } else if sig != uintptr(bounceSignal) { + // Only the bounce should be processed. throw("unexpected signal") } @@ -114,11 +146,10 @@ func bluepillHandler(context unsafe.Pointer) { // ready. if c.runData.readyForInterruptInjection == 0 { c.runData.requestInterruptWindow = 1 - continue // Rerun vCPU. } else { - // Force injection below; the vCPU is ready. - c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN + injectInterrupt(c) } + continue // Rerun vCPU. case syscall.EFAULT: // If a fault is not serviceable due to the host // backing pages having page permissions, instead of an @@ -137,6 +168,30 @@ func bluepillHandler(context unsafe.Pointer) { } switch c.runData.exitReason { + case _KVM_EXIT_HLT: + // Copy out registers. + bluepillArchExit(c, bluepillArchContext(context)) + + // Return to the vCPUReady state; notify any waiters. + user := atomic.LoadUint32(&c.state) & vCPUUser + switch atomic.SwapUint32(&c.state, user) { + case user | vCPUGuest: // Expected case. + case user | vCPUGuest | vCPUWaiter: + c.notify() + default: + throw("invalid state") + } + return + case _KVM_EXIT_IRQ_WINDOW_OPEN: + // Inject an interrupt now. + injectInterrupt(c) + // Clear previous injection request. + c.runData.requestInterruptWindow = 0 + case _KVM_EXIT_INTR: + // This is fine, it is the normal exit reason during + // signal delivery. However, we still need to handle + // other potential exit reasons *combined* with EINTR, + // so this switch must be hit even after the above. case _KVM_EXIT_EXCEPTION: c.die(bluepillArchContext(context), "exception") return @@ -155,20 +210,6 @@ func bluepillHandler(context unsafe.Pointer) { case _KVM_EXIT_DEBUG: c.die(bluepillArchContext(context), "debug") return - case _KVM_EXIT_HLT: - // Copy out registers. - bluepillArchExit(c, bluepillArchContext(context)) - - // Return to the vCPUReady state; notify any waiters. - user := atomic.LoadUint32(&c.state) & vCPUUser - switch atomic.SwapUint32(&c.state, user) { - case user | vCPUGuest: // Expected case. - case user | vCPUGuest | vCPUWaiter: - c.notify() - default: - throw("invalid state") - } - return case _KVM_EXIT_MMIO: // Increment the fault count. atomic.AddUint32(&c.faults, 1) @@ -200,18 +241,6 @@ func bluepillHandler(context unsafe.Pointer) { data[i] = *b } } - case _KVM_EXIT_IRQ_WINDOW_OPEN: - // Interrupt: we must have requested an interrupt - // window; set the interrupt line. - if _, _, errno := syscall.RawSyscall( - syscall.SYS_IOCTL, - uintptr(c.fd), - _KVM_INTERRUPT, - uintptr(unsafe.Pointer(&bounce))); errno != 0 { - throw("interrupt injection failed") - } - // Clear previous injection request. - c.runData.requestInterruptWindow = 0 case _KVM_EXIT_SHUTDOWN: c.die(bluepillArchContext(context), "shutdown") return diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go index 1d5c77ff4..07d9c9a98 100644 --- a/pkg/sentry/platform/kvm/kvm_const.go +++ b/pkg/sentry/platform/kvm/kvm_const.go @@ -48,6 +48,7 @@ const ( _KVM_EXIT_IRQ_WINDOW_OPEN = 0x7 _KVM_EXIT_SHUTDOWN = 0x8 _KVM_EXIT_FAIL_ENTRY = 0x9 + _KVM_EXIT_INTR = 0xa _KVM_EXIT_INTERNAL_ERROR = 0x11 _KVM_EXIT_SYSTEM_EVENT = 0x18 ) diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index f1afc74dc..345b71e8f 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -108,6 +108,9 @@ type vCPU struct { // This is a bitmask of the three fields (vCPU*) described above. state uint32 + // signalMask is the vCPU signal mask. + signalMask uint64 + // runData for this vCPU. runData *runData @@ -121,6 +124,7 @@ type vCPU struct { // vCPUArchState is the architecture-specific state. vCPUArchState + // dieState is the temporary state associated with throwing exceptions. dieState dieState } @@ -153,11 +157,6 @@ func (m *machine) newVCPU() *vCPU { c.CPU.Init(&m.kernel, c) m.vCPUsByID[c.id] = c - // Ensure the signal mask is correct. - if err := c.setSignalMask(); err != nil { - panic(fmt.Sprintf("error setting signal mask: %v", err)) - } - // Map the run data. runData, err := mapRunData(int(fd)) if err != nil { diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go index 7156c245f..52286e56d 100644 --- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go @@ -111,31 +111,6 @@ func (c *vCPU) setSystemTime() error { return nil } -// setSignalMask sets the vCPU signal mask. -// -// This must be called prior to running the vCPU. -func (c *vCPU) setSignalMask() error { - // The layout of this structure implies that it will not necessarily be - // the same layout chosen by the Go compiler. It gets fudged here. - var data struct { - length uint32 - mask1 uint32 - mask2 uint32 - _ uint32 - } - data.length = 8 // Fixed sigset size. - data.mask1 = ^uint32(bounceSignalMask & 0xffffffff) - data.mask2 = ^uint32(bounceSignalMask >> 32) - if _, _, errno := syscall.RawSyscall( - syscall.SYS_IOCTL, - uintptr(c.fd), - _KVM_SET_SIGNAL_MASK, - uintptr(unsafe.Pointer(&data))); errno != 0 { - return fmt.Errorf("error setting signal mask: %v", errno) - } - return nil -} - // setUserRegisters sets user registers in the vCPU. func (c *vCPU) setUserRegisters(uregs *userRegs) error { if _, _, errno := syscall.RawSyscall( diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go index b531f2f85..185eeb4f0 100755 --- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go @@ -268,32 +268,6 @@ func (c *vCPU) setSystemTime() error { return nil } -// setSignalMask sets the vCPU signal mask. -// -// This must be called prior to running the vCPU. -func (c *vCPU) setSignalMask() error { - // The layout of this structure implies that it will not necessarily be - // the same layout chosen by the Go compiler. It gets fudged here. - var data struct { - length uint32 - mask1 uint32 - mask2 uint32 - _ uint32 - } - data.length = 8 // Fixed sigset size. - data.mask1 = ^uint32(bounceSignalMask & 0xffffffff) - data.mask2 = ^uint32(bounceSignalMask >> 32) - if _, _, errno := syscall.RawSyscall( - syscall.SYS_IOCTL, - uintptr(c.fd), - _KVM_SET_SIGNAL_MASK, - uintptr(unsafe.Pointer(&data))); errno != 0 { - return fmt.Errorf("error setting signal mask: %v", errno) - } - - return nil -} - // SwitchToUser unpacks architectural-details. func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (usermem.AccessType, error) { // Check for canonical addresses. diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go index f04be2ab5..e4de0a889 100644 --- a/pkg/sentry/platform/kvm/machine_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_unsafe.go @@ -87,6 +87,47 @@ func unmapRunData(r *runData) error { return nil } +// setSignalMask sets the vCPU signal mask. +// +// This will be called from the bluepill handler, and therefore must not +// perform any allocation. +// +//go:nosplit +func (c *vCPU) setSignalMask(enableOthers bool) { + // The signal mask is either: + // *) Only the bounce signal, which we need to use to execute the + // machine state up until the bounce interrupt can be processed. + // or + // *) All signals, which is the default state unless we need to + // continue execution to exit guest mode (the case above). + mask := bounceSignalMask + if enableOthers { + mask |= otherSignalsMask + } + if c.signalMask == mask { + return // Already set. + } + + // The layout of this structure implies that it will not necessarily be + // the same layout chosen by the Go compiler. It gets fudged here. + var data struct { + length uint32 + mask1 uint32 + mask2 uint32 + _ uint32 + } + data.length = 8 // Fixed sigset size. + data.mask1 = ^uint32(mask & 0xffffffff) + data.mask2 = ^uint32(mask >> 32) + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_SIGNAL_MASK, + uintptr(unsafe.Pointer(&data))); errno != 0 { + throw("setSignal mask failed") + } +} + // atomicAddressSpace is an atomic address space pointer. type atomicAddressSpace struct { pointer unsafe.Pointer |