diff options
-rw-r--r-- | pkg/sentry/platform/kvm/bluepill_unsafe.go | 20 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/kvm_const.go | 1 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine_amd64.go | 53 |
3 files changed, 52 insertions, 22 deletions
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go index 2c1e098d7..216d4b4b6 100644 --- a/pkg/sentry/platform/kvm/bluepill_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go @@ -61,8 +61,9 @@ func bluepillHandler(context unsafe.Pointer) { } for { - _, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0) - if errno == syscall.EINTR { + switch _, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0); errno { + case 0: // Expected case. + case syscall.EINTR: // First, we process whatever pending signal // interrupted KVM. Since we're in a signal handler // currently, all signals are masked and the signal @@ -93,7 +94,20 @@ func bluepillHandler(context unsafe.Pointer) { // Force injection below; the vCPU is ready. c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN } - } else if errno != 0 { + case syscall.EFAULT: + // If a fault is not serviceable due to the host + // backing pages having page permissions, instead of an + // MMIO exit we receive EFAULT from the run ioctl. We + // always inject an NMI here since we may be in kernel + // mode and have interrupts disabled. + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_NMI, 0); errno != 0 { + throw("NMI injection failed") + } + continue // Rerun vCPU. + default: throw("run failed") } diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go index 0ec6a4a00..c819fd16f 100644 --- a/pkg/sentry/platform/kvm/kvm_const.go +++ b/pkg/sentry/platform/kvm/kvm_const.go @@ -24,6 +24,7 @@ const ( _KVM_CREATE_VCPU = 0xae41 _KVM_SET_TSS_ADDR = 0xae47 _KVM_RUN = 0xae80 + _KVM_NMI = 0xae9a _KVM_INTERRUPT = 0x4004ae86 _KVM_SET_MSRS = 0x4008ae89 _KVM_SET_USER_MEMORY_REGION = 0x4020ae46 diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index 4e42f2c87..f583f68f7 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -97,6 +97,29 @@ func (c *vCPU) initArchState() error { return c.setSystemTime() } +// fault generates an appropriate fault return. +// +//go:nosplit +func (c *vCPU) fault(signal int32) (*arch.SignalInfo, usermem.AccessType, error) { + bluepill(c) // Probably no-op, but may not be. + faultAddr := ring0.ReadCR2() + code, user := c.ErrorCode() + if !user { + // The last fault serviced by this CPU was not a user + // fault, so we can't reliably trust the faultAddr or + // the code provided here. We need to re-execute. + return nil, usermem.NoAccess, platform.ErrContextInterrupt + } + info := &arch.SignalInfo{Signo: signal} + info.SetAddr(uint64(faultAddr)) + accessType := usermem.AccessType{ + Read: code&(1<<1) == 0, + Write: code&(1<<1) != 0, + Execute: code&(1<<4) != 0, + } + return info, accessType, platform.ErrContextSignal +} + // SwitchToUser unpacks architectural-details. func (c *vCPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetables.PageTables, flags ring0.Flags) (*arch.SignalInfo, usermem.AccessType, error) { // See below. @@ -116,29 +139,13 @@ func (c *vCPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetab // Fast path: system call executed. return nil, usermem.NoAccess, nil + case ring0.PageFault: + return c.fault(int32(syscall.SIGSEGV)) + case ring0.Debug, ring0.Breakpoint: info := &arch.SignalInfo{Signo: int32(syscall.SIGTRAP)} return info, usermem.AccessType{}, platform.ErrContextSignal - case ring0.PageFault: - bluepill(c) // Probably no-op, but may not be. - faultAddr := ring0.ReadCR2() - code, user := c.ErrorCode() - if !user { - // The last fault serviced by this CPU was not a user - // fault, so we can't reliably trust the faultAddr or - // the code provided here. We need to re-execute. - return nil, usermem.NoAccess, platform.ErrContextInterrupt - } - info := &arch.SignalInfo{Signo: int32(syscall.SIGSEGV)} - info.SetAddr(uint64(faultAddr)) - accessType := usermem.AccessType{ - Read: code&(1<<1) == 0, - Write: code&(1<<1) != 0, - Execute: code&(1<<4) != 0, - } - return info, accessType, platform.ErrContextSignal - case ring0.GeneralProtectionFault: if !ring0.IsCanonical(regs.Rip) { // If the RIP is non-canonical, it's a SEGV. @@ -160,6 +167,14 @@ func (c *vCPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetab case ring0.Vector(bounce): return nil, usermem.NoAccess, platform.ErrContextInterrupt + case ring0.NMI: + // An NMI is generated only when a fault is not servicable by + // KVM itself, so we think some mapping is writeable but it's + // really not. This could happen, e.g. if some file is + // truncated (and would generate a SIGBUS) and we map it + // directly into the instance. + return c.fault(int32(syscall.SIGBUS)) + default: panic(fmt.Sprintf("unexpected vector: 0x%x", vector)) } |