summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--pkg/sentry/platform/kvm/bluepill_unsafe.go20
-rw-r--r--pkg/sentry/platform/kvm/kvm_const.go1
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64.go53
3 files changed, 52 insertions, 22 deletions
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 2c1e098d7..216d4b4b6 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -61,8 +61,9 @@ func bluepillHandler(context unsafe.Pointer) {
}
for {
- _, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0)
- if errno == syscall.EINTR {
+ switch _, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0); errno {
+ case 0: // Expected case.
+ case syscall.EINTR:
// First, we process whatever pending signal
// interrupted KVM. Since we're in a signal handler
// currently, all signals are masked and the signal
@@ -93,7 +94,20 @@ func bluepillHandler(context unsafe.Pointer) {
// Force injection below; the vCPU is ready.
c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN
}
- } else if errno != 0 {
+ case syscall.EFAULT:
+ // If a fault is not serviceable due to the host
+ // backing pages having page permissions, instead of an
+ // MMIO exit we receive EFAULT from the run ioctl. We
+ // always inject an NMI here since we may be in kernel
+ // mode and have interrupts disabled.
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_NMI, 0); errno != 0 {
+ throw("NMI injection failed")
+ }
+ continue // Rerun vCPU.
+ default:
throw("run failed")
}
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index 0ec6a4a00..c819fd16f 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -24,6 +24,7 @@ const (
_KVM_CREATE_VCPU = 0xae41
_KVM_SET_TSS_ADDR = 0xae47
_KVM_RUN = 0xae80
+ _KVM_NMI = 0xae9a
_KVM_INTERRUPT = 0x4004ae86
_KVM_SET_MSRS = 0x4008ae89
_KVM_SET_USER_MEMORY_REGION = 0x4020ae46
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index 4e42f2c87..f583f68f7 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -97,6 +97,29 @@ func (c *vCPU) initArchState() error {
return c.setSystemTime()
}
+// fault generates an appropriate fault return.
+//
+//go:nosplit
+func (c *vCPU) fault(signal int32) (*arch.SignalInfo, usermem.AccessType, error) {
+ bluepill(c) // Probably no-op, but may not be.
+ faultAddr := ring0.ReadCR2()
+ code, user := c.ErrorCode()
+ if !user {
+ // The last fault serviced by this CPU was not a user
+ // fault, so we can't reliably trust the faultAddr or
+ // the code provided here. We need to re-execute.
+ return nil, usermem.NoAccess, platform.ErrContextInterrupt
+ }
+ info := &arch.SignalInfo{Signo: signal}
+ info.SetAddr(uint64(faultAddr))
+ accessType := usermem.AccessType{
+ Read: code&(1<<1) == 0,
+ Write: code&(1<<1) != 0,
+ Execute: code&(1<<4) != 0,
+ }
+ return info, accessType, platform.ErrContextSignal
+}
+
// SwitchToUser unpacks architectural-details.
func (c *vCPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetables.PageTables, flags ring0.Flags) (*arch.SignalInfo, usermem.AccessType, error) {
// See below.
@@ -116,29 +139,13 @@ func (c *vCPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetab
// Fast path: system call executed.
return nil, usermem.NoAccess, nil
+ case ring0.PageFault:
+ return c.fault(int32(syscall.SIGSEGV))
+
case ring0.Debug, ring0.Breakpoint:
info := &arch.SignalInfo{Signo: int32(syscall.SIGTRAP)}
return info, usermem.AccessType{}, platform.ErrContextSignal
- case ring0.PageFault:
- bluepill(c) // Probably no-op, but may not be.
- faultAddr := ring0.ReadCR2()
- code, user := c.ErrorCode()
- if !user {
- // The last fault serviced by this CPU was not a user
- // fault, so we can't reliably trust the faultAddr or
- // the code provided here. We need to re-execute.
- return nil, usermem.NoAccess, platform.ErrContextInterrupt
- }
- info := &arch.SignalInfo{Signo: int32(syscall.SIGSEGV)}
- info.SetAddr(uint64(faultAddr))
- accessType := usermem.AccessType{
- Read: code&(1<<1) == 0,
- Write: code&(1<<1) != 0,
- Execute: code&(1<<4) != 0,
- }
- return info, accessType, platform.ErrContextSignal
-
case ring0.GeneralProtectionFault:
if !ring0.IsCanonical(regs.Rip) {
// If the RIP is non-canonical, it's a SEGV.
@@ -160,6 +167,14 @@ func (c *vCPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetab
case ring0.Vector(bounce):
return nil, usermem.NoAccess, platform.ErrContextInterrupt
+ case ring0.NMI:
+ // An NMI is generated only when a fault is not servicable by
+ // KVM itself, so we think some mapping is writeable but it's
+ // really not. This could happen, e.g. if some file is
+ // truncated (and would generate a SIGBUS) and we map it
+ // directly into the instance.
+ return c.fault(int32(syscall.SIGBUS))
+
default:
panic(fmt.Sprintf("unexpected vector: 0x%x", vector))
}