summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/platform/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/platform/kvm')
-rw-r--r--pkg/sentry/platform/kvm/BUILD1
-rw-r--r--pkg/sentry/platform/kvm/bluepill.go12
-rw-r--r--pkg/sentry/platform/kvm/bluepill_unsafe.go97
-rw-r--r--pkg/sentry/platform/kvm/kvm_const.go1
-rw-r--r--pkg/sentry/platform/kvm/kvm_test.go64
-rw-r--r--pkg/sentry/platform/kvm/machine.go9
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64_unsafe.go25
-rw-r--r--pkg/sentry/platform/kvm/machine_arm64_unsafe.go26
-rw-r--r--pkg/sentry/platform/kvm/machine_unsafe.go41
9 files changed, 177 insertions, 99 deletions
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 159f7eafd..e27f57536 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -70,6 +70,7 @@ go_test(
"requires-kvm",
],
deps = [
+ "//pkg/procid",
"//pkg/sentry/arch",
"//pkg/sentry/platform",
"//pkg/sentry/platform/kvm/testutil",
diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
index 4b23f7803..555b5fa96 100644
--- a/pkg/sentry/platform/kvm/bluepill.go
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -46,6 +46,14 @@ var (
// bounceSignalMask has only bounceSignal set.
bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1))
+ // otherSignalsMask includes all other signals that will be cause the
+ // vCPU to exit during execution.
+ //
+ // Currently, this includes the preemption signal and the profiling
+ // signal. In general, these should be signals whose delivery actually
+ // influences the way the program executes as the switch can be costly.
+ otherSignalsMask = uint64(1<<(uint64(syscall.SIGURG)-1)) | uint64(1<<(uint64(syscall.SIGPROF)-1))
+
// bounce is the interrupt vector used to return to the kernel.
bounce = uint32(ring0.VirtualizationException)
@@ -86,8 +94,8 @@ func (c *vCPU) die(context *arch.SignalContext64, msg string) {
}
func init() {
- // Install the handler.
- if err := safecopy.ReplaceSignalHandler(bluepillSignal, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil {
+ // Install the handler, masking all signals.
+ if err := safecopy.ReplaceSignalHandler(bluepillSignal, reflect.ValueOf(sighandler).Pointer(), &savedHandler, ^uint64(0)); err != nil {
panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err))
}
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 9add7c944..4e9d80765 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -24,6 +24,7 @@ import (
"syscall"
"unsafe"
+ "gvisor.dev/gvisor/pkg/atomicbitops"
"gvisor.dev/gvisor/pkg/sentry/arch"
)
@@ -58,6 +59,19 @@ func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
return &((*arch.UContext64)(context).MContext)
}
+// injectInterrupt is a helper to inject an interrupt.
+//
+//go:nosplit
+func injectInterrupt(c *vCPU) {
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_INTERRUPT,
+ uintptr(unsafe.Pointer(&bounce))); errno != 0 {
+ throw("interrupt injection failed")
+ }
+}
+
// bluepillHandler is called from the signal stub.
//
// The world may be stopped while this is executing, and it executes on the
@@ -69,6 +83,9 @@ func bluepillHandler(context unsafe.Pointer) {
// Sanitize the registers; interrupts must always be disabled.
c := bluepillArchEnter(bluepillArchContext(context))
+ // Enable preemption.
+ c.setSignalMask(true)
+
// Increment the number of switches.
atomic.AddUint32(&c.switches, 1)
@@ -89,6 +106,9 @@ func bluepillHandler(context unsafe.Pointer) {
// interrupted KVM. Since we're in a signal handler
// currently, all signals are masked and the signal
// must have been delivered directly to this thread.
+ //
+ // We will not be able to actually do subsequent
+ // KVM_RUNs until this signal is processed.
timeout := syscall.Timespec{}
sig, _, errno := syscall.RawSyscall6(
syscall.SYS_RT_SIGTIMEDWAIT,
@@ -98,12 +118,24 @@ func bluepillHandler(context unsafe.Pointer) {
8, // sigset size.
0, 0)
if errno == syscall.EAGAIN {
- continue
- }
- if errno != 0 {
+ // If weren't able to process this signal, then
+ // it must not have been in the bounceMask. By
+ // elimination, it must have been the
+ // preemption signal. We can't process this
+ // signal right now, so we need to disable
+ // preemption until the interrupt is actually
+ // handled.
+ c.setSignalMask(false)
+ // Note that there is a waiter for this vCPU.
+ // This will cause the vCPU to exit at some
+ // point in the future (releasing the user lock
+ // and guest mode).
+ atomicbitops.OrUint32(&c.state, vCPUWaiter)
+ } else if errno != 0 {
+ // We only expect success or a timeout.
throw("error waiting for pending signal")
- }
- if sig != uintptr(bounceSignal) {
+ } else if sig != uintptr(bounceSignal) {
+ // Only the bounce should be processed.
throw("unexpected signal")
}
@@ -114,11 +146,10 @@ func bluepillHandler(context unsafe.Pointer) {
// ready.
if c.runData.readyForInterruptInjection == 0 {
c.runData.requestInterruptWindow = 1
- continue // Rerun vCPU.
} else {
- // Force injection below; the vCPU is ready.
- c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN
+ injectInterrupt(c)
}
+ continue // Rerun vCPU.
case syscall.EFAULT:
// If a fault is not serviceable due to the host
// backing pages having page permissions, instead of an
@@ -137,6 +168,30 @@ func bluepillHandler(context unsafe.Pointer) {
}
switch c.runData.exitReason {
+ case _KVM_EXIT_HLT:
+ // Copy out registers.
+ bluepillArchExit(c, bluepillArchContext(context))
+
+ // Return to the vCPUReady state; notify any waiters.
+ user := atomic.LoadUint32(&c.state) & vCPUUser
+ switch atomic.SwapUint32(&c.state, user) {
+ case user | vCPUGuest: // Expected case.
+ case user | vCPUGuest | vCPUWaiter:
+ c.notify()
+ default:
+ throw("invalid state")
+ }
+ return
+ case _KVM_EXIT_IRQ_WINDOW_OPEN:
+ // Inject an interrupt now.
+ injectInterrupt(c)
+ // Clear previous injection request.
+ c.runData.requestInterruptWindow = 0
+ case _KVM_EXIT_INTR:
+ // This is fine, it is the normal exit reason during
+ // signal delivery. However, we still need to handle
+ // other potential exit reasons *combined* with EINTR,
+ // so this switch must be hit even after the above.
case _KVM_EXIT_EXCEPTION:
c.die(bluepillArchContext(context), "exception")
return
@@ -155,20 +210,6 @@ func bluepillHandler(context unsafe.Pointer) {
case _KVM_EXIT_DEBUG:
c.die(bluepillArchContext(context), "debug")
return
- case _KVM_EXIT_HLT:
- // Copy out registers.
- bluepillArchExit(c, bluepillArchContext(context))
-
- // Return to the vCPUReady state; notify any waiters.
- user := atomic.LoadUint32(&c.state) & vCPUUser
- switch atomic.SwapUint32(&c.state, user) {
- case user | vCPUGuest: // Expected case.
- case user | vCPUGuest | vCPUWaiter:
- c.notify()
- default:
- throw("invalid state")
- }
- return
case _KVM_EXIT_MMIO:
// Increment the fault count.
atomic.AddUint32(&c.faults, 1)
@@ -200,18 +241,6 @@ func bluepillHandler(context unsafe.Pointer) {
data[i] = *b
}
}
- case _KVM_EXIT_IRQ_WINDOW_OPEN:
- // Interrupt: we must have requested an interrupt
- // window; set the interrupt line.
- if _, _, errno := syscall.RawSyscall(
- syscall.SYS_IOCTL,
- uintptr(c.fd),
- _KVM_INTERRUPT,
- uintptr(unsafe.Pointer(&bounce))); errno != 0 {
- throw("interrupt injection failed")
- }
- // Clear previous injection request.
- c.runData.requestInterruptWindow = 0
case _KVM_EXIT_SHUTDOWN:
c.die(bluepillArchContext(context), "shutdown")
return
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index 1d5c77ff4..07d9c9a98 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -48,6 +48,7 @@ const (
_KVM_EXIT_IRQ_WINDOW_OPEN = 0x7
_KVM_EXIT_SHUTDOWN = 0x8
_KVM_EXIT_FAIL_ENTRY = 0x9
+ _KVM_EXIT_INTR = 0xa
_KVM_EXIT_INTERNAL_ERROR = 0x11
_KVM_EXIT_SYSTEM_EVENT = 0x18
)
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index c42752d50..d42ba3f24 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -16,12 +16,15 @@ package kvm
import (
"math/rand"
+ "os"
"reflect"
+ "runtime"
"sync/atomic"
"syscall"
"testing"
"time"
+ "gvisor.dev/gvisor/pkg/procid"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil"
@@ -320,15 +323,18 @@ func TestBounce(t *testing.T) {
})
}
+// randomSleep is used by some race tests below.
+//
+// O(hundreds of microseconds) is appropriate to ensure different overlaps and
+// different schedules.
+func randomSleep() {
+ if n := rand.Intn(1000); n > 100 {
+ time.Sleep(time.Duration(n) * time.Microsecond)
+ }
+}
+
func TestBounceStress(t *testing.T) {
applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
- randomSleep := func() {
- // O(hundreds of microseconds) is appropriate to ensure
- // different overlaps and different schedules.
- if n := rand.Intn(1000); n > 100 {
- time.Sleep(time.Duration(n) * time.Microsecond)
- }
- }
for i := 0; i < 1000; i++ {
// Start an asynchronously executing goroutine that
// calls Bounce at pseudo-random point in time.
@@ -355,6 +361,50 @@ func TestBounceStress(t *testing.T) {
})
}
+func TestPreemption(t *testing.T) {
+ applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+ // Lock the main vCPU thread.
+ runtime.LockOSThread()
+ pid := os.Getpid()
+ tid := procid.Current()
+ running := uint32(1)
+ defer atomic.StoreUint32(&running, 0)
+
+ // Start generating "preemptions".
+ go func() {
+ for atomic.LoadUint32(&running) != 0 {
+ // Kick via a preemption: best effort.
+ syscall.Tgkill(pid, int(tid), syscall.SIGURG)
+ randomSleep()
+ }
+ }()
+
+ for i := 0; i < 1000; i++ {
+ randomSleep()
+ var si arch.SignalInfo
+ if _, err := c.SwitchToUser(ring0.SwitchOpts{
+ Registers: regs,
+ FloatingPointState: dummyFPState,
+ PageTables: pt,
+ }, &si); err != platform.ErrContextInterrupt {
+ t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
+ }
+ // Was this caused by a preemption signal?
+ if got := atomic.LoadUint32(&c.state); got&vCPUGuest != 0 && got&vCPUWaiter == 0 {
+ continue
+ }
+ c.unlock()
+ // Should have dropped from guest mode, processed preemption.
+ if got := atomic.LoadUint32(&c.state); got != vCPUReady {
+ t.Errorf("vCPU not in ready state: got %v", got)
+ }
+ randomSleep()
+ c.lock()
+ }
+ return false
+ })
+}
+
func TestInvalidate(t *testing.T) {
var data uintptr // Used below.
applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index f1afc74dc..345b71e8f 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -108,6 +108,9 @@ type vCPU struct {
// This is a bitmask of the three fields (vCPU*) described above.
state uint32
+ // signalMask is the vCPU signal mask.
+ signalMask uint64
+
// runData for this vCPU.
runData *runData
@@ -121,6 +124,7 @@ type vCPU struct {
// vCPUArchState is the architecture-specific state.
vCPUArchState
+ // dieState is the temporary state associated with throwing exceptions.
dieState dieState
}
@@ -153,11 +157,6 @@ func (m *machine) newVCPU() *vCPU {
c.CPU.Init(&m.kernel, c)
m.vCPUsByID[c.id] = c
- // Ensure the signal mask is correct.
- if err := c.setSignalMask(); err != nil {
- panic(fmt.Sprintf("error setting signal mask: %v", err))
- }
-
// Map the run data.
runData, err := mapRunData(int(fd))
if err != nil {
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index 7156c245f..52286e56d 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -111,31 +111,6 @@ func (c *vCPU) setSystemTime() error {
return nil
}
-// setSignalMask sets the vCPU signal mask.
-//
-// This must be called prior to running the vCPU.
-func (c *vCPU) setSignalMask() error {
- // The layout of this structure implies that it will not necessarily be
- // the same layout chosen by the Go compiler. It gets fudged here.
- var data struct {
- length uint32
- mask1 uint32
- mask2 uint32
- _ uint32
- }
- data.length = 8 // Fixed sigset size.
- data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
- data.mask2 = ^uint32(bounceSignalMask >> 32)
- if _, _, errno := syscall.RawSyscall(
- syscall.SYS_IOCTL,
- uintptr(c.fd),
- _KVM_SET_SIGNAL_MASK,
- uintptr(unsafe.Pointer(&data))); errno != 0 {
- return fmt.Errorf("error setting signal mask: %v", errno)
- }
- return nil
-}
-
// setUserRegisters sets user registers in the vCPU.
func (c *vCPU) setUserRegisters(uregs *userRegs) error {
if _, _, errno := syscall.RawSyscall(
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index b531f2f85..185eeb4f0 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -268,32 +268,6 @@ func (c *vCPU) setSystemTime() error {
return nil
}
-// setSignalMask sets the vCPU signal mask.
-//
-// This must be called prior to running the vCPU.
-func (c *vCPU) setSignalMask() error {
- // The layout of this structure implies that it will not necessarily be
- // the same layout chosen by the Go compiler. It gets fudged here.
- var data struct {
- length uint32
- mask1 uint32
- mask2 uint32
- _ uint32
- }
- data.length = 8 // Fixed sigset size.
- data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
- data.mask2 = ^uint32(bounceSignalMask >> 32)
- if _, _, errno := syscall.RawSyscall(
- syscall.SYS_IOCTL,
- uintptr(c.fd),
- _KVM_SET_SIGNAL_MASK,
- uintptr(unsafe.Pointer(&data))); errno != 0 {
- return fmt.Errorf("error setting signal mask: %v", errno)
- }
-
- return nil
-}
-
// SwitchToUser unpacks architectural-details.
func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (usermem.AccessType, error) {
// Check for canonical addresses.
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index f04be2ab5..e4de0a889 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -87,6 +87,47 @@ func unmapRunData(r *runData) error {
return nil
}
+// setSignalMask sets the vCPU signal mask.
+//
+// This will be called from the bluepill handler, and therefore must not
+// perform any allocation.
+//
+//go:nosplit
+func (c *vCPU) setSignalMask(enableOthers bool) {
+ // The signal mask is either:
+ // *) Only the bounce signal, which we need to use to execute the
+ // machine state up until the bounce interrupt can be processed.
+ // or
+ // *) All signals, which is the default state unless we need to
+ // continue execution to exit guest mode (the case above).
+ mask := bounceSignalMask
+ if enableOthers {
+ mask |= otherSignalsMask
+ }
+ if c.signalMask == mask {
+ return // Already set.
+ }
+
+ // The layout of this structure implies that it will not necessarily be
+ // the same layout chosen by the Go compiler. It gets fudged here.
+ var data struct {
+ length uint32
+ mask1 uint32
+ mask2 uint32
+ _ uint32
+ }
+ data.length = 8 // Fixed sigset size.
+ data.mask1 = ^uint32(mask & 0xffffffff)
+ data.mask2 = ^uint32(mask >> 32)
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_SET_SIGNAL_MASK,
+ uintptr(unsafe.Pointer(&data))); errno != 0 {
+ throw("setSignal mask failed")
+ }
+}
+
// atomicAddressSpace is an atomic address space pointer.
type atomicAddressSpace struct {
pointer unsafe.Pointer