summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--pkg/sentry/platform/kvm/BUILD2
-rw-r--r--pkg/sentry/platform/kvm/address_space.go2
-rw-r--r--pkg/sentry/platform/kvm/bluepill_unsafe.go27
-rw-r--r--pkg/sentry/platform/kvm/kvm_test.go20
-rw-r--r--pkg/sentry/platform/kvm/machine.go243
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64.go1
-rw-r--r--pkg/sentry/platform/kvm/machine_unsafe.go36
7 files changed, 183 insertions, 148 deletions
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index d902e344a..adc43c21b 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -51,6 +51,7 @@ go_library(
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
+ "//pkg/atomicbitops",
"//pkg/cpuid",
"//pkg/log",
"//pkg/sentry/arch",
@@ -63,7 +64,6 @@ go_library(
"//pkg/sentry/platform/safecopy",
"//pkg/sentry/time",
"//pkg/sentry/usermem",
- "//pkg/tmutex",
],
)
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index 2302f78e1..3d57ae0cb 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -57,7 +57,7 @@ func (as *addressSpace) Invalidate() {
c := key.(*vCPU)
v := value.(*uint32)
atomic.StoreUint32(v, 0) // Invalidation required.
- c.Bounce() // Force a kernel transition.
+ c.BounceToKernel() // Force a kernel transition.
return true // Keep iterating.
})
}
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 9e252af64..2c1e098d7 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -51,15 +51,13 @@ func bluepillHandler(context unsafe.Pointer) {
// Increment the number of switches.
atomic.AddUint32(&c.switches, 1)
- // Store vCPUGuest.
- //
- // This is fine even if we're not in guest mode yet. In this signal
- // handler, we'll already have all the relevant signals blocked, so an
- // interrupt is only deliverable when we actually execute the KVM_RUN.
- //
- // The state will be returned to vCPUReady by Phase2.
- if state := atomic.SwapUintptr(&c.state, vCPUGuest); state != vCPUReady {
- throw("vCPU not in ready state")
+ // Mark this as guest mode.
+ switch atomic.SwapUint32(&c.state, vCPUGuest|vCPUUser) {
+ case vCPUUser: // Expected case.
+ case vCPUUser | vCPUWaiter:
+ c.notify()
+ default:
+ throw("invalid state")
}
for {
@@ -118,11 +116,12 @@ func bluepillHandler(context unsafe.Pointer) {
// Copy out registers.
bluepillArchExit(c, bluepillArchContext(context))
- // Notify any waiters.
- switch state := atomic.SwapUintptr(&c.state, vCPUReady); state {
- case vCPUGuest:
- case vCPUWaiter:
- c.notify() // Safe from handler.
+ // Return to the vCPUReady state; notify any waiters.
+ user := atomic.LoadUint32(&c.state) & vCPUUser
+ switch atomic.SwapUint32(&c.state, user) {
+ case user | vCPUGuest: // Expected case.
+ case user | vCPUGuest | vCPUWaiter:
+ c.notify()
default:
throw("invalid state")
}
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index 61cfdd8fd..778a6d187 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -17,6 +17,7 @@ package kvm
import (
"math/rand"
"reflect"
+ "sync/atomic"
"syscall"
"testing"
"time"
@@ -84,7 +85,7 @@ func bluepillTest(t testHarness, fn func(*vCPU)) {
func TestKernelSyscall(t *testing.T) {
bluepillTest(t, func(c *vCPU) {
redpill() // Leave guest mode.
- if got := c.State(); got != vCPUReady {
+ if got := atomic.LoadUint32(&c.state); got != vCPUUser {
t.Errorf("vCPU not in ready state: got %v", got)
}
})
@@ -102,7 +103,7 @@ func TestKernelFault(t *testing.T) {
hostFault() // Ensure recovery works.
bluepillTest(t, func(c *vCPU) {
hostFault()
- if got := c.State(); got != vCPUReady {
+ if got := atomic.LoadUint32(&c.state); got != vCPUUser {
t.Errorf("vCPU not in ready state: got %v", got)
}
})
@@ -229,7 +230,7 @@ func TestBounce(t *testing.T) {
applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
go func() {
time.Sleep(time.Millisecond)
- c.Bounce()
+ c.BounceToKernel()
}()
if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != platform.ErrContextInterrupt {
t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
@@ -239,7 +240,7 @@ func TestBounce(t *testing.T) {
applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
go func() {
time.Sleep(time.Millisecond)
- c.Bounce()
+ c.BounceToKernel()
}()
if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); err != platform.ErrContextInterrupt {
t.Errorf("application full restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
@@ -264,17 +265,15 @@ func TestBounceStress(t *testing.T) {
// kernel is in various stages of the switch.
go func() {
randomSleep()
- c.Bounce()
+ c.BounceToKernel()
}()
randomSleep()
- // Execute the switch.
if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != platform.ErrContextInterrupt {
t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
}
- // Simulate work.
- c.Unlock()
+ c.unlock()
randomSleep()
- c.Lock()
+ c.lock()
}
return false
})
@@ -289,8 +288,7 @@ func TestInvalidate(t *testing.T) {
}
// Unmap the page containing data & invalidate.
pt.Unmap(usermem.Addr(reflect.ValueOf(&data).Pointer() & ^uintptr(usermem.PageSize-1)), usermem.PageSize)
- c.Invalidate() // Ensure invalidation.
- if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != platform.ErrContextSignal {
+ if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFlush); err != platform.ErrContextSignal {
t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextSignal)
}
return false
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index a5be0cee3..7a962e316 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -21,11 +21,11 @@ import (
"sync/atomic"
"syscall"
+ "gvisor.googlesource.com/gvisor/pkg/atomicbitops"
"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
- "gvisor.googlesource.com/gvisor/pkg/tmutex"
)
// machine contains state associated with the VM as a whole.
@@ -57,20 +57,19 @@ type machine struct {
}
const (
- // vCPUReady is the lock value for an available vCPU.
- //
- // Legal transitions: vCPUGuest (bluepill).
- vCPUReady uintptr = iota
+ // vCPUReady is an alias for all the below clear.
+ vCPUReady uint32 = 0
+
+ // vCPUser indicates that the vCPU is in or about to enter user mode.
+ vCPUUser uint32 = 1 << 0
// vCPUGuest indicates the vCPU is in guest mode.
- //
- // Legal transition: vCPUReady (bluepill), vCPUWaiter (wait).
- vCPUGuest
+ vCPUGuest uint32 = 1 << 1
- // vCPUWaiter indicates that the vCPU should be released.
+ // vCPUWaiter indicates that there is a waiter.
//
- // Legal transition: vCPUReady (bluepill).
- vCPUWaiter
+ // If this is set, then notify must be called on any state transitions.
+ vCPUWaiter uint32 = 1 << 2
)
// vCPU is a single KVM vCPU.
@@ -93,17 +92,16 @@ type vCPU struct {
// faults is a count of world faults (informational only).
faults uint32
- // state is the vCPU state; all are described above.
- state uintptr
+ // state is the vCPU state.
+ //
+ // This is a bitmask of the three fields (vCPU*) described above.
+ state uint32
// runData for this vCPU.
runData *runData
// machine associated with this vCPU.
machine *machine
-
- // mu applies across get/put; it does not protect the above.
- mu tmutex.Mutex
}
// newMachine returns a new VM context.
@@ -145,7 +143,6 @@ func newMachine(vm int, vCPUs int) (*machine, error) {
fd: int(fd),
machine: m,
}
- c.mu.Init()
c.CPU.Init(m.kernel)
c.CPU.KernelSyscall = bluepillSyscall
c.CPU.KernelException = bluepillException
@@ -253,27 +250,17 @@ func (m *machine) Destroy() {
// Ensure the vCPU is not still running in guest mode. This is
// possible iff teardown has been done by other threads, and
// somehow a single thread has not executed any system calls.
- c.wait()
-
- // Teardown the vCPU itself.
- switch state := c.State(); state {
- case vCPUReady:
- // Note that the runData may not be mapped if an error
- // occurs during the middle of initialization.
- if c.runData != nil {
- if err := unmapRunData(c.runData); err != nil {
- panic(fmt.Sprintf("error unmapping rundata: %v", err))
- }
- }
- if err := syscall.Close(int(c.fd)); err != nil {
- panic(fmt.Sprintf("error closing vCPU fd: %v", err))
+ c.BounceToHost()
+
+ // Note that the runData may not be mapped if an error occurs
+ // during the middle of initialization.
+ if c.runData != nil {
+ if err := unmapRunData(c.runData); err != nil {
+ panic(fmt.Sprintf("error unmapping rundata: %v", err))
}
- case vCPUGuest, vCPUWaiter:
- // Should never happen; waited above.
- panic("vCPU disposed in guest state")
- default:
- // Should never happen; not a valid state.
- panic(fmt.Sprintf("vCPU in invalid state: %v", state))
+ }
+ if err := syscall.Close(int(c.fd)); err != nil {
+ panic(fmt.Sprintf("error closing vCPU fd: %v", err))
}
}
@@ -296,14 +283,19 @@ func (m *machine) Get() (*vCPU, error) {
for {
// Check for an exact match.
- if c := m.vCPUs[tid]; c != nil && c.mu.TryLock() {
+ if c := m.vCPUs[tid]; c != nil {
+ c.lock()
m.mu.Unlock()
return c, nil
}
// Scan for an available vCPU.
for origTID, c := range m.vCPUs {
- if c.LockInState(vCPUReady) {
+ // We can only steal a vCPU that is the vCPUReady
+ // state. That is, it must not be heading to user mode
+ // with some other thread, have a waiter registered, or
+ // be in guest mode already.
+ if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
delete(m.vCPUs, origTID)
m.vCPUs[tid] = c
m.mu.Unlock()
@@ -317,96 +309,151 @@ func (m *machine) Get() (*vCPU, error) {
}
}
- // Everything is busy executing user code (locked).
+ // Everything is already in guest mode.
//
- // We hold the pool lock here, so we should be able to kick something
- // out of kernel mode and have it bounce into host mode when it tries
- // to grab the vCPU again.
+ // We hold the pool lock here, so we should be able to kick
+ // something out of kernel mode and have it bounce into host
+ // mode when it tries to grab the vCPU again.
for _, c := range m.vCPUs {
- if c.State() != vCPUWaiter {
- c.Bounce()
- }
+ c.BounceToHost()
}
- // Give other threads an opportunity to run.
+ // Give other threads an opportunity to run. We don't yield the
+ // pool lock above, so if they try to regrab the lock we will
+ // serialize at this point. This is extreme, but we don't
+ // expect to exhaust all vCPUs frequently.
yield()
}
}
// Put puts the current vCPU.
func (m *machine) Put(c *vCPU) {
- c.Unlock()
+ c.unlock()
runtime.UnlockOSThread()
}
-// State returns the current state.
-func (c *vCPU) State() uintptr {
- return atomic.LoadUintptr(&c.state)
-}
-
-// Lock locks the vCPU.
-func (c *vCPU) Lock() {
- c.mu.Lock()
-}
-
-// Invalidate invalidates caches.
-func (c *vCPU) Invalidate() {
+// lock marks the vCPU as in user mode.
+//
+// This should only be called directly when known to be safe, i.e. when
+// the vCPU is owned by the current TID with no chance of theft.
+//
+//go:nosplit
+func (c *vCPU) lock() {
+ atomicbitops.OrUint32(&c.state, vCPUUser)
}
-// LockInState locks the vCPU if it is in the given state and TryLock succeeds.
-func (c *vCPU) LockInState(state uintptr) bool {
- if c.State() == state && c.mu.TryLock() {
- if c.State() != state {
- c.mu.Unlock()
- return false
- }
- return true
+// unlock clears the vCPUUser bit.
+//
+//go:nosplit
+func (c *vCPU) unlock() {
+ if atomic.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest) {
+ // Happy path: no exits are forced, and we can continue
+ // executing on our merry way with a single atomic access.
+ return
}
- return false
-}
-// Unlock unlocks the given vCPU.
-func (c *vCPU) Unlock() {
- // Ensure we're out of guest mode, if necessary.
- if c.State() == vCPUWaiter {
- redpill() // Force guest mode exit.
+ // Clear the lock.
+ origState := atomic.LoadUint32(&c.state)
+ atomicbitops.AndUint32(&c.state, ^vCPUUser)
+ switch origState {
+ case vCPUUser:
+ // Normal state.
+ case vCPUUser | vCPUGuest | vCPUWaiter:
+ // Force a transition: this must trigger a notification when we
+ // return from guest mode.
+ redpill()
+ case vCPUUser | vCPUWaiter:
+ // Waiting for the lock to be released; the responsibility is
+ // on us to notify the waiter and clear the associated bit.
+ atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
+ c.notify()
+ default:
+ panic("invalid state")
}
- c.mu.Unlock()
}
// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
+//
+//go:nosplit
func (c *vCPU) NotifyInterrupt() {
- c.Bounce()
+ c.BounceToKernel()
}
// pid is used below in bounce.
var pid = syscall.Getpid()
-// Bounce ensures that the vCPU bounces back to the kernel.
+// bounce forces a return to the kernel or to host mode.
//
-// In practice, this means returning EAGAIN from running user code. The vCPU
-// will be unlocked and relock, and the kernel is guaranteed to check for
-// interrupt notifications (e.g. injected via Notify) and invalidations.
-func (c *vCPU) Bounce() {
+// This effectively unwinds the state machine.
+func (c *vCPU) bounce(forceGuestExit bool) {
for {
- if c.mu.TryLock() {
- // We know that the vCPU must be in the kernel already,
- // because the lock was not acquired. We specifically
- // don't want to call bounce in this case, because it's
- // not necessary to knock the vCPU out of guest mode.
- c.mu.Unlock()
+ switch state := atomic.LoadUint32(&c.state); state {
+ case vCPUReady, vCPUWaiter:
+ // There is nothing to be done, we're already in the
+ // kernel pre-acquisition. The Bounce criteria have
+ // been satisfied.
return
+ case vCPUUser:
+ // We need to register a waiter for the actual guest
+ // transition. When the transition takes place, then we
+ // can inject an interrupt to ensure a return to host
+ // mode.
+ atomic.CompareAndSwapUint32(&c.state, state, state|vCPUWaiter)
+ case vCPUUser | vCPUWaiter:
+ // Wait for the transition to guest mode. This should
+ // come from the bluepill handler.
+ c.waitUntilNot(state)
+ case vCPUGuest, vCPUUser | vCPUGuest:
+ if state == vCPUGuest && !forceGuestExit {
+ // The vCPU is already not acquired, so there's
+ // no need to do a fresh injection here.
+ return
+ }
+ // The vCPU is in user or kernel mode. Attempt to
+ // register a notification on change.
+ if !atomic.CompareAndSwapUint32(&c.state, state, state|vCPUWaiter) {
+ break // Retry.
+ }
+ for {
+ // We need to spin here until the signal is
+ // delivered, because Tgkill can return EAGAIN
+ // under memory pressure. Since we already
+ // marked ourselves as a waiter, we need to
+ // ensure that a signal is actually delivered.
+ if err := syscall.Tgkill(pid, int(atomic.LoadUint64(&c.tid)), bounceSignal); err == nil {
+ break
+ } else if err.(syscall.Errno) == syscall.EAGAIN {
+ continue
+ } else {
+ // Nothing else should be returned by tgkill.
+ panic(fmt.Sprintf("unexpected tgkill error: %v", err))
+ }
+ }
+ case vCPUGuest | vCPUWaiter, vCPUUser | vCPUGuest | vCPUWaiter:
+ if state == vCPUGuest|vCPUWaiter && !forceGuestExit {
+ // See above.
+ return
+ }
+ // Wait for the transition. This again should happen
+ // from the bluepill handler, but on the way out.
+ c.waitUntilNot(state)
+ default:
+ // Should not happen: the above is exhaustive.
+ panic("invalid state")
}
+ }
+}
- if state := c.State(); state == vCPUGuest || state == vCPUWaiter {
- // We know that the vCPU was in guest mode, so a single signal
- // interruption will guarantee that a transition takes place.
- syscall.Tgkill(pid, int(atomic.LoadUint64(&c.tid)), bounceSignal)
- return
- }
+// BounceToKernel ensures that the vCPU bounces back to the kernel.
+//
+//go:nosplit
+func (c *vCPU) BounceToKernel() {
+ c.bounce(false)
+}
- // Someone holds the lock, but the vCPU is not yet transitioned
- // into guest mode. It's in the critical section; give it time.
- yield()
- }
+// BounceToHost ensures that the vCPU is in host mode.
+//
+//go:nosplit
+func (c *vCPU) BounceToHost() {
+ c.bounce(true)
}
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index fe4d31702..4e42f2c87 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -158,7 +158,6 @@ func (c *vCPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetab
return info, usermem.AccessType{}, platform.ErrContextSignal
case ring0.Vector(bounce):
- redpill() // Bail and reacqire.
return nil, usermem.NoAccess, platform.ErrContextInterrupt
default:
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index da67e23f6..9f7fcd135 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -16,7 +16,6 @@ package kvm
import (
"fmt"
- "sync/atomic"
"syscall"
"unsafe"
@@ -69,7 +68,7 @@ func unmapRunData(r *runData) error {
return nil
}
-// notify notifies that the vCPU has returned to host mode.
+// notify notifies that the vCPU has transitioned modes.
//
// This may be called by a signal handler and therefore throws on error.
//
@@ -86,27 +85,20 @@ func (c *vCPU) notify() {
}
}
-// wait waits for the vCPU to return to host mode.
+// waitUntilNot waits for the vCPU to transition modes.
+//
+// The state should have been previously set to vCPUWaiter after performing an
+// appropriate action to cause a transition (e.g. interrupt injection).
//
// This panics on error.
-func (c *vCPU) wait() {
- if !atomic.CompareAndSwapUintptr(&c.state, vCPUGuest, vCPUWaiter) {
- return // Nothing to wait for.
- }
- for {
- _, _, errno := syscall.Syscall6(
- syscall.SYS_FUTEX,
- uintptr(unsafe.Pointer(&c.state)),
- linux.FUTEX_WAIT,
- uintptr(vCPUWaiter), // Expected value.
- 0, 0, 0)
- if errno == syscall.EINTR {
- continue
- } else if errno == syscall.EAGAIN {
- break
- } else if errno != 0 {
- panic("futex wait error")
- }
- break
+func (c *vCPU) waitUntilNot(state uint32) {
+ _, _, errno := syscall.Syscall6(
+ syscall.SYS_FUTEX,
+ uintptr(unsafe.Pointer(&c.state)),
+ linux.FUTEX_WAIT,
+ uintptr(state),
+ 0, 0, 0)
+ if errno != 0 && errno != syscall.EINTR && errno != syscall.EAGAIN {
+ panic("futex wait error")
}
}