summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/platform/kvm/machine.go
diff options
context:
space:
mode:
authorGoogler <noreply@google.com>2018-04-27 10:37:02 -0700
committerAdin Scannell <ascannell@google.com>2018-04-28 01:44:26 -0400
commitd02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/platform/kvm/machine.go
parentf70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)
Check in gVisor.
PiperOrigin-RevId: 194583126 Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
Diffstat (limited to 'pkg/sentry/platform/kvm/machine.go')
-rw-r--r--pkg/sentry/platform/kvm/machine.go412
1 files changed, 412 insertions, 0 deletions
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
new file mode 100644
index 000000000..a5be0cee3
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -0,0 +1,412 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+ "fmt"
+ "runtime"
+ "sync"
+ "sync/atomic"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/tmutex"
+)
+
+// machine contains state associated with the VM as a whole.
+type machine struct {
+ // fd is the vm fd.
+ fd int
+
+ // nextSlot is the next slot for setMemoryRegion.
+ //
+ // This must be accessed atomically. If nextSlot is ^uint32(0), then
+ // slots are currently being updated, and the caller should retry.
+ nextSlot uint32
+
+ // kernel is the set of global structures.
+ kernel *ring0.Kernel
+
+ // mappingCache is used for mapPhysical.
+ mappingCache sync.Map
+
+ // mu protects vCPUs.
+ mu sync.Mutex
+
+ // vCPUs are the machine vCPUs.
+ //
+ // This is eventually keyed by system TID, but is initially indexed by
+ // the negative vCPU id. This is merely an optimization, so while
+ // collisions here are not possible, it wouldn't matter anyways.
+ vCPUs map[uint64]*vCPU
+}
+
+const (
+ // vCPUReady is the lock value for an available vCPU.
+ //
+ // Legal transitions: vCPUGuest (bluepill).
+ vCPUReady uintptr = iota
+
+ // vCPUGuest indicates the vCPU is in guest mode.
+ //
+ // Legal transition: vCPUReady (bluepill), vCPUWaiter (wait).
+ vCPUGuest
+
+ // vCPUWaiter indicates that the vCPU should be released.
+ //
+ // Legal transition: vCPUReady (bluepill).
+ vCPUWaiter
+)
+
+// vCPU is a single KVM vCPU.
+type vCPU struct {
+ // CPU is the kernel CPU data.
+ //
+ // This must be the first element of this structure, it is referenced
+ // by the bluepill code (see bluepill_amd64.s).
+ ring0.CPU
+
+ // fd is the vCPU fd.
+ fd int
+
+ // tid is the last set tid.
+ tid uint64
+
+ // switches is a count of world switches (informational only).
+ switches uint32
+
+ // faults is a count of world faults (informational only).
+ faults uint32
+
+ // state is the vCPU state; all are described above.
+ state uintptr
+
+ // runData for this vCPU.
+ runData *runData
+
+ // machine associated with this vCPU.
+ machine *machine
+
+ // mu applies across get/put; it does not protect the above.
+ mu tmutex.Mutex
+}
+
+// newMachine returns a new VM context.
+func newMachine(vm int, vCPUs int) (*machine, error) {
+ // Create the machine.
+ m := &machine{
+ fd: vm,
+ vCPUs: make(map[uint64]*vCPU),
+ }
+ if vCPUs > _KVM_NR_VCPUS {
+ // Hard cap at KVM's limit.
+ vCPUs = _KVM_NR_VCPUS
+ }
+ if n := 2 * runtime.NumCPU(); vCPUs > n {
+ // Cap at twice the number of physical cores. Otherwise we're
+ // just wasting memory and thrashing. (There may be scheduling
+ // issues when you've got > n active threads.)
+ vCPUs = n
+ }
+ m.kernel = ring0.New(ring0.KernelOpts{
+ PageTables: pagetables.New(m, pagetablesOpts),
+ })
+
+ // Initialize architecture state.
+ if err := m.initArchState(vCPUs); err != nil {
+ m.Destroy()
+ return nil, err
+ }
+
+ // Create all the vCPUs.
+ for id := 0; id < vCPUs; id++ {
+ // Create the vCPU.
+ fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(vm), _KVM_CREATE_VCPU, uintptr(id))
+ if errno != 0 {
+ m.Destroy()
+ return nil, fmt.Errorf("error creating VCPU: %v", errno)
+ }
+ c := &vCPU{
+ fd: int(fd),
+ machine: m,
+ }
+ c.mu.Init()
+ c.CPU.Init(m.kernel)
+ c.CPU.KernelSyscall = bluepillSyscall
+ c.CPU.KernelException = bluepillException
+ m.vCPUs[uint64(-id)] = c // See above.
+
+ // Ensure the signal mask is correct.
+ if err := c.setSignalMask(); err != nil {
+ m.Destroy()
+ return nil, err
+ }
+
+ // Initialize architecture state.
+ if err := c.initArchState(); err != nil {
+ m.Destroy()
+ return nil, err
+ }
+
+ // Map the run data.
+ runData, err := mapRunData(int(fd))
+ if err != nil {
+ m.Destroy()
+ return nil, err
+ }
+ c.runData = runData
+ }
+
+ // Apply the physical mappings. Note that these mappings may point to
+ // guest physical addresses that are not actually available. These
+ // physical pages are mapped on demand, see kernel_unsafe.go.
+ applyPhysicalRegions(func(pr physicalRegion) bool {
+ // Map everything in the lower half.
+ m.kernel.PageTables.Map(usermem.Addr(pr.virtual), pr.length, false /* kernel */, usermem.AnyAccess, pr.physical)
+ // And keep everything in the upper half.
+ kernelAddr := usermem.Addr(ring0.KernelStartAddress | pr.virtual)
+ m.kernel.PageTables.Map(kernelAddr, pr.length, false /* kernel */, usermem.AnyAccess, pr.physical)
+ return true // Keep iterating.
+ })
+
+ // Ensure that the currently mapped virtual regions are actually
+ // available in the VM. Note that this doesn't guarantee no future
+ // faults, however it should guarantee that everything is available to
+ // ensure successful vCPU entry.
+ applyVirtualRegions(func(vr virtualRegion) {
+ if excludeVirtualRegion(vr) {
+ return // skip region.
+ }
+ for virtual := vr.virtual; virtual < vr.virtual+vr.length; {
+ physical, length, ok := TranslateToPhysical(virtual)
+ if !ok {
+ // This must be an invalid region that was
+ // knocked out by creation of the physical map.
+ return
+ }
+ if virtual+length > vr.virtual+vr.length {
+ // Cap the length to the end of the area.
+ length = vr.virtual + vr.length - virtual
+ }
+
+ // Ensure the physical range is mapped.
+ m.mapPhysical(physical, length)
+ virtual += length
+ }
+ })
+
+ // Ensure the machine is cleaned up properly.
+ runtime.SetFinalizer(m, (*machine).Destroy)
+ return m, nil
+}
+
+// mapPhysical checks for the mapping of a physical range, and installs one if
+// not available. This attempts to be efficient for calls in the hot path.
+//
+// This panics on error.
+func (m *machine) mapPhysical(physical, length uintptr) {
+ for end := physical + length; physical < end; {
+ _, physicalStart, length, ok := calculateBluepillFault(m, physical)
+ if !ok {
+ // Should never happen.
+ panic("mapPhysical on unknown physical address")
+ }
+
+ if _, ok := m.mappingCache.LoadOrStore(physicalStart, true); !ok {
+ // Not present in the cache; requires setting the slot.
+ if _, ok := handleBluepillFault(m, physical); !ok {
+ panic("handleBluepillFault failed")
+ }
+ }
+
+ // Move to the next chunk.
+ physical = physicalStart + length
+ }
+}
+
+// Destroy frees associated resources.
+//
+// Destroy should only be called once all active users of the machine are gone.
+// The machine object should not be used after calling Destroy.
+//
+// Precondition: all vCPUs must be returned to the machine.
+func (m *machine) Destroy() {
+ runtime.SetFinalizer(m, nil)
+
+ // Destroy vCPUs.
+ for _, c := range m.vCPUs {
+ // Ensure the vCPU is not still running in guest mode. This is
+ // possible iff teardown has been done by other threads, and
+ // somehow a single thread has not executed any system calls.
+ c.wait()
+
+ // Teardown the vCPU itself.
+ switch state := c.State(); state {
+ case vCPUReady:
+ // Note that the runData may not be mapped if an error
+ // occurs during the middle of initialization.
+ if c.runData != nil {
+ if err := unmapRunData(c.runData); err != nil {
+ panic(fmt.Sprintf("error unmapping rundata: %v", err))
+ }
+ }
+ if err := syscall.Close(int(c.fd)); err != nil {
+ panic(fmt.Sprintf("error closing vCPU fd: %v", err))
+ }
+ case vCPUGuest, vCPUWaiter:
+ // Should never happen; waited above.
+ panic("vCPU disposed in guest state")
+ default:
+ // Should never happen; not a valid state.
+ panic(fmt.Sprintf("vCPU in invalid state: %v", state))
+ }
+ }
+
+ // Release host mappings.
+ if m.kernel.PageTables != nil {
+ m.kernel.PageTables.Release()
+ }
+
+ // vCPUs are gone: teardown machine state.
+ if err := syscall.Close(m.fd); err != nil {
+ panic(fmt.Sprintf("error closing VM fd: %v", err))
+ }
+}
+
+// Get gets an available vCPU.
+func (m *machine) Get() (*vCPU, error) {
+ runtime.LockOSThread()
+ tid := procid.Current()
+ m.mu.Lock()
+
+ for {
+ // Check for an exact match.
+ if c := m.vCPUs[tid]; c != nil && c.mu.TryLock() {
+ m.mu.Unlock()
+ return c, nil
+ }
+
+ // Scan for an available vCPU.
+ for origTID, c := range m.vCPUs {
+ if c.LockInState(vCPUReady) {
+ delete(m.vCPUs, origTID)
+ m.vCPUs[tid] = c
+ m.mu.Unlock()
+
+ // We need to reload thread-local segments as
+ // we have origTID != tid and the vCPU state
+ // may be stale.
+ c.loadSegments()
+ atomic.StoreUint64(&c.tid, tid)
+ return c, nil
+ }
+ }
+
+ // Everything is busy executing user code (locked).
+ //
+ // We hold the pool lock here, so we should be able to kick something
+ // out of kernel mode and have it bounce into host mode when it tries
+ // to grab the vCPU again.
+ for _, c := range m.vCPUs {
+ if c.State() != vCPUWaiter {
+ c.Bounce()
+ }
+ }
+
+ // Give other threads an opportunity to run.
+ yield()
+ }
+}
+
+// Put puts the current vCPU.
+func (m *machine) Put(c *vCPU) {
+ c.Unlock()
+ runtime.UnlockOSThread()
+}
+
+// State returns the current state.
+func (c *vCPU) State() uintptr {
+ return atomic.LoadUintptr(&c.state)
+}
+
+// Lock locks the vCPU.
+func (c *vCPU) Lock() {
+ c.mu.Lock()
+}
+
+// Invalidate invalidates caches.
+func (c *vCPU) Invalidate() {
+}
+
+// LockInState locks the vCPU if it is in the given state and TryLock succeeds.
+func (c *vCPU) LockInState(state uintptr) bool {
+ if c.State() == state && c.mu.TryLock() {
+ if c.State() != state {
+ c.mu.Unlock()
+ return false
+ }
+ return true
+ }
+ return false
+}
+
+// Unlock unlocks the given vCPU.
+func (c *vCPU) Unlock() {
+ // Ensure we're out of guest mode, if necessary.
+ if c.State() == vCPUWaiter {
+ redpill() // Force guest mode exit.
+ }
+ c.mu.Unlock()
+}
+
+// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
+func (c *vCPU) NotifyInterrupt() {
+ c.Bounce()
+}
+
+// pid is used below in bounce.
+var pid = syscall.Getpid()
+
+// Bounce ensures that the vCPU bounces back to the kernel.
+//
+// In practice, this means returning EAGAIN from running user code. The vCPU
+// will be unlocked and relock, and the kernel is guaranteed to check for
+// interrupt notifications (e.g. injected via Notify) and invalidations.
+func (c *vCPU) Bounce() {
+ for {
+ if c.mu.TryLock() {
+ // We know that the vCPU must be in the kernel already,
+ // because the lock was not acquired. We specifically
+ // don't want to call bounce in this case, because it's
+ // not necessary to knock the vCPU out of guest mode.
+ c.mu.Unlock()
+ return
+ }
+
+ if state := c.State(); state == vCPUGuest || state == vCPUWaiter {
+ // We know that the vCPU was in guest mode, so a single signal
+ // interruption will guarantee that a transition takes place.
+ syscall.Tgkill(pid, int(atomic.LoadUint64(&c.tid)), bounceSignal)
+ return
+ }
+
+ // Someone holds the lock, but the vCPU is not yet transitioned
+ // into guest mode. It's in the critical section; give it time.
+ yield()
+ }
+}