diff options
Diffstat (limited to 'pkg/sentry/platform/kvm/machine.go')
-rw-r--r-- | pkg/sentry/platform/kvm/machine.go | 525 |
1 files changed, 525 insertions, 0 deletions
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go new file mode 100644 index 000000000..f5953b96e --- /dev/null +++ b/pkg/sentry/platform/kvm/machine.go @@ -0,0 +1,525 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "fmt" + "runtime" + "sync" + "sync/atomic" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/atomicbitops" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// machine contains state associated with the VM as a whole. +type machine struct { + // fd is the vm fd. + fd int + + // nextSlot is the next slot for setMemoryRegion. + // + // This must be accessed atomically. If nextSlot is ^uint32(0), then + // slots are currently being updated, and the caller should retry. + nextSlot uint32 + + // kernel is the set of global structures. + kernel ring0.Kernel + + // mappingCache is used for mapPhysical. + mappingCache sync.Map + + // mu protects vCPUs. + mu sync.RWMutex + + // available is notified when vCPUs are available. + available sync.Cond + + // vCPUs are the machine vCPUs. + // + // These are populated dynamically. + vCPUs map[uint64]*vCPU + + // vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID. + vCPUsByID map[int]*vCPU + + // maxVCPUs is the maximum number of vCPUs supported by the machine. + maxVCPUs int +} + +const ( + // vCPUReady is an alias for all the below clear. + vCPUReady uint32 = 0 + + // vCPUser indicates that the vCPU is in or about to enter user mode. + vCPUUser uint32 = 1 << 0 + + // vCPUGuest indicates the vCPU is in guest mode. + vCPUGuest uint32 = 1 << 1 + + // vCPUWaiter indicates that there is a waiter. + // + // If this is set, then notify must be called on any state transitions. + vCPUWaiter uint32 = 1 << 2 +) + +// vCPU is a single KVM vCPU. +type vCPU struct { + // CPU is the kernel CPU data. + // + // This must be the first element of this structure, it is referenced + // by the bluepill code (see bluepill_amd64.s). + ring0.CPU + + // id is the vCPU id. + id int + + // fd is the vCPU fd. + fd int + + // tid is the last set tid. + tid uint64 + + // switches is a count of world switches (informational only). + switches uint32 + + // faults is a count of world faults (informational only). + faults uint32 + + // state is the vCPU state. + // + // This is a bitmask of the three fields (vCPU*) described above. + state uint32 + + // runData for this vCPU. + runData *runData + + // machine associated with this vCPU. + machine *machine + + // active is the current addressSpace: this is set and read atomically, + // it is used to elide unnecessary interrupts due to invalidations. + active atomicAddressSpace + + // vCPUArchState is the architecture-specific state. + vCPUArchState + + dieState dieState +} + +type dieState struct { + // message is thrown from die. + message string + + // guestRegs is used to store register state during vCPU.die() to prevent + // allocation inside nosplit function. + guestRegs userRegs +} + +// newVCPU creates a returns a new vCPU. +// +// Precondtion: mu must be held. +func (m *machine) newVCPU() *vCPU { + id := len(m.vCPUs) + + // Create the vCPU. + fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id)) + if errno != 0 { + panic(fmt.Sprintf("error creating new vCPU: %v", errno)) + } + + c := &vCPU{ + id: id, + fd: int(fd), + machine: m, + } + c.CPU.Init(&m.kernel, c) + m.vCPUsByID[c.id] = c + + // Ensure the signal mask is correct. + if err := c.setSignalMask(); err != nil { + panic(fmt.Sprintf("error setting signal mask: %v", err)) + } + + // Map the run data. + runData, err := mapRunData(int(fd)) + if err != nil { + panic(fmt.Sprintf("error mapping run data: %v", err)) + } + c.runData = runData + + // Initialize architecture state. + if err := c.initArchState(); err != nil { + panic(fmt.Sprintf("error initialization vCPU state: %v", err)) + } + + return c // Done. +} + +// newMachine returns a new VM context. +func newMachine(vm int) (*machine, error) { + // Create the machine. + m := &machine{ + fd: vm, + vCPUs: make(map[uint64]*vCPU), + vCPUsByID: make(map[int]*vCPU), + } + m.available.L = &m.mu + m.kernel.Init(ring0.KernelOpts{ + PageTables: pagetables.New(newAllocator()), + }) + + maxVCPUs, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) + if errno != 0 { + m.maxVCPUs = _KVM_NR_VCPUS + } else { + m.maxVCPUs = int(maxVCPUs) + } + log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs) + + // Apply the physical mappings. Note that these mappings may point to + // guest physical addresses that are not actually available. These + // physical pages are mapped on demand, see kernel_unsafe.go. + applyPhysicalRegions(func(pr physicalRegion) bool { + // Map everything in the lower half. + m.kernel.PageTables.Map( + usermem.Addr(pr.virtual), + pr.length, + pagetables.MapOpts{AccessType: usermem.AnyAccess}, + pr.physical) + + // And keep everything in the upper half. + m.kernel.PageTables.Map( + usermem.Addr(ring0.KernelStartAddress|pr.virtual), + pr.length, + pagetables.MapOpts{AccessType: usermem.AnyAccess}, + pr.physical) + + return true // Keep iterating. + }) + + // Ensure that the currently mapped virtual regions are actually + // available in the VM. Note that this doesn't guarantee no future + // faults, however it should guarantee that everything is available to + // ensure successful vCPU entry. + applyVirtualRegions(func(vr virtualRegion) { + if excludeVirtualRegion(vr) { + return // skip region. + } + for virtual := vr.virtual; virtual < vr.virtual+vr.length; { + physical, length, ok := translateToPhysical(virtual) + if !ok { + // This must be an invalid region that was + // knocked out by creation of the physical map. + return + } + if virtual+length > vr.virtual+vr.length { + // Cap the length to the end of the area. + length = vr.virtual + vr.length - virtual + } + + // Ensure the physical range is mapped. + m.mapPhysical(physical, length) + virtual += length + } + }) + + // Initialize architecture state. + if err := m.initArchState(); err != nil { + m.Destroy() + return nil, err + } + + // Ensure the machine is cleaned up properly. + runtime.SetFinalizer(m, (*machine).Destroy) + return m, nil +} + +// mapPhysical checks for the mapping of a physical range, and installs one if +// not available. This attempts to be efficient for calls in the hot path. +// +// This panics on error. +func (m *machine) mapPhysical(physical, length uintptr) { + for end := physical + length; physical < end; { + _, physicalStart, length, ok := calculateBluepillFault(physical) + if !ok { + // Should never happen. + panic("mapPhysical on unknown physical address") + } + + if _, ok := m.mappingCache.LoadOrStore(physicalStart, true); !ok { + // Not present in the cache; requires setting the slot. + if _, ok := handleBluepillFault(m, physical); !ok { + panic("handleBluepillFault failed") + } + } + + // Move to the next chunk. + physical = physicalStart + length + } +} + +// Destroy frees associated resources. +// +// Destroy should only be called once all active users of the machine are gone. +// The machine object should not be used after calling Destroy. +// +// Precondition: all vCPUs must be returned to the machine. +func (m *machine) Destroy() { + runtime.SetFinalizer(m, nil) + + // Destroy vCPUs. + for _, c := range m.vCPUs { + // Ensure the vCPU is not still running in guest mode. This is + // possible iff teardown has been done by other threads, and + // somehow a single thread has not executed any system calls. + c.BounceToHost() + + // Note that the runData may not be mapped if an error occurs + // during the middle of initialization. + if c.runData != nil { + if err := unmapRunData(c.runData); err != nil { + panic(fmt.Sprintf("error unmapping rundata: %v", err)) + } + } + if err := syscall.Close(int(c.fd)); err != nil { + panic(fmt.Sprintf("error closing vCPU fd: %v", err)) + } + } + + // vCPUs are gone: teardown machine state. + if err := syscall.Close(m.fd); err != nil { + panic(fmt.Sprintf("error closing VM fd: %v", err)) + } +} + +// Get gets an available vCPU. +func (m *machine) Get() *vCPU { + runtime.LockOSThread() + tid := procid.Current() + m.mu.RLock() + + // Check for an exact match. + if c := m.vCPUs[tid]; c != nil { + c.lock() + m.mu.RUnlock() + return c + } + + // The happy path failed. We now proceed to acquire an exclusive lock + // (because the vCPU map may change), and scan all available vCPUs. + m.mu.RUnlock() + m.mu.Lock() + + for { + // Scan for an available vCPU. + for origTID, c := range m.vCPUs { + if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) { + delete(m.vCPUs, origTID) + m.vCPUs[tid] = c + m.mu.Unlock() + c.loadSegments(tid) + return c + } + } + + // Create a new vCPU (maybe). + if len(m.vCPUs) < m.maxVCPUs { + c := m.newVCPU() + c.lock() + m.vCPUs[tid] = c + m.mu.Unlock() + c.loadSegments(tid) + return c + } + + // Scan for something not in user mode. + for origTID, c := range m.vCPUs { + if !atomic.CompareAndSwapUint32(&c.state, vCPUGuest, vCPUGuest|vCPUWaiter) { + continue + } + + // The vCPU is not be able to transition to + // vCPUGuest|vCPUUser or to vCPUUser because that + // transition requires holding the machine mutex, as we + // do now. There is no path to register a waiter on + // just the vCPUReady state. + for { + c.waitUntilNot(vCPUGuest | vCPUWaiter) + if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) { + break + } + } + + // Steal the vCPU. + delete(m.vCPUs, origTID) + m.vCPUs[tid] = c + m.mu.Unlock() + c.loadSegments(tid) + return c + } + + // Everything is executing in user mode. Wait until something + // is available. Note that signaling the condition variable + // will have the extra effect of kicking the vCPUs out of guest + // mode if that's where they were. + m.available.Wait() + } +} + +// Put puts the current vCPU. +func (m *machine) Put(c *vCPU) { + c.unlock() + runtime.UnlockOSThread() + m.available.Signal() +} + +// newDirtySet returns a new dirty set. +func (m *machine) newDirtySet() *dirtySet { + return &dirtySet{ + vCPUs: make([]uint64, (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64), + } +} + +// lock marks the vCPU as in user mode. +// +// This should only be called directly when known to be safe, i.e. when +// the vCPU is owned by the current TID with no chance of theft. +// +//go:nosplit +func (c *vCPU) lock() { + atomicbitops.OrUint32(&c.state, vCPUUser) +} + +// unlock clears the vCPUUser bit. +// +//go:nosplit +func (c *vCPU) unlock() { + if atomic.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest) { + // Happy path: no exits are forced, and we can continue + // executing on our merry way with a single atomic access. + return + } + + // Clear the lock. + origState := atomic.LoadUint32(&c.state) + atomicbitops.AndUint32(&c.state, ^vCPUUser) + switch origState { + case vCPUUser: + // Normal state. + case vCPUUser | vCPUGuest | vCPUWaiter: + // Force a transition: this must trigger a notification when we + // return from guest mode. + c.notify() + case vCPUUser | vCPUWaiter: + // Waiting for the lock to be released; the responsibility is + // on us to notify the waiter and clear the associated bit. + atomicbitops.AndUint32(&c.state, ^vCPUWaiter) + c.notify() + default: + panic("invalid state") + } +} + +// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt. +// +//go:nosplit +func (c *vCPU) NotifyInterrupt() { + c.BounceToKernel() +} + +// pid is used below in bounce. +var pid = syscall.Getpid() + +// bounce forces a return to the kernel or to host mode. +// +// This effectively unwinds the state machine. +func (c *vCPU) bounce(forceGuestExit bool) { + for { + switch state := atomic.LoadUint32(&c.state); state { + case vCPUReady, vCPUWaiter: + // There is nothing to be done, we're already in the + // kernel pre-acquisition. The Bounce criteria have + // been satisfied. + return + case vCPUUser: + // We need to register a waiter for the actual guest + // transition. When the transition takes place, then we + // can inject an interrupt to ensure a return to host + // mode. + atomic.CompareAndSwapUint32(&c.state, state, state|vCPUWaiter) + case vCPUUser | vCPUWaiter: + // Wait for the transition to guest mode. This should + // come from the bluepill handler. + c.waitUntilNot(state) + case vCPUGuest, vCPUUser | vCPUGuest: + if state == vCPUGuest && !forceGuestExit { + // The vCPU is already not acquired, so there's + // no need to do a fresh injection here. + return + } + // The vCPU is in user or kernel mode. Attempt to + // register a notification on change. + if !atomic.CompareAndSwapUint32(&c.state, state, state|vCPUWaiter) { + break // Retry. + } + for { + // We need to spin here until the signal is + // delivered, because Tgkill can return EAGAIN + // under memory pressure. Since we already + // marked ourselves as a waiter, we need to + // ensure that a signal is actually delivered. + if err := syscall.Tgkill(pid, int(atomic.LoadUint64(&c.tid)), bounceSignal); err == nil { + break + } else if err.(syscall.Errno) == syscall.EAGAIN { + continue + } else { + // Nothing else should be returned by tgkill. + panic(fmt.Sprintf("unexpected tgkill error: %v", err)) + } + } + case vCPUGuest | vCPUWaiter, vCPUUser | vCPUGuest | vCPUWaiter: + if state == vCPUGuest|vCPUWaiter && !forceGuestExit { + // See above. + return + } + // Wait for the transition. This again should happen + // from the bluepill handler, but on the way out. + c.waitUntilNot(state) + default: + // Should not happen: the above is exhaustive. + panic("invalid state") + } + } +} + +// BounceToKernel ensures that the vCPU bounces back to the kernel. +// +//go:nosplit +func (c *vCPU) BounceToKernel() { + c.bounce(false) +} + +// BounceToHost ensures that the vCPU is in host mode. +// +//go:nosplit +func (c *vCPU) BounceToHost() { + c.bounce(true) +} |