1 files changed, 525 insertions, 0 deletions
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
new file mode 100644
index 000000000..f5953b96e
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -0,0 +1,525 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"fmt"
+	"runtime"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/atomicbitops"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// machine contains state associated with the VM as a whole.
+type machine struct {
+	// fd is the vm fd.
+	fd int
+
+	// nextSlot is the next slot for setMemoryRegion.
+	//
+	// This must be accessed atomically. If nextSlot is ^uint32(0), then
+	// slots are currently being updated, and the caller should retry.
+	nextSlot uint32
+
+	// kernel is the set of global structures.
+	kernel ring0.Kernel
+
+	// mappingCache is used for mapPhysical.
+	mappingCache sync.Map
+
+	// mu protects vCPUs.
+	mu sync.RWMutex
+
+	// available is notified when vCPUs are available.
+	available sync.Cond
+
+	// vCPUs are the machine vCPUs.
+	//
+	// These are populated dynamically.
+	vCPUs map[uint64]*vCPU
+
+	// vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID.
+	vCPUsByID map[int]*vCPU
+
+	// maxVCPUs is the maximum number of vCPUs supported by the machine.
+	maxVCPUs int
+}
+
+const (
+	// vCPUReady is an alias for all the below clear.
+	vCPUReady uint32 = 0
+
+	// vCPUser indicates that the vCPU is in or about to enter user mode.
+	vCPUUser uint32 = 1 << 0
+
+	// vCPUGuest indicates the vCPU is in guest mode.
+	vCPUGuest uint32 = 1 << 1
+
+	// vCPUWaiter indicates that there is a waiter.
+	//
+	// If this is set, then notify must be called on any state transitions.
+	vCPUWaiter uint32 = 1 << 2
+)
+
+// vCPU is a single KVM vCPU.
+type vCPU struct {
+	// CPU is the kernel CPU data.
+	//
+	// This must be the first element of this structure, it is referenced
+	// by the bluepill code (see bluepill_amd64.s).
+	ring0.CPU
+
+	// id is the vCPU id.
+	id int
+
+	// fd is the vCPU fd.
+	fd int
+
+	// tid is the last set tid.
+	tid uint64
+
+	// switches is a count of world switches (informational only).
+	switches uint32
+
+	// faults is a count of world faults (informational only).
+	faults uint32
+
+	// state is the vCPU state.
+	//
+	// This is a bitmask of the three fields (vCPU*) described above.
+	state uint32
+
+	// runData for this vCPU.
+	runData *runData
+
+	// machine associated with this vCPU.
+	machine *machine
+
+	// active is the current addressSpace: this is set and read atomically,
+	// it is used to elide unnecessary interrupts due to invalidations.
+	active atomicAddressSpace
+
+	// vCPUArchState is the architecture-specific state.
+	vCPUArchState
+
+	dieState dieState
+}
+
+type dieState struct {
+	// message is thrown from die.
+	message string
+
+	// guestRegs is used to store register state during vCPU.die() to prevent
+	// allocation inside nosplit function.
+	guestRegs userRegs
+}
+
+// newVCPU creates a returns a new vCPU.
+//
+// Precondtion: mu must be held.
+func (m *machine) newVCPU() *vCPU {
+	id := len(m.vCPUs)
+
+	// Create the vCPU.
+	fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id))
+	if errno != 0 {
+		panic(fmt.Sprintf("error creating new vCPU: %v", errno))
+	}
+
+	c := &vCPU{
+		id:      id,
+		fd:      int(fd),
+		machine: m,
+	}
+	c.CPU.Init(&m.kernel, c)
+	m.vCPUsByID[c.id] = c
+
+	// Ensure the signal mask is correct.
+	if err := c.setSignalMask(); err != nil {
+		panic(fmt.Sprintf("error setting signal mask: %v", err))
+	}
+
+	// Map the run data.
+	runData, err := mapRunData(int(fd))
+	if err != nil {
+		panic(fmt.Sprintf("error mapping run data: %v", err))
+	}
+	c.runData = runData
+
+	// Initialize architecture state.
+	if err := c.initArchState(); err != nil {
+		panic(fmt.Sprintf("error initialization vCPU state: %v", err))
+	}
+
+	return c // Done.
+}
+
+// newMachine returns a new VM context.
+func newMachine(vm int) (*machine, error) {
+	// Create the machine.
+	m := &machine{
+		fd:        vm,
+		vCPUs:     make(map[uint64]*vCPU),
+		vCPUsByID: make(map[int]*vCPU),
+	}
+	m.available.L = &m.mu
+	m.kernel.Init(ring0.KernelOpts{
+		PageTables: pagetables.New(newAllocator()),
+	})
+
+	maxVCPUs, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
+	if errno != 0 {
+		m.maxVCPUs = _KVM_NR_VCPUS
+	} else {
+		m.maxVCPUs = int(maxVCPUs)
+	}
+	log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
+
+	// Apply the physical mappings. Note that these mappings may point to
+	// guest physical addresses that are not actually available. These
+	// physical pages are mapped on demand, see kernel_unsafe.go.
+	applyPhysicalRegions(func(pr physicalRegion) bool {
+		// Map everything in the lower half.
+		m.kernel.PageTables.Map(
+			usermem.Addr(pr.virtual),
+			pr.length,
+			pagetables.MapOpts{AccessType: usermem.AnyAccess},
+			pr.physical)
+
+		// And keep everything in the upper half.
+		m.kernel.PageTables.Map(
+			usermem.Addr(ring0.KernelStartAddress|pr.virtual),
+			pr.length,
+			pagetables.MapOpts{AccessType: usermem.AnyAccess},
+			pr.physical)
+
+		return true // Keep iterating.
+	})
+
+	// Ensure that the currently mapped virtual regions are actually
+	// available in the VM. Note that this doesn't guarantee no future
+	// faults, however it should guarantee that everything is available to
+	// ensure successful vCPU entry.
+	applyVirtualRegions(func(vr virtualRegion) {
+		if excludeVirtualRegion(vr) {
+			return // skip region.
+		}
+		for virtual := vr.virtual; virtual < vr.virtual+vr.length; {
+			physical, length, ok := translateToPhysical(virtual)
+			if !ok {
+				// This must be an invalid region that was
+				// knocked out by creation of the physical map.
+				return
+			}
+			if virtual+length > vr.virtual+vr.length {
+				// Cap the length to the end of the area.
+				length = vr.virtual + vr.length - virtual
+			}
+
+			// Ensure the physical range is mapped.
+			m.mapPhysical(physical, length)
+			virtual += length
+		}
+	})
+
+	// Initialize architecture state.
+	if err := m.initArchState(); err != nil {
+		m.Destroy()
+		return nil, err
+	}
+
+	// Ensure the machine is cleaned up properly.
+	runtime.SetFinalizer(m, (*machine).Destroy)
+	return m, nil
+}
+
+// mapPhysical checks for the mapping of a physical range, and installs one if
+// not available. This attempts to be efficient for calls in the hot path.
+//
+// This panics on error.
+func (m *machine) mapPhysical(physical, length uintptr) {
+	for end := physical + length; physical < end; {
+		_, physicalStart, length, ok := calculateBluepillFault(physical)
+		if !ok {
+			// Should never happen.
+			panic("mapPhysical on unknown physical address")
+		}
+
+		if _, ok := m.mappingCache.LoadOrStore(physicalStart, true); !ok {
+			// Not present in the cache; requires setting the slot.
+			if _, ok := handleBluepillFault(m, physical); !ok {
+				panic("handleBluepillFault failed")
+			}
+		}
+
+		// Move to the next chunk.
+		physical = physicalStart + length
+	}
+}
+
+// Destroy frees associated resources.
+//
+// Destroy should only be called once all active users of the machine are gone.
+// The machine object should not be used after calling Destroy.
+//
+// Precondition: all vCPUs must be returned to the machine.
+func (m *machine) Destroy() {
+	runtime.SetFinalizer(m, nil)
+
+	// Destroy vCPUs.
+	for _, c := range m.vCPUs {
+		// Ensure the vCPU is not still running in guest mode. This is
+		// possible iff teardown has been done by other threads, and
+		// somehow a single thread has not executed any system calls.
+		c.BounceToHost()
+
+		// Note that the runData may not be mapped if an error occurs
+		// during the middle of initialization.
+		if c.runData != nil {
+			if err := unmapRunData(c.runData); err != nil {
+				panic(fmt.Sprintf("error unmapping rundata: %v", err))
+			}
+		}
+		if err := syscall.Close(int(c.fd)); err != nil {
+			panic(fmt.Sprintf("error closing vCPU fd: %v", err))
+		}
+	}
+
+	// vCPUs are gone: teardown machine state.
+	if err := syscall.Close(m.fd); err != nil {
+		panic(fmt.Sprintf("error closing VM fd: %v", err))
+	}
+}
+
+// Get gets an available vCPU.
+func (m *machine) Get() *vCPU {
+	runtime.LockOSThread()
+	tid := procid.Current()
+	m.mu.RLock()
+
+	// Check for an exact match.
+	if c := m.vCPUs[tid]; c != nil {
+		c.lock()
+		m.mu.RUnlock()
+		return c
+	}
+
+	// The happy path failed. We now proceed to acquire an exclusive lock
+	// (because the vCPU map may change), and scan all available vCPUs.
+	m.mu.RUnlock()
+	m.mu.Lock()
+
+	for {
+		// Scan for an available vCPU.
+		for origTID, c := range m.vCPUs {
+			if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
+				delete(m.vCPUs, origTID)
+				m.vCPUs[tid] = c
+				m.mu.Unlock()
+				c.loadSegments(tid)
+				return c
+			}
+		}
+
+		// Create a new vCPU (maybe).
+		if len(m.vCPUs) < m.maxVCPUs {
+			c := m.newVCPU()
+			c.lock()
+			m.vCPUs[tid] = c
+			m.mu.Unlock()
+			c.loadSegments(tid)
+			return c
+		}
+
+		// Scan for something not in user mode.
+		for origTID, c := range m.vCPUs {
+			if !atomic.CompareAndSwapUint32(&c.state, vCPUGuest, vCPUGuest|vCPUWaiter) {
+				continue
+			}
+
+			// The vCPU is not be able to transition to
+			// vCPUGuest|vCPUUser or to vCPUUser because that
+			// transition requires holding the machine mutex, as we
+			// do now. There is no path to register a waiter on
+			// just the vCPUReady state.
+			for {
+				c.waitUntilNot(vCPUGuest | vCPUWaiter)
+				if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
+					break
+				}
+			}
+
+			// Steal the vCPU.
+			delete(m.vCPUs, origTID)
+			m.vCPUs[tid] = c
+			m.mu.Unlock()
+			c.loadSegments(tid)
+			return c
+		}
+
+		// Everything is executing in user mode. Wait until something
+		// is available.  Note that signaling the condition variable
+		// will have the extra effect of kicking the vCPUs out of guest
+		// mode if that's where they were.
+		m.available.Wait()
+	}
+}
+
+// Put puts the current vCPU.
+func (m *machine) Put(c *vCPU) {
+	c.unlock()
+	runtime.UnlockOSThread()
+	m.available.Signal()
+}
+
+// newDirtySet returns a new dirty set.
+func (m *machine) newDirtySet() *dirtySet {
+	return &dirtySet{
+		vCPUs: make([]uint64, (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64),
+	}
+}
+
+// lock marks the vCPU as in user mode.
+//
+// This should only be called directly when known to be safe, i.e. when
+// the vCPU is owned by the current TID with no chance of theft.
+//
+//go:nosplit
+func (c *vCPU) lock() {
+	atomicbitops.OrUint32(&c.state, vCPUUser)
+}
+
+// unlock clears the vCPUUser bit.
+//
+//go:nosplit
+func (c *vCPU) unlock() {
+	if atomic.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest) {
+		// Happy path: no exits are forced, and we can continue
+		// executing on our merry way with a single atomic access.
+		return
+	}
+
+	// Clear the lock.
+	origState := atomic.LoadUint32(&c.state)
+	atomicbitops.AndUint32(&c.state, ^vCPUUser)
+	switch origState {
+	case vCPUUser:
+		// Normal state.
+	case vCPUUser | vCPUGuest | vCPUWaiter:
+		// Force a transition: this must trigger a notification when we
+		// return from guest mode.
+		c.notify()
+	case vCPUUser | vCPUWaiter:
+		// Waiting for the lock to be released; the responsibility is
+		// on us to notify the waiter and clear the associated bit.
+		atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
+		c.notify()
+	default:
+		panic("invalid state")
+	}
+}
+
+// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
+//
+//go:nosplit
+func (c *vCPU) NotifyInterrupt() {
+	c.BounceToKernel()
+}
+
+// pid is used below in bounce.
+var pid = syscall.Getpid()
+
+// bounce forces a return to the kernel or to host mode.
+//
+// This effectively unwinds the state machine.
+func (c *vCPU) bounce(forceGuestExit bool) {
+	for {
+		switch state := atomic.LoadUint32(&c.state); state {
+		case vCPUReady, vCPUWaiter:
+			// There is nothing to be done, we're already in the
+			// kernel pre-acquisition. The Bounce criteria have
+			// been satisfied.
+			return
+		case vCPUUser:
+			// We need to register a waiter for the actual guest
+			// transition. When the transition takes place, then we
+			// can inject an interrupt to ensure a return to host
+			// mode.
+			atomic.CompareAndSwapUint32(&c.state, state, state|vCPUWaiter)
+		case vCPUUser | vCPUWaiter:
+			// Wait for the transition to guest mode. This should
+			// come from the bluepill handler.
+			c.waitUntilNot(state)
+		case vCPUGuest, vCPUUser | vCPUGuest:
+			if state == vCPUGuest && !forceGuestExit {
+				// The vCPU is already not acquired, so there's
+				// no need to do a fresh injection here.
+				return
+			}
+			// The vCPU is in user or kernel mode. Attempt to
+			// register a notification on change.
+			if !atomic.CompareAndSwapUint32(&c.state, state, state|vCPUWaiter) {
+				break // Retry.
+			}
+			for {
+				// We need to spin here until the signal is
+				// delivered, because Tgkill can return EAGAIN
+				// under memory pressure. Since we already
+				// marked ourselves as a waiter, we need to
+				// ensure that a signal is actually delivered.
+				if err := syscall.Tgkill(pid, int(atomic.LoadUint64(&c.tid)), bounceSignal); err == nil {
+					break
+				} else if err.(syscall.Errno) == syscall.EAGAIN {
+					continue
+				} else {
+					// Nothing else should be returned by tgkill.
+					panic(fmt.Sprintf("unexpected tgkill error: %v", err))
+				}
+			}
+		case vCPUGuest | vCPUWaiter, vCPUUser | vCPUGuest | vCPUWaiter:
+			if state == vCPUGuest|vCPUWaiter && !forceGuestExit {
+				// See above.
+				return
+			}
+			// Wait for the transition. This again should happen
+			// from the bluepill handler, but on the way out.
+			c.waitUntilNot(state)
+		default:
+			// Should not happen: the above is exhaustive.
+			panic("invalid state")
+		}
+	}
+}
+
+// BounceToKernel ensures that the vCPU bounces back to the kernel.
+//
+//go:nosplit
+func (c *vCPU) BounceToKernel() {
+	c.bounce(false)
+}
+
+// BounceToHost ensures that the vCPU is in host mode.
+//
+//go:nosplit
+func (c *vCPU) BounceToHost() {
+	c.bounce(true)
+}