Check in gVisor.

PiperOrigin-RevId: 194583126 Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
author: Googler <noreply@google.com> 2018-04-27 10:37:02 -0700
committer: Adin Scannell <ascannell@google.com> 2018-04-28 01:44:26 -0400
commit: d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree: 54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/platform/kvm/machine.go
parent: f70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)
1 files changed, 412 insertions, 0 deletions
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
new file mode 100644
index 000000000..a5be0cee3
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -0,0 +1,412 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"fmt"
+	"runtime"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/tmutex"
+)
+
+// machine contains state associated with the VM as a whole.
+type machine struct {
+	// fd is the vm fd.
+	fd int
+
+	// nextSlot is the next slot for setMemoryRegion.
+	//
+	// This must be accessed atomically. If nextSlot is ^uint32(0), then
+	// slots are currently being updated, and the caller should retry.
+	nextSlot uint32
+
+	// kernel is the set of global structures.
+	kernel *ring0.Kernel
+
+	// mappingCache is used for mapPhysical.
+	mappingCache sync.Map
+
+	// mu protects vCPUs.
+	mu sync.Mutex
+
+	// vCPUs are the machine vCPUs.
+	//
+	// This is eventually keyed by system TID, but is initially indexed by
+	// the negative vCPU id. This is merely an optimization, so while
+	// collisions here are not possible, it wouldn't matter anyways.
+	vCPUs map[uint64]*vCPU
+}
+
+const (
+	// vCPUReady is the lock value for an available vCPU.
+	//
+	// Legal transitions: vCPUGuest (bluepill).
+	vCPUReady uintptr = iota
+
+	// vCPUGuest indicates the vCPU is in guest mode.
+	//
+	// Legal transition: vCPUReady (bluepill), vCPUWaiter (wait).
+	vCPUGuest
+
+	// vCPUWaiter indicates that the vCPU should be released.
+	//
+	// Legal transition: vCPUReady (bluepill).
+	vCPUWaiter
+)
+
+// vCPU is a single KVM vCPU.
+type vCPU struct {
+	// CPU is the kernel CPU data.
+	//
+	// This must be the first element of this structure, it is referenced
+	// by the bluepill code (see bluepill_amd64.s).
+	ring0.CPU
+
+	// fd is the vCPU fd.
+	fd int
+
+	// tid is the last set tid.
+	tid uint64
+
+	// switches is a count of world switches (informational only).
+	switches uint32
+
+	// faults is a count of world faults (informational only).
+	faults uint32
+
+	// state is the vCPU state; all are described above.
+	state uintptr
+
+	// runData for this vCPU.
+	runData *runData
+
+	// machine associated with this vCPU.
+	machine *machine
+
+	// mu applies across get/put; it does not protect the above.
+	mu tmutex.Mutex
+}
+
+// newMachine returns a new VM context.
+func newMachine(vm int, vCPUs int) (*machine, error) {
+	// Create the machine.
+	m := &machine{
+		fd:    vm,
+		vCPUs: make(map[uint64]*vCPU),
+	}
+	if vCPUs > _KVM_NR_VCPUS {
+		// Hard cap at KVM's limit.
+		vCPUs = _KVM_NR_VCPUS
+	}
+	if n := 2 * runtime.NumCPU(); vCPUs > n {
+		// Cap at twice the number of physical cores. Otherwise we're
+		// just wasting memory and thrashing. (There may be scheduling
+		// issues when you've got > n active threads.)
+		vCPUs = n
+	}
+	m.kernel = ring0.New(ring0.KernelOpts{
+		PageTables: pagetables.New(m, pagetablesOpts),
+	})
+
+	// Initialize architecture state.
+	if err := m.initArchState(vCPUs); err != nil {
+		m.Destroy()
+		return nil, err
+	}
+
+	// Create all the vCPUs.
+	for id := 0; id < vCPUs; id++ {
+		// Create the vCPU.
+		fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(vm), _KVM_CREATE_VCPU, uintptr(id))
+		if errno != 0 {
+			m.Destroy()
+			return nil, fmt.Errorf("error creating VCPU: %v", errno)
+		}
+		c := &vCPU{
+			fd:      int(fd),
+			machine: m,
+		}
+		c.mu.Init()
+		c.CPU.Init(m.kernel)
+		c.CPU.KernelSyscall = bluepillSyscall
+		c.CPU.KernelException = bluepillException
+		m.vCPUs[uint64(-id)] = c // See above.
+
+		// Ensure the signal mask is correct.
+		if err := c.setSignalMask(); err != nil {
+			m.Destroy()
+			return nil, err
+		}
+
+		// Initialize architecture state.
+		if err := c.initArchState(); err != nil {
+			m.Destroy()
+			return nil, err
+		}
+
+		// Map the run data.
+		runData, err := mapRunData(int(fd))
+		if err != nil {
+			m.Destroy()
+			return nil, err
+		}
+		c.runData = runData
+	}
+
+	// Apply the physical mappings. Note that these mappings may point to
+	// guest physical addresses that are not actually available. These
+	// physical pages are mapped on demand, see kernel_unsafe.go.
+	applyPhysicalRegions(func(pr physicalRegion) bool {
+		// Map everything in the lower half.
+		m.kernel.PageTables.Map(usermem.Addr(pr.virtual), pr.length, false /* kernel */, usermem.AnyAccess, pr.physical)
+		// And keep everything in the upper half.
+		kernelAddr := usermem.Addr(ring0.KernelStartAddress | pr.virtual)
+		m.kernel.PageTables.Map(kernelAddr, pr.length, false /* kernel */, usermem.AnyAccess, pr.physical)
+		return true // Keep iterating.
+	})
+
+	// Ensure that the currently mapped virtual regions are actually
+	// available in the VM. Note that this doesn't guarantee no future
+	// faults, however it should guarantee that everything is available to
+	// ensure successful vCPU entry.
+	applyVirtualRegions(func(vr virtualRegion) {
+		if excludeVirtualRegion(vr) {
+			return // skip region.
+		}
+		for virtual := vr.virtual; virtual < vr.virtual+vr.length; {
+			physical, length, ok := TranslateToPhysical(virtual)
+			if !ok {
+				// This must be an invalid region that was
+				// knocked out by creation of the physical map.
+				return
+			}
+			if virtual+length > vr.virtual+vr.length {
+				// Cap the length to the end of the area.
+				length = vr.virtual + vr.length - virtual
+			}
+
+			// Ensure the physical range is mapped.
+			m.mapPhysical(physical, length)
+			virtual += length
+		}
+	})
+
+	// Ensure the machine is cleaned up properly.
+	runtime.SetFinalizer(m, (*machine).Destroy)
+	return m, nil
+}
+
+// mapPhysical checks for the mapping of a physical range, and installs one if
+// not available. This attempts to be efficient for calls in the hot path.
+//
+// This panics on error.
+func (m *machine) mapPhysical(physical, length uintptr) {
+	for end := physical + length; physical < end; {
+		_, physicalStart, length, ok := calculateBluepillFault(m, physical)
+		if !ok {
+			// Should never happen.
+			panic("mapPhysical on unknown physical address")
+		}
+
+		if _, ok := m.mappingCache.LoadOrStore(physicalStart, true); !ok {
+			// Not present in the cache; requires setting the slot.
+			if _, ok := handleBluepillFault(m, physical); !ok {
+				panic("handleBluepillFault failed")
+			}
+		}
+
+		// Move to the next chunk.
+		physical = physicalStart + length
+	}
+}
+
+// Destroy frees associated resources.
+//
+// Destroy should only be called once all active users of the machine are gone.
+// The machine object should not be used after calling Destroy.
+//
+// Precondition: all vCPUs must be returned to the machine.
+func (m *machine) Destroy() {
+	runtime.SetFinalizer(m, nil)
+
+	// Destroy vCPUs.
+	for _, c := range m.vCPUs {
+		// Ensure the vCPU is not still running in guest mode. This is
+		// possible iff teardown has been done by other threads, and
+		// somehow a single thread has not executed any system calls.
+		c.wait()
+
+		// Teardown the vCPU itself.
+		switch state := c.State(); state {
+		case vCPUReady:
+			// Note that the runData may not be mapped if an error
+			// occurs during the middle of initialization.
+			if c.runData != nil {
+				if err := unmapRunData(c.runData); err != nil {
+					panic(fmt.Sprintf("error unmapping rundata: %v", err))
+				}
+			}
+			if err := syscall.Close(int(c.fd)); err != nil {
+				panic(fmt.Sprintf("error closing vCPU fd: %v", err))
+			}
+		case vCPUGuest, vCPUWaiter:
+			// Should never happen; waited above.
+			panic("vCPU disposed in guest state")
+		default:
+			// Should never happen; not a valid state.
+			panic(fmt.Sprintf("vCPU in invalid state: %v", state))
+		}
+	}
+
+	// Release host mappings.
+	if m.kernel.PageTables != nil {
+		m.kernel.PageTables.Release()
+	}
+
+	// vCPUs are gone: teardown machine state.
+	if err := syscall.Close(m.fd); err != nil {
+		panic(fmt.Sprintf("error closing VM fd: %v", err))
+	}
+}
+
+// Get gets an available vCPU.
+func (m *machine) Get() (*vCPU, error) {
+	runtime.LockOSThread()
+	tid := procid.Current()
+	m.mu.Lock()
+
+	for {
+		// Check for an exact match.
+		if c := m.vCPUs[tid]; c != nil && c.mu.TryLock() {
+			m.mu.Unlock()
+			return c, nil
+		}
+
+		// Scan for an available vCPU.
+		for origTID, c := range m.vCPUs {
+			if c.LockInState(vCPUReady) {
+				delete(m.vCPUs, origTID)
+				m.vCPUs[tid] = c
+				m.mu.Unlock()
+
+				// We need to reload thread-local segments as
+				// we have origTID != tid and the vCPU state
+				// may be stale.
+				c.loadSegments()
+				atomic.StoreUint64(&c.tid, tid)
+				return c, nil
+			}
+		}
+
+		// Everything is busy executing user code (locked).
+		//
+		// We hold the pool lock here, so we should be able to kick something
+		// out of kernel mode and have it bounce into host mode when it tries
+		// to grab the vCPU again.
+		for _, c := range m.vCPUs {
+			if c.State() != vCPUWaiter {
+				c.Bounce()
+			}
+		}
+
+		// Give other threads an opportunity to run.
+		yield()
+	}
+}
+
+// Put puts the current vCPU.
+func (m *machine) Put(c *vCPU) {
+	c.Unlock()
+	runtime.UnlockOSThread()
+}
+
+// State returns the current state.
+func (c *vCPU) State() uintptr {
+	return atomic.LoadUintptr(&c.state)
+}
+
+// Lock locks the vCPU.
+func (c *vCPU) Lock() {
+	c.mu.Lock()
+}
+
+// Invalidate invalidates caches.
+func (c *vCPU) Invalidate() {
+}
+
+// LockInState locks the vCPU if it is in the given state and TryLock succeeds.
+func (c *vCPU) LockInState(state uintptr) bool {
+	if c.State() == state && c.mu.TryLock() {
+		if c.State() != state {
+			c.mu.Unlock()
+			return false
+		}
+		return true
+	}
+	return false
+}
+
+// Unlock unlocks the given vCPU.
+func (c *vCPU) Unlock() {
+	// Ensure we're out of guest mode, if necessary.
+	if c.State() == vCPUWaiter {
+		redpill() // Force guest mode exit.
+	}
+	c.mu.Unlock()
+}
+
+// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
+func (c *vCPU) NotifyInterrupt() {
+	c.Bounce()
+}
+
+// pid is used below in bounce.
+var pid = syscall.Getpid()
+
+// Bounce ensures that the vCPU bounces back to the kernel.
+//
+// In practice, this means returning EAGAIN from running user code. The vCPU
+// will be unlocked and relock, and the kernel is guaranteed to check for
+// interrupt notifications (e.g. injected via Notify) and invalidations.
+func (c *vCPU) Bounce() {
+	for {
+		if c.mu.TryLock() {
+			// We know that the vCPU must be in the kernel already,
+			// because the lock was not acquired. We specifically
+			// don't want to call bounce in this case, because it's
+			// not necessary to knock the vCPU out of guest mode.
+			c.mu.Unlock()
+			return
+		}
+
+		if state := c.State(); state == vCPUGuest || state == vCPUWaiter {
+			// We know that the vCPU was in guest mode, so a single signal
+			// interruption will guarantee that a transition takes place.
+			syscall.Tgkill(pid, int(atomic.LoadUint64(&c.tid)), bounceSignal)
+			return
+		}
+
+		// Someone holds the lock, but the vCPU is not yet transitioned
+		// into guest mode. It's in the critical section; give it time.
+		yield()
+	}
+}
author	Googler <noreply@google.com>	2018-04-27 10:37:02 -0700
committer	Adin Scannell <ascannell@google.com>	2018-04-28 01:44:26 -0400
commit	d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree	54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/platform/kvm/machine.go
parent	f70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)