Merge 216da0b7 (automated)

author: gVisor bot <gvisor-bot@google.com> 2019-06-02 06:44:55 +0000
committer: gVisor bot <gvisor-bot@google.com> 2019-06-02 06:44:55 +0000
commit: ceb0d792f328d1fc0692197d8856a43c3936a571 (patch)
tree: 83155f302eff44a78bcc30a3a08f4efe59a79379 /pkg/sentry/platform/kvm
parent: deb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff)
parent: 216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff)
20 files changed, 3150 insertions, 0 deletions
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
new file mode 100644
index 000000000..689122175
--- /dev/null
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -0,0 +1,234 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"sync"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/atomicbitops"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// dirtySet tracks vCPUs for invalidation.
+type dirtySet struct {
+	vCPUs []uint64
+}
+
+// forEach iterates over all CPUs in the dirty set.
+func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) {
+	m.mu.RLock()
+	defer m.mu.RUnlock()
+
+	for index := range ds.vCPUs {
+		mask := atomic.SwapUint64(&ds.vCPUs[index], 0)
+		if mask != 0 {
+			for bit := 0; bit < 64; bit++ {
+				if mask&(1<<uint64(bit)) == 0 {
+					continue
+				}
+				id := 64*index + bit
+				fn(m.vCPUsByID[id])
+			}
+		}
+	}
+}
+
+// mark marks the given vCPU as dirty and returns whether it was previously
+// clean. Being previously clean implies that a flush is needed on entry.
+func (ds *dirtySet) mark(c *vCPU) bool {
+	index := uint64(c.id) / 64
+	bit := uint64(1) << uint(c.id%64)
+
+	oldValue := atomic.LoadUint64(&ds.vCPUs[index])
+	if oldValue&bit != 0 {
+		return false // Not clean.
+	}
+
+	// Set the bit unilaterally, and ensure that a flush takes place. Note
+	// that it's possible for races to occur here, but since the flush is
+	// taking place long after these lines there's no race in practice.
+	atomicbitops.OrUint64(&ds.vCPUs[index], bit)
+	return true // Previously clean.
+}
+
+// addressSpace is a wrapper for PageTables.
+type addressSpace struct {
+	platform.NoAddressSpaceIO
+
+	// mu is the lock for modifications to the address space.
+	//
+	// Note that the page tables themselves are not locked.
+	mu sync.Mutex
+
+	// machine is the underlying machine.
+	machine *machine
+
+	// pageTables are for this particular address space.
+	pageTables *pagetables.PageTables
+
+	// dirtySet is the set of dirty vCPUs.
+	dirtySet *dirtySet
+}
+
+// invalidate is the implementation for Invalidate.
+func (as *addressSpace) invalidate() {
+	as.dirtySet.forEach(as.machine, func(c *vCPU) {
+		if c.active.get() == as { // If this happens to be active,
+			c.BounceToKernel() // ... force a kernel transition.
+		}
+	})
+}
+
+// Invalidate interrupts all dirty contexts.
+func (as *addressSpace) Invalidate() {
+	as.mu.Lock()
+	defer as.mu.Unlock()
+	as.invalidate()
+}
+
+// Touch adds the given vCPU to the dirty list.
+//
+// The return value indicates whether a flush is required.
+func (as *addressSpace) Touch(c *vCPU) bool {
+	return as.dirtySet.mark(c)
+}
+
+type hostMapEntry struct {
+	addr   uintptr
+	length uintptr
+}
+
+func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) {
+	for m.length > 0 {
+		physical, length, ok := translateToPhysical(m.addr)
+		if !ok {
+			panic("unable to translate segment")
+		}
+		if length > m.length {
+			length = m.length
+		}
+
+		// Ensure that this map has physical mappings. If the page does
+		// not have physical mappings, the KVM module may inject
+		// spurious exceptions when emulation fails (i.e. it tries to
+		// emulate because the RIP is pointed at those pages).
+		as.machine.mapPhysical(physical, length)
+
+		// Install the page table mappings. Note that the ordering is
+		// important; if the pagetable mappings were installed before
+		// ensuring the physical pages were available, then some other
+		// thread could theoretically access them.
+		//
+		// Due to the way KVM's shadow paging implementation works,
+		// modifications to the page tables while in host mode may not
+		// be trapped, leading to the shadow pages being out of sync.
+		// Therefore, we need to ensure that we are in guest mode for
+		// page table modifications. See the call to bluepill, below.
+		as.machine.retryInGuest(func() {
+			inv = as.pageTables.Map(addr, length, pagetables.MapOpts{
+				AccessType: at,
+				User:       true,
+			}, physical) || inv
+		})
+		m.addr += length
+		m.length -= length
+		addr += usermem.Addr(length)
+	}
+
+	return inv
+}
+
+// MapFile implements platform.AddressSpace.MapFile.
+func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
+	as.mu.Lock()
+	defer as.mu.Unlock()
+
+	// Get mappings in the sentry's address space, which are guaranteed to be
+	// valid as long as a reference is held on the mapped pages (which is in
+	// turn required by AddressSpace.MapFile precondition).
+	//
+	// If precommit is true, we will touch mappings to commit them, so ensure
+	// that mappings are readable from sentry context.
+	//
+	// We don't execute from application file-mapped memory, and guest page
+	// tables don't care if we have execute permission (but they do need pages
+	// to be readable).
+	bs, err := f.MapInternal(fr, usermem.AccessType{
+		Read:  at.Read || at.Execute || precommit,
+		Write: at.Write,
+	})
+	if err != nil {
+		return err
+	}
+
+	// Map the mappings in the sentry's address space (guest physical memory)
+	// into the application's address space (guest virtual memory).
+	inv := false
+	for !bs.IsEmpty() {
+		b := bs.Head()
+		bs = bs.Tail()
+		// Since fr was page-aligned, b should also be page-aligned. We do the
+		// lookup in our host page tables for this translation.
+		if precommit {
+			s := b.ToSlice()
+			for i := 0; i < len(s); i += usermem.PageSize {
+				_ = s[i] // Touch to commit.
+			}
+		}
+		prev := as.mapHost(addr, hostMapEntry{
+			addr:   b.Addr(),
+			length: uintptr(b.Len()),
+		}, at)
+		inv = inv || prev
+		addr += usermem.Addr(b.Len())
+	}
+	if inv {
+		as.invalidate()
+	}
+
+	return nil
+}
+
+// Unmap unmaps the given range by calling pagetables.PageTables.Unmap.
+func (as *addressSpace) Unmap(addr usermem.Addr, length uint64) {
+	as.mu.Lock()
+	defer as.mu.Unlock()
+
+	// See above re: retryInGuest.
+	var prev bool
+	as.machine.retryInGuest(func() {
+		prev = as.pageTables.Unmap(addr, uintptr(length)) || prev
+	})
+	if prev {
+		as.invalidate()
+
+		// Recycle any freed intermediate pages.
+		as.pageTables.Allocator.Recycle()
+	}
+}
+
+// Release releases the page tables.
+func (as *addressSpace) Release() {
+	as.Unmap(0, ^uint64(0))
+
+	// Free all pages from the allocator.
+	as.pageTables.Allocator.(allocator).base.Drain()
+
+	// Drop all cached machine references.
+	as.machine.dropPageTables(as.pageTables)
+}
diff --git a/pkg/sentry/platform/kvm/allocator.go b/pkg/sentry/platform/kvm/allocator.go
new file mode 100644
index 000000000..42bcc9733
--- /dev/null
+++ b/pkg/sentry/platform/kvm/allocator.go
@@ -0,0 +1,76 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+)
+
+type allocator struct {
+	base *pagetables.RuntimeAllocator
+}
+
+// newAllocator is used to define the allocator.
+func newAllocator() allocator {
+	return allocator{
+		base: pagetables.NewRuntimeAllocator(),
+	}
+}
+
+// NewPTEs implements pagetables.Allocator.NewPTEs.
+//
+//go:nosplit
+func (a allocator) NewPTEs() *pagetables.PTEs {
+	return a.base.NewPTEs()
+}
+
+// PhysicalFor returns the physical address for a set of PTEs.
+//
+//go:nosplit
+func (a allocator) PhysicalFor(ptes *pagetables.PTEs) uintptr {
+	virtual := a.base.PhysicalFor(ptes)
+	physical, _, ok := translateToPhysical(virtual)
+	if !ok {
+		panic(fmt.Sprintf("PhysicalFor failed for %p", ptes))
+	}
+	return physical
+}
+
+// LookupPTEs implements pagetables.Allocator.LookupPTEs.
+//
+//go:nosplit
+func (a allocator) LookupPTEs(physical uintptr) *pagetables.PTEs {
+	virtualStart, physicalStart, _, ok := calculateBluepillFault(physical)
+	if !ok {
+		panic(fmt.Sprintf("LookupPTEs failed for 0x%x", physical))
+	}
+	return a.base.LookupPTEs(virtualStart + (physical - physicalStart))
+}
+
+// FreePTEs implements pagetables.Allocator.FreePTEs.
+//
+//go:nosplit
+func (a allocator) FreePTEs(ptes *pagetables.PTEs) {
+	a.base.FreePTEs(ptes)
+}
+
+// Recycle implements pagetables.Allocator.Recycle.
+//
+//go:nosplit
+func (a allocator) Recycle() {
+	a.base.Recycle()
+}
diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
new file mode 100644
index 000000000..a926e6f8b
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -0,0 +1,82 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"fmt"
+	"reflect"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/safecopy"
+)
+
+// bluepill enters guest mode.
+func bluepill(*vCPU)
+
+// sighandler is the signal entry point.
+func sighandler()
+
+// dieTrampoline is the assembly trampoline. This calls dieHandler.
+//
+// This uses an architecture-specific calling convention, documented in
+// dieArchSetup and the assembly implementation for dieTrampoline.
+func dieTrampoline()
+
+var (
+	// savedHandler is a pointer to the previous handler.
+	//
+	// This is called by bluepillHandler.
+	savedHandler uintptr
+
+	// dieTrampolineAddr is the address of dieTrampoline.
+	dieTrampolineAddr uintptr
+)
+
+// dieHandler is called by dieTrampoline.
+//
+//go:nosplit
+func dieHandler(c *vCPU) {
+	throw(c.dieState.message)
+}
+
+// die is called to set the vCPU up to panic.
+//
+// This loads vCPU state, and sets up a call for the trampoline.
+//
+//go:nosplit
+func (c *vCPU) die(context *arch.SignalContext64, msg string) {
+	// Save the death message, which will be thrown.
+	c.dieState.message = msg
+
+	// Reload all registers to have an accurate stack trace when we return
+	// to host mode. This means that the stack should be unwound correctly.
+	if errno := c.getUserRegisters(&c.dieState.guestRegs); errno != 0 {
+		throw(msg)
+	}
+
+	// Setup the trampoline.
+	dieArchSetup(c, context, &c.dieState.guestRegs)
+}
+
+func init() {
+	// Install the handler.
+	if err := safecopy.ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil {
+		panic(fmt.Sprintf("Unable to set handler for signal %d: %v", syscall.SIGSEGV, err))
+	}
+
+	// Extract the address for the trampoline.
+	dieTrampolineAddr = reflect.ValueOf(dieTrampoline).Pointer()
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go
new file mode 100644
index 000000000..c258408f9
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.go
@@ -0,0 +1,141 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+)
+
+var (
+	// bounceSignal is the signal used for bouncing KVM.
+	//
+	// We use SIGCHLD because it is not masked by the runtime, and
+	// it will be ignored properly by other parts of the kernel.
+	bounceSignal = syscall.SIGCHLD
+
+	// bounceSignalMask has only bounceSignal set.
+	bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1))
+
+	// bounce is the interrupt vector used to return to the kernel.
+	bounce = uint32(ring0.VirtualizationException)
+)
+
+// redpill on amd64 invokes a syscall with -1.
+//
+//go:nosplit
+func redpill() {
+	syscall.RawSyscall(^uintptr(0), 0, 0, 0)
+}
+
+// bluepillArchEnter is called during bluepillEnter.
+//
+//go:nosplit
+func bluepillArchEnter(context *arch.SignalContext64) *vCPU {
+	c := vCPUPtr(uintptr(context.Rax))
+	regs := c.CPU.Registers()
+	regs.R8 = context.R8
+	regs.R9 = context.R9
+	regs.R10 = context.R10
+	regs.R11 = context.R11
+	regs.R12 = context.R12
+	regs.R13 = context.R13
+	regs.R14 = context.R14
+	regs.R15 = context.R15
+	regs.Rdi = context.Rdi
+	regs.Rsi = context.Rsi
+	regs.Rbp = context.Rbp
+	regs.Rbx = context.Rbx
+	regs.Rdx = context.Rdx
+	regs.Rax = context.Rax
+	regs.Rcx = context.Rcx
+	regs.Rsp = context.Rsp
+	regs.Rip = context.Rip
+	regs.Eflags = context.Eflags
+	regs.Eflags &^= uint64(ring0.KernelFlagsClear)
+	regs.Eflags |= ring0.KernelFlagsSet
+	regs.Cs = uint64(ring0.Kcode)
+	regs.Ds = uint64(ring0.Udata)
+	regs.Es = uint64(ring0.Udata)
+	regs.Ss = uint64(ring0.Kdata)
+	return c
+}
+
+// KernelSyscall handles kernel syscalls.
+//
+//go:nosplit
+func (c *vCPU) KernelSyscall() {
+	regs := c.Registers()
+	if regs.Rax != ^uint64(0) {
+		regs.Rip -= 2 // Rewind.
+	}
+	// We only trigger a bluepill entry in the bluepill function, and can
+	// therefore be guaranteed that there is no floating point state to be
+	// loaded on resuming from halt. We only worry about saving on exit.
+	ring0.SaveFloatingPoint((*byte)(c.floatingPointState))
+	ring0.Halt()
+	ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment.
+}
+
+// KernelException handles kernel exceptions.
+//
+//go:nosplit
+func (c *vCPU) KernelException(vector ring0.Vector) {
+	regs := c.Registers()
+	if vector == ring0.Vector(bounce) {
+		// These should not interrupt kernel execution; point the Rip
+		// to zero to ensure that we get a reasonable panic when we
+		// attempt to return and a full stack trace.
+		regs.Rip = 0
+	}
+	// See above.
+	ring0.SaveFloatingPoint((*byte)(c.floatingPointState))
+	ring0.Halt()
+	ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment.
+}
+
+// bluepillArchExit is called during bluepillEnter.
+//
+//go:nosplit
+func bluepillArchExit(c *vCPU, context *arch.SignalContext64) {
+	regs := c.CPU.Registers()
+	context.R8 = regs.R8
+	context.R9 = regs.R9
+	context.R10 = regs.R10
+	context.R11 = regs.R11
+	context.R12 = regs.R12
+	context.R13 = regs.R13
+	context.R14 = regs.R14
+	context.R15 = regs.R15
+	context.Rdi = regs.Rdi
+	context.Rsi = regs.Rsi
+	context.Rbp = regs.Rbp
+	context.Rbx = regs.Rbx
+	context.Rdx = regs.Rdx
+	context.Rax = regs.Rax
+	context.Rcx = regs.Rcx
+	context.Rsp = regs.Rsp
+	context.Rip = regs.Rip
+	context.Eflags = regs.Eflags
+
+	// Set the context pointer to the saved floating point state. This is
+	// where the guest data has been serialized, the kernel will restore
+	// from this new pointer value.
+	context.Fpstate = uint64(uintptrValue((*byte)(c.floatingPointState)))
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.s b/pkg/sentry/platform/kvm/bluepill_amd64.s
new file mode 100644
index 000000000..2bc34a435
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.s
@@ -0,0 +1,93 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// VCPU_CPU is the location of the CPU in the vCPU struct.
+//
+// This is guaranteed to be zero.
+#define VCPU_CPU 0x0
+
+// CPU_SELF is the self reference in ring0's percpu.
+//
+// This is guaranteed to be zero.
+#define CPU_SELF 0x0
+
+// Context offsets.
+//
+// Only limited use of the context is done in the assembly stub below, most is
+// done in the Go handlers. However, the RIP must be examined.
+#define CONTEXT_RAX 0x90
+#define CONTEXT_RIP 0xa8
+#define CONTEXT_FP  0xe0
+
+// CLI is the literal byte for the disable interrupts instruction.
+//
+// This is checked as the source of the fault.
+#define CLI $0xfa
+
+// See bluepill.go.
+TEXT ·bluepill(SB),NOSPLIT,$0
+begin:
+	MOVQ vcpu+0(FP), AX
+	LEAQ VCPU_CPU(AX), BX
+	BYTE CLI;
+check_vcpu:
+	MOVQ CPU_SELF(GS), CX
+	CMPQ BX, CX
+	JE right_vCPU
+wrong_vcpu:
+	CALL ·redpill(SB)
+	JMP begin
+right_vCPU:
+	RET
+
+// sighandler: see bluepill.go for documentation.
+//
+// The arguments are the following:
+//
+// 	DI - The signal number.
+// 	SI - Pointer to siginfo_t structure.
+// 	DX - Pointer to ucontext structure.
+//
+TEXT ·sighandler(SB),NOSPLIT,$0
+	// Check if the signal is from the kernel.
+	MOVQ $0x80, CX
+	CMPL CX, 0x8(SI)
+	JNE fallback
+
+	// Check if RIP is disable interrupts.
+	MOVQ CONTEXT_RIP(DX), CX
+	CMPQ CX, $0x0
+	JE fallback
+	CMPB 0(CX), CLI
+	JNE fallback
+
+	// Call the bluepillHandler.
+	PUSHQ DX                    // First argument (context).
+	CALL ·bluepillHandler(SB)   // Call the handler.
+	POPQ DX                     // Discard the argument.
+	RET
+
+fallback:
+	// Jump to the previous signal handler.
+	XORQ CX, CX
+	MOVQ ·savedHandler(SB), AX
+	JMP AX
+
+// dieTrampoline: see bluepill.go, bluepill_amd64_unsafe.go for documentation.
+TEXT ·dieTrampoline(SB),NOSPLIT,$0
+	PUSHQ BX // First argument (vCPU).
+	PUSHQ AX // Fake the old RIP as caller.
+	JMP ·dieHandler(SB)
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
new file mode 100644
index 000000000..92fde7ee0
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -0,0 +1,56 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+)
+
+// bluepillArchContext returns the arch-specific context.
+//
+//go:nosplit
+func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
+	return &((*arch.UContext64)(context).MContext)
+}
+
+// dieArchSetup initialies the state for dieTrampoline.
+//
+// The amd64 dieTrampoline requires the vCPU to be set in BX, and the last RIP
+// to be in AX. The trampoline then simulates a call to dieHandler from the
+// provided RIP.
+//
+//go:nosplit
+func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
+	// If the vCPU is in user mode, we set the stack to the stored stack
+	// value in the vCPU itself. We don't want to unwind the user stack.
+	if guestRegs.RFLAGS&ring0.UserFlagsSet == ring0.UserFlagsSet {
+		regs := c.CPU.Registers()
+		context.Rax = regs.Rax
+		context.Rsp = regs.Rsp
+		context.Rbp = regs.Rbp
+	} else {
+		context.Rax = guestRegs.RIP
+		context.Rsp = guestRegs.RSP
+		context.Rbp = guestRegs.RBP
+		context.Eflags = guestRegs.RFLAGS
+	}
+	context.Rbx = uint64(uintptr(unsafe.Pointer(c)))
+	context.Rip = uint64(dieTrampolineAddr)
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go
new file mode 100644
index 000000000..3c452f5ba
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_fault.go
@@ -0,0 +1,127 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+const (
+	// faultBlockSize is the size used for servicing memory faults.
+	//
+	// This should be large enough to avoid frequent faults and avoid using
+	// all available KVM slots (~512), but small enough that KVM does not
+	// complain about slot sizes (~4GB). See handleBluepillFault for how
+	// this block is used.
+	faultBlockSize = 2 << 30
+
+	// faultBlockMask is the mask for the fault blocks.
+	//
+	// This must be typed to avoid overflow complaints (ugh).
+	faultBlockMask = ^uintptr(faultBlockSize - 1)
+)
+
+// yield yields the CPU.
+//
+//go:nosplit
+func yield() {
+	syscall.RawSyscall(syscall.SYS_SCHED_YIELD, 0, 0, 0)
+}
+
+// calculateBluepillFault calculates the fault address range.
+//
+//go:nosplit
+func calculateBluepillFault(physical uintptr) (virtualStart, physicalStart, length uintptr, ok bool) {
+	alignedPhysical := physical &^ uintptr(usermem.PageSize-1)
+	for _, pr := range physicalRegions {
+		end := pr.physical + pr.length
+		if physical < pr.physical || physical >= end {
+			continue
+		}
+
+		// Adjust the block to match our size.
+		physicalStart = alignedPhysical & faultBlockMask
+		if physicalStart < pr.physical {
+			// Bound the starting point to the start of the region.
+			physicalStart = pr.physical
+		}
+		virtualStart = pr.virtual + (physicalStart - pr.physical)
+		physicalEnd := physicalStart + faultBlockSize
+		if physicalEnd > end {
+			physicalEnd = end
+		}
+		length = physicalEnd - physicalStart
+		return virtualStart, physicalStart, length, true
+	}
+
+	return 0, 0, 0, false
+}
+
+// handleBluepillFault handles a physical fault.
+//
+// The corresponding virtual address is returned. This may throw on error.
+//
+//go:nosplit
+func handleBluepillFault(m *machine, physical uintptr) (uintptr, bool) {
+	// Paging fault: we need to map the underlying physical pages for this
+	// fault. This all has to be done in this function because we're in a
+	// signal handler context. (We can't call any functions that might
+	// split the stack.)
+	virtualStart, physicalStart, length, ok := calculateBluepillFault(physical)
+	if !ok {
+		return 0, false
+	}
+
+	// Set the KVM slot.
+	//
+	// First, we need to acquire the exclusive right to set a slot.  See
+	// machine.nextSlot for information about the protocol.
+	slot := atomic.SwapUint32(&m.nextSlot, ^uint32(0))
+	for slot == ^uint32(0) {
+		yield() // Race with another call.
+		slot = atomic.SwapUint32(&m.nextSlot, ^uint32(0))
+	}
+	errno := m.setMemoryRegion(int(slot), physicalStart, length, virtualStart)
+	if errno == 0 {
+		// Successfully added region; we can increment nextSlot and
+		// allow another set to proceed here.
+		atomic.StoreUint32(&m.nextSlot, slot+1)
+		return virtualStart + (physical - physicalStart), true
+	}
+
+	// Release our slot (still available).
+	atomic.StoreUint32(&m.nextSlot, slot)
+
+	switch errno {
+	case syscall.EEXIST:
+		// The region already exists. It's possible that we raced with
+		// another vCPU here. We just revert nextSlot and return true,
+		// because this must have been satisfied by some other vCPU.
+		return virtualStart + (physical - physicalStart), true
+	case syscall.EINVAL:
+		throw("set memory region failed; out of slots")
+	case syscall.ENOMEM:
+		throw("set memory region failed: out of memory")
+	case syscall.EFAULT:
+		throw("set memory region failed: invalid physical range")
+	default:
+		throw("set memory region failed: unknown reason")
+	}
+
+	panic("unreachable")
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
new file mode 100644
index 000000000..7e8e9f42a
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -0,0 +1,213 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build go1.12
+// +build !go1.14
+
+// Check go:linkname function signatures when updating Go version.
+
+package kvm
+
+import (
+	"sync/atomic"
+	"syscall"
+	"unsafe"
+)
+
+//go:linkname throw runtime.throw
+func throw(string)
+
+// vCPUPtr returns a CPU for the given address.
+//
+//go:nosplit
+func vCPUPtr(addr uintptr) *vCPU {
+	return (*vCPU)(unsafe.Pointer(addr))
+}
+
+// bytePtr returns a bytePtr for the given address.
+//
+//go:nosplit
+func bytePtr(addr uintptr) *byte {
+	return (*byte)(unsafe.Pointer(addr))
+}
+
+// uintptrValue returns a uintptr for the given address.
+//
+//go:nosplit
+func uintptrValue(addr *byte) uintptr {
+	return (uintptr)(unsafe.Pointer(addr))
+}
+
+// bluepillHandler is called from the signal stub.
+//
+// The world may be stopped while this is executing, and it executes on the
+// signal stack. It should only execute raw system calls and functions that are
+// explicitly marked go:nosplit.
+//
+//go:nosplit
+func bluepillHandler(context unsafe.Pointer) {
+	// Sanitize the registers; interrupts must always be disabled.
+	c := bluepillArchEnter(bluepillArchContext(context))
+
+	// Increment the number of switches.
+	atomic.AddUint32(&c.switches, 1)
+
+	// Mark this as guest mode.
+	switch atomic.SwapUint32(&c.state, vCPUGuest|vCPUUser) {
+	case vCPUUser: // Expected case.
+	case vCPUUser | vCPUWaiter:
+		c.notify()
+	default:
+		throw("invalid state")
+	}
+
+	for {
+		switch _, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0); errno {
+		case 0: // Expected case.
+		case syscall.EINTR:
+			// First, we process whatever pending signal
+			// interrupted KVM. Since we're in a signal handler
+			// currently, all signals are masked and the signal
+			// must have been delivered directly to this thread.
+			sig, _, errno := syscall.RawSyscall6(
+				syscall.SYS_RT_SIGTIMEDWAIT,
+				uintptr(unsafe.Pointer(&bounceSignalMask)),
+				0, // siginfo.
+				0, // timeout.
+				8, // sigset size.
+				0, 0)
+			if errno != 0 {
+				throw("error waiting for pending signal")
+			}
+			if sig != uintptr(bounceSignal) {
+				throw("unexpected signal")
+			}
+
+			// Check whether the current state of the vCPU is ready
+			// for interrupt injection. Because we don't have a
+			// PIC, we can't inject an interrupt while they are
+			// masked. We need to request a window if it's not
+			// ready.
+			if c.runData.readyForInterruptInjection == 0 {
+				c.runData.requestInterruptWindow = 1
+				continue // Rerun vCPU.
+			} else {
+				// Force injection below; the vCPU is ready.
+				c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN
+			}
+		case syscall.EFAULT:
+			// If a fault is not serviceable due to the host
+			// backing pages having page permissions, instead of an
+			// MMIO exit we receive EFAULT from the run ioctl. We
+			// always inject an NMI here since we may be in kernel
+			// mode and have interrupts disabled.
+			if _, _, errno := syscall.RawSyscall(
+				syscall.SYS_IOCTL,
+				uintptr(c.fd),
+				_KVM_NMI, 0); errno != 0 {
+				throw("NMI injection failed")
+			}
+			continue // Rerun vCPU.
+		default:
+			throw("run failed")
+		}
+
+		switch c.runData.exitReason {
+		case _KVM_EXIT_EXCEPTION:
+			c.die(bluepillArchContext(context), "exception")
+			return
+		case _KVM_EXIT_IO:
+			c.die(bluepillArchContext(context), "I/O")
+			return
+		case _KVM_EXIT_INTERNAL_ERROR:
+			// An internal error is typically thrown when emulation
+			// fails. This can occur via the MMIO path below (and
+			// it might fail because we have multiple regions that
+			// are not mapped). We would actually prefer that no
+			// emulation occur, and don't mind at all if it fails.
+		case _KVM_EXIT_HYPERCALL:
+			c.die(bluepillArchContext(context), "hypercall")
+			return
+		case _KVM_EXIT_DEBUG:
+			c.die(bluepillArchContext(context), "debug")
+			return
+		case _KVM_EXIT_HLT:
+			// Copy out registers.
+			bluepillArchExit(c, bluepillArchContext(context))
+
+			// Return to the vCPUReady state; notify any waiters.
+			user := atomic.LoadUint32(&c.state) & vCPUUser
+			switch atomic.SwapUint32(&c.state, user) {
+			case user | vCPUGuest: // Expected case.
+			case user | vCPUGuest | vCPUWaiter:
+				c.notify()
+			default:
+				throw("invalid state")
+			}
+			return
+		case _KVM_EXIT_MMIO:
+			// Increment the fault count.
+			atomic.AddUint32(&c.faults, 1)
+
+			// For MMIO, the physical address is the first data item.
+			physical := uintptr(c.runData.data[0])
+			virtual, ok := handleBluepillFault(c.machine, physical)
+			if !ok {
+				c.die(bluepillArchContext(context), "invalid physical address")
+				return
+			}
+
+			// We now need to fill in the data appropriately. KVM
+			// expects us to provide the result of the given MMIO
+			// operation in the runData struct. This is safe
+			// because, if a fault occurs here, the same fault
+			// would have occurred in guest mode. The kernel should
+			// not create invalid page table mappings.
+			data := (*[8]byte)(unsafe.Pointer(&c.runData.data[1]))
+			length := (uintptr)((uint32)(c.runData.data[2]))
+			write := (uint8)(((c.runData.data[2] >> 32) & 0xff)) != 0
+			for i := uintptr(0); i < length; i++ {
+				b := bytePtr(uintptr(virtual) + i)
+				if write {
+					// Write to the given address.
+					*b = data[i]
+				} else {
+					// Read from the given address.
+					data[i] = *b
+				}
+			}
+		case _KVM_EXIT_IRQ_WINDOW_OPEN:
+			// Interrupt: we must have requested an interrupt
+			// window; set the interrupt line.
+			if _, _, errno := syscall.RawSyscall(
+				syscall.SYS_IOCTL,
+				uintptr(c.fd),
+				_KVM_INTERRUPT,
+				uintptr(unsafe.Pointer(&bounce))); errno != 0 {
+				throw("interrupt injection failed")
+			}
+			// Clear previous injection request.
+			c.runData.requestInterruptWindow = 0
+		case _KVM_EXIT_SHUTDOWN:
+			c.die(bluepillArchContext(context), "shutdown")
+			return
+		case _KVM_EXIT_FAIL_ENTRY:
+			c.die(bluepillArchContext(context), "entry failed")
+			return
+		default:
+			c.die(bluepillArchContext(context), "unknown")
+			return
+		}
+	}
+}
diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go
new file mode 100644
index 000000000..0eb0020f7
--- /dev/null
+++ b/pkg/sentry/platform/kvm/context.go
@@ -0,0 +1,87 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/interrupt"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// context is an implementation of the platform context.
+//
+// This is a thin wrapper around the machine.
+type context struct {
+	// machine is the parent machine, and is immutable.
+	machine *machine
+
+	// info is the arch.SignalInfo cached for this context.
+	info arch.SignalInfo
+
+	// interrupt is the interrupt context.
+	interrupt interrupt.Forwarder
+}
+
+// Switch runs the provided context in the given address space.
+func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*arch.SignalInfo, usermem.AccessType, error) {
+	localAS := as.(*addressSpace)
+
+	// Grab a vCPU.
+	cpu := c.machine.Get()
+
+	// Enable interrupts (i.e. calls to vCPU.Notify).
+	if !c.interrupt.Enable(cpu) {
+		c.machine.Put(cpu) // Already preempted.
+		return nil, usermem.NoAccess, platform.ErrContextInterrupt
+	}
+
+	// Set the active address space.
+	//
+	// This must be done prior to the call to Touch below. If the address
+	// space is invalidated between this line and the call below, we will
+	// flag on entry anyways. When the active address space below is
+	// cleared, it indicates that we don't need an explicit interrupt and
+	// that the flush can occur naturally on the next user entry.
+	cpu.active.set(localAS)
+
+	// Prepare switch options.
+	switchOpts := ring0.SwitchOpts{
+		Registers:          &ac.StateData().Regs,
+		FloatingPointState: (*byte)(ac.FloatingPointData()),
+		PageTables:         localAS.pageTables,
+		Flush:              localAS.Touch(cpu),
+		FullRestore:        ac.FullRestore(),
+	}
+
+	// Take the blue pill.
+	at, err := cpu.SwitchToUser(switchOpts, &c.info)
+
+	// Clear the address space.
+	cpu.active.set(nil)
+
+	// Release resources.
+	c.machine.Put(cpu)
+
+	// All done.
+	c.interrupt.Disable()
+	return &c.info, at, err
+}
+
+// Interrupt interrupts the running context.
+func (c *context) Interrupt() {
+	c.interrupt.NotifyInterrupt()
+}
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
new file mode 100644
index 000000000..ed0521c3f
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -0,0 +1,143 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package kvm provides a kvm-based implementation of the platform interface.
+package kvm
+
+import (
+	"fmt"
+	"os"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// KVM represents a lightweight VM context.
+type KVM struct {
+	platform.NoCPUPreemptionDetection
+
+	// machine is the backing VM.
+	machine *machine
+}
+
+var (
+	globalOnce sync.Once
+	globalErr  error
+)
+
+// OpenDevice opens the KVM device at /dev/kvm and returns the File.
+func OpenDevice() (*os.File, error) {
+	f, err := os.OpenFile("/dev/kvm", syscall.O_RDWR, 0)
+	if err != nil {
+		return nil, fmt.Errorf("error opening /dev/kvm: %v", err)
+	}
+	return f, nil
+}
+
+// New returns a new KVM-based implementation of the platform interface.
+func New(deviceFile *os.File) (*KVM, error) {
+	fd := deviceFile.Fd()
+
+	// Ensure global initialization is done.
+	globalOnce.Do(func() {
+		physicalInit()
+		globalErr = updateSystemValues(int(fd))
+		ring0.Init(cpuid.HostFeatureSet())
+	})
+	if globalErr != nil {
+		return nil, globalErr
+	}
+
+	// Create a new VM fd.
+	vm, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, fd, _KVM_CREATE_VM, 0)
+	if errno != 0 {
+		return nil, fmt.Errorf("creating VM: %v", errno)
+	}
+	// We are done with the device file.
+	deviceFile.Close()
+
+	// Create a VM context.
+	machine, err := newMachine(int(vm))
+	if err != nil {
+		return nil, err
+	}
+
+	// All set.
+	return &KVM{
+		machine: machine,
+	}, nil
+}
+
+// SupportsAddressSpaceIO implements platform.Platform.SupportsAddressSpaceIO.
+func (*KVM) SupportsAddressSpaceIO() bool {
+	return false
+}
+
+// CooperativelySchedulesAddressSpace implements platform.Platform.CooperativelySchedulesAddressSpace.
+func (*KVM) CooperativelySchedulesAddressSpace() bool {
+	return false
+}
+
+// MapUnit implements platform.Platform.MapUnit.
+func (*KVM) MapUnit() uint64 {
+	// We greedily creates PTEs in MapFile, so extremely large mappings can
+	// be expensive. Not _that_ expensive since we allow super pages, but
+	// even though can get out of hand if you're creating multi-terabyte
+	// mappings. For this reason, we limit mappings to an arbitrary 16MB.
+	return 16 << 20
+}
+
+// MinUserAddress returns the lowest available address.
+func (*KVM) MinUserAddress() usermem.Addr {
+	return usermem.PageSize
+}
+
+// MaxUserAddress returns the first address that may not be used.
+func (*KVM) MaxUserAddress() usermem.Addr {
+	return usermem.Addr(ring0.MaximumUserAddress)
+}
+
+// NewAddressSpace returns a new pagetable root.
+func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan struct{}, error) {
+	// Allocate page tables and install system mappings.
+	pageTables := pagetables.New(newAllocator())
+	applyPhysicalRegions(func(pr physicalRegion) bool {
+		// Map the kernel in the upper half.
+		pageTables.Map(
+			usermem.Addr(ring0.KernelStartAddress|pr.virtual),
+			pr.length,
+			pagetables.MapOpts{AccessType: usermem.AnyAccess},
+			pr.physical)
+		return true // Keep iterating.
+	})
+
+	// Return the new address space.
+	return &addressSpace{
+		machine:    k.machine,
+		pageTables: pageTables,
+		dirtySet:   k.machine.newDirtySet(),
+	}, nil, nil
+}
+
+// NewContext returns an interruptible context.
+func (k *KVM) NewContext() platform.Context {
+	return &context{
+		machine: k.machine,
+	}
+}
diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go
new file mode 100644
index 000000000..61493ccaf
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_amd64.go
@@ -0,0 +1,213 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+)
+
+// userMemoryRegion is a region of physical memory.
+//
+// This mirrors kvm_memory_region.
+type userMemoryRegion struct {
+	slot          uint32
+	flags         uint32
+	guestPhysAddr uint64
+	memorySize    uint64
+	userspaceAddr uint64
+}
+
+// userRegs represents KVM user registers.
+//
+// This mirrors kvm_regs.
+type userRegs struct {
+	RAX    uint64
+	RBX    uint64
+	RCX    uint64
+	RDX    uint64
+	RSI    uint64
+	RDI    uint64
+	RSP    uint64
+	RBP    uint64
+	R8     uint64
+	R9     uint64
+	R10    uint64
+	R11    uint64
+	R12    uint64
+	R13    uint64
+	R14    uint64
+	R15    uint64
+	RIP    uint64
+	RFLAGS uint64
+}
+
+// systemRegs represents KVM system registers.
+//
+// This mirrors kvm_sregs.
+type systemRegs struct {
+	CS              segment
+	DS              segment
+	ES              segment
+	FS              segment
+	GS              segment
+	SS              segment
+	TR              segment
+	LDT             segment
+	GDT             descriptor
+	IDT             descriptor
+	CR0             uint64
+	CR2             uint64
+	CR3             uint64
+	CR4             uint64
+	CR8             uint64
+	EFER            uint64
+	apicBase        uint64
+	interruptBitmap [(_KVM_NR_INTERRUPTS + 63) / 64]uint64
+}
+
+// segment is the expanded form of a segment register.
+//
+// This mirrors kvm_segment.
+type segment struct {
+	base     uint64
+	limit    uint32
+	selector uint16
+	typ      uint8
+	present  uint8
+	DPL      uint8
+	DB       uint8
+	S        uint8
+	L        uint8
+	G        uint8
+	AVL      uint8
+	unusable uint8
+	_        uint8
+}
+
+// Clear clears the segment and marks it unusable.
+func (s *segment) Clear() {
+	*s = segment{unusable: 1}
+}
+
+// selector is a segment selector.
+type selector uint16
+
+// tobool is a simple helper.
+func tobool(x ring0.SegmentDescriptorFlags) uint8 {
+	if x != 0 {
+		return 1
+	}
+	return 0
+}
+
+// Load loads the segment described by d into the segment s.
+//
+// The argument sel is recorded as the segment selector index.
+func (s *segment) Load(d *ring0.SegmentDescriptor, sel ring0.Selector) {
+	flag := d.Flags()
+	if flag&ring0.SegmentDescriptorPresent == 0 {
+		s.Clear()
+		return
+	}
+	s.base = uint64(d.Base())
+	s.limit = d.Limit()
+	s.typ = uint8((flag>>8)&0xF) | 1
+	s.S = tobool(flag & ring0.SegmentDescriptorSystem)
+	s.DPL = uint8(d.DPL())
+	s.present = tobool(flag & ring0.SegmentDescriptorPresent)
+	s.AVL = tobool(flag & ring0.SegmentDescriptorAVL)
+	s.L = tobool(flag & ring0.SegmentDescriptorLong)
+	s.DB = tobool(flag & ring0.SegmentDescriptorDB)
+	s.G = tobool(flag & ring0.SegmentDescriptorG)
+	if s.L != 0 {
+		s.limit = 0xffffffff
+	}
+	s.unusable = 0
+	s.selector = uint16(sel)
+}
+
+// descriptor describes a region of physical memory.
+//
+// It corresponds to the pseudo-descriptor used in the x86 LGDT and LIDT
+// instructions, and mirrors kvm_dtable.
+type descriptor struct {
+	base  uint64
+	limit uint16
+	_     [3]uint16
+}
+
+// modelControlRegister is an MSR entry.
+//
+// This mirrors kvm_msr_entry.
+type modelControlRegister struct {
+	index uint32
+	_     uint32
+	data  uint64
+}
+
+// modelControlRegisers is a collection of MSRs.
+//
+// This mirrors kvm_msrs.
+type modelControlRegisters struct {
+	nmsrs   uint32
+	_       uint32
+	entries [16]modelControlRegister
+}
+
+// runData is the run structure. This may be mapped for synchronous register
+// access (although that doesn't appear to be supported by my kernel at least).
+//
+// This mirrors kvm_run.
+type runData struct {
+	requestInterruptWindow uint8
+	_                      [7]uint8
+
+	exitReason                 uint32
+	readyForInterruptInjection uint8
+	ifFlag                     uint8
+	_                          [2]uint8
+
+	cr8      uint64
+	apicBase uint64
+
+	// This is the union data for exits. Interpretation depends entirely on
+	// the exitReason above (see vCPU code for more information).
+	data [32]uint64
+}
+
+// cpuidEntry is a single CPUID entry.
+//
+// This mirrors kvm_cpuid_entry2.
+type cpuidEntry struct {
+	function uint32
+	index    uint32
+	flags    uint32
+	eax      uint32
+	ebx      uint32
+	ecx      uint32
+	edx      uint32
+	_        [3]uint32
+}
+
+// cpuidEntries is a collection of CPUID entries.
+//
+// This mirrors kvm_cpuid2.
+type cpuidEntries struct {
+	nr      uint32
+	_       uint32
+	entries [_KVM_NR_CPUID_ENTRIES]cpuidEntry
+}
diff --git a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
new file mode 100644
index 000000000..46c4b9113
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
@@ -0,0 +1,77 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+	"fmt"
+	"syscall"
+	"unsafe"
+)
+
+var (
+	runDataSize    int
+	hasGuestPCID   bool
+	cpuidSupported = cpuidEntries{nr: _KVM_NR_CPUID_ENTRIES}
+)
+
+func updateSystemValues(fd int) error {
+	// Extract the mmap size.
+	sz, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(fd), _KVM_GET_VCPU_MMAP_SIZE, 0)
+	if errno != 0 {
+		return fmt.Errorf("getting VCPU mmap size: %v", errno)
+	}
+
+	// Save the data.
+	runDataSize = int(sz)
+
+	// Must do the dance to figure out the number of entries.
+	_, _, errno = syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(fd),
+		_KVM_GET_SUPPORTED_CPUID,
+		uintptr(unsafe.Pointer(&cpuidSupported)))
+	if errno != 0 && errno != syscall.ENOMEM {
+		// Some other error occurred.
+		return fmt.Errorf("getting supported CPUID: %v", errno)
+	}
+
+	// The number should now be correct.
+	_, _, errno = syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(fd),
+		_KVM_GET_SUPPORTED_CPUID,
+		uintptr(unsafe.Pointer(&cpuidSupported)))
+	if errno != 0 {
+		// Didn't work with the right number.
+		return fmt.Errorf("getting supported CPUID (2nd attempt): %v", errno)
+	}
+
+	// Calculate whether guestPCID is supported.
+	//
+	// FIXME(ascannell): These should go through the much more pleasant
+	// cpuid package interfaces, once a way to accept raw kvm CPUID entries
+	// is plumbed (or some rough equivalent).
+	for i := 0; i < int(cpuidSupported.nr); i++ {
+		entry := cpuidSupported.entries[i]
+		if entry.function == 1 && entry.index == 0 && entry.ecx&(1<<17) != 0 {
+			hasGuestPCID = true // Found matching PCID in guest feature set.
+		}
+	}
+
+	// Success.
+	return nil
+}
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
new file mode 100644
index 000000000..d05f05c29
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -0,0 +1,64 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+// KVM ioctls.
+//
+// Only the ioctls we need in Go appear here; some additional ioctls are used
+// within the assembly stubs (KVM_INTERRUPT, etc.).
+const (
+	_KVM_CREATE_VM              = 0xae01
+	_KVM_GET_VCPU_MMAP_SIZE     = 0xae04
+	_KVM_CREATE_VCPU            = 0xae41
+	_KVM_SET_TSS_ADDR           = 0xae47
+	_KVM_RUN                    = 0xae80
+	_KVM_NMI                    = 0xae9a
+	_KVM_CHECK_EXTENSION        = 0xae03
+	_KVM_INTERRUPT              = 0x4004ae86
+	_KVM_SET_MSRS               = 0x4008ae89
+	_KVM_SET_USER_MEMORY_REGION = 0x4020ae46
+	_KVM_SET_REGS               = 0x4090ae82
+	_KVM_SET_SREGS              = 0x4138ae84
+	_KVM_GET_REGS               = 0x8090ae81
+	_KVM_GET_SUPPORTED_CPUID    = 0xc008ae05
+	_KVM_SET_CPUID2             = 0x4008ae90
+	_KVM_SET_SIGNAL_MASK        = 0x4004ae8b
+)
+
+// KVM exit reasons.
+const (
+	_KVM_EXIT_EXCEPTION       = 0x1
+	_KVM_EXIT_IO              = 0x2
+	_KVM_EXIT_HYPERCALL       = 0x3
+	_KVM_EXIT_DEBUG           = 0x4
+	_KVM_EXIT_HLT             = 0x5
+	_KVM_EXIT_MMIO            = 0x6
+	_KVM_EXIT_IRQ_WINDOW_OPEN = 0x7
+	_KVM_EXIT_SHUTDOWN        = 0x8
+	_KVM_EXIT_FAIL_ENTRY      = 0x9
+	_KVM_EXIT_INTERNAL_ERROR  = 0x11
+)
+
+// KVM capability options.
+const (
+	_KVM_CAP_MAX_VCPUS = 0x42
+)
+
+// KVM limits.
+const (
+	_KVM_NR_VCPUS         = 0xff
+	_KVM_NR_INTERRUPTS    = 0x100
+	_KVM_NR_CPUID_ENTRIES = 0x100
+)
diff --git a/pkg/sentry/platform/kvm/kvm_state_autogen.go b/pkg/sentry/platform/kvm/kvm_state_autogen.go
new file mode 100755
index 000000000..5ab0e0735
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_state_autogen.go
@@ -0,0 +1,4 @@
+// automatically generated by stateify.
+
+package kvm
+
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
new file mode 100644
index 000000000..f5953b96e
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -0,0 +1,525 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"fmt"
+	"runtime"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/atomicbitops"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// machine contains state associated with the VM as a whole.
+type machine struct {
+	// fd is the vm fd.
+	fd int
+
+	// nextSlot is the next slot for setMemoryRegion.
+	//
+	// This must be accessed atomically. If nextSlot is ^uint32(0), then
+	// slots are currently being updated, and the caller should retry.
+	nextSlot uint32
+
+	// kernel is the set of global structures.
+	kernel ring0.Kernel
+
+	// mappingCache is used for mapPhysical.
+	mappingCache sync.Map
+
+	// mu protects vCPUs.
+	mu sync.RWMutex
+
+	// available is notified when vCPUs are available.
+	available sync.Cond
+
+	// vCPUs are the machine vCPUs.
+	//
+	// These are populated dynamically.
+	vCPUs map[uint64]*vCPU
+
+	// vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID.
+	vCPUsByID map[int]*vCPU
+
+	// maxVCPUs is the maximum number of vCPUs supported by the machine.
+	maxVCPUs int
+}
+
+const (
+	// vCPUReady is an alias for all the below clear.
+	vCPUReady uint32 = 0
+
+	// vCPUser indicates that the vCPU is in or about to enter user mode.
+	vCPUUser uint32 = 1 << 0
+
+	// vCPUGuest indicates the vCPU is in guest mode.
+	vCPUGuest uint32 = 1 << 1
+
+	// vCPUWaiter indicates that there is a waiter.
+	//
+	// If this is set, then notify must be called on any state transitions.
+	vCPUWaiter uint32 = 1 << 2
+)
+
+// vCPU is a single KVM vCPU.
+type vCPU struct {
+	// CPU is the kernel CPU data.
+	//
+	// This must be the first element of this structure, it is referenced
+	// by the bluepill code (see bluepill_amd64.s).
+	ring0.CPU
+
+	// id is the vCPU id.
+	id int
+
+	// fd is the vCPU fd.
+	fd int
+
+	// tid is the last set tid.
+	tid uint64
+
+	// switches is a count of world switches (informational only).
+	switches uint32
+
+	// faults is a count of world faults (informational only).
+	faults uint32
+
+	// state is the vCPU state.
+	//
+	// This is a bitmask of the three fields (vCPU*) described above.
+	state uint32
+
+	// runData for this vCPU.
+	runData *runData
+
+	// machine associated with this vCPU.
+	machine *machine
+
+	// active is the current addressSpace: this is set and read atomically,
+	// it is used to elide unnecessary interrupts due to invalidations.
+	active atomicAddressSpace
+
+	// vCPUArchState is the architecture-specific state.
+	vCPUArchState
+
+	dieState dieState
+}
+
+type dieState struct {
+	// message is thrown from die.
+	message string
+
+	// guestRegs is used to store register state during vCPU.die() to prevent
+	// allocation inside nosplit function.
+	guestRegs userRegs
+}
+
+// newVCPU creates a returns a new vCPU.
+//
+// Precondtion: mu must be held.
+func (m *machine) newVCPU() *vCPU {
+	id := len(m.vCPUs)
+
+	// Create the vCPU.
+	fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id))
+	if errno != 0 {
+		panic(fmt.Sprintf("error creating new vCPU: %v", errno))
+	}
+
+	c := &vCPU{
+		id:      id,
+		fd:      int(fd),
+		machine: m,
+	}
+	c.CPU.Init(&m.kernel, c)
+	m.vCPUsByID[c.id] = c
+
+	// Ensure the signal mask is correct.
+	if err := c.setSignalMask(); err != nil {
+		panic(fmt.Sprintf("error setting signal mask: %v", err))
+	}
+
+	// Map the run data.
+	runData, err := mapRunData(int(fd))
+	if err != nil {
+		panic(fmt.Sprintf("error mapping run data: %v", err))
+	}
+	c.runData = runData
+
+	// Initialize architecture state.
+	if err := c.initArchState(); err != nil {
+		panic(fmt.Sprintf("error initialization vCPU state: %v", err))
+	}
+
+	return c // Done.
+}
+
+// newMachine returns a new VM context.
+func newMachine(vm int) (*machine, error) {
+	// Create the machine.
+	m := &machine{
+		fd:        vm,
+		vCPUs:     make(map[uint64]*vCPU),
+		vCPUsByID: make(map[int]*vCPU),
+	}
+	m.available.L = &m.mu
+	m.kernel.Init(ring0.KernelOpts{
+		PageTables: pagetables.New(newAllocator()),
+	})
+
+	maxVCPUs, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
+	if errno != 0 {
+		m.maxVCPUs = _KVM_NR_VCPUS
+	} else {
+		m.maxVCPUs = int(maxVCPUs)
+	}
+	log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
+
+	// Apply the physical mappings. Note that these mappings may point to
+	// guest physical addresses that are not actually available. These
+	// physical pages are mapped on demand, see kernel_unsafe.go.
+	applyPhysicalRegions(func(pr physicalRegion) bool {
+		// Map everything in the lower half.
+		m.kernel.PageTables.Map(
+			usermem.Addr(pr.virtual),
+			pr.length,
+			pagetables.MapOpts{AccessType: usermem.AnyAccess},
+			pr.physical)
+
+		// And keep everything in the upper half.
+		m.kernel.PageTables.Map(
+			usermem.Addr(ring0.KernelStartAddress|pr.virtual),
+			pr.length,
+			pagetables.MapOpts{AccessType: usermem.AnyAccess},
+			pr.physical)
+
+		return true // Keep iterating.
+	})
+
+	// Ensure that the currently mapped virtual regions are actually
+	// available in the VM. Note that this doesn't guarantee no future
+	// faults, however it should guarantee that everything is available to
+	// ensure successful vCPU entry.
+	applyVirtualRegions(func(vr virtualRegion) {
+		if excludeVirtualRegion(vr) {
+			return // skip region.
+		}
+		for virtual := vr.virtual; virtual < vr.virtual+vr.length; {
+			physical, length, ok := translateToPhysical(virtual)
+			if !ok {
+				// This must be an invalid region that was
+				// knocked out by creation of the physical map.
+				return
+			}
+			if virtual+length > vr.virtual+vr.length {
+				// Cap the length to the end of the area.
+				length = vr.virtual + vr.length - virtual
+			}
+
+			// Ensure the physical range is mapped.
+			m.mapPhysical(physical, length)
+			virtual += length
+		}
+	})
+
+	// Initialize architecture state.
+	if err := m.initArchState(); err != nil {
+		m.Destroy()
+		return nil, err
+	}
+
+	// Ensure the machine is cleaned up properly.
+	runtime.SetFinalizer(m, (*machine).Destroy)
+	return m, nil
+}
+
+// mapPhysical checks for the mapping of a physical range, and installs one if
+// not available. This attempts to be efficient for calls in the hot path.
+//
+// This panics on error.
+func (m *machine) mapPhysical(physical, length uintptr) {
+	for end := physical + length; physical < end; {
+		_, physicalStart, length, ok := calculateBluepillFault(physical)
+		if !ok {
+			// Should never happen.
+			panic("mapPhysical on unknown physical address")
+		}
+
+		if _, ok := m.mappingCache.LoadOrStore(physicalStart, true); !ok {
+			// Not present in the cache; requires setting the slot.
+			if _, ok := handleBluepillFault(m, physical); !ok {
+				panic("handleBluepillFault failed")
+			}
+		}
+
+		// Move to the next chunk.
+		physical = physicalStart + length
+	}
+}
+
+// Destroy frees associated resources.
+//
+// Destroy should only be called once all active users of the machine are gone.
+// The machine object should not be used after calling Destroy.
+//
+// Precondition: all vCPUs must be returned to the machine.
+func (m *machine) Destroy() {
+	runtime.SetFinalizer(m, nil)
+
+	// Destroy vCPUs.
+	for _, c := range m.vCPUs {
+		// Ensure the vCPU is not still running in guest mode. This is
+		// possible iff teardown has been done by other threads, and
+		// somehow a single thread has not executed any system calls.
+		c.BounceToHost()
+
+		// Note that the runData may not be mapped if an error occurs
+		// during the middle of initialization.
+		if c.runData != nil {
+			if err := unmapRunData(c.runData); err != nil {
+				panic(fmt.Sprintf("error unmapping rundata: %v", err))
+			}
+		}
+		if err := syscall.Close(int(c.fd)); err != nil {
+			panic(fmt.Sprintf("error closing vCPU fd: %v", err))
+		}
+	}
+
+	// vCPUs are gone: teardown machine state.
+	if err := syscall.Close(m.fd); err != nil {
+		panic(fmt.Sprintf("error closing VM fd: %v", err))
+	}
+}
+
+// Get gets an available vCPU.
+func (m *machine) Get() *vCPU {
+	runtime.LockOSThread()
+	tid := procid.Current()
+	m.mu.RLock()
+
+	// Check for an exact match.
+	if c := m.vCPUs[tid]; c != nil {
+		c.lock()
+		m.mu.RUnlock()
+		return c
+	}
+
+	// The happy path failed. We now proceed to acquire an exclusive lock
+	// (because the vCPU map may change), and scan all available vCPUs.
+	m.mu.RUnlock()
+	m.mu.Lock()
+
+	for {
+		// Scan for an available vCPU.
+		for origTID, c := range m.vCPUs {
+			if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
+				delete(m.vCPUs, origTID)
+				m.vCPUs[tid] = c
+				m.mu.Unlock()
+				c.loadSegments(tid)
+				return c
+			}
+		}
+
+		// Create a new vCPU (maybe).
+		if len(m.vCPUs) < m.maxVCPUs {
+			c := m.newVCPU()
+			c.lock()
+			m.vCPUs[tid] = c
+			m.mu.Unlock()
+			c.loadSegments(tid)
+			return c
+		}
+
+		// Scan for something not in user mode.
+		for origTID, c := range m.vCPUs {
+			if !atomic.CompareAndSwapUint32(&c.state, vCPUGuest, vCPUGuest|vCPUWaiter) {
+				continue
+			}
+
+			// The vCPU is not be able to transition to
+			// vCPUGuest|vCPUUser or to vCPUUser because that
+			// transition requires holding the machine mutex, as we
+			// do now. There is no path to register a waiter on
+			// just the vCPUReady state.
+			for {
+				c.waitUntilNot(vCPUGuest | vCPUWaiter)
+				if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
+					break
+				}
+			}
+
+			// Steal the vCPU.
+			delete(m.vCPUs, origTID)
+			m.vCPUs[tid] = c
+			m.mu.Unlock()
+			c.loadSegments(tid)
+			return c
+		}
+
+		// Everything is executing in user mode. Wait until something
+		// is available.  Note that signaling the condition variable
+		// will have the extra effect of kicking the vCPUs out of guest
+		// mode if that's where they were.
+		m.available.Wait()
+	}
+}
+
+// Put puts the current vCPU.
+func (m *machine) Put(c *vCPU) {
+	c.unlock()
+	runtime.UnlockOSThread()
+	m.available.Signal()
+}
+
+// newDirtySet returns a new dirty set.
+func (m *machine) newDirtySet() *dirtySet {
+	return &dirtySet{
+		vCPUs: make([]uint64, (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64),
+	}
+}
+
+// lock marks the vCPU as in user mode.
+//
+// This should only be called directly when known to be safe, i.e. when
+// the vCPU is owned by the current TID with no chance of theft.
+//
+//go:nosplit
+func (c *vCPU) lock() {
+	atomicbitops.OrUint32(&c.state, vCPUUser)
+}
+
+// unlock clears the vCPUUser bit.
+//
+//go:nosplit
+func (c *vCPU) unlock() {
+	if atomic.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest) {
+		// Happy path: no exits are forced, and we can continue
+		// executing on our merry way with a single atomic access.
+		return
+	}
+
+	// Clear the lock.
+	origState := atomic.LoadUint32(&c.state)
+	atomicbitops.AndUint32(&c.state, ^vCPUUser)
+	switch origState {
+	case vCPUUser:
+		// Normal state.
+	case vCPUUser | vCPUGuest | vCPUWaiter:
+		// Force a transition: this must trigger a notification when we
+		// return from guest mode.
+		c.notify()
+	case vCPUUser | vCPUWaiter:
+		// Waiting for the lock to be released; the responsibility is
+		// on us to notify the waiter and clear the associated bit.
+		atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
+		c.notify()
+	default:
+		panic("invalid state")
+	}
+}
+
+// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
+//
+//go:nosplit
+func (c *vCPU) NotifyInterrupt() {
+	c.BounceToKernel()
+}
+
+// pid is used below in bounce.
+var pid = syscall.Getpid()
+
+// bounce forces a return to the kernel or to host mode.
+//
+// This effectively unwinds the state machine.
+func (c *vCPU) bounce(forceGuestExit bool) {
+	for {
+		switch state := atomic.LoadUint32(&c.state); state {
+		case vCPUReady, vCPUWaiter:
+			// There is nothing to be done, we're already in the
+			// kernel pre-acquisition. The Bounce criteria have
+			// been satisfied.
+			return
+		case vCPUUser:
+			// We need to register a waiter for the actual guest
+			// transition. When the transition takes place, then we
+			// can inject an interrupt to ensure a return to host
+			// mode.
+			atomic.CompareAndSwapUint32(&c.state, state, state|vCPUWaiter)
+		case vCPUUser | vCPUWaiter:
+			// Wait for the transition to guest mode. This should
+			// come from the bluepill handler.
+			c.waitUntilNot(state)
+		case vCPUGuest, vCPUUser | vCPUGuest:
+			if state == vCPUGuest && !forceGuestExit {
+				// The vCPU is already not acquired, so there's
+				// no need to do a fresh injection here.
+				return
+			}
+			// The vCPU is in user or kernel mode. Attempt to
+			// register a notification on change.
+			if !atomic.CompareAndSwapUint32(&c.state, state, state|vCPUWaiter) {
+				break // Retry.
+			}
+			for {
+				// We need to spin here until the signal is
+				// delivered, because Tgkill can return EAGAIN
+				// under memory pressure. Since we already
+				// marked ourselves as a waiter, we need to
+				// ensure that a signal is actually delivered.
+				if err := syscall.Tgkill(pid, int(atomic.LoadUint64(&c.tid)), bounceSignal); err == nil {
+					break
+				} else if err.(syscall.Errno) == syscall.EAGAIN {
+					continue
+				} else {
+					// Nothing else should be returned by tgkill.
+					panic(fmt.Sprintf("unexpected tgkill error: %v", err))
+				}
+			}
+		case vCPUGuest | vCPUWaiter, vCPUUser | vCPUGuest | vCPUWaiter:
+			if state == vCPUGuest|vCPUWaiter && !forceGuestExit {
+				// See above.
+				return
+			}
+			// Wait for the transition. This again should happen
+			// from the bluepill handler, but on the way out.
+			c.waitUntilNot(state)
+		default:
+			// Should not happen: the above is exhaustive.
+			panic("invalid state")
+		}
+	}
+}
+
+// BounceToKernel ensures that the vCPU bounces back to the kernel.
+//
+//go:nosplit
+func (c *vCPU) BounceToKernel() {
+	c.bounce(false)
+}
+
+// BounceToHost ensures that the vCPU is in host mode.
+//
+//go:nosplit
+func (c *vCPU) BounceToHost() {
+	c.bounce(true)
+}
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
new file mode 100644
index 000000000..b6821122a
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -0,0 +1,357 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+	"fmt"
+	"reflect"
+	"runtime/debug"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// initArchState initializes architecture-specific state.
+func (m *machine) initArchState() error {
+	// Set the legacy TSS address. This address is covered by the reserved
+	// range (up to 4GB). In fact, this is a main reason it exists.
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(m.fd),
+		_KVM_SET_TSS_ADDR,
+		uintptr(reservedMemory-(3*usermem.PageSize))); errno != 0 {
+		return errno
+	}
+
+	// Enable CPUID faulting, if possible. Note that this also serves as a
+	// basic platform sanity tests, since we will enter guest mode for the
+	// first time here. The recovery is necessary, since if we fail to read
+	// the platform info register, we will retry to host mode and
+	// ultimately need to handle a segmentation fault.
+	old := debug.SetPanicOnFault(true)
+	defer func() {
+		recover()
+		debug.SetPanicOnFault(old)
+	}()
+	m.retryInGuest(func() {
+		ring0.SetCPUIDFaulting(true)
+	})
+
+	return nil
+}
+
+type vCPUArchState struct {
+	// PCIDs is the set of PCIDs for this vCPU.
+	//
+	// This starts above fixedKernelPCID.
+	PCIDs *pagetables.PCIDs
+
+	// floatingPointState is the floating point state buffer used in guest
+	// to host transitions. See usage in bluepill_amd64.go.
+	floatingPointState *arch.FloatingPointData
+}
+
+const (
+	// fixedKernelPCID is a fixed kernel PCID used for the kernel page
+	// tables. We must start allocating user PCIDs above this in order to
+	// avoid any conflict (see below).
+	fixedKernelPCID = 1
+
+	// poolPCIDs is the number of PCIDs to record in the database. As this
+	// grows, assignment can take longer, since it is a simple linear scan.
+	// Beyond a relatively small number, there are likely few perform
+	// benefits, since the TLB has likely long since lost any translations
+	// from more than a few PCIDs past.
+	poolPCIDs = 8
+)
+
+// dropPageTables drops cached page table entries.
+func (m *machine) dropPageTables(pt *pagetables.PageTables) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	// Clear from all PCIDs.
+	for _, c := range m.vCPUs {
+		c.PCIDs.Drop(pt)
+	}
+}
+
+// initArchState initializes architecture-specific state.
+func (c *vCPU) initArchState() error {
+	var (
+		kernelSystemRegs systemRegs
+		kernelUserRegs   userRegs
+	)
+
+	// Set base control registers.
+	kernelSystemRegs.CR0 = c.CR0()
+	kernelSystemRegs.CR4 = c.CR4()
+	kernelSystemRegs.EFER = c.EFER()
+
+	// Set the IDT & GDT in the registers.
+	kernelSystemRegs.IDT.base, kernelSystemRegs.IDT.limit = c.IDT()
+	kernelSystemRegs.GDT.base, kernelSystemRegs.GDT.limit = c.GDT()
+	kernelSystemRegs.CS.Load(&ring0.KernelCodeSegment, ring0.Kcode)
+	kernelSystemRegs.DS.Load(&ring0.UserDataSegment, ring0.Udata)
+	kernelSystemRegs.ES.Load(&ring0.UserDataSegment, ring0.Udata)
+	kernelSystemRegs.SS.Load(&ring0.KernelDataSegment, ring0.Kdata)
+	kernelSystemRegs.FS.Load(&ring0.UserDataSegment, ring0.Udata)
+	kernelSystemRegs.GS.Load(&ring0.UserDataSegment, ring0.Udata)
+	tssBase, tssLimit, tss := c.TSS()
+	kernelSystemRegs.TR.Load(tss, ring0.Tss)
+	kernelSystemRegs.TR.base = tssBase
+	kernelSystemRegs.TR.limit = uint32(tssLimit)
+
+	// Point to kernel page tables, with no initial PCID.
+	kernelSystemRegs.CR3 = c.machine.kernel.PageTables.CR3(false, 0)
+
+	// Initialize the PCID database.
+	if hasGuestPCID {
+		// Note that NewPCIDs may return a nil table here, in which
+		// case we simply don't use PCID support (see below). In
+		// practice, this should not happen, however.
+		c.PCIDs = pagetables.NewPCIDs(fixedKernelPCID+1, poolPCIDs)
+	}
+
+	// Set the CPUID; this is required before setting system registers,
+	// since KVM will reject several CR4 bits if the CPUID does not
+	// indicate the support is available.
+	if err := c.setCPUID(); err != nil {
+		return err
+	}
+
+	// Set the entrypoint for the kernel.
+	kernelUserRegs.RIP = uint64(reflect.ValueOf(ring0.Start).Pointer())
+	kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer())
+	kernelUserRegs.RFLAGS = ring0.KernelFlagsSet
+
+	// Set the system registers.
+	if err := c.setSystemRegisters(&kernelSystemRegs); err != nil {
+		return err
+	}
+
+	// Set the user registers.
+	if err := c.setUserRegisters(&kernelUserRegs); err != nil {
+		return err
+	}
+
+	// Allocate some floating point state save area for the local vCPU.
+	// This will be saved prior to leaving the guest, and we restore from
+	// this always. We cannot use the pointer in the context alone because
+	// we don't know how large the area there is in reality.
+	c.floatingPointState = arch.NewFloatingPointData()
+
+	// Set the time offset to the host native time.
+	return c.setSystemTime()
+}
+
+// nonCanonical generates a canonical address return.
+//
+//go:nosplit
+func nonCanonical(addr uint64, signal int32, info *arch.SignalInfo) (usermem.AccessType, error) {
+	*info = arch.SignalInfo{
+		Signo: signal,
+		Code:  arch.SignalInfoKernel,
+	}
+	info.SetAddr(addr) // Include address.
+	return usermem.NoAccess, platform.ErrContextSignal
+}
+
+// fault generates an appropriate fault return.
+//
+//go:nosplit
+func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (usermem.AccessType, error) {
+	bluepill(c) // Probably no-op, but may not be.
+	faultAddr := ring0.ReadCR2()
+	code, user := c.ErrorCode()
+	if !user {
+		// The last fault serviced by this CPU was not a user
+		// fault, so we can't reliably trust the faultAddr or
+		// the code provided here. We need to re-execute.
+		return usermem.NoAccess, platform.ErrContextInterrupt
+	}
+	// Reset the pointed SignalInfo.
+	*info = arch.SignalInfo{Signo: signal}
+	info.SetAddr(uint64(faultAddr))
+	accessType := usermem.AccessType{
+		Read:    code&(1<<1) == 0,
+		Write:   code&(1<<1) != 0,
+		Execute: code&(1<<4) != 0,
+	}
+	if !accessType.Write && !accessType.Execute {
+		info.Code = 1 // SEGV_MAPERR.
+	} else {
+		info.Code = 2 // SEGV_ACCERR.
+	}
+	return accessType, platform.ErrContextSignal
+}
+
+// SwitchToUser unpacks architectural-details.
+func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (usermem.AccessType, error) {
+	// Check for canonical addresses.
+	if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Rip) {
+		return nonCanonical(regs.Rip, int32(syscall.SIGSEGV), info)
+	} else if !ring0.IsCanonical(regs.Rsp) {
+		return nonCanonical(regs.Rsp, int32(syscall.SIGBUS), info)
+	} else if !ring0.IsCanonical(regs.Fs_base) {
+		return nonCanonical(regs.Fs_base, int32(syscall.SIGBUS), info)
+	} else if !ring0.IsCanonical(regs.Gs_base) {
+		return nonCanonical(regs.Gs_base, int32(syscall.SIGBUS), info)
+	}
+
+	// Assign PCIDs.
+	if c.PCIDs != nil {
+		var requireFlushPCID bool // Force a flush?
+		switchOpts.UserPCID, requireFlushPCID = c.PCIDs.Assign(switchOpts.PageTables)
+		switchOpts.KernelPCID = fixedKernelPCID
+		switchOpts.Flush = switchOpts.Flush || requireFlushPCID
+	}
+
+	// See below.
+	var vector ring0.Vector
+
+	// Past this point, stack growth can cause system calls (and a break
+	// from guest mode). So we need to ensure that between the bluepill
+	// call here and the switch call immediately below, no additional
+	// allocations occur.
+	entersyscall()
+	bluepill(c)
+	vector = c.CPU.SwitchToUser(switchOpts)
+	exitsyscall()
+
+	switch vector {
+	case ring0.Syscall, ring0.SyscallInt80:
+		// Fast path: system call executed.
+		return usermem.NoAccess, nil
+
+	case ring0.PageFault:
+		return c.fault(int32(syscall.SIGSEGV), info)
+
+	case ring0.Debug, ring0.Breakpoint:
+		*info = arch.SignalInfo{
+			Signo: int32(syscall.SIGTRAP),
+			Code:  1, // TRAP_BRKPT (breakpoint).
+		}
+		info.SetAddr(switchOpts.Registers.Rip) // Include address.
+		return usermem.AccessType{}, platform.ErrContextSignal
+
+	case ring0.GeneralProtectionFault,
+		ring0.SegmentNotPresent,
+		ring0.BoundRangeExceeded,
+		ring0.InvalidTSS,
+		ring0.StackSegmentFault:
+		*info = arch.SignalInfo{
+			Signo: int32(syscall.SIGSEGV),
+			Code:  arch.SignalInfoKernel,
+		}
+		info.SetAddr(switchOpts.Registers.Rip) // Include address.
+		if vector == ring0.GeneralProtectionFault {
+			// When CPUID faulting is enabled, we will generate a #GP(0) when
+			// userspace executes a CPUID instruction. This is handled above,
+			// because we need to be able to map and read user memory.
+			return usermem.AccessType{}, platform.ErrContextSignalCPUID
+		}
+		return usermem.AccessType{}, platform.ErrContextSignal
+
+	case ring0.InvalidOpcode:
+		*info = arch.SignalInfo{
+			Signo: int32(syscall.SIGILL),
+			Code:  1, // ILL_ILLOPC (illegal opcode).
+		}
+		info.SetAddr(switchOpts.Registers.Rip) // Include address.
+		return usermem.AccessType{}, platform.ErrContextSignal
+
+	case ring0.DivideByZero:
+		*info = arch.SignalInfo{
+			Signo: int32(syscall.SIGFPE),
+			Code:  1, // FPE_INTDIV (divide by zero).
+		}
+		info.SetAddr(switchOpts.Registers.Rip) // Include address.
+		return usermem.AccessType{}, platform.ErrContextSignal
+
+	case ring0.Overflow:
+		*info = arch.SignalInfo{
+			Signo: int32(syscall.SIGFPE),
+			Code:  2, // FPE_INTOVF (integer overflow).
+		}
+		info.SetAddr(switchOpts.Registers.Rip) // Include address.
+		return usermem.AccessType{}, platform.ErrContextSignal
+
+	case ring0.X87FloatingPointException,
+		ring0.SIMDFloatingPointException:
+		*info = arch.SignalInfo{
+			Signo: int32(syscall.SIGFPE),
+			Code:  7, // FPE_FLTINV (invalid operation).
+		}
+		info.SetAddr(switchOpts.Registers.Rip) // Include address.
+		return usermem.AccessType{}, platform.ErrContextSignal
+
+	case ring0.Vector(bounce): // ring0.VirtualizationException
+		return usermem.NoAccess, platform.ErrContextInterrupt
+
+	case ring0.AlignmentCheck:
+		*info = arch.SignalInfo{
+			Signo: int32(syscall.SIGBUS),
+			Code:  2, // BUS_ADRERR (physical address does not exist).
+		}
+		return usermem.NoAccess, platform.ErrContextSignal
+
+	case ring0.NMI:
+		// An NMI is generated only when a fault is not servicable by
+		// KVM itself, so we think some mapping is writeable but it's
+		// really not. This could happen, e.g. if some file is
+		// truncated (and would generate a SIGBUS) and we map it
+		// directly into the instance.
+		return c.fault(int32(syscall.SIGBUS), info)
+
+	case ring0.DeviceNotAvailable,
+		ring0.DoubleFault,
+		ring0.CoprocessorSegmentOverrun,
+		ring0.MachineCheck,
+		ring0.SecurityException:
+		fallthrough
+	default:
+		panic(fmt.Sprintf("unexpected vector: 0x%x", vector))
+	}
+}
+
+// retryInGuest runs the given function in guest mode.
+//
+// If the function does not complete in guest mode (due to execution of a
+// system call due to a GC stall, for example), then it will be retried. The
+// given function must be idempotent as a result of the retry mechanism.
+func (m *machine) retryInGuest(fn func()) {
+	c := m.Get()
+	defer m.Put(c)
+	for {
+		c.ClearErrorCode() // See below.
+		bluepill(c)        // Force guest mode.
+		fn()               // Execute the given function.
+		_, user := c.ErrorCode()
+		if user {
+			// If user is set, then we haven't bailed back to host
+			// mode via a kernel exception or system call. We
+			// consider the full function to have executed in guest
+			// mode and we can return.
+			break
+		}
+	}
+}
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
new file mode 100644
index 000000000..06a2e3b0c
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -0,0 +1,161 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+	"fmt"
+	"sync/atomic"
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/time"
+)
+
+// setMemoryRegion initializes a region.
+//
+// This may be called from bluepillHandler, and therefore returns an errno
+// directly (instead of wrapping in an error) to avoid allocations.
+//
+//go:nosplit
+func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr) syscall.Errno {
+	userRegion := userMemoryRegion{
+		slot:          uint32(slot),
+		flags:         0,
+		guestPhysAddr: uint64(physical),
+		memorySize:    uint64(length),
+		userspaceAddr: uint64(virtual),
+	}
+
+	// Set the region.
+	_, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(m.fd),
+		_KVM_SET_USER_MEMORY_REGION,
+		uintptr(unsafe.Pointer(&userRegion)))
+	return errno
+}
+
+// loadSegments copies the current segments.
+//
+// This may be called from within the signal context and throws on error.
+//
+//go:nosplit
+func (c *vCPU) loadSegments(tid uint64) {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_ARCH_PRCTL,
+		linux.ARCH_GET_FS,
+		uintptr(unsafe.Pointer(&c.CPU.Registers().Fs_base)),
+		0); errno != 0 {
+		throw("getting FS segment")
+	}
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_ARCH_PRCTL,
+		linux.ARCH_GET_GS,
+		uintptr(unsafe.Pointer(&c.CPU.Registers().Gs_base)),
+		0); errno != 0 {
+		throw("getting GS segment")
+	}
+	atomic.StoreUint64(&c.tid, tid)
+}
+
+// setCPUID sets the CPUID to be used by the guest.
+func (c *vCPU) setCPUID() error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_CPUID2,
+		uintptr(unsafe.Pointer(&cpuidSupported))); errno != 0 {
+		return fmt.Errorf("error setting CPUID: %v", errno)
+	}
+	return nil
+}
+
+// setSystemTime sets the TSC for the vCPU.
+//
+// This has to make the call many times in order to minimize the intrinstic
+// error in the offset. Unfortunately KVM does not expose a relative offset via
+// the API, so this is an approximation. We do this via an iterative algorithm.
+// This has the advantage that it can generally deal with highly variable
+// system call times and should converge on the correct offset.
+func (c *vCPU) setSystemTime() error {
+	const (
+		_MSR_IA32_TSC  = 0x00000010
+		calibrateTries = 10
+	)
+	registers := modelControlRegisters{
+		nmsrs: 1,
+	}
+	registers.entries[0] = modelControlRegister{
+		index: _MSR_IA32_TSC,
+	}
+	target := uint64(^uint32(0))
+	for done := 0; done < calibrateTries; {
+		start := uint64(time.Rdtsc())
+		registers.entries[0].data = start + target
+		if _, _, errno := syscall.RawSyscall(
+			syscall.SYS_IOCTL,
+			uintptr(c.fd),
+			_KVM_SET_MSRS,
+			uintptr(unsafe.Pointer(&registers))); errno != 0 {
+			return fmt.Errorf("error setting system time: %v", errno)
+		}
+		// See if this is our new minimum call time. Note that this
+		// serves two functions: one, we make sure that we are
+		// accurately predicting the offset we need to set. Second, we
+		// don't want to do the final set on a slow call, which could
+		// produce a really bad result. So we only count attempts
+		// within +/- 6.25% of our minimum as an attempt.
+		end := uint64(time.Rdtsc())
+		if end < start {
+			continue // Totally bogus.
+		}
+		half := (end - start) / 2
+		if half < target {
+			target = half
+		}
+		if (half - target) < target/8 {
+			done++
+		}
+	}
+	return nil
+}
+
+// setSignalMask sets the vCPU signal mask.
+//
+// This must be called prior to running the vCPU.
+func (c *vCPU) setSignalMask() error {
+	// The layout of this structure implies that it will not necessarily be
+	// the same layout chosen by the Go compiler. It gets fudged here.
+	var data struct {
+		length uint32
+		mask1  uint32
+		mask2  uint32
+		_      uint32
+	}
+	data.length = 8 // Fixed sigset size.
+	data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
+	data.mask2 = ^uint32(bounceSignalMask >> 32)
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_SIGNAL_MASK,
+		uintptr(unsafe.Pointer(&data))); errno != 0 {
+		return fmt.Errorf("error setting signal mask: %v", errno)
+	}
+	return nil
+}
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
new file mode 100644
index 000000000..1d3c6d2d6
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -0,0 +1,160 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build go1.12
+// +build !go1.14
+
+// Check go:linkname function signatures when updating Go version.
+
+package kvm
+
+import (
+	"fmt"
+	"sync/atomic"
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+)
+
+//go:linkname entersyscall runtime.entersyscall
+func entersyscall()
+
+//go:linkname exitsyscall runtime.exitsyscall
+func exitsyscall()
+
+// mapRunData maps the vCPU run data.
+func mapRunData(fd int) (*runData, error) {
+	r, _, errno := syscall.RawSyscall6(
+		syscall.SYS_MMAP,
+		0,
+		uintptr(runDataSize),
+		syscall.PROT_READ|syscall.PROT_WRITE,
+		syscall.MAP_SHARED,
+		uintptr(fd),
+		0)
+	if errno != 0 {
+		return nil, fmt.Errorf("error mapping runData: %v", errno)
+	}
+	return (*runData)(unsafe.Pointer(r)), nil
+}
+
+// unmapRunData unmaps the vCPU run data.
+func unmapRunData(r *runData) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_MUNMAP,
+		uintptr(unsafe.Pointer(r)),
+		uintptr(runDataSize),
+		0); errno != 0 {
+		return fmt.Errorf("error unmapping runData: %v", errno)
+	}
+	return nil
+}
+
+// setUserRegisters sets user registers in the vCPU.
+func (c *vCPU) setUserRegisters(uregs *userRegs) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_REGS,
+		uintptr(unsafe.Pointer(uregs))); errno != 0 {
+		return fmt.Errorf("error setting user registers: %v", errno)
+	}
+	return nil
+}
+
+// getUserRegisters reloads user registers in the vCPU.
+//
+// This is safe to call from a nosplit context.
+//
+//go:nosplit
+func (c *vCPU) getUserRegisters(uregs *userRegs) syscall.Errno {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_GET_REGS,
+		uintptr(unsafe.Pointer(uregs))); errno != 0 {
+		return errno
+	}
+	return 0
+}
+
+// setSystemRegisters sets system registers.
+func (c *vCPU) setSystemRegisters(sregs *systemRegs) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_SREGS,
+		uintptr(unsafe.Pointer(sregs))); errno != 0 {
+		return fmt.Errorf("error setting system registers: %v", errno)
+	}
+	return nil
+}
+
+// atomicAddressSpace is an atomic address space pointer.
+type atomicAddressSpace struct {
+	pointer unsafe.Pointer
+}
+
+// set sets the address space value.
+//
+//go:nosplit
+func (a *atomicAddressSpace) set(as *addressSpace) {
+	atomic.StorePointer(&a.pointer, unsafe.Pointer(as))
+}
+
+// get gets the address space value.
+//
+// Note that this should be considered best-effort, and may have changed by the
+// time this function returns.
+//
+//go:nosplit
+func (a *atomicAddressSpace) get() *addressSpace {
+	return (*addressSpace)(atomic.LoadPointer(&a.pointer))
+}
+
+// notify notifies that the vCPU has transitioned modes.
+//
+// This may be called by a signal handler and therefore throws on error.
+//
+//go:nosplit
+func (c *vCPU) notify() {
+	_, _, errno := syscall.RawSyscall6(
+		syscall.SYS_FUTEX,
+		uintptr(unsafe.Pointer(&c.state)),
+		linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG,
+		^uintptr(0), // Number of waiters.
+		0, 0, 0)
+	if errno != 0 {
+		throw("futex wake error")
+	}
+}
+
+// waitUntilNot waits for the vCPU to transition modes.
+//
+// The state should have been previously set to vCPUWaiter after performing an
+// appropriate action to cause a transition (e.g. interrupt injection).
+//
+// This panics on error.
+func (c *vCPU) waitUntilNot(state uint32) {
+	_, _, errno := syscall.Syscall6(
+		syscall.SYS_FUTEX,
+		uintptr(unsafe.Pointer(&c.state)),
+		linux.FUTEX_WAIT|linux.FUTEX_PRIVATE_FLAG,
+		uintptr(state),
+		0, 0, 0)
+	if errno != 0 && errno != syscall.EINTR && errno != syscall.EAGAIN {
+		panic("futex wait error")
+	}
+}
diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go
new file mode 100644
index 000000000..450eb8201
--- /dev/null
+++ b/pkg/sentry/platform/kvm/physical_map.go
@@ -0,0 +1,224 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"fmt"
+	"sort"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+const (
+	// reservedMemory is a chunk of physical memory reserved starting at
+	// physical address zero. There are some special pages in this region,
+	// so we just call the whole thing off.
+	//
+	// Other architectures may define this to be zero.
+	reservedMemory = 0x100000000
+)
+
+type region struct {
+	virtual uintptr
+	length  uintptr
+}
+
+type physicalRegion struct {
+	region
+	physical uintptr
+}
+
+// physicalRegions contains a list of available physical regions.
+//
+// The physical value used in physicalRegions is a number indicating the
+// physical offset, aligned appropriately and starting above reservedMemory.
+var physicalRegions []physicalRegion
+
+// fillAddressSpace fills the host address space with PROT_NONE mappings until
+// we have a host address space size that is less than or equal to the physical
+// address space. This allows us to have an injective host virtual to guest
+// physical mapping.
+//
+// The excluded regions are returned.
+func fillAddressSpace() (excludedRegions []region) {
+	// We can cut vSize in half, because the kernel will be using the top
+	// half and we ignore it while constructing mappings. It's as if we've
+	// already excluded half the possible addresses.
+	vSize := uintptr(1) << ring0.VirtualAddressBits()
+	vSize = vSize >> 1
+
+	// We exclude reservedMemory below from our physical memory size, so it
+	// needs to be dropped here as well. Otherwise, we could end up with
+	// physical addresses that are beyond what is mapped.
+	pSize := uintptr(1) << ring0.PhysicalAddressBits()
+	pSize -= reservedMemory
+
+	// Add specifically excluded regions; see excludeVirtualRegion.
+	applyVirtualRegions(func(vr virtualRegion) {
+		if excludeVirtualRegion(vr) {
+			excludedRegions = append(excludedRegions, vr.region)
+			vSize -= vr.length
+			log.Infof("excluded: virtual [%x,%x)", vr.virtual, vr.virtual+vr.length)
+		}
+	})
+
+	// Do we need any more work?
+	if vSize < pSize {
+		return excludedRegions
+	}
+
+	// Calculate the required space and fill it.
+	//
+	// Note carefully that we add faultBlockSize to required up front, and
+	// on each iteration of the loop below (i.e. each new physical region
+	// we define), we add faultBlockSize again. This is done because the
+	// computation of physical regions will ensure proper alignments with
+	// faultBlockSize, potentially causing up to faultBlockSize bytes in
+	// internal fragmentation for each physical region. So we need to
+	// account for this properly during allocation.
+	requiredAddr, ok := usermem.Addr(vSize - pSize + faultBlockSize).RoundUp()
+	if !ok {
+		panic(fmt.Sprintf(
+			"overflow for vSize (%x) - pSize (%x) + faultBlockSize (%x)",
+			vSize, pSize, faultBlockSize))
+	}
+	required := uintptr(requiredAddr)
+	current := required // Attempted mmap size.
+	for filled := uintptr(0); filled < required && current > 0; {
+		addr, _, errno := syscall.RawSyscall6(
+			syscall.SYS_MMAP,
+			0, // Suggested address.
+			current,
+			syscall.PROT_NONE,
+			syscall.MAP_ANONYMOUS|syscall.MAP_PRIVATE|syscall.MAP_NORESERVE,
+			0, 0)
+		if errno != 0 {
+			// Attempt half the size; overflow not possible.
+			currentAddr, _ := usermem.Addr(current >> 1).RoundUp()
+			current = uintptr(currentAddr)
+			continue
+		}
+		// We filled a block.
+		filled += current
+		excludedRegions = append(excludedRegions, region{
+			virtual: addr,
+			length:  current,
+		})
+		// See comment above.
+		if filled != required {
+			required += faultBlockSize
+		}
+	}
+	if current == 0 {
+		panic("filling address space failed")
+	}
+	sort.Slice(excludedRegions, func(i, j int) bool {
+		return excludedRegions[i].virtual < excludedRegions[j].virtual
+	})
+	for _, r := range excludedRegions {
+		log.Infof("region: virtual [%x,%x)", r.virtual, r.virtual+r.length)
+	}
+	return excludedRegions
+}
+
+// computePhysicalRegions computes physical regions.
+func computePhysicalRegions(excludedRegions []region) (physicalRegions []physicalRegion) {
+	physical := uintptr(reservedMemory)
+	addValidRegion := func(virtual, length uintptr) {
+		if length == 0 {
+			return
+		}
+		if virtual == 0 {
+			virtual += usermem.PageSize
+			length -= usermem.PageSize
+		}
+		if end := virtual + length; end > ring0.MaximumUserAddress {
+			length -= (end - ring0.MaximumUserAddress)
+		}
+		if length == 0 {
+			return
+		}
+		// Round physical up to the same alignment as the virtual
+		// address (with respect to faultBlockSize).
+		if offset := virtual &^ faultBlockMask; physical&^faultBlockMask != offset {
+			if newPhysical := (physical & faultBlockMask) + offset; newPhysical > physical {
+				physical = newPhysical // Round up by only a little bit.
+			} else {
+				physical = ((physical + faultBlockSize) & faultBlockMask) + offset
+			}
+		}
+		physicalRegions = append(physicalRegions, physicalRegion{
+			region: region{
+				virtual: virtual,
+				length:  length,
+			},
+			physical: physical,
+		})
+		physical += length
+	}
+	lastExcludedEnd := uintptr(0)
+	for _, r := range excludedRegions {
+		addValidRegion(lastExcludedEnd, r.virtual-lastExcludedEnd)
+		lastExcludedEnd = r.virtual + r.length
+	}
+	addValidRegion(lastExcludedEnd, ring0.MaximumUserAddress-lastExcludedEnd)
+
+	// Dump our all physical regions.
+	for _, r := range physicalRegions {
+		log.Infof("physicalRegion: virtual [%x,%x) => physical [%x,%x)",
+			r.virtual, r.virtual+r.length, r.physical, r.physical+r.length)
+	}
+	return physicalRegions
+}
+
+// physicalInit initializes physical address mappings.
+func physicalInit() {
+	physicalRegions = computePhysicalRegions(fillAddressSpace())
+}
+
+// applyPhysicalRegions applies the given function on physical regions.
+//
+// Iteration continues as long as true is returned. The return value is the
+// return from the last call to fn, or true if there are no entries.
+//
+// Precondition: physicalInit must have been called.
+func applyPhysicalRegions(fn func(pr physicalRegion) bool) bool {
+	for _, pr := range physicalRegions {
+		if !fn(pr) {
+			return false
+		}
+	}
+	return true
+}
+
+// translateToPhysical translates the given virtual address.
+//
+// Precondition: physicalInit must have been called.
+//
+//go:nosplit
+func translateToPhysical(virtual uintptr) (physical uintptr, length uintptr, ok bool) {
+	for _, pr := range physicalRegions {
+		if pr.virtual <= virtual && virtual < pr.virtual+pr.length {
+			physical = pr.physical + (virtual - pr.virtual)
+			length = pr.length - (virtual - pr.virtual)
+			ok = true
+			return
+		}
+	}
+	return
+}
diff --git a/pkg/sentry/platform/kvm/virtual_map.go b/pkg/sentry/platform/kvm/virtual_map.go
new file mode 100644
index 000000000..28a1b4414
--- /dev/null
+++ b/pkg/sentry/platform/kvm/virtual_map.go
@@ -0,0 +1,113 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"bufio"
+	"fmt"
+	"io"
+	"os"
+	"regexp"
+	"strconv"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+type virtualRegion struct {
+	region
+	accessType usermem.AccessType
+	shared     bool
+	offset     uintptr
+	filename   string
+}
+
+// mapsLine matches a single line from /proc/PID/maps.
+var mapsLine = regexp.MustCompile("([0-9a-f]+)-([0-9a-f]+) ([r-][w-][x-][sp]) ([0-9a-f]+) [0-9a-f]{2}:[0-9a-f]{2,} [0-9]+\\s+(.*)")
+
+// excludeRegion returns true if these regions should be excluded from the
+// physical map. Virtual regions need to be excluded if get_user_pages will
+// fail on those addresses, preventing KVM from satisfying EPT faults.
+//
+// This includes the VVAR page because the VVAR page may be mapped as I/O
+// memory. And the VDSO page is knocked out because the VVAR page is not even
+// recorded in /proc/self/maps on older kernels; knocking out the VDSO page
+// prevents code in the VDSO from accessing the VVAR address.
+//
+// This is called by the physical map functions, not applyVirtualRegions.
+func excludeVirtualRegion(r virtualRegion) bool {
+	return r.filename == "[vvar]" || r.filename == "[vdso]"
+}
+
+// applyVirtualRegions parses the process maps file.
+//
+// Unlike mappedRegions, these are not consistent over time.
+func applyVirtualRegions(fn func(vr virtualRegion)) error {
+	// Open /proc/self/maps.
+	f, err := os.Open("/proc/self/maps")
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	// Parse all entries.
+	r := bufio.NewReader(f)
+	for {
+		b, err := r.ReadBytes('\n')
+		if b != nil && len(b) > 0 {
+			m := mapsLine.FindSubmatch(b)
+			if m == nil {
+				// This should not happen: kernel bug?
+				return fmt.Errorf("badly formed line: %v", string(b))
+			}
+			start, err := strconv.ParseUint(string(m[1]), 16, 64)
+			if err != nil {
+				return fmt.Errorf("bad start address: %v", string(b))
+			}
+			end, err := strconv.ParseUint(string(m[2]), 16, 64)
+			if err != nil {
+				return fmt.Errorf("bad end address: %v", string(b))
+			}
+			read := m[3][0] == 'r'
+			write := m[3][1] == 'w'
+			execute := m[3][2] == 'x'
+			shared := m[3][3] == 's'
+			offset, err := strconv.ParseUint(string(m[4]), 16, 64)
+			if err != nil {
+				return fmt.Errorf("bad offset: %v", string(b))
+			}
+			fn(virtualRegion{
+				region: region{
+					virtual: uintptr(start),
+					length:  uintptr(end - start),
+				},
+				accessType: usermem.AccessType{
+					Read:    read,
+					Write:   write,
+					Execute: execute,
+				},
+				shared:   shared,
+				offset:   uintptr(offset),
+				filename: string(m[5]),
+			})
+		}
+		if err != nil && err == io.EOF {
+			break
+		} else if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
author	gVisor bot <gvisor-bot@google.com>	2019-06-02 06:44:55 +0000
committer	gVisor bot <gvisor-bot@google.com>	2019-06-02 06:44:55 +0000
commit	ceb0d792f328d1fc0692197d8856a43c3936a571 (patch)
tree	83155f302eff44a78bcc30a3a08f4efe59a79379 /pkg/sentry/platform/kvm
parent	deb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff)
parent	216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff)