Merge 216da0b7 (automated)

author: gVisor bot <gvisor-bot@google.com> 2019-06-02 06:44:55 +0000
committer: gVisor bot <gvisor-bot@google.com> 2019-06-02 06:44:55 +0000
commit: ceb0d792f328d1fc0692197d8856a43c3936a571 (patch)
tree: 83155f302eff44a78bcc30a3a08f4efe59a79379 /pkg/sentry/platform
parent: deb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff)
parent: 216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff)
73 files changed, 10828 insertions, 0 deletions
diff --git a/pkg/sentry/platform/context.go b/pkg/sentry/platform/context.go
new file mode 100644
index 000000000..793f57fd7
--- /dev/null
+++ b/pkg/sentry/platform/context.go
@@ -0,0 +1,36 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package platform
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is the auth package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxPlatform is a Context.Value key for a Platform.
+	CtxPlatform contextID = iota
+)
+
+// FromContext returns the Platform that is used to execute ctx's application
+// code, or nil if no such Platform exists.
+func FromContext(ctx context.Context) Platform {
+	if v := ctx.Value(CtxPlatform); v != nil {
+		return v.(Platform)
+	}
+	return nil
+}
diff --git a/pkg/sentry/platform/file_range.go b/pkg/sentry/platform/file_range.go
new file mode 100755
index 000000000..685d360e3
--- /dev/null
+++ b/pkg/sentry/platform/file_range.go
@@ -0,0 +1,62 @@
+package platform
+
+// A Range represents a contiguous range of T.
+//
+// +stateify savable
+type FileRange struct {
+	// Start is the inclusive start of the range.
+	Start uint64
+
+	// End is the exclusive end of the range.
+	End uint64
+}
+
+// WellFormed returns true if r.Start <= r.End. All other methods on a Range
+// require that the Range is well-formed.
+func (r FileRange) WellFormed() bool {
+	return r.Start <= r.End
+}
+
+// Length returns the length of the range.
+func (r FileRange) Length() uint64 {
+	return r.End - r.Start
+}
+
+// Contains returns true if r contains x.
+func (r FileRange) Contains(x uint64) bool {
+	return r.Start <= x && x < r.End
+}
+
+// Overlaps returns true if r and r2 overlap.
+func (r FileRange) Overlaps(r2 FileRange) bool {
+	return r.Start < r2.End && r2.Start < r.End
+}
+
+// IsSupersetOf returns true if r is a superset of r2; that is, the range r2 is
+// contained within r.
+func (r FileRange) IsSupersetOf(r2 FileRange) bool {
+	return r.Start <= r2.Start && r.End >= r2.End
+}
+
+// Intersect returns a range consisting of the intersection between r and r2.
+// If r and r2 do not overlap, Intersect returns a range with unspecified
+// bounds, but for which Length() == 0.
+func (r FileRange) Intersect(r2 FileRange) FileRange {
+	if r.Start < r2.Start {
+		r.Start = r2.Start
+	}
+	if r.End > r2.End {
+		r.End = r2.End
+	}
+	if r.End < r.Start {
+		r.End = r.Start
+	}
+	return r
+}
+
+// CanSplitAt returns true if it is legal to split a segment spanning the range
+// r at x; that is, splitting at x would produce two ranges, both of which have
+// non-zero length.
+func (r FileRange) CanSplitAt(x uint64) bool {
+	return r.Contains(x) && r.Start < x
+}
diff --git a/pkg/sentry/platform/interrupt/interrupt.go b/pkg/sentry/platform/interrupt/interrupt.go
new file mode 100644
index 000000000..a4651f500
--- /dev/null
+++ b/pkg/sentry/platform/interrupt/interrupt.go
@@ -0,0 +1,96 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package interrupt provides an interrupt helper.
+package interrupt
+
+import (
+	"fmt"
+	"sync"
+)
+
+// Receiver receives interrupt notifications from a Forwarder.
+type Receiver interface {
+	// NotifyInterrupt is called when the Receiver receives an interrupt.
+	NotifyInterrupt()
+}
+
+// Forwarder is a helper for delivering delayed signal interruptions.
+//
+// This helps platform implementations with Interrupt semantics.
+type Forwarder struct {
+	// mu protects the below.
+	mu sync.Mutex
+
+	// dst is the function to be called when NotifyInterrupt() is called. If
+	// dst is nil, pending will be set instead, causing the next call to
+	// Enable() to return false.
+	dst     Receiver
+	pending bool
+}
+
+// Enable attempts to enable interrupt forwarding to r. If f has already
+// received an interrupt, Enable does nothing and returns false. Otherwise,
+// future calls to f.NotifyInterrupt() cause r.NotifyInterrupt() to be called,
+// and Enable returns true.
+//
+// Usage:
+//
+// if !f.Enable(r) {
+//     // There was an interrupt.
+//     return
+// }
+// defer f.Disable()
+//
+// Preconditions: r must not be nil. f must not already be forwarding
+// interrupts to a Receiver.
+func (f *Forwarder) Enable(r Receiver) bool {
+	if r == nil {
+		panic("nil Receiver")
+	}
+	f.mu.Lock()
+	if f.dst != nil {
+		f.mu.Unlock()
+		panic(fmt.Sprintf("already forwarding interrupts to %+v", f.dst))
+	}
+	if f.pending {
+		f.pending = false
+		f.mu.Unlock()
+		return false
+	}
+	f.dst = r
+	f.mu.Unlock()
+	return true
+}
+
+// Disable stops interrupt forwarding. If interrupt forwarding is already
+// disabled, Disable is a no-op.
+func (f *Forwarder) Disable() {
+	f.mu.Lock()
+	f.dst = nil
+	f.mu.Unlock()
+}
+
+// NotifyInterrupt implements Receiver.NotifyInterrupt. If interrupt forwarding
+// is enabled, the configured Receiver will be notified. Otherwise the
+// interrupt will be delivered to the next call to Enable.
+func (f *Forwarder) NotifyInterrupt() {
+	f.mu.Lock()
+	if f.dst != nil {
+		f.dst.NotifyInterrupt()
+	} else {
+		f.pending = true
+	}
+	f.mu.Unlock()
+}
diff --git a/pkg/sentry/platform/interrupt/interrupt_state_autogen.go b/pkg/sentry/platform/interrupt/interrupt_state_autogen.go
new file mode 100755
index 000000000..15e8bacdf
--- /dev/null
+++ b/pkg/sentry/platform/interrupt/interrupt_state_autogen.go
@@ -0,0 +1,4 @@
+// automatically generated by stateify.
+
+package interrupt
+
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
new file mode 100644
index 000000000..689122175
--- /dev/null
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -0,0 +1,234 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"sync"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/atomicbitops"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// dirtySet tracks vCPUs for invalidation.
+type dirtySet struct {
+	vCPUs []uint64
+}
+
+// forEach iterates over all CPUs in the dirty set.
+func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) {
+	m.mu.RLock()
+	defer m.mu.RUnlock()
+
+	for index := range ds.vCPUs {
+		mask := atomic.SwapUint64(&ds.vCPUs[index], 0)
+		if mask != 0 {
+			for bit := 0; bit < 64; bit++ {
+				if mask&(1<<uint64(bit)) == 0 {
+					continue
+				}
+				id := 64*index + bit
+				fn(m.vCPUsByID[id])
+			}
+		}
+	}
+}
+
+// mark marks the given vCPU as dirty and returns whether it was previously
+// clean. Being previously clean implies that a flush is needed on entry.
+func (ds *dirtySet) mark(c *vCPU) bool {
+	index := uint64(c.id) / 64
+	bit := uint64(1) << uint(c.id%64)
+
+	oldValue := atomic.LoadUint64(&ds.vCPUs[index])
+	if oldValue&bit != 0 {
+		return false // Not clean.
+	}
+
+	// Set the bit unilaterally, and ensure that a flush takes place. Note
+	// that it's possible for races to occur here, but since the flush is
+	// taking place long after these lines there's no race in practice.
+	atomicbitops.OrUint64(&ds.vCPUs[index], bit)
+	return true // Previously clean.
+}
+
+// addressSpace is a wrapper for PageTables.
+type addressSpace struct {
+	platform.NoAddressSpaceIO
+
+	// mu is the lock for modifications to the address space.
+	//
+	// Note that the page tables themselves are not locked.
+	mu sync.Mutex
+
+	// machine is the underlying machine.
+	machine *machine
+
+	// pageTables are for this particular address space.
+	pageTables *pagetables.PageTables
+
+	// dirtySet is the set of dirty vCPUs.
+	dirtySet *dirtySet
+}
+
+// invalidate is the implementation for Invalidate.
+func (as *addressSpace) invalidate() {
+	as.dirtySet.forEach(as.machine, func(c *vCPU) {
+		if c.active.get() == as { // If this happens to be active,
+			c.BounceToKernel() // ... force a kernel transition.
+		}
+	})
+}
+
+// Invalidate interrupts all dirty contexts.
+func (as *addressSpace) Invalidate() {
+	as.mu.Lock()
+	defer as.mu.Unlock()
+	as.invalidate()
+}
+
+// Touch adds the given vCPU to the dirty list.
+//
+// The return value indicates whether a flush is required.
+func (as *addressSpace) Touch(c *vCPU) bool {
+	return as.dirtySet.mark(c)
+}
+
+type hostMapEntry struct {
+	addr   uintptr
+	length uintptr
+}
+
+func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) {
+	for m.length > 0 {
+		physical, length, ok := translateToPhysical(m.addr)
+		if !ok {
+			panic("unable to translate segment")
+		}
+		if length > m.length {
+			length = m.length
+		}
+
+		// Ensure that this map has physical mappings. If the page does
+		// not have physical mappings, the KVM module may inject
+		// spurious exceptions when emulation fails (i.e. it tries to
+		// emulate because the RIP is pointed at those pages).
+		as.machine.mapPhysical(physical, length)
+
+		// Install the page table mappings. Note that the ordering is
+		// important; if the pagetable mappings were installed before
+		// ensuring the physical pages were available, then some other
+		// thread could theoretically access them.
+		//
+		// Due to the way KVM's shadow paging implementation works,
+		// modifications to the page tables while in host mode may not
+		// be trapped, leading to the shadow pages being out of sync.
+		// Therefore, we need to ensure that we are in guest mode for
+		// page table modifications. See the call to bluepill, below.
+		as.machine.retryInGuest(func() {
+			inv = as.pageTables.Map(addr, length, pagetables.MapOpts{
+				AccessType: at,
+				User:       true,
+			}, physical) || inv
+		})
+		m.addr += length
+		m.length -= length
+		addr += usermem.Addr(length)
+	}
+
+	return inv
+}
+
+// MapFile implements platform.AddressSpace.MapFile.
+func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
+	as.mu.Lock()
+	defer as.mu.Unlock()
+
+	// Get mappings in the sentry's address space, which are guaranteed to be
+	// valid as long as a reference is held on the mapped pages (which is in
+	// turn required by AddressSpace.MapFile precondition).
+	//
+	// If precommit is true, we will touch mappings to commit them, so ensure
+	// that mappings are readable from sentry context.
+	//
+	// We don't execute from application file-mapped memory, and guest page
+	// tables don't care if we have execute permission (but they do need pages
+	// to be readable).
+	bs, err := f.MapInternal(fr, usermem.AccessType{
+		Read:  at.Read || at.Execute || precommit,
+		Write: at.Write,
+	})
+	if err != nil {
+		return err
+	}
+
+	// Map the mappings in the sentry's address space (guest physical memory)
+	// into the application's address space (guest virtual memory).
+	inv := false
+	for !bs.IsEmpty() {
+		b := bs.Head()
+		bs = bs.Tail()
+		// Since fr was page-aligned, b should also be page-aligned. We do the
+		// lookup in our host page tables for this translation.
+		if precommit {
+			s := b.ToSlice()
+			for i := 0; i < len(s); i += usermem.PageSize {
+				_ = s[i] // Touch to commit.
+			}
+		}
+		prev := as.mapHost(addr, hostMapEntry{
+			addr:   b.Addr(),
+			length: uintptr(b.Len()),
+		}, at)
+		inv = inv || prev
+		addr += usermem.Addr(b.Len())
+	}
+	if inv {
+		as.invalidate()
+	}
+
+	return nil
+}
+
+// Unmap unmaps the given range by calling pagetables.PageTables.Unmap.
+func (as *addressSpace) Unmap(addr usermem.Addr, length uint64) {
+	as.mu.Lock()
+	defer as.mu.Unlock()
+
+	// See above re: retryInGuest.
+	var prev bool
+	as.machine.retryInGuest(func() {
+		prev = as.pageTables.Unmap(addr, uintptr(length)) || prev
+	})
+	if prev {
+		as.invalidate()
+
+		// Recycle any freed intermediate pages.
+		as.pageTables.Allocator.Recycle()
+	}
+}
+
+// Release releases the page tables.
+func (as *addressSpace) Release() {
+	as.Unmap(0, ^uint64(0))
+
+	// Free all pages from the allocator.
+	as.pageTables.Allocator.(allocator).base.Drain()
+
+	// Drop all cached machine references.
+	as.machine.dropPageTables(as.pageTables)
+}
diff --git a/pkg/sentry/platform/kvm/allocator.go b/pkg/sentry/platform/kvm/allocator.go
new file mode 100644
index 000000000..42bcc9733
--- /dev/null
+++ b/pkg/sentry/platform/kvm/allocator.go
@@ -0,0 +1,76 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+)
+
+type allocator struct {
+	base *pagetables.RuntimeAllocator
+}
+
+// newAllocator is used to define the allocator.
+func newAllocator() allocator {
+	return allocator{
+		base: pagetables.NewRuntimeAllocator(),
+	}
+}
+
+// NewPTEs implements pagetables.Allocator.NewPTEs.
+//
+//go:nosplit
+func (a allocator) NewPTEs() *pagetables.PTEs {
+	return a.base.NewPTEs()
+}
+
+// PhysicalFor returns the physical address for a set of PTEs.
+//
+//go:nosplit
+func (a allocator) PhysicalFor(ptes *pagetables.PTEs) uintptr {
+	virtual := a.base.PhysicalFor(ptes)
+	physical, _, ok := translateToPhysical(virtual)
+	if !ok {
+		panic(fmt.Sprintf("PhysicalFor failed for %p", ptes))
+	}
+	return physical
+}
+
+// LookupPTEs implements pagetables.Allocator.LookupPTEs.
+//
+//go:nosplit
+func (a allocator) LookupPTEs(physical uintptr) *pagetables.PTEs {
+	virtualStart, physicalStart, _, ok := calculateBluepillFault(physical)
+	if !ok {
+		panic(fmt.Sprintf("LookupPTEs failed for 0x%x", physical))
+	}
+	return a.base.LookupPTEs(virtualStart + (physical - physicalStart))
+}
+
+// FreePTEs implements pagetables.Allocator.FreePTEs.
+//
+//go:nosplit
+func (a allocator) FreePTEs(ptes *pagetables.PTEs) {
+	a.base.FreePTEs(ptes)
+}
+
+// Recycle implements pagetables.Allocator.Recycle.
+//
+//go:nosplit
+func (a allocator) Recycle() {
+	a.base.Recycle()
+}
diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
new file mode 100644
index 000000000..a926e6f8b
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -0,0 +1,82 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"fmt"
+	"reflect"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/safecopy"
+)
+
+// bluepill enters guest mode.
+func bluepill(*vCPU)
+
+// sighandler is the signal entry point.
+func sighandler()
+
+// dieTrampoline is the assembly trampoline. This calls dieHandler.
+//
+// This uses an architecture-specific calling convention, documented in
+// dieArchSetup and the assembly implementation for dieTrampoline.
+func dieTrampoline()
+
+var (
+	// savedHandler is a pointer to the previous handler.
+	//
+	// This is called by bluepillHandler.
+	savedHandler uintptr
+
+	// dieTrampolineAddr is the address of dieTrampoline.
+	dieTrampolineAddr uintptr
+)
+
+// dieHandler is called by dieTrampoline.
+//
+//go:nosplit
+func dieHandler(c *vCPU) {
+	throw(c.dieState.message)
+}
+
+// die is called to set the vCPU up to panic.
+//
+// This loads vCPU state, and sets up a call for the trampoline.
+//
+//go:nosplit
+func (c *vCPU) die(context *arch.SignalContext64, msg string) {
+	// Save the death message, which will be thrown.
+	c.dieState.message = msg
+
+	// Reload all registers to have an accurate stack trace when we return
+	// to host mode. This means that the stack should be unwound correctly.
+	if errno := c.getUserRegisters(&c.dieState.guestRegs); errno != 0 {
+		throw(msg)
+	}
+
+	// Setup the trampoline.
+	dieArchSetup(c, context, &c.dieState.guestRegs)
+}
+
+func init() {
+	// Install the handler.
+	if err := safecopy.ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil {
+		panic(fmt.Sprintf("Unable to set handler for signal %d: %v", syscall.SIGSEGV, err))
+	}
+
+	// Extract the address for the trampoline.
+	dieTrampolineAddr = reflect.ValueOf(dieTrampoline).Pointer()
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go
new file mode 100644
index 000000000..c258408f9
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.go
@@ -0,0 +1,141 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+)
+
+var (
+	// bounceSignal is the signal used for bouncing KVM.
+	//
+	// We use SIGCHLD because it is not masked by the runtime, and
+	// it will be ignored properly by other parts of the kernel.
+	bounceSignal = syscall.SIGCHLD
+
+	// bounceSignalMask has only bounceSignal set.
+	bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1))
+
+	// bounce is the interrupt vector used to return to the kernel.
+	bounce = uint32(ring0.VirtualizationException)
+)
+
+// redpill on amd64 invokes a syscall with -1.
+//
+//go:nosplit
+func redpill() {
+	syscall.RawSyscall(^uintptr(0), 0, 0, 0)
+}
+
+// bluepillArchEnter is called during bluepillEnter.
+//
+//go:nosplit
+func bluepillArchEnter(context *arch.SignalContext64) *vCPU {
+	c := vCPUPtr(uintptr(context.Rax))
+	regs := c.CPU.Registers()
+	regs.R8 = context.R8
+	regs.R9 = context.R9
+	regs.R10 = context.R10
+	regs.R11 = context.R11
+	regs.R12 = context.R12
+	regs.R13 = context.R13
+	regs.R14 = context.R14
+	regs.R15 = context.R15
+	regs.Rdi = context.Rdi
+	regs.Rsi = context.Rsi
+	regs.Rbp = context.Rbp
+	regs.Rbx = context.Rbx
+	regs.Rdx = context.Rdx
+	regs.Rax = context.Rax
+	regs.Rcx = context.Rcx
+	regs.Rsp = context.Rsp
+	regs.Rip = context.Rip
+	regs.Eflags = context.Eflags
+	regs.Eflags &^= uint64(ring0.KernelFlagsClear)
+	regs.Eflags |= ring0.KernelFlagsSet
+	regs.Cs = uint64(ring0.Kcode)
+	regs.Ds = uint64(ring0.Udata)
+	regs.Es = uint64(ring0.Udata)
+	regs.Ss = uint64(ring0.Kdata)
+	return c
+}
+
+// KernelSyscall handles kernel syscalls.
+//
+//go:nosplit
+func (c *vCPU) KernelSyscall() {
+	regs := c.Registers()
+	if regs.Rax != ^uint64(0) {
+		regs.Rip -= 2 // Rewind.
+	}
+	// We only trigger a bluepill entry in the bluepill function, and can
+	// therefore be guaranteed that there is no floating point state to be
+	// loaded on resuming from halt. We only worry about saving on exit.
+	ring0.SaveFloatingPoint((*byte)(c.floatingPointState))
+	ring0.Halt()
+	ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment.
+}
+
+// KernelException handles kernel exceptions.
+//
+//go:nosplit
+func (c *vCPU) KernelException(vector ring0.Vector) {
+	regs := c.Registers()
+	if vector == ring0.Vector(bounce) {
+		// These should not interrupt kernel execution; point the Rip
+		// to zero to ensure that we get a reasonable panic when we
+		// attempt to return and a full stack trace.
+		regs.Rip = 0
+	}
+	// See above.
+	ring0.SaveFloatingPoint((*byte)(c.floatingPointState))
+	ring0.Halt()
+	ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment.
+}
+
+// bluepillArchExit is called during bluepillEnter.
+//
+//go:nosplit
+func bluepillArchExit(c *vCPU, context *arch.SignalContext64) {
+	regs := c.CPU.Registers()
+	context.R8 = regs.R8
+	context.R9 = regs.R9
+	context.R10 = regs.R10
+	context.R11 = regs.R11
+	context.R12 = regs.R12
+	context.R13 = regs.R13
+	context.R14 = regs.R14
+	context.R15 = regs.R15
+	context.Rdi = regs.Rdi
+	context.Rsi = regs.Rsi
+	context.Rbp = regs.Rbp
+	context.Rbx = regs.Rbx
+	context.Rdx = regs.Rdx
+	context.Rax = regs.Rax
+	context.Rcx = regs.Rcx
+	context.Rsp = regs.Rsp
+	context.Rip = regs.Rip
+	context.Eflags = regs.Eflags
+
+	// Set the context pointer to the saved floating point state. This is
+	// where the guest data has been serialized, the kernel will restore
+	// from this new pointer value.
+	context.Fpstate = uint64(uintptrValue((*byte)(c.floatingPointState)))
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.s b/pkg/sentry/platform/kvm/bluepill_amd64.s
new file mode 100644
index 000000000..2bc34a435
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.s
@@ -0,0 +1,93 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// VCPU_CPU is the location of the CPU in the vCPU struct.
+//
+// This is guaranteed to be zero.
+#define VCPU_CPU 0x0
+
+// CPU_SELF is the self reference in ring0's percpu.
+//
+// This is guaranteed to be zero.
+#define CPU_SELF 0x0
+
+// Context offsets.
+//
+// Only limited use of the context is done in the assembly stub below, most is
+// done in the Go handlers. However, the RIP must be examined.
+#define CONTEXT_RAX 0x90
+#define CONTEXT_RIP 0xa8
+#define CONTEXT_FP  0xe0
+
+// CLI is the literal byte for the disable interrupts instruction.
+//
+// This is checked as the source of the fault.
+#define CLI $0xfa
+
+// See bluepill.go.
+TEXT ·bluepill(SB),NOSPLIT,$0
+begin:
+	MOVQ vcpu+0(FP), AX
+	LEAQ VCPU_CPU(AX), BX
+	BYTE CLI;
+check_vcpu:
+	MOVQ CPU_SELF(GS), CX
+	CMPQ BX, CX
+	JE right_vCPU
+wrong_vcpu:
+	CALL ·redpill(SB)
+	JMP begin
+right_vCPU:
+	RET
+
+// sighandler: see bluepill.go for documentation.
+//
+// The arguments are the following:
+//
+// 	DI - The signal number.
+// 	SI - Pointer to siginfo_t structure.
+// 	DX - Pointer to ucontext structure.
+//
+TEXT ·sighandler(SB),NOSPLIT,$0
+	// Check if the signal is from the kernel.
+	MOVQ $0x80, CX
+	CMPL CX, 0x8(SI)
+	JNE fallback
+
+	// Check if RIP is disable interrupts.
+	MOVQ CONTEXT_RIP(DX), CX
+	CMPQ CX, $0x0
+	JE fallback
+	CMPB 0(CX), CLI
+	JNE fallback
+
+	// Call the bluepillHandler.
+	PUSHQ DX                    // First argument (context).
+	CALL ·bluepillHandler(SB)   // Call the handler.
+	POPQ DX                     // Discard the argument.
+	RET
+
+fallback:
+	// Jump to the previous signal handler.
+	XORQ CX, CX
+	MOVQ ·savedHandler(SB), AX
+	JMP AX
+
+// dieTrampoline: see bluepill.go, bluepill_amd64_unsafe.go for documentation.
+TEXT ·dieTrampoline(SB),NOSPLIT,$0
+	PUSHQ BX // First argument (vCPU).
+	PUSHQ AX // Fake the old RIP as caller.
+	JMP ·dieHandler(SB)
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
new file mode 100644
index 000000000..92fde7ee0
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -0,0 +1,56 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+)
+
+// bluepillArchContext returns the arch-specific context.
+//
+//go:nosplit
+func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
+	return &((*arch.UContext64)(context).MContext)
+}
+
+// dieArchSetup initialies the state for dieTrampoline.
+//
+// The amd64 dieTrampoline requires the vCPU to be set in BX, and the last RIP
+// to be in AX. The trampoline then simulates a call to dieHandler from the
+// provided RIP.
+//
+//go:nosplit
+func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
+	// If the vCPU is in user mode, we set the stack to the stored stack
+	// value in the vCPU itself. We don't want to unwind the user stack.
+	if guestRegs.RFLAGS&ring0.UserFlagsSet == ring0.UserFlagsSet {
+		regs := c.CPU.Registers()
+		context.Rax = regs.Rax
+		context.Rsp = regs.Rsp
+		context.Rbp = regs.Rbp
+	} else {
+		context.Rax = guestRegs.RIP
+		context.Rsp = guestRegs.RSP
+		context.Rbp = guestRegs.RBP
+		context.Eflags = guestRegs.RFLAGS
+	}
+	context.Rbx = uint64(uintptr(unsafe.Pointer(c)))
+	context.Rip = uint64(dieTrampolineAddr)
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go
new file mode 100644
index 000000000..3c452f5ba
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_fault.go
@@ -0,0 +1,127 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+const (
+	// faultBlockSize is the size used for servicing memory faults.
+	//
+	// This should be large enough to avoid frequent faults and avoid using
+	// all available KVM slots (~512), but small enough that KVM does not
+	// complain about slot sizes (~4GB). See handleBluepillFault for how
+	// this block is used.
+	faultBlockSize = 2 << 30
+
+	// faultBlockMask is the mask for the fault blocks.
+	//
+	// This must be typed to avoid overflow complaints (ugh).
+	faultBlockMask = ^uintptr(faultBlockSize - 1)
+)
+
+// yield yields the CPU.
+//
+//go:nosplit
+func yield() {
+	syscall.RawSyscall(syscall.SYS_SCHED_YIELD, 0, 0, 0)
+}
+
+// calculateBluepillFault calculates the fault address range.
+//
+//go:nosplit
+func calculateBluepillFault(physical uintptr) (virtualStart, physicalStart, length uintptr, ok bool) {
+	alignedPhysical := physical &^ uintptr(usermem.PageSize-1)
+	for _, pr := range physicalRegions {
+		end := pr.physical + pr.length
+		if physical < pr.physical || physical >= end {
+			continue
+		}
+
+		// Adjust the block to match our size.
+		physicalStart = alignedPhysical & faultBlockMask
+		if physicalStart < pr.physical {
+			// Bound the starting point to the start of the region.
+			physicalStart = pr.physical
+		}
+		virtualStart = pr.virtual + (physicalStart - pr.physical)
+		physicalEnd := physicalStart + faultBlockSize
+		if physicalEnd > end {
+			physicalEnd = end
+		}
+		length = physicalEnd - physicalStart
+		return virtualStart, physicalStart, length, true
+	}
+
+	return 0, 0, 0, false
+}
+
+// handleBluepillFault handles a physical fault.
+//
+// The corresponding virtual address is returned. This may throw on error.
+//
+//go:nosplit
+func handleBluepillFault(m *machine, physical uintptr) (uintptr, bool) {
+	// Paging fault: we need to map the underlying physical pages for this
+	// fault. This all has to be done in this function because we're in a
+	// signal handler context. (We can't call any functions that might
+	// split the stack.)
+	virtualStart, physicalStart, length, ok := calculateBluepillFault(physical)
+	if !ok {
+		return 0, false
+	}
+
+	// Set the KVM slot.
+	//
+	// First, we need to acquire the exclusive right to set a slot.  See
+	// machine.nextSlot for information about the protocol.
+	slot := atomic.SwapUint32(&m.nextSlot, ^uint32(0))
+	for slot == ^uint32(0) {
+		yield() // Race with another call.
+		slot = atomic.SwapUint32(&m.nextSlot, ^uint32(0))
+	}
+	errno := m.setMemoryRegion(int(slot), physicalStart, length, virtualStart)
+	if errno == 0 {
+		// Successfully added region; we can increment nextSlot and
+		// allow another set to proceed here.
+		atomic.StoreUint32(&m.nextSlot, slot+1)
+		return virtualStart + (physical - physicalStart), true
+	}
+
+	// Release our slot (still available).
+	atomic.StoreUint32(&m.nextSlot, slot)
+
+	switch errno {
+	case syscall.EEXIST:
+		// The region already exists. It's possible that we raced with
+		// another vCPU here. We just revert nextSlot and return true,
+		// because this must have been satisfied by some other vCPU.
+		return virtualStart + (physical - physicalStart), true
+	case syscall.EINVAL:
+		throw("set memory region failed; out of slots")
+	case syscall.ENOMEM:
+		throw("set memory region failed: out of memory")
+	case syscall.EFAULT:
+		throw("set memory region failed: invalid physical range")
+	default:
+		throw("set memory region failed: unknown reason")
+	}
+
+	panic("unreachable")
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
new file mode 100644
index 000000000..7e8e9f42a
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -0,0 +1,213 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build go1.12
+// +build !go1.14
+
+// Check go:linkname function signatures when updating Go version.
+
+package kvm
+
+import (
+	"sync/atomic"
+	"syscall"
+	"unsafe"
+)
+
+//go:linkname throw runtime.throw
+func throw(string)
+
+// vCPUPtr returns a CPU for the given address.
+//
+//go:nosplit
+func vCPUPtr(addr uintptr) *vCPU {
+	return (*vCPU)(unsafe.Pointer(addr))
+}
+
+// bytePtr returns a bytePtr for the given address.
+//
+//go:nosplit
+func bytePtr(addr uintptr) *byte {
+	return (*byte)(unsafe.Pointer(addr))
+}
+
+// uintptrValue returns a uintptr for the given address.
+//
+//go:nosplit
+func uintptrValue(addr *byte) uintptr {
+	return (uintptr)(unsafe.Pointer(addr))
+}
+
+// bluepillHandler is called from the signal stub.
+//
+// The world may be stopped while this is executing, and it executes on the
+// signal stack. It should only execute raw system calls and functions that are
+// explicitly marked go:nosplit.
+//
+//go:nosplit
+func bluepillHandler(context unsafe.Pointer) {
+	// Sanitize the registers; interrupts must always be disabled.
+	c := bluepillArchEnter(bluepillArchContext(context))
+
+	// Increment the number of switches.
+	atomic.AddUint32(&c.switches, 1)
+
+	// Mark this as guest mode.
+	switch atomic.SwapUint32(&c.state, vCPUGuest|vCPUUser) {
+	case vCPUUser: // Expected case.
+	case vCPUUser | vCPUWaiter:
+		c.notify()
+	default:
+		throw("invalid state")
+	}
+
+	for {
+		switch _, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0); errno {
+		case 0: // Expected case.
+		case syscall.EINTR:
+			// First, we process whatever pending signal
+			// interrupted KVM. Since we're in a signal handler
+			// currently, all signals are masked and the signal
+			// must have been delivered directly to this thread.
+			sig, _, errno := syscall.RawSyscall6(
+				syscall.SYS_RT_SIGTIMEDWAIT,
+				uintptr(unsafe.Pointer(&bounceSignalMask)),
+				0, // siginfo.
+				0, // timeout.
+				8, // sigset size.
+				0, 0)
+			if errno != 0 {
+				throw("error waiting for pending signal")
+			}
+			if sig != uintptr(bounceSignal) {
+				throw("unexpected signal")
+			}
+
+			// Check whether the current state of the vCPU is ready
+			// for interrupt injection. Because we don't have a
+			// PIC, we can't inject an interrupt while they are
+			// masked. We need to request a window if it's not
+			// ready.
+			if c.runData.readyForInterruptInjection == 0 {
+				c.runData.requestInterruptWindow = 1
+				continue // Rerun vCPU.
+			} else {
+				// Force injection below; the vCPU is ready.
+				c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN
+			}
+		case syscall.EFAULT:
+			// If a fault is not serviceable due to the host
+			// backing pages having page permissions, instead of an
+			// MMIO exit we receive EFAULT from the run ioctl. We
+			// always inject an NMI here since we may be in kernel
+			// mode and have interrupts disabled.
+			if _, _, errno := syscall.RawSyscall(
+				syscall.SYS_IOCTL,
+				uintptr(c.fd),
+				_KVM_NMI, 0); errno != 0 {
+				throw("NMI injection failed")
+			}
+			continue // Rerun vCPU.
+		default:
+			throw("run failed")
+		}
+
+		switch c.runData.exitReason {
+		case _KVM_EXIT_EXCEPTION:
+			c.die(bluepillArchContext(context), "exception")
+			return
+		case _KVM_EXIT_IO:
+			c.die(bluepillArchContext(context), "I/O")
+			return
+		case _KVM_EXIT_INTERNAL_ERROR:
+			// An internal error is typically thrown when emulation
+			// fails. This can occur via the MMIO path below (and
+			// it might fail because we have multiple regions that
+			// are not mapped). We would actually prefer that no
+			// emulation occur, and don't mind at all if it fails.
+		case _KVM_EXIT_HYPERCALL:
+			c.die(bluepillArchContext(context), "hypercall")
+			return
+		case _KVM_EXIT_DEBUG:
+			c.die(bluepillArchContext(context), "debug")
+			return
+		case _KVM_EXIT_HLT:
+			// Copy out registers.
+			bluepillArchExit(c, bluepillArchContext(context))
+
+			// Return to the vCPUReady state; notify any waiters.
+			user := atomic.LoadUint32(&c.state) & vCPUUser
+			switch atomic.SwapUint32(&c.state, user) {
+			case user | vCPUGuest: // Expected case.
+			case user | vCPUGuest | vCPUWaiter:
+				c.notify()
+			default:
+				throw("invalid state")
+			}
+			return
+		case _KVM_EXIT_MMIO:
+			// Increment the fault count.
+			atomic.AddUint32(&c.faults, 1)
+
+			// For MMIO, the physical address is the first data item.
+			physical := uintptr(c.runData.data[0])
+			virtual, ok := handleBluepillFault(c.machine, physical)
+			if !ok {
+				c.die(bluepillArchContext(context), "invalid physical address")
+				return
+			}
+
+			// We now need to fill in the data appropriately. KVM
+			// expects us to provide the result of the given MMIO
+			// operation in the runData struct. This is safe
+			// because, if a fault occurs here, the same fault
+			// would have occurred in guest mode. The kernel should
+			// not create invalid page table mappings.
+			data := (*[8]byte)(unsafe.Pointer(&c.runData.data[1]))
+			length := (uintptr)((uint32)(c.runData.data[2]))
+			write := (uint8)(((c.runData.data[2] >> 32) & 0xff)) != 0
+			for i := uintptr(0); i < length; i++ {
+				b := bytePtr(uintptr(virtual) + i)
+				if write {
+					// Write to the given address.
+					*b = data[i]
+				} else {
+					// Read from the given address.
+					data[i] = *b
+				}
+			}
+		case _KVM_EXIT_IRQ_WINDOW_OPEN:
+			// Interrupt: we must have requested an interrupt
+			// window; set the interrupt line.
+			if _, _, errno := syscall.RawSyscall(
+				syscall.SYS_IOCTL,
+				uintptr(c.fd),
+				_KVM_INTERRUPT,
+				uintptr(unsafe.Pointer(&bounce))); errno != 0 {
+				throw("interrupt injection failed")
+			}
+			// Clear previous injection request.
+			c.runData.requestInterruptWindow = 0
+		case _KVM_EXIT_SHUTDOWN:
+			c.die(bluepillArchContext(context), "shutdown")
+			return
+		case _KVM_EXIT_FAIL_ENTRY:
+			c.die(bluepillArchContext(context), "entry failed")
+			return
+		default:
+			c.die(bluepillArchContext(context), "unknown")
+			return
+		}
+	}
+}
diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go
new file mode 100644
index 000000000..0eb0020f7
--- /dev/null
+++ b/pkg/sentry/platform/kvm/context.go
@@ -0,0 +1,87 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/interrupt"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// context is an implementation of the platform context.
+//
+// This is a thin wrapper around the machine.
+type context struct {
+	// machine is the parent machine, and is immutable.
+	machine *machine
+
+	// info is the arch.SignalInfo cached for this context.
+	info arch.SignalInfo
+
+	// interrupt is the interrupt context.
+	interrupt interrupt.Forwarder
+}
+
+// Switch runs the provided context in the given address space.
+func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*arch.SignalInfo, usermem.AccessType, error) {
+	localAS := as.(*addressSpace)
+
+	// Grab a vCPU.
+	cpu := c.machine.Get()
+
+	// Enable interrupts (i.e. calls to vCPU.Notify).
+	if !c.interrupt.Enable(cpu) {
+		c.machine.Put(cpu) // Already preempted.
+		return nil, usermem.NoAccess, platform.ErrContextInterrupt
+	}
+
+	// Set the active address space.
+	//
+	// This must be done prior to the call to Touch below. If the address
+	// space is invalidated between this line and the call below, we will
+	// flag on entry anyways. When the active address space below is
+	// cleared, it indicates that we don't need an explicit interrupt and
+	// that the flush can occur naturally on the next user entry.
+	cpu.active.set(localAS)
+
+	// Prepare switch options.
+	switchOpts := ring0.SwitchOpts{
+		Registers:          &ac.StateData().Regs,
+		FloatingPointState: (*byte)(ac.FloatingPointData()),
+		PageTables:         localAS.pageTables,
+		Flush:              localAS.Touch(cpu),
+		FullRestore:        ac.FullRestore(),
+	}
+
+	// Take the blue pill.
+	at, err := cpu.SwitchToUser(switchOpts, &c.info)
+
+	// Clear the address space.
+	cpu.active.set(nil)
+
+	// Release resources.
+	c.machine.Put(cpu)
+
+	// All done.
+	c.interrupt.Disable()
+	return &c.info, at, err
+}
+
+// Interrupt interrupts the running context.
+func (c *context) Interrupt() {
+	c.interrupt.NotifyInterrupt()
+}
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
new file mode 100644
index 000000000..ed0521c3f
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -0,0 +1,143 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package kvm provides a kvm-based implementation of the platform interface.
+package kvm
+
+import (
+	"fmt"
+	"os"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// KVM represents a lightweight VM context.
+type KVM struct {
+	platform.NoCPUPreemptionDetection
+
+	// machine is the backing VM.
+	machine *machine
+}
+
+var (
+	globalOnce sync.Once
+	globalErr  error
+)
+
+// OpenDevice opens the KVM device at /dev/kvm and returns the File.
+func OpenDevice() (*os.File, error) {
+	f, err := os.OpenFile("/dev/kvm", syscall.O_RDWR, 0)
+	if err != nil {
+		return nil, fmt.Errorf("error opening /dev/kvm: %v", err)
+	}
+	return f, nil
+}
+
+// New returns a new KVM-based implementation of the platform interface.
+func New(deviceFile *os.File) (*KVM, error) {
+	fd := deviceFile.Fd()
+
+	// Ensure global initialization is done.
+	globalOnce.Do(func() {
+		physicalInit()
+		globalErr = updateSystemValues(int(fd))
+		ring0.Init(cpuid.HostFeatureSet())
+	})
+	if globalErr != nil {
+		return nil, globalErr
+	}
+
+	// Create a new VM fd.
+	vm, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, fd, _KVM_CREATE_VM, 0)
+	if errno != 0 {
+		return nil, fmt.Errorf("creating VM: %v", errno)
+	}
+	// We are done with the device file.
+	deviceFile.Close()
+
+	// Create a VM context.
+	machine, err := newMachine(int(vm))
+	if err != nil {
+		return nil, err
+	}
+
+	// All set.
+	return &KVM{
+		machine: machine,
+	}, nil
+}
+
+// SupportsAddressSpaceIO implements platform.Platform.SupportsAddressSpaceIO.
+func (*KVM) SupportsAddressSpaceIO() bool {
+	return false
+}
+
+// CooperativelySchedulesAddressSpace implements platform.Platform.CooperativelySchedulesAddressSpace.
+func (*KVM) CooperativelySchedulesAddressSpace() bool {
+	return false
+}
+
+// MapUnit implements platform.Platform.MapUnit.
+func (*KVM) MapUnit() uint64 {
+	// We greedily creates PTEs in MapFile, so extremely large mappings can
+	// be expensive. Not _that_ expensive since we allow super pages, but
+	// even though can get out of hand if you're creating multi-terabyte
+	// mappings. For this reason, we limit mappings to an arbitrary 16MB.
+	return 16 << 20
+}
+
+// MinUserAddress returns the lowest available address.
+func (*KVM) MinUserAddress() usermem.Addr {
+	return usermem.PageSize
+}
+
+// MaxUserAddress returns the first address that may not be used.
+func (*KVM) MaxUserAddress() usermem.Addr {
+	return usermem.Addr(ring0.MaximumUserAddress)
+}
+
+// NewAddressSpace returns a new pagetable root.
+func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan struct{}, error) {
+	// Allocate page tables and install system mappings.
+	pageTables := pagetables.New(newAllocator())
+	applyPhysicalRegions(func(pr physicalRegion) bool {
+		// Map the kernel in the upper half.
+		pageTables.Map(
+			usermem.Addr(ring0.KernelStartAddress|pr.virtual),
+			pr.length,
+			pagetables.MapOpts{AccessType: usermem.AnyAccess},
+			pr.physical)
+		return true // Keep iterating.
+	})
+
+	// Return the new address space.
+	return &addressSpace{
+		machine:    k.machine,
+		pageTables: pageTables,
+		dirtySet:   k.machine.newDirtySet(),
+	}, nil, nil
+}
+
+// NewContext returns an interruptible context.
+func (k *KVM) NewContext() platform.Context {
+	return &context{
+		machine: k.machine,
+	}
+}
diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go
new file mode 100644
index 000000000..61493ccaf
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_amd64.go
@@ -0,0 +1,213 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+)
+
+// userMemoryRegion is a region of physical memory.
+//
+// This mirrors kvm_memory_region.
+type userMemoryRegion struct {
+	slot          uint32
+	flags         uint32
+	guestPhysAddr uint64
+	memorySize    uint64
+	userspaceAddr uint64
+}
+
+// userRegs represents KVM user registers.
+//
+// This mirrors kvm_regs.
+type userRegs struct {
+	RAX    uint64
+	RBX    uint64
+	RCX    uint64
+	RDX    uint64
+	RSI    uint64
+	RDI    uint64
+	RSP    uint64
+	RBP    uint64
+	R8     uint64
+	R9     uint64
+	R10    uint64
+	R11    uint64
+	R12    uint64
+	R13    uint64
+	R14    uint64
+	R15    uint64
+	RIP    uint64
+	RFLAGS uint64
+}
+
+// systemRegs represents KVM system registers.
+//
+// This mirrors kvm_sregs.
+type systemRegs struct {
+	CS              segment
+	DS              segment
+	ES              segment
+	FS              segment
+	GS              segment
+	SS              segment
+	TR              segment
+	LDT             segment
+	GDT             descriptor
+	IDT             descriptor
+	CR0             uint64
+	CR2             uint64
+	CR3             uint64
+	CR4             uint64
+	CR8             uint64
+	EFER            uint64
+	apicBase        uint64
+	interruptBitmap [(_KVM_NR_INTERRUPTS + 63) / 64]uint64
+}
+
+// segment is the expanded form of a segment register.
+//
+// This mirrors kvm_segment.
+type segment struct {
+	base     uint64
+	limit    uint32
+	selector uint16
+	typ      uint8
+	present  uint8
+	DPL      uint8
+	DB       uint8
+	S        uint8
+	L        uint8
+	G        uint8
+	AVL      uint8
+	unusable uint8
+	_        uint8
+}
+
+// Clear clears the segment and marks it unusable.
+func (s *segment) Clear() {
+	*s = segment{unusable: 1}
+}
+
+// selector is a segment selector.
+type selector uint16
+
+// tobool is a simple helper.
+func tobool(x ring0.SegmentDescriptorFlags) uint8 {
+	if x != 0 {
+		return 1
+	}
+	return 0
+}
+
+// Load loads the segment described by d into the segment s.
+//
+// The argument sel is recorded as the segment selector index.
+func (s *segment) Load(d *ring0.SegmentDescriptor, sel ring0.Selector) {
+	flag := d.Flags()
+	if flag&ring0.SegmentDescriptorPresent == 0 {
+		s.Clear()
+		return
+	}
+	s.base = uint64(d.Base())
+	s.limit = d.Limit()
+	s.typ = uint8((flag>>8)&0xF) | 1
+	s.S = tobool(flag & ring0.SegmentDescriptorSystem)
+	s.DPL = uint8(d.DPL())
+	s.present = tobool(flag & ring0.SegmentDescriptorPresent)
+	s.AVL = tobool(flag & ring0.SegmentDescriptorAVL)
+	s.L = tobool(flag & ring0.SegmentDescriptorLong)
+	s.DB = tobool(flag & ring0.SegmentDescriptorDB)
+	s.G = tobool(flag & ring0.SegmentDescriptorG)
+	if s.L != 0 {
+		s.limit = 0xffffffff
+	}
+	s.unusable = 0
+	s.selector = uint16(sel)
+}
+
+// descriptor describes a region of physical memory.
+//
+// It corresponds to the pseudo-descriptor used in the x86 LGDT and LIDT
+// instructions, and mirrors kvm_dtable.
+type descriptor struct {
+	base  uint64
+	limit uint16
+	_     [3]uint16
+}
+
+// modelControlRegister is an MSR entry.
+//
+// This mirrors kvm_msr_entry.
+type modelControlRegister struct {
+	index uint32
+	_     uint32
+	data  uint64
+}
+
+// modelControlRegisers is a collection of MSRs.
+//
+// This mirrors kvm_msrs.
+type modelControlRegisters struct {
+	nmsrs   uint32
+	_       uint32
+	entries [16]modelControlRegister
+}
+
+// runData is the run structure. This may be mapped for synchronous register
+// access (although that doesn't appear to be supported by my kernel at least).
+//
+// This mirrors kvm_run.
+type runData struct {
+	requestInterruptWindow uint8
+	_                      [7]uint8
+
+	exitReason                 uint32
+	readyForInterruptInjection uint8
+	ifFlag                     uint8
+	_                          [2]uint8
+
+	cr8      uint64
+	apicBase uint64
+
+	// This is the union data for exits. Interpretation depends entirely on
+	// the exitReason above (see vCPU code for more information).
+	data [32]uint64
+}
+
+// cpuidEntry is a single CPUID entry.
+//
+// This mirrors kvm_cpuid_entry2.
+type cpuidEntry struct {
+	function uint32
+	index    uint32
+	flags    uint32
+	eax      uint32
+	ebx      uint32
+	ecx      uint32
+	edx      uint32
+	_        [3]uint32
+}
+
+// cpuidEntries is a collection of CPUID entries.
+//
+// This mirrors kvm_cpuid2.
+type cpuidEntries struct {
+	nr      uint32
+	_       uint32
+	entries [_KVM_NR_CPUID_ENTRIES]cpuidEntry
+}
diff --git a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
new file mode 100644
index 000000000..46c4b9113
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
@@ -0,0 +1,77 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+	"fmt"
+	"syscall"
+	"unsafe"
+)
+
+var (
+	runDataSize    int
+	hasGuestPCID   bool
+	cpuidSupported = cpuidEntries{nr: _KVM_NR_CPUID_ENTRIES}
+)
+
+func updateSystemValues(fd int) error {
+	// Extract the mmap size.
+	sz, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(fd), _KVM_GET_VCPU_MMAP_SIZE, 0)
+	if errno != 0 {
+		return fmt.Errorf("getting VCPU mmap size: %v", errno)
+	}
+
+	// Save the data.
+	runDataSize = int(sz)
+
+	// Must do the dance to figure out the number of entries.
+	_, _, errno = syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(fd),
+		_KVM_GET_SUPPORTED_CPUID,
+		uintptr(unsafe.Pointer(&cpuidSupported)))
+	if errno != 0 && errno != syscall.ENOMEM {
+		// Some other error occurred.
+		return fmt.Errorf("getting supported CPUID: %v", errno)
+	}
+
+	// The number should now be correct.
+	_, _, errno = syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(fd),
+		_KVM_GET_SUPPORTED_CPUID,
+		uintptr(unsafe.Pointer(&cpuidSupported)))
+	if errno != 0 {
+		// Didn't work with the right number.
+		return fmt.Errorf("getting supported CPUID (2nd attempt): %v", errno)
+	}
+
+	// Calculate whether guestPCID is supported.
+	//
+	// FIXME(ascannell): These should go through the much more pleasant
+	// cpuid package interfaces, once a way to accept raw kvm CPUID entries
+	// is plumbed (or some rough equivalent).
+	for i := 0; i < int(cpuidSupported.nr); i++ {
+		entry := cpuidSupported.entries[i]
+		if entry.function == 1 && entry.index == 0 && entry.ecx&(1<<17) != 0 {
+			hasGuestPCID = true // Found matching PCID in guest feature set.
+		}
+	}
+
+	// Success.
+	return nil
+}
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
new file mode 100644
index 000000000..d05f05c29
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -0,0 +1,64 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+// KVM ioctls.
+//
+// Only the ioctls we need in Go appear here; some additional ioctls are used
+// within the assembly stubs (KVM_INTERRUPT, etc.).
+const (
+	_KVM_CREATE_VM              = 0xae01
+	_KVM_GET_VCPU_MMAP_SIZE     = 0xae04
+	_KVM_CREATE_VCPU            = 0xae41
+	_KVM_SET_TSS_ADDR           = 0xae47
+	_KVM_RUN                    = 0xae80
+	_KVM_NMI                    = 0xae9a
+	_KVM_CHECK_EXTENSION        = 0xae03
+	_KVM_INTERRUPT              = 0x4004ae86
+	_KVM_SET_MSRS               = 0x4008ae89
+	_KVM_SET_USER_MEMORY_REGION = 0x4020ae46
+	_KVM_SET_REGS               = 0x4090ae82
+	_KVM_SET_SREGS              = 0x4138ae84
+	_KVM_GET_REGS               = 0x8090ae81
+	_KVM_GET_SUPPORTED_CPUID    = 0xc008ae05
+	_KVM_SET_CPUID2             = 0x4008ae90
+	_KVM_SET_SIGNAL_MASK        = 0x4004ae8b
+)
+
+// KVM exit reasons.
+const (
+	_KVM_EXIT_EXCEPTION       = 0x1
+	_KVM_EXIT_IO              = 0x2
+	_KVM_EXIT_HYPERCALL       = 0x3
+	_KVM_EXIT_DEBUG           = 0x4
+	_KVM_EXIT_HLT             = 0x5
+	_KVM_EXIT_MMIO            = 0x6
+	_KVM_EXIT_IRQ_WINDOW_OPEN = 0x7
+	_KVM_EXIT_SHUTDOWN        = 0x8
+	_KVM_EXIT_FAIL_ENTRY      = 0x9
+	_KVM_EXIT_INTERNAL_ERROR  = 0x11
+)
+
+// KVM capability options.
+const (
+	_KVM_CAP_MAX_VCPUS = 0x42
+)
+
+// KVM limits.
+const (
+	_KVM_NR_VCPUS         = 0xff
+	_KVM_NR_INTERRUPTS    = 0x100
+	_KVM_NR_CPUID_ENTRIES = 0x100
+)
diff --git a/pkg/sentry/platform/kvm/kvm_state_autogen.go b/pkg/sentry/platform/kvm/kvm_state_autogen.go
new file mode 100755
index 000000000..5ab0e0735
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_state_autogen.go
@@ -0,0 +1,4 @@
+// automatically generated by stateify.
+
+package kvm
+
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
new file mode 100644
index 000000000..f5953b96e
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -0,0 +1,525 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"fmt"
+	"runtime"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/atomicbitops"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// machine contains state associated with the VM as a whole.
+type machine struct {
+	// fd is the vm fd.
+	fd int
+
+	// nextSlot is the next slot for setMemoryRegion.
+	//
+	// This must be accessed atomically. If nextSlot is ^uint32(0), then
+	// slots are currently being updated, and the caller should retry.
+	nextSlot uint32
+
+	// kernel is the set of global structures.
+	kernel ring0.Kernel
+
+	// mappingCache is used for mapPhysical.
+	mappingCache sync.Map
+
+	// mu protects vCPUs.
+	mu sync.RWMutex
+
+	// available is notified when vCPUs are available.
+	available sync.Cond
+
+	// vCPUs are the machine vCPUs.
+	//
+	// These are populated dynamically.
+	vCPUs map[uint64]*vCPU
+
+	// vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID.
+	vCPUsByID map[int]*vCPU
+
+	// maxVCPUs is the maximum number of vCPUs supported by the machine.
+	maxVCPUs int
+}
+
+const (
+	// vCPUReady is an alias for all the below clear.
+	vCPUReady uint32 = 0
+
+	// vCPUser indicates that the vCPU is in or about to enter user mode.
+	vCPUUser uint32 = 1 << 0
+
+	// vCPUGuest indicates the vCPU is in guest mode.
+	vCPUGuest uint32 = 1 << 1
+
+	// vCPUWaiter indicates that there is a waiter.
+	//
+	// If this is set, then notify must be called on any state transitions.
+	vCPUWaiter uint32 = 1 << 2
+)
+
+// vCPU is a single KVM vCPU.
+type vCPU struct {
+	// CPU is the kernel CPU data.
+	//
+	// This must be the first element of this structure, it is referenced
+	// by the bluepill code (see bluepill_amd64.s).
+	ring0.CPU
+
+	// id is the vCPU id.
+	id int
+
+	// fd is the vCPU fd.
+	fd int
+
+	// tid is the last set tid.
+	tid uint64
+
+	// switches is a count of world switches (informational only).
+	switches uint32
+
+	// faults is a count of world faults (informational only).
+	faults uint32
+
+	// state is the vCPU state.
+	//
+	// This is a bitmask of the three fields (vCPU*) described above.
+	state uint32
+
+	// runData for this vCPU.
+	runData *runData
+
+	// machine associated with this vCPU.
+	machine *machine
+
+	// active is the current addressSpace: this is set and read atomically,
+	// it is used to elide unnecessary interrupts due to invalidations.
+	active atomicAddressSpace
+
+	// vCPUArchState is the architecture-specific state.
+	vCPUArchState
+
+	dieState dieState
+}
+
+type dieState struct {
+	// message is thrown from die.
+	message string
+
+	// guestRegs is used to store register state during vCPU.die() to prevent
+	// allocation inside nosplit function.
+	guestRegs userRegs
+}
+
+// newVCPU creates a returns a new vCPU.
+//
+// Precondtion: mu must be held.
+func (m *machine) newVCPU() *vCPU {
+	id := len(m.vCPUs)
+
+	// Create the vCPU.
+	fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id))
+	if errno != 0 {
+		panic(fmt.Sprintf("error creating new vCPU: %v", errno))
+	}
+
+	c := &vCPU{
+		id:      id,
+		fd:      int(fd),
+		machine: m,
+	}
+	c.CPU.Init(&m.kernel, c)
+	m.vCPUsByID[c.id] = c
+
+	// Ensure the signal mask is correct.
+	if err := c.setSignalMask(); err != nil {
+		panic(fmt.Sprintf("error setting signal mask: %v", err))
+	}
+
+	// Map the run data.
+	runData, err := mapRunData(int(fd))
+	if err != nil {
+		panic(fmt.Sprintf("error mapping run data: %v", err))
+	}
+	c.runData = runData
+
+	// Initialize architecture state.
+	if err := c.initArchState(); err != nil {
+		panic(fmt.Sprintf("error initialization vCPU state: %v", err))
+	}
+
+	return c // Done.
+}
+
+// newMachine returns a new VM context.
+func newMachine(vm int) (*machine, error) {
+	// Create the machine.
+	m := &machine{
+		fd:        vm,
+		vCPUs:     make(map[uint64]*vCPU),
+		vCPUsByID: make(map[int]*vCPU),
+	}
+	m.available.L = &m.mu
+	m.kernel.Init(ring0.KernelOpts{
+		PageTables: pagetables.New(newAllocator()),
+	})
+
+	maxVCPUs, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
+	if errno != 0 {
+		m.maxVCPUs = _KVM_NR_VCPUS
+	} else {
+		m.maxVCPUs = int(maxVCPUs)
+	}
+	log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
+
+	// Apply the physical mappings. Note that these mappings may point to
+	// guest physical addresses that are not actually available. These
+	// physical pages are mapped on demand, see kernel_unsafe.go.
+	applyPhysicalRegions(func(pr physicalRegion) bool {
+		// Map everything in the lower half.
+		m.kernel.PageTables.Map(
+			usermem.Addr(pr.virtual),
+			pr.length,
+			pagetables.MapOpts{AccessType: usermem.AnyAccess},
+			pr.physical)
+
+		// And keep everything in the upper half.
+		m.kernel.PageTables.Map(
+			usermem.Addr(ring0.KernelStartAddress|pr.virtual),
+			pr.length,
+			pagetables.MapOpts{AccessType: usermem.AnyAccess},
+			pr.physical)
+
+		return true // Keep iterating.
+	})
+
+	// Ensure that the currently mapped virtual regions are actually
+	// available in the VM. Note that this doesn't guarantee no future
+	// faults, however it should guarantee that everything is available to
+	// ensure successful vCPU entry.
+	applyVirtualRegions(func(vr virtualRegion) {
+		if excludeVirtualRegion(vr) {
+			return // skip region.
+		}
+		for virtual := vr.virtual; virtual < vr.virtual+vr.length; {
+			physical, length, ok := translateToPhysical(virtual)
+			if !ok {
+				// This must be an invalid region that was
+				// knocked out by creation of the physical map.
+				return
+			}
+			if virtual+length > vr.virtual+vr.length {
+				// Cap the length to the end of the area.
+				length = vr.virtual + vr.length - virtual
+			}
+
+			// Ensure the physical range is mapped.
+			m.mapPhysical(physical, length)
+			virtual += length
+		}
+	})
+
+	// Initialize architecture state.
+	if err := m.initArchState(); err != nil {
+		m.Destroy()
+		return nil, err
+	}
+
+	// Ensure the machine is cleaned up properly.
+	runtime.SetFinalizer(m, (*machine).Destroy)
+	return m, nil
+}
+
+// mapPhysical checks for the mapping of a physical range, and installs one if
+// not available. This attempts to be efficient for calls in the hot path.
+//
+// This panics on error.
+func (m *machine) mapPhysical(physical, length uintptr) {
+	for end := physical + length; physical < end; {
+		_, physicalStart, length, ok := calculateBluepillFault(physical)
+		if !ok {
+			// Should never happen.
+			panic("mapPhysical on unknown physical address")
+		}
+
+		if _, ok := m.mappingCache.LoadOrStore(physicalStart, true); !ok {
+			// Not present in the cache; requires setting the slot.
+			if _, ok := handleBluepillFault(m, physical); !ok {
+				panic("handleBluepillFault failed")
+			}
+		}
+
+		// Move to the next chunk.
+		physical = physicalStart + length
+	}
+}
+
+// Destroy frees associated resources.
+//
+// Destroy should only be called once all active users of the machine are gone.
+// The machine object should not be used after calling Destroy.
+//
+// Precondition: all vCPUs must be returned to the machine.
+func (m *machine) Destroy() {
+	runtime.SetFinalizer(m, nil)
+
+	// Destroy vCPUs.
+	for _, c := range m.vCPUs {
+		// Ensure the vCPU is not still running in guest mode. This is
+		// possible iff teardown has been done by other threads, and
+		// somehow a single thread has not executed any system calls.
+		c.BounceToHost()
+
+		// Note that the runData may not be mapped if an error occurs
+		// during the middle of initialization.
+		if c.runData != nil {
+			if err := unmapRunData(c.runData); err != nil {
+				panic(fmt.Sprintf("error unmapping rundata: %v", err))
+			}
+		}
+		if err := syscall.Close(int(c.fd)); err != nil {
+			panic(fmt.Sprintf("error closing vCPU fd: %v", err))
+		}
+	}
+
+	// vCPUs are gone: teardown machine state.
+	if err := syscall.Close(m.fd); err != nil {
+		panic(fmt.Sprintf("error closing VM fd: %v", err))
+	}
+}
+
+// Get gets an available vCPU.
+func (m *machine) Get() *vCPU {
+	runtime.LockOSThread()
+	tid := procid.Current()
+	m.mu.RLock()
+
+	// Check for an exact match.
+	if c := m.vCPUs[tid]; c != nil {
+		c.lock()
+		m.mu.RUnlock()
+		return c
+	}
+
+	// The happy path failed. We now proceed to acquire an exclusive lock
+	// (because the vCPU map may change), and scan all available vCPUs.
+	m.mu.RUnlock()
+	m.mu.Lock()
+
+	for {
+		// Scan for an available vCPU.
+		for origTID, c := range m.vCPUs {
+			if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
+				delete(m.vCPUs, origTID)
+				m.vCPUs[tid] = c
+				m.mu.Unlock()
+				c.loadSegments(tid)
+				return c
+			}
+		}
+
+		// Create a new vCPU (maybe).
+		if len(m.vCPUs) < m.maxVCPUs {
+			c := m.newVCPU()
+			c.lock()
+			m.vCPUs[tid] = c
+			m.mu.Unlock()
+			c.loadSegments(tid)
+			return c
+		}
+
+		// Scan for something not in user mode.
+		for origTID, c := range m.vCPUs {
+			if !atomic.CompareAndSwapUint32(&c.state, vCPUGuest, vCPUGuest|vCPUWaiter) {
+				continue
+			}
+
+			// The vCPU is not be able to transition to
+			// vCPUGuest|vCPUUser or to vCPUUser because that
+			// transition requires holding the machine mutex, as we
+			// do now. There is no path to register a waiter on
+			// just the vCPUReady state.
+			for {
+				c.waitUntilNot(vCPUGuest | vCPUWaiter)
+				if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
+					break
+				}
+			}
+
+			// Steal the vCPU.
+			delete(m.vCPUs, origTID)
+			m.vCPUs[tid] = c
+			m.mu.Unlock()
+			c.loadSegments(tid)
+			return c
+		}
+
+		// Everything is executing in user mode. Wait until something
+		// is available.  Note that signaling the condition variable
+		// will have the extra effect of kicking the vCPUs out of guest
+		// mode if that's where they were.
+		m.available.Wait()
+	}
+}
+
+// Put puts the current vCPU.
+func (m *machine) Put(c *vCPU) {
+	c.unlock()
+	runtime.UnlockOSThread()
+	m.available.Signal()
+}
+
+// newDirtySet returns a new dirty set.
+func (m *machine) newDirtySet() *dirtySet {
+	return &dirtySet{
+		vCPUs: make([]uint64, (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64),
+	}
+}
+
+// lock marks the vCPU as in user mode.
+//
+// This should only be called directly when known to be safe, i.e. when
+// the vCPU is owned by the current TID with no chance of theft.
+//
+//go:nosplit
+func (c *vCPU) lock() {
+	atomicbitops.OrUint32(&c.state, vCPUUser)
+}
+
+// unlock clears the vCPUUser bit.
+//
+//go:nosplit
+func (c *vCPU) unlock() {
+	if atomic.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest) {
+		// Happy path: no exits are forced, and we can continue
+		// executing on our merry way with a single atomic access.
+		return
+	}
+
+	// Clear the lock.
+	origState := atomic.LoadUint32(&c.state)
+	atomicbitops.AndUint32(&c.state, ^vCPUUser)
+	switch origState {
+	case vCPUUser:
+		// Normal state.
+	case vCPUUser | vCPUGuest | vCPUWaiter:
+		// Force a transition: this must trigger a notification when we
+		// return from guest mode.
+		c.notify()
+	case vCPUUser | vCPUWaiter:
+		// Waiting for the lock to be released; the responsibility is
+		// on us to notify the waiter and clear the associated bit.
+		atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
+		c.notify()
+	default:
+		panic("invalid state")
+	}
+}
+
+// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
+//
+//go:nosplit
+func (c *vCPU) NotifyInterrupt() {
+	c.BounceToKernel()
+}
+
+// pid is used below in bounce.
+var pid = syscall.Getpid()
+
+// bounce forces a return to the kernel or to host mode.
+//
+// This effectively unwinds the state machine.
+func (c *vCPU) bounce(forceGuestExit bool) {
+	for {
+		switch state := atomic.LoadUint32(&c.state); state {
+		case vCPUReady, vCPUWaiter:
+			// There is nothing to be done, we're already in the
+			// kernel pre-acquisition. The Bounce criteria have
+			// been satisfied.
+			return
+		case vCPUUser:
+			// We need to register a waiter for the actual guest
+			// transition. When the transition takes place, then we
+			// can inject an interrupt to ensure a return to host
+			// mode.
+			atomic.CompareAndSwapUint32(&c.state, state, state|vCPUWaiter)
+		case vCPUUser | vCPUWaiter:
+			// Wait for the transition to guest mode. This should
+			// come from the bluepill handler.
+			c.waitUntilNot(state)
+		case vCPUGuest, vCPUUser | vCPUGuest:
+			if state == vCPUGuest && !forceGuestExit {
+				// The vCPU is already not acquired, so there's
+				// no need to do a fresh injection here.
+				return
+			}
+			// The vCPU is in user or kernel mode. Attempt to
+			// register a notification on change.
+			if !atomic.CompareAndSwapUint32(&c.state, state, state|vCPUWaiter) {
+				break // Retry.
+			}
+			for {
+				// We need to spin here until the signal is
+				// delivered, because Tgkill can return EAGAIN
+				// under memory pressure. Since we already
+				// marked ourselves as a waiter, we need to
+				// ensure that a signal is actually delivered.
+				if err := syscall.Tgkill(pid, int(atomic.LoadUint64(&c.tid)), bounceSignal); err == nil {
+					break
+				} else if err.(syscall.Errno) == syscall.EAGAIN {
+					continue
+				} else {
+					// Nothing else should be returned by tgkill.
+					panic(fmt.Sprintf("unexpected tgkill error: %v", err))
+				}
+			}
+		case vCPUGuest | vCPUWaiter, vCPUUser | vCPUGuest | vCPUWaiter:
+			if state == vCPUGuest|vCPUWaiter && !forceGuestExit {
+				// See above.
+				return
+			}
+			// Wait for the transition. This again should happen
+			// from the bluepill handler, but on the way out.
+			c.waitUntilNot(state)
+		default:
+			// Should not happen: the above is exhaustive.
+			panic("invalid state")
+		}
+	}
+}
+
+// BounceToKernel ensures that the vCPU bounces back to the kernel.
+//
+//go:nosplit
+func (c *vCPU) BounceToKernel() {
+	c.bounce(false)
+}
+
+// BounceToHost ensures that the vCPU is in host mode.
+//
+//go:nosplit
+func (c *vCPU) BounceToHost() {
+	c.bounce(true)
+}
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
new file mode 100644
index 000000000..b6821122a
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -0,0 +1,357 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+	"fmt"
+	"reflect"
+	"runtime/debug"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// initArchState initializes architecture-specific state.
+func (m *machine) initArchState() error {
+	// Set the legacy TSS address. This address is covered by the reserved
+	// range (up to 4GB). In fact, this is a main reason it exists.
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(m.fd),
+		_KVM_SET_TSS_ADDR,
+		uintptr(reservedMemory-(3*usermem.PageSize))); errno != 0 {
+		return errno
+	}
+
+	// Enable CPUID faulting, if possible. Note that this also serves as a
+	// basic platform sanity tests, since we will enter guest mode for the
+	// first time here. The recovery is necessary, since if we fail to read
+	// the platform info register, we will retry to host mode and
+	// ultimately need to handle a segmentation fault.
+	old := debug.SetPanicOnFault(true)
+	defer func() {
+		recover()
+		debug.SetPanicOnFault(old)
+	}()
+	m.retryInGuest(func() {
+		ring0.SetCPUIDFaulting(true)
+	})
+
+	return nil
+}
+
+type vCPUArchState struct {
+	// PCIDs is the set of PCIDs for this vCPU.
+	//
+	// This starts above fixedKernelPCID.
+	PCIDs *pagetables.PCIDs
+
+	// floatingPointState is the floating point state buffer used in guest
+	// to host transitions. See usage in bluepill_amd64.go.
+	floatingPointState *arch.FloatingPointData
+}
+
+const (
+	// fixedKernelPCID is a fixed kernel PCID used for the kernel page
+	// tables. We must start allocating user PCIDs above this in order to
+	// avoid any conflict (see below).
+	fixedKernelPCID = 1
+
+	// poolPCIDs is the number of PCIDs to record in the database. As this
+	// grows, assignment can take longer, since it is a simple linear scan.
+	// Beyond a relatively small number, there are likely few perform
+	// benefits, since the TLB has likely long since lost any translations
+	// from more than a few PCIDs past.
+	poolPCIDs = 8
+)
+
+// dropPageTables drops cached page table entries.
+func (m *machine) dropPageTables(pt *pagetables.PageTables) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	// Clear from all PCIDs.
+	for _, c := range m.vCPUs {
+		c.PCIDs.Drop(pt)
+	}
+}
+
+// initArchState initializes architecture-specific state.
+func (c *vCPU) initArchState() error {
+	var (
+		kernelSystemRegs systemRegs
+		kernelUserRegs   userRegs
+	)
+
+	// Set base control registers.
+	kernelSystemRegs.CR0 = c.CR0()
+	kernelSystemRegs.CR4 = c.CR4()
+	kernelSystemRegs.EFER = c.EFER()
+
+	// Set the IDT & GDT in the registers.
+	kernelSystemRegs.IDT.base, kernelSystemRegs.IDT.limit = c.IDT()
+	kernelSystemRegs.GDT.base, kernelSystemRegs.GDT.limit = c.GDT()
+	kernelSystemRegs.CS.Load(&ring0.KernelCodeSegment, ring0.Kcode)
+	kernelSystemRegs.DS.Load(&ring0.UserDataSegment, ring0.Udata)
+	kernelSystemRegs.ES.Load(&ring0.UserDataSegment, ring0.Udata)
+	kernelSystemRegs.SS.Load(&ring0.KernelDataSegment, ring0.Kdata)
+	kernelSystemRegs.FS.Load(&ring0.UserDataSegment, ring0.Udata)
+	kernelSystemRegs.GS.Load(&ring0.UserDataSegment, ring0.Udata)
+	tssBase, tssLimit, tss := c.TSS()
+	kernelSystemRegs.TR.Load(tss, ring0.Tss)
+	kernelSystemRegs.TR.base = tssBase
+	kernelSystemRegs.TR.limit = uint32(tssLimit)
+
+	// Point to kernel page tables, with no initial PCID.
+	kernelSystemRegs.CR3 = c.machine.kernel.PageTables.CR3(false, 0)
+
+	// Initialize the PCID database.
+	if hasGuestPCID {
+		// Note that NewPCIDs may return a nil table here, in which
+		// case we simply don't use PCID support (see below). In
+		// practice, this should not happen, however.
+		c.PCIDs = pagetables.NewPCIDs(fixedKernelPCID+1, poolPCIDs)
+	}
+
+	// Set the CPUID; this is required before setting system registers,
+	// since KVM will reject several CR4 bits if the CPUID does not
+	// indicate the support is available.
+	if err := c.setCPUID(); err != nil {
+		return err
+	}
+
+	// Set the entrypoint for the kernel.
+	kernelUserRegs.RIP = uint64(reflect.ValueOf(ring0.Start).Pointer())
+	kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer())
+	kernelUserRegs.RFLAGS = ring0.KernelFlagsSet
+
+	// Set the system registers.
+	if err := c.setSystemRegisters(&kernelSystemRegs); err != nil {
+		return err
+	}
+
+	// Set the user registers.
+	if err := c.setUserRegisters(&kernelUserRegs); err != nil {
+		return err
+	}
+
+	// Allocate some floating point state save area for the local vCPU.
+	// This will be saved prior to leaving the guest, and we restore from
+	// this always. We cannot use the pointer in the context alone because
+	// we don't know how large the area there is in reality.
+	c.floatingPointState = arch.NewFloatingPointData()
+
+	// Set the time offset to the host native time.
+	return c.setSystemTime()
+}
+
+// nonCanonical generates a canonical address return.
+//
+//go:nosplit
+func nonCanonical(addr uint64, signal int32, info *arch.SignalInfo) (usermem.AccessType, error) {
+	*info = arch.SignalInfo{
+		Signo: signal,
+		Code:  arch.SignalInfoKernel,
+	}
+	info.SetAddr(addr) // Include address.
+	return usermem.NoAccess, platform.ErrContextSignal
+}
+
+// fault generates an appropriate fault return.
+//
+//go:nosplit
+func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (usermem.AccessType, error) {
+	bluepill(c) // Probably no-op, but may not be.
+	faultAddr := ring0.ReadCR2()
+	code, user := c.ErrorCode()
+	if !user {
+		// The last fault serviced by this CPU was not a user
+		// fault, so we can't reliably trust the faultAddr or
+		// the code provided here. We need to re-execute.
+		return usermem.NoAccess, platform.ErrContextInterrupt
+	}
+	// Reset the pointed SignalInfo.
+	*info = arch.SignalInfo{Signo: signal}
+	info.SetAddr(uint64(faultAddr))
+	accessType := usermem.AccessType{
+		Read:    code&(1<<1) == 0,
+		Write:   code&(1<<1) != 0,
+		Execute: code&(1<<4) != 0,
+	}
+	if !accessType.Write && !accessType.Execute {
+		info.Code = 1 // SEGV_MAPERR.
+	} else {
+		info.Code = 2 // SEGV_ACCERR.
+	}
+	return accessType, platform.ErrContextSignal
+}
+
+// SwitchToUser unpacks architectural-details.
+func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (usermem.AccessType, error) {
+	// Check for canonical addresses.
+	if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Rip) {
+		return nonCanonical(regs.Rip, int32(syscall.SIGSEGV), info)
+	} else if !ring0.IsCanonical(regs.Rsp) {
+		return nonCanonical(regs.Rsp, int32(syscall.SIGBUS), info)
+	} else if !ring0.IsCanonical(regs.Fs_base) {
+		return nonCanonical(regs.Fs_base, int32(syscall.SIGBUS), info)
+	} else if !ring0.IsCanonical(regs.Gs_base) {
+		return nonCanonical(regs.Gs_base, int32(syscall.SIGBUS), info)
+	}
+
+	// Assign PCIDs.
+	if c.PCIDs != nil {
+		var requireFlushPCID bool // Force a flush?
+		switchOpts.UserPCID, requireFlushPCID = c.PCIDs.Assign(switchOpts.PageTables)
+		switchOpts.KernelPCID = fixedKernelPCID
+		switchOpts.Flush = switchOpts.Flush || requireFlushPCID
+	}
+
+	// See below.
+	var vector ring0.Vector
+
+	// Past this point, stack growth can cause system calls (and a break
+	// from guest mode). So we need to ensure that between the bluepill
+	// call here and the switch call immediately below, no additional
+	// allocations occur.
+	entersyscall()
+	bluepill(c)
+	vector = c.CPU.SwitchToUser(switchOpts)
+	exitsyscall()
+
+	switch vector {
+	case ring0.Syscall, ring0.SyscallInt80:
+		// Fast path: system call executed.
+		return usermem.NoAccess, nil
+
+	case ring0.PageFault:
+		return c.fault(int32(syscall.SIGSEGV), info)
+
+	case ring0.Debug, ring0.Breakpoint:
+		*info = arch.SignalInfo{
+			Signo: int32(syscall.SIGTRAP),
+			Code:  1, // TRAP_BRKPT (breakpoint).
+		}
+		info.SetAddr(switchOpts.Registers.Rip) // Include address.
+		return usermem.AccessType{}, platform.ErrContextSignal
+
+	case ring0.GeneralProtectionFault,
+		ring0.SegmentNotPresent,
+		ring0.BoundRangeExceeded,
+		ring0.InvalidTSS,
+		ring0.StackSegmentFault:
+		*info = arch.SignalInfo{
+			Signo: int32(syscall.SIGSEGV),
+			Code:  arch.SignalInfoKernel,
+		}
+		info.SetAddr(switchOpts.Registers.Rip) // Include address.
+		if vector == ring0.GeneralProtectionFault {
+			// When CPUID faulting is enabled, we will generate a #GP(0) when
+			// userspace executes a CPUID instruction. This is handled above,
+			// because we need to be able to map and read user memory.
+			return usermem.AccessType{}, platform.ErrContextSignalCPUID
+		}
+		return usermem.AccessType{}, platform.ErrContextSignal
+
+	case ring0.InvalidOpcode:
+		*info = arch.SignalInfo{
+			Signo: int32(syscall.SIGILL),
+			Code:  1, // ILL_ILLOPC (illegal opcode).
+		}
+		info.SetAddr(switchOpts.Registers.Rip) // Include address.
+		return usermem.AccessType{}, platform.ErrContextSignal
+
+	case ring0.DivideByZero:
+		*info = arch.SignalInfo{
+			Signo: int32(syscall.SIGFPE),
+			Code:  1, // FPE_INTDIV (divide by zero).
+		}
+		info.SetAddr(switchOpts.Registers.Rip) // Include address.
+		return usermem.AccessType{}, platform.ErrContextSignal
+
+	case ring0.Overflow:
+		*info = arch.SignalInfo{
+			Signo: int32(syscall.SIGFPE),
+			Code:  2, // FPE_INTOVF (integer overflow).
+		}
+		info.SetAddr(switchOpts.Registers.Rip) // Include address.
+		return usermem.AccessType{}, platform.ErrContextSignal
+
+	case ring0.X87FloatingPointException,
+		ring0.SIMDFloatingPointException:
+		*info = arch.SignalInfo{
+			Signo: int32(syscall.SIGFPE),
+			Code:  7, // FPE_FLTINV (invalid operation).
+		}
+		info.SetAddr(switchOpts.Registers.Rip) // Include address.
+		return usermem.AccessType{}, platform.ErrContextSignal
+
+	case ring0.Vector(bounce): // ring0.VirtualizationException
+		return usermem.NoAccess, platform.ErrContextInterrupt
+
+	case ring0.AlignmentCheck:
+		*info = arch.SignalInfo{
+			Signo: int32(syscall.SIGBUS),
+			Code:  2, // BUS_ADRERR (physical address does not exist).
+		}
+		return usermem.NoAccess, platform.ErrContextSignal
+
+	case ring0.NMI:
+		// An NMI is generated only when a fault is not servicable by
+		// KVM itself, so we think some mapping is writeable but it's
+		// really not. This could happen, e.g. if some file is
+		// truncated (and would generate a SIGBUS) and we map it
+		// directly into the instance.
+		return c.fault(int32(syscall.SIGBUS), info)
+
+	case ring0.DeviceNotAvailable,
+		ring0.DoubleFault,
+		ring0.CoprocessorSegmentOverrun,
+		ring0.MachineCheck,
+		ring0.SecurityException:
+		fallthrough
+	default:
+		panic(fmt.Sprintf("unexpected vector: 0x%x", vector))
+	}
+}
+
+// retryInGuest runs the given function in guest mode.
+//
+// If the function does not complete in guest mode (due to execution of a
+// system call due to a GC stall, for example), then it will be retried. The
+// given function must be idempotent as a result of the retry mechanism.
+func (m *machine) retryInGuest(fn func()) {
+	c := m.Get()
+	defer m.Put(c)
+	for {
+		c.ClearErrorCode() // See below.
+		bluepill(c)        // Force guest mode.
+		fn()               // Execute the given function.
+		_, user := c.ErrorCode()
+		if user {
+			// If user is set, then we haven't bailed back to host
+			// mode via a kernel exception or system call. We
+			// consider the full function to have executed in guest
+			// mode and we can return.
+			break
+		}
+	}
+}
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
new file mode 100644
index 000000000..06a2e3b0c
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -0,0 +1,161 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+	"fmt"
+	"sync/atomic"
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/time"
+)
+
+// setMemoryRegion initializes a region.
+//
+// This may be called from bluepillHandler, and therefore returns an errno
+// directly (instead of wrapping in an error) to avoid allocations.
+//
+//go:nosplit
+func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr) syscall.Errno {
+	userRegion := userMemoryRegion{
+		slot:          uint32(slot),
+		flags:         0,
+		guestPhysAddr: uint64(physical),
+		memorySize:    uint64(length),
+		userspaceAddr: uint64(virtual),
+	}
+
+	// Set the region.
+	_, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(m.fd),
+		_KVM_SET_USER_MEMORY_REGION,
+		uintptr(unsafe.Pointer(&userRegion)))
+	return errno
+}
+
+// loadSegments copies the current segments.
+//
+// This may be called from within the signal context and throws on error.
+//
+//go:nosplit
+func (c *vCPU) loadSegments(tid uint64) {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_ARCH_PRCTL,
+		linux.ARCH_GET_FS,
+		uintptr(unsafe.Pointer(&c.CPU.Registers().Fs_base)),
+		0); errno != 0 {
+		throw("getting FS segment")
+	}
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_ARCH_PRCTL,
+		linux.ARCH_GET_GS,
+		uintptr(unsafe.Pointer(&c.CPU.Registers().Gs_base)),
+		0); errno != 0 {
+		throw("getting GS segment")
+	}
+	atomic.StoreUint64(&c.tid, tid)
+}
+
+// setCPUID sets the CPUID to be used by the guest.
+func (c *vCPU) setCPUID() error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_CPUID2,
+		uintptr(unsafe.Pointer(&cpuidSupported))); errno != 0 {
+		return fmt.Errorf("error setting CPUID: %v", errno)
+	}
+	return nil
+}
+
+// setSystemTime sets the TSC for the vCPU.
+//
+// This has to make the call many times in order to minimize the intrinstic
+// error in the offset. Unfortunately KVM does not expose a relative offset via
+// the API, so this is an approximation. We do this via an iterative algorithm.
+// This has the advantage that it can generally deal with highly variable
+// system call times and should converge on the correct offset.
+func (c *vCPU) setSystemTime() error {
+	const (
+		_MSR_IA32_TSC  = 0x00000010
+		calibrateTries = 10
+	)
+	registers := modelControlRegisters{
+		nmsrs: 1,
+	}
+	registers.entries[0] = modelControlRegister{
+		index: _MSR_IA32_TSC,
+	}
+	target := uint64(^uint32(0))
+	for done := 0; done < calibrateTries; {
+		start := uint64(time.Rdtsc())
+		registers.entries[0].data = start + target
+		if _, _, errno := syscall.RawSyscall(
+			syscall.SYS_IOCTL,
+			uintptr(c.fd),
+			_KVM_SET_MSRS,
+			uintptr(unsafe.Pointer(&registers))); errno != 0 {
+			return fmt.Errorf("error setting system time: %v", errno)
+		}
+		// See if this is our new minimum call time. Note that this
+		// serves two functions: one, we make sure that we are
+		// accurately predicting the offset we need to set. Second, we
+		// don't want to do the final set on a slow call, which could
+		// produce a really bad result. So we only count attempts
+		// within +/- 6.25% of our minimum as an attempt.
+		end := uint64(time.Rdtsc())
+		if end < start {
+			continue // Totally bogus.
+		}
+		half := (end - start) / 2
+		if half < target {
+			target = half
+		}
+		if (half - target) < target/8 {
+			done++
+		}
+	}
+	return nil
+}
+
+// setSignalMask sets the vCPU signal mask.
+//
+// This must be called prior to running the vCPU.
+func (c *vCPU) setSignalMask() error {
+	// The layout of this structure implies that it will not necessarily be
+	// the same layout chosen by the Go compiler. It gets fudged here.
+	var data struct {
+		length uint32
+		mask1  uint32
+		mask2  uint32
+		_      uint32
+	}
+	data.length = 8 // Fixed sigset size.
+	data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
+	data.mask2 = ^uint32(bounceSignalMask >> 32)
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_SIGNAL_MASK,
+		uintptr(unsafe.Pointer(&data))); errno != 0 {
+		return fmt.Errorf("error setting signal mask: %v", errno)
+	}
+	return nil
+}
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
new file mode 100644
index 000000000..1d3c6d2d6
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -0,0 +1,160 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build go1.12
+// +build !go1.14
+
+// Check go:linkname function signatures when updating Go version.
+
+package kvm
+
+import (
+	"fmt"
+	"sync/atomic"
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+)
+
+//go:linkname entersyscall runtime.entersyscall
+func entersyscall()
+
+//go:linkname exitsyscall runtime.exitsyscall
+func exitsyscall()
+
+// mapRunData maps the vCPU run data.
+func mapRunData(fd int) (*runData, error) {
+	r, _, errno := syscall.RawSyscall6(
+		syscall.SYS_MMAP,
+		0,
+		uintptr(runDataSize),
+		syscall.PROT_READ|syscall.PROT_WRITE,
+		syscall.MAP_SHARED,
+		uintptr(fd),
+		0)
+	if errno != 0 {
+		return nil, fmt.Errorf("error mapping runData: %v", errno)
+	}
+	return (*runData)(unsafe.Pointer(r)), nil
+}
+
+// unmapRunData unmaps the vCPU run data.
+func unmapRunData(r *runData) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_MUNMAP,
+		uintptr(unsafe.Pointer(r)),
+		uintptr(runDataSize),
+		0); errno != 0 {
+		return fmt.Errorf("error unmapping runData: %v", errno)
+	}
+	return nil
+}
+
+// setUserRegisters sets user registers in the vCPU.
+func (c *vCPU) setUserRegisters(uregs *userRegs) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_REGS,
+		uintptr(unsafe.Pointer(uregs))); errno != 0 {
+		return fmt.Errorf("error setting user registers: %v", errno)
+	}
+	return nil
+}
+
+// getUserRegisters reloads user registers in the vCPU.
+//
+// This is safe to call from a nosplit context.
+//
+//go:nosplit
+func (c *vCPU) getUserRegisters(uregs *userRegs) syscall.Errno {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_GET_REGS,
+		uintptr(unsafe.Pointer(uregs))); errno != 0 {
+		return errno
+	}
+	return 0
+}
+
+// setSystemRegisters sets system registers.
+func (c *vCPU) setSystemRegisters(sregs *systemRegs) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_IOCTL,
+		uintptr(c.fd),
+		_KVM_SET_SREGS,
+		uintptr(unsafe.Pointer(sregs))); errno != 0 {
+		return fmt.Errorf("error setting system registers: %v", errno)
+	}
+	return nil
+}
+
+// atomicAddressSpace is an atomic address space pointer.
+type atomicAddressSpace struct {
+	pointer unsafe.Pointer
+}
+
+// set sets the address space value.
+//
+//go:nosplit
+func (a *atomicAddressSpace) set(as *addressSpace) {
+	atomic.StorePointer(&a.pointer, unsafe.Pointer(as))
+}
+
+// get gets the address space value.
+//
+// Note that this should be considered best-effort, and may have changed by the
+// time this function returns.
+//
+//go:nosplit
+func (a *atomicAddressSpace) get() *addressSpace {
+	return (*addressSpace)(atomic.LoadPointer(&a.pointer))
+}
+
+// notify notifies that the vCPU has transitioned modes.
+//
+// This may be called by a signal handler and therefore throws on error.
+//
+//go:nosplit
+func (c *vCPU) notify() {
+	_, _, errno := syscall.RawSyscall6(
+		syscall.SYS_FUTEX,
+		uintptr(unsafe.Pointer(&c.state)),
+		linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG,
+		^uintptr(0), // Number of waiters.
+		0, 0, 0)
+	if errno != 0 {
+		throw("futex wake error")
+	}
+}
+
+// waitUntilNot waits for the vCPU to transition modes.
+//
+// The state should have been previously set to vCPUWaiter after performing an
+// appropriate action to cause a transition (e.g. interrupt injection).
+//
+// This panics on error.
+func (c *vCPU) waitUntilNot(state uint32) {
+	_, _, errno := syscall.Syscall6(
+		syscall.SYS_FUTEX,
+		uintptr(unsafe.Pointer(&c.state)),
+		linux.FUTEX_WAIT|linux.FUTEX_PRIVATE_FLAG,
+		uintptr(state),
+		0, 0, 0)
+	if errno != 0 && errno != syscall.EINTR && errno != syscall.EAGAIN {
+		panic("futex wait error")
+	}
+}
diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go
new file mode 100644
index 000000000..450eb8201
--- /dev/null
+++ b/pkg/sentry/platform/kvm/physical_map.go
@@ -0,0 +1,224 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"fmt"
+	"sort"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+const (
+	// reservedMemory is a chunk of physical memory reserved starting at
+	// physical address zero. There are some special pages in this region,
+	// so we just call the whole thing off.
+	//
+	// Other architectures may define this to be zero.
+	reservedMemory = 0x100000000
+)
+
+type region struct {
+	virtual uintptr
+	length  uintptr
+}
+
+type physicalRegion struct {
+	region
+	physical uintptr
+}
+
+// physicalRegions contains a list of available physical regions.
+//
+// The physical value used in physicalRegions is a number indicating the
+// physical offset, aligned appropriately and starting above reservedMemory.
+var physicalRegions []physicalRegion
+
+// fillAddressSpace fills the host address space with PROT_NONE mappings until
+// we have a host address space size that is less than or equal to the physical
+// address space. This allows us to have an injective host virtual to guest
+// physical mapping.
+//
+// The excluded regions are returned.
+func fillAddressSpace() (excludedRegions []region) {
+	// We can cut vSize in half, because the kernel will be using the top
+	// half and we ignore it while constructing mappings. It's as if we've
+	// already excluded half the possible addresses.
+	vSize := uintptr(1) << ring0.VirtualAddressBits()
+	vSize = vSize >> 1
+
+	// We exclude reservedMemory below from our physical memory size, so it
+	// needs to be dropped here as well. Otherwise, we could end up with
+	// physical addresses that are beyond what is mapped.
+	pSize := uintptr(1) << ring0.PhysicalAddressBits()
+	pSize -= reservedMemory
+
+	// Add specifically excluded regions; see excludeVirtualRegion.
+	applyVirtualRegions(func(vr virtualRegion) {
+		if excludeVirtualRegion(vr) {
+			excludedRegions = append(excludedRegions, vr.region)
+			vSize -= vr.length
+			log.Infof("excluded: virtual [%x,%x)", vr.virtual, vr.virtual+vr.length)
+		}
+	})
+
+	// Do we need any more work?
+	if vSize < pSize {
+		return excludedRegions
+	}
+
+	// Calculate the required space and fill it.
+	//
+	// Note carefully that we add faultBlockSize to required up front, and
+	// on each iteration of the loop below (i.e. each new physical region
+	// we define), we add faultBlockSize again. This is done because the
+	// computation of physical regions will ensure proper alignments with
+	// faultBlockSize, potentially causing up to faultBlockSize bytes in
+	// internal fragmentation for each physical region. So we need to
+	// account for this properly during allocation.
+	requiredAddr, ok := usermem.Addr(vSize - pSize + faultBlockSize).RoundUp()
+	if !ok {
+		panic(fmt.Sprintf(
+			"overflow for vSize (%x) - pSize (%x) + faultBlockSize (%x)",
+			vSize, pSize, faultBlockSize))
+	}
+	required := uintptr(requiredAddr)
+	current := required // Attempted mmap size.
+	for filled := uintptr(0); filled < required && current > 0; {
+		addr, _, errno := syscall.RawSyscall6(
+			syscall.SYS_MMAP,
+			0, // Suggested address.
+			current,
+			syscall.PROT_NONE,
+			syscall.MAP_ANONYMOUS|syscall.MAP_PRIVATE|syscall.MAP_NORESERVE,
+			0, 0)
+		if errno != 0 {
+			// Attempt half the size; overflow not possible.
+			currentAddr, _ := usermem.Addr(current >> 1).RoundUp()
+			current = uintptr(currentAddr)
+			continue
+		}
+		// We filled a block.
+		filled += current
+		excludedRegions = append(excludedRegions, region{
+			virtual: addr,
+			length:  current,
+		})
+		// See comment above.
+		if filled != required {
+			required += faultBlockSize
+		}
+	}
+	if current == 0 {
+		panic("filling address space failed")
+	}
+	sort.Slice(excludedRegions, func(i, j int) bool {
+		return excludedRegions[i].virtual < excludedRegions[j].virtual
+	})
+	for _, r := range excludedRegions {
+		log.Infof("region: virtual [%x,%x)", r.virtual, r.virtual+r.length)
+	}
+	return excludedRegions
+}
+
+// computePhysicalRegions computes physical regions.
+func computePhysicalRegions(excludedRegions []region) (physicalRegions []physicalRegion) {
+	physical := uintptr(reservedMemory)
+	addValidRegion := func(virtual, length uintptr) {
+		if length == 0 {
+			return
+		}
+		if virtual == 0 {
+			virtual += usermem.PageSize
+			length -= usermem.PageSize
+		}
+		if end := virtual + length; end > ring0.MaximumUserAddress {
+			length -= (end - ring0.MaximumUserAddress)
+		}
+		if length == 0 {
+			return
+		}
+		// Round physical up to the same alignment as the virtual
+		// address (with respect to faultBlockSize).
+		if offset := virtual &^ faultBlockMask; physical&^faultBlockMask != offset {
+			if newPhysical := (physical & faultBlockMask) + offset; newPhysical > physical {
+				physical = newPhysical // Round up by only a little bit.
+			} else {
+				physical = ((physical + faultBlockSize) & faultBlockMask) + offset
+			}
+		}
+		physicalRegions = append(physicalRegions, physicalRegion{
+			region: region{
+				virtual: virtual,
+				length:  length,
+			},
+			physical: physical,
+		})
+		physical += length
+	}
+	lastExcludedEnd := uintptr(0)
+	for _, r := range excludedRegions {
+		addValidRegion(lastExcludedEnd, r.virtual-lastExcludedEnd)
+		lastExcludedEnd = r.virtual + r.length
+	}
+	addValidRegion(lastExcludedEnd, ring0.MaximumUserAddress-lastExcludedEnd)
+
+	// Dump our all physical regions.
+	for _, r := range physicalRegions {
+		log.Infof("physicalRegion: virtual [%x,%x) => physical [%x,%x)",
+			r.virtual, r.virtual+r.length, r.physical, r.physical+r.length)
+	}
+	return physicalRegions
+}
+
+// physicalInit initializes physical address mappings.
+func physicalInit() {
+	physicalRegions = computePhysicalRegions(fillAddressSpace())
+}
+
+// applyPhysicalRegions applies the given function on physical regions.
+//
+// Iteration continues as long as true is returned. The return value is the
+// return from the last call to fn, or true if there are no entries.
+//
+// Precondition: physicalInit must have been called.
+func applyPhysicalRegions(fn func(pr physicalRegion) bool) bool {
+	for _, pr := range physicalRegions {
+		if !fn(pr) {
+			return false
+		}
+	}
+	return true
+}
+
+// translateToPhysical translates the given virtual address.
+//
+// Precondition: physicalInit must have been called.
+//
+//go:nosplit
+func translateToPhysical(virtual uintptr) (physical uintptr, length uintptr, ok bool) {
+	for _, pr := range physicalRegions {
+		if pr.virtual <= virtual && virtual < pr.virtual+pr.length {
+			physical = pr.physical + (virtual - pr.virtual)
+			length = pr.length - (virtual - pr.virtual)
+			ok = true
+			return
+		}
+	}
+	return
+}
diff --git a/pkg/sentry/platform/kvm/virtual_map.go b/pkg/sentry/platform/kvm/virtual_map.go
new file mode 100644
index 000000000..28a1b4414
--- /dev/null
+++ b/pkg/sentry/platform/kvm/virtual_map.go
@@ -0,0 +1,113 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+	"bufio"
+	"fmt"
+	"io"
+	"os"
+	"regexp"
+	"strconv"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+type virtualRegion struct {
+	region
+	accessType usermem.AccessType
+	shared     bool
+	offset     uintptr
+	filename   string
+}
+
+// mapsLine matches a single line from /proc/PID/maps.
+var mapsLine = regexp.MustCompile("([0-9a-f]+)-([0-9a-f]+) ([r-][w-][x-][sp]) ([0-9a-f]+) [0-9a-f]{2}:[0-9a-f]{2,} [0-9]+\\s+(.*)")
+
+// excludeRegion returns true if these regions should be excluded from the
+// physical map. Virtual regions need to be excluded if get_user_pages will
+// fail on those addresses, preventing KVM from satisfying EPT faults.
+//
+// This includes the VVAR page because the VVAR page may be mapped as I/O
+// memory. And the VDSO page is knocked out because the VVAR page is not even
+// recorded in /proc/self/maps on older kernels; knocking out the VDSO page
+// prevents code in the VDSO from accessing the VVAR address.
+//
+// This is called by the physical map functions, not applyVirtualRegions.
+func excludeVirtualRegion(r virtualRegion) bool {
+	return r.filename == "[vvar]" || r.filename == "[vdso]"
+}
+
+// applyVirtualRegions parses the process maps file.
+//
+// Unlike mappedRegions, these are not consistent over time.
+func applyVirtualRegions(fn func(vr virtualRegion)) error {
+	// Open /proc/self/maps.
+	f, err := os.Open("/proc/self/maps")
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	// Parse all entries.
+	r := bufio.NewReader(f)
+	for {
+		b, err := r.ReadBytes('\n')
+		if b != nil && len(b) > 0 {
+			m := mapsLine.FindSubmatch(b)
+			if m == nil {
+				// This should not happen: kernel bug?
+				return fmt.Errorf("badly formed line: %v", string(b))
+			}
+			start, err := strconv.ParseUint(string(m[1]), 16, 64)
+			if err != nil {
+				return fmt.Errorf("bad start address: %v", string(b))
+			}
+			end, err := strconv.ParseUint(string(m[2]), 16, 64)
+			if err != nil {
+				return fmt.Errorf("bad end address: %v", string(b))
+			}
+			read := m[3][0] == 'r'
+			write := m[3][1] == 'w'
+			execute := m[3][2] == 'x'
+			shared := m[3][3] == 's'
+			offset, err := strconv.ParseUint(string(m[4]), 16, 64)
+			if err != nil {
+				return fmt.Errorf("bad offset: %v", string(b))
+			}
+			fn(virtualRegion{
+				region: region{
+					virtual: uintptr(start),
+					length:  uintptr(end - start),
+				},
+				accessType: usermem.AccessType{
+					Read:    read,
+					Write:   write,
+					Execute: execute,
+				},
+				shared:   shared,
+				offset:   uintptr(offset),
+				filename: string(m[5]),
+			})
+		}
+		if err != nil && err == io.EOF {
+			break
+		} else if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/platform/mmap_min_addr.go b/pkg/sentry/platform/mmap_min_addr.go
new file mode 100644
index 000000000..90976735b
--- /dev/null
+++ b/pkg/sentry/platform/mmap_min_addr.go
@@ -0,0 +1,60 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package platform
+
+import (
+	"fmt"
+	"io/ioutil"
+	"strconv"
+	"strings"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// systemMMapMinAddrSource is the source file.
+const systemMMapMinAddrSource = "/proc/sys/vm/mmap_min_addr"
+
+// systemMMapMinAddr is the system's minimum map address.
+var systemMMapMinAddr uint64
+
+// SystemMMapMinAddr returns the minimum system address.
+func SystemMMapMinAddr() usermem.Addr {
+	return usermem.Addr(systemMMapMinAddr)
+}
+
+// MMapMinAddr is a size zero struct that implements MinUserAddress based on
+// the system minimum address. It is suitable for embedding in platforms that
+// rely on the system mmap, and thus require the system minimum.
+type MMapMinAddr struct {
+}
+
+// MinUserAddress implements platform.MinUserAddresss.
+func (*MMapMinAddr) MinUserAddress() usermem.Addr {
+	return SystemMMapMinAddr()
+}
+
+func init() {
+	// Open the source file.
+	b, err := ioutil.ReadFile(systemMMapMinAddrSource)
+	if err != nil {
+		panic(fmt.Sprintf("couldn't open %s: %v", systemMMapMinAddrSource, err))
+	}
+
+	// Parse the result.
+	systemMMapMinAddr, err = strconv.ParseUint(strings.TrimSpace(string(b)), 10, 64)
+	if err != nil {
+		panic(fmt.Sprintf("couldn't parse %s from %s: %v", string(b), systemMMapMinAddrSource, err))
+	}
+}
diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go
new file mode 100644
index 000000000..ae37276ad
--- /dev/null
+++ b/pkg/sentry/platform/platform.go
@@ -0,0 +1,349 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package platform provides a Platform abstraction.
+//
+// See Platform for more information.
+package platform
+
+import (
+	"fmt"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// Platform provides abstractions for execution contexts (Context,
+// AddressSpace).
+type Platform interface {
+	// SupportsAddressSpaceIO returns true if AddressSpaces returned by this
+	// Platform support AddressSpaceIO methods.
+	//
+	// The value returned by SupportsAddressSpaceIO is guaranteed to remain
+	// unchanged over the lifetime of the Platform.
+	SupportsAddressSpaceIO() bool
+
+	// CooperativelySchedulesAddressSpace returns true if the Platform has a
+	// limited number of AddressSpaces, such that mm.MemoryManager.Deactivate
+	// should call AddressSpace.Release when there are no goroutines that
+	// require the mm.MemoryManager to have an active AddressSpace.
+	//
+	// The value returned by CooperativelySchedulesAddressSpace is guaranteed
+	// to remain unchanged over the lifetime of the Platform.
+	CooperativelySchedulesAddressSpace() bool
+
+	// DetectsCPUPreemption returns true if Contexts returned by the Platform
+	// can reliably return ErrContextCPUPreempted.
+	DetectsCPUPreemption() bool
+
+	// MapUnit returns the alignment used for optional mappings into this
+	// platform's AddressSpaces. Higher values indicate lower per-page costs
+	// for AddressSpace.MapFile. As a special case, a MapUnit of 0 indicates
+	// that the cost of AddressSpace.MapFile is effectively independent of the
+	// number of pages mapped. If MapUnit is non-zero, it must be a power-of-2
+	// multiple of usermem.PageSize.
+	MapUnit() uint64
+
+	// MinUserAddress returns the minimum mappable address on this
+	// platform.
+	MinUserAddress() usermem.Addr
+
+	// MaxUserAddress returns the maximum mappable address on this
+	// platform.
+	MaxUserAddress() usermem.Addr
+
+	// NewAddressSpace returns a new memory context for this platform.
+	//
+	// If mappingsID is not nil, the platform may assume that (1) all calls
+	// to NewAddressSpace with the same mappingsID represent the same
+	// (mutable) set of mappings, and (2) the set of mappings has not
+	// changed since the last time AddressSpace.Release was called on an
+	// AddressSpace returned by a call to NewAddressSpace with the same
+	// mappingsID.
+	//
+	// If a new AddressSpace cannot be created immediately, a nil
+	// AddressSpace is returned, along with channel that is closed when
+	// the caller should retry a call to NewAddressSpace.
+	//
+	// In general, this blocking behavior only occurs when
+	// CooperativelySchedulesAddressSpace (above) returns false.
+	NewAddressSpace(mappingsID interface{}) (AddressSpace, <-chan struct{}, error)
+
+	// NewContext returns a new execution context.
+	NewContext() Context
+
+	// PreemptAllCPUs causes all concurrent calls to Context.Switch(), as well
+	// as the first following call to Context.Switch() for each Context, to
+	// return ErrContextCPUPreempted.
+	//
+	// PreemptAllCPUs is only supported if DetectsCPUPremption() == true.
+	// Platforms for which this does not hold may panic if PreemptAllCPUs is
+	// called.
+	PreemptAllCPUs() error
+}
+
+// NoCPUPreemptionDetection implements Platform.DetectsCPUPreemption and
+// dependent methods for Platforms that do not support this feature.
+type NoCPUPreemptionDetection struct{}
+
+// DetectsCPUPreemption implements Platform.DetectsCPUPreemption.
+func (NoCPUPreemptionDetection) DetectsCPUPreemption() bool {
+	return false
+}
+
+// PreemptAllCPUs implements Platform.PreemptAllCPUs.
+func (NoCPUPreemptionDetection) PreemptAllCPUs() error {
+	panic("This platform does not support CPU preemption detection")
+}
+
+// Context represents the execution context for a single thread.
+type Context interface {
+	// Switch resumes execution of the thread specified by the arch.Context
+	// in the provided address space. This call will block while the thread
+	// is executing.
+	//
+	// If cpu is non-negative, and it is not the number of the CPU that the
+	// thread executes on, Context should return ErrContextCPUPreempted. cpu
+	// can only be non-negative if Platform.DetectsCPUPreemption() is true;
+	// Contexts from Platforms for which this does not hold may ignore cpu, or
+	// panic if cpu is non-negative.
+	//
+	// Switch may return one of the following special errors:
+	//
+	// - nil: The Context invoked a system call.
+	//
+	// - ErrContextSignal: The Context was interrupted by a signal. The
+	// returned *arch.SignalInfo contains information about the signal. If
+	// arch.SignalInfo.Signo == SIGSEGV, the returned usermem.AccessType
+	// contains the access type of the triggering fault. The caller owns
+	// the returned SignalInfo.
+	//
+	// - ErrContextInterrupt: The Context was interrupted by a call to
+	// Interrupt(). Switch() may return ErrContextInterrupt spuriously. In
+	// particular, most implementations of Interrupt() will cause the first
+	// following call to Switch() to return ErrContextInterrupt if there is no
+	// concurrent call to Switch().
+	//
+	// - ErrContextCPUPreempted: See the definition of that error for details.
+	Switch(as AddressSpace, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error)
+
+	// Interrupt interrupts a concurrent call to Switch(), causing it to return
+	// ErrContextInterrupt.
+	Interrupt()
+}
+
+var (
+	// ErrContextSignal is returned by Context.Switch() to indicate that the
+	// Context was interrupted by a signal.
+	ErrContextSignal = fmt.Errorf("interrupted by signal")
+
+	// ErrContextSignalCPUID is equivalent to ErrContextSignal, except that
+	// a check should be done for execution of the CPUID instruction. If
+	// the current instruction pointer is a CPUID instruction, then this
+	// should be emulated appropriately. If not, then the given signal
+	// should be handled per above.
+	ErrContextSignalCPUID = fmt.Errorf("interrupted by signal, possible CPUID")
+
+	// ErrContextInterrupt is returned by Context.Switch() to indicate that the
+	// Context was interrupted by a call to Context.Interrupt().
+	ErrContextInterrupt = fmt.Errorf("interrupted by platform.Context.Interrupt()")
+
+	// ErrContextCPUPreempted is returned by Context.Switch() to indicate that
+	// one of the following occurred:
+	//
+	// - The CPU executing the Context is not the CPU passed to
+	// Context.Switch().
+	//
+	// - The CPU executing the Context may have executed another Context since
+	// the last time it executed this one; or the CPU has previously executed
+	// another Context, and has never executed this one.
+	//
+	// - Platform.PreemptAllCPUs() was called since the last return from
+	// Context.Switch().
+	ErrContextCPUPreempted = fmt.Errorf("interrupted by CPU preemption")
+)
+
+// SignalInterrupt is a signal reserved for use by implementations of
+// Context.Interrupt(). The sentry guarantees that it will ignore delivery of
+// this signal both to Contexts and to the sentry itself, under the assumption
+// that they originate from races with Context.Interrupt().
+//
+// NOTE(b/23420492): The Go runtime only guarantees that a small subset
+// of signals will be always be unblocked on all threads, one of which
+// is SIGCHLD.
+const SignalInterrupt = linux.SIGCHLD
+
+// AddressSpace represents a virtual address space in which a Context can
+// execute.
+type AddressSpace interface {
+	// MapFile creates a shared mapping of offsets fr from f at address addr.
+	// Any existing overlapping mappings are silently replaced.
+	//
+	// If precommit is true, the platform should eagerly commit resources (e.g.
+	// physical memory) to the mapping. The precommit flag is advisory and
+	// implementations may choose to ignore it.
+	//
+	// Preconditions: addr and fr must be page-aligned. fr.Length() > 0.
+	// at.Any() == true. At least one reference must be held on all pages in
+	// fr, and must continue to be held as long as pages are mapped.
+	MapFile(addr usermem.Addr, f File, fr FileRange, at usermem.AccessType, precommit bool) error
+
+	// Unmap unmaps the given range.
+	//
+	// Preconditions: addr is page-aligned. length > 0.
+	Unmap(addr usermem.Addr, length uint64)
+
+	// Release releases this address space. After releasing, a new AddressSpace
+	// must be acquired via platform.NewAddressSpace().
+	Release()
+
+	// AddressSpaceIO methods are supported iff the associated platform's
+	// Platform.SupportsAddressSpaceIO() == true. AddressSpaces for which this
+	// does not hold may panic if AddressSpaceIO methods are invoked.
+	AddressSpaceIO
+}
+
+// AddressSpaceIO supports IO through the memory mappings installed in an
+// AddressSpace.
+//
+// AddressSpaceIO implementors are responsible for ensuring that address ranges
+// are application-mappable.
+type AddressSpaceIO interface {
+	// CopyOut copies len(src) bytes from src to the memory mapped at addr. It
+	// returns the number of bytes copied. If the number of bytes copied is <
+	// len(src), it returns a non-nil error explaining why.
+	CopyOut(addr usermem.Addr, src []byte) (int, error)
+
+	// CopyIn copies len(dst) bytes from the memory mapped at addr to dst.
+	// It returns the number of bytes copied. If the number of bytes copied is
+	// < len(dst), it returns a non-nil error explaining why.
+	CopyIn(addr usermem.Addr, dst []byte) (int, error)
+
+	// ZeroOut sets toZero bytes to 0, starting at addr. It returns the number
+	// of bytes zeroed. If the number of bytes zeroed is < toZero, it returns a
+	// non-nil error explaining why.
+	ZeroOut(addr usermem.Addr, toZero uintptr) (uintptr, error)
+
+	// SwapUint32 atomically sets the uint32 value at addr to new and returns
+	// the previous value.
+	//
+	// Preconditions: addr must be aligned to a 4-byte boundary.
+	SwapUint32(addr usermem.Addr, new uint32) (uint32, error)
+
+	// CompareAndSwapUint32 atomically compares the uint32 value at addr to
+	// old; if they are equal, the value in memory is replaced by new. In
+	// either case, the previous value stored in memory is returned.
+	//
+	// Preconditions: addr must be aligned to a 4-byte boundary.
+	CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error)
+
+	// LoadUint32 atomically loads the uint32 value at addr and returns it.
+	//
+	// Preconditions: addr must be aligned to a 4-byte boundary.
+	LoadUint32(addr usermem.Addr) (uint32, error)
+}
+
+// NoAddressSpaceIO implements AddressSpaceIO methods by panicing.
+type NoAddressSpaceIO struct{}
+
+// CopyOut implements AddressSpaceIO.CopyOut.
+func (NoAddressSpaceIO) CopyOut(addr usermem.Addr, src []byte) (int, error) {
+	panic("This platform does not support AddressSpaceIO")
+}
+
+// CopyIn implements AddressSpaceIO.CopyIn.
+func (NoAddressSpaceIO) CopyIn(addr usermem.Addr, dst []byte) (int, error) {
+	panic("This platform does not support AddressSpaceIO")
+}
+
+// ZeroOut implements AddressSpaceIO.ZeroOut.
+func (NoAddressSpaceIO) ZeroOut(addr usermem.Addr, toZero uintptr) (uintptr, error) {
+	panic("This platform does not support AddressSpaceIO")
+}
+
+// SwapUint32 implements AddressSpaceIO.SwapUint32.
+func (NoAddressSpaceIO) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) {
+	panic("This platform does not support AddressSpaceIO")
+}
+
+// CompareAndSwapUint32 implements AddressSpaceIO.CompareAndSwapUint32.
+func (NoAddressSpaceIO) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) {
+	panic("This platform does not support AddressSpaceIO")
+}
+
+// LoadUint32 implements AddressSpaceIO.LoadUint32.
+func (NoAddressSpaceIO) LoadUint32(addr usermem.Addr) (uint32, error) {
+	panic("This platform does not support AddressSpaceIO")
+}
+
+// SegmentationFault is an error returned by AddressSpaceIO methods when IO
+// fails due to access of an unmapped page, or a mapped page with insufficient
+// permissions.
+type SegmentationFault struct {
+	// Addr is the address at which the fault occurred.
+	Addr usermem.Addr
+}
+
+// Error implements error.Error.
+func (f SegmentationFault) Error() string {
+	return fmt.Sprintf("segmentation fault at %#x", f.Addr)
+}
+
+// File represents a host file that may be mapped into an AddressSpace.
+type File interface {
+	// All pages in a File are reference-counted.
+
+	// IncRef increments the reference count on all pages in fr.
+	//
+	// Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() >
+	// 0. At least one reference must be held on all pages in fr. (The File
+	// interface does not provide a way to acquire an initial reference;
+	// implementors may define mechanisms for doing so.)
+	IncRef(fr FileRange)
+
+	// DecRef decrements the reference count on all pages in fr.
+	//
+	// Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() >
+	// 0. At least one reference must be held on all pages in fr.
+	DecRef(fr FileRange)
+
+	// MapInternal returns a mapping of the given file offsets in the invoking
+	// process' address space for reading and writing.
+	//
+	// Note that fr.Start and fr.End need not be page-aligned.
+	//
+	// Preconditions: fr.Length() > 0. At least one reference must be held on
+	// all pages in fr.
+	//
+	// Postconditions: The returned mapping is valid as long as at least one
+	// reference is held on the mapped pages.
+	MapInternal(fr FileRange, at usermem.AccessType) (safemem.BlockSeq, error)
+
+	// FD returns the file descriptor represented by the File.
+	//
+	// The only permitted operation on the returned file descriptor is to map
+	// pages from it consistent with the requirements of AddressSpace.MapFile.
+	FD() int
+}
+
+// FileRange represents a range of uint64 offsets into a File.
+//
+// type FileRange <generated using go_generics>
+
+// String implements fmt.Stringer.String.
+func (fr FileRange) String() string {
+	return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End)
+}
diff --git a/pkg/sentry/platform/platform_state_autogen.go b/pkg/sentry/platform/platform_state_autogen.go
new file mode 100755
index 000000000..13ea50daf
--- /dev/null
+++ b/pkg/sentry/platform/platform_state_autogen.go
@@ -0,0 +1,24 @@
+// automatically generated by stateify.
+
+package platform
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+func (x *FileRange) beforeSave() {}
+func (x *FileRange) save(m state.Map) {
+	x.beforeSave()
+	m.Save("Start", &x.Start)
+	m.Save("End", &x.End)
+}
+
+func (x *FileRange) afterLoad() {}
+func (x *FileRange) load(m state.Map) {
+	m.Load("Start", &x.Start)
+	m.Load("End", &x.End)
+}
+
+func init() {
+	state.Register("platform.FileRange", (*FileRange)(nil), state.Fns{Save: (*FileRange).save, Load: (*FileRange).load})
+}
diff --git a/pkg/sentry/platform/procid/procid.go b/pkg/sentry/platform/procid/procid.go
new file mode 100644
index 000000000..78b92422c
--- /dev/null
+++ b/pkg/sentry/platform/procid/procid.go
@@ -0,0 +1,21 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package procid provides a way to get the current system thread identifier.
+package procid
+
+// Current returns the current system thread identifier.
+//
+// Precondition: This should only be called with the runtime OS thread locked.
+func Current() uint64
diff --git a/pkg/sentry/platform/procid/procid_amd64.s b/pkg/sentry/platform/procid/procid_amd64.s
new file mode 100644
index 000000000..30ec8e6e2
--- /dev/null
+++ b/pkg/sentry/platform/procid/procid_amd64.s
@@ -0,0 +1,30 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+// +build go1.8
+// +build !go1.14
+
+#include "textflag.h"
+
+TEXT ·Current(SB),NOSPLIT,$0-8
+	// The offset specified here is the m_procid offset for Go1.8+.
+	// Changes to this offset should be caught by the tests, and major
+	// version changes require an explicit tag change above.
+	MOVQ TLS, AX
+	MOVQ 0(AX)(TLS*1), AX
+	MOVQ 48(AX), AX // g_m (may change in future versions)
+	MOVQ 72(AX), AX // m_procid (may change in future versions)
+	MOVQ AX, ret+0(FP)
+	RET
diff --git a/pkg/sentry/platform/procid/procid_arm64.s b/pkg/sentry/platform/procid/procid_arm64.s
new file mode 100644
index 000000000..e340d9f98
--- /dev/null
+++ b/pkg/sentry/platform/procid/procid_arm64.s
@@ -0,0 +1,29 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+// +build go1.8
+// +build !go1.14
+
+#include "textflag.h"
+
+TEXT ·Current(SB),NOSPLIT,$0-8
+	// The offset specified here is the m_procid offset for Go1.8+.
+	// Changes to this offset should be caught by the tests, and major
+	// version changes require an explicit tag change above.
+	MOVD g, R0      // g
+	MOVD 48(R0), R0 // g_m (may change in future versions)
+	MOVD 72(R0), R0 // m_procid (may change in future versions)
+	MOVD R0, ret+0(FP)
+	RET
diff --git a/pkg/sentry/platform/procid/procid_state_autogen.go b/pkg/sentry/platform/procid/procid_state_autogen.go
new file mode 100755
index 000000000..f27a7c510
--- /dev/null
+++ b/pkg/sentry/platform/procid/procid_state_autogen.go
@@ -0,0 +1,4 @@
+// automatically generated by stateify.
+
+package procid
+
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
new file mode 100644
index 000000000..6a890dd81
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -0,0 +1,238 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ptrace provides a ptrace-based implementation of the platform
+// interface. This is useful for development and testing purposes primarily,
+// and runs on stock kernels without special permissions.
+//
+// In a nutshell, it works as follows:
+//
+// The creation of a new address space creates a new child processes with a
+// single thread which is traced by a single goroutine.
+//
+// A context is just a collection of temporary variables. Calling Switch on a
+// context does the following:
+//
+//	Locks the runtime thread.
+//
+//	Looks up a traced subprocess thread for the current runtime thread. If
+//	none exists, the dedicated goroutine is asked to create a new stopped
+//	thread in the subprocess. This stopped subprocess thread is then traced
+//	by the current thread and this information is stored for subsequent
+//	switches.
+//
+//	The context is then bound with information about the subprocess thread
+//	so that the context may be appropriately interrupted via a signal.
+//
+//	The requested operation is performed in the traced subprocess thread
+//	(e.g. set registers, execute, return).
+//
+// Lock order:
+//
+// subprocess.mu
+//   context.mu
+package ptrace
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/interrupt"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+var (
+	// stubStart is the link address for our stub, and determines the
+	// maximum user address. This is valid only after a call to stubInit.
+	//
+	// We attempt to link the stub here, and adjust downward as needed.
+	stubStart uintptr = 0x7fffffff0000
+
+	// stubEnd is the first byte past the end of the stub, as with
+	// stubStart this is valid only after a call to stubInit.
+	stubEnd uintptr
+
+	// stubInitialized controls one-time stub initialization.
+	stubInitialized sync.Once
+)
+
+type context struct {
+	// signalInfo is the signal info, if and when a signal is received.
+	signalInfo arch.SignalInfo
+
+	// interrupt is the interrupt context.
+	interrupt interrupt.Forwarder
+
+	// mu protects the following fields.
+	mu sync.Mutex
+
+	// If lastFaultSP is non-nil, the last context switch was due to a fault
+	// received while executing lastFaultSP. Only context.Switch may set
+	// lastFaultSP to a non-nil value.
+	lastFaultSP *subprocess
+
+	// lastFaultAddr is the last faulting address; this is only meaningful if
+	// lastFaultSP is non-nil.
+	lastFaultAddr usermem.Addr
+
+	// lastFaultIP is the address of the last faulting instruction;
+	// this is also only meaningful if lastFaultSP is non-nil.
+	lastFaultIP usermem.Addr
+}
+
+// Switch runs the provided context in the given address space.
+func (c *context) Switch(as platform.AddressSpace, ac arch.Context, cpu int32) (*arch.SignalInfo, usermem.AccessType, error) {
+	s := as.(*subprocess)
+	isSyscall := s.switchToApp(c, ac)
+
+	var (
+		faultSP   *subprocess
+		faultAddr usermem.Addr
+		faultIP   usermem.Addr
+	)
+	if !isSyscall && linux.Signal(c.signalInfo.Signo) == linux.SIGSEGV {
+		faultSP = s
+		faultAddr = usermem.Addr(c.signalInfo.Addr())
+		faultIP = usermem.Addr(ac.IP())
+	}
+
+	// Update the context to reflect the outcome of this context switch.
+	c.mu.Lock()
+	lastFaultSP := c.lastFaultSP
+	lastFaultAddr := c.lastFaultAddr
+	lastFaultIP := c.lastFaultIP
+	// At this point, c may not yet be in s.contexts, so c.lastFaultSP won't be
+	// updated by s.Unmap(). This is fine; we only need to synchronize with
+	// calls to s.Unmap() that occur after the handling of this fault.
+	c.lastFaultSP = faultSP
+	c.lastFaultAddr = faultAddr
+	c.lastFaultIP = faultIP
+	c.mu.Unlock()
+
+	// Update subprocesses to reflect the outcome of this context switch.
+	if lastFaultSP != faultSP {
+		if lastFaultSP != nil {
+			lastFaultSP.mu.Lock()
+			delete(lastFaultSP.contexts, c)
+			lastFaultSP.mu.Unlock()
+		}
+		if faultSP != nil {
+			faultSP.mu.Lock()
+			faultSP.contexts[c] = struct{}{}
+			faultSP.mu.Unlock()
+		}
+	}
+
+	if isSyscall {
+		return nil, usermem.NoAccess, nil
+	}
+
+	si := c.signalInfo
+
+	if faultSP == nil {
+		// Non-fault signal.
+		return &si, usermem.NoAccess, platform.ErrContextSignal
+	}
+
+	// Got a page fault. Ideally, we'd get real fault type here, but ptrace
+	// doesn't expose this information. Instead, we use a simple heuristic:
+	//
+	// It was an instruction fault iff the faulting addr == instruction
+	// pointer.
+	//
+	// It was a write fault if the fault is immediately repeated.
+	at := usermem.Read
+	if faultAddr == faultIP {
+		at.Execute = true
+	}
+	if lastFaultSP == faultSP &&
+		lastFaultAddr == faultAddr &&
+		lastFaultIP == faultIP {
+		at.Write = true
+	}
+
+	// Unfortunately, we have to unilaterally return ErrContextSignalCPUID
+	// here, in case this fault was generated by a CPUID exception. There
+	// is no way to distinguish between CPUID-generated faults and regular
+	// page faults.
+	return &si, at, platform.ErrContextSignalCPUID
+}
+
+// Interrupt interrupts the running guest application associated with this context.
+func (c *context) Interrupt() {
+	c.interrupt.NotifyInterrupt()
+}
+
+// PTrace represents a collection of ptrace subprocesses.
+type PTrace struct {
+	platform.MMapMinAddr
+	platform.NoCPUPreemptionDetection
+}
+
+// New returns a new ptrace-based implementation of the platform interface.
+func New() (*PTrace, error) {
+	stubInitialized.Do(func() {
+		// Initialize the stub.
+		stubInit()
+
+		// Create the master process for the global pool. This must be
+		// done before initializing any other processes.
+		master, err := newSubprocess(createStub)
+		if err != nil {
+			// Should never happen.
+			panic("unable to initialize ptrace master: " + err.Error())
+		}
+
+		// Set the master on the globalPool.
+		globalPool.master = master
+	})
+
+	return &PTrace{}, nil
+}
+
+// SupportsAddressSpaceIO implements platform.Platform.SupportsAddressSpaceIO.
+func (*PTrace) SupportsAddressSpaceIO() bool {
+	return false
+}
+
+// CooperativelySchedulesAddressSpace implements platform.Platform.CooperativelySchedulesAddressSpace.
+func (*PTrace) CooperativelySchedulesAddressSpace() bool {
+	return false
+}
+
+// MapUnit implements platform.Platform.MapUnit.
+func (*PTrace) MapUnit() uint64 {
+	// The host kernel manages page tables and arbitrary-sized mappings
+	// have effectively the same cost.
+	return 0
+}
+
+// MaxUserAddress returns the first address that may not be used by user
+// applications.
+func (*PTrace) MaxUserAddress() usermem.Addr {
+	return usermem.Addr(stubStart)
+}
+
+// NewAddressSpace returns a new subprocess.
+func (p *PTrace) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan struct{}, error) {
+	as, err := newSubprocess(globalPool.master.createStub)
+	return as, nil, err
+}
+
+// NewContext returns an interruptible context.
+func (*PTrace) NewContext() platform.Context {
+	return &context{}
+}
diff --git a/pkg/sentry/platform/ptrace/ptrace_state_autogen.go b/pkg/sentry/platform/ptrace/ptrace_state_autogen.go
new file mode 100755
index 000000000..ac83a71e7
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/ptrace_state_autogen.go
@@ -0,0 +1,4 @@
+// automatically generated by stateify.
+
+package ptrace
+
diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
new file mode 100644
index 000000000..585f6c1fb
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
@@ -0,0 +1,166 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ptrace
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// GETREGSET/SETREGSET register set types.
+//
+// See include/uapi/linux/elf.h.
+const (
+	// _NT_PRFPREG is for x86 floating-point state without using xsave.
+	_NT_PRFPREG = 0x2
+
+	// _NT_X86_XSTATE is for x86 extended state using xsave.
+	_NT_X86_XSTATE = 0x202
+)
+
+// fpRegSet returns the GETREGSET/SETREGSET register set type to be used.
+func fpRegSet(useXsave bool) uintptr {
+	if useXsave {
+		return _NT_X86_XSTATE
+	}
+	return _NT_PRFPREG
+}
+
+// getRegs sets the regular register set.
+func (t *thread) getRegs(regs *syscall.PtraceRegs) error {
+	_, _, errno := syscall.RawSyscall6(
+		syscall.SYS_PTRACE,
+		syscall.PTRACE_GETREGS,
+		uintptr(t.tid),
+		0,
+		uintptr(unsafe.Pointer(regs)),
+		0, 0)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+// setRegs sets the regular register set.
+func (t *thread) setRegs(regs *syscall.PtraceRegs) error {
+	_, _, errno := syscall.RawSyscall6(
+		syscall.SYS_PTRACE,
+		syscall.PTRACE_SETREGS,
+		uintptr(t.tid),
+		0,
+		uintptr(unsafe.Pointer(regs)),
+		0, 0)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+// getFPRegs gets the floating-point data via the GETREGSET ptrace syscall.
+func (t *thread) getFPRegs(fpState *arch.FloatingPointData, fpLen uint64, useXsave bool) error {
+	iovec := syscall.Iovec{
+		Base: (*byte)(fpState),
+		Len:  fpLen,
+	}
+	_, _, errno := syscall.RawSyscall6(
+		syscall.SYS_PTRACE,
+		syscall.PTRACE_GETREGSET,
+		uintptr(t.tid),
+		fpRegSet(useXsave),
+		uintptr(unsafe.Pointer(&iovec)),
+		0, 0)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+// setFPRegs sets the floating-point data via the SETREGSET ptrace syscall.
+func (t *thread) setFPRegs(fpState *arch.FloatingPointData, fpLen uint64, useXsave bool) error {
+	iovec := syscall.Iovec{
+		Base: (*byte)(fpState),
+		Len:  fpLen,
+	}
+	_, _, errno := syscall.RawSyscall6(
+		syscall.SYS_PTRACE,
+		syscall.PTRACE_SETREGSET,
+		uintptr(t.tid),
+		fpRegSet(useXsave),
+		uintptr(unsafe.Pointer(&iovec)),
+		0, 0)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+// getSignalInfo retrieves information about the signal that caused the stop.
+func (t *thread) getSignalInfo(si *arch.SignalInfo) error {
+	_, _, errno := syscall.RawSyscall6(
+		syscall.SYS_PTRACE,
+		syscall.PTRACE_GETSIGINFO,
+		uintptr(t.tid),
+		0,
+		uintptr(unsafe.Pointer(si)),
+		0, 0)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+// clone creates a new thread from this one.
+//
+// The returned thread will be stopped and available for any system thread to
+// call attach on it.
+//
+// Precondition: the OS thread must be locked and own t.
+func (t *thread) clone() (*thread, error) {
+	r, ok := usermem.Addr(t.initRegs.Rsp).RoundUp()
+	if !ok {
+		return nil, syscall.EINVAL
+	}
+	rval, err := t.syscallIgnoreInterrupt(
+		&t.initRegs,
+		syscall.SYS_CLONE,
+		arch.SyscallArgument{Value: uintptr(
+			syscall.CLONE_FILES |
+				syscall.CLONE_FS |
+				syscall.CLONE_SIGHAND |
+				syscall.CLONE_THREAD |
+				syscall.CLONE_PTRACE |
+				syscall.CLONE_VM)},
+		// The stack pointer is just made up, but we have it be
+		// something sensible so the kernel doesn't think we're
+		// up to no good. Which we are.
+		arch.SyscallArgument{Value: uintptr(r)},
+		arch.SyscallArgument{},
+		arch.SyscallArgument{},
+		// We use these registers initially, but really they
+		// could be anything. We're going to stop immediately.
+		arch.SyscallArgument{Value: uintptr(unsafe.Pointer(&t.initRegs))})
+	if err != nil {
+		return nil, err
+	}
+
+	return &thread{
+		tgid: t.tgid,
+		tid:  int32(rval),
+		cpu:  ^uint32(0),
+	}, nil
+}
diff --git a/pkg/sentry/platform/ptrace/stub_amd64.s b/pkg/sentry/platform/ptrace/stub_amd64.s
new file mode 100644
index 000000000..64c718d21
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/stub_amd64.s
@@ -0,0 +1,114 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "funcdata.h"
+#include "textflag.h"
+
+#define SYS_GETPID		39
+#define SYS_EXIT		60
+#define SYS_KILL		62
+#define SYS_GETPPID		110
+#define SYS_PRCTL		157
+
+#define SIGKILL			9
+#define SIGSTOP			19
+
+#define PR_SET_PDEATHSIG	1
+
+// stub bootstraps the child and sends itself SIGSTOP to wait for attach.
+//
+// R15 contains the expected PPID. R15 is used instead of a more typical DI
+// since syscalls will clobber DI and createStub wants to pass a new PPID to
+// grandchildren.
+//
+// This should not be used outside the context of a new ptrace child (as the
+// function is otherwise a bunch of nonsense).
+TEXT ·stub(SB),NOSPLIT,$0
+begin:
+	// N.B. This loop only executes in the context of a single-threaded
+	// fork child.
+
+	MOVQ $SYS_PRCTL, AX
+	MOVQ $PR_SET_PDEATHSIG, DI
+	MOVQ $SIGKILL, SI
+	SYSCALL
+
+	CMPQ AX, $0
+	JNE error
+
+	// If the parent already died before we called PR_SET_DEATHSIG then
+	// we'll have an unexpected PPID.
+	MOVQ $SYS_GETPPID, AX
+	SYSCALL
+
+	CMPQ AX, $0
+	JL error
+
+	CMPQ AX, R15
+	JNE parent_dead
+
+	MOVQ $SYS_GETPID, AX
+	SYSCALL
+
+	CMPQ AX, $0
+	JL error
+
+	// SIGSTOP to wait for attach.
+	//
+	// The SYSCALL instruction will be used for future syscall injection by
+	// thread.syscall.
+	MOVQ AX, DI
+	MOVQ $SYS_KILL, AX
+	MOVQ $SIGSTOP, SI
+	SYSCALL
+
+	// The tracer may "detach" and/or allow code execution here in three cases:
+	//
+	// 1. New (traced) stub threads are explicitly detached by the
+	// goroutine in newSubprocess. However, they are detached while in
+	// group-stop, so they do not execute code here.
+	//
+	// 2. If a tracer thread exits, it implicitly detaches from the stub,
+	// potentially allowing code execution here. However, the Go runtime
+	// never exits individual threads, so this case never occurs.
+	//
+	// 3. subprocess.createStub clones a new stub process that is untraced,
+	// thus executing this code. We setup the PDEATHSIG before SIGSTOPing
+	// ourselves for attach by the tracer.
+	//
+	// R15 has been updated with the expected PPID.
+	JMP begin
+
+error:
+	// Exit with -errno.
+	MOVQ AX, DI
+	NEGQ DI
+	MOVQ $SYS_EXIT, AX
+	SYSCALL
+	HLT
+
+parent_dead:
+	MOVQ $SYS_EXIT, AX
+	MOVQ $1, DI
+	SYSCALL
+	HLT
+
+// stubCall calls the stub function at the given address with the given PPID.
+//
+// This is a distinct function because stub, above, may be mapped at any
+// arbitrary location, and stub has a specific binary API (see above).
+TEXT ·stubCall(SB),NOSPLIT,$0-16
+	MOVQ addr+0(FP), AX
+	MOVQ pid+8(FP), R15
+	JMP AX
diff --git a/pkg/sentry/platform/ptrace/stub_unsafe.go b/pkg/sentry/platform/ptrace/stub_unsafe.go
new file mode 100644
index 000000000..54d5021a9
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/stub_unsafe.go
@@ -0,0 +1,98 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ptrace
+
+import (
+	"reflect"
+	"syscall"
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/safecopy"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// stub is defined in arch-specific assembly.
+func stub()
+
+// stubCall calls the stub at the given address with the given pid.
+func stubCall(addr, pid uintptr)
+
+// unsafeSlice returns a slice for the given address and length.
+func unsafeSlice(addr uintptr, length int) (slice []byte) {
+	sh := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
+	sh.Data = addr
+	sh.Len = length
+	sh.Cap = length
+	return
+}
+
+// stubInit initializes the stub.
+func stubInit() {
+	// Grab the existing stub.
+	stubBegin := reflect.ValueOf(stub).Pointer()
+	stubLen := int(safecopy.FindEndAddress(stubBegin) - stubBegin)
+	stubSlice := unsafeSlice(stubBegin, stubLen)
+	mapLen := uintptr(stubLen)
+	if offset := mapLen % usermem.PageSize; offset != 0 {
+		mapLen += usermem.PageSize - offset
+	}
+
+	for stubStart > 0 {
+		// Map the target address for the stub.
+		//
+		// We don't use FIXED here because we don't want to unmap
+		// something that may have been there already. We just walk
+		// down the address space until we find a place where the stub
+		// can be placed.
+		addr, _, errno := syscall.RawSyscall6(
+			syscall.SYS_MMAP,
+			stubStart,
+			mapLen,
+			syscall.PROT_WRITE|syscall.PROT_READ,
+			syscall.MAP_PRIVATE|syscall.MAP_ANONYMOUS,
+			0 /* fd */, 0 /* offset */)
+		if addr != stubStart || errno != 0 {
+			if addr != 0 {
+				// Unmap the region we've mapped accidentally.
+				syscall.RawSyscall(syscall.SYS_MUNMAP, addr, mapLen, 0)
+			}
+
+			// Attempt to begin at a lower address.
+			stubStart -= uintptr(usermem.PageSize)
+			continue
+		}
+
+		// Copy the stub to the address.
+		targetSlice := unsafeSlice(addr, stubLen)
+		copy(targetSlice, stubSlice)
+
+		// Make the stub executable.
+		if _, _, errno := syscall.RawSyscall(
+			syscall.SYS_MPROTECT,
+			stubStart,
+			mapLen,
+			syscall.PROT_EXEC|syscall.PROT_READ); errno != 0 {
+			panic("mprotect failed: " + errno.Error())
+		}
+
+		// Set the end.
+		stubEnd = stubStart + mapLen
+		return
+	}
+
+	// This will happen only if we exhaust the entire address
+	// space, and it will take a long, long time.
+	panic("failed to map stub")
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
new file mode 100644
index 000000000..83b43057f
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -0,0 +1,610 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ptrace
+
+import (
+	"fmt"
+	"os"
+	"runtime"
+	"sync"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// globalPool exists to solve two distinct problems:
+//
+// 1) Subprocesses can't always be killed properly (see Release).
+//
+// 2) Any seccomp filters that have been installed will apply to subprocesses
+// created here. Therefore we use the intermediary (master), which is created
+// on initialization of the platform.
+var globalPool struct {
+	mu        sync.Mutex
+	master    *subprocess
+	available []*subprocess
+}
+
+// thread is a traced thread; it is a thread identifier.
+//
+// This is a convenience type for defining ptrace operations.
+type thread struct {
+	tgid int32
+	tid  int32
+	cpu  uint32
+
+	// initRegs are the initial registers for the first thread.
+	//
+	// These are used for the register set for system calls.
+	initRegs syscall.PtraceRegs
+}
+
+// threadPool is a collection of threads.
+type threadPool struct {
+	// mu protects below.
+	mu sync.Mutex
+
+	// threads is the collection of threads.
+	//
+	// This map is indexed by system TID (the calling thread); which will
+	// be the tracer for the given *thread, and therefore capable of using
+	// relevant ptrace calls.
+	threads map[int32]*thread
+}
+
+// lookupOrCreate looks up a given thread or creates one.
+//
+// newThread will generally be subprocess.newThread.
+//
+// Precondition: the runtime OS thread must be locked.
+func (tp *threadPool) lookupOrCreate(currentTID int32, newThread func() *thread) *thread {
+	tp.mu.Lock()
+	t, ok := tp.threads[currentTID]
+	if !ok {
+		// Before creating a new thread, see if we can find a thread
+		// whose system tid has disappeared.
+		//
+		// TODO(b/77216482): Other parts of this package depend on
+		// threads never exiting.
+		for origTID, t := range tp.threads {
+			// Signal zero is an easy existence check.
+			if err := syscall.Tgkill(syscall.Getpid(), int(origTID), 0); err != nil {
+				// This thread has been abandoned; reuse it.
+				delete(tp.threads, origTID)
+				tp.threads[currentTID] = t
+				tp.mu.Unlock()
+				return t
+			}
+		}
+
+		// Create a new thread.
+		t = newThread()
+		tp.threads[currentTID] = t
+	}
+	tp.mu.Unlock()
+	return t
+}
+
+// subprocess is a collection of threads being traced.
+type subprocess struct {
+	platform.NoAddressSpaceIO
+
+	// requests is used to signal creation of new threads.
+	requests chan chan *thread
+
+	// sysemuThreads are reserved for emulation.
+	sysemuThreads threadPool
+
+	// syscallThreads are reserved for syscalls (except clone, which is
+	// handled in the dedicated goroutine corresponding to requests above).
+	syscallThreads threadPool
+
+	// mu protects the following fields.
+	mu sync.Mutex
+
+	// contexts is the set of contexts for which it's possible that
+	// context.lastFaultSP == this subprocess.
+	contexts map[*context]struct{}
+}
+
+// newSubprocess returns a useable subprocess.
+//
+// This will either be a newly created subprocess, or one from the global pool.
+// The create function will be called in the latter case, which is guaranteed
+// to happen with the runtime thread locked.
+func newSubprocess(create func() (*thread, error)) (*subprocess, error) {
+	// See Release.
+	globalPool.mu.Lock()
+	if len(globalPool.available) > 0 {
+		sp := globalPool.available[len(globalPool.available)-1]
+		globalPool.available = globalPool.available[:len(globalPool.available)-1]
+		globalPool.mu.Unlock()
+		return sp, nil
+	}
+	globalPool.mu.Unlock()
+
+	// The following goroutine is responsible for creating the first traced
+	// thread, and responding to requests to make additional threads in the
+	// traced process. The process will be killed and reaped when the
+	// request channel is closed, which happens in Release below.
+	errChan := make(chan error)
+	requests := make(chan chan *thread)
+	go func() { // S/R-SAFE: Platform-related.
+		runtime.LockOSThread()
+		defer runtime.UnlockOSThread()
+
+		// Initialize the first thread.
+		firstThread, err := create()
+		if err != nil {
+			errChan <- err
+			return
+		}
+
+		// Ready to handle requests.
+		errChan <- nil
+
+		// Wait for requests to create threads.
+		for r := range requests {
+			t, err := firstThread.clone()
+			if err != nil {
+				// Should not happen: not recoverable.
+				panic(fmt.Sprintf("error initializing first thread: %v", err))
+			}
+
+			// Since the new thread was created with
+			// clone(CLONE_PTRACE), it will begin execution with
+			// SIGSTOP pending and with this thread as its tracer.
+			// (Hopefully nobody tgkilled it with a signal <
+			// SIGSTOP before the SIGSTOP was delivered, in which
+			// case that signal would be delivered before SIGSTOP.)
+			if sig := t.wait(stopped); sig != syscall.SIGSTOP {
+				panic(fmt.Sprintf("error waiting for new clone: expected SIGSTOP, got %v", sig))
+			}
+
+			// Detach the thread.
+			t.detach()
+
+			// Return the thread.
+			r <- t
+		}
+
+		// Requests should never be closed.
+		panic("unreachable")
+	}()
+
+	// Wait until error or readiness.
+	if err := <-errChan; err != nil {
+		return nil, err
+	}
+
+	// Ready.
+	sp := &subprocess{
+		requests: requests,
+		sysemuThreads: threadPool{
+			threads: make(map[int32]*thread),
+		},
+		syscallThreads: threadPool{
+			threads: make(map[int32]*thread),
+		},
+		contexts: make(map[*context]struct{}),
+	}
+
+	sp.unmap()
+	return sp, nil
+}
+
+// unmap unmaps non-stub regions of the process.
+//
+// This will panic on failure (which should never happen).
+func (s *subprocess) unmap() {
+	s.Unmap(0, uint64(stubStart))
+	if maximumUserAddress != stubEnd {
+		s.Unmap(usermem.Addr(stubEnd), uint64(maximumUserAddress-stubEnd))
+	}
+}
+
+// Release kills the subprocess.
+//
+// Just kidding! We can't safely co-ordinate the detaching of all the
+// tracees (since the tracers are random runtime threads, and the process
+// won't exit until tracers have been notifier).
+//
+// Therefore we simply unmap everything in the subprocess and return it to the
+// globalPool. This has the added benefit of reducing creation time for new
+// subprocesses.
+func (s *subprocess) Release() {
+	go func() { // S/R-SAFE: Platform.
+		s.unmap()
+		globalPool.mu.Lock()
+		globalPool.available = append(globalPool.available, s)
+		globalPool.mu.Unlock()
+	}()
+}
+
+// newThread creates a new traced thread.
+//
+// Precondition: the OS thread must be locked.
+func (s *subprocess) newThread() *thread {
+	// Ask the first thread to create a new one.
+	r := make(chan *thread)
+	s.requests <- r
+	t := <-r
+
+	// Attach the subprocess to this one.
+	t.attach()
+
+	// Return the new thread, which is now bound.
+	return t
+}
+
+// attach attachs to the thread.
+func (t *thread) attach() {
+	if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_ATTACH, uintptr(t.tid), 0); errno != 0 {
+		panic(fmt.Sprintf("unable to attach: %v", errno))
+	}
+
+	// PTRACE_ATTACH sends SIGSTOP, and wakes the tracee if it was already
+	// stopped from the SIGSTOP queued by CLONE_PTRACE (see inner loop of
+	// newSubprocess), so we always expect to see signal-delivery-stop with
+	// SIGSTOP.
+	if sig := t.wait(stopped); sig != syscall.SIGSTOP {
+		panic(fmt.Sprintf("wait failed: expected SIGSTOP, got %v", sig))
+	}
+
+	// Initialize options.
+	t.init()
+
+	// Grab registers.
+	//
+	// Note that we adjust the current register RIP value to be just before
+	// the current system call executed. This depends on the definition of
+	// the stub itself.
+	if err := t.getRegs(&t.initRegs); err != nil {
+		panic(fmt.Sprintf("ptrace get regs failed: %v", err))
+	}
+	t.initRegs.Rip -= initRegsRipAdjustment
+}
+
+// detach detachs from the thread.
+//
+// Because the SIGSTOP is not supressed, the thread will enter group-stop.
+func (t *thread) detach() {
+	if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(syscall.SIGSTOP), 0, 0); errno != 0 {
+		panic(fmt.Sprintf("can't detach new clone: %v", errno))
+	}
+}
+
+// waitOutcome is used for wait below.
+type waitOutcome int
+
+const (
+	// stopped indicates that the process was stopped.
+	stopped waitOutcome = iota
+
+	// killed indicates that the process was killed.
+	killed
+)
+
+// wait waits for a stop event.
+//
+// Precondition: outcome is a valid waitOutcome.
+func (t *thread) wait(outcome waitOutcome) syscall.Signal {
+	var status syscall.WaitStatus
+
+	for {
+		r, err := syscall.Wait4(int(t.tid), &status, syscall.WALL|syscall.WUNTRACED, nil)
+		if err == syscall.EINTR || err == syscall.EAGAIN {
+			// Wait was interrupted; wait again.
+			continue
+		} else if err != nil {
+			panic(fmt.Sprintf("ptrace wait failed: %v", err))
+		}
+		if int(r) != int(t.tid) {
+			panic(fmt.Sprintf("ptrace wait returned %v, expected %v", r, t.tid))
+		}
+		switch outcome {
+		case stopped:
+			if !status.Stopped() {
+				panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status))
+			}
+			stopSig := status.StopSignal()
+			if stopSig == 0 {
+				continue // Spurious stop.
+			}
+			if stopSig == syscall.SIGTRAP {
+				// Re-encode the trap cause the way it's expected.
+				return stopSig | syscall.Signal(status.TrapCause()<<8)
+			}
+			// Not a trap signal.
+			return stopSig
+		case killed:
+			if !status.Exited() && !status.Signaled() {
+				panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status))
+			}
+			return syscall.Signal(status.ExitStatus())
+		default:
+			// Should not happen.
+			panic(fmt.Sprintf("unknown outcome: %v", outcome))
+		}
+	}
+}
+
+// destroy kills the thread.
+//
+// Note that this should not be used in the general case; the death of threads
+// will typically cause the death of the parent. This is a utility method for
+// manually created threads.
+func (t *thread) destroy() {
+	t.detach()
+	syscall.Tgkill(int(t.tgid), int(t.tid), syscall.Signal(syscall.SIGKILL))
+	t.wait(killed)
+}
+
+// init initializes trace options.
+func (t *thread) init() {
+	// Set our TRACESYSGOOD option to differeniate real SIGTRAP.
+	_, _, errno := syscall.RawSyscall6(
+		syscall.SYS_PTRACE,
+		syscall.PTRACE_SETOPTIONS,
+		uintptr(t.tid),
+		0,
+		syscall.PTRACE_O_TRACESYSGOOD,
+		0, 0)
+	if errno != 0 {
+		panic(fmt.Sprintf("ptrace set options failed: %v", errno))
+	}
+}
+
+// syscall executes a system call cycle in the traced context.
+//
+// This is _not_ for use by application system calls, rather it is for use when
+// a system call must be injected into the remote context (e.g. mmap, munmap).
+// Note that clones are handled separately.
+func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
+	// Set registers.
+	if err := t.setRegs(regs); err != nil {
+		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
+	}
+
+	for {
+		// Execute the syscall instruction.
+		if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0); errno != 0 {
+			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
+		}
+
+		sig := t.wait(stopped)
+		if sig == (syscallEvent | syscall.SIGTRAP) {
+			// Reached syscall-enter-stop.
+			break
+		} else {
+			// Some other signal caused a thread stop; ignore.
+			continue
+		}
+	}
+
+	// Complete the actual system call.
+	if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0); errno != 0 {
+		panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
+	}
+
+	// Wait for syscall-exit-stop. "[Signal-delivery-stop] never happens
+	// between syscall-enter-stop and syscall-exit-stop; it happens *after*
+	// syscall-exit-stop.)" - ptrace(2), "Syscall-stops"
+	if sig := t.wait(stopped); sig != (syscallEvent | syscall.SIGTRAP) {
+		panic(fmt.Sprintf("wait failed: expected SIGTRAP, got %v [%d]", sig, sig))
+	}
+
+	// Grab registers.
+	if err := t.getRegs(regs); err != nil {
+		panic(fmt.Sprintf("ptrace get regs failed: %v", err))
+	}
+
+	return syscallReturnValue(regs)
+}
+
+// syscallIgnoreInterrupt ignores interrupts on the system call thread and
+// restarts the syscall if the kernel indicates that should happen.
+func (t *thread) syscallIgnoreInterrupt(
+	initRegs *syscall.PtraceRegs,
+	sysno uintptr,
+	args ...arch.SyscallArgument) (uintptr, error) {
+	for {
+		regs := createSyscallRegs(initRegs, sysno, args...)
+		rval, err := t.syscall(&regs)
+		switch err {
+		case ERESTARTSYS:
+			continue
+		case ERESTARTNOINTR:
+			continue
+		case ERESTARTNOHAND:
+			continue
+		default:
+			return rval, err
+		}
+	}
+}
+
+// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
+func (t *thread) NotifyInterrupt() {
+	syscall.Tgkill(int(t.tgid), int(t.tid), syscall.Signal(platform.SignalInterrupt))
+}
+
+// switchToApp is called from the main SwitchToApp entrypoint.
+//
+// This function returns true on a system call, false on a signal.
+func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
+	// Lock the thread for ptrace operations.
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	// Extract floating point state.
+	fpState := ac.FloatingPointData()
+	fpLen, _ := ac.FeatureSet().ExtendedStateSize()
+	useXsave := ac.FeatureSet().UseXsave()
+
+	// Grab our thread from the pool.
+	currentTID := int32(procid.Current())
+	t := s.sysemuThreads.lookupOrCreate(currentTID, s.newThread)
+
+	// Reset necessary registers.
+	regs := &ac.StateData().Regs
+	t.resetSysemuRegs(regs)
+
+	// Check for interrupts, and ensure that future interrupts will signal t.
+	if !c.interrupt.Enable(t) {
+		// Pending interrupt; simulate.
+		c.signalInfo = arch.SignalInfo{Signo: int32(platform.SignalInterrupt)}
+		return false
+	}
+	defer c.interrupt.Disable()
+
+	// Ensure that the CPU set is bound appropriately; this makes the
+	// emulation below several times faster, presumably by avoiding
+	// interprocessor wakeups and by simplifying the schedule.
+	t.bind()
+
+	// Set registers.
+	if err := t.setRegs(regs); err != nil {
+		panic(fmt.Sprintf("ptrace set regs (%+v) failed: %v", regs, err))
+	}
+	if err := t.setFPRegs(fpState, uint64(fpLen), useXsave); err != nil {
+		panic(fmt.Sprintf("ptrace set fpregs (%+v) failed: %v", fpState, err))
+	}
+
+	for {
+		// Start running until the next system call.
+		if isSingleStepping(regs) {
+			if _, _, errno := syscall.RawSyscall(
+				syscall.SYS_PTRACE,
+				syscall.PTRACE_SYSEMU_SINGLESTEP,
+				uintptr(t.tid), 0); errno != 0 {
+				panic(fmt.Sprintf("ptrace sysemu failed: %v", errno))
+			}
+		} else {
+			if _, _, errno := syscall.RawSyscall(
+				syscall.SYS_PTRACE,
+				syscall.PTRACE_SYSEMU,
+				uintptr(t.tid), 0); errno != 0 {
+				panic(fmt.Sprintf("ptrace sysemu failed: %v", errno))
+			}
+		}
+
+		// Wait for the syscall-enter stop.
+		sig := t.wait(stopped)
+
+		// Refresh all registers.
+		if err := t.getRegs(regs); err != nil {
+			panic(fmt.Sprintf("ptrace get regs failed: %v", err))
+		}
+		if err := t.getFPRegs(fpState, uint64(fpLen), useXsave); err != nil {
+			panic(fmt.Sprintf("ptrace get fpregs failed: %v", err))
+		}
+
+		// Is it a system call?
+		if sig == (syscallEvent | syscall.SIGTRAP) {
+			// Ensure registers are sane.
+			updateSyscallRegs(regs)
+			return true
+		} else if sig == syscall.SIGSTOP {
+			// SIGSTOP was delivered to another thread in the same thread
+			// group, which initiated another group stop. Just ignore it.
+			continue
+		}
+
+		// Grab signal information.
+		if err := t.getSignalInfo(&c.signalInfo); err != nil {
+			// Should never happen.
+			panic(fmt.Sprintf("ptrace get signal info failed: %v", err))
+		}
+
+		// We have a signal. We verify however, that the signal was
+		// either delivered from the kernel or from this process. We
+		// don't respect other signals.
+		if c.signalInfo.Code > 0 {
+			// The signal was generated by the kernel. We inspect
+			// the signal information, and may patch it in order to
+			// faciliate vsyscall emulation. See patchSignalInfo.
+			patchSignalInfo(regs, &c.signalInfo)
+			return false
+		} else if c.signalInfo.Code <= 0 && c.signalInfo.Pid() == int32(os.Getpid()) {
+			// The signal was generated by this process. That means
+			// that it was an interrupt or something else that we
+			// should bail for. Note that we ignore signals
+			// generated by other processes.
+			return false
+		}
+	}
+}
+
+// syscall executes the given system call without handling interruptions.
+func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintptr, error) {
+	// Grab a thread.
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+	currentTID := int32(procid.Current())
+	t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread)
+
+	return t.syscallIgnoreInterrupt(&t.initRegs, sysno, args...)
+}
+
+// MapFile implements platform.AddressSpace.MapFile.
+func (s *subprocess) MapFile(addr usermem.Addr, f platform.File, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
+	var flags int
+	if precommit {
+		flags |= syscall.MAP_POPULATE
+	}
+	_, err := s.syscall(
+		syscall.SYS_MMAP,
+		arch.SyscallArgument{Value: uintptr(addr)},
+		arch.SyscallArgument{Value: uintptr(fr.Length())},
+		arch.SyscallArgument{Value: uintptr(at.Prot())},
+		arch.SyscallArgument{Value: uintptr(flags | syscall.MAP_SHARED | syscall.MAP_FIXED)},
+		arch.SyscallArgument{Value: uintptr(f.FD())},
+		arch.SyscallArgument{Value: uintptr(fr.Start)})
+	return err
+}
+
+// Unmap implements platform.AddressSpace.Unmap.
+func (s *subprocess) Unmap(addr usermem.Addr, length uint64) {
+	ar, ok := addr.ToRange(length)
+	if !ok {
+		panic(fmt.Sprintf("addr %#x + length %#x overflows", addr, length))
+	}
+	s.mu.Lock()
+	for c := range s.contexts {
+		c.mu.Lock()
+		if c.lastFaultSP == s && ar.Contains(c.lastFaultAddr) {
+			// Forget the last fault so that if c faults again, the fault isn't
+			// incorrectly reported as a write fault. If this is being called
+			// due to munmap() of the corresponding vma, handling of the second
+			// fault will fail anyway.
+			c.lastFaultSP = nil
+			delete(s.contexts, c)
+		}
+		c.mu.Unlock()
+	}
+	s.mu.Unlock()
+	_, err := s.syscall(
+		syscall.SYS_MUNMAP,
+		arch.SyscallArgument{Value: uintptr(addr)},
+		arch.SyscallArgument{Value: uintptr(length)})
+	if err != nil {
+		// We never expect this to happen.
+		panic(fmt.Sprintf("munmap(%x, %x)) failed: %v", addr, length, err))
+	}
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
new file mode 100644
index 000000000..77a0e908f
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -0,0 +1,104 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package ptrace
+
+import (
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+)
+
+const (
+	// maximumUserAddress is the largest possible user address.
+	maximumUserAddress = 0x7ffffffff000
+
+	// initRegsRipAdjustment is the size of the syscall instruction.
+	initRegsRipAdjustment = 2
+)
+
+// Linux kernel errnos which "should never be seen by user programs", but will
+// be revealed to ptrace syscall exit tracing.
+//
+// These constants are used in subprocess.go.
+const (
+	ERESTARTSYS    = syscall.Errno(512)
+	ERESTARTNOINTR = syscall.Errno(513)
+	ERESTARTNOHAND = syscall.Errno(514)
+)
+
+// resetSysemuRegs sets up emulation registers.
+//
+// This should be called prior to calling sysemu.
+func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) {
+	regs.Cs = t.initRegs.Cs
+	regs.Ss = t.initRegs.Ss
+	regs.Ds = t.initRegs.Ds
+	regs.Es = t.initRegs.Es
+	regs.Fs = t.initRegs.Fs
+	regs.Gs = t.initRegs.Gs
+}
+
+// createSyscallRegs sets up syscall registers.
+//
+// This should be called to generate registers for a system call.
+func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch.SyscallArgument) syscall.PtraceRegs {
+	// Copy initial registers.
+	regs := *initRegs
+
+	// Set our syscall number.
+	regs.Rax = uint64(sysno)
+	if len(args) >= 1 {
+		regs.Rdi = args[0].Uint64()
+	}
+	if len(args) >= 2 {
+		regs.Rsi = args[1].Uint64()
+	}
+	if len(args) >= 3 {
+		regs.Rdx = args[2].Uint64()
+	}
+	if len(args) >= 4 {
+		regs.R10 = args[3].Uint64()
+	}
+	if len(args) >= 5 {
+		regs.R8 = args[4].Uint64()
+	}
+	if len(args) >= 6 {
+		regs.R9 = args[5].Uint64()
+	}
+
+	return regs
+}
+
+// isSingleStepping determines if the registers indicate single-stepping.
+func isSingleStepping(regs *syscall.PtraceRegs) bool {
+	return (regs.Eflags & arch.X86TrapFlag) != 0
+}
+
+// updateSyscallRegs updates registers after finishing sysemu.
+func updateSyscallRegs(regs *syscall.PtraceRegs) {
+	// Ptrace puts -ENOSYS in rax on syscall-enter-stop.
+	regs.Rax = regs.Orig_rax
+}
+
+// syscallReturnValue extracts a sensible return from registers.
+func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) {
+	rval := int64(regs.Rax)
+	if rval < 0 {
+		return 0, syscall.Errno(-rval)
+	}
+	return uintptr(rval), nil
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
new file mode 100644
index 000000000..2c07b4ac3
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -0,0 +1,338 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package ptrace
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/seccomp"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
+)
+
+const syscallEvent syscall.Signal = 0x80
+
+// probeSeccomp returns true iff seccomp is run after ptrace notifications,
+// which is generally the case for kernel version >= 4.8. This check is dynamic
+// because kernels have be backported behavior.
+//
+// See createStub for more information.
+//
+// Precondition: the runtime OS thread must be locked.
+func probeSeccomp() bool {
+	// Create a completely new, destroyable process.
+	t, err := attachedThread(0, linux.SECCOMP_RET_ERRNO)
+	if err != nil {
+		panic(fmt.Sprintf("seccomp probe failed: %v", err))
+	}
+	defer t.destroy()
+
+	// Set registers to the yield system call. This call is not allowed
+	// by the filters specified in the attachThread function.
+	regs := createSyscallRegs(&t.initRegs, syscall.SYS_SCHED_YIELD)
+	if err := t.setRegs(&regs); err != nil {
+		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
+	}
+
+	for {
+		// Attempt an emulation.
+		if _, _, errno := syscall.RawSyscall(syscall.SYS_PTRACE, syscall.PTRACE_SYSEMU, uintptr(t.tid), 0); errno != 0 {
+			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
+		}
+
+		sig := t.wait(stopped)
+		if sig == (syscallEvent | syscall.SIGTRAP) {
+			// Did the seccomp errno hook already run? This would
+			// indicate that seccomp is first in line and we're
+			// less than 4.8.
+			if err := t.getRegs(&regs); err != nil {
+				panic(fmt.Sprintf("ptrace get-regs failed: %v", err))
+			}
+			if _, err := syscallReturnValue(&regs); err == nil {
+				// The seccomp errno mode ran first, and reset
+				// the error in the registers.
+				return false
+			}
+			// The seccomp hook did not run yet, and therefore it
+			// is safe to use RET_KILL mode for dispatched calls.
+			return true
+		}
+	}
+}
+
+// patchSignalInfo patches the signal info to account for hitting the seccomp
+// filters from vsyscall emulation, specified below. We allow for SIGSYS as a
+// synchronous trap, but patch the structure to appear like a SIGSEGV with the
+// Rip as the faulting address.
+//
+// Note that this should only be called after verifying that the signalInfo has
+// been generated by the kernel.
+func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) {
+	if linux.Signal(signalInfo.Signo) == linux.SIGSYS {
+		signalInfo.Signo = int32(linux.SIGSEGV)
+
+		// Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered
+		// with the si_call_addr field pointing to the current RIP. This field
+		// aligns with the si_addr field for a SIGSEGV, so we don't need to touch
+		// anything there. We do need to unwind emulation however, so we set the
+		// instruction pointer to the faulting value, and "unpop" the stack.
+		regs.Rip = signalInfo.Addr()
+		regs.Rsp -= 8
+	}
+}
+
+// createStub creates a fresh stub processes.
+//
+// Precondition: the runtime OS thread must be locked.
+func createStub() (*thread, error) {
+	// The exact interactions of ptrace and seccomp are complex, and
+	// changed in recent kernel versions. Before commit 93e35efb8de45, the
+	// seccomp check is done before the ptrace emulation check. This means
+	// that any calls not matching this list will trigger the seccomp
+	// default action instead of notifying ptrace.
+	//
+	// After commit 93e35efb8de45, the seccomp check is done after the
+	// ptrace emulation check. This simplifies using SYSEMU, since seccomp
+	// will never run for emulation. Seccomp will only run for injected
+	// system calls, and thus we can use RET_KILL as our violation action.
+	var defaultAction linux.BPFAction
+	if probeSeccomp() {
+		log.Infof("Latest seccomp behavior found (kernel >= 4.8 likely)")
+		defaultAction = linux.SECCOMP_RET_KILL_THREAD
+	} else {
+		// We must rely on SYSEMU behavior; tracing with SYSEMU is broken.
+		log.Infof("Legacy seccomp behavior found (kernel < 4.8 likely)")
+		defaultAction = linux.SECCOMP_RET_ALLOW
+	}
+
+	// When creating the new child process, we specify SIGKILL as the
+	// signal to deliver when the child exits. We never expect a subprocess
+	// to exit; they are pooled and reused. This is done to ensure that if
+	// a subprocess is OOM-killed, this process (and all other stubs,
+	// transitively) will be killed as well. It's simply not possible to
+	// safely handle a single stub getting killed: the exact state of
+	// execution is unknown and not recoverable.
+	return attachedThread(uintptr(syscall.SIGKILL)|syscall.CLONE_FILES, defaultAction)
+}
+
+// attachedThread returns a new attached thread.
+//
+// Precondition: the runtime OS thread must be locked.
+func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, error) {
+	// Create a BPF program that allows only the system calls needed by the
+	// stub and all its children. This is used to create child stubs
+	// (below), so we must include the ability to fork, but otherwise lock
+	// down available calls only to what is needed.
+	rules := []seccomp.RuleSet{
+		// Rules for trapping vsyscall access.
+		seccomp.RuleSet{
+			Rules: seccomp.SyscallRules{
+				syscall.SYS_GETTIMEOFDAY: {},
+				syscall.SYS_TIME:         {},
+				309:                      {}, // SYS_GETCPU.
+			},
+			Action:   linux.SECCOMP_RET_TRAP,
+			Vsyscall: true,
+		},
+	}
+	if defaultAction != linux.SECCOMP_RET_ALLOW {
+		rules = append(rules, seccomp.RuleSet{
+			Rules: seccomp.SyscallRules{
+				syscall.SYS_CLONE: []seccomp.Rule{
+					// Allow creation of new subprocesses (used by the master).
+					{seccomp.AllowValue(syscall.CLONE_FILES | syscall.SIGKILL)},
+					// Allow creation of new threads within a single address space (used by addresss spaces).
+					{seccomp.AllowValue(
+						syscall.CLONE_FILES |
+							syscall.CLONE_FS |
+							syscall.CLONE_SIGHAND |
+							syscall.CLONE_THREAD |
+							syscall.CLONE_PTRACE |
+							syscall.CLONE_VM)},
+				},
+
+				// For the initial process creation.
+				syscall.SYS_WAIT4: {},
+				syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
+					{seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
+				},
+				syscall.SYS_EXIT: {},
+
+				// For the stub prctl dance (all).
+				syscall.SYS_PRCTL: []seccomp.Rule{
+					{seccomp.AllowValue(syscall.PR_SET_PDEATHSIG), seccomp.AllowValue(syscall.SIGKILL)},
+				},
+				syscall.SYS_GETPPID: {},
+
+				// For the stub to stop itself (all).
+				syscall.SYS_GETPID: {},
+				syscall.SYS_KILL: []seccomp.Rule{
+					{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SIGSTOP)},
+				},
+
+				// Injected to support the address space operations.
+				syscall.SYS_MMAP:   {},
+				syscall.SYS_MUNMAP: {},
+			},
+			Action: linux.SECCOMP_RET_ALLOW,
+		})
+	}
+	instrs, err := seccomp.BuildProgram(rules, defaultAction)
+	if err != nil {
+		return nil, err
+	}
+
+	// Declare all variables up front in order to ensure that there's no
+	// need for allocations between beforeFork & afterFork.
+	var (
+		pid   uintptr
+		ppid  uintptr
+		errno syscall.Errno
+	)
+
+	// Remember the current ppid for the pdeathsig race.
+	ppid, _, _ = syscall.RawSyscall(syscall.SYS_GETPID, 0, 0, 0)
+
+	// Among other things, beforeFork masks all signals.
+	beforeFork()
+
+	// Do the clone.
+	pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, flags, 0, 0, 0, 0, 0)
+	if errno != 0 {
+		afterFork()
+		return nil, errno
+	}
+
+	// Is this the parent?
+	if pid != 0 {
+		// Among other things, restore signal mask.
+		afterFork()
+
+		// Initialize the first thread.
+		t := &thread{
+			tgid: int32(pid),
+			tid:  int32(pid),
+			cpu:  ^uint32(0),
+		}
+		if sig := t.wait(stopped); sig != syscall.SIGSTOP {
+			return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig)
+		}
+		t.attach()
+
+		return t, nil
+	}
+
+	// Move the stub to a new session (and thus a new process group). This
+	// prevents the stub from getting PTY job control signals intended only
+	// for the sentry process. We must call this before restoring signal
+	// mask.
+	if _, _, errno := syscall.RawSyscall(syscall.SYS_SETSID, 0, 0, 0); errno != 0 {
+		syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
+	}
+
+	// afterForkInChild resets all signals to their default dispositions
+	// and restores the signal mask to its pre-fork state.
+	afterForkInChild()
+
+	// Explicitly unmask all signals to ensure that the tracer can see
+	// them.
+	if errno := unmaskAllSignals(); errno != 0 {
+		syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
+	}
+
+	// Set an aggressive BPF filter for the stub and all it's children. See
+	// the description of the BPF program built above.
+	if errno := seccomp.SetFilter(instrs); errno != 0 {
+		syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
+	}
+
+	// Enable cpuid-faulting; this may fail on older kernels or hardware,
+	// so we just disregard the result. Host CPUID will be enabled.
+	syscall.RawSyscall(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0)
+
+	// Call the stub; should not return.
+	stubCall(stubStart, ppid)
+	panic("unreachable")
+}
+
+// createStub creates a stub processes as a child of an existing subprocesses.
+//
+// Precondition: the runtime OS thread must be locked.
+func (s *subprocess) createStub() (*thread, error) {
+	// There's no need to lock the runtime thread here, as this can only be
+	// called from a context that is already locked.
+	currentTID := int32(procid.Current())
+	t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread)
+
+	// Pass the expected PPID to the child via R15.
+	regs := t.initRegs
+	regs.R15 = uint64(t.tgid)
+
+	// Call fork in a subprocess.
+	//
+	// The new child must set up PDEATHSIG to ensure it dies if this
+	// process dies. Since this process could die at any time, this cannot
+	// be done via instrumentation from here.
+	//
+	// Instead, we create the child untraced, which will do the PDEATHSIG
+	// setup and then SIGSTOP itself for our attach below.
+	//
+	// See above re: SIGKILL.
+	pid, err := t.syscallIgnoreInterrupt(
+		&regs,
+		syscall.SYS_CLONE,
+		arch.SyscallArgument{Value: uintptr(syscall.SIGKILL | syscall.CLONE_FILES)},
+		arch.SyscallArgument{Value: 0},
+		arch.SyscallArgument{Value: 0},
+		arch.SyscallArgument{Value: 0},
+		arch.SyscallArgument{Value: 0},
+		arch.SyscallArgument{Value: 0})
+	if err != nil {
+		return nil, err
+	}
+
+	// Wait for child to enter group-stop, so we don't stop its
+	// bootstrapping work with t.attach below.
+	//
+	// We unfortunately don't have a handy part of memory to write the wait
+	// status. If the wait succeeds, we'll assume that it was the SIGSTOP.
+	// If the child actually exited, the attach below will fail.
+	_, err = t.syscallIgnoreInterrupt(
+		&t.initRegs,
+		syscall.SYS_WAIT4,
+		arch.SyscallArgument{Value: uintptr(pid)},
+		arch.SyscallArgument{Value: 0},
+		arch.SyscallArgument{Value: syscall.WALL | syscall.WUNTRACED},
+		arch.SyscallArgument{Value: 0},
+		arch.SyscallArgument{Value: 0},
+		arch.SyscallArgument{Value: 0})
+	if err != nil {
+		return nil, err
+	}
+
+	childT := &thread{
+		tgid: int32(pid),
+		tid:  int32(pid),
+		cpu:  ^uint32(0),
+	}
+	childT.attach()
+
+	return childT, nil
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go
new file mode 100644
index 000000000..1bf7eab28
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go
@@ -0,0 +1,109 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64 linux
+
+package ptrace
+
+import (
+	"sync"
+	"sync/atomic"
+	"syscall"
+	"unsafe"
+
+	"golang.org/x/sys/unix"
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+)
+
+// maskPool contains reusable CPU masks for setting affinity. Unfortunately,
+// runtime.NumCPU doesn't actually record the number of CPUs on the system, it
+// just records the number of CPUs available in the scheduler affinity set at
+// startup. This may a) change over time and b) gives a number far lower than
+// the maximum indexable CPU. To prevent lots of allocation in the hot path, we
+// use a pool to store large masks that we can reuse during bind.
+var maskPool = sync.Pool{
+	New: func() interface{} {
+		const maxCPUs = 1024 // Not a hard limit; see below.
+		return make([]uintptr, maxCPUs/64)
+	},
+}
+
+// unmaskAllSignals unmasks all signals on the current thread.
+//
+//go:nosplit
+func unmaskAllSignals() syscall.Errno {
+	var set linux.SignalSet
+	_, _, errno := syscall.RawSyscall6(syscall.SYS_RT_SIGPROCMASK, linux.SIG_SETMASK, uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0)
+	return errno
+}
+
+// getCPU gets the current CPU.
+//
+// Precondition: the current runtime thread should be locked.
+func getCPU() (uint32, error) {
+	var cpu uintptr
+	if _, _, errno := syscall.RawSyscall(
+		unix.SYS_GETCPU,
+		uintptr(unsafe.Pointer(&cpu)),
+		0, 0); errno != 0 {
+		return 0, errno
+	}
+	return uint32(cpu), nil
+}
+
+// setCPU sets the CPU affinity.
+func (t *thread) setCPU(cpu uint32) error {
+	mask := maskPool.Get().([]uintptr)
+	n := int(cpu / 64)
+	v := uintptr(1 << uintptr(cpu%64))
+	if n >= len(mask) {
+		// See maskPool note above. We've actually exceeded the number
+		// of available cores. Grow the mask and return it.
+		mask = make([]uintptr, n+1)
+	}
+	mask[n] |= v
+	if _, _, errno := syscall.RawSyscall(
+		unix.SYS_SCHED_SETAFFINITY,
+		uintptr(t.tid),
+		uintptr(len(mask)*8),
+		uintptr(unsafe.Pointer(&mask[0]))); errno != 0 {
+		return errno
+	}
+	mask[n] &^= v
+	maskPool.Put(mask)
+	return nil
+}
+
+// bind attempts to ensure that the thread is on the same CPU as the current
+// thread. This provides no guarantees as it is fundamentally a racy operation:
+// CPU sets may change and we may be rescheduled in the middle of this
+// operation. As a result, no failures are reported.
+//
+// Precondition: the current runtime thread should be locked.
+func (t *thread) bind() {
+	currentCPU, err := getCPU()
+	if err != nil {
+		return
+	}
+	if oldCPU := atomic.SwapUint32(&t.cpu, currentCPU); oldCPU != currentCPU {
+		// Set the affinity on the thread and save the CPU for next
+		// round; we don't expect CPUs to bounce around too frequently.
+		//
+		// (It's worth noting that we could move CPUs between this point
+		// and when the tracee finishes executing. But that would be
+		// roughly the status quo anyways -- we're just maximizing our
+		// chances of colocation, not guaranteeing it.)
+		t.setCPU(currentCPU)
+	}
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
new file mode 100644
index 000000000..b80a3604d
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/subprocess_unsafe.go
@@ -0,0 +1,33 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build go1.12
+// +build !go1.14
+
+// Check go:linkname function signatures when updating Go version.
+
+package ptrace
+
+import (
+	_ "unsafe" // required for go:linkname.
+)
+
+//go:linkname beforeFork syscall.runtime_BeforeFork
+func beforeFork()
+
+//go:linkname afterFork syscall.runtime_AfterFork
+func afterFork()
+
+//go:linkname afterForkInChild syscall.runtime_AfterForkInChild
+func afterForkInChild()
diff --git a/pkg/sentry/platform/ring0/defs_impl.go b/pkg/sentry/platform/ring0/defs_impl.go
new file mode 100755
index 000000000..582553bc7
--- /dev/null
+++ b/pkg/sentry/platform/ring0/defs_impl.go
@@ -0,0 +1,538 @@
+package ring0
+
+import (
+	"syscall"
+
+	"fmt"
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"io"
+	"reflect"
+)
+
+var (
+	// UserspaceSize is the total size of userspace.
+	UserspaceSize = uintptr(1) << (VirtualAddressBits() - 1)
+
+	// MaximumUserAddress is the largest possible user address.
+	MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(usermem.PageSize-1)
+
+	// KernelStartAddress is the starting kernel address.
+	KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1)
+)
+
+// Kernel is a global kernel object.
+//
+// This contains global state, shared by multiple CPUs.
+type Kernel struct {
+	KernelArchState
+}
+
+// Hooks are hooks for kernel functions.
+type Hooks interface {
+	// KernelSyscall is called for kernel system calls.
+	//
+	// Return from this call will restore registers and return to the kernel: the
+	// registers must be modified directly.
+	//
+	// If this function is not provided, a kernel exception results in halt.
+	//
+	// This must be go:nosplit, as this will be on the interrupt stack.
+	// Closures are permitted, as the pointer to the closure frame is not
+	// passed on the stack.
+	KernelSyscall()
+
+	// KernelException handles an exception during kernel execution.
+	//
+	// Return from this call will restore registers and return to the kernel: the
+	// registers must be modified directly.
+	//
+	// If this function is not provided, a kernel exception results in halt.
+	//
+	// This must be go:nosplit, as this will be on the interrupt stack.
+	// Closures are permitted, as the pointer to the closure frame is not
+	// passed on the stack.
+	KernelException(Vector)
+}
+
+// CPU is the per-CPU struct.
+type CPU struct {
+	// self is a self reference.
+	//
+	// This is always guaranteed to be at offset zero.
+	self *CPU
+
+	// kernel is reference to the kernel that this CPU was initialized
+	// with. This reference is kept for garbage collection purposes: CPU
+	// registers may refer to objects within the Kernel object that cannot
+	// be safely freed.
+	kernel *Kernel
+
+	// CPUArchState is architecture-specific state.
+	CPUArchState
+
+	// registers is a set of registers; these may be used on kernel system
+	// calls and exceptions via the Registers function.
+	registers syscall.PtraceRegs
+
+	// hooks are kernel hooks.
+	hooks Hooks
+}
+
+// Registers returns a modifiable-copy of the kernel registers.
+//
+// This is explicitly safe to call during KernelException and KernelSyscall.
+//
+//go:nosplit
+func (c *CPU) Registers() *syscall.PtraceRegs {
+	return &c.registers
+}
+
+// SwitchOpts are passed to the Switch function.
+type SwitchOpts struct {
+	// Registers are the user register state.
+	Registers *syscall.PtraceRegs
+
+	// FloatingPointState is a byte pointer where floating point state is
+	// saved and restored.
+	FloatingPointState *byte
+
+	// PageTables are the application page tables.
+	PageTables *pagetables.PageTables
+
+	// Flush indicates that a TLB flush should be forced on switch.
+	Flush bool
+
+	// FullRestore indicates that an iret-based restore should be used.
+	FullRestore bool
+
+	// SwitchArchOpts are architecture-specific options.
+	SwitchArchOpts
+}
+
+// Segment indices and Selectors.
+const (
+	// Index into GDT array.
+	_          = iota // Null descriptor first.
+	_                 // Reserved (Linux is kernel 32).
+	segKcode          // Kernel code (64-bit).
+	segKdata          // Kernel data.
+	segUcode32        // User code (32-bit).
+	segUdata          // User data.
+	segUcode64        // User code (64-bit).
+	segTss            // Task segment descriptor.
+	segTssHi          // Upper bits for TSS.
+	segLast           // Last segment (terminal, not included).
+)
+
+// Selectors.
+const (
+	Kcode   Selector = segKcode << 3
+	Kdata   Selector = segKdata << 3
+	Ucode32 Selector = (segUcode32 << 3) | 3
+	Udata   Selector = (segUdata << 3) | 3
+	Ucode64 Selector = (segUcode64 << 3) | 3
+	Tss     Selector = segTss << 3
+)
+
+// Standard segments.
+var (
+	UserCodeSegment32 SegmentDescriptor
+	UserDataSegment   SegmentDescriptor
+	UserCodeSegment64 SegmentDescriptor
+	KernelCodeSegment SegmentDescriptor
+	KernelDataSegment SegmentDescriptor
+)
+
+// KernelOpts has initialization options for the kernel.
+type KernelOpts struct {
+	// PageTables are the kernel pagetables; this must be provided.
+	PageTables *pagetables.PageTables
+}
+
+// KernelArchState contains architecture-specific state.
+type KernelArchState struct {
+	KernelOpts
+
+	// globalIDT is our set of interrupt gates.
+	globalIDT idt64
+}
+
+// CPUArchState contains CPU-specific arch state.
+type CPUArchState struct {
+	// stack is the stack used for interrupts on this CPU.
+	stack [256]byte
+
+	// errorCode is the error code from the last exception.
+	errorCode uintptr
+
+	// errorType indicates the type of error code here, it is always set
+	// along with the errorCode value above.
+	//
+	// It will either by 1, which indicates a user error, or 0 indicating a
+	// kernel error. If the error code below returns false (kernel error),
+	// then it cannot provide relevant information about the last
+	// exception.
+	errorType uintptr
+
+	// gdt is the CPU's descriptor table.
+	gdt descriptorTable
+
+	// tss is the CPU's task state.
+	tss TaskState64
+}
+
+// ErrorCode returns the last error code.
+//
+// The returned boolean indicates whether the error code corresponds to the
+// last user error or not. If it does not, then fault information must be
+// ignored. This is generally the result of a kernel fault while servicing a
+// user fault.
+//
+//go:nosplit
+func (c *CPU) ErrorCode() (value uintptr, user bool) {
+	return c.errorCode, c.errorType != 0
+}
+
+// ClearErrorCode resets the error code.
+//
+//go:nosplit
+func (c *CPU) ClearErrorCode() {
+	c.errorCode = 0
+	c.errorType = 1
+}
+
+// SwitchArchOpts are embedded in SwitchOpts.
+type SwitchArchOpts struct {
+	// UserPCID indicates that the application PCID to be used on switch,
+	// assuming that PCIDs are supported.
+	//
+	// Per pagetables_x86.go, a zero PCID implies a flush.
+	UserPCID uint16
+
+	// KernelPCID indicates that the kernel PCID to be used on return,
+	// assuming that PCIDs are supported.
+	//
+	// Per pagetables_x86.go, a zero PCID implies a flush.
+	KernelPCID uint16
+}
+
+func init() {
+	KernelCodeSegment.setCode64(0, 0, 0)
+	KernelDataSegment.setData(0, 0xffffffff, 0)
+	UserCodeSegment32.setCode64(0, 0, 3)
+	UserDataSegment.setData(0, 0xffffffff, 3)
+	UserCodeSegment64.setCode64(0, 0, 3)
+}
+
+// Emit prints architecture-specific offsets.
+func Emit(w io.Writer) {
+	fmt.Fprintf(w, "// Automatically generated, do not edit.\n")
+
+	c := &CPU{}
+	fmt.Fprintf(w, "\n// CPU offsets.\n")
+	fmt.Fprintf(w, "#define CPU_SELF             0x%02x\n", reflect.ValueOf(&c.self).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_REGISTERS        0x%02x\n", reflect.ValueOf(&c.registers).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_STACK_TOP        0x%02x\n", reflect.ValueOf(&c.stack[0]).Pointer()-reflect.ValueOf(c).Pointer()+uintptr(len(c.stack)))
+	fmt.Fprintf(w, "#define CPU_ERROR_CODE       0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_ERROR_TYPE       0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer())
+
+	fmt.Fprintf(w, "\n// Bits.\n")
+	fmt.Fprintf(w, "#define _RFLAGS_IF           0x%02x\n", _RFLAGS_IF)
+	fmt.Fprintf(w, "#define _KERNEL_FLAGS        0x%02x\n", KernelFlagsSet)
+
+	fmt.Fprintf(w, "\n// Vectors.\n")
+	fmt.Fprintf(w, "#define DivideByZero               0x%02x\n", DivideByZero)
+	fmt.Fprintf(w, "#define Debug                      0x%02x\n", Debug)
+	fmt.Fprintf(w, "#define NMI                        0x%02x\n", NMI)
+	fmt.Fprintf(w, "#define Breakpoint                 0x%02x\n", Breakpoint)
+	fmt.Fprintf(w, "#define Overflow                   0x%02x\n", Overflow)
+	fmt.Fprintf(w, "#define BoundRangeExceeded         0x%02x\n", BoundRangeExceeded)
+	fmt.Fprintf(w, "#define InvalidOpcode              0x%02x\n", InvalidOpcode)
+	fmt.Fprintf(w, "#define DeviceNotAvailable         0x%02x\n", DeviceNotAvailable)
+	fmt.Fprintf(w, "#define DoubleFault                0x%02x\n", DoubleFault)
+	fmt.Fprintf(w, "#define CoprocessorSegmentOverrun  0x%02x\n", CoprocessorSegmentOverrun)
+	fmt.Fprintf(w, "#define InvalidTSS                 0x%02x\n", InvalidTSS)
+	fmt.Fprintf(w, "#define SegmentNotPresent          0x%02x\n", SegmentNotPresent)
+	fmt.Fprintf(w, "#define StackSegmentFault          0x%02x\n", StackSegmentFault)
+	fmt.Fprintf(w, "#define GeneralProtectionFault     0x%02x\n", GeneralProtectionFault)
+	fmt.Fprintf(w, "#define PageFault                  0x%02x\n", PageFault)
+	fmt.Fprintf(w, "#define X87FloatingPointException  0x%02x\n", X87FloatingPointException)
+	fmt.Fprintf(w, "#define AlignmentCheck             0x%02x\n", AlignmentCheck)
+	fmt.Fprintf(w, "#define MachineCheck               0x%02x\n", MachineCheck)
+	fmt.Fprintf(w, "#define SIMDFloatingPointException 0x%02x\n", SIMDFloatingPointException)
+	fmt.Fprintf(w, "#define VirtualizationException    0x%02x\n", VirtualizationException)
+	fmt.Fprintf(w, "#define SecurityException          0x%02x\n", SecurityException)
+	fmt.Fprintf(w, "#define SyscallInt80               0x%02x\n", SyscallInt80)
+	fmt.Fprintf(w, "#define Syscall                    0x%02x\n", Syscall)
+
+	p := &syscall.PtraceRegs{}
+	fmt.Fprintf(w, "\n// Ptrace registers.\n")
+	fmt.Fprintf(w, "#define PTRACE_R15      0x%02x\n", reflect.ValueOf(&p.R15).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R14      0x%02x\n", reflect.ValueOf(&p.R14).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R13      0x%02x\n", reflect.ValueOf(&p.R13).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R12      0x%02x\n", reflect.ValueOf(&p.R12).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RBP      0x%02x\n", reflect.ValueOf(&p.Rbp).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RBX      0x%02x\n", reflect.ValueOf(&p.Rbx).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R11      0x%02x\n", reflect.ValueOf(&p.R11).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R10      0x%02x\n", reflect.ValueOf(&p.R10).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R9       0x%02x\n", reflect.ValueOf(&p.R9).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R8       0x%02x\n", reflect.ValueOf(&p.R8).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RAX      0x%02x\n", reflect.ValueOf(&p.Rax).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RCX      0x%02x\n", reflect.ValueOf(&p.Rcx).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RDX      0x%02x\n", reflect.ValueOf(&p.Rdx).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RSI      0x%02x\n", reflect.ValueOf(&p.Rsi).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RDI      0x%02x\n", reflect.ValueOf(&p.Rdi).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_ORIGRAX  0x%02x\n", reflect.ValueOf(&p.Orig_rax).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RIP      0x%02x\n", reflect.ValueOf(&p.Rip).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_CS       0x%02x\n", reflect.ValueOf(&p.Cs).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_FLAGS    0x%02x\n", reflect.ValueOf(&p.Eflags).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RSP      0x%02x\n", reflect.ValueOf(&p.Rsp).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_SS       0x%02x\n", reflect.ValueOf(&p.Ss).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_FS       0x%02x\n", reflect.ValueOf(&p.Fs_base).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_GS       0x%02x\n", reflect.ValueOf(&p.Gs_base).Pointer()-reflect.ValueOf(p).Pointer())
+}
+
+// Useful bits.
+const (
+	_CR0_PE = 1 << 0
+	_CR0_ET = 1 << 4
+	_CR0_AM = 1 << 18
+	_CR0_PG = 1 << 31
+
+	_CR4_PSE        = 1 << 4
+	_CR4_PAE        = 1 << 5
+	_CR4_PGE        = 1 << 7
+	_CR4_OSFXSR     = 1 << 9
+	_CR4_OSXMMEXCPT = 1 << 10
+	_CR4_FSGSBASE   = 1 << 16
+	_CR4_PCIDE      = 1 << 17
+	_CR4_OSXSAVE    = 1 << 18
+	_CR4_SMEP       = 1 << 20
+
+	_RFLAGS_AC       = 1 << 18
+	_RFLAGS_NT       = 1 << 14
+	_RFLAGS_IOPL     = 3 << 12
+	_RFLAGS_DF       = 1 << 10
+	_RFLAGS_IF       = 1 << 9
+	_RFLAGS_STEP     = 1 << 8
+	_RFLAGS_RESERVED = 1 << 1
+
+	_EFER_SCE = 0x001
+	_EFER_LME = 0x100
+	_EFER_LMA = 0x400
+	_EFER_NX  = 0x800
+
+	_MSR_STAR          = 0xc0000081
+	_MSR_LSTAR         = 0xc0000082
+	_MSR_CSTAR         = 0xc0000083
+	_MSR_SYSCALL_MASK  = 0xc0000084
+	_MSR_PLATFORM_INFO = 0xce
+	_MSR_MISC_FEATURES = 0x140
+
+	_PLATFORM_INFO_CPUID_FAULT = 1 << 31
+
+	_MISC_FEATURE_CPUID_TRAP = 0x1
+)
+
+const (
+	// KernelFlagsSet should always be set in the kernel.
+	KernelFlagsSet = _RFLAGS_RESERVED
+
+	// UserFlagsSet are always set in userspace.
+	UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF
+
+	// KernelFlagsClear should always be clear in the kernel.
+	KernelFlagsClear = _RFLAGS_STEP | _RFLAGS_IF | _RFLAGS_IOPL | _RFLAGS_AC | _RFLAGS_NT
+
+	// UserFlagsClear are always cleared in userspace.
+	UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL
+)
+
+// Vector is an exception vector.
+type Vector uintptr
+
+// Exception vectors.
+const (
+	DivideByZero Vector = iota
+	Debug
+	NMI
+	Breakpoint
+	Overflow
+	BoundRangeExceeded
+	InvalidOpcode
+	DeviceNotAvailable
+	DoubleFault
+	CoprocessorSegmentOverrun
+	InvalidTSS
+	SegmentNotPresent
+	StackSegmentFault
+	GeneralProtectionFault
+	PageFault
+	_
+	X87FloatingPointException
+	AlignmentCheck
+	MachineCheck
+	SIMDFloatingPointException
+	VirtualizationException
+	SecurityException = 0x1e
+	SyscallInt80      = 0x80
+	_NR_INTERRUPTS    = SyscallInt80 + 1
+)
+
+// System call vectors.
+const (
+	Syscall Vector = _NR_INTERRUPTS
+)
+
+// VirtualAddressBits returns the number bits available for virtual addresses.
+//
+// Note that sign-extension semantics apply to the highest order bit.
+//
+// FIXME(b/69382326): This should use the cpuid passed to Init.
+func VirtualAddressBits() uint32 {
+	ax, _, _, _ := cpuid.HostID(0x80000008, 0)
+	return (ax >> 8) & 0xff
+}
+
+// PhysicalAddressBits returns the number of bits available for physical addresses.
+//
+// FIXME(b/69382326): This should use the cpuid passed to Init.
+func PhysicalAddressBits() uint32 {
+	ax, _, _, _ := cpuid.HostID(0x80000008, 0)
+	return ax & 0xff
+}
+
+// Selector is a segment Selector.
+type Selector uint16
+
+// SegmentDescriptor is a segment descriptor.
+type SegmentDescriptor struct {
+	bits [2]uint32
+}
+
+// descriptorTable is a collection of descriptors.
+type descriptorTable [32]SegmentDescriptor
+
+// SegmentDescriptorFlags are typed flags within a descriptor.
+type SegmentDescriptorFlags uint32
+
+// SegmentDescriptorFlag declarations.
+const (
+	SegmentDescriptorAccess     SegmentDescriptorFlags = 1 << 8  // Access bit (always set).
+	SegmentDescriptorWrite                             = 1 << 9  // Write permission.
+	SegmentDescriptorExpandDown                        = 1 << 10 // Grows down, not used.
+	SegmentDescriptorExecute                           = 1 << 11 // Execute permission.
+	SegmentDescriptorSystem                            = 1 << 12 // Zero => system, 1 => user code/data.
+	SegmentDescriptorPresent                           = 1 << 15 // Present.
+	SegmentDescriptorAVL                               = 1 << 20 // Available.
+	SegmentDescriptorLong                              = 1 << 21 // Long mode.
+	SegmentDescriptorDB                                = 1 << 22 // 16 or 32-bit.
+	SegmentDescriptorG                                 = 1 << 23 // Granularity: page or byte.
+)
+
+// Base returns the descriptor's base linear address.
+func (d *SegmentDescriptor) Base() uint32 {
+	return d.bits[1]&0xFF000000 | (d.bits[1]&0x000000FF)<<16 | d.bits[0]>>16
+}
+
+// Limit returns the descriptor size.
+func (d *SegmentDescriptor) Limit() uint32 {
+	l := d.bits[0]&0xFFFF | d.bits[1]&0xF0000
+	if d.bits[1]&uint32(SegmentDescriptorG) != 0 {
+		l <<= 12
+		l |= 0xFFF
+	}
+	return l
+}
+
+// Flags returns descriptor flags.
+func (d *SegmentDescriptor) Flags() SegmentDescriptorFlags {
+	return SegmentDescriptorFlags(d.bits[1] & 0x00F09F00)
+}
+
+// DPL returns the descriptor privilege level.
+func (d *SegmentDescriptor) DPL() int {
+	return int((d.bits[1] >> 13) & 3)
+}
+
+func (d *SegmentDescriptor) setNull() {
+	d.bits[0] = 0
+	d.bits[1] = 0
+}
+
+func (d *SegmentDescriptor) set(base, limit uint32, dpl int, flags SegmentDescriptorFlags) {
+	flags |= SegmentDescriptorPresent
+	if limit>>12 != 0 {
+		limit >>= 12
+		flags |= SegmentDescriptorG
+	}
+	d.bits[0] = base<<16 | limit&0xFFFF
+	d.bits[1] = base&0xFF000000 | (base>>16)&0xFF | limit&0x000F0000 | uint32(flags) | uint32(dpl)<<13
+}
+
+func (d *SegmentDescriptor) setCode32(base, limit uint32, dpl int) {
+	d.set(base, limit, dpl,
+		SegmentDescriptorDB|
+			SegmentDescriptorExecute|
+			SegmentDescriptorSystem)
+}
+
+func (d *SegmentDescriptor) setCode64(base, limit uint32, dpl int) {
+	d.set(base, limit, dpl,
+		SegmentDescriptorG|
+			SegmentDescriptorLong|
+			SegmentDescriptorExecute|
+			SegmentDescriptorSystem)
+}
+
+func (d *SegmentDescriptor) setData(base, limit uint32, dpl int) {
+	d.set(base, limit, dpl,
+		SegmentDescriptorWrite|
+			SegmentDescriptorSystem)
+}
+
+// setHi is only used for the TSS segment, which is magically 64-bits.
+func (d *SegmentDescriptor) setHi(base uint32) {
+	d.bits[0] = base
+	d.bits[1] = 0
+}
+
+// Gate64 is a 64-bit task, trap, or interrupt gate.
+type Gate64 struct {
+	bits [4]uint32
+}
+
+// idt64 is a 64-bit interrupt descriptor table.
+type idt64 [_NR_INTERRUPTS]Gate64
+
+func (g *Gate64) setInterrupt(cs Selector, rip uint64, dpl int, ist int) {
+	g.bits[0] = uint32(cs)<<16 | uint32(rip)&0xFFFF
+	g.bits[1] = uint32(rip)&0xFFFF0000 | SegmentDescriptorPresent | uint32(dpl)<<13 | 14<<8 | uint32(ist)&0x7
+	g.bits[2] = uint32(rip >> 32)
+}
+
+func (g *Gate64) setTrap(cs Selector, rip uint64, dpl int, ist int) {
+	g.setInterrupt(cs, rip, dpl, ist)
+	g.bits[1] |= 1 << 8
+}
+
+// TaskState64 is a 64-bit task state structure.
+type TaskState64 struct {
+	_              uint32
+	rsp0Lo, rsp0Hi uint32
+	rsp1Lo, rsp1Hi uint32
+	rsp2Lo, rsp2Hi uint32
+	_              [2]uint32
+	ist1Lo, ist1Hi uint32
+	ist2Lo, ist2Hi uint32
+	ist3Lo, ist3Hi uint32
+	ist4Lo, ist4Hi uint32
+	ist5Lo, ist5Hi uint32
+	ist6Lo, ist6Hi uint32
+	ist7Lo, ist7Hi uint32
+	_              [2]uint32
+	_              uint16
+	ioPerm         uint16
+}
diff --git a/pkg/sentry/platform/ring0/entry_amd64.go b/pkg/sentry/platform/ring0/entry_amd64.go
new file mode 100644
index 000000000..a5ce67885
--- /dev/null
+++ b/pkg/sentry/platform/ring0/entry_amd64.go
@@ -0,0 +1,128 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package ring0
+
+import (
+	"syscall"
+)
+
+// This is an assembly function.
+//
+// The sysenter function is invoked in two situations:
+//
+//  (1) The guest kernel has executed a system call.
+//  (2) The guest application has executed a system call.
+//
+// The interrupt flag is examined to determine whether the system call was
+// executed from kernel mode or not and the appropriate stub is called.
+func sysenter()
+
+// swapgs swaps the current GS value.
+//
+// This must be called prior to sysret/iret.
+func swapgs()
+
+// sysret returns to userspace from a system call.
+//
+// The return code is the vector that interrupted execution.
+//
+// See stubs.go for a note regarding the frame size of this function.
+func sysret(*CPU, *syscall.PtraceRegs) Vector
+
+// "iret is the cadillac of CPL switching."
+//
+//				-- Neel Natu
+//
+// iret is nearly identical to sysret, except an iret is used to fully restore
+// all user state. This must be called in cases where all registers need to be
+// restored.
+func iret(*CPU, *syscall.PtraceRegs) Vector
+
+// exception is the generic exception entry.
+//
+// This is called by the individual stub definitions.
+func exception()
+
+// resume is a stub that restores the CPU kernel registers.
+//
+// This is used when processing kernel exceptions and syscalls.
+func resume()
+
+// Start is the CPU entrypoint.
+//
+// The following start conditions must be satisfied:
+//
+//  * AX should contain the CPU pointer.
+//  * c.GDT() should be loaded as the GDT.
+//  * c.IDT() should be loaded as the IDT.
+//  * c.CR0() should be the current CR0 value.
+//  * c.CR3() should be set to the kernel PageTables.
+//  * c.CR4() should be the current CR4 value.
+//  * c.EFER() should be the current EFER value.
+//
+// The CPU state will be set to c.Registers().
+func Start()
+
+// Exception stubs.
+func divideByZero()
+func debug()
+func nmi()
+func breakpoint()
+func overflow()
+func boundRangeExceeded()
+func invalidOpcode()
+func deviceNotAvailable()
+func doubleFault()
+func coprocessorSegmentOverrun()
+func invalidTSS()
+func segmentNotPresent()
+func stackSegmentFault()
+func generalProtectionFault()
+func pageFault()
+func x87FloatingPointException()
+func alignmentCheck()
+func machineCheck()
+func simdFloatingPointException()
+func virtualizationException()
+func securityException()
+func syscallInt80()
+
+// Exception handler index.
+var handlers = map[Vector]func(){
+	DivideByZero:               divideByZero,
+	Debug:                      debug,
+	NMI:                        nmi,
+	Breakpoint:                 breakpoint,
+	Overflow:                   overflow,
+	BoundRangeExceeded:         boundRangeExceeded,
+	InvalidOpcode:              invalidOpcode,
+	DeviceNotAvailable:         deviceNotAvailable,
+	DoubleFault:                doubleFault,
+	CoprocessorSegmentOverrun:  coprocessorSegmentOverrun,
+	InvalidTSS:                 invalidTSS,
+	SegmentNotPresent:          segmentNotPresent,
+	StackSegmentFault:          stackSegmentFault,
+	GeneralProtectionFault:     generalProtectionFault,
+	PageFault:                  pageFault,
+	X87FloatingPointException:  x87FloatingPointException,
+	AlignmentCheck:             alignmentCheck,
+	MachineCheck:               machineCheck,
+	SIMDFloatingPointException: simdFloatingPointException,
+	VirtualizationException:    virtualizationException,
+	SecurityException:          securityException,
+	SyscallInt80:               syscallInt80,
+}
diff --git a/pkg/sentry/platform/ring0/entry_impl_amd64.s b/pkg/sentry/platform/ring0/entry_impl_amd64.s
new file mode 100755
index 000000000..d082d06a9
--- /dev/null
+++ b/pkg/sentry/platform/ring0/entry_impl_amd64.s
@@ -0,0 +1,383 @@
+// build +amd64
+
+// Automatically generated, do not edit.
+
+// CPU offsets.
+#define CPU_SELF             0x00
+#define CPU_REGISTERS        0x288
+#define CPU_STACK_TOP        0x110
+#define CPU_ERROR_CODE       0x110
+#define CPU_ERROR_TYPE       0x118
+
+// Bits.
+#define _RFLAGS_IF           0x200
+#define _KERNEL_FLAGS        0x02
+
+// Vectors.
+#define DivideByZero               0x00
+#define Debug                      0x01
+#define NMI                        0x02
+#define Breakpoint                 0x03
+#define Overflow                   0x04
+#define BoundRangeExceeded         0x05
+#define InvalidOpcode              0x06
+#define DeviceNotAvailable         0x07
+#define DoubleFault                0x08
+#define CoprocessorSegmentOverrun  0x09
+#define InvalidTSS                 0x0a
+#define SegmentNotPresent          0x0b
+#define StackSegmentFault          0x0c
+#define GeneralProtectionFault     0x0d
+#define PageFault                  0x0e
+#define X87FloatingPointException  0x10
+#define AlignmentCheck             0x11
+#define MachineCheck               0x12
+#define SIMDFloatingPointException 0x13
+#define VirtualizationException    0x14
+#define SecurityException          0x1e
+#define SyscallInt80               0x80
+#define Syscall                    0x81
+
+// Ptrace registers.
+#define PTRACE_R15      0x00
+#define PTRACE_R14      0x08
+#define PTRACE_R13      0x10
+#define PTRACE_R12      0x18
+#define PTRACE_RBP      0x20
+#define PTRACE_RBX      0x28
+#define PTRACE_R11      0x30
+#define PTRACE_R10      0x38
+#define PTRACE_R9       0x40
+#define PTRACE_R8       0x48
+#define PTRACE_RAX      0x50
+#define PTRACE_RCX      0x58
+#define PTRACE_RDX      0x60
+#define PTRACE_RSI      0x68
+#define PTRACE_RDI      0x70
+#define PTRACE_ORIGRAX  0x78
+#define PTRACE_RIP      0x80
+#define PTRACE_CS       0x88
+#define PTRACE_FLAGS    0x90
+#define PTRACE_RSP      0x98
+#define PTRACE_SS       0xa0
+#define PTRACE_FS       0xa8
+#define PTRACE_GS       0xb0
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "funcdata.h"
+#include "textflag.h"
+
+// NB: Offsets are programatically generated (see BUILD).
+//
+// This file is concatenated with the definitions.
+
+// Saves a register set.
+//
+// This is a macro because it may need to executed in contents where a stack is
+// not available for calls.
+//
+// The following registers are not saved: AX, SP, IP, FLAGS, all segments.
+#define REGISTERS_SAVE(reg, offset) \
+  MOVQ R15, offset+PTRACE_R15(reg); \
+  MOVQ R14, offset+PTRACE_R14(reg); \
+  MOVQ R13, offset+PTRACE_R13(reg); \
+  MOVQ R12, offset+PTRACE_R12(reg); \
+  MOVQ BP,  offset+PTRACE_RBP(reg); \
+  MOVQ BX,  offset+PTRACE_RBX(reg); \
+  MOVQ CX,  offset+PTRACE_RCX(reg); \
+  MOVQ DX,  offset+PTRACE_RDX(reg); \
+  MOVQ R11, offset+PTRACE_R11(reg); \
+  MOVQ R10, offset+PTRACE_R10(reg); \
+  MOVQ R9,  offset+PTRACE_R9(reg); \
+  MOVQ R8,  offset+PTRACE_R8(reg); \
+  MOVQ SI,  offset+PTRACE_RSI(reg); \
+  MOVQ DI,  offset+PTRACE_RDI(reg);
+
+// Loads a register set.
+//
+// This is a macro because it may need to executed in contents where a stack is
+// not available for calls.
+//
+// The following registers are not loaded: AX, SP, IP, FLAGS, all segments.
+#define REGISTERS_LOAD(reg, offset) \
+  MOVQ offset+PTRACE_R15(reg), R15; \
+  MOVQ offset+PTRACE_R14(reg), R14; \
+  MOVQ offset+PTRACE_R13(reg), R13; \
+  MOVQ offset+PTRACE_R12(reg), R12; \
+  MOVQ offset+PTRACE_RBP(reg), BP; \
+  MOVQ offset+PTRACE_RBX(reg), BX; \
+  MOVQ offset+PTRACE_RCX(reg), CX; \
+  MOVQ offset+PTRACE_RDX(reg), DX; \
+  MOVQ offset+PTRACE_R11(reg), R11; \
+  MOVQ offset+PTRACE_R10(reg), R10; \
+  MOVQ offset+PTRACE_R9(reg),  R9; \
+  MOVQ offset+PTRACE_R8(reg),  R8; \
+  MOVQ offset+PTRACE_RSI(reg), SI; \
+  MOVQ offset+PTRACE_RDI(reg), DI;
+
+// SWAP_GS swaps the kernel GS (CPU).
+#define SWAP_GS() \
+	BYTE $0x0F; BYTE $0x01; BYTE $0xf8;
+
+// IRET returns from an interrupt frame.
+#define IRET() \
+	BYTE $0x48; BYTE $0xcf;
+
+// SYSRET64 executes the sysret instruction.
+#define SYSRET64() \
+	BYTE $0x48; BYTE $0x0f; BYTE $0x07;
+
+// LOAD_KERNEL_ADDRESS loads a kernel address.
+#define LOAD_KERNEL_ADDRESS(from, to) \
+	MOVQ from, to; \
+	ORQ ·KernelStartAddress(SB), to;
+
+// LOAD_KERNEL_STACK loads the kernel stack.
+#define LOAD_KERNEL_STACK(from) \
+	LOAD_KERNEL_ADDRESS(CPU_SELF(from), SP); \
+	LEAQ CPU_STACK_TOP(SP), SP;
+
+// See kernel.go.
+TEXT ·Halt(SB),NOSPLIT,$0
+	HLT
+	RET
+
+// See entry_amd64.go.
+TEXT ·swapgs(SB),NOSPLIT,$0
+	SWAP_GS()
+	RET
+
+// See entry_amd64.go.
+TEXT ·sysret(SB),NOSPLIT,$0-24
+	// Save original state.
+	LOAD_KERNEL_ADDRESS(cpu+0(FP), BX)
+	LOAD_KERNEL_ADDRESS(regs+8(FP), AX)
+	MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX)
+	MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX)
+	MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX)
+
+	// Restore user register state.
+	REGISTERS_LOAD(AX, 0)
+	MOVQ PTRACE_RIP(AX), CX    // Needed for SYSRET.
+	MOVQ PTRACE_FLAGS(AX), R11 // Needed for SYSRET.
+	MOVQ PTRACE_RSP(AX), SP    // Restore the stack directly.
+	MOVQ PTRACE_RAX(AX), AX    // Restore AX (scratch).
+	SYSRET64()
+
+// See entry_amd64.go.
+TEXT ·iret(SB),NOSPLIT,$0-24
+	// Save original state.
+	LOAD_KERNEL_ADDRESS(cpu+0(FP), BX)
+	LOAD_KERNEL_ADDRESS(regs+8(FP), AX)
+	MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX)
+	MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX)
+	MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX)
+
+	// Build an IRET frame & restore state.
+	LOAD_KERNEL_STACK(BX)
+	MOVQ PTRACE_SS(AX), BX;    PUSHQ BX
+	MOVQ PTRACE_RSP(AX), CX;   PUSHQ CX
+	MOVQ PTRACE_FLAGS(AX), DX; PUSHQ DX
+	MOVQ PTRACE_CS(AX), DI;    PUSHQ DI
+	MOVQ PTRACE_RIP(AX), SI;   PUSHQ SI
+	REGISTERS_LOAD(AX, 0)   // Restore most registers.
+	MOVQ PTRACE_RAX(AX), AX // Restore AX (scratch).
+	IRET()
+
+// See entry_amd64.go.
+TEXT ·resume(SB),NOSPLIT,$0
+	// See iret, above.
+	MOVQ CPU_REGISTERS+PTRACE_SS(GS), BX;    PUSHQ BX
+	MOVQ CPU_REGISTERS+PTRACE_RSP(GS), CX;   PUSHQ CX
+	MOVQ CPU_REGISTERS+PTRACE_FLAGS(GS), DX; PUSHQ DX
+	MOVQ CPU_REGISTERS+PTRACE_CS(GS), DI;    PUSHQ DI
+	MOVQ CPU_REGISTERS+PTRACE_RIP(GS), SI;   PUSHQ SI
+	REGISTERS_LOAD(GS, CPU_REGISTERS)
+	MOVQ CPU_REGISTERS+PTRACE_RAX(GS), AX
+	IRET()
+
+// See entry_amd64.go.
+TEXT ·Start(SB),NOSPLIT,$0
+	LOAD_KERNEL_STACK(AX) // Set the stack.
+	PUSHQ $0x0            // Previous frame pointer.
+	MOVQ SP, BP           // Set frame pointer.
+	PUSHQ AX              // First argument (CPU).
+	CALL ·start(SB)       // Call Go hook.
+	JMP ·resume(SB)       // Restore to registers.
+
+// See entry_amd64.go.
+TEXT ·sysenter(SB),NOSPLIT,$0
+	// Interrupts are always disabled while we're executing in kernel mode
+	// and always enabled while executing in user mode. Therefore, we can
+	// reliably look at the flags in R11 to determine where this syscall
+	// was from.
+	TESTL $_RFLAGS_IF, R11
+	JZ kernel
+
+user:
+	SWAP_GS()
+	XCHGQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Swap stacks.
+	XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Swap for AX (regs).
+	REGISTERS_SAVE(AX, 0)                  // Save all except IP, FLAGS, SP, AX.
+	MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX  // Load saved AX value.
+	MOVQ BX,  PTRACE_RAX(AX)               // Save everything else.
+	MOVQ BX,  PTRACE_ORIGRAX(AX)
+	MOVQ CX,  PTRACE_RIP(AX)
+	MOVQ R11, PTRACE_FLAGS(AX)
+	MOVQ CPU_REGISTERS+PTRACE_RSP(GS), BX; MOVQ BX, PTRACE_RSP(AX)
+	MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code.
+	MOVQ $1, CPU_ERROR_TYPE(GS) // Set error type to user.
+
+	// Return to the kernel, where the frame is:
+	//
+	//	vector      (sp+24)
+	// 	regs        (sp+16)
+	// 	cpu         (sp+8)
+	// 	vcpu.Switch (sp+0)
+	//
+	MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer.
+	MOVQ $Syscall, 24(SP)                 // Output vector.
+	RET
+
+kernel:
+	// We can't restore the original stack, but we can access the registers
+	// in the CPU state directly. No need for temporary juggling.
+	MOVQ AX,  CPU_REGISTERS+PTRACE_ORIGRAX(GS)
+	MOVQ AX,  CPU_REGISTERS+PTRACE_RAX(GS)
+	REGISTERS_SAVE(GS, CPU_REGISTERS)
+	MOVQ CX,  CPU_REGISTERS+PTRACE_RIP(GS)
+	MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(GS)
+	MOVQ SP,  CPU_REGISTERS+PTRACE_RSP(GS)
+	MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code.
+	MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel.
+
+	// Call the syscall trampoline.
+	LOAD_KERNEL_STACK(GS)
+	MOVQ CPU_SELF(GS), AX   // Load vCPU.
+	PUSHQ AX                // First argument (vCPU).
+	CALL ·kernelSyscall(SB) // Call the trampoline.
+	POPQ AX                 // Pop vCPU.
+	JMP ·resume(SB)
+
+// exception is a generic exception handler.
+//
+// There are two cases handled:
+//
+// 1) An exception in kernel mode: this results in saving the state at the time
+// of the exception and calling the defined hook.
+//
+// 2) An exception in guest mode: the original kernel frame is restored, and
+// the vector & error codes are pushed as return values.
+//
+// See below for the stubs that call exception.
+TEXT ·exception(SB),NOSPLIT,$0
+	// Determine whether the exception occurred in kernel mode or user
+	// mode, based on the flags. We expect the following stack:
+	//
+	//	SS          (sp+48)
+	//	SP          (sp+40)
+	//	FLAGS       (sp+32)
+	//	CS          (sp+24)
+	//	IP          (sp+16)
+	//	ERROR_CODE  (sp+8)
+	//	VECTOR      (sp+0)
+	//
+	TESTL $_RFLAGS_IF, 32(SP)
+	JZ kernel
+
+user:
+	SWAP_GS()
+	ADDQ $-8, SP                            // Adjust for flags.
+	MOVQ $_KERNEL_FLAGS, 0(SP); BYTE $0x9d; // Reset flags (POPFQ).
+	XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX  // Swap for user regs.
+	REGISTERS_SAVE(AX, 0)                   // Save all except IP, FLAGS, SP, AX.
+	MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX   // Restore original AX.
+	MOVQ BX, PTRACE_RAX(AX)                 // Save it.
+	MOVQ BX, PTRACE_ORIGRAX(AX)
+	MOVQ 16(SP), BX; MOVQ BX, PTRACE_RIP(AX)
+	MOVQ 24(SP), CX; MOVQ CX, PTRACE_CS(AX)
+	MOVQ 32(SP), DX; MOVQ DX, PTRACE_FLAGS(AX)
+	MOVQ 40(SP), DI; MOVQ DI, PTRACE_RSP(AX)
+	MOVQ 48(SP), SI; MOVQ SI, PTRACE_SS(AX)
+
+	// Copy out and return.
+	MOVQ 0(SP), BX                        // Load vector.
+	MOVQ 8(SP), CX                        // Load error code.
+	MOVQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Original stack (kernel version).
+	MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer.
+	MOVQ CX, CPU_ERROR_CODE(GS)           // Set error code.
+	MOVQ $1, CPU_ERROR_TYPE(GS)           // Set error type to user.
+	MOVQ BX, 24(SP)                       // Output vector.
+	RET
+
+kernel:
+	// As per above, we can save directly.
+	MOVQ AX, CPU_REGISTERS+PTRACE_RAX(GS)
+	MOVQ AX, CPU_REGISTERS+PTRACE_ORIGRAX(GS)
+	REGISTERS_SAVE(GS, CPU_REGISTERS)
+	MOVQ 16(SP), AX; MOVQ AX, CPU_REGISTERS+PTRACE_RIP(GS)
+	MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(GS)
+	MOVQ 40(SP), CX; MOVQ CX, CPU_REGISTERS+PTRACE_RSP(GS)
+
+	// Set the error code and adjust the stack.
+	MOVQ 8(SP), AX              // Load the error code.
+	MOVQ AX, CPU_ERROR_CODE(GS) // Copy out to the CPU.
+	MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel.
+	MOVQ 0(SP), BX              // BX contains the vector.
+	ADDQ $48, SP                // Drop the exception frame.
+
+	// Call the exception trampoline.
+	LOAD_KERNEL_STACK(GS)
+	MOVQ CPU_SELF(GS), AX     // Load vCPU.
+	PUSHQ BX                  // Second argument (vector).
+	PUSHQ AX                  // First argument (vCPU).
+	CALL ·kernelException(SB) // Call the trampoline.
+	POPQ BX                   // Pop vector.
+	POPQ AX                   // Pop vCPU.
+	JMP ·resume(SB)
+
+#define EXCEPTION_WITH_ERROR(value, symbol) \
+TEXT symbol,NOSPLIT,$0; \
+	PUSHQ $value; \
+	JMP ·exception(SB);
+
+#define EXCEPTION_WITHOUT_ERROR(value, symbol) \
+TEXT symbol,NOSPLIT,$0; \
+	PUSHQ $0x0; \
+	PUSHQ $value; \
+	JMP ·exception(SB);
+
+EXCEPTION_WITHOUT_ERROR(DivideByZero, ·divideByZero(SB))
+EXCEPTION_WITHOUT_ERROR(Debug, ·debug(SB))
+EXCEPTION_WITHOUT_ERROR(NMI, ·nmi(SB))
+EXCEPTION_WITHOUT_ERROR(Breakpoint, ·breakpoint(SB))
+EXCEPTION_WITHOUT_ERROR(Overflow, ·overflow(SB))
+EXCEPTION_WITHOUT_ERROR(BoundRangeExceeded, ·boundRangeExceeded(SB))
+EXCEPTION_WITHOUT_ERROR(InvalidOpcode, ·invalidOpcode(SB))
+EXCEPTION_WITHOUT_ERROR(DeviceNotAvailable, ·deviceNotAvailable(SB))
+EXCEPTION_WITH_ERROR(DoubleFault, ·doubleFault(SB))
+EXCEPTION_WITHOUT_ERROR(CoprocessorSegmentOverrun, ·coprocessorSegmentOverrun(SB))
+EXCEPTION_WITH_ERROR(InvalidTSS, ·invalidTSS(SB))
+EXCEPTION_WITH_ERROR(SegmentNotPresent, ·segmentNotPresent(SB))
+EXCEPTION_WITH_ERROR(StackSegmentFault, ·stackSegmentFault(SB))
+EXCEPTION_WITH_ERROR(GeneralProtectionFault, ·generalProtectionFault(SB))
+EXCEPTION_WITH_ERROR(PageFault, ·pageFault(SB))
+EXCEPTION_WITHOUT_ERROR(X87FloatingPointException, ·x87FloatingPointException(SB))
+EXCEPTION_WITH_ERROR(AlignmentCheck, ·alignmentCheck(SB))
+EXCEPTION_WITHOUT_ERROR(MachineCheck, ·machineCheck(SB))
+EXCEPTION_WITHOUT_ERROR(SIMDFloatingPointException, ·simdFloatingPointException(SB))
+EXCEPTION_WITHOUT_ERROR(VirtualizationException, ·virtualizationException(SB))
+EXCEPTION_WITH_ERROR(SecurityException, ·securityException(SB))
+EXCEPTION_WITHOUT_ERROR(SyscallInt80, ·syscallInt80(SB))
diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go
new file mode 100644
index 000000000..900c0bba7
--- /dev/null
+++ b/pkg/sentry/platform/ring0/kernel.go
@@ -0,0 +1,66 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ring0
+
+// Init initializes a new kernel.
+//
+// N.B. that constraints on KernelOpts must be satisfied.
+//
+//go:nosplit
+func (k *Kernel) Init(opts KernelOpts) {
+	k.init(opts)
+}
+
+// Halt halts execution.
+func Halt()
+
+// defaultHooks implements hooks.
+type defaultHooks struct{}
+
+// KernelSyscall implements Hooks.KernelSyscall.
+//
+//go:nosplit
+func (defaultHooks) KernelSyscall() { Halt() }
+
+// KernelException implements Hooks.KernelException.
+//
+//go:nosplit
+func (defaultHooks) KernelException(Vector) { Halt() }
+
+// kernelSyscall is a trampoline.
+//
+//go:nosplit
+func kernelSyscall(c *CPU) { c.hooks.KernelSyscall() }
+
+// kernelException is a trampoline.
+//
+//go:nosplit
+func kernelException(c *CPU, vector Vector) { c.hooks.KernelException(vector) }
+
+// Init initializes a new CPU.
+//
+// Init allows embedding in other objects.
+func (c *CPU) Init(k *Kernel, hooks Hooks) {
+	c.self = c   // Set self reference.
+	c.kernel = k // Set kernel reference.
+	c.init()     // Perform architectural init.
+
+	// Require hooks.
+	if hooks != nil {
+		c.hooks = hooks
+	} else {
+		c.hooks = defaultHooks{}
+	}
+}
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
new file mode 100644
index 000000000..3577b5127
--- /dev/null
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -0,0 +1,271 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package ring0
+
+import (
+	"encoding/binary"
+)
+
+// init initializes architecture-specific state.
+func (k *Kernel) init(opts KernelOpts) {
+	// Save the root page tables.
+	k.PageTables = opts.PageTables
+
+	// Setup the IDT, which is uniform.
+	for v, handler := range handlers {
+		// Allow Breakpoint and Overflow to be called from all
+		// privilege levels.
+		dpl := 0
+		if v == Breakpoint || v == Overflow {
+			dpl = 3
+		}
+		// Note that we set all traps to use the interrupt stack, this
+		// is defined below when setting up the TSS.
+		k.globalIDT[v].setInterrupt(Kcode, uint64(kernelFunc(handler)), dpl, 1 /* ist */)
+	}
+}
+
+// init initializes architecture-specific state.
+func (c *CPU) init() {
+	// Null segment.
+	c.gdt[0].setNull()
+
+	// Kernel & user segments.
+	c.gdt[segKcode] = KernelCodeSegment
+	c.gdt[segKdata] = KernelDataSegment
+	c.gdt[segUcode32] = UserCodeSegment32
+	c.gdt[segUdata] = UserDataSegment
+	c.gdt[segUcode64] = UserCodeSegment64
+
+	// The task segment, this spans two entries.
+	tssBase, tssLimit, _ := c.TSS()
+	c.gdt[segTss].set(
+		uint32(tssBase),
+		uint32(tssLimit),
+		0, // Privilege level zero.
+		SegmentDescriptorPresent|
+			SegmentDescriptorAccess|
+			SegmentDescriptorWrite|
+			SegmentDescriptorExecute)
+	c.gdt[segTssHi].setHi(uint32((tssBase) >> 32))
+
+	// Set the kernel stack pointer in the TSS (virtual address).
+	stackAddr := c.StackTop()
+	c.tss.rsp0Lo = uint32(stackAddr)
+	c.tss.rsp0Hi = uint32(stackAddr >> 32)
+	c.tss.ist1Lo = uint32(stackAddr)
+	c.tss.ist1Hi = uint32(stackAddr >> 32)
+
+	// Permanently set the kernel segments.
+	c.registers.Cs = uint64(Kcode)
+	c.registers.Ds = uint64(Kdata)
+	c.registers.Es = uint64(Kdata)
+	c.registers.Ss = uint64(Kdata)
+	c.registers.Fs = uint64(Kdata)
+	c.registers.Gs = uint64(Kdata)
+
+	// Set mandatory flags.
+	c.registers.Eflags = KernelFlagsSet
+}
+
+// StackTop returns the kernel's stack address.
+//
+//go:nosplit
+func (c *CPU) StackTop() uint64 {
+	return uint64(kernelAddr(&c.stack[0])) + uint64(len(c.stack))
+}
+
+// IDT returns the CPU's IDT base and limit.
+//
+//go:nosplit
+func (c *CPU) IDT() (uint64, uint16) {
+	return uint64(kernelAddr(&c.kernel.globalIDT[0])), uint16(binary.Size(&c.kernel.globalIDT) - 1)
+}
+
+// GDT returns the CPU's GDT base and limit.
+//
+//go:nosplit
+func (c *CPU) GDT() (uint64, uint16) {
+	return uint64(kernelAddr(&c.gdt[0])), uint16(8*segLast - 1)
+}
+
+// TSS returns the CPU's TSS base, limit and value.
+//
+//go:nosplit
+func (c *CPU) TSS() (uint64, uint16, *SegmentDescriptor) {
+	return uint64(kernelAddr(&c.tss)), uint16(binary.Size(&c.tss) - 1), &c.gdt[segTss]
+}
+
+// CR0 returns the CPU's CR0 value.
+//
+//go:nosplit
+func (c *CPU) CR0() uint64 {
+	return _CR0_PE | _CR0_PG | _CR0_AM | _CR0_ET
+}
+
+// CR4 returns the CPU's CR4 value.
+//
+//go:nosplit
+func (c *CPU) CR4() uint64 {
+	cr4 := uint64(_CR4_PAE | _CR4_PSE | _CR4_OSFXSR | _CR4_OSXMMEXCPT)
+	if hasPCID {
+		cr4 |= _CR4_PCIDE
+	}
+	if hasXSAVE {
+		cr4 |= _CR4_OSXSAVE
+	}
+	if hasSMEP {
+		cr4 |= _CR4_SMEP
+	}
+	if hasFSGSBASE {
+		cr4 |= _CR4_FSGSBASE
+	}
+	return cr4
+}
+
+// EFER returns the CPU's EFER value.
+//
+//go:nosplit
+func (c *CPU) EFER() uint64 {
+	return _EFER_LME | _EFER_LMA | _EFER_SCE | _EFER_NX
+}
+
+// IsCanonical indicates whether addr is canonical per the amd64 spec.
+//
+//go:nosplit
+func IsCanonical(addr uint64) bool {
+	return addr <= 0x00007fffffffffff || addr > 0xffff800000000000
+}
+
+// SwitchToUser performs either a sysret or an iret.
+//
+// The return value is the vector that interrupted execution.
+//
+// This function will not split the stack. Callers will probably want to call
+// runtime.entersyscall (and pair with a call to runtime.exitsyscall) prior to
+// calling this function.
+//
+// When this is done, this region is quite sensitive to things like system
+// calls. After calling entersyscall, any memory used must have been allocated
+// and no function calls without go:nosplit are permitted. Any calls made here
+// are protected appropriately (e.g. IsCanonical and CR3).
+//
+// Also note that this function transitively depends on the compiler generating
+// code that uses IP-relative addressing inside of absolute addresses. That's
+// the case for amd64, but may not be the case for other architectures.
+//
+// Precondition: the Rip, Rsp, Fs and Gs registers must be canonical.
+//
+//go:nosplit
+func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
+	userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID)
+	kernelCR3 := c.kernel.PageTables.CR3(true, switchOpts.KernelPCID)
+
+	// Sanitize registers.
+	regs := switchOpts.Registers
+	regs.Eflags &= ^uint64(UserFlagsClear)
+	regs.Eflags |= UserFlagsSet
+	regs.Cs = uint64(Ucode64) // Required for iret.
+	regs.Ss = uint64(Udata)   // Ditto.
+
+	// Perform the switch.
+	swapgs()                                         // GS will be swapped on return.
+	WriteFS(uintptr(regs.Fs_base))                   // Set application FS.
+	WriteGS(uintptr(regs.Gs_base))                   // Set application GS.
+	LoadFloatingPoint(switchOpts.FloatingPointState) // Copy in floating point.
+	jumpToKernel()                                   // Switch to upper half.
+	writeCR3(uintptr(userCR3))                       // Change to user address space.
+	if switchOpts.FullRestore {
+		vector = iret(c, regs)
+	} else {
+		vector = sysret(c, regs)
+	}
+	writeCR3(uintptr(kernelCR3))                     // Return to kernel address space.
+	jumpToUser()                                     // Return to lower half.
+	SaveFloatingPoint(switchOpts.FloatingPointState) // Copy out floating point.
+	WriteFS(uintptr(c.registers.Fs_base))            // Restore kernel FS.
+	return
+}
+
+// start is the CPU entrypoint.
+//
+// This is called from the Start asm stub (see entry_amd64.go); on return the
+// registers in c.registers will be restored (not segments).
+//
+//go:nosplit
+func start(c *CPU) {
+	// Save per-cpu & FS segment.
+	WriteGS(kernelAddr(c))
+	WriteFS(uintptr(c.registers.Fs_base))
+
+	// Initialize floating point.
+	//
+	// Note that on skylake, the valid XCR0 mask reported seems to be 0xff.
+	// This breaks down as:
+	//
+	//	bit0   - x87
+	//	bit1   - SSE
+	//	bit2   - AVX
+	//	bit3-4 - MPX
+	//	bit5-7 - AVX512
+	//
+	// For some reason, enabled MPX & AVX512 on platforms that report them
+	// seems to be cause a general protection fault. (Maybe there are some
+	// virtualization issues and these aren't exported to the guest cpuid.)
+	// This needs further investigation, but we can limit the floating
+	// point operations to x87, SSE & AVX for now.
+	fninit()
+	xsetbv(0, validXCR0Mask&0x7)
+
+	// Set the syscall target.
+	wrmsr(_MSR_LSTAR, kernelFunc(sysenter))
+	wrmsr(_MSR_SYSCALL_MASK, KernelFlagsClear|_RFLAGS_DF)
+
+	// NOTE: This depends on having the 64-bit segments immediately
+	// following the 32-bit user segments. This is simply the way the
+	// sysret instruction is designed to work (it assumes they follow).
+	wrmsr(_MSR_STAR, uintptr(uint64(Kcode)<<32|uint64(Ucode32)<<48))
+	wrmsr(_MSR_CSTAR, kernelFunc(sysenter))
+}
+
+// SetCPUIDFaulting sets CPUID faulting per the boolean value.
+//
+// True is returned if faulting could be set.
+//
+//go:nosplit
+func SetCPUIDFaulting(on bool) bool {
+	// Per the SDM (Vol 3, Table 2-43), PLATFORM_INFO bit 31 denotes support
+	// for CPUID faulting, and we enable and disable via the MISC_FEATURES MSR.
+	if rdmsr(_MSR_PLATFORM_INFO)&_PLATFORM_INFO_CPUID_FAULT != 0 {
+		features := rdmsr(_MSR_MISC_FEATURES)
+		if on {
+			features |= _MISC_FEATURE_CPUID_TRAP
+		} else {
+			features &^= _MISC_FEATURE_CPUID_TRAP
+		}
+		wrmsr(_MSR_MISC_FEATURES, features)
+		return true // Setting successful.
+	}
+	return false
+}
+
+// ReadCR2 reads the current CR2 value.
+//
+//go:nosplit
+func ReadCR2() uintptr {
+	return readCR2()
+}
diff --git a/pkg/sentry/platform/ring0/kernel_unsafe.go b/pkg/sentry/platform/ring0/kernel_unsafe.go
new file mode 100644
index 000000000..16955ad91
--- /dev/null
+++ b/pkg/sentry/platform/ring0/kernel_unsafe.go
@@ -0,0 +1,41 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ring0
+
+import (
+	"unsafe"
+)
+
+// eface mirrors runtime.eface.
+type eface struct {
+	typ  uintptr
+	data unsafe.Pointer
+}
+
+// kernelAddr returns the kernel virtual address for the given object.
+//
+//go:nosplit
+func kernelAddr(obj interface{}) uintptr {
+	e := (*eface)(unsafe.Pointer(&obj))
+	return KernelStartAddress | uintptr(e.data)
+}
+
+// kernelFunc returns the address of the given function.
+//
+//go:nosplit
+func kernelFunc(fn func()) uintptr {
+	fnptr := (**uintptr)(unsafe.Pointer(&fn))
+	return KernelStartAddress | **fnptr
+}
diff --git a/pkg/sentry/platform/ring0/lib_amd64.go b/pkg/sentry/platform/ring0/lib_amd64.go
new file mode 100644
index 000000000..9c5f26962
--- /dev/null
+++ b/pkg/sentry/platform/ring0/lib_amd64.go
@@ -0,0 +1,131 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package ring0
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+)
+
+// LoadFloatingPoint loads floating point state by the most efficient mechanism
+// available (set by Init).
+var LoadFloatingPoint func(*byte)
+
+// SaveFloatingPoint saves floating point state by the most efficient mechanism
+// available (set by Init).
+var SaveFloatingPoint func(*byte)
+
+// fxrstor uses fxrstor64 to load floating point state.
+func fxrstor(*byte)
+
+// xrstor uses xrstor to load floating point state.
+func xrstor(*byte)
+
+// fxsave uses fxsave64 to save floating point state.
+func fxsave(*byte)
+
+// xsave uses xsave to save floating point state.
+func xsave(*byte)
+
+// xsaveopt uses xsaveopt to save floating point state.
+func xsaveopt(*byte)
+
+// WriteFS sets the GS address (set by init).
+var WriteFS func(addr uintptr)
+
+// wrfsbase writes to the GS base address.
+func wrfsbase(addr uintptr)
+
+// wrfsmsr writes to the GS_BASE MSR.
+func wrfsmsr(addr uintptr)
+
+// WriteGS sets the GS address (set by init).
+var WriteGS func(addr uintptr)
+
+// wrgsbase writes to the GS base address.
+func wrgsbase(addr uintptr)
+
+// wrgsmsr writes to the GS_BASE MSR.
+func wrgsmsr(addr uintptr)
+
+// writeCR3 writes the CR3 value.
+func writeCR3(phys uintptr)
+
+// readCR3 reads the current CR3 value.
+func readCR3() uintptr
+
+// readCR2 reads the current CR2 value.
+func readCR2() uintptr
+
+// jumpToKernel jumps to the kernel version of the current RIP.
+func jumpToKernel()
+
+// jumpToUser jumps to the user version of the current RIP.
+func jumpToUser()
+
+// fninit initializes the floating point unit.
+func fninit()
+
+// xsetbv writes to an extended control register.
+func xsetbv(reg, value uintptr)
+
+// xgetbv reads an extended control register.
+func xgetbv(reg uintptr) uintptr
+
+// wrmsr reads to the given MSR.
+func wrmsr(reg, value uintptr)
+
+// rdmsr reads the given MSR.
+func rdmsr(reg uintptr) uintptr
+
+// Mostly-constants set by Init.
+var (
+	hasSMEP       bool
+	hasPCID       bool
+	hasXSAVEOPT   bool
+	hasXSAVE      bool
+	hasFSGSBASE   bool
+	validXCR0Mask uintptr
+)
+
+// Init sets function pointers based on architectural features.
+//
+// This must be called prior to using ring0.
+func Init(featureSet *cpuid.FeatureSet) {
+	hasSMEP = featureSet.HasFeature(cpuid.X86FeatureSMEP)
+	hasPCID = featureSet.HasFeature(cpuid.X86FeaturePCID)
+	hasXSAVEOPT = featureSet.UseXsaveopt()
+	hasXSAVE = featureSet.UseXsave()
+	hasFSGSBASE = featureSet.HasFeature(cpuid.X86FeatureFSGSBase)
+	validXCR0Mask = uintptr(featureSet.ValidXCR0Mask())
+	if hasXSAVEOPT {
+		SaveFloatingPoint = xsaveopt
+		LoadFloatingPoint = xrstor
+	} else if hasXSAVE {
+		SaveFloatingPoint = xsave
+		LoadFloatingPoint = xrstor
+	} else {
+		SaveFloatingPoint = fxsave
+		LoadFloatingPoint = fxrstor
+	}
+	if hasFSGSBASE {
+		WriteFS = wrfsbase
+		WriteGS = wrgsbase
+	} else {
+		WriteFS = wrfsmsr
+		WriteGS = wrgsmsr
+	}
+}
diff --git a/pkg/sentry/platform/ring0/lib_amd64.s b/pkg/sentry/platform/ring0/lib_amd64.s
new file mode 100644
index 000000000..75d742750
--- /dev/null
+++ b/pkg/sentry/platform/ring0/lib_amd64.s
@@ -0,0 +1,247 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "funcdata.h"
+#include "textflag.h"
+
+// fxrstor loads floating point state.
+//
+// The code corresponds to:
+//
+//     fxrstor64 (%rbx)
+//
+TEXT ·fxrstor(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), BX
+	MOVL $0xffffffff, AX
+	MOVL $0xffffffff, DX
+	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x0b;
+	RET
+
+// xrstor loads floating point state.
+//
+// The code corresponds to:
+//
+//     xrstor (%rdi)
+//
+TEXT ·xrstor(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), DI
+	MOVL $0xffffffff, AX
+	MOVL $0xffffffff, DX
+	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x2f;
+	RET
+
+// fxsave saves floating point state.
+//
+// The code corresponds to:
+//
+//     fxsave64 (%rbx)
+//
+TEXT ·fxsave(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), BX
+	MOVL $0xffffffff, AX
+	MOVL $0xffffffff, DX
+	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x03;
+	RET
+
+// xsave saves floating point state.
+//
+// The code corresponds to:
+//
+//     xsave (%rdi)
+//
+TEXT ·xsave(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), DI
+	MOVL $0xffffffff, AX
+	MOVL $0xffffffff, DX
+	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27;
+	RET
+
+// xsaveopt saves floating point state.
+//
+// The code corresponds to:
+//
+//     xsaveopt (%rdi)
+//
+TEXT ·xsaveopt(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), DI
+	MOVL $0xffffffff, AX
+	MOVL $0xffffffff, DX
+	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37;
+	RET
+
+// wrfsbase writes to the FS base.
+//
+// The code corresponds to:
+//
+// 	wrfsbase %rax
+//
+TEXT ·wrfsbase(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), AX
+	BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xd0;
+	RET
+
+// wrfsmsr writes to the FSBASE MSR.
+//
+// The code corresponds to:
+//
+// 	wrmsr (writes EDX:EAX to the MSR in ECX)
+//
+TEXT ·wrfsmsr(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), AX
+	MOVQ AX, DX
+	SHRQ $32, DX
+	MOVQ $0xc0000100, CX // MSR_FS_BASE
+	BYTE $0x0f; BYTE $0x30;
+	RET
+
+// wrgsbase writes to the GS base.
+//
+// The code corresponds to:
+//
+// 	wrgsbase %rax
+//
+TEXT ·wrgsbase(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), AX
+	BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xd8;
+	RET
+
+// wrgsmsr writes to the GSBASE MSR.
+//
+// See wrfsmsr.
+TEXT ·wrgsmsr(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), AX
+	MOVQ AX, DX
+	SHRQ $32, DX
+	MOVQ $0xc0000101, CX     // MSR_GS_BASE
+	BYTE $0x0f; BYTE $0x30;  // WRMSR
+	RET
+
+// jumpToUser changes execution to the user address.
+//
+// This works by changing the return value to the user version.
+TEXT ·jumpToUser(SB),NOSPLIT,$0
+	MOVQ 0(SP), AX
+	MOVQ ·KernelStartAddress(SB), BX
+	NOTQ BX
+	ANDQ BX, SP // Switch the stack.
+	ANDQ BX, BP // Switch the frame pointer.
+	ANDQ BX, AX // Future return value.
+	MOVQ AX, 0(SP)
+	RET
+
+// jumpToKernel changes execution to the kernel address space.
+//
+// This works by changing the return value to the kernel version.
+TEXT ·jumpToKernel(SB),NOSPLIT,$0
+	MOVQ 0(SP), AX
+	MOVQ ·KernelStartAddress(SB), BX
+	ORQ BX, SP // Switch the stack.
+	ORQ BX, BP // Switch the frame pointer.
+	ORQ BX, AX // Future return value.
+	MOVQ AX, 0(SP)
+	RET
+
+// writeCR3 writes the given CR3 value.
+//
+// The code corresponds to:
+//
+// 	mov %rax, %cr3
+//
+TEXT ·writeCR3(SB),NOSPLIT,$0-8
+	MOVQ cr3+0(FP), AX
+	BYTE $0x0f; BYTE $0x22; BYTE $0xd8;
+	RET
+
+// readCR3 reads the current CR3 value.
+//
+// The code corresponds to:
+//
+// 	mov %cr3, %rax
+//
+TEXT ·readCR3(SB),NOSPLIT,$0-8
+	BYTE $0x0f; BYTE $0x20; BYTE $0xd8;
+	MOVQ AX, ret+0(FP)
+	RET
+
+// readCR2 reads the current CR2 value.
+//
+// The code corresponds to:
+//
+// 	mov %cr2, %rax
+//
+TEXT ·readCR2(SB),NOSPLIT,$0-8
+	BYTE $0x0f; BYTE $0x20; BYTE $0xd0;
+	MOVQ AX, ret+0(FP)
+	RET
+
+// fninit initializes the floating point unit.
+//
+// The code corresponds to:
+//
+// 	fninit
+TEXT ·fninit(SB),NOSPLIT,$0
+	BYTE $0xdb; BYTE $0xe3;
+	RET
+
+// xsetbv writes to an extended control register.
+//
+// The code corresponds to:
+//
+// 	xsetbv
+//
+TEXT ·xsetbv(SB),NOSPLIT,$0-16
+	MOVL reg+0(FP), CX
+	MOVL value+8(FP), AX
+	MOVL value+12(FP), DX
+	BYTE $0x0f; BYTE $0x01; BYTE $0xd1;
+	RET
+
+// xgetbv reads an extended control register.
+//
+// The code corresponds to:
+//
+// 	xgetbv
+//
+TEXT ·xgetbv(SB),NOSPLIT,$0-16
+	MOVL reg+0(FP), CX
+	BYTE $0x0f; BYTE $0x01; BYTE $0xd0;
+	MOVL AX, ret+8(FP)
+	MOVL DX, ret+12(FP)
+	RET
+
+// wrmsr writes to a control register.
+//
+// The code corresponds to:
+//
+// 	wrmsr
+//
+TEXT ·wrmsr(SB),NOSPLIT,$0-16
+	MOVL reg+0(FP), CX
+	MOVL value+8(FP), AX
+	MOVL value+12(FP), DX
+	BYTE $0x0f; BYTE $0x30;
+	RET
+
+// rdmsr reads a control register.
+//
+// The code corresponds to:
+//
+// 	rdmsr
+//
+TEXT ·rdmsr(SB),NOSPLIT,$0-16
+	MOVL reg+0(FP), CX
+	BYTE $0x0f; BYTE $0x32;
+	MOVL AX, ret+8(FP)
+	MOVL DX, ret+12(FP)
+	RET
diff --git a/pkg/sentry/platform/ring0/pagetables/allocator.go b/pkg/sentry/platform/ring0/pagetables/allocator.go
new file mode 100644
index 000000000..23fd5c352
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/allocator.go
@@ -0,0 +1,122 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pagetables
+
+// Allocator is used to allocate and map PTEs.
+//
+// Note that allocators may be called concurrently.
+type Allocator interface {
+	// NewPTEs returns a new set of PTEs and their physical address.
+	NewPTEs() *PTEs
+
+	// PhysicalFor gives the physical address for a set of PTEs.
+	PhysicalFor(ptes *PTEs) uintptr
+
+	// LookupPTEs looks up PTEs by physical address.
+	LookupPTEs(physical uintptr) *PTEs
+
+	// FreePTEs marks a set of PTEs a freed, although they may not be available
+	// for use again until Recycle is called, below.
+	FreePTEs(ptes *PTEs)
+
+	// Recycle makes freed PTEs available for use again.
+	Recycle()
+}
+
+// RuntimeAllocator is a trivial allocator.
+type RuntimeAllocator struct {
+	// used is the set of PTEs that have been allocated. This includes any
+	// PTEs that may be in the pool below. PTEs are only freed from this
+	// map by the Drain call.
+	//
+	// This exists to prevent accidental garbage collection.
+	used map[*PTEs]struct{}
+
+	// pool is the set of free-to-use PTEs.
+	pool []*PTEs
+
+	// freed is the set of recently-freed PTEs.
+	freed []*PTEs
+}
+
+// NewRuntimeAllocator returns an allocator that uses runtime allocation.
+func NewRuntimeAllocator() *RuntimeAllocator {
+	return &RuntimeAllocator{
+		used: make(map[*PTEs]struct{}),
+	}
+}
+
+// Recycle returns freed pages to the pool.
+func (r *RuntimeAllocator) Recycle() {
+	r.pool = append(r.pool, r.freed...)
+	r.freed = r.freed[:0]
+}
+
+// Drain empties the pool.
+func (r *RuntimeAllocator) Drain() {
+	r.Recycle()
+	for i, ptes := range r.pool {
+		// Zap the entry in the underlying array to ensure that it can
+		// be properly garbage collected.
+		r.pool[i] = nil
+		// Similarly, free the reference held by the used map (these
+		// also apply for the pool entries).
+		delete(r.used, ptes)
+	}
+	r.pool = r.pool[:0]
+}
+
+// NewPTEs implements Allocator.NewPTEs.
+//
+// Note that the "physical" address here is actually the virtual address of the
+// PTEs structure. The entries are tracked only to avoid garbage collection.
+//
+// This is guaranteed not to split as long as the pool is sufficiently full.
+//
+//go:nosplit
+func (r *RuntimeAllocator) NewPTEs() *PTEs {
+	// Pull from the pool if we can.
+	if len(r.pool) > 0 {
+		ptes := r.pool[len(r.pool)-1]
+		r.pool = r.pool[:len(r.pool)-1]
+		return ptes
+	}
+
+	// Allocate a new entry.
+	ptes := newAlignedPTEs()
+	r.used[ptes] = struct{}{}
+	return ptes
+}
+
+// PhysicalFor returns the physical address for the given PTEs.
+//
+//go:nosplit
+func (r *RuntimeAllocator) PhysicalFor(ptes *PTEs) uintptr {
+	return physicalFor(ptes)
+}
+
+// LookupPTEs implements Allocator.LookupPTEs.
+//
+//go:nosplit
+func (r *RuntimeAllocator) LookupPTEs(physical uintptr) *PTEs {
+	return fromPhysical(physical)
+}
+
+// FreePTEs implements Allocator.FreePTEs.
+//
+//go:nosplit
+func (r *RuntimeAllocator) FreePTEs(ptes *PTEs) {
+	r.freed = append(r.freed, ptes)
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go
new file mode 100644
index 000000000..1b996b4e2
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go
@@ -0,0 +1,53 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pagetables
+
+import (
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// newAlignedPTEs returns a set of aligned PTEs.
+func newAlignedPTEs() *PTEs {
+	ptes := new(PTEs)
+	offset := physicalFor(ptes) & (usermem.PageSize - 1)
+	if offset == 0 {
+		// Already aligned.
+		return ptes
+	}
+
+	// Need to force an aligned allocation.
+	unaligned := make([]byte, (2*usermem.PageSize)-1)
+	offset = uintptr(unsafe.Pointer(&unaligned[0])) & (usermem.PageSize - 1)
+	if offset != 0 {
+		offset = usermem.PageSize - offset
+	}
+	return (*PTEs)(unsafe.Pointer(&unaligned[offset]))
+}
+
+// physicalFor returns the "physical" address for PTEs.
+//
+//go:nosplit
+func physicalFor(ptes *PTEs) uintptr {
+	return uintptr(unsafe.Pointer(ptes))
+}
+
+// fromPhysical returns the PTEs from the "physical" address.
+//
+//go:nosplit
+func fromPhysical(physical uintptr) *PTEs {
+	return (*PTEs)(unsafe.Pointer(physical))
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
new file mode 100644
index 000000000..e5dcaada7
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -0,0 +1,221 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pagetables provides a generic implementation of pagetables.
+//
+// The core functions must be safe to call from a nosplit context. Furthermore,
+// this pagetables implementation goes to lengths to ensure that all functions
+// are free from runtime allocation. Calls to NewPTEs/FreePTEs may be made
+// during walks, but these can be cached elsewhere if required.
+package pagetables
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// PageTables is a set of page tables.
+type PageTables struct {
+	// Allocator is used to allocate nodes.
+	Allocator Allocator
+
+	// root is the pagetable root.
+	root *PTEs
+
+	// rootPhysical is the cached physical address of the root.
+	//
+	// This is saved only to prevent constant translation.
+	rootPhysical uintptr
+
+	// archPageTables includes architecture-specific features.
+	archPageTables
+}
+
+// New returns new PageTables.
+func New(a Allocator) *PageTables {
+	p := new(PageTables)
+	p.Init(a)
+	return p
+}
+
+// Init initializes a set of PageTables.
+//
+//go:nosplit
+func (p *PageTables) Init(allocator Allocator) {
+	p.Allocator = allocator
+	p.root = p.Allocator.NewPTEs()
+	p.rootPhysical = p.Allocator.PhysicalFor(p.root)
+}
+
+// mapVisitor is used for map.
+type mapVisitor struct {
+	target   uintptr // Input.
+	physical uintptr // Input.
+	opts     MapOpts // Input.
+	prev     bool    // Output.
+}
+
+// visit is used for map.
+//
+//go:nosplit
+func (v *mapVisitor) visit(start uintptr, pte *PTE, align uintptr) {
+	p := v.physical + (start - uintptr(v.target))
+	if pte.Valid() && (pte.Address() != p || pte.Opts() != v.opts) {
+		v.prev = true
+	}
+	if p&align != 0 {
+		// We will install entries at a smaller granulaity if we don't
+		// install a valid entry here, however we must zap any existing
+		// entry to ensure this happens.
+		pte.Clear()
+		return
+	}
+	pte.Set(p, v.opts)
+}
+
+//go:nosplit
+func (*mapVisitor) requiresAlloc() bool { return true }
+
+//go:nosplit
+func (*mapVisitor) requiresSplit() bool { return true }
+
+// Map installs a mapping with the given physical address.
+//
+// True is returned iff there was a previous mapping in the range.
+//
+// Precondition: addr & length must be page-aligned, their sum must not overflow.
+//
+//go:nosplit
+func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physical uintptr) bool {
+	if !opts.AccessType.Any() {
+		return p.Unmap(addr, length)
+	}
+	w := mapWalker{
+		pageTables: p,
+		visitor: mapVisitor{
+			target:   uintptr(addr),
+			physical: physical,
+			opts:     opts,
+		},
+	}
+	w.iterateRange(uintptr(addr), uintptr(addr)+length)
+	return w.visitor.prev
+}
+
+// unmapVisitor is used for unmap.
+type unmapVisitor struct {
+	count int
+}
+
+//go:nosplit
+func (*unmapVisitor) requiresAlloc() bool { return false }
+
+//go:nosplit
+func (*unmapVisitor) requiresSplit() bool { return true }
+
+// visit unmaps the given entry.
+//
+//go:nosplit
+func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) {
+	pte.Clear()
+	v.count++
+}
+
+// Unmap unmaps the given range.
+//
+// True is returned iff there was a previous mapping in the range.
+//
+// Precondition: addr & length must be page-aligned.
+//
+//go:nosplit
+func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool {
+	w := unmapWalker{
+		pageTables: p,
+		visitor: unmapVisitor{
+			count: 0,
+		},
+	}
+	w.iterateRange(uintptr(addr), uintptr(addr)+length)
+	return w.visitor.count > 0
+}
+
+// emptyVisitor is used for emptiness checks.
+type emptyVisitor struct {
+	count int
+}
+
+//go:nosplit
+func (*emptyVisitor) requiresAlloc() bool { return false }
+
+//go:nosplit
+func (*emptyVisitor) requiresSplit() bool { return false }
+
+// visit unmaps the given entry.
+//
+//go:nosplit
+func (v *emptyVisitor) visit(start uintptr, pte *PTE, align uintptr) {
+	v.count++
+}
+
+// IsEmpty checks if the given range is empty.
+//
+// Precondition: addr & length must be page-aligned.
+//
+//go:nosplit
+func (p *PageTables) IsEmpty(addr usermem.Addr, length uintptr) bool {
+	w := emptyWalker{
+		pageTables: p,
+	}
+	w.iterateRange(uintptr(addr), uintptr(addr)+length)
+	return w.visitor.count == 0
+}
+
+// lookupVisitor is used for lookup.
+type lookupVisitor struct {
+	target   uintptr // Input.
+	physical uintptr // Output.
+	opts     MapOpts // Output.
+}
+
+// visit matches the given address.
+//
+//go:nosplit
+func (v *lookupVisitor) visit(start uintptr, pte *PTE, align uintptr) {
+	if !pte.Valid() {
+		return
+	}
+	v.physical = pte.Address() + (start - uintptr(v.target))
+	v.opts = pte.Opts()
+}
+
+//go:nosplit
+func (*lookupVisitor) requiresAlloc() bool { return false }
+
+//go:nosplit
+func (*lookupVisitor) requiresSplit() bool { return false }
+
+// Lookup returns the physical address for the given virtual address.
+//
+//go:nosplit
+func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts) {
+	mask := uintptr(usermem.PageSize - 1)
+	offset := uintptr(addr) & mask
+	w := lookupWalker{
+		pageTables: p,
+		visitor: lookupVisitor{
+			target: uintptr(addr &^ usermem.Addr(mask)),
+		},
+	}
+	w.iterateRange(uintptr(addr), uintptr(addr)+1)
+	return w.visitor.physical + offset, w.visitor.opts
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
new file mode 100644
index 000000000..7aa6c524e
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
@@ -0,0 +1,45 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pagetables
+
+// Address constraints.
+//
+// The lowerTop and upperBottom currently apply to four-level pagetables;
+// additional refactoring would be necessary to support five-level pagetables.
+const (
+	lowerTop    = 0x00007fffffffffff
+	upperBottom = 0xffff800000000000
+
+	pteShift = 12
+	pmdShift = 21
+	pudShift = 30
+	pgdShift = 39
+
+	pteMask = 0x1ff << pteShift
+	pmdMask = 0x1ff << pmdShift
+	pudMask = 0x1ff << pudShift
+	pgdMask = 0x1ff << pgdShift
+
+	pteSize = 1 << pteShift
+	pmdSize = 1 << pmdShift
+	pudSize = 1 << pudShift
+	pgdSize = 1 << pgdShift
+
+	executeDisable = 1 << 63
+	entriesPerPage = 512
+)
+
+// PTEs is a collection of entries.
+type PTEs [entriesPerPage]PTE
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_state_autogen.go b/pkg/sentry/platform/ring0/pagetables/pagetables_state_autogen.go
new file mode 100755
index 000000000..ac1ccf3d3
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_state_autogen.go
@@ -0,0 +1,4 @@
+// automatically generated by stateify.
+
+package pagetables
+
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
new file mode 100644
index 000000000..ff427fbe9
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
@@ -0,0 +1,180 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package pagetables
+
+import (
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// archPageTables is architecture-specific data.
+type archPageTables struct {
+	// pcid is the value assigned by PCIDs.Assign.
+	//
+	// Note that zero is a valid PCID.
+	pcid uint16
+}
+
+// CR3 returns the CR3 value for these tables.
+//
+// This may be called in interrupt contexts. A PCID of zero always implies a
+// flush and should be passed when PCIDs are not enabled. See pcids_x86.go for
+// more information.
+//
+//go:nosplit
+func (p *PageTables) CR3(noFlush bool, pcid uint16) uint64 {
+	// Bit 63 is set to avoid flushing the PCID (per SDM 4.10.4.1).
+	const noFlushBit uint64 = 0x8000000000000000
+	if noFlush && pcid != 0 {
+		return noFlushBit | uint64(p.rootPhysical) | uint64(pcid)
+	}
+	return uint64(p.rootPhysical) | uint64(pcid)
+}
+
+// Bits in page table entries.
+const (
+	present      = 0x001
+	writable     = 0x002
+	user         = 0x004
+	writeThrough = 0x008
+	cacheDisable = 0x010
+	accessed     = 0x020
+	dirty        = 0x040
+	super        = 0x080
+	global       = 0x100
+	optionMask   = executeDisable | 0xfff
+)
+
+// MapOpts are x86 options.
+type MapOpts struct {
+	// AccessType defines permissions.
+	AccessType usermem.AccessType
+
+	// Global indicates the page is globally accessible.
+	Global bool
+
+	// User indicates the page is a user page.
+	User bool
+}
+
+// PTE is a page table entry.
+type PTE uintptr
+
+// Clear clears this PTE, including super page information.
+//
+//go:nosplit
+func (p *PTE) Clear() {
+	atomic.StoreUintptr((*uintptr)(p), 0)
+}
+
+// Valid returns true iff this entry is valid.
+//
+//go:nosplit
+func (p *PTE) Valid() bool {
+	return atomic.LoadUintptr((*uintptr)(p))&present != 0
+}
+
+// Opts returns the PTE options.
+//
+// These are all options except Valid and Super.
+//
+//go:nosplit
+func (p *PTE) Opts() MapOpts {
+	v := atomic.LoadUintptr((*uintptr)(p))
+	return MapOpts{
+		AccessType: usermem.AccessType{
+			Read:    v&present != 0,
+			Write:   v&writable != 0,
+			Execute: v&executeDisable == 0,
+		},
+		Global: v&global != 0,
+		User:   v&user != 0,
+	}
+}
+
+// SetSuper sets this page as a super page.
+//
+// The page must not be valid or a panic will result.
+//
+//go:nosplit
+func (p *PTE) SetSuper() {
+	if p.Valid() {
+		// This is not allowed.
+		panic("SetSuper called on valid page!")
+	}
+	atomic.StoreUintptr((*uintptr)(p), super)
+}
+
+// IsSuper returns true iff this page is a super page.
+//
+//go:nosplit
+func (p *PTE) IsSuper() bool {
+	return atomic.LoadUintptr((*uintptr)(p))&super != 0
+}
+
+// Set sets this PTE value.
+//
+// This does not change the super page property.
+//
+//go:nosplit
+func (p *PTE) Set(addr uintptr, opts MapOpts) {
+	if !opts.AccessType.Any() {
+		p.Clear()
+		return
+	}
+	v := (addr &^ optionMask) | present | accessed
+	if opts.User {
+		v |= user
+	}
+	if opts.Global {
+		v |= global
+	}
+	if !opts.AccessType.Execute {
+		v |= executeDisable
+	}
+	if opts.AccessType.Write {
+		v |= writable | dirty
+	}
+	if p.IsSuper() {
+		// Note that this is inherited from the previous instance. Set
+		// does not change the value of Super. See above.
+		v |= super
+	}
+	atomic.StoreUintptr((*uintptr)(p), v)
+}
+
+// setPageTable sets this PTE value and forces the write bit and super bit to
+// be cleared. This is used explicitly for breaking super pages.
+//
+//go:nosplit
+func (p *PTE) setPageTable(pt *PageTables, ptes *PTEs) {
+	addr := pt.Allocator.PhysicalFor(ptes)
+	if addr&^optionMask != addr {
+		// This should never happen.
+		panic("unaligned physical address!")
+	}
+	v := addr | present | user | writable | accessed | dirty
+	atomic.StoreUintptr((*uintptr)(p), v)
+}
+
+// Address extracts the address. This should only be used if Valid returns true.
+//
+//go:nosplit
+func (p *PTE) Address() uintptr {
+	return atomic.LoadUintptr((*uintptr)(p)) &^ optionMask
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
new file mode 100644
index 000000000..0f029f25d
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
@@ -0,0 +1,109 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package pagetables
+
+import (
+	"sync"
+)
+
+// limitPCID is the number of valid PCIDs.
+const limitPCID = 4096
+
+// PCIDs is a simple PCID database.
+//
+// This is not protected by locks and is thus suitable for use only with a
+// single CPU at a time.
+type PCIDs struct {
+	// mu protects below.
+	mu sync.Mutex
+
+	// cache are the assigned page tables.
+	cache map[*PageTables]uint16
+
+	// avail are available PCIDs.
+	avail []uint16
+}
+
+// NewPCIDs returns a new PCID database.
+//
+// start is the first index to assign. Typically this will be one, as the zero
+// pcid will always be flushed on transition (see pagetables_x86.go). This may
+// be more than one if specific PCIDs are reserved.
+//
+// Nil is returned iff the start and size are out of range.
+func NewPCIDs(start, size uint16) *PCIDs {
+	if start+uint16(size) >= limitPCID {
+		return nil // See comment.
+	}
+	p := &PCIDs{
+		cache: make(map[*PageTables]uint16),
+	}
+	for pcid := start; pcid < start+size; pcid++ {
+		p.avail = append(p.avail, pcid)
+	}
+	return p
+}
+
+// Assign assigns a PCID to the given PageTables.
+//
+// This may overwrite any previous assignment provided. If this in the case,
+// true is returned to indicate that the PCID should be flushed.
+func (p *PCIDs) Assign(pt *PageTables) (uint16, bool) {
+	p.mu.Lock()
+	if pcid, ok := p.cache[pt]; ok {
+		p.mu.Unlock()
+		return pcid, false // No flush.
+	}
+
+	// Is there something available?
+	if len(p.avail) > 0 {
+		pcid := p.avail[len(p.avail)-1]
+		p.avail = p.avail[:len(p.avail)-1]
+		p.cache[pt] = pcid
+
+		// We need to flush because while this is in the available
+		// pool, it may have been used previously.
+		p.mu.Unlock()
+		return pcid, true
+	}
+
+	// Evict an existing table.
+	for old, pcid := range p.cache {
+		delete(p.cache, old)
+		p.cache[pt] = pcid
+
+		// A flush is definitely required in this case, these page
+		// tables may still be active. (They will just be assigned some
+		// other PCID if and when they hit the given CPU again.)
+		p.mu.Unlock()
+		return pcid, true
+	}
+
+	// No PCID.
+	p.mu.Unlock()
+	return 0, false
+}
+
+// Drop drops references to a set of page tables.
+func (p *PCIDs) Drop(pt *PageTables) {
+	p.mu.Lock()
+	if pcid, ok := p.cache[pt]; ok {
+		delete(p.cache, pt)
+		p.avail = append(p.avail, pcid)
+	}
+	p.mu.Unlock()
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/walker_empty.go b/pkg/sentry/platform/ring0/pagetables/walker_empty.go
new file mode 100755
index 000000000..417784e17
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/walker_empty.go
@@ -0,0 +1,255 @@
+package pagetables
+
+// Walker walks page tables.
+type emptyWalker struct {
+	// pageTables are the tables to walk.
+	pageTables *PageTables
+
+	// Visitor is the set of arguments.
+	visitor emptyVisitor
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The
+// exception is super pages. If a valid super page (huge or jumbo) cannot be
+// installed, then the walk will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of super pages whenever
+// possible. Whether a super page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if requiresAlloc is true, then no gaps will be present. However,
+// if alloc is not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: start must be page-aligned.
+//
+// Precondition: start must be less than end.
+//
+// Precondition: If requiresAlloc is true, then start and end should not span
+// non-canonical ranges. If they do, a panic will result.
+//
+//go:nosplit
+func (w *emptyWalker) iterateRange(start, end uintptr) {
+	if start%pteSize != 0 {
+		panic("unaligned start")
+	}
+	if end < start {
+		panic("start > end")
+	}
+	if start < lowerTop {
+		if end <= lowerTop {
+			w.iterateRangeCanonical(start, end)
+		} else if end > lowerTop && end <= upperBottom {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(start, lowerTop)
+		} else {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(start, lowerTop)
+			w.iterateRangeCanonical(upperBottom, end)
+		}
+	} else if start < upperBottom {
+		if end <= upperBottom {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+		} else {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(upperBottom, end)
+		}
+	} else {
+		w.iterateRangeCanonical(start, end)
+	}
+}
+
+// next returns the next address quantized by the given size.
+//
+//go:nosplit
+func emptynext(start uintptr, size uintptr) uintptr {
+	start &= ^(size - 1)
+	start += size
+	return start
+}
+
+// iterateRangeCanonical walks a canonical range.
+//
+//go:nosplit
+func (w *emptyWalker) iterateRangeCanonical(start, end uintptr) {
+	for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+		var (
+			pgdEntry   = &w.pageTables.root[pgdIndex]
+			pudEntries *PTEs
+		)
+		if !pgdEntry.Valid() {
+			if !w.visitor.requiresAlloc() {
+
+				start = emptynext(start, pgdSize)
+				continue
+			}
+
+			pudEntries = w.pageTables.Allocator.NewPTEs()
+			pgdEntry.setPageTable(w.pageTables, pudEntries)
+		} else {
+			pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address())
+		}
+
+		clearPUDEntries := uint16(0)
+
+		for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+			var (
+				pudEntry   = &pudEntries[pudIndex]
+				pmdEntries *PTEs
+			)
+			if !pudEntry.Valid() {
+				if !w.visitor.requiresAlloc() {
+
+					clearPUDEntries++
+					start = emptynext(start, pudSize)
+					continue
+				}
+
+				if start&(pudSize-1) == 0 && end-start >= pudSize {
+					pudEntry.SetSuper()
+					w.visitor.visit(uintptr(start), pudEntry, pudSize-1)
+					if pudEntry.Valid() {
+						start = emptynext(start, pudSize)
+						continue
+					}
+				}
+
+				pmdEntries = w.pageTables.Allocator.NewPTEs()
+				pudEntry.setPageTable(w.pageTables, pmdEntries)
+
+			} else if pudEntry.IsSuper() {
+
+				if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < emptynext(start, pudSize)) {
+
+					pmdEntries = w.pageTables.Allocator.NewPTEs()
+					for index := uint16(0); index < entriesPerPage; index++ {
+						pmdEntries[index].SetSuper()
+						pmdEntries[index].Set(
+							pudEntry.Address()+(pmdSize*uintptr(index)),
+							pudEntry.Opts())
+					}
+					pudEntry.setPageTable(w.pageTables, pmdEntries)
+				} else {
+
+					w.visitor.visit(uintptr(start), pudEntry, pudSize-1)
+
+					if !pudEntry.Valid() {
+						clearPUDEntries++
+					}
+
+					start = emptynext(start, pudSize)
+					continue
+				}
+			} else {
+				pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address())
+			}
+
+			clearPMDEntries := uint16(0)
+
+			for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+				var (
+					pmdEntry   = &pmdEntries[pmdIndex]
+					pteEntries *PTEs
+				)
+				if !pmdEntry.Valid() {
+					if !w.visitor.requiresAlloc() {
+
+						clearPMDEntries++
+						start = emptynext(start, pmdSize)
+						continue
+					}
+
+					if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+						pmdEntry.SetSuper()
+						w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1)
+						if pmdEntry.Valid() {
+							start = emptynext(start, pmdSize)
+							continue
+						}
+					}
+
+					pteEntries = w.pageTables.Allocator.NewPTEs()
+					pmdEntry.setPageTable(w.pageTables, pteEntries)
+
+				} else if pmdEntry.IsSuper() {
+
+					if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < emptynext(start, pmdSize)) {
+
+						pteEntries = w.pageTables.Allocator.NewPTEs()
+						for index := uint16(0); index < entriesPerPage; index++ {
+							pteEntries[index].Set(
+								pmdEntry.Address()+(pteSize*uintptr(index)),
+								pmdEntry.Opts())
+						}
+						pmdEntry.setPageTable(w.pageTables, pteEntries)
+					} else {
+
+						w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1)
+
+						if !pmdEntry.Valid() {
+							clearPMDEntries++
+						}
+
+						start = emptynext(start, pmdSize)
+						continue
+					}
+				} else {
+					pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address())
+				}
+
+				clearPTEEntries := uint16(0)
+
+				for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+					var (
+						pteEntry = &pteEntries[pteIndex]
+					)
+					if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+						clearPTEEntries++
+						start += pteSize
+						continue
+					}
+
+					w.visitor.visit(uintptr(start), pteEntry, pteSize-1)
+					if !pteEntry.Valid() {
+						if w.visitor.requiresAlloc() {
+							panic("PTE not set after iteration with requiresAlloc!")
+						}
+						clearPTEEntries++
+					}
+
+					start += pteSize
+					continue
+				}
+
+				if clearPTEEntries == entriesPerPage {
+					pmdEntry.Clear()
+					w.pageTables.Allocator.FreePTEs(pteEntries)
+					clearPMDEntries++
+				}
+			}
+
+			if clearPMDEntries == entriesPerPage {
+				pudEntry.Clear()
+				w.pageTables.Allocator.FreePTEs(pmdEntries)
+				clearPUDEntries++
+			}
+		}
+
+		if clearPUDEntries == entriesPerPage {
+			pgdEntry.Clear()
+			w.pageTables.Allocator.FreePTEs(pudEntries)
+		}
+	}
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/walker_lookup.go b/pkg/sentry/platform/ring0/pagetables/walker_lookup.go
new file mode 100755
index 000000000..906c9c50f
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/walker_lookup.go
@@ -0,0 +1,255 @@
+package pagetables
+
+// Walker walks page tables.
+type lookupWalker struct {
+	// pageTables are the tables to walk.
+	pageTables *PageTables
+
+	// Visitor is the set of arguments.
+	visitor lookupVisitor
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The
+// exception is super pages. If a valid super page (huge or jumbo) cannot be
+// installed, then the walk will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of super pages whenever
+// possible. Whether a super page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if requiresAlloc is true, then no gaps will be present. However,
+// if alloc is not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: start must be page-aligned.
+//
+// Precondition: start must be less than end.
+//
+// Precondition: If requiresAlloc is true, then start and end should not span
+// non-canonical ranges. If they do, a panic will result.
+//
+//go:nosplit
+func (w *lookupWalker) iterateRange(start, end uintptr) {
+	if start%pteSize != 0 {
+		panic("unaligned start")
+	}
+	if end < start {
+		panic("start > end")
+	}
+	if start < lowerTop {
+		if end <= lowerTop {
+			w.iterateRangeCanonical(start, end)
+		} else if end > lowerTop && end <= upperBottom {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(start, lowerTop)
+		} else {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(start, lowerTop)
+			w.iterateRangeCanonical(upperBottom, end)
+		}
+	} else if start < upperBottom {
+		if end <= upperBottom {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+		} else {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(upperBottom, end)
+		}
+	} else {
+		w.iterateRangeCanonical(start, end)
+	}
+}
+
+// next returns the next address quantized by the given size.
+//
+//go:nosplit
+func lookupnext(start uintptr, size uintptr) uintptr {
+	start &= ^(size - 1)
+	start += size
+	return start
+}
+
+// iterateRangeCanonical walks a canonical range.
+//
+//go:nosplit
+func (w *lookupWalker) iterateRangeCanonical(start, end uintptr) {
+	for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+		var (
+			pgdEntry   = &w.pageTables.root[pgdIndex]
+			pudEntries *PTEs
+		)
+		if !pgdEntry.Valid() {
+			if !w.visitor.requiresAlloc() {
+
+				start = lookupnext(start, pgdSize)
+				continue
+			}
+
+			pudEntries = w.pageTables.Allocator.NewPTEs()
+			pgdEntry.setPageTable(w.pageTables, pudEntries)
+		} else {
+			pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address())
+		}
+
+		clearPUDEntries := uint16(0)
+
+		for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+			var (
+				pudEntry   = &pudEntries[pudIndex]
+				pmdEntries *PTEs
+			)
+			if !pudEntry.Valid() {
+				if !w.visitor.requiresAlloc() {
+
+					clearPUDEntries++
+					start = lookupnext(start, pudSize)
+					continue
+				}
+
+				if start&(pudSize-1) == 0 && end-start >= pudSize {
+					pudEntry.SetSuper()
+					w.visitor.visit(uintptr(start), pudEntry, pudSize-1)
+					if pudEntry.Valid() {
+						start = lookupnext(start, pudSize)
+						continue
+					}
+				}
+
+				pmdEntries = w.pageTables.Allocator.NewPTEs()
+				pudEntry.setPageTable(w.pageTables, pmdEntries)
+
+			} else if pudEntry.IsSuper() {
+
+				if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < lookupnext(start, pudSize)) {
+
+					pmdEntries = w.pageTables.Allocator.NewPTEs()
+					for index := uint16(0); index < entriesPerPage; index++ {
+						pmdEntries[index].SetSuper()
+						pmdEntries[index].Set(
+							pudEntry.Address()+(pmdSize*uintptr(index)),
+							pudEntry.Opts())
+					}
+					pudEntry.setPageTable(w.pageTables, pmdEntries)
+				} else {
+
+					w.visitor.visit(uintptr(start), pudEntry, pudSize-1)
+
+					if !pudEntry.Valid() {
+						clearPUDEntries++
+					}
+
+					start = lookupnext(start, pudSize)
+					continue
+				}
+			} else {
+				pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address())
+			}
+
+			clearPMDEntries := uint16(0)
+
+			for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+				var (
+					pmdEntry   = &pmdEntries[pmdIndex]
+					pteEntries *PTEs
+				)
+				if !pmdEntry.Valid() {
+					if !w.visitor.requiresAlloc() {
+
+						clearPMDEntries++
+						start = lookupnext(start, pmdSize)
+						continue
+					}
+
+					if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+						pmdEntry.SetSuper()
+						w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1)
+						if pmdEntry.Valid() {
+							start = lookupnext(start, pmdSize)
+							continue
+						}
+					}
+
+					pteEntries = w.pageTables.Allocator.NewPTEs()
+					pmdEntry.setPageTable(w.pageTables, pteEntries)
+
+				} else if pmdEntry.IsSuper() {
+
+					if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < lookupnext(start, pmdSize)) {
+
+						pteEntries = w.pageTables.Allocator.NewPTEs()
+						for index := uint16(0); index < entriesPerPage; index++ {
+							pteEntries[index].Set(
+								pmdEntry.Address()+(pteSize*uintptr(index)),
+								pmdEntry.Opts())
+						}
+						pmdEntry.setPageTable(w.pageTables, pteEntries)
+					} else {
+
+						w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1)
+
+						if !pmdEntry.Valid() {
+							clearPMDEntries++
+						}
+
+						start = lookupnext(start, pmdSize)
+						continue
+					}
+				} else {
+					pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address())
+				}
+
+				clearPTEEntries := uint16(0)
+
+				for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+					var (
+						pteEntry = &pteEntries[pteIndex]
+					)
+					if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+						clearPTEEntries++
+						start += pteSize
+						continue
+					}
+
+					w.visitor.visit(uintptr(start), pteEntry, pteSize-1)
+					if !pteEntry.Valid() {
+						if w.visitor.requiresAlloc() {
+							panic("PTE not set after iteration with requiresAlloc!")
+						}
+						clearPTEEntries++
+					}
+
+					start += pteSize
+					continue
+				}
+
+				if clearPTEEntries == entriesPerPage {
+					pmdEntry.Clear()
+					w.pageTables.Allocator.FreePTEs(pteEntries)
+					clearPMDEntries++
+				}
+			}
+
+			if clearPMDEntries == entriesPerPage {
+				pudEntry.Clear()
+				w.pageTables.Allocator.FreePTEs(pmdEntries)
+				clearPUDEntries++
+			}
+		}
+
+		if clearPUDEntries == entriesPerPage {
+			pgdEntry.Clear()
+			w.pageTables.Allocator.FreePTEs(pudEntries)
+		}
+	}
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/walker_map.go b/pkg/sentry/platform/ring0/pagetables/walker_map.go
new file mode 100755
index 000000000..61ee3c825
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/walker_map.go
@@ -0,0 +1,255 @@
+package pagetables
+
+// Walker walks page tables.
+type mapWalker struct {
+	// pageTables are the tables to walk.
+	pageTables *PageTables
+
+	// Visitor is the set of arguments.
+	visitor mapVisitor
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The
+// exception is super pages. If a valid super page (huge or jumbo) cannot be
+// installed, then the walk will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of super pages whenever
+// possible. Whether a super page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if requiresAlloc is true, then no gaps will be present. However,
+// if alloc is not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: start must be page-aligned.
+//
+// Precondition: start must be less than end.
+//
+// Precondition: If requiresAlloc is true, then start and end should not span
+// non-canonical ranges. If they do, a panic will result.
+//
+//go:nosplit
+func (w *mapWalker) iterateRange(start, end uintptr) {
+	if start%pteSize != 0 {
+		panic("unaligned start")
+	}
+	if end < start {
+		panic("start > end")
+	}
+	if start < lowerTop {
+		if end <= lowerTop {
+			w.iterateRangeCanonical(start, end)
+		} else if end > lowerTop && end <= upperBottom {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(start, lowerTop)
+		} else {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(start, lowerTop)
+			w.iterateRangeCanonical(upperBottom, end)
+		}
+	} else if start < upperBottom {
+		if end <= upperBottom {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+		} else {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(upperBottom, end)
+		}
+	} else {
+		w.iterateRangeCanonical(start, end)
+	}
+}
+
+// next returns the next address quantized by the given size.
+//
+//go:nosplit
+func mapnext(start uintptr, size uintptr) uintptr {
+	start &= ^(size - 1)
+	start += size
+	return start
+}
+
+// iterateRangeCanonical walks a canonical range.
+//
+//go:nosplit
+func (w *mapWalker) iterateRangeCanonical(start, end uintptr) {
+	for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+		var (
+			pgdEntry   = &w.pageTables.root[pgdIndex]
+			pudEntries *PTEs
+		)
+		if !pgdEntry.Valid() {
+			if !w.visitor.requiresAlloc() {
+
+				start = mapnext(start, pgdSize)
+				continue
+			}
+
+			pudEntries = w.pageTables.Allocator.NewPTEs()
+			pgdEntry.setPageTable(w.pageTables, pudEntries)
+		} else {
+			pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address())
+		}
+
+		clearPUDEntries := uint16(0)
+
+		for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+			var (
+				pudEntry   = &pudEntries[pudIndex]
+				pmdEntries *PTEs
+			)
+			if !pudEntry.Valid() {
+				if !w.visitor.requiresAlloc() {
+
+					clearPUDEntries++
+					start = mapnext(start, pudSize)
+					continue
+				}
+
+				if start&(pudSize-1) == 0 && end-start >= pudSize {
+					pudEntry.SetSuper()
+					w.visitor.visit(uintptr(start), pudEntry, pudSize-1)
+					if pudEntry.Valid() {
+						start = mapnext(start, pudSize)
+						continue
+					}
+				}
+
+				pmdEntries = w.pageTables.Allocator.NewPTEs()
+				pudEntry.setPageTable(w.pageTables, pmdEntries)
+
+			} else if pudEntry.IsSuper() {
+
+				if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < mapnext(start, pudSize)) {
+
+					pmdEntries = w.pageTables.Allocator.NewPTEs()
+					for index := uint16(0); index < entriesPerPage; index++ {
+						pmdEntries[index].SetSuper()
+						pmdEntries[index].Set(
+							pudEntry.Address()+(pmdSize*uintptr(index)),
+							pudEntry.Opts())
+					}
+					pudEntry.setPageTable(w.pageTables, pmdEntries)
+				} else {
+
+					w.visitor.visit(uintptr(start), pudEntry, pudSize-1)
+
+					if !pudEntry.Valid() {
+						clearPUDEntries++
+					}
+
+					start = mapnext(start, pudSize)
+					continue
+				}
+			} else {
+				pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address())
+			}
+
+			clearPMDEntries := uint16(0)
+
+			for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+				var (
+					pmdEntry   = &pmdEntries[pmdIndex]
+					pteEntries *PTEs
+				)
+				if !pmdEntry.Valid() {
+					if !w.visitor.requiresAlloc() {
+
+						clearPMDEntries++
+						start = mapnext(start, pmdSize)
+						continue
+					}
+
+					if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+						pmdEntry.SetSuper()
+						w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1)
+						if pmdEntry.Valid() {
+							start = mapnext(start, pmdSize)
+							continue
+						}
+					}
+
+					pteEntries = w.pageTables.Allocator.NewPTEs()
+					pmdEntry.setPageTable(w.pageTables, pteEntries)
+
+				} else if pmdEntry.IsSuper() {
+
+					if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < mapnext(start, pmdSize)) {
+
+						pteEntries = w.pageTables.Allocator.NewPTEs()
+						for index := uint16(0); index < entriesPerPage; index++ {
+							pteEntries[index].Set(
+								pmdEntry.Address()+(pteSize*uintptr(index)),
+								pmdEntry.Opts())
+						}
+						pmdEntry.setPageTable(w.pageTables, pteEntries)
+					} else {
+
+						w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1)
+
+						if !pmdEntry.Valid() {
+							clearPMDEntries++
+						}
+
+						start = mapnext(start, pmdSize)
+						continue
+					}
+				} else {
+					pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address())
+				}
+
+				clearPTEEntries := uint16(0)
+
+				for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+					var (
+						pteEntry = &pteEntries[pteIndex]
+					)
+					if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+						clearPTEEntries++
+						start += pteSize
+						continue
+					}
+
+					w.visitor.visit(uintptr(start), pteEntry, pteSize-1)
+					if !pteEntry.Valid() {
+						if w.visitor.requiresAlloc() {
+							panic("PTE not set after iteration with requiresAlloc!")
+						}
+						clearPTEEntries++
+					}
+
+					start += pteSize
+					continue
+				}
+
+				if clearPTEEntries == entriesPerPage {
+					pmdEntry.Clear()
+					w.pageTables.Allocator.FreePTEs(pteEntries)
+					clearPMDEntries++
+				}
+			}
+
+			if clearPMDEntries == entriesPerPage {
+				pudEntry.Clear()
+				w.pageTables.Allocator.FreePTEs(pmdEntries)
+				clearPUDEntries++
+			}
+		}
+
+		if clearPUDEntries == entriesPerPage {
+			pgdEntry.Clear()
+			w.pageTables.Allocator.FreePTEs(pudEntries)
+		}
+	}
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/walker_unmap.go b/pkg/sentry/platform/ring0/pagetables/walker_unmap.go
new file mode 100755
index 000000000..be2aa0ce4
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/walker_unmap.go
@@ -0,0 +1,255 @@
+package pagetables
+
+// Walker walks page tables.
+type unmapWalker struct {
+	// pageTables are the tables to walk.
+	pageTables *PageTables
+
+	// Visitor is the set of arguments.
+	visitor unmapVisitor
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The
+// exception is super pages. If a valid super page (huge or jumbo) cannot be
+// installed, then the walk will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of super pages whenever
+// possible. Whether a super page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if requiresAlloc is true, then no gaps will be present. However,
+// if alloc is not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: start must be page-aligned.
+//
+// Precondition: start must be less than end.
+//
+// Precondition: If requiresAlloc is true, then start and end should not span
+// non-canonical ranges. If they do, a panic will result.
+//
+//go:nosplit
+func (w *unmapWalker) iterateRange(start, end uintptr) {
+	if start%pteSize != 0 {
+		panic("unaligned start")
+	}
+	if end < start {
+		panic("start > end")
+	}
+	if start < lowerTop {
+		if end <= lowerTop {
+			w.iterateRangeCanonical(start, end)
+		} else if end > lowerTop && end <= upperBottom {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(start, lowerTop)
+		} else {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(start, lowerTop)
+			w.iterateRangeCanonical(upperBottom, end)
+		}
+	} else if start < upperBottom {
+		if end <= upperBottom {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+		} else {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(upperBottom, end)
+		}
+	} else {
+		w.iterateRangeCanonical(start, end)
+	}
+}
+
+// next returns the next address quantized by the given size.
+//
+//go:nosplit
+func unmapnext(start uintptr, size uintptr) uintptr {
+	start &= ^(size - 1)
+	start += size
+	return start
+}
+
+// iterateRangeCanonical walks a canonical range.
+//
+//go:nosplit
+func (w *unmapWalker) iterateRangeCanonical(start, end uintptr) {
+	for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+		var (
+			pgdEntry   = &w.pageTables.root[pgdIndex]
+			pudEntries *PTEs
+		)
+		if !pgdEntry.Valid() {
+			if !w.visitor.requiresAlloc() {
+
+				start = unmapnext(start, pgdSize)
+				continue
+			}
+
+			pudEntries = w.pageTables.Allocator.NewPTEs()
+			pgdEntry.setPageTable(w.pageTables, pudEntries)
+		} else {
+			pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address())
+		}
+
+		clearPUDEntries := uint16(0)
+
+		for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+			var (
+				pudEntry   = &pudEntries[pudIndex]
+				pmdEntries *PTEs
+			)
+			if !pudEntry.Valid() {
+				if !w.visitor.requiresAlloc() {
+
+					clearPUDEntries++
+					start = unmapnext(start, pudSize)
+					continue
+				}
+
+				if start&(pudSize-1) == 0 && end-start >= pudSize {
+					pudEntry.SetSuper()
+					w.visitor.visit(uintptr(start), pudEntry, pudSize-1)
+					if pudEntry.Valid() {
+						start = unmapnext(start, pudSize)
+						continue
+					}
+				}
+
+				pmdEntries = w.pageTables.Allocator.NewPTEs()
+				pudEntry.setPageTable(w.pageTables, pmdEntries)
+
+			} else if pudEntry.IsSuper() {
+
+				if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < unmapnext(start, pudSize)) {
+
+					pmdEntries = w.pageTables.Allocator.NewPTEs()
+					for index := uint16(0); index < entriesPerPage; index++ {
+						pmdEntries[index].SetSuper()
+						pmdEntries[index].Set(
+							pudEntry.Address()+(pmdSize*uintptr(index)),
+							pudEntry.Opts())
+					}
+					pudEntry.setPageTable(w.pageTables, pmdEntries)
+				} else {
+
+					w.visitor.visit(uintptr(start), pudEntry, pudSize-1)
+
+					if !pudEntry.Valid() {
+						clearPUDEntries++
+					}
+
+					start = unmapnext(start, pudSize)
+					continue
+				}
+			} else {
+				pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address())
+			}
+
+			clearPMDEntries := uint16(0)
+
+			for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+				var (
+					pmdEntry   = &pmdEntries[pmdIndex]
+					pteEntries *PTEs
+				)
+				if !pmdEntry.Valid() {
+					if !w.visitor.requiresAlloc() {
+
+						clearPMDEntries++
+						start = unmapnext(start, pmdSize)
+						continue
+					}
+
+					if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+						pmdEntry.SetSuper()
+						w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1)
+						if pmdEntry.Valid() {
+							start = unmapnext(start, pmdSize)
+							continue
+						}
+					}
+
+					pteEntries = w.pageTables.Allocator.NewPTEs()
+					pmdEntry.setPageTable(w.pageTables, pteEntries)
+
+				} else if pmdEntry.IsSuper() {
+
+					if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < unmapnext(start, pmdSize)) {
+
+						pteEntries = w.pageTables.Allocator.NewPTEs()
+						for index := uint16(0); index < entriesPerPage; index++ {
+							pteEntries[index].Set(
+								pmdEntry.Address()+(pteSize*uintptr(index)),
+								pmdEntry.Opts())
+						}
+						pmdEntry.setPageTable(w.pageTables, pteEntries)
+					} else {
+
+						w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1)
+
+						if !pmdEntry.Valid() {
+							clearPMDEntries++
+						}
+
+						start = unmapnext(start, pmdSize)
+						continue
+					}
+				} else {
+					pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address())
+				}
+
+				clearPTEEntries := uint16(0)
+
+				for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+					var (
+						pteEntry = &pteEntries[pteIndex]
+					)
+					if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+						clearPTEEntries++
+						start += pteSize
+						continue
+					}
+
+					w.visitor.visit(uintptr(start), pteEntry, pteSize-1)
+					if !pteEntry.Valid() {
+						if w.visitor.requiresAlloc() {
+							panic("PTE not set after iteration with requiresAlloc!")
+						}
+						clearPTEEntries++
+					}
+
+					start += pteSize
+					continue
+				}
+
+				if clearPTEEntries == entriesPerPage {
+					pmdEntry.Clear()
+					w.pageTables.Allocator.FreePTEs(pteEntries)
+					clearPMDEntries++
+				}
+			}
+
+			if clearPMDEntries == entriesPerPage {
+				pudEntry.Clear()
+				w.pageTables.Allocator.FreePTEs(pmdEntries)
+				clearPUDEntries++
+			}
+		}
+
+		if clearPUDEntries == entriesPerPage {
+			pgdEntry.Clear()
+			w.pageTables.Allocator.FreePTEs(pudEntries)
+		}
+	}
+}
diff --git a/pkg/sentry/platform/ring0/ring0.go b/pkg/sentry/platform/ring0/ring0.go
new file mode 100644
index 000000000..cdeb1b43a
--- /dev/null
+++ b/pkg/sentry/platform/ring0/ring0.go
@@ -0,0 +1,16 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ring0 provides basic operating system-level stubs.
+package ring0
diff --git a/pkg/sentry/platform/ring0/ring0_state_autogen.go b/pkg/sentry/platform/ring0/ring0_state_autogen.go
new file mode 100755
index 000000000..462f9a446
--- /dev/null
+++ b/pkg/sentry/platform/ring0/ring0_state_autogen.go
@@ -0,0 +1,4 @@
+// automatically generated by stateify.
+
+package ring0
+
diff --git a/pkg/sentry/platform/safecopy/atomic_amd64.s b/pkg/sentry/platform/safecopy/atomic_amd64.s
new file mode 100644
index 000000000..a0cd78f33
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/atomic_amd64.s
@@ -0,0 +1,136 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// handleSwapUint32Fault returns the value stored in DI. Control is transferred
+// to it when swapUint32 below receives SIGSEGV or SIGBUS, with the signal
+// number stored in DI.
+//
+// It must have the same frame configuration as swapUint32 so that it can undo
+// any potential call frame set up by the assembler.
+TEXT handleSwapUint32Fault(SB), NOSPLIT, $0-24
+  MOVL DI, sig+20(FP)
+  RET
+
+// swapUint32 atomically stores new into *addr and returns (the previous *addr
+// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the
+// value of old is unspecified, and sig is the number of the signal that was
+// received.
+//
+// Preconditions: addr must be aligned to a 4-byte boundary.
+//
+//func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32)
+TEXT ·swapUint32(SB), NOSPLIT, $0-24
+  // Store 0 as the returned signal number. If we run to completion,
+  // this is the value the caller will see; if a signal is received,
+  // handleSwapUint32Fault will store a different value in this address.
+  MOVL $0, sig+20(FP)
+
+  MOVQ addr+0(FP), DI
+  MOVL new+8(FP), AX
+  XCHGL AX, 0(DI)
+  MOVL AX, old+16(FP)
+  RET
+
+// handleSwapUint64Fault returns the value stored in DI. Control is transferred
+// to it when swapUint64 below receives SIGSEGV or SIGBUS, with the signal
+// number stored in DI.
+//
+// It must have the same frame configuration as swapUint64 so that it can undo
+// any potential call frame set up by the assembler.
+TEXT handleSwapUint64Fault(SB), NOSPLIT, $0-28
+  MOVL DI, sig+24(FP)
+  RET
+
+// swapUint64 atomically stores new into *addr and returns (the previous *addr
+// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the
+// value of old is unspecified, and sig is the number of the signal that was
+// received.
+//
+// Preconditions: addr must be aligned to a 8-byte boundary.
+//
+//func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32)
+TEXT ·swapUint64(SB), NOSPLIT, $0-28
+  // Store 0 as the returned signal number. If we run to completion,
+  // this is the value the caller will see; if a signal is received,
+  // handleSwapUint64Fault will store a different value in this address.
+  MOVL $0, sig+24(FP)
+
+  MOVQ addr+0(FP), DI
+  MOVQ new+8(FP), AX
+  XCHGQ AX, 0(DI)
+  MOVQ AX, old+16(FP)
+  RET
+
+// handleCompareAndSwapUint32Fault returns the value stored in DI. Control is
+// transferred to it when swapUint64 below receives SIGSEGV or SIGBUS, with the
+// signal number stored in DI.
+//
+// It must have the same frame configuration as compareAndSwapUint32 so that it
+// can undo any potential call frame set up by the assembler.
+TEXT handleCompareAndSwapUint32Fault(SB), NOSPLIT, $0-24
+  MOVL DI, sig+20(FP)
+  RET
+
+// compareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns
+// (the value previously stored at addr, 0). If a SIGSEGV or SIGBUS signal is
+// received during the operation, the value of prev is unspecified, and sig is
+// the number of the signal that was received.
+//
+// Preconditions: addr must be aligned to a 4-byte boundary.
+//
+//func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32)
+TEXT ·compareAndSwapUint32(SB), NOSPLIT, $0-24
+  // Store 0 as the returned signal number. If we run to completion, this is
+  // the value the caller will see; if a signal is received,
+  // handleCompareAndSwapUint32Fault will store a different value in this
+  // address.
+  MOVL $0, sig+20(FP)
+
+  MOVQ addr+0(FP), DI
+  MOVL old+8(FP), AX
+  MOVL new+12(FP), DX
+  LOCK
+  CMPXCHGL DX, 0(DI)
+  MOVL AX, prev+16(FP)
+  RET
+
+// handleLoadUint32Fault returns the value stored in DI. Control is transferred
+// to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal
+// number stored in DI.
+//
+// It must have the same frame configuration as loadUint32 so that it can undo
+// any potential call frame set up by the assembler.
+TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16
+  MOVL DI, sig+12(FP)
+  RET
+
+// loadUint32 atomically loads *addr and returns it. If a SIGSEGV or SIGBUS
+// signal is received, the value returned is unspecified, and sig is the number
+// of the signal that was received.
+//
+// Preconditions: addr must be aligned to a 4-byte boundary.
+//
+//func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32)
+TEXT ·loadUint32(SB), NOSPLIT, $0-16
+  // Store 0 as the returned signal number. If we run to completion,
+  // this is the value the caller will see; if a signal is received,
+  // handleLoadUint32Fault will store a different value in this address.
+  MOVL $0, sig+12(FP)
+
+  MOVQ addr+0(FP), AX
+  MOVL (AX), BX
+  MOVL BX, val+8(FP)
+  RET
diff --git a/pkg/sentry/platform/safecopy/atomic_arm64.s b/pkg/sentry/platform/safecopy/atomic_arm64.s
new file mode 100644
index 000000000..d58ed71f7
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/atomic_arm64.s
@@ -0,0 +1,126 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// handleSwapUint32Fault returns the value stored in R1. Control is transferred
+// to it when swapUint32 below receives SIGSEGV or SIGBUS, with the signal
+// number stored in R1.
+//
+// It must have the same frame configuration as swapUint32 so that it can undo
+// any potential call frame set up by the assembler.
+TEXT handleSwapUint32Fault(SB), NOSPLIT, $0-24
+	MOVW R1, sig+20(FP)
+	RET
+
+// See the corresponding doc in safecopy_unsafe.go
+//
+// The code is derived from Go source runtime/internal/atomic.Xchg.
+//
+//func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32)
+TEXT ·swapUint32(SB), NOSPLIT, $0-24
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleSwapUint32Fault will store a different value in this address.
+	MOVW $0, sig+20(FP)
+again:
+	MOVD addr+0(FP), R0
+	MOVW new+8(FP), R1
+	LDAXRW (R0), R2
+	STLXRW R1, (R0), R3
+	CBNZ R3, again
+	MOVW R2, old+16(FP)
+	RET
+
+// handleSwapUint64Fault returns the value stored in R1. Control is transferred
+// to it when swapUint64 below receives SIGSEGV or SIGBUS, with the signal
+// number stored in R1.
+//
+// It must have the same frame configuration as swapUint64 so that it can undo
+// any potential call frame set up by the assembler.
+TEXT handleSwapUint64Fault(SB), NOSPLIT, $0-28
+	MOVW R1, sig+24(FP)
+	RET
+
+// See the corresponding doc in safecopy_unsafe.go
+//
+// The code is derived from Go source runtime/internal/atomic.Xchg64.
+//
+//func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32)
+TEXT ·swapUint64(SB), NOSPLIT, $0-28
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleSwapUint64Fault will store a different value in this address.
+	MOVW $0, sig+24(FP)
+again:
+	MOVD addr+0(FP), R0
+	MOVD new+8(FP), R1
+	LDAXR (R0), R2
+	STLXR R1, (R0), R3
+	CBNZ R3, again
+	MOVD R2, old+16(FP)
+	RET
+
+// handleCompareAndSwapUint32Fault returns the value stored in R1. Control is
+// transferred to it when compareAndSwapUint32 below receives SIGSEGV or SIGBUS,
+// with the signal number stored in R1.
+//
+// It must have the same frame configuration as compareAndSwapUint32 so that it
+// can undo any potential call frame set up by the assembler.
+TEXT handleCompareAndSwapUint32Fault(SB), NOSPLIT, $0-24
+	MOVW R1, sig+20(FP)
+	RET
+
+// See the corresponding doc in safecopy_unsafe.go
+//
+// The code is derived from Go source runtime/internal/atomic.Cas.
+//
+//func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32)
+TEXT ·compareAndSwapUint32(SB), NOSPLIT, $0-24
+	// Store 0 as the returned signal number. If we run to completion, this is
+	// the value the caller will see; if a signal is received,
+	// handleCompareAndSwapUint32Fault will store a different value in this
+	// address.
+	MOVW $0, sig+20(FP)
+
+	MOVD addr+0(FP), R0
+	MOVW old+8(FP), R1
+	MOVW new+12(FP), R2
+again:
+	LDAXRW (R0), R3
+	CMPW R1, R3
+	BNE done
+	STLXRW R2, (R0), R4
+	CBNZ R4, again
+done:
+	MOVW R3, prev+16(FP)
+	RET
+
+// handleLoadUint32Fault returns the value stored in DI. Control is transferred
+// to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal
+// number stored in DI.
+//
+// It must have the same frame configuration as loadUint32 so that it can undo
+// any potential call frame set up by the assembler.
+TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16
+	MOVW R1, sig+12(FP)
+	RET
+
+// loadUint32 atomically loads *addr and returns it. If a SIGSEGV or SIGBUS
+// signal is received, the value returned is unspecified, and sig is the number
+// of the signal that was received.
+//
+// Preconditions: addr must be aligned to a 4-byte boundary.
+//
+//func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32)
+TEXT ·loadUint32(SB), NOSPLIT, $0-16
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleLoadUint32Fault will store a different value in this address.
+	MOVW $0, sig+12(FP)
+
+	MOVD addr+0(FP), R0
+	LDARW (R0), R1
+	MOVW R1, val+8(FP)
+	RET
diff --git a/pkg/sentry/platform/safecopy/memclr_amd64.s b/pkg/sentry/platform/safecopy/memclr_amd64.s
new file mode 100644
index 000000000..64cf32f05
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/memclr_amd64.s
@@ -0,0 +1,147 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// handleMemclrFault returns (the value stored in AX, the value stored in DI).
+// Control is transferred to it when memclr below receives SIGSEGV or SIGBUS,
+// with the faulting address stored in AX and the signal number stored in DI.
+//
+// It must have the same frame configuration as memclr so that it can undo any
+// potential call frame set up by the assembler.
+TEXT handleMemclrFault(SB), NOSPLIT, $0-28
+	MOVQ	AX, addr+16(FP)
+	MOVL	DI, sig+24(FP)
+	RET
+
+// memclr sets the n bytes following ptr to zeroes. If a SIGSEGV or SIGBUS
+// signal is received during the write, it returns the address that caused the
+// fault and the number of the signal that was received. Otherwise, it returns
+// an unspecified address and a signal number of 0.
+//
+// Data is written in order, such that if a fault happens at address p, it is
+// safe to assume that all data before p-maxRegisterSize has already been
+// successfully written.
+//
+// The code is derived from runtime.memclrNoHeapPointers.
+//
+// func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
+TEXT ·memclr(SB), NOSPLIT, $0-28
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleMemclrFault will store a different value in this address.
+	MOVL	$0, sig+24(FP)
+
+	MOVQ	ptr+0(FP), DI
+	MOVQ	n+8(FP), BX
+	XORQ	AX, AX
+
+	// MOVOU seems always faster than REP STOSQ.
+tail:
+	TESTQ	BX, BX
+	JEQ	_0
+	CMPQ	BX, $2
+	JBE	_1or2
+	CMPQ	BX, $4
+	JBE	_3or4
+	CMPQ	BX, $8
+	JB	_5through7
+	JE	_8
+	CMPQ	BX, $16
+	JBE	_9through16
+	PXOR	X0, X0
+	CMPQ	BX, $32
+	JBE	_17through32
+	CMPQ	BX, $64
+	JBE	_33through64
+	CMPQ	BX, $128
+	JBE	_65through128
+	CMPQ	BX, $256
+	JBE	_129through256
+	// TODO: use branch table and BSR to make this just a single dispatch
+	// TODO: for really big clears, use MOVNTDQ, even without AVX2.
+
+loop:
+	MOVOU	X0, 0(DI)
+	MOVOU	X0, 16(DI)
+	MOVOU	X0, 32(DI)
+	MOVOU	X0, 48(DI)
+	MOVOU	X0, 64(DI)
+	MOVOU	X0, 80(DI)
+	MOVOU	X0, 96(DI)
+	MOVOU	X0, 112(DI)
+	MOVOU	X0, 128(DI)
+	MOVOU	X0, 144(DI)
+	MOVOU	X0, 160(DI)
+	MOVOU	X0, 176(DI)
+	MOVOU	X0, 192(DI)
+	MOVOU	X0, 208(DI)
+	MOVOU	X0, 224(DI)
+	MOVOU	X0, 240(DI)
+	SUBQ	$256, BX
+	ADDQ	$256, DI
+	CMPQ	BX, $256
+	JAE	loop
+	JMP	tail
+
+_1or2:
+	MOVB	AX, (DI)
+	MOVB	AX, -1(DI)(BX*1)
+	RET
+_0:
+	RET
+_3or4:
+	MOVW	AX, (DI)
+	MOVW	AX, -2(DI)(BX*1)
+	RET
+_5through7:
+	MOVL	AX, (DI)
+	MOVL	AX, -4(DI)(BX*1)
+	RET
+_8:
+	// We need a separate case for 8 to make sure we clear pointers atomically.
+	MOVQ	AX, (DI)
+	RET
+_9through16:
+	MOVQ	AX, (DI)
+	MOVQ	AX, -8(DI)(BX*1)
+	RET
+_17through32:
+	MOVOU	X0, (DI)
+	MOVOU	X0, -16(DI)(BX*1)
+	RET
+_33through64:
+	MOVOU	X0, (DI)
+	MOVOU	X0, 16(DI)
+	MOVOU	X0, -32(DI)(BX*1)
+	MOVOU	X0, -16(DI)(BX*1)
+	RET
+_65through128:
+	MOVOU	X0, (DI)
+	MOVOU	X0, 16(DI)
+	MOVOU	X0, 32(DI)
+	MOVOU	X0, 48(DI)
+	MOVOU	X0, -64(DI)(BX*1)
+	MOVOU	X0, -48(DI)(BX*1)
+	MOVOU	X0, -32(DI)(BX*1)
+	MOVOU	X0, -16(DI)(BX*1)
+	RET
+_129through256:
+	MOVOU	X0, (DI)
+	MOVOU	X0, 16(DI)
+	MOVOU	X0, 32(DI)
+	MOVOU	X0, 48(DI)
+	MOVOU	X0, 64(DI)
+	MOVOU	X0, 80(DI)
+	MOVOU	X0, 96(DI)
+	MOVOU	X0, 112(DI)
+	MOVOU	X0, -128(DI)(BX*1)
+	MOVOU	X0, -112(DI)(BX*1)
+	MOVOU	X0, -96(DI)(BX*1)
+	MOVOU	X0, -80(DI)(BX*1)
+	MOVOU	X0, -64(DI)(BX*1)
+	MOVOU	X0, -48(DI)(BX*1)
+	MOVOU	X0, -32(DI)(BX*1)
+	MOVOU	X0, -16(DI)(BX*1)
+	RET
diff --git a/pkg/sentry/platform/safecopy/memclr_arm64.s b/pkg/sentry/platform/safecopy/memclr_arm64.s
new file mode 100644
index 000000000..7361b9067
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/memclr_arm64.s
@@ -0,0 +1,74 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// handleMemclrFault returns (the value stored in R0, the value stored in R1).
+// Control is transferred to it when memclr below receives SIGSEGV or SIGBUS,
+// with the faulting address stored in R0 and the signal number stored in R1.
+//
+// It must have the same frame configuration as memclr so that it can undo any
+// potential call frame set up by the assembler.
+TEXT handleMemclrFault(SB), NOSPLIT, $0-28
+	MOVD R0, addr+16(FP)
+	MOVW R1, sig+24(FP)
+	RET
+
+// See the corresponding doc in safecopy_unsafe.go
+//
+// The code is derived from runtime.memclrNoHeapPointers.
+//
+// func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
+TEXT ·memclr(SB), NOSPLIT, $0-28
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleMemclrFault will store a different value in this address.
+	MOVW $0, sig+24(FP)
+	MOVD ptr+0(FP), R0
+	MOVD n+8(FP), R1
+
+	// If size is less than 16 bytes, use tail_zero to zero what remains
+	CMP $16, R1
+	BLT tail_zero
+	// Get buffer offset into 16 byte aligned address for better performance
+	ANDS $15, R0, ZR
+	BNE unaligned_to_16
+aligned_to_16:
+	LSR $4, R1, R2
+zero_by_16:
+	STP.P (ZR, ZR), 16(R0) // Store pair with post index.
+	SUBS $1, R2, R2
+	BNE zero_by_16
+	ANDS $15, R1, R1
+	BEQ end
+
+	// Zero buffer with size=R1 < 16
+tail_zero:
+	TBZ $3, R1, tail_zero_4
+	MOVD.P ZR, 8(R0)
+tail_zero_4:
+	TBZ $2, R1, tail_zero_2
+	MOVW.P ZR, 4(R0)
+tail_zero_2:
+	TBZ $1, R1, tail_zero_1
+	MOVH.P ZR, 2(R0)
+tail_zero_1:
+	TBZ $0, R1, end
+	MOVB ZR, (R0)
+end:
+	RET
+
+unaligned_to_16:
+	MOVD R0, R2
+head_loop:
+	MOVBU.P ZR, 1(R0)
+	ANDS $15, R0, ZR
+	BNE head_loop
+	// Adjust length for what remains
+	SUB R2, R0, R3
+	SUB R3, R1
+	// If size is less than 16 bytes, use tail_zero to zero what remains
+	CMP $16, R1
+	BLT tail_zero
+	B aligned_to_16
diff --git a/pkg/sentry/platform/safecopy/memcpy_amd64.s b/pkg/sentry/platform/safecopy/memcpy_amd64.s
new file mode 100644
index 000000000..129691d68
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/memcpy_amd64.s
@@ -0,0 +1,250 @@
+// Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
+// Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
+// Portions Copyright 2009 The Go Authors. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "textflag.h"
+
+// handleMemcpyFault returns (the value stored in AX, the value stored in DI).
+// Control is transferred to it when memcpy below receives SIGSEGV or SIGBUS,
+// with the faulting address stored in AX and the signal number stored in DI.
+//
+// It must have the same frame configuration as memcpy so that it can undo any
+// potential call frame set up by the assembler.
+TEXT handleMemcpyFault(SB), NOSPLIT, $0-36
+	MOVQ	AX, addr+24(FP)
+	MOVL	DI, sig+32(FP)
+	RET
+
+// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received
+// during the copy, it returns the address that caused the fault and the number
+// of the signal that was received. Otherwise, it returns an unspecified address
+// and a signal number of 0.
+//
+// Data is copied in order, such that if a fault happens at address p, it is
+// safe to assume that all data before p-maxRegisterSize has already been
+// successfully copied.
+//
+// The code is derived from the forward copying part of runtime.memmove.
+//
+// func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
+TEXT ·memcpy(SB), NOSPLIT, $0-36
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleMemcpyFault will store a different value in this address.
+	MOVL	$0, sig+32(FP)
+
+	MOVQ	to+0(FP), DI
+	MOVQ	from+8(FP), SI
+	MOVQ	n+16(FP), BX
+
+	// REP instructions have a high startup cost, so we handle small sizes
+	// with some straightline code. The REP MOVSQ instruction is really fast
+	// for large sizes. The cutover is approximately 2K.
+tail:
+	// move_129through256 or smaller work whether or not the source and the
+	// destination memory regions overlap because they load all data into
+	// registers before writing it back.  move_256through2048 on the other
+	// hand can be used only when the memory regions don't overlap or the copy
+	// direction is forward.
+	TESTQ	BX, BX
+	JEQ	move_0
+	CMPQ	BX, $2
+	JBE	move_1or2
+	CMPQ	BX, $4
+	JBE	move_3or4
+	CMPQ	BX, $8
+	JB	move_5through7
+	JE	move_8
+	CMPQ	BX, $16
+	JBE	move_9through16
+	CMPQ	BX, $32
+	JBE	move_17through32
+	CMPQ	BX, $64
+	JBE	move_33through64
+	CMPQ	BX, $128
+	JBE	move_65through128
+	CMPQ	BX, $256
+	JBE	move_129through256
+	// TODO: use branch table and BSR to make this just a single dispatch
+
+/*
+ * forward copy loop
+ */
+	CMPQ	BX, $2048
+	JLS	move_256through2048
+
+	// Check alignment
+	MOVL	SI, AX
+	ORL	DI, AX
+	TESTL	$7, AX
+	JEQ	fwdBy8
+
+	// Do 1 byte at a time
+	MOVQ	BX, CX
+	REP;	MOVSB
+	RET
+
+fwdBy8:
+	// Do 8 bytes at a time
+	MOVQ	BX, CX
+	SHRQ	$3, CX
+	ANDQ	$7, BX
+	REP;	MOVSQ
+	JMP	tail
+
+move_1or2:
+	MOVB	(SI), AX
+	MOVB	AX, (DI)
+	MOVB	-1(SI)(BX*1), CX
+	MOVB	CX, -1(DI)(BX*1)
+	RET
+move_0:
+	RET
+move_3or4:
+	MOVW	(SI), AX
+	MOVW	AX, (DI)
+	MOVW	-2(SI)(BX*1), CX
+	MOVW	CX, -2(DI)(BX*1)
+	RET
+move_5through7:
+	MOVL	(SI), AX
+	MOVL	AX, (DI)
+	MOVL	-4(SI)(BX*1), CX
+	MOVL	CX, -4(DI)(BX*1)
+	RET
+move_8:
+	// We need a separate case for 8 to make sure we write pointers atomically.
+	MOVQ	(SI), AX
+	MOVQ	AX, (DI)
+	RET
+move_9through16:
+	MOVQ	(SI), AX
+	MOVQ	AX, (DI)
+	MOVQ	-8(SI)(BX*1), CX
+	MOVQ	CX, -8(DI)(BX*1)
+	RET
+move_17through32:
+	MOVOU	(SI), X0
+	MOVOU	X0, (DI)
+	MOVOU	-16(SI)(BX*1), X1
+	MOVOU	X1, -16(DI)(BX*1)
+	RET
+move_33through64:
+	MOVOU	(SI), X0
+	MOVOU	X0, (DI)
+	MOVOU	16(SI), X1
+	MOVOU	X1, 16(DI)
+	MOVOU	-32(SI)(BX*1), X2
+	MOVOU	X2, -32(DI)(BX*1)
+	MOVOU	-16(SI)(BX*1), X3
+	MOVOU	X3, -16(DI)(BX*1)
+	RET
+move_65through128:
+	MOVOU	(SI), X0
+	MOVOU	X0, (DI)
+	MOVOU	16(SI), X1
+	MOVOU	X1, 16(DI)
+	MOVOU	32(SI), X2
+	MOVOU	X2, 32(DI)
+	MOVOU	48(SI), X3
+	MOVOU	X3, 48(DI)
+	MOVOU	-64(SI)(BX*1), X4
+	MOVOU	X4, -64(DI)(BX*1)
+	MOVOU	-48(SI)(BX*1), X5
+	MOVOU	X5, -48(DI)(BX*1)
+	MOVOU	-32(SI)(BX*1), X6
+	MOVOU	X6, -32(DI)(BX*1)
+	MOVOU	-16(SI)(BX*1), X7
+	MOVOU	X7, -16(DI)(BX*1)
+	RET
+move_129through256:
+	MOVOU	(SI), X0
+	MOVOU	X0, (DI)
+	MOVOU	16(SI), X1
+	MOVOU	X1, 16(DI)
+	MOVOU	32(SI), X2
+	MOVOU	X2, 32(DI)
+	MOVOU	48(SI), X3
+	MOVOU	X3, 48(DI)
+	MOVOU	64(SI), X4
+	MOVOU	X4, 64(DI)
+	MOVOU	80(SI), X5
+	MOVOU	X5, 80(DI)
+	MOVOU	96(SI), X6
+	MOVOU	X6, 96(DI)
+	MOVOU	112(SI), X7
+	MOVOU	X7, 112(DI)
+	MOVOU	-128(SI)(BX*1), X8
+	MOVOU	X8, -128(DI)(BX*1)
+	MOVOU	-112(SI)(BX*1), X9
+	MOVOU	X9, -112(DI)(BX*1)
+	MOVOU	-96(SI)(BX*1), X10
+	MOVOU	X10, -96(DI)(BX*1)
+	MOVOU	-80(SI)(BX*1), X11
+	MOVOU	X11, -80(DI)(BX*1)
+	MOVOU	-64(SI)(BX*1), X12
+	MOVOU	X12, -64(DI)(BX*1)
+	MOVOU	-48(SI)(BX*1), X13
+	MOVOU	X13, -48(DI)(BX*1)
+	MOVOU	-32(SI)(BX*1), X14
+	MOVOU	X14, -32(DI)(BX*1)
+	MOVOU	-16(SI)(BX*1), X15
+	MOVOU	X15, -16(DI)(BX*1)
+	RET
+move_256through2048:
+	SUBQ	$256, BX
+	MOVOU	(SI), X0
+	MOVOU	X0, (DI)
+	MOVOU	16(SI), X1
+	MOVOU	X1, 16(DI)
+	MOVOU	32(SI), X2
+	MOVOU	X2, 32(DI)
+	MOVOU	48(SI), X3
+	MOVOU	X3, 48(DI)
+	MOVOU	64(SI), X4
+	MOVOU	X4, 64(DI)
+	MOVOU	80(SI), X5
+	MOVOU	X5, 80(DI)
+	MOVOU	96(SI), X6
+	MOVOU	X6, 96(DI)
+	MOVOU	112(SI), X7
+	MOVOU	X7, 112(DI)
+	MOVOU	128(SI), X8
+	MOVOU	X8, 128(DI)
+	MOVOU	144(SI), X9
+	MOVOU	X9, 144(DI)
+	MOVOU	160(SI), X10
+	MOVOU	X10, 160(DI)
+	MOVOU	176(SI), X11
+	MOVOU	X11, 176(DI)
+	MOVOU	192(SI), X12
+	MOVOU	X12, 192(DI)
+	MOVOU	208(SI), X13
+	MOVOU	X13, 208(DI)
+	MOVOU	224(SI), X14
+	MOVOU	X14, 224(DI)
+	MOVOU	240(SI), X15
+	MOVOU	X15, 240(DI)
+	CMPQ	BX, $256
+	LEAQ	256(SI), SI
+	LEAQ	256(DI), DI
+	JGE	move_256through2048
+	JMP	tail
diff --git a/pkg/sentry/platform/safecopy/memcpy_arm64.s b/pkg/sentry/platform/safecopy/memcpy_arm64.s
new file mode 100644
index 000000000..e7e541565
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/memcpy_arm64.s
@@ -0,0 +1,78 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// handleMemcpyFault returns (the value stored in R0, the value stored in R1).
+// Control is transferred to it when memcpy below receives SIGSEGV or SIGBUS,
+// with the faulting address stored in R0 and the signal number stored in R1.
+//
+// It must have the same frame configuration as memcpy so that it can undo any
+// potential call frame set up by the assembler.
+TEXT handleMemcpyFault(SB), NOSPLIT, $0-36
+	MOVD R0, addr+24(FP)
+	MOVW R1, sig+32(FP)
+	RET
+
+// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received
+// during the copy, it returns the address that caused the fault and the number
+// of the signal that was received. Otherwise, it returns an unspecified address
+// and a signal number of 0.
+//
+// Data is copied in order, such that if a fault happens at address p, it is
+// safe to assume that all data before p-maxRegisterSize has already been
+// successfully copied.
+//
+// The code is derived from the Go source runtime.memmove.
+//
+// func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
+TEXT ·memcpy(SB), NOSPLIT, $-8-36
+	// Store 0 as the returned signal number. If we run to completion,
+	// this is the value the caller will see; if a signal is received,
+	// handleMemcpyFault will store a different value in this address.
+	MOVW $0, sig+32(FP)
+
+	MOVD to+0(FP), R3
+	MOVD from+8(FP), R4
+	MOVD n+16(FP), R5
+	CMP $0, R5
+	BNE check
+	RET
+
+check:
+	AND $~7, R5, R7     // R7 is N&~7.
+	SUB R7, R5, R6      // R6 is N&7.
+
+	// Copying forward proceeds by copying R7/8 words then copying R6 bytes.
+	// R3 and R4 are advanced as we copy.
+
+	// (There may be implementations of armv8 where copying by bytes until
+	// at least one of source or dest is word aligned is a worthwhile
+	// optimization, but the on the one tested so far (xgene) it did not
+	// make a significance difference.)
+
+	CMP $0, R7          // Do we need to do any word-by-word copying?
+	BEQ noforwardlarge
+	ADD R3, R7, R9      // R9 points just past where we copy by word.
+
+forwardlargeloop:
+	MOVD.P 8(R4), R8       // R8 is just a scratch register.
+	MOVD.P R8, 8(R3)
+	CMP R3, R9
+	BNE forwardlargeloop
+
+noforwardlarge:
+	CMP $0, R6          // Do we need to do any byte-by-byte copying?
+	BNE forwardtail
+	RET
+
+forwardtail:
+	ADD R3, R6, R9      // R9 points just past the destination memory.
+
+forwardtailloop:
+	MOVBU.P 1(R4), R8
+	MOVBU.P R8, 1(R3)
+	CMP R3, R9
+	BNE forwardtailloop
+	RET
diff --git a/pkg/sentry/platform/safecopy/safecopy.go b/pkg/sentry/platform/safecopy/safecopy.go
new file mode 100644
index 000000000..5126871eb
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/safecopy.go
@@ -0,0 +1,144 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package safecopy provides an efficient implementation of functions to access
+// memory that may result in SIGSEGV or SIGBUS being sent to the accessor.
+package safecopy
+
+import (
+	"fmt"
+	"reflect"
+	"runtime"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// SegvError is returned when a safecopy function receives SIGSEGV.
+type SegvError struct {
+	// Addr is the address at which the SIGSEGV occurred.
+	Addr uintptr
+}
+
+// Error implements error.Error.
+func (e SegvError) Error() string {
+	return fmt.Sprintf("SIGSEGV at %#x", e.Addr)
+}
+
+// BusError is returned when a safecopy function receives SIGBUS.
+type BusError struct {
+	// Addr is the address at which the SIGBUS occurred.
+	Addr uintptr
+}
+
+// Error implements error.Error.
+func (e BusError) Error() string {
+	return fmt.Sprintf("SIGBUS at %#x", e.Addr)
+}
+
+// AlignmentError is returned when a safecopy function is passed an address
+// that does not meet alignment requirements.
+type AlignmentError struct {
+	// Addr is the invalid address.
+	Addr uintptr
+
+	// Alignment is the required alignment.
+	Alignment uintptr
+}
+
+// Error implements error.Error.
+func (e AlignmentError) Error() string {
+	return fmt.Sprintf("address %#x is not aligned to a %d-byte boundary", e.Addr, e.Alignment)
+}
+
+var (
+	// The begin and end addresses below are for the functions that are
+	// checked by the signal handler.
+	memcpyBegin               uintptr
+	memcpyEnd                 uintptr
+	memclrBegin               uintptr
+	memclrEnd                 uintptr
+	swapUint32Begin           uintptr
+	swapUint32End             uintptr
+	swapUint64Begin           uintptr
+	swapUint64End             uintptr
+	compareAndSwapUint32Begin uintptr
+	compareAndSwapUint32End   uintptr
+	loadUint32Begin           uintptr
+	loadUint32End             uintptr
+
+	// savedSigSegVHandler is a pointer to the SIGSEGV handler that was
+	// configured before we replaced it with our own. We still call into it
+	// when we get a SIGSEGV that is not interesting to us.
+	savedSigSegVHandler uintptr
+
+	// same a above, but for SIGBUS signals.
+	savedSigBusHandler uintptr
+)
+
+// signalHandler is our replacement signal handler for SIGSEGV and SIGBUS
+// signals.
+func signalHandler()
+
+// FindEndAddress returns the end address (one byte beyond the last) of the
+// function that contains the specified address (begin).
+func FindEndAddress(begin uintptr) uintptr {
+	f := runtime.FuncForPC(begin)
+	if f != nil {
+		for p := begin; ; p++ {
+			g := runtime.FuncForPC(p)
+			if f != g {
+				return p
+			}
+		}
+	}
+	return begin
+}
+
+// initializeAddresses initializes the addresses used by the signal handler.
+func initializeAddresses() {
+	// The following functions are written in assembly language, so they won't
+	// be inlined by the existing compiler/linker. Tests will fail if this
+	// assumption is violated.
+	memcpyBegin = reflect.ValueOf(memcpy).Pointer()
+	memcpyEnd = FindEndAddress(memcpyBegin)
+	memclrBegin = reflect.ValueOf(memclr).Pointer()
+	memclrEnd = FindEndAddress(memclrBegin)
+	swapUint32Begin = reflect.ValueOf(swapUint32).Pointer()
+	swapUint32End = FindEndAddress(swapUint32Begin)
+	swapUint64Begin = reflect.ValueOf(swapUint64).Pointer()
+	swapUint64End = FindEndAddress(swapUint64Begin)
+	compareAndSwapUint32Begin = reflect.ValueOf(compareAndSwapUint32).Pointer()
+	compareAndSwapUint32End = FindEndAddress(compareAndSwapUint32Begin)
+	loadUint32Begin = reflect.ValueOf(loadUint32).Pointer()
+	loadUint32End = FindEndAddress(loadUint32Begin)
+}
+
+func init() {
+	initializeAddresses()
+	if err := ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(signalHandler).Pointer(), &savedSigSegVHandler); err != nil {
+		panic(fmt.Sprintf("Unable to set handler for SIGSEGV: %v", err))
+	}
+	if err := ReplaceSignalHandler(syscall.SIGBUS, reflect.ValueOf(signalHandler).Pointer(), &savedSigBusHandler); err != nil {
+		panic(fmt.Sprintf("Unable to set handler for SIGBUS: %v", err))
+	}
+	syserror.AddErrorUnwrapper(func(e error) (syscall.Errno, bool) {
+		switch e.(type) {
+		case SegvError, BusError, AlignmentError:
+			return syscall.EFAULT, true
+		default:
+			return 0, false
+		}
+	})
+}
diff --git a/pkg/sentry/platform/safecopy/safecopy_state_autogen.go b/pkg/sentry/platform/safecopy/safecopy_state_autogen.go
new file mode 100755
index 000000000..58fd8fbd0
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/safecopy_state_autogen.go
@@ -0,0 +1,4 @@
+// automatically generated by stateify.
+
+package safecopy
+
diff --git a/pkg/sentry/platform/safecopy/safecopy_unsafe.go b/pkg/sentry/platform/safecopy/safecopy_unsafe.go
new file mode 100644
index 000000000..eef028e68
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/safecopy_unsafe.go
@@ -0,0 +1,335 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package safecopy
+
+import (
+	"fmt"
+	"syscall"
+	"unsafe"
+)
+
+// maxRegisterSize is the maximum register size used in memcpy and memclr. It
+// is used to decide by how much to rewind the copy (for memcpy) or zeroing
+// (for memclr) before proceeding.
+const maxRegisterSize = 16
+
+// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received
+// during the copy, it returns the address that caused the fault and the number
+// of the signal that was received. Otherwise, it returns an unspecified address
+// and a signal number of 0.
+//
+// Data is copied in order, such that if a fault happens at address p, it is
+// safe to assume that all data before p-maxRegisterSize has already been
+// successfully copied.
+//
+//go:noescape
+func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
+
+// memclr sets the n bytes following ptr to zeroes. If a SIGSEGV or SIGBUS
+// signal is received during the write, it returns the address that caused the
+// fault and the number of the signal that was received. Otherwise, it returns
+// an unspecified address and a signal number of 0.
+//
+// Data is written in order, such that if a fault happens at address p, it is
+// safe to assume that all data before p-maxRegisterSize has already been
+// successfully written.
+//
+//go:noescape
+func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32)
+
+// swapUint32 atomically stores new into *ptr and returns (the previous *ptr
+// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the
+// value of old is unspecified, and sig is the number of the signal that was
+// received.
+//
+// Preconditions: ptr must be aligned to a 4-byte boundary.
+//
+//go:noescape
+func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32)
+
+// swapUint64 atomically stores new into *ptr and returns (the previous *ptr
+// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the
+// value of old is unspecified, and sig is the number of the signal that was
+// received.
+//
+// Preconditions: ptr must be aligned to a 8-byte boundary.
+//
+//go:noescape
+func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32)
+
+// compareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns
+// (the value previously stored at ptr, 0). If a SIGSEGV or SIGBUS signal is
+// received during the operation, the value of prev is unspecified, and sig is
+// the number of the signal that was received.
+//
+// Preconditions: ptr must be aligned to a 4-byte boundary.
+//
+//go:noescape
+func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32)
+
+// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It
+// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr.
+//
+// Preconditions: ptr must be aligned to a 4-byte boundary.
+//
+//go:noescape
+func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32)
+
+// CopyIn copies len(dst) bytes from src to dst. It returns the number of bytes
+// copied and an error if SIGSEGV or SIGBUS is received while reading from src.
+func CopyIn(dst []byte, src unsafe.Pointer) (int, error) {
+	toCopy := uintptr(len(dst))
+	if len(dst) == 0 {
+		return 0, nil
+	}
+
+	fault, sig := memcpy(unsafe.Pointer(&dst[0]), src, toCopy)
+	if sig == 0 {
+		return len(dst), nil
+	}
+
+	faultN, srcN := uintptr(fault), uintptr(src)
+	if faultN < srcN || faultN >= srcN+toCopy {
+		panic(fmt.Sprintf("CopyIn raised signal %d at %#x, which is outside source [%#x, %#x)", sig, faultN, srcN, srcN+toCopy))
+	}
+
+	// memcpy might have ended the copy up to maxRegisterSize bytes before
+	// fault, if an instruction caused a memory access that straddled two
+	// pages, and the second one faulted. Try to copy up to the fault.
+	var done int
+	if faultN-srcN > maxRegisterSize {
+		done = int(faultN - srcN - maxRegisterSize)
+	}
+	n, err := CopyIn(dst[done:int(faultN-srcN)], unsafe.Pointer(srcN+uintptr(done)))
+	done += n
+	if err != nil {
+		return done, err
+	}
+	return done, errorFromFaultSignal(fault, sig)
+}
+
+// CopyOut copies len(src) bytes from src to dst. If returns the number of
+// bytes done and an error if SIGSEGV or SIGBUS is received while writing to
+// dst.
+func CopyOut(dst unsafe.Pointer, src []byte) (int, error) {
+	toCopy := uintptr(len(src))
+	if toCopy == 0 {
+		return 0, nil
+	}
+
+	fault, sig := memcpy(dst, unsafe.Pointer(&src[0]), toCopy)
+	if sig == 0 {
+		return len(src), nil
+	}
+
+	faultN, dstN := uintptr(fault), uintptr(dst)
+	if faultN < dstN || faultN >= dstN+toCopy {
+		panic(fmt.Sprintf("CopyOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, faultN, dstN, dstN+toCopy))
+	}
+
+	// memcpy might have ended the copy up to maxRegisterSize bytes before
+	// fault, if an instruction caused a memory access that straddled two
+	// pages, and the second one faulted. Try to copy up to the fault.
+	var done int
+	if faultN-dstN > maxRegisterSize {
+		done = int(faultN - dstN - maxRegisterSize)
+	}
+	n, err := CopyOut(unsafe.Pointer(dstN+uintptr(done)), src[done:int(faultN-dstN)])
+	done += n
+	if err != nil {
+		return done, err
+	}
+	return done, errorFromFaultSignal(fault, sig)
+}
+
+// Copy copies toCopy bytes from src to dst. It returns the number of bytes
+// copied and an error if SIGSEGV or SIGBUS is received while reading from src
+// or writing to dst.
+//
+// Data is copied in order; if [src, src+toCopy) and [dst, dst+toCopy) overlap,
+// the resulting contents of dst are unspecified.
+func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) {
+	if toCopy == 0 {
+		return 0, nil
+	}
+
+	fault, sig := memcpy(dst, src, toCopy)
+	if sig == 0 {
+		return toCopy, nil
+	}
+
+	// Did the fault occur while reading from src or writing to dst?
+	faultN, srcN, dstN := uintptr(fault), uintptr(src), uintptr(dst)
+	faultAfterSrc := ^uintptr(0)
+	if faultN >= srcN {
+		faultAfterSrc = faultN - srcN
+	}
+	faultAfterDst := ^uintptr(0)
+	if faultN >= dstN {
+		faultAfterDst = faultN - dstN
+	}
+	if faultAfterSrc >= toCopy && faultAfterDst >= toCopy {
+		panic(fmt.Sprintf("Copy raised signal %d at %#x, which is outside source [%#x, %#x) and destination [%#x, %#x)", sig, faultN, srcN, srcN+toCopy, dstN, dstN+toCopy))
+	}
+	faultedAfter := faultAfterSrc
+	if faultedAfter > faultAfterDst {
+		faultedAfter = faultAfterDst
+	}
+
+	// memcpy might have ended the copy up to maxRegisterSize bytes before
+	// fault, if an instruction caused a memory access that straddled two
+	// pages, and the second one faulted. Try to copy up to the fault.
+	var done uintptr
+	if faultedAfter > maxRegisterSize {
+		done = faultedAfter - maxRegisterSize
+	}
+	n, err := Copy(unsafe.Pointer(dstN+done), unsafe.Pointer(srcN+done), faultedAfter-done)
+	done += n
+	if err != nil {
+		return done, err
+	}
+	return done, errorFromFaultSignal(fault, sig)
+}
+
+// ZeroOut writes toZero zero bytes to dst. It returns the number of bytes
+// written and an error if SIGSEGV or SIGBUS is received while writing to dst.
+func ZeroOut(dst unsafe.Pointer, toZero uintptr) (uintptr, error) {
+	if toZero == 0 {
+		return 0, nil
+	}
+
+	fault, sig := memclr(dst, toZero)
+	if sig == 0 {
+		return toZero, nil
+	}
+
+	faultN, dstN := uintptr(fault), uintptr(dst)
+	if faultN < dstN || faultN >= dstN+toZero {
+		panic(fmt.Sprintf("ZeroOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, faultN, dstN, dstN+toZero))
+	}
+
+	// memclr might have ended the write up to maxRegisterSize bytes before
+	// fault, if an instruction caused a memory access that straddled two
+	// pages, and the second one faulted. Try to write up to the fault.
+	var done uintptr
+	if faultN-dstN > maxRegisterSize {
+		done = faultN - dstN - maxRegisterSize
+	}
+	n, err := ZeroOut(unsafe.Pointer(dstN+done), faultN-dstN-done)
+	done += n
+	if err != nil {
+		return done, err
+	}
+	return done, errorFromFaultSignal(fault, sig)
+}
+
+// SwapUint32 is equivalent to sync/atomic.SwapUint32, except that it returns
+// an error if SIGSEGV or SIGBUS is received while accessing ptr, or if ptr is
+// not aligned to a 4-byte boundary.
+func SwapUint32(ptr unsafe.Pointer, new uint32) (uint32, error) {
+	if addr := uintptr(ptr); addr&3 != 0 {
+		return 0, AlignmentError{addr, 4}
+	}
+	old, sig := swapUint32(ptr, new)
+	return old, errorFromFaultSignal(ptr, sig)
+}
+
+// SwapUint64 is equivalent to sync/atomic.SwapUint64, except that it returns
+// an error if SIGSEGV or SIGBUS is received while accessing ptr, or if ptr is
+// not aligned to an 8-byte boundary.
+func SwapUint64(ptr unsafe.Pointer, new uint64) (uint64, error) {
+	if addr := uintptr(ptr); addr&7 != 0 {
+		return 0, AlignmentError{addr, 8}
+	}
+	old, sig := swapUint64(ptr, new)
+	return old, errorFromFaultSignal(ptr, sig)
+}
+
+// CompareAndSwapUint32 is equivalent to atomicbitops.CompareAndSwapUint32,
+// except that it returns an error if SIGSEGV or SIGBUS is received while
+// accessing ptr, or if ptr is not aligned to a 4-byte boundary.
+func CompareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (uint32, error) {
+	if addr := uintptr(ptr); addr&3 != 0 {
+		return 0, AlignmentError{addr, 4}
+	}
+	prev, sig := compareAndSwapUint32(ptr, old, new)
+	return prev, errorFromFaultSignal(ptr, sig)
+}
+
+// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It
+// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr.
+//
+// Preconditions: ptr must be aligned to a 4-byte boundary.
+func LoadUint32(ptr unsafe.Pointer) (uint32, error) {
+	if addr := uintptr(ptr); addr&3 != 0 {
+		return 0, AlignmentError{addr, 4}
+	}
+	val, sig := loadUint32(ptr)
+	return val, errorFromFaultSignal(ptr, sig)
+}
+
+func errorFromFaultSignal(addr unsafe.Pointer, sig int32) error {
+	switch sig {
+	case 0:
+		return nil
+	case int32(syscall.SIGSEGV):
+		return SegvError{uintptr(addr)}
+	case int32(syscall.SIGBUS):
+		return BusError{uintptr(addr)}
+	default:
+		panic(fmt.Sprintf("safecopy got unexpected signal %d at address %#x", sig, addr))
+	}
+}
+
+// ReplaceSignalHandler replaces the existing signal handler for the provided
+// signal with the one that handles faults in safecopy-protected functions.
+//
+// It stores the value of the previously set handler in previous.
+//
+// This function will be called on initialization in order to install safecopy
+// handlers for appropriate signals. These handlers will call the previous
+// handler however, and if this is function is being used externally then the
+// same courtesy is expected.
+func ReplaceSignalHandler(sig syscall.Signal, handler uintptr, previous *uintptr) error {
+	var sa struct {
+		handler  uintptr
+		flags    uint64
+		restorer uintptr
+		mask     uint64
+	}
+	const maskLen = 8
+
+	// Get the existing signal handler information, and save the current
+	// handler. Once we replace it, we will use this pointer to fall back to
+	// it when we receive other signals.
+	if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), 0, uintptr(unsafe.Pointer(&sa)), maskLen, 0, 0); e != 0 {
+		return e
+	}
+
+	// Fail if there isn't a previous handler.
+	if sa.handler == 0 {
+		return fmt.Errorf("previous handler for signal %x isn't set", sig)
+	}
+
+	*previous = sa.handler
+
+	// Install our own handler.
+	sa.handler = handler
+	if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), uintptr(unsafe.Pointer(&sa)), 0, maskLen, 0, 0); e != 0 {
+		return e
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/platform/safecopy/sighandler_amd64.s b/pkg/sentry/platform/safecopy/sighandler_amd64.s
new file mode 100644
index 000000000..475ae48e9
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/sighandler_amd64.s
@@ -0,0 +1,133 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// The signals handled by sigHandler.
+#define SIGBUS  7
+#define SIGSEGV 11
+
+// Offsets to the registers in context->uc_mcontext.gregs[].
+#define REG_RDI 0x68
+#define REG_RAX 0x90
+#define REG_IP  0xa8
+
+// Offset to the si_addr field of siginfo.
+#define SI_CODE 0x08
+#define SI_ADDR 0x10
+
+// signalHandler is the signal handler for SIGSEGV and SIGBUS signals. It must
+// not be set up as a handler to any other signals.
+//
+// If the instruction causing the signal is within a safecopy-protected
+// function, the signal is handled such that execution resumes in the
+// appropriate fault handling stub with AX containing the faulting address and
+// DI containing the signal number. Otherwise control is transferred to the
+// previously configured signal handler (savedSigSegvHandler or
+// savedSigBusHandler).
+//
+// This function cannot be written in go because it runs whenever a signal is
+// received by the thread (preempting whatever was running), which includes when
+// garbage collector has stopped or isn't expecting any interactions (like
+// barriers).
+//
+// The arguments are the following:
+// DI - The signal number.
+// SI - Pointer to siginfo_t structure.
+// DX - Pointer to ucontext structure.
+TEXT ·signalHandler(SB),NOSPLIT,$0
+	// Check if the signal is from the kernel.
+	MOVQ $0x0, CX
+	CMPL CX, SI_CODE(SI)
+	JGE original_handler
+
+	// Check if RIP is within the area we care about.
+	MOVQ REG_IP(DX), CX
+	CMPQ CX, ·memcpyBegin(SB)
+	JB not_memcpy
+	CMPQ CX, ·memcpyEnd(SB)
+	JAE not_memcpy
+
+	// Modify the context such that execution will resume in the fault
+	// handler.
+	LEAQ handleMemcpyFault(SB), CX
+	JMP handle_fault
+
+not_memcpy:
+	CMPQ CX, ·memclrBegin(SB)
+	JB not_memclr
+	CMPQ CX, ·memclrEnd(SB)
+	JAE not_memclr
+
+	LEAQ handleMemclrFault(SB), CX
+	JMP handle_fault
+
+not_memclr:
+	CMPQ CX, ·swapUint32Begin(SB)
+	JB not_swapuint32
+	CMPQ CX, ·swapUint32End(SB)
+	JAE not_swapuint32
+
+	LEAQ handleSwapUint32Fault(SB), CX
+	JMP handle_fault
+
+not_swapuint32:
+	CMPQ CX, ·swapUint64Begin(SB)
+	JB not_swapuint64
+	CMPQ CX, ·swapUint64End(SB)
+	JAE not_swapuint64
+
+	LEAQ handleSwapUint64Fault(SB), CX
+	JMP handle_fault
+
+not_swapuint64:
+	CMPQ CX, ·compareAndSwapUint32Begin(SB)
+	JB not_casuint32
+	CMPQ CX, ·compareAndSwapUint32End(SB)
+	JAE not_casuint32
+
+	LEAQ handleCompareAndSwapUint32Fault(SB), CX
+	JMP handle_fault
+
+not_casuint32:
+	CMPQ CX, ·loadUint32Begin(SB)
+	JB not_loaduint32
+	CMPQ CX, ·loadUint32End(SB)
+	JAE not_loaduint32
+
+	LEAQ handleLoadUint32Fault(SB), CX
+	JMP handle_fault
+
+not_loaduint32:
+original_handler:
+	// Jump to the previous signal handler, which is likely the golang one.
+	XORQ CX, CX
+	MOVQ ·savedSigBusHandler(SB), AX
+	CMPL DI, $SIGSEGV
+	CMOVQEQ ·savedSigSegVHandler(SB), AX
+	JMP AX
+
+handle_fault:
+	// Entered with the address of the fault handler in RCX; store it in
+	// RIP.
+	MOVQ CX, REG_IP(DX)
+
+	// Store the faulting address in RAX.
+	MOVQ SI_ADDR(SI), CX
+	MOVQ CX, REG_RAX(DX)
+
+	// Store the signal number in EDI.
+	MOVL DI, REG_RDI(DX)
+
+	RET
diff --git a/pkg/sentry/platform/safecopy/sighandler_arm64.s b/pkg/sentry/platform/safecopy/sighandler_arm64.s
new file mode 100644
index 000000000..53e4ac2c1
--- /dev/null
+++ b/pkg/sentry/platform/safecopy/sighandler_arm64.s
@@ -0,0 +1,143 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// The signals handled by sigHandler.
+#define SIGBUS 7
+#define SIGSEGV 11
+
+// Offsets to the registers in context->uc_mcontext.gregs[].
+#define REG_R0 0xB8
+#define REG_R1 0xC0
+#define REG_PC 0x1B8
+
+// Offset to the si_addr field of siginfo.
+#define SI_CODE 0x08
+#define SI_ADDR 0x10
+
+// signalHandler is the signal handler for SIGSEGV and SIGBUS signals. It must
+// not be set up as a handler to any other signals.
+//
+// If the instruction causing the signal is within a safecopy-protected
+// function, the signal is handled such that execution resumes in the
+// appropriate fault handling stub with R0 containing the faulting address and
+// R1 containing the signal number. Otherwise control is transferred to the
+// previously configured signal handler (savedSigSegvHandler or
+// savedSigBusHandler).
+//
+// This function cannot be written in go because it runs whenever a signal is
+// received by the thread (preempting whatever was running), which includes when
+// garbage collector has stopped or isn't expecting any interactions (like
+// barriers).
+//
+// The arguments are the following:
+// R0 - The signal number.
+// R1 - Pointer to siginfo_t structure.
+// R2 - Pointer to ucontext structure.
+TEXT ·signalHandler(SB),NOSPLIT,$0
+	// Check if the signal is from the kernel, si_code > 0 means a kernel signal.
+	MOVD SI_CODE(R1), R7
+	CMPW $0x0, R7
+	BLE original_handler
+
+	// Check if PC is within the area we care about.
+	MOVD REG_PC(R2), R7
+	MOVD ·memcpyBegin(SB), R8
+	CMP R8, R7
+	BLO not_memcpy
+	MOVD ·memcpyEnd(SB), R8
+	CMP R8, R7
+	BHS not_memcpy
+
+	// Modify the context such that execution will resume in the fault handler.
+	MOVD $handleMemcpyFault(SB), R7
+	B handle_fault
+
+not_memcpy:
+	MOVD ·memclrBegin(SB), R8
+	CMP R8, R7
+	BLO not_memclr
+	MOVD ·memclrEnd(SB), R8
+	CMP R8, R7
+	BHS not_memclr
+
+	MOVD $handleMemclrFault(SB), R7
+	B handle_fault
+
+not_memclr:
+	MOVD ·swapUint32Begin(SB), R8
+	CMP R8, R7
+	BLO not_swapuint32
+	MOVD ·swapUint32End(SB), R8
+	CMP R8, R7
+	BHS not_swapuint32
+
+	MOVD $handleSwapUint32Fault(SB), R7
+	B handle_fault
+
+not_swapuint32:
+	MOVD ·swapUint64Begin(SB), R8
+	CMP R8, R7
+	BLO not_swapuint64
+	MOVD ·swapUint64End(SB), R8
+	CMP R8, R7
+	BHS not_swapuint64
+
+	MOVD $handleSwapUint64Fault(SB), R7
+	B handle_fault
+
+not_swapuint64:
+	MOVD ·compareAndSwapUint32Begin(SB), R8
+	CMP R8, R7
+	BLO not_casuint32
+	MOVD ·compareAndSwapUint32End(SB), R8
+	CMP R8, R7
+	BHS not_casuint32
+
+	MOVD $handleCompareAndSwapUint32Fault(SB), R7
+	B handle_fault
+
+not_casuint32:
+	MOVD ·loadUint32Begin(SB), R8
+	CMP R8, R7
+	BLO not_loaduint32
+	MOVD ·loadUint32End(SB), R8
+	CMP R8, R7
+	BHS not_loaduint32
+
+	MOVD $handleLoadUint32Fault(SB), R7
+	B handle_fault
+
+not_loaduint32:
+original_handler:
+	// Jump to the previous signal handler, which is likely the golang one.
+	MOVD ·savedSigBusHandler(SB), R7
+	MOVD ·savedSigSegVHandler(SB), R8
+	CMPW $SIGSEGV, R0
+	CSEL EQ, R8, R7, R7
+	B (R7)
+
+handle_fault:
+	// Entered with the address of the fault handler in R7; store it in PC.
+	MOVD R7, REG_PC(R2)
+
+	// Store the faulting address in R0.
+	MOVD SI_ADDR(R1), R7
+	MOVD R7, REG_R0(R2)
+
+	// Store the signal number in R1.
+	MOVW R0, REG_R1(R2)
+
+	RET
author	gVisor bot <gvisor-bot@google.com>	2019-06-02 06:44:55 +0000
committer	gVisor bot <gvisor-bot@google.com>	2019-06-02 06:44:55 +0000
commit	ceb0d792f328d1fc0692197d8856a43c3936a571 (patch)
tree	83155f302eff44a78bcc30a3a08f4efe59a79379 /pkg/sentry/platform
parent	deb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff)
parent	216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff)