Merge 216da0b7 (automated)

author: gVisor bot <gvisor-bot@google.com> 2019-06-02 06:44:55 +0000
committer: gVisor bot <gvisor-bot@google.com> 2019-06-02 06:44:55 +0000
commit: ceb0d792f328d1fc0692197d8856a43c3936a571 (patch)
tree: 83155f302eff44a78bcc30a3a08f4efe59a79379 /pkg/sentry/platform/ring0
parent: deb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff)
parent: 216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff)
21 files changed, 3579 insertions, 0 deletions
diff --git a/pkg/sentry/platform/ring0/defs_impl.go b/pkg/sentry/platform/ring0/defs_impl.go
new file mode 100755
index 000000000..582553bc7
--- /dev/null
+++ b/pkg/sentry/platform/ring0/defs_impl.go
@@ -0,0 +1,538 @@
+package ring0
+
+import (
+	"syscall"
+
+	"fmt"
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"io"
+	"reflect"
+)
+
+var (
+	// UserspaceSize is the total size of userspace.
+	UserspaceSize = uintptr(1) << (VirtualAddressBits() - 1)
+
+	// MaximumUserAddress is the largest possible user address.
+	MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(usermem.PageSize-1)
+
+	// KernelStartAddress is the starting kernel address.
+	KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1)
+)
+
+// Kernel is a global kernel object.
+//
+// This contains global state, shared by multiple CPUs.
+type Kernel struct {
+	KernelArchState
+}
+
+// Hooks are hooks for kernel functions.
+type Hooks interface {
+	// KernelSyscall is called for kernel system calls.
+	//
+	// Return from this call will restore registers and return to the kernel: the
+	// registers must be modified directly.
+	//
+	// If this function is not provided, a kernel exception results in halt.
+	//
+	// This must be go:nosplit, as this will be on the interrupt stack.
+	// Closures are permitted, as the pointer to the closure frame is not
+	// passed on the stack.
+	KernelSyscall()
+
+	// KernelException handles an exception during kernel execution.
+	//
+	// Return from this call will restore registers and return to the kernel: the
+	// registers must be modified directly.
+	//
+	// If this function is not provided, a kernel exception results in halt.
+	//
+	// This must be go:nosplit, as this will be on the interrupt stack.
+	// Closures are permitted, as the pointer to the closure frame is not
+	// passed on the stack.
+	KernelException(Vector)
+}
+
+// CPU is the per-CPU struct.
+type CPU struct {
+	// self is a self reference.
+	//
+	// This is always guaranteed to be at offset zero.
+	self *CPU
+
+	// kernel is reference to the kernel that this CPU was initialized
+	// with. This reference is kept for garbage collection purposes: CPU
+	// registers may refer to objects within the Kernel object that cannot
+	// be safely freed.
+	kernel *Kernel
+
+	// CPUArchState is architecture-specific state.
+	CPUArchState
+
+	// registers is a set of registers; these may be used on kernel system
+	// calls and exceptions via the Registers function.
+	registers syscall.PtraceRegs
+
+	// hooks are kernel hooks.
+	hooks Hooks
+}
+
+// Registers returns a modifiable-copy of the kernel registers.
+//
+// This is explicitly safe to call during KernelException and KernelSyscall.
+//
+//go:nosplit
+func (c *CPU) Registers() *syscall.PtraceRegs {
+	return &c.registers
+}
+
+// SwitchOpts are passed to the Switch function.
+type SwitchOpts struct {
+	// Registers are the user register state.
+	Registers *syscall.PtraceRegs
+
+	// FloatingPointState is a byte pointer where floating point state is
+	// saved and restored.
+	FloatingPointState *byte
+
+	// PageTables are the application page tables.
+	PageTables *pagetables.PageTables
+
+	// Flush indicates that a TLB flush should be forced on switch.
+	Flush bool
+
+	// FullRestore indicates that an iret-based restore should be used.
+	FullRestore bool
+
+	// SwitchArchOpts are architecture-specific options.
+	SwitchArchOpts
+}
+
+// Segment indices and Selectors.
+const (
+	// Index into GDT array.
+	_          = iota // Null descriptor first.
+	_                 // Reserved (Linux is kernel 32).
+	segKcode          // Kernel code (64-bit).
+	segKdata          // Kernel data.
+	segUcode32        // User code (32-bit).
+	segUdata          // User data.
+	segUcode64        // User code (64-bit).
+	segTss            // Task segment descriptor.
+	segTssHi          // Upper bits for TSS.
+	segLast           // Last segment (terminal, not included).
+)
+
+// Selectors.
+const (
+	Kcode   Selector = segKcode << 3
+	Kdata   Selector = segKdata << 3
+	Ucode32 Selector = (segUcode32 << 3) | 3
+	Udata   Selector = (segUdata << 3) | 3
+	Ucode64 Selector = (segUcode64 << 3) | 3
+	Tss     Selector = segTss << 3
+)
+
+// Standard segments.
+var (
+	UserCodeSegment32 SegmentDescriptor
+	UserDataSegment   SegmentDescriptor
+	UserCodeSegment64 SegmentDescriptor
+	KernelCodeSegment SegmentDescriptor
+	KernelDataSegment SegmentDescriptor
+)
+
+// KernelOpts has initialization options for the kernel.
+type KernelOpts struct {
+	// PageTables are the kernel pagetables; this must be provided.
+	PageTables *pagetables.PageTables
+}
+
+// KernelArchState contains architecture-specific state.
+type KernelArchState struct {
+	KernelOpts
+
+	// globalIDT is our set of interrupt gates.
+	globalIDT idt64
+}
+
+// CPUArchState contains CPU-specific arch state.
+type CPUArchState struct {
+	// stack is the stack used for interrupts on this CPU.
+	stack [256]byte
+
+	// errorCode is the error code from the last exception.
+	errorCode uintptr
+
+	// errorType indicates the type of error code here, it is always set
+	// along with the errorCode value above.
+	//
+	// It will either by 1, which indicates a user error, or 0 indicating a
+	// kernel error. If the error code below returns false (kernel error),
+	// then it cannot provide relevant information about the last
+	// exception.
+	errorType uintptr
+
+	// gdt is the CPU's descriptor table.
+	gdt descriptorTable
+
+	// tss is the CPU's task state.
+	tss TaskState64
+}
+
+// ErrorCode returns the last error code.
+//
+// The returned boolean indicates whether the error code corresponds to the
+// last user error or not. If it does not, then fault information must be
+// ignored. This is generally the result of a kernel fault while servicing a
+// user fault.
+//
+//go:nosplit
+func (c *CPU) ErrorCode() (value uintptr, user bool) {
+	return c.errorCode, c.errorType != 0
+}
+
+// ClearErrorCode resets the error code.
+//
+//go:nosplit
+func (c *CPU) ClearErrorCode() {
+	c.errorCode = 0
+	c.errorType = 1
+}
+
+// SwitchArchOpts are embedded in SwitchOpts.
+type SwitchArchOpts struct {
+	// UserPCID indicates that the application PCID to be used on switch,
+	// assuming that PCIDs are supported.
+	//
+	// Per pagetables_x86.go, a zero PCID implies a flush.
+	UserPCID uint16
+
+	// KernelPCID indicates that the kernel PCID to be used on return,
+	// assuming that PCIDs are supported.
+	//
+	// Per pagetables_x86.go, a zero PCID implies a flush.
+	KernelPCID uint16
+}
+
+func init() {
+	KernelCodeSegment.setCode64(0, 0, 0)
+	KernelDataSegment.setData(0, 0xffffffff, 0)
+	UserCodeSegment32.setCode64(0, 0, 3)
+	UserDataSegment.setData(0, 0xffffffff, 3)
+	UserCodeSegment64.setCode64(0, 0, 3)
+}
+
+// Emit prints architecture-specific offsets.
+func Emit(w io.Writer) {
+	fmt.Fprintf(w, "// Automatically generated, do not edit.\n")
+
+	c := &CPU{}
+	fmt.Fprintf(w, "\n// CPU offsets.\n")
+	fmt.Fprintf(w, "#define CPU_SELF             0x%02x\n", reflect.ValueOf(&c.self).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_REGISTERS        0x%02x\n", reflect.ValueOf(&c.registers).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_STACK_TOP        0x%02x\n", reflect.ValueOf(&c.stack[0]).Pointer()-reflect.ValueOf(c).Pointer()+uintptr(len(c.stack)))
+	fmt.Fprintf(w, "#define CPU_ERROR_CODE       0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer())
+	fmt.Fprintf(w, "#define CPU_ERROR_TYPE       0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer())
+
+	fmt.Fprintf(w, "\n// Bits.\n")
+	fmt.Fprintf(w, "#define _RFLAGS_IF           0x%02x\n", _RFLAGS_IF)
+	fmt.Fprintf(w, "#define _KERNEL_FLAGS        0x%02x\n", KernelFlagsSet)
+
+	fmt.Fprintf(w, "\n// Vectors.\n")
+	fmt.Fprintf(w, "#define DivideByZero               0x%02x\n", DivideByZero)
+	fmt.Fprintf(w, "#define Debug                      0x%02x\n", Debug)
+	fmt.Fprintf(w, "#define NMI                        0x%02x\n", NMI)
+	fmt.Fprintf(w, "#define Breakpoint                 0x%02x\n", Breakpoint)
+	fmt.Fprintf(w, "#define Overflow                   0x%02x\n", Overflow)
+	fmt.Fprintf(w, "#define BoundRangeExceeded         0x%02x\n", BoundRangeExceeded)
+	fmt.Fprintf(w, "#define InvalidOpcode              0x%02x\n", InvalidOpcode)
+	fmt.Fprintf(w, "#define DeviceNotAvailable         0x%02x\n", DeviceNotAvailable)
+	fmt.Fprintf(w, "#define DoubleFault                0x%02x\n", DoubleFault)
+	fmt.Fprintf(w, "#define CoprocessorSegmentOverrun  0x%02x\n", CoprocessorSegmentOverrun)
+	fmt.Fprintf(w, "#define InvalidTSS                 0x%02x\n", InvalidTSS)
+	fmt.Fprintf(w, "#define SegmentNotPresent          0x%02x\n", SegmentNotPresent)
+	fmt.Fprintf(w, "#define StackSegmentFault          0x%02x\n", StackSegmentFault)
+	fmt.Fprintf(w, "#define GeneralProtectionFault     0x%02x\n", GeneralProtectionFault)
+	fmt.Fprintf(w, "#define PageFault                  0x%02x\n", PageFault)
+	fmt.Fprintf(w, "#define X87FloatingPointException  0x%02x\n", X87FloatingPointException)
+	fmt.Fprintf(w, "#define AlignmentCheck             0x%02x\n", AlignmentCheck)
+	fmt.Fprintf(w, "#define MachineCheck               0x%02x\n", MachineCheck)
+	fmt.Fprintf(w, "#define SIMDFloatingPointException 0x%02x\n", SIMDFloatingPointException)
+	fmt.Fprintf(w, "#define VirtualizationException    0x%02x\n", VirtualizationException)
+	fmt.Fprintf(w, "#define SecurityException          0x%02x\n", SecurityException)
+	fmt.Fprintf(w, "#define SyscallInt80               0x%02x\n", SyscallInt80)
+	fmt.Fprintf(w, "#define Syscall                    0x%02x\n", Syscall)
+
+	p := &syscall.PtraceRegs{}
+	fmt.Fprintf(w, "\n// Ptrace registers.\n")
+	fmt.Fprintf(w, "#define PTRACE_R15      0x%02x\n", reflect.ValueOf(&p.R15).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R14      0x%02x\n", reflect.ValueOf(&p.R14).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R13      0x%02x\n", reflect.ValueOf(&p.R13).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R12      0x%02x\n", reflect.ValueOf(&p.R12).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RBP      0x%02x\n", reflect.ValueOf(&p.Rbp).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RBX      0x%02x\n", reflect.ValueOf(&p.Rbx).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R11      0x%02x\n", reflect.ValueOf(&p.R11).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R10      0x%02x\n", reflect.ValueOf(&p.R10).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R9       0x%02x\n", reflect.ValueOf(&p.R9).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_R8       0x%02x\n", reflect.ValueOf(&p.R8).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RAX      0x%02x\n", reflect.ValueOf(&p.Rax).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RCX      0x%02x\n", reflect.ValueOf(&p.Rcx).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RDX      0x%02x\n", reflect.ValueOf(&p.Rdx).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RSI      0x%02x\n", reflect.ValueOf(&p.Rsi).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RDI      0x%02x\n", reflect.ValueOf(&p.Rdi).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_ORIGRAX  0x%02x\n", reflect.ValueOf(&p.Orig_rax).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RIP      0x%02x\n", reflect.ValueOf(&p.Rip).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_CS       0x%02x\n", reflect.ValueOf(&p.Cs).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_FLAGS    0x%02x\n", reflect.ValueOf(&p.Eflags).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_RSP      0x%02x\n", reflect.ValueOf(&p.Rsp).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_SS       0x%02x\n", reflect.ValueOf(&p.Ss).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_FS       0x%02x\n", reflect.ValueOf(&p.Fs_base).Pointer()-reflect.ValueOf(p).Pointer())
+	fmt.Fprintf(w, "#define PTRACE_GS       0x%02x\n", reflect.ValueOf(&p.Gs_base).Pointer()-reflect.ValueOf(p).Pointer())
+}
+
+// Useful bits.
+const (
+	_CR0_PE = 1 << 0
+	_CR0_ET = 1 << 4
+	_CR0_AM = 1 << 18
+	_CR0_PG = 1 << 31
+
+	_CR4_PSE        = 1 << 4
+	_CR4_PAE        = 1 << 5
+	_CR4_PGE        = 1 << 7
+	_CR4_OSFXSR     = 1 << 9
+	_CR4_OSXMMEXCPT = 1 << 10
+	_CR4_FSGSBASE   = 1 << 16
+	_CR4_PCIDE      = 1 << 17
+	_CR4_OSXSAVE    = 1 << 18
+	_CR4_SMEP       = 1 << 20
+
+	_RFLAGS_AC       = 1 << 18
+	_RFLAGS_NT       = 1 << 14
+	_RFLAGS_IOPL     = 3 << 12
+	_RFLAGS_DF       = 1 << 10
+	_RFLAGS_IF       = 1 << 9
+	_RFLAGS_STEP     = 1 << 8
+	_RFLAGS_RESERVED = 1 << 1
+
+	_EFER_SCE = 0x001
+	_EFER_LME = 0x100
+	_EFER_LMA = 0x400
+	_EFER_NX  = 0x800
+
+	_MSR_STAR          = 0xc0000081
+	_MSR_LSTAR         = 0xc0000082
+	_MSR_CSTAR         = 0xc0000083
+	_MSR_SYSCALL_MASK  = 0xc0000084
+	_MSR_PLATFORM_INFO = 0xce
+	_MSR_MISC_FEATURES = 0x140
+
+	_PLATFORM_INFO_CPUID_FAULT = 1 << 31
+
+	_MISC_FEATURE_CPUID_TRAP = 0x1
+)
+
+const (
+	// KernelFlagsSet should always be set in the kernel.
+	KernelFlagsSet = _RFLAGS_RESERVED
+
+	// UserFlagsSet are always set in userspace.
+	UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF
+
+	// KernelFlagsClear should always be clear in the kernel.
+	KernelFlagsClear = _RFLAGS_STEP | _RFLAGS_IF | _RFLAGS_IOPL | _RFLAGS_AC | _RFLAGS_NT
+
+	// UserFlagsClear are always cleared in userspace.
+	UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL
+)
+
+// Vector is an exception vector.
+type Vector uintptr
+
+// Exception vectors.
+const (
+	DivideByZero Vector = iota
+	Debug
+	NMI
+	Breakpoint
+	Overflow
+	BoundRangeExceeded
+	InvalidOpcode
+	DeviceNotAvailable
+	DoubleFault
+	CoprocessorSegmentOverrun
+	InvalidTSS
+	SegmentNotPresent
+	StackSegmentFault
+	GeneralProtectionFault
+	PageFault
+	_
+	X87FloatingPointException
+	AlignmentCheck
+	MachineCheck
+	SIMDFloatingPointException
+	VirtualizationException
+	SecurityException = 0x1e
+	SyscallInt80      = 0x80
+	_NR_INTERRUPTS    = SyscallInt80 + 1
+)
+
+// System call vectors.
+const (
+	Syscall Vector = _NR_INTERRUPTS
+)
+
+// VirtualAddressBits returns the number bits available for virtual addresses.
+//
+// Note that sign-extension semantics apply to the highest order bit.
+//
+// FIXME(b/69382326): This should use the cpuid passed to Init.
+func VirtualAddressBits() uint32 {
+	ax, _, _, _ := cpuid.HostID(0x80000008, 0)
+	return (ax >> 8) & 0xff
+}
+
+// PhysicalAddressBits returns the number of bits available for physical addresses.
+//
+// FIXME(b/69382326): This should use the cpuid passed to Init.
+func PhysicalAddressBits() uint32 {
+	ax, _, _, _ := cpuid.HostID(0x80000008, 0)
+	return ax & 0xff
+}
+
+// Selector is a segment Selector.
+type Selector uint16
+
+// SegmentDescriptor is a segment descriptor.
+type SegmentDescriptor struct {
+	bits [2]uint32
+}
+
+// descriptorTable is a collection of descriptors.
+type descriptorTable [32]SegmentDescriptor
+
+// SegmentDescriptorFlags are typed flags within a descriptor.
+type SegmentDescriptorFlags uint32
+
+// SegmentDescriptorFlag declarations.
+const (
+	SegmentDescriptorAccess     SegmentDescriptorFlags = 1 << 8  // Access bit (always set).
+	SegmentDescriptorWrite                             = 1 << 9  // Write permission.
+	SegmentDescriptorExpandDown                        = 1 << 10 // Grows down, not used.
+	SegmentDescriptorExecute                           = 1 << 11 // Execute permission.
+	SegmentDescriptorSystem                            = 1 << 12 // Zero => system, 1 => user code/data.
+	SegmentDescriptorPresent                           = 1 << 15 // Present.
+	SegmentDescriptorAVL                               = 1 << 20 // Available.
+	SegmentDescriptorLong                              = 1 << 21 // Long mode.
+	SegmentDescriptorDB                                = 1 << 22 // 16 or 32-bit.
+	SegmentDescriptorG                                 = 1 << 23 // Granularity: page or byte.
+)
+
+// Base returns the descriptor's base linear address.
+func (d *SegmentDescriptor) Base() uint32 {
+	return d.bits[1]&0xFF000000 | (d.bits[1]&0x000000FF)<<16 | d.bits[0]>>16
+}
+
+// Limit returns the descriptor size.
+func (d *SegmentDescriptor) Limit() uint32 {
+	l := d.bits[0]&0xFFFF | d.bits[1]&0xF0000
+	if d.bits[1]&uint32(SegmentDescriptorG) != 0 {
+		l <<= 12
+		l |= 0xFFF
+	}
+	return l
+}
+
+// Flags returns descriptor flags.
+func (d *SegmentDescriptor) Flags() SegmentDescriptorFlags {
+	return SegmentDescriptorFlags(d.bits[1] & 0x00F09F00)
+}
+
+// DPL returns the descriptor privilege level.
+func (d *SegmentDescriptor) DPL() int {
+	return int((d.bits[1] >> 13) & 3)
+}
+
+func (d *SegmentDescriptor) setNull() {
+	d.bits[0] = 0
+	d.bits[1] = 0
+}
+
+func (d *SegmentDescriptor) set(base, limit uint32, dpl int, flags SegmentDescriptorFlags) {
+	flags |= SegmentDescriptorPresent
+	if limit>>12 != 0 {
+		limit >>= 12
+		flags |= SegmentDescriptorG
+	}
+	d.bits[0] = base<<16 | limit&0xFFFF
+	d.bits[1] = base&0xFF000000 | (base>>16)&0xFF | limit&0x000F0000 | uint32(flags) | uint32(dpl)<<13
+}
+
+func (d *SegmentDescriptor) setCode32(base, limit uint32, dpl int) {
+	d.set(base, limit, dpl,
+		SegmentDescriptorDB|
+			SegmentDescriptorExecute|
+			SegmentDescriptorSystem)
+}
+
+func (d *SegmentDescriptor) setCode64(base, limit uint32, dpl int) {
+	d.set(base, limit, dpl,
+		SegmentDescriptorG|
+			SegmentDescriptorLong|
+			SegmentDescriptorExecute|
+			SegmentDescriptorSystem)
+}
+
+func (d *SegmentDescriptor) setData(base, limit uint32, dpl int) {
+	d.set(base, limit, dpl,
+		SegmentDescriptorWrite|
+			SegmentDescriptorSystem)
+}
+
+// setHi is only used for the TSS segment, which is magically 64-bits.
+func (d *SegmentDescriptor) setHi(base uint32) {
+	d.bits[0] = base
+	d.bits[1] = 0
+}
+
+// Gate64 is a 64-bit task, trap, or interrupt gate.
+type Gate64 struct {
+	bits [4]uint32
+}
+
+// idt64 is a 64-bit interrupt descriptor table.
+type idt64 [_NR_INTERRUPTS]Gate64
+
+func (g *Gate64) setInterrupt(cs Selector, rip uint64, dpl int, ist int) {
+	g.bits[0] = uint32(cs)<<16 | uint32(rip)&0xFFFF
+	g.bits[1] = uint32(rip)&0xFFFF0000 | SegmentDescriptorPresent | uint32(dpl)<<13 | 14<<8 | uint32(ist)&0x7
+	g.bits[2] = uint32(rip >> 32)
+}
+
+func (g *Gate64) setTrap(cs Selector, rip uint64, dpl int, ist int) {
+	g.setInterrupt(cs, rip, dpl, ist)
+	g.bits[1] |= 1 << 8
+}
+
+// TaskState64 is a 64-bit task state structure.
+type TaskState64 struct {
+	_              uint32
+	rsp0Lo, rsp0Hi uint32
+	rsp1Lo, rsp1Hi uint32
+	rsp2Lo, rsp2Hi uint32
+	_              [2]uint32
+	ist1Lo, ist1Hi uint32
+	ist2Lo, ist2Hi uint32
+	ist3Lo, ist3Hi uint32
+	ist4Lo, ist4Hi uint32
+	ist5Lo, ist5Hi uint32
+	ist6Lo, ist6Hi uint32
+	ist7Lo, ist7Hi uint32
+	_              [2]uint32
+	_              uint16
+	ioPerm         uint16
+}
diff --git a/pkg/sentry/platform/ring0/entry_amd64.go b/pkg/sentry/platform/ring0/entry_amd64.go
new file mode 100644
index 000000000..a5ce67885
--- /dev/null
+++ b/pkg/sentry/platform/ring0/entry_amd64.go
@@ -0,0 +1,128 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package ring0
+
+import (
+	"syscall"
+)
+
+// This is an assembly function.
+//
+// The sysenter function is invoked in two situations:
+//
+//  (1) The guest kernel has executed a system call.
+//  (2) The guest application has executed a system call.
+//
+// The interrupt flag is examined to determine whether the system call was
+// executed from kernel mode or not and the appropriate stub is called.
+func sysenter()
+
+// swapgs swaps the current GS value.
+//
+// This must be called prior to sysret/iret.
+func swapgs()
+
+// sysret returns to userspace from a system call.
+//
+// The return code is the vector that interrupted execution.
+//
+// See stubs.go for a note regarding the frame size of this function.
+func sysret(*CPU, *syscall.PtraceRegs) Vector
+
+// "iret is the cadillac of CPL switching."
+//
+//				-- Neel Natu
+//
+// iret is nearly identical to sysret, except an iret is used to fully restore
+// all user state. This must be called in cases where all registers need to be
+// restored.
+func iret(*CPU, *syscall.PtraceRegs) Vector
+
+// exception is the generic exception entry.
+//
+// This is called by the individual stub definitions.
+func exception()
+
+// resume is a stub that restores the CPU kernel registers.
+//
+// This is used when processing kernel exceptions and syscalls.
+func resume()
+
+// Start is the CPU entrypoint.
+//
+// The following start conditions must be satisfied:
+//
+//  * AX should contain the CPU pointer.
+//  * c.GDT() should be loaded as the GDT.
+//  * c.IDT() should be loaded as the IDT.
+//  * c.CR0() should be the current CR0 value.
+//  * c.CR3() should be set to the kernel PageTables.
+//  * c.CR4() should be the current CR4 value.
+//  * c.EFER() should be the current EFER value.
+//
+// The CPU state will be set to c.Registers().
+func Start()
+
+// Exception stubs.
+func divideByZero()
+func debug()
+func nmi()
+func breakpoint()
+func overflow()
+func boundRangeExceeded()
+func invalidOpcode()
+func deviceNotAvailable()
+func doubleFault()
+func coprocessorSegmentOverrun()
+func invalidTSS()
+func segmentNotPresent()
+func stackSegmentFault()
+func generalProtectionFault()
+func pageFault()
+func x87FloatingPointException()
+func alignmentCheck()
+func machineCheck()
+func simdFloatingPointException()
+func virtualizationException()
+func securityException()
+func syscallInt80()
+
+// Exception handler index.
+var handlers = map[Vector]func(){
+	DivideByZero:               divideByZero,
+	Debug:                      debug,
+	NMI:                        nmi,
+	Breakpoint:                 breakpoint,
+	Overflow:                   overflow,
+	BoundRangeExceeded:         boundRangeExceeded,
+	InvalidOpcode:              invalidOpcode,
+	DeviceNotAvailable:         deviceNotAvailable,
+	DoubleFault:                doubleFault,
+	CoprocessorSegmentOverrun:  coprocessorSegmentOverrun,
+	InvalidTSS:                 invalidTSS,
+	SegmentNotPresent:          segmentNotPresent,
+	StackSegmentFault:          stackSegmentFault,
+	GeneralProtectionFault:     generalProtectionFault,
+	PageFault:                  pageFault,
+	X87FloatingPointException:  x87FloatingPointException,
+	AlignmentCheck:             alignmentCheck,
+	MachineCheck:               machineCheck,
+	SIMDFloatingPointException: simdFloatingPointException,
+	VirtualizationException:    virtualizationException,
+	SecurityException:          securityException,
+	SyscallInt80:               syscallInt80,
+}
diff --git a/pkg/sentry/platform/ring0/entry_impl_amd64.s b/pkg/sentry/platform/ring0/entry_impl_amd64.s
new file mode 100755
index 000000000..d082d06a9
--- /dev/null
+++ b/pkg/sentry/platform/ring0/entry_impl_amd64.s
@@ -0,0 +1,383 @@
+// build +amd64
+
+// Automatically generated, do not edit.
+
+// CPU offsets.
+#define CPU_SELF             0x00
+#define CPU_REGISTERS        0x288
+#define CPU_STACK_TOP        0x110
+#define CPU_ERROR_CODE       0x110
+#define CPU_ERROR_TYPE       0x118
+
+// Bits.
+#define _RFLAGS_IF           0x200
+#define _KERNEL_FLAGS        0x02
+
+// Vectors.
+#define DivideByZero               0x00
+#define Debug                      0x01
+#define NMI                        0x02
+#define Breakpoint                 0x03
+#define Overflow                   0x04
+#define BoundRangeExceeded         0x05
+#define InvalidOpcode              0x06
+#define DeviceNotAvailable         0x07
+#define DoubleFault                0x08
+#define CoprocessorSegmentOverrun  0x09
+#define InvalidTSS                 0x0a
+#define SegmentNotPresent          0x0b
+#define StackSegmentFault          0x0c
+#define GeneralProtectionFault     0x0d
+#define PageFault                  0x0e
+#define X87FloatingPointException  0x10
+#define AlignmentCheck             0x11
+#define MachineCheck               0x12
+#define SIMDFloatingPointException 0x13
+#define VirtualizationException    0x14
+#define SecurityException          0x1e
+#define SyscallInt80               0x80
+#define Syscall                    0x81
+
+// Ptrace registers.
+#define PTRACE_R15      0x00
+#define PTRACE_R14      0x08
+#define PTRACE_R13      0x10
+#define PTRACE_R12      0x18
+#define PTRACE_RBP      0x20
+#define PTRACE_RBX      0x28
+#define PTRACE_R11      0x30
+#define PTRACE_R10      0x38
+#define PTRACE_R9       0x40
+#define PTRACE_R8       0x48
+#define PTRACE_RAX      0x50
+#define PTRACE_RCX      0x58
+#define PTRACE_RDX      0x60
+#define PTRACE_RSI      0x68
+#define PTRACE_RDI      0x70
+#define PTRACE_ORIGRAX  0x78
+#define PTRACE_RIP      0x80
+#define PTRACE_CS       0x88
+#define PTRACE_FLAGS    0x90
+#define PTRACE_RSP      0x98
+#define PTRACE_SS       0xa0
+#define PTRACE_FS       0xa8
+#define PTRACE_GS       0xb0
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "funcdata.h"
+#include "textflag.h"
+
+// NB: Offsets are programatically generated (see BUILD).
+//
+// This file is concatenated with the definitions.
+
+// Saves a register set.
+//
+// This is a macro because it may need to executed in contents where a stack is
+// not available for calls.
+//
+// The following registers are not saved: AX, SP, IP, FLAGS, all segments.
+#define REGISTERS_SAVE(reg, offset) \
+  MOVQ R15, offset+PTRACE_R15(reg); \
+  MOVQ R14, offset+PTRACE_R14(reg); \
+  MOVQ R13, offset+PTRACE_R13(reg); \
+  MOVQ R12, offset+PTRACE_R12(reg); \
+  MOVQ BP,  offset+PTRACE_RBP(reg); \
+  MOVQ BX,  offset+PTRACE_RBX(reg); \
+  MOVQ CX,  offset+PTRACE_RCX(reg); \
+  MOVQ DX,  offset+PTRACE_RDX(reg); \
+  MOVQ R11, offset+PTRACE_R11(reg); \
+  MOVQ R10, offset+PTRACE_R10(reg); \
+  MOVQ R9,  offset+PTRACE_R9(reg); \
+  MOVQ R8,  offset+PTRACE_R8(reg); \
+  MOVQ SI,  offset+PTRACE_RSI(reg); \
+  MOVQ DI,  offset+PTRACE_RDI(reg);
+
+// Loads a register set.
+//
+// This is a macro because it may need to executed in contents where a stack is
+// not available for calls.
+//
+// The following registers are not loaded: AX, SP, IP, FLAGS, all segments.
+#define REGISTERS_LOAD(reg, offset) \
+  MOVQ offset+PTRACE_R15(reg), R15; \
+  MOVQ offset+PTRACE_R14(reg), R14; \
+  MOVQ offset+PTRACE_R13(reg), R13; \
+  MOVQ offset+PTRACE_R12(reg), R12; \
+  MOVQ offset+PTRACE_RBP(reg), BP; \
+  MOVQ offset+PTRACE_RBX(reg), BX; \
+  MOVQ offset+PTRACE_RCX(reg), CX; \
+  MOVQ offset+PTRACE_RDX(reg), DX; \
+  MOVQ offset+PTRACE_R11(reg), R11; \
+  MOVQ offset+PTRACE_R10(reg), R10; \
+  MOVQ offset+PTRACE_R9(reg),  R9; \
+  MOVQ offset+PTRACE_R8(reg),  R8; \
+  MOVQ offset+PTRACE_RSI(reg), SI; \
+  MOVQ offset+PTRACE_RDI(reg), DI;
+
+// SWAP_GS swaps the kernel GS (CPU).
+#define SWAP_GS() \
+	BYTE $0x0F; BYTE $0x01; BYTE $0xf8;
+
+// IRET returns from an interrupt frame.
+#define IRET() \
+	BYTE $0x48; BYTE $0xcf;
+
+// SYSRET64 executes the sysret instruction.
+#define SYSRET64() \
+	BYTE $0x48; BYTE $0x0f; BYTE $0x07;
+
+// LOAD_KERNEL_ADDRESS loads a kernel address.
+#define LOAD_KERNEL_ADDRESS(from, to) \
+	MOVQ from, to; \
+	ORQ ·KernelStartAddress(SB), to;
+
+// LOAD_KERNEL_STACK loads the kernel stack.
+#define LOAD_KERNEL_STACK(from) \
+	LOAD_KERNEL_ADDRESS(CPU_SELF(from), SP); \
+	LEAQ CPU_STACK_TOP(SP), SP;
+
+// See kernel.go.
+TEXT ·Halt(SB),NOSPLIT,$0
+	HLT
+	RET
+
+// See entry_amd64.go.
+TEXT ·swapgs(SB),NOSPLIT,$0
+	SWAP_GS()
+	RET
+
+// See entry_amd64.go.
+TEXT ·sysret(SB),NOSPLIT,$0-24
+	// Save original state.
+	LOAD_KERNEL_ADDRESS(cpu+0(FP), BX)
+	LOAD_KERNEL_ADDRESS(regs+8(FP), AX)
+	MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX)
+	MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX)
+	MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX)
+
+	// Restore user register state.
+	REGISTERS_LOAD(AX, 0)
+	MOVQ PTRACE_RIP(AX), CX    // Needed for SYSRET.
+	MOVQ PTRACE_FLAGS(AX), R11 // Needed for SYSRET.
+	MOVQ PTRACE_RSP(AX), SP    // Restore the stack directly.
+	MOVQ PTRACE_RAX(AX), AX    // Restore AX (scratch).
+	SYSRET64()
+
+// See entry_amd64.go.
+TEXT ·iret(SB),NOSPLIT,$0-24
+	// Save original state.
+	LOAD_KERNEL_ADDRESS(cpu+0(FP), BX)
+	LOAD_KERNEL_ADDRESS(regs+8(FP), AX)
+	MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX)
+	MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX)
+	MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX)
+
+	// Build an IRET frame & restore state.
+	LOAD_KERNEL_STACK(BX)
+	MOVQ PTRACE_SS(AX), BX;    PUSHQ BX
+	MOVQ PTRACE_RSP(AX), CX;   PUSHQ CX
+	MOVQ PTRACE_FLAGS(AX), DX; PUSHQ DX
+	MOVQ PTRACE_CS(AX), DI;    PUSHQ DI
+	MOVQ PTRACE_RIP(AX), SI;   PUSHQ SI
+	REGISTERS_LOAD(AX, 0)   // Restore most registers.
+	MOVQ PTRACE_RAX(AX), AX // Restore AX (scratch).
+	IRET()
+
+// See entry_amd64.go.
+TEXT ·resume(SB),NOSPLIT,$0
+	// See iret, above.
+	MOVQ CPU_REGISTERS+PTRACE_SS(GS), BX;    PUSHQ BX
+	MOVQ CPU_REGISTERS+PTRACE_RSP(GS), CX;   PUSHQ CX
+	MOVQ CPU_REGISTERS+PTRACE_FLAGS(GS), DX; PUSHQ DX
+	MOVQ CPU_REGISTERS+PTRACE_CS(GS), DI;    PUSHQ DI
+	MOVQ CPU_REGISTERS+PTRACE_RIP(GS), SI;   PUSHQ SI
+	REGISTERS_LOAD(GS, CPU_REGISTERS)
+	MOVQ CPU_REGISTERS+PTRACE_RAX(GS), AX
+	IRET()
+
+// See entry_amd64.go.
+TEXT ·Start(SB),NOSPLIT,$0
+	LOAD_KERNEL_STACK(AX) // Set the stack.
+	PUSHQ $0x0            // Previous frame pointer.
+	MOVQ SP, BP           // Set frame pointer.
+	PUSHQ AX              // First argument (CPU).
+	CALL ·start(SB)       // Call Go hook.
+	JMP ·resume(SB)       // Restore to registers.
+
+// See entry_amd64.go.
+TEXT ·sysenter(SB),NOSPLIT,$0
+	// Interrupts are always disabled while we're executing in kernel mode
+	// and always enabled while executing in user mode. Therefore, we can
+	// reliably look at the flags in R11 to determine where this syscall
+	// was from.
+	TESTL $_RFLAGS_IF, R11
+	JZ kernel
+
+user:
+	SWAP_GS()
+	XCHGQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Swap stacks.
+	XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Swap for AX (regs).
+	REGISTERS_SAVE(AX, 0)                  // Save all except IP, FLAGS, SP, AX.
+	MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX  // Load saved AX value.
+	MOVQ BX,  PTRACE_RAX(AX)               // Save everything else.
+	MOVQ BX,  PTRACE_ORIGRAX(AX)
+	MOVQ CX,  PTRACE_RIP(AX)
+	MOVQ R11, PTRACE_FLAGS(AX)
+	MOVQ CPU_REGISTERS+PTRACE_RSP(GS), BX; MOVQ BX, PTRACE_RSP(AX)
+	MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code.
+	MOVQ $1, CPU_ERROR_TYPE(GS) // Set error type to user.
+
+	// Return to the kernel, where the frame is:
+	//
+	//	vector      (sp+24)
+	// 	regs        (sp+16)
+	// 	cpu         (sp+8)
+	// 	vcpu.Switch (sp+0)
+	//
+	MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer.
+	MOVQ $Syscall, 24(SP)                 // Output vector.
+	RET
+
+kernel:
+	// We can't restore the original stack, but we can access the registers
+	// in the CPU state directly. No need for temporary juggling.
+	MOVQ AX,  CPU_REGISTERS+PTRACE_ORIGRAX(GS)
+	MOVQ AX,  CPU_REGISTERS+PTRACE_RAX(GS)
+	REGISTERS_SAVE(GS, CPU_REGISTERS)
+	MOVQ CX,  CPU_REGISTERS+PTRACE_RIP(GS)
+	MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(GS)
+	MOVQ SP,  CPU_REGISTERS+PTRACE_RSP(GS)
+	MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code.
+	MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel.
+
+	// Call the syscall trampoline.
+	LOAD_KERNEL_STACK(GS)
+	MOVQ CPU_SELF(GS), AX   // Load vCPU.
+	PUSHQ AX                // First argument (vCPU).
+	CALL ·kernelSyscall(SB) // Call the trampoline.
+	POPQ AX                 // Pop vCPU.
+	JMP ·resume(SB)
+
+// exception is a generic exception handler.
+//
+// There are two cases handled:
+//
+// 1) An exception in kernel mode: this results in saving the state at the time
+// of the exception and calling the defined hook.
+//
+// 2) An exception in guest mode: the original kernel frame is restored, and
+// the vector & error codes are pushed as return values.
+//
+// See below for the stubs that call exception.
+TEXT ·exception(SB),NOSPLIT,$0
+	// Determine whether the exception occurred in kernel mode or user
+	// mode, based on the flags. We expect the following stack:
+	//
+	//	SS          (sp+48)
+	//	SP          (sp+40)
+	//	FLAGS       (sp+32)
+	//	CS          (sp+24)
+	//	IP          (sp+16)
+	//	ERROR_CODE  (sp+8)
+	//	VECTOR      (sp+0)
+	//
+	TESTL $_RFLAGS_IF, 32(SP)
+	JZ kernel
+
+user:
+	SWAP_GS()
+	ADDQ $-8, SP                            // Adjust for flags.
+	MOVQ $_KERNEL_FLAGS, 0(SP); BYTE $0x9d; // Reset flags (POPFQ).
+	XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX  // Swap for user regs.
+	REGISTERS_SAVE(AX, 0)                   // Save all except IP, FLAGS, SP, AX.
+	MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX   // Restore original AX.
+	MOVQ BX, PTRACE_RAX(AX)                 // Save it.
+	MOVQ BX, PTRACE_ORIGRAX(AX)
+	MOVQ 16(SP), BX; MOVQ BX, PTRACE_RIP(AX)
+	MOVQ 24(SP), CX; MOVQ CX, PTRACE_CS(AX)
+	MOVQ 32(SP), DX; MOVQ DX, PTRACE_FLAGS(AX)
+	MOVQ 40(SP), DI; MOVQ DI, PTRACE_RSP(AX)
+	MOVQ 48(SP), SI; MOVQ SI, PTRACE_SS(AX)
+
+	// Copy out and return.
+	MOVQ 0(SP), BX                        // Load vector.
+	MOVQ 8(SP), CX                        // Load error code.
+	MOVQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Original stack (kernel version).
+	MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer.
+	MOVQ CX, CPU_ERROR_CODE(GS)           // Set error code.
+	MOVQ $1, CPU_ERROR_TYPE(GS)           // Set error type to user.
+	MOVQ BX, 24(SP)                       // Output vector.
+	RET
+
+kernel:
+	// As per above, we can save directly.
+	MOVQ AX, CPU_REGISTERS+PTRACE_RAX(GS)
+	MOVQ AX, CPU_REGISTERS+PTRACE_ORIGRAX(GS)
+	REGISTERS_SAVE(GS, CPU_REGISTERS)
+	MOVQ 16(SP), AX; MOVQ AX, CPU_REGISTERS+PTRACE_RIP(GS)
+	MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(GS)
+	MOVQ 40(SP), CX; MOVQ CX, CPU_REGISTERS+PTRACE_RSP(GS)
+
+	// Set the error code and adjust the stack.
+	MOVQ 8(SP), AX              // Load the error code.
+	MOVQ AX, CPU_ERROR_CODE(GS) // Copy out to the CPU.
+	MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel.
+	MOVQ 0(SP), BX              // BX contains the vector.
+	ADDQ $48, SP                // Drop the exception frame.
+
+	// Call the exception trampoline.
+	LOAD_KERNEL_STACK(GS)
+	MOVQ CPU_SELF(GS), AX     // Load vCPU.
+	PUSHQ BX                  // Second argument (vector).
+	PUSHQ AX                  // First argument (vCPU).
+	CALL ·kernelException(SB) // Call the trampoline.
+	POPQ BX                   // Pop vector.
+	POPQ AX                   // Pop vCPU.
+	JMP ·resume(SB)
+
+#define EXCEPTION_WITH_ERROR(value, symbol) \
+TEXT symbol,NOSPLIT,$0; \
+	PUSHQ $value; \
+	JMP ·exception(SB);
+
+#define EXCEPTION_WITHOUT_ERROR(value, symbol) \
+TEXT symbol,NOSPLIT,$0; \
+	PUSHQ $0x0; \
+	PUSHQ $value; \
+	JMP ·exception(SB);
+
+EXCEPTION_WITHOUT_ERROR(DivideByZero, ·divideByZero(SB))
+EXCEPTION_WITHOUT_ERROR(Debug, ·debug(SB))
+EXCEPTION_WITHOUT_ERROR(NMI, ·nmi(SB))
+EXCEPTION_WITHOUT_ERROR(Breakpoint, ·breakpoint(SB))
+EXCEPTION_WITHOUT_ERROR(Overflow, ·overflow(SB))
+EXCEPTION_WITHOUT_ERROR(BoundRangeExceeded, ·boundRangeExceeded(SB))
+EXCEPTION_WITHOUT_ERROR(InvalidOpcode, ·invalidOpcode(SB))
+EXCEPTION_WITHOUT_ERROR(DeviceNotAvailable, ·deviceNotAvailable(SB))
+EXCEPTION_WITH_ERROR(DoubleFault, ·doubleFault(SB))
+EXCEPTION_WITHOUT_ERROR(CoprocessorSegmentOverrun, ·coprocessorSegmentOverrun(SB))
+EXCEPTION_WITH_ERROR(InvalidTSS, ·invalidTSS(SB))
+EXCEPTION_WITH_ERROR(SegmentNotPresent, ·segmentNotPresent(SB))
+EXCEPTION_WITH_ERROR(StackSegmentFault, ·stackSegmentFault(SB))
+EXCEPTION_WITH_ERROR(GeneralProtectionFault, ·generalProtectionFault(SB))
+EXCEPTION_WITH_ERROR(PageFault, ·pageFault(SB))
+EXCEPTION_WITHOUT_ERROR(X87FloatingPointException, ·x87FloatingPointException(SB))
+EXCEPTION_WITH_ERROR(AlignmentCheck, ·alignmentCheck(SB))
+EXCEPTION_WITHOUT_ERROR(MachineCheck, ·machineCheck(SB))
+EXCEPTION_WITHOUT_ERROR(SIMDFloatingPointException, ·simdFloatingPointException(SB))
+EXCEPTION_WITHOUT_ERROR(VirtualizationException, ·virtualizationException(SB))
+EXCEPTION_WITH_ERROR(SecurityException, ·securityException(SB))
+EXCEPTION_WITHOUT_ERROR(SyscallInt80, ·syscallInt80(SB))
diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go
new file mode 100644
index 000000000..900c0bba7
--- /dev/null
+++ b/pkg/sentry/platform/ring0/kernel.go
@@ -0,0 +1,66 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ring0
+
+// Init initializes a new kernel.
+//
+// N.B. that constraints on KernelOpts must be satisfied.
+//
+//go:nosplit
+func (k *Kernel) Init(opts KernelOpts) {
+	k.init(opts)
+}
+
+// Halt halts execution.
+func Halt()
+
+// defaultHooks implements hooks.
+type defaultHooks struct{}
+
+// KernelSyscall implements Hooks.KernelSyscall.
+//
+//go:nosplit
+func (defaultHooks) KernelSyscall() { Halt() }
+
+// KernelException implements Hooks.KernelException.
+//
+//go:nosplit
+func (defaultHooks) KernelException(Vector) { Halt() }
+
+// kernelSyscall is a trampoline.
+//
+//go:nosplit
+func kernelSyscall(c *CPU) { c.hooks.KernelSyscall() }
+
+// kernelException is a trampoline.
+//
+//go:nosplit
+func kernelException(c *CPU, vector Vector) { c.hooks.KernelException(vector) }
+
+// Init initializes a new CPU.
+//
+// Init allows embedding in other objects.
+func (c *CPU) Init(k *Kernel, hooks Hooks) {
+	c.self = c   // Set self reference.
+	c.kernel = k // Set kernel reference.
+	c.init()     // Perform architectural init.
+
+	// Require hooks.
+	if hooks != nil {
+		c.hooks = hooks
+	} else {
+		c.hooks = defaultHooks{}
+	}
+}
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
new file mode 100644
index 000000000..3577b5127
--- /dev/null
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -0,0 +1,271 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package ring0
+
+import (
+	"encoding/binary"
+)
+
+// init initializes architecture-specific state.
+func (k *Kernel) init(opts KernelOpts) {
+	// Save the root page tables.
+	k.PageTables = opts.PageTables
+
+	// Setup the IDT, which is uniform.
+	for v, handler := range handlers {
+		// Allow Breakpoint and Overflow to be called from all
+		// privilege levels.
+		dpl := 0
+		if v == Breakpoint || v == Overflow {
+			dpl = 3
+		}
+		// Note that we set all traps to use the interrupt stack, this
+		// is defined below when setting up the TSS.
+		k.globalIDT[v].setInterrupt(Kcode, uint64(kernelFunc(handler)), dpl, 1 /* ist */)
+	}
+}
+
+// init initializes architecture-specific state.
+func (c *CPU) init() {
+	// Null segment.
+	c.gdt[0].setNull()
+
+	// Kernel & user segments.
+	c.gdt[segKcode] = KernelCodeSegment
+	c.gdt[segKdata] = KernelDataSegment
+	c.gdt[segUcode32] = UserCodeSegment32
+	c.gdt[segUdata] = UserDataSegment
+	c.gdt[segUcode64] = UserCodeSegment64
+
+	// The task segment, this spans two entries.
+	tssBase, tssLimit, _ := c.TSS()
+	c.gdt[segTss].set(
+		uint32(tssBase),
+		uint32(tssLimit),
+		0, // Privilege level zero.
+		SegmentDescriptorPresent|
+			SegmentDescriptorAccess|
+			SegmentDescriptorWrite|
+			SegmentDescriptorExecute)
+	c.gdt[segTssHi].setHi(uint32((tssBase) >> 32))
+
+	// Set the kernel stack pointer in the TSS (virtual address).
+	stackAddr := c.StackTop()
+	c.tss.rsp0Lo = uint32(stackAddr)
+	c.tss.rsp0Hi = uint32(stackAddr >> 32)
+	c.tss.ist1Lo = uint32(stackAddr)
+	c.tss.ist1Hi = uint32(stackAddr >> 32)
+
+	// Permanently set the kernel segments.
+	c.registers.Cs = uint64(Kcode)
+	c.registers.Ds = uint64(Kdata)
+	c.registers.Es = uint64(Kdata)
+	c.registers.Ss = uint64(Kdata)
+	c.registers.Fs = uint64(Kdata)
+	c.registers.Gs = uint64(Kdata)
+
+	// Set mandatory flags.
+	c.registers.Eflags = KernelFlagsSet
+}
+
+// StackTop returns the kernel's stack address.
+//
+//go:nosplit
+func (c *CPU) StackTop() uint64 {
+	return uint64(kernelAddr(&c.stack[0])) + uint64(len(c.stack))
+}
+
+// IDT returns the CPU's IDT base and limit.
+//
+//go:nosplit
+func (c *CPU) IDT() (uint64, uint16) {
+	return uint64(kernelAddr(&c.kernel.globalIDT[0])), uint16(binary.Size(&c.kernel.globalIDT) - 1)
+}
+
+// GDT returns the CPU's GDT base and limit.
+//
+//go:nosplit
+func (c *CPU) GDT() (uint64, uint16) {
+	return uint64(kernelAddr(&c.gdt[0])), uint16(8*segLast - 1)
+}
+
+// TSS returns the CPU's TSS base, limit and value.
+//
+//go:nosplit
+func (c *CPU) TSS() (uint64, uint16, *SegmentDescriptor) {
+	return uint64(kernelAddr(&c.tss)), uint16(binary.Size(&c.tss) - 1), &c.gdt[segTss]
+}
+
+// CR0 returns the CPU's CR0 value.
+//
+//go:nosplit
+func (c *CPU) CR0() uint64 {
+	return _CR0_PE | _CR0_PG | _CR0_AM | _CR0_ET
+}
+
+// CR4 returns the CPU's CR4 value.
+//
+//go:nosplit
+func (c *CPU) CR4() uint64 {
+	cr4 := uint64(_CR4_PAE | _CR4_PSE | _CR4_OSFXSR | _CR4_OSXMMEXCPT)
+	if hasPCID {
+		cr4 |= _CR4_PCIDE
+	}
+	if hasXSAVE {
+		cr4 |= _CR4_OSXSAVE
+	}
+	if hasSMEP {
+		cr4 |= _CR4_SMEP
+	}
+	if hasFSGSBASE {
+		cr4 |= _CR4_FSGSBASE
+	}
+	return cr4
+}
+
+// EFER returns the CPU's EFER value.
+//
+//go:nosplit
+func (c *CPU) EFER() uint64 {
+	return _EFER_LME | _EFER_LMA | _EFER_SCE | _EFER_NX
+}
+
+// IsCanonical indicates whether addr is canonical per the amd64 spec.
+//
+//go:nosplit
+func IsCanonical(addr uint64) bool {
+	return addr <= 0x00007fffffffffff || addr > 0xffff800000000000
+}
+
+// SwitchToUser performs either a sysret or an iret.
+//
+// The return value is the vector that interrupted execution.
+//
+// This function will not split the stack. Callers will probably want to call
+// runtime.entersyscall (and pair with a call to runtime.exitsyscall) prior to
+// calling this function.
+//
+// When this is done, this region is quite sensitive to things like system
+// calls. After calling entersyscall, any memory used must have been allocated
+// and no function calls without go:nosplit are permitted. Any calls made here
+// are protected appropriately (e.g. IsCanonical and CR3).
+//
+// Also note that this function transitively depends on the compiler generating
+// code that uses IP-relative addressing inside of absolute addresses. That's
+// the case for amd64, but may not be the case for other architectures.
+//
+// Precondition: the Rip, Rsp, Fs and Gs registers must be canonical.
+//
+//go:nosplit
+func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
+	userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID)
+	kernelCR3 := c.kernel.PageTables.CR3(true, switchOpts.KernelPCID)
+
+	// Sanitize registers.
+	regs := switchOpts.Registers
+	regs.Eflags &= ^uint64(UserFlagsClear)
+	regs.Eflags |= UserFlagsSet
+	regs.Cs = uint64(Ucode64) // Required for iret.
+	regs.Ss = uint64(Udata)   // Ditto.
+
+	// Perform the switch.
+	swapgs()                                         // GS will be swapped on return.
+	WriteFS(uintptr(regs.Fs_base))                   // Set application FS.
+	WriteGS(uintptr(regs.Gs_base))                   // Set application GS.
+	LoadFloatingPoint(switchOpts.FloatingPointState) // Copy in floating point.
+	jumpToKernel()                                   // Switch to upper half.
+	writeCR3(uintptr(userCR3))                       // Change to user address space.
+	if switchOpts.FullRestore {
+		vector = iret(c, regs)
+	} else {
+		vector = sysret(c, regs)
+	}
+	writeCR3(uintptr(kernelCR3))                     // Return to kernel address space.
+	jumpToUser()                                     // Return to lower half.
+	SaveFloatingPoint(switchOpts.FloatingPointState) // Copy out floating point.
+	WriteFS(uintptr(c.registers.Fs_base))            // Restore kernel FS.
+	return
+}
+
+// start is the CPU entrypoint.
+//
+// This is called from the Start asm stub (see entry_amd64.go); on return the
+// registers in c.registers will be restored (not segments).
+//
+//go:nosplit
+func start(c *CPU) {
+	// Save per-cpu & FS segment.
+	WriteGS(kernelAddr(c))
+	WriteFS(uintptr(c.registers.Fs_base))
+
+	// Initialize floating point.
+	//
+	// Note that on skylake, the valid XCR0 mask reported seems to be 0xff.
+	// This breaks down as:
+	//
+	//	bit0   - x87
+	//	bit1   - SSE
+	//	bit2   - AVX
+	//	bit3-4 - MPX
+	//	bit5-7 - AVX512
+	//
+	// For some reason, enabled MPX & AVX512 on platforms that report them
+	// seems to be cause a general protection fault. (Maybe there are some
+	// virtualization issues and these aren't exported to the guest cpuid.)
+	// This needs further investigation, but we can limit the floating
+	// point operations to x87, SSE & AVX for now.
+	fninit()
+	xsetbv(0, validXCR0Mask&0x7)
+
+	// Set the syscall target.
+	wrmsr(_MSR_LSTAR, kernelFunc(sysenter))
+	wrmsr(_MSR_SYSCALL_MASK, KernelFlagsClear|_RFLAGS_DF)
+
+	// NOTE: This depends on having the 64-bit segments immediately
+	// following the 32-bit user segments. This is simply the way the
+	// sysret instruction is designed to work (it assumes they follow).
+	wrmsr(_MSR_STAR, uintptr(uint64(Kcode)<<32|uint64(Ucode32)<<48))
+	wrmsr(_MSR_CSTAR, kernelFunc(sysenter))
+}
+
+// SetCPUIDFaulting sets CPUID faulting per the boolean value.
+//
+// True is returned if faulting could be set.
+//
+//go:nosplit
+func SetCPUIDFaulting(on bool) bool {
+	// Per the SDM (Vol 3, Table 2-43), PLATFORM_INFO bit 31 denotes support
+	// for CPUID faulting, and we enable and disable via the MISC_FEATURES MSR.
+	if rdmsr(_MSR_PLATFORM_INFO)&_PLATFORM_INFO_CPUID_FAULT != 0 {
+		features := rdmsr(_MSR_MISC_FEATURES)
+		if on {
+			features |= _MISC_FEATURE_CPUID_TRAP
+		} else {
+			features &^= _MISC_FEATURE_CPUID_TRAP
+		}
+		wrmsr(_MSR_MISC_FEATURES, features)
+		return true // Setting successful.
+	}
+	return false
+}
+
+// ReadCR2 reads the current CR2 value.
+//
+//go:nosplit
+func ReadCR2() uintptr {
+	return readCR2()
+}
diff --git a/pkg/sentry/platform/ring0/kernel_unsafe.go b/pkg/sentry/platform/ring0/kernel_unsafe.go
new file mode 100644
index 000000000..16955ad91
--- /dev/null
+++ b/pkg/sentry/platform/ring0/kernel_unsafe.go
@@ -0,0 +1,41 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ring0
+
+import (
+	"unsafe"
+)
+
+// eface mirrors runtime.eface.
+type eface struct {
+	typ  uintptr
+	data unsafe.Pointer
+}
+
+// kernelAddr returns the kernel virtual address for the given object.
+//
+//go:nosplit
+func kernelAddr(obj interface{}) uintptr {
+	e := (*eface)(unsafe.Pointer(&obj))
+	return KernelStartAddress | uintptr(e.data)
+}
+
+// kernelFunc returns the address of the given function.
+//
+//go:nosplit
+func kernelFunc(fn func()) uintptr {
+	fnptr := (**uintptr)(unsafe.Pointer(&fn))
+	return KernelStartAddress | **fnptr
+}
diff --git a/pkg/sentry/platform/ring0/lib_amd64.go b/pkg/sentry/platform/ring0/lib_amd64.go
new file mode 100644
index 000000000..9c5f26962
--- /dev/null
+++ b/pkg/sentry/platform/ring0/lib_amd64.go
@@ -0,0 +1,131 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package ring0
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/cpuid"
+)
+
+// LoadFloatingPoint loads floating point state by the most efficient mechanism
+// available (set by Init).
+var LoadFloatingPoint func(*byte)
+
+// SaveFloatingPoint saves floating point state by the most efficient mechanism
+// available (set by Init).
+var SaveFloatingPoint func(*byte)
+
+// fxrstor uses fxrstor64 to load floating point state.
+func fxrstor(*byte)
+
+// xrstor uses xrstor to load floating point state.
+func xrstor(*byte)
+
+// fxsave uses fxsave64 to save floating point state.
+func fxsave(*byte)
+
+// xsave uses xsave to save floating point state.
+func xsave(*byte)
+
+// xsaveopt uses xsaveopt to save floating point state.
+func xsaveopt(*byte)
+
+// WriteFS sets the GS address (set by init).
+var WriteFS func(addr uintptr)
+
+// wrfsbase writes to the GS base address.
+func wrfsbase(addr uintptr)
+
+// wrfsmsr writes to the GS_BASE MSR.
+func wrfsmsr(addr uintptr)
+
+// WriteGS sets the GS address (set by init).
+var WriteGS func(addr uintptr)
+
+// wrgsbase writes to the GS base address.
+func wrgsbase(addr uintptr)
+
+// wrgsmsr writes to the GS_BASE MSR.
+func wrgsmsr(addr uintptr)
+
+// writeCR3 writes the CR3 value.
+func writeCR3(phys uintptr)
+
+// readCR3 reads the current CR3 value.
+func readCR3() uintptr
+
+// readCR2 reads the current CR2 value.
+func readCR2() uintptr
+
+// jumpToKernel jumps to the kernel version of the current RIP.
+func jumpToKernel()
+
+// jumpToUser jumps to the user version of the current RIP.
+func jumpToUser()
+
+// fninit initializes the floating point unit.
+func fninit()
+
+// xsetbv writes to an extended control register.
+func xsetbv(reg, value uintptr)
+
+// xgetbv reads an extended control register.
+func xgetbv(reg uintptr) uintptr
+
+// wrmsr reads to the given MSR.
+func wrmsr(reg, value uintptr)
+
+// rdmsr reads the given MSR.
+func rdmsr(reg uintptr) uintptr
+
+// Mostly-constants set by Init.
+var (
+	hasSMEP       bool
+	hasPCID       bool
+	hasXSAVEOPT   bool
+	hasXSAVE      bool
+	hasFSGSBASE   bool
+	validXCR0Mask uintptr
+)
+
+// Init sets function pointers based on architectural features.
+//
+// This must be called prior to using ring0.
+func Init(featureSet *cpuid.FeatureSet) {
+	hasSMEP = featureSet.HasFeature(cpuid.X86FeatureSMEP)
+	hasPCID = featureSet.HasFeature(cpuid.X86FeaturePCID)
+	hasXSAVEOPT = featureSet.UseXsaveopt()
+	hasXSAVE = featureSet.UseXsave()
+	hasFSGSBASE = featureSet.HasFeature(cpuid.X86FeatureFSGSBase)
+	validXCR0Mask = uintptr(featureSet.ValidXCR0Mask())
+	if hasXSAVEOPT {
+		SaveFloatingPoint = xsaveopt
+		LoadFloatingPoint = xrstor
+	} else if hasXSAVE {
+		SaveFloatingPoint = xsave
+		LoadFloatingPoint = xrstor
+	} else {
+		SaveFloatingPoint = fxsave
+		LoadFloatingPoint = fxrstor
+	}
+	if hasFSGSBASE {
+		WriteFS = wrfsbase
+		WriteGS = wrgsbase
+	} else {
+		WriteFS = wrfsmsr
+		WriteGS = wrgsmsr
+	}
+}
diff --git a/pkg/sentry/platform/ring0/lib_amd64.s b/pkg/sentry/platform/ring0/lib_amd64.s
new file mode 100644
index 000000000..75d742750
--- /dev/null
+++ b/pkg/sentry/platform/ring0/lib_amd64.s
@@ -0,0 +1,247 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "funcdata.h"
+#include "textflag.h"
+
+// fxrstor loads floating point state.
+//
+// The code corresponds to:
+//
+//     fxrstor64 (%rbx)
+//
+TEXT ·fxrstor(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), BX
+	MOVL $0xffffffff, AX
+	MOVL $0xffffffff, DX
+	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x0b;
+	RET
+
+// xrstor loads floating point state.
+//
+// The code corresponds to:
+//
+//     xrstor (%rdi)
+//
+TEXT ·xrstor(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), DI
+	MOVL $0xffffffff, AX
+	MOVL $0xffffffff, DX
+	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x2f;
+	RET
+
+// fxsave saves floating point state.
+//
+// The code corresponds to:
+//
+//     fxsave64 (%rbx)
+//
+TEXT ·fxsave(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), BX
+	MOVL $0xffffffff, AX
+	MOVL $0xffffffff, DX
+	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x03;
+	RET
+
+// xsave saves floating point state.
+//
+// The code corresponds to:
+//
+//     xsave (%rdi)
+//
+TEXT ·xsave(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), DI
+	MOVL $0xffffffff, AX
+	MOVL $0xffffffff, DX
+	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27;
+	RET
+
+// xsaveopt saves floating point state.
+//
+// The code corresponds to:
+//
+//     xsaveopt (%rdi)
+//
+TEXT ·xsaveopt(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), DI
+	MOVL $0xffffffff, AX
+	MOVL $0xffffffff, DX
+	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37;
+	RET
+
+// wrfsbase writes to the FS base.
+//
+// The code corresponds to:
+//
+// 	wrfsbase %rax
+//
+TEXT ·wrfsbase(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), AX
+	BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xd0;
+	RET
+
+// wrfsmsr writes to the FSBASE MSR.
+//
+// The code corresponds to:
+//
+// 	wrmsr (writes EDX:EAX to the MSR in ECX)
+//
+TEXT ·wrfsmsr(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), AX
+	MOVQ AX, DX
+	SHRQ $32, DX
+	MOVQ $0xc0000100, CX // MSR_FS_BASE
+	BYTE $0x0f; BYTE $0x30;
+	RET
+
+// wrgsbase writes to the GS base.
+//
+// The code corresponds to:
+//
+// 	wrgsbase %rax
+//
+TEXT ·wrgsbase(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), AX
+	BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xd8;
+	RET
+
+// wrgsmsr writes to the GSBASE MSR.
+//
+// See wrfsmsr.
+TEXT ·wrgsmsr(SB),NOSPLIT,$0-8
+	MOVQ addr+0(FP), AX
+	MOVQ AX, DX
+	SHRQ $32, DX
+	MOVQ $0xc0000101, CX     // MSR_GS_BASE
+	BYTE $0x0f; BYTE $0x30;  // WRMSR
+	RET
+
+// jumpToUser changes execution to the user address.
+//
+// This works by changing the return value to the user version.
+TEXT ·jumpToUser(SB),NOSPLIT,$0
+	MOVQ 0(SP), AX
+	MOVQ ·KernelStartAddress(SB), BX
+	NOTQ BX
+	ANDQ BX, SP // Switch the stack.
+	ANDQ BX, BP // Switch the frame pointer.
+	ANDQ BX, AX // Future return value.
+	MOVQ AX, 0(SP)
+	RET
+
+// jumpToKernel changes execution to the kernel address space.
+//
+// This works by changing the return value to the kernel version.
+TEXT ·jumpToKernel(SB),NOSPLIT,$0
+	MOVQ 0(SP), AX
+	MOVQ ·KernelStartAddress(SB), BX
+	ORQ BX, SP // Switch the stack.
+	ORQ BX, BP // Switch the frame pointer.
+	ORQ BX, AX // Future return value.
+	MOVQ AX, 0(SP)
+	RET
+
+// writeCR3 writes the given CR3 value.
+//
+// The code corresponds to:
+//
+// 	mov %rax, %cr3
+//
+TEXT ·writeCR3(SB),NOSPLIT,$0-8
+	MOVQ cr3+0(FP), AX
+	BYTE $0x0f; BYTE $0x22; BYTE $0xd8;
+	RET
+
+// readCR3 reads the current CR3 value.
+//
+// The code corresponds to:
+//
+// 	mov %cr3, %rax
+//
+TEXT ·readCR3(SB),NOSPLIT,$0-8
+	BYTE $0x0f; BYTE $0x20; BYTE $0xd8;
+	MOVQ AX, ret+0(FP)
+	RET
+
+// readCR2 reads the current CR2 value.
+//
+// The code corresponds to:
+//
+// 	mov %cr2, %rax
+//
+TEXT ·readCR2(SB),NOSPLIT,$0-8
+	BYTE $0x0f; BYTE $0x20; BYTE $0xd0;
+	MOVQ AX, ret+0(FP)
+	RET
+
+// fninit initializes the floating point unit.
+//
+// The code corresponds to:
+//
+// 	fninit
+TEXT ·fninit(SB),NOSPLIT,$0
+	BYTE $0xdb; BYTE $0xe3;
+	RET
+
+// xsetbv writes to an extended control register.
+//
+// The code corresponds to:
+//
+// 	xsetbv
+//
+TEXT ·xsetbv(SB),NOSPLIT,$0-16
+	MOVL reg+0(FP), CX
+	MOVL value+8(FP), AX
+	MOVL value+12(FP), DX
+	BYTE $0x0f; BYTE $0x01; BYTE $0xd1;
+	RET
+
+// xgetbv reads an extended control register.
+//
+// The code corresponds to:
+//
+// 	xgetbv
+//
+TEXT ·xgetbv(SB),NOSPLIT,$0-16
+	MOVL reg+0(FP), CX
+	BYTE $0x0f; BYTE $0x01; BYTE $0xd0;
+	MOVL AX, ret+8(FP)
+	MOVL DX, ret+12(FP)
+	RET
+
+// wrmsr writes to a control register.
+//
+// The code corresponds to:
+//
+// 	wrmsr
+//
+TEXT ·wrmsr(SB),NOSPLIT,$0-16
+	MOVL reg+0(FP), CX
+	MOVL value+8(FP), AX
+	MOVL value+12(FP), DX
+	BYTE $0x0f; BYTE $0x30;
+	RET
+
+// rdmsr reads a control register.
+//
+// The code corresponds to:
+//
+// 	rdmsr
+//
+TEXT ·rdmsr(SB),NOSPLIT,$0-16
+	MOVL reg+0(FP), CX
+	BYTE $0x0f; BYTE $0x32;
+	MOVL AX, ret+8(FP)
+	MOVL DX, ret+12(FP)
+	RET
diff --git a/pkg/sentry/platform/ring0/pagetables/allocator.go b/pkg/sentry/platform/ring0/pagetables/allocator.go
new file mode 100644
index 000000000..23fd5c352
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/allocator.go
@@ -0,0 +1,122 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pagetables
+
+// Allocator is used to allocate and map PTEs.
+//
+// Note that allocators may be called concurrently.
+type Allocator interface {
+	// NewPTEs returns a new set of PTEs and their physical address.
+	NewPTEs() *PTEs
+
+	// PhysicalFor gives the physical address for a set of PTEs.
+	PhysicalFor(ptes *PTEs) uintptr
+
+	// LookupPTEs looks up PTEs by physical address.
+	LookupPTEs(physical uintptr) *PTEs
+
+	// FreePTEs marks a set of PTEs a freed, although they may not be available
+	// for use again until Recycle is called, below.
+	FreePTEs(ptes *PTEs)
+
+	// Recycle makes freed PTEs available for use again.
+	Recycle()
+}
+
+// RuntimeAllocator is a trivial allocator.
+type RuntimeAllocator struct {
+	// used is the set of PTEs that have been allocated. This includes any
+	// PTEs that may be in the pool below. PTEs are only freed from this
+	// map by the Drain call.
+	//
+	// This exists to prevent accidental garbage collection.
+	used map[*PTEs]struct{}
+
+	// pool is the set of free-to-use PTEs.
+	pool []*PTEs
+
+	// freed is the set of recently-freed PTEs.
+	freed []*PTEs
+}
+
+// NewRuntimeAllocator returns an allocator that uses runtime allocation.
+func NewRuntimeAllocator() *RuntimeAllocator {
+	return &RuntimeAllocator{
+		used: make(map[*PTEs]struct{}),
+	}
+}
+
+// Recycle returns freed pages to the pool.
+func (r *RuntimeAllocator) Recycle() {
+	r.pool = append(r.pool, r.freed...)
+	r.freed = r.freed[:0]
+}
+
+// Drain empties the pool.
+func (r *RuntimeAllocator) Drain() {
+	r.Recycle()
+	for i, ptes := range r.pool {
+		// Zap the entry in the underlying array to ensure that it can
+		// be properly garbage collected.
+		r.pool[i] = nil
+		// Similarly, free the reference held by the used map (these
+		// also apply for the pool entries).
+		delete(r.used, ptes)
+	}
+	r.pool = r.pool[:0]
+}
+
+// NewPTEs implements Allocator.NewPTEs.
+//
+// Note that the "physical" address here is actually the virtual address of the
+// PTEs structure. The entries are tracked only to avoid garbage collection.
+//
+// This is guaranteed not to split as long as the pool is sufficiently full.
+//
+//go:nosplit
+func (r *RuntimeAllocator) NewPTEs() *PTEs {
+	// Pull from the pool if we can.
+	if len(r.pool) > 0 {
+		ptes := r.pool[len(r.pool)-1]
+		r.pool = r.pool[:len(r.pool)-1]
+		return ptes
+	}
+
+	// Allocate a new entry.
+	ptes := newAlignedPTEs()
+	r.used[ptes] = struct{}{}
+	return ptes
+}
+
+// PhysicalFor returns the physical address for the given PTEs.
+//
+//go:nosplit
+func (r *RuntimeAllocator) PhysicalFor(ptes *PTEs) uintptr {
+	return physicalFor(ptes)
+}
+
+// LookupPTEs implements Allocator.LookupPTEs.
+//
+//go:nosplit
+func (r *RuntimeAllocator) LookupPTEs(physical uintptr) *PTEs {
+	return fromPhysical(physical)
+}
+
+// FreePTEs implements Allocator.FreePTEs.
+//
+//go:nosplit
+func (r *RuntimeAllocator) FreePTEs(ptes *PTEs) {
+	r.freed = append(r.freed, ptes)
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go
new file mode 100644
index 000000000..1b996b4e2
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go
@@ -0,0 +1,53 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pagetables
+
+import (
+	"unsafe"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// newAlignedPTEs returns a set of aligned PTEs.
+func newAlignedPTEs() *PTEs {
+	ptes := new(PTEs)
+	offset := physicalFor(ptes) & (usermem.PageSize - 1)
+	if offset == 0 {
+		// Already aligned.
+		return ptes
+	}
+
+	// Need to force an aligned allocation.
+	unaligned := make([]byte, (2*usermem.PageSize)-1)
+	offset = uintptr(unsafe.Pointer(&unaligned[0])) & (usermem.PageSize - 1)
+	if offset != 0 {
+		offset = usermem.PageSize - offset
+	}
+	return (*PTEs)(unsafe.Pointer(&unaligned[offset]))
+}
+
+// physicalFor returns the "physical" address for PTEs.
+//
+//go:nosplit
+func physicalFor(ptes *PTEs) uintptr {
+	return uintptr(unsafe.Pointer(ptes))
+}
+
+// fromPhysical returns the PTEs from the "physical" address.
+//
+//go:nosplit
+func fromPhysical(physical uintptr) *PTEs {
+	return (*PTEs)(unsafe.Pointer(physical))
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
new file mode 100644
index 000000000..e5dcaada7
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -0,0 +1,221 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pagetables provides a generic implementation of pagetables.
+//
+// The core functions must be safe to call from a nosplit context. Furthermore,
+// this pagetables implementation goes to lengths to ensure that all functions
+// are free from runtime allocation. Calls to NewPTEs/FreePTEs may be made
+// during walks, but these can be cached elsewhere if required.
+package pagetables
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// PageTables is a set of page tables.
+type PageTables struct {
+	// Allocator is used to allocate nodes.
+	Allocator Allocator
+
+	// root is the pagetable root.
+	root *PTEs
+
+	// rootPhysical is the cached physical address of the root.
+	//
+	// This is saved only to prevent constant translation.
+	rootPhysical uintptr
+
+	// archPageTables includes architecture-specific features.
+	archPageTables
+}
+
+// New returns new PageTables.
+func New(a Allocator) *PageTables {
+	p := new(PageTables)
+	p.Init(a)
+	return p
+}
+
+// Init initializes a set of PageTables.
+//
+//go:nosplit
+func (p *PageTables) Init(allocator Allocator) {
+	p.Allocator = allocator
+	p.root = p.Allocator.NewPTEs()
+	p.rootPhysical = p.Allocator.PhysicalFor(p.root)
+}
+
+// mapVisitor is used for map.
+type mapVisitor struct {
+	target   uintptr // Input.
+	physical uintptr // Input.
+	opts     MapOpts // Input.
+	prev     bool    // Output.
+}
+
+// visit is used for map.
+//
+//go:nosplit
+func (v *mapVisitor) visit(start uintptr, pte *PTE, align uintptr) {
+	p := v.physical + (start - uintptr(v.target))
+	if pte.Valid() && (pte.Address() != p || pte.Opts() != v.opts) {
+		v.prev = true
+	}
+	if p&align != 0 {
+		// We will install entries at a smaller granulaity if we don't
+		// install a valid entry here, however we must zap any existing
+		// entry to ensure this happens.
+		pte.Clear()
+		return
+	}
+	pte.Set(p, v.opts)
+}
+
+//go:nosplit
+func (*mapVisitor) requiresAlloc() bool { return true }
+
+//go:nosplit
+func (*mapVisitor) requiresSplit() bool { return true }
+
+// Map installs a mapping with the given physical address.
+//
+// True is returned iff there was a previous mapping in the range.
+//
+// Precondition: addr & length must be page-aligned, their sum must not overflow.
+//
+//go:nosplit
+func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physical uintptr) bool {
+	if !opts.AccessType.Any() {
+		return p.Unmap(addr, length)
+	}
+	w := mapWalker{
+		pageTables: p,
+		visitor: mapVisitor{
+			target:   uintptr(addr),
+			physical: physical,
+			opts:     opts,
+		},
+	}
+	w.iterateRange(uintptr(addr), uintptr(addr)+length)
+	return w.visitor.prev
+}
+
+// unmapVisitor is used for unmap.
+type unmapVisitor struct {
+	count int
+}
+
+//go:nosplit
+func (*unmapVisitor) requiresAlloc() bool { return false }
+
+//go:nosplit
+func (*unmapVisitor) requiresSplit() bool { return true }
+
+// visit unmaps the given entry.
+//
+//go:nosplit
+func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) {
+	pte.Clear()
+	v.count++
+}
+
+// Unmap unmaps the given range.
+//
+// True is returned iff there was a previous mapping in the range.
+//
+// Precondition: addr & length must be page-aligned.
+//
+//go:nosplit
+func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool {
+	w := unmapWalker{
+		pageTables: p,
+		visitor: unmapVisitor{
+			count: 0,
+		},
+	}
+	w.iterateRange(uintptr(addr), uintptr(addr)+length)
+	return w.visitor.count > 0
+}
+
+// emptyVisitor is used for emptiness checks.
+type emptyVisitor struct {
+	count int
+}
+
+//go:nosplit
+func (*emptyVisitor) requiresAlloc() bool { return false }
+
+//go:nosplit
+func (*emptyVisitor) requiresSplit() bool { return false }
+
+// visit unmaps the given entry.
+//
+//go:nosplit
+func (v *emptyVisitor) visit(start uintptr, pte *PTE, align uintptr) {
+	v.count++
+}
+
+// IsEmpty checks if the given range is empty.
+//
+// Precondition: addr & length must be page-aligned.
+//
+//go:nosplit
+func (p *PageTables) IsEmpty(addr usermem.Addr, length uintptr) bool {
+	w := emptyWalker{
+		pageTables: p,
+	}
+	w.iterateRange(uintptr(addr), uintptr(addr)+length)
+	return w.visitor.count == 0
+}
+
+// lookupVisitor is used for lookup.
+type lookupVisitor struct {
+	target   uintptr // Input.
+	physical uintptr // Output.
+	opts     MapOpts // Output.
+}
+
+// visit matches the given address.
+//
+//go:nosplit
+func (v *lookupVisitor) visit(start uintptr, pte *PTE, align uintptr) {
+	if !pte.Valid() {
+		return
+	}
+	v.physical = pte.Address() + (start - uintptr(v.target))
+	v.opts = pte.Opts()
+}
+
+//go:nosplit
+func (*lookupVisitor) requiresAlloc() bool { return false }
+
+//go:nosplit
+func (*lookupVisitor) requiresSplit() bool { return false }
+
+// Lookup returns the physical address for the given virtual address.
+//
+//go:nosplit
+func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts) {
+	mask := uintptr(usermem.PageSize - 1)
+	offset := uintptr(addr) & mask
+	w := lookupWalker{
+		pageTables: p,
+		visitor: lookupVisitor{
+			target: uintptr(addr &^ usermem.Addr(mask)),
+		},
+	}
+	w.iterateRange(uintptr(addr), uintptr(addr)+1)
+	return w.visitor.physical + offset, w.visitor.opts
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
new file mode 100644
index 000000000..7aa6c524e
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
@@ -0,0 +1,45 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pagetables
+
+// Address constraints.
+//
+// The lowerTop and upperBottom currently apply to four-level pagetables;
+// additional refactoring would be necessary to support five-level pagetables.
+const (
+	lowerTop    = 0x00007fffffffffff
+	upperBottom = 0xffff800000000000
+
+	pteShift = 12
+	pmdShift = 21
+	pudShift = 30
+	pgdShift = 39
+
+	pteMask = 0x1ff << pteShift
+	pmdMask = 0x1ff << pmdShift
+	pudMask = 0x1ff << pudShift
+	pgdMask = 0x1ff << pgdShift
+
+	pteSize = 1 << pteShift
+	pmdSize = 1 << pmdShift
+	pudSize = 1 << pudShift
+	pgdSize = 1 << pgdShift
+
+	executeDisable = 1 << 63
+	entriesPerPage = 512
+)
+
+// PTEs is a collection of entries.
+type PTEs [entriesPerPage]PTE
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_state_autogen.go b/pkg/sentry/platform/ring0/pagetables/pagetables_state_autogen.go
new file mode 100755
index 000000000..ac1ccf3d3
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_state_autogen.go
@@ -0,0 +1,4 @@
+// automatically generated by stateify.
+
+package pagetables
+
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
new file mode 100644
index 000000000..ff427fbe9
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
@@ -0,0 +1,180 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package pagetables
+
+import (
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// archPageTables is architecture-specific data.
+type archPageTables struct {
+	// pcid is the value assigned by PCIDs.Assign.
+	//
+	// Note that zero is a valid PCID.
+	pcid uint16
+}
+
+// CR3 returns the CR3 value for these tables.
+//
+// This may be called in interrupt contexts. A PCID of zero always implies a
+// flush and should be passed when PCIDs are not enabled. See pcids_x86.go for
+// more information.
+//
+//go:nosplit
+func (p *PageTables) CR3(noFlush bool, pcid uint16) uint64 {
+	// Bit 63 is set to avoid flushing the PCID (per SDM 4.10.4.1).
+	const noFlushBit uint64 = 0x8000000000000000
+	if noFlush && pcid != 0 {
+		return noFlushBit | uint64(p.rootPhysical) | uint64(pcid)
+	}
+	return uint64(p.rootPhysical) | uint64(pcid)
+}
+
+// Bits in page table entries.
+const (
+	present      = 0x001
+	writable     = 0x002
+	user         = 0x004
+	writeThrough = 0x008
+	cacheDisable = 0x010
+	accessed     = 0x020
+	dirty        = 0x040
+	super        = 0x080
+	global       = 0x100
+	optionMask   = executeDisable | 0xfff
+)
+
+// MapOpts are x86 options.
+type MapOpts struct {
+	// AccessType defines permissions.
+	AccessType usermem.AccessType
+
+	// Global indicates the page is globally accessible.
+	Global bool
+
+	// User indicates the page is a user page.
+	User bool
+}
+
+// PTE is a page table entry.
+type PTE uintptr
+
+// Clear clears this PTE, including super page information.
+//
+//go:nosplit
+func (p *PTE) Clear() {
+	atomic.StoreUintptr((*uintptr)(p), 0)
+}
+
+// Valid returns true iff this entry is valid.
+//
+//go:nosplit
+func (p *PTE) Valid() bool {
+	return atomic.LoadUintptr((*uintptr)(p))&present != 0
+}
+
+// Opts returns the PTE options.
+//
+// These are all options except Valid and Super.
+//
+//go:nosplit
+func (p *PTE) Opts() MapOpts {
+	v := atomic.LoadUintptr((*uintptr)(p))
+	return MapOpts{
+		AccessType: usermem.AccessType{
+			Read:    v&present != 0,
+			Write:   v&writable != 0,
+			Execute: v&executeDisable == 0,
+		},
+		Global: v&global != 0,
+		User:   v&user != 0,
+	}
+}
+
+// SetSuper sets this page as a super page.
+//
+// The page must not be valid or a panic will result.
+//
+//go:nosplit
+func (p *PTE) SetSuper() {
+	if p.Valid() {
+		// This is not allowed.
+		panic("SetSuper called on valid page!")
+	}
+	atomic.StoreUintptr((*uintptr)(p), super)
+}
+
+// IsSuper returns true iff this page is a super page.
+//
+//go:nosplit
+func (p *PTE) IsSuper() bool {
+	return atomic.LoadUintptr((*uintptr)(p))&super != 0
+}
+
+// Set sets this PTE value.
+//
+// This does not change the super page property.
+//
+//go:nosplit
+func (p *PTE) Set(addr uintptr, opts MapOpts) {
+	if !opts.AccessType.Any() {
+		p.Clear()
+		return
+	}
+	v := (addr &^ optionMask) | present | accessed
+	if opts.User {
+		v |= user
+	}
+	if opts.Global {
+		v |= global
+	}
+	if !opts.AccessType.Execute {
+		v |= executeDisable
+	}
+	if opts.AccessType.Write {
+		v |= writable | dirty
+	}
+	if p.IsSuper() {
+		// Note that this is inherited from the previous instance. Set
+		// does not change the value of Super. See above.
+		v |= super
+	}
+	atomic.StoreUintptr((*uintptr)(p), v)
+}
+
+// setPageTable sets this PTE value and forces the write bit and super bit to
+// be cleared. This is used explicitly for breaking super pages.
+//
+//go:nosplit
+func (p *PTE) setPageTable(pt *PageTables, ptes *PTEs) {
+	addr := pt.Allocator.PhysicalFor(ptes)
+	if addr&^optionMask != addr {
+		// This should never happen.
+		panic("unaligned physical address!")
+	}
+	v := addr | present | user | writable | accessed | dirty
+	atomic.StoreUintptr((*uintptr)(p), v)
+}
+
+// Address extracts the address. This should only be used if Valid returns true.
+//
+//go:nosplit
+func (p *PTE) Address() uintptr {
+	return atomic.LoadUintptr((*uintptr)(p)) &^ optionMask
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
new file mode 100644
index 000000000..0f029f25d
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
@@ -0,0 +1,109 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package pagetables
+
+import (
+	"sync"
+)
+
+// limitPCID is the number of valid PCIDs.
+const limitPCID = 4096
+
+// PCIDs is a simple PCID database.
+//
+// This is not protected by locks and is thus suitable for use only with a
+// single CPU at a time.
+type PCIDs struct {
+	// mu protects below.
+	mu sync.Mutex
+
+	// cache are the assigned page tables.
+	cache map[*PageTables]uint16
+
+	// avail are available PCIDs.
+	avail []uint16
+}
+
+// NewPCIDs returns a new PCID database.
+//
+// start is the first index to assign. Typically this will be one, as the zero
+// pcid will always be flushed on transition (see pagetables_x86.go). This may
+// be more than one if specific PCIDs are reserved.
+//
+// Nil is returned iff the start and size are out of range.
+func NewPCIDs(start, size uint16) *PCIDs {
+	if start+uint16(size) >= limitPCID {
+		return nil // See comment.
+	}
+	p := &PCIDs{
+		cache: make(map[*PageTables]uint16),
+	}
+	for pcid := start; pcid < start+size; pcid++ {
+		p.avail = append(p.avail, pcid)
+	}
+	return p
+}
+
+// Assign assigns a PCID to the given PageTables.
+//
+// This may overwrite any previous assignment provided. If this in the case,
+// true is returned to indicate that the PCID should be flushed.
+func (p *PCIDs) Assign(pt *PageTables) (uint16, bool) {
+	p.mu.Lock()
+	if pcid, ok := p.cache[pt]; ok {
+		p.mu.Unlock()
+		return pcid, false // No flush.
+	}
+
+	// Is there something available?
+	if len(p.avail) > 0 {
+		pcid := p.avail[len(p.avail)-1]
+		p.avail = p.avail[:len(p.avail)-1]
+		p.cache[pt] = pcid
+
+		// We need to flush because while this is in the available
+		// pool, it may have been used previously.
+		p.mu.Unlock()
+		return pcid, true
+	}
+
+	// Evict an existing table.
+	for old, pcid := range p.cache {
+		delete(p.cache, old)
+		p.cache[pt] = pcid
+
+		// A flush is definitely required in this case, these page
+		// tables may still be active. (They will just be assigned some
+		// other PCID if and when they hit the given CPU again.)
+		p.mu.Unlock()
+		return pcid, true
+	}
+
+	// No PCID.
+	p.mu.Unlock()
+	return 0, false
+}
+
+// Drop drops references to a set of page tables.
+func (p *PCIDs) Drop(pt *PageTables) {
+	p.mu.Lock()
+	if pcid, ok := p.cache[pt]; ok {
+		delete(p.cache, pt)
+		p.avail = append(p.avail, pcid)
+	}
+	p.mu.Unlock()
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/walker_empty.go b/pkg/sentry/platform/ring0/pagetables/walker_empty.go
new file mode 100755
index 000000000..417784e17
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/walker_empty.go
@@ -0,0 +1,255 @@
+package pagetables
+
+// Walker walks page tables.
+type emptyWalker struct {
+	// pageTables are the tables to walk.
+	pageTables *PageTables
+
+	// Visitor is the set of arguments.
+	visitor emptyVisitor
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The
+// exception is super pages. If a valid super page (huge or jumbo) cannot be
+// installed, then the walk will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of super pages whenever
+// possible. Whether a super page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if requiresAlloc is true, then no gaps will be present. However,
+// if alloc is not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: start must be page-aligned.
+//
+// Precondition: start must be less than end.
+//
+// Precondition: If requiresAlloc is true, then start and end should not span
+// non-canonical ranges. If they do, a panic will result.
+//
+//go:nosplit
+func (w *emptyWalker) iterateRange(start, end uintptr) {
+	if start%pteSize != 0 {
+		panic("unaligned start")
+	}
+	if end < start {
+		panic("start > end")
+	}
+	if start < lowerTop {
+		if end <= lowerTop {
+			w.iterateRangeCanonical(start, end)
+		} else if end > lowerTop && end <= upperBottom {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(start, lowerTop)
+		} else {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(start, lowerTop)
+			w.iterateRangeCanonical(upperBottom, end)
+		}
+	} else if start < upperBottom {
+		if end <= upperBottom {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+		} else {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(upperBottom, end)
+		}
+	} else {
+		w.iterateRangeCanonical(start, end)
+	}
+}
+
+// next returns the next address quantized by the given size.
+//
+//go:nosplit
+func emptynext(start uintptr, size uintptr) uintptr {
+	start &= ^(size - 1)
+	start += size
+	return start
+}
+
+// iterateRangeCanonical walks a canonical range.
+//
+//go:nosplit
+func (w *emptyWalker) iterateRangeCanonical(start, end uintptr) {
+	for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+		var (
+			pgdEntry   = &w.pageTables.root[pgdIndex]
+			pudEntries *PTEs
+		)
+		if !pgdEntry.Valid() {
+			if !w.visitor.requiresAlloc() {
+
+				start = emptynext(start, pgdSize)
+				continue
+			}
+
+			pudEntries = w.pageTables.Allocator.NewPTEs()
+			pgdEntry.setPageTable(w.pageTables, pudEntries)
+		} else {
+			pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address())
+		}
+
+		clearPUDEntries := uint16(0)
+
+		for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+			var (
+				pudEntry   = &pudEntries[pudIndex]
+				pmdEntries *PTEs
+			)
+			if !pudEntry.Valid() {
+				if !w.visitor.requiresAlloc() {
+
+					clearPUDEntries++
+					start = emptynext(start, pudSize)
+					continue
+				}
+
+				if start&(pudSize-1) == 0 && end-start >= pudSize {
+					pudEntry.SetSuper()
+					w.visitor.visit(uintptr(start), pudEntry, pudSize-1)
+					if pudEntry.Valid() {
+						start = emptynext(start, pudSize)
+						continue
+					}
+				}
+
+				pmdEntries = w.pageTables.Allocator.NewPTEs()
+				pudEntry.setPageTable(w.pageTables, pmdEntries)
+
+			} else if pudEntry.IsSuper() {
+
+				if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < emptynext(start, pudSize)) {
+
+					pmdEntries = w.pageTables.Allocator.NewPTEs()
+					for index := uint16(0); index < entriesPerPage; index++ {
+						pmdEntries[index].SetSuper()
+						pmdEntries[index].Set(
+							pudEntry.Address()+(pmdSize*uintptr(index)),
+							pudEntry.Opts())
+					}
+					pudEntry.setPageTable(w.pageTables, pmdEntries)
+				} else {
+
+					w.visitor.visit(uintptr(start), pudEntry, pudSize-1)
+
+					if !pudEntry.Valid() {
+						clearPUDEntries++
+					}
+
+					start = emptynext(start, pudSize)
+					continue
+				}
+			} else {
+				pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address())
+			}
+
+			clearPMDEntries := uint16(0)
+
+			for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+				var (
+					pmdEntry   = &pmdEntries[pmdIndex]
+					pteEntries *PTEs
+				)
+				if !pmdEntry.Valid() {
+					if !w.visitor.requiresAlloc() {
+
+						clearPMDEntries++
+						start = emptynext(start, pmdSize)
+						continue
+					}
+
+					if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+						pmdEntry.SetSuper()
+						w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1)
+						if pmdEntry.Valid() {
+							start = emptynext(start, pmdSize)
+							continue
+						}
+					}
+
+					pteEntries = w.pageTables.Allocator.NewPTEs()
+					pmdEntry.setPageTable(w.pageTables, pteEntries)
+
+				} else if pmdEntry.IsSuper() {
+
+					if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < emptynext(start, pmdSize)) {
+
+						pteEntries = w.pageTables.Allocator.NewPTEs()
+						for index := uint16(0); index < entriesPerPage; index++ {
+							pteEntries[index].Set(
+								pmdEntry.Address()+(pteSize*uintptr(index)),
+								pmdEntry.Opts())
+						}
+						pmdEntry.setPageTable(w.pageTables, pteEntries)
+					} else {
+
+						w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1)
+
+						if !pmdEntry.Valid() {
+							clearPMDEntries++
+						}
+
+						start = emptynext(start, pmdSize)
+						continue
+					}
+				} else {
+					pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address())
+				}
+
+				clearPTEEntries := uint16(0)
+
+				for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+					var (
+						pteEntry = &pteEntries[pteIndex]
+					)
+					if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+						clearPTEEntries++
+						start += pteSize
+						continue
+					}
+
+					w.visitor.visit(uintptr(start), pteEntry, pteSize-1)
+					if !pteEntry.Valid() {
+						if w.visitor.requiresAlloc() {
+							panic("PTE not set after iteration with requiresAlloc!")
+						}
+						clearPTEEntries++
+					}
+
+					start += pteSize
+					continue
+				}
+
+				if clearPTEEntries == entriesPerPage {
+					pmdEntry.Clear()
+					w.pageTables.Allocator.FreePTEs(pteEntries)
+					clearPMDEntries++
+				}
+			}
+
+			if clearPMDEntries == entriesPerPage {
+				pudEntry.Clear()
+				w.pageTables.Allocator.FreePTEs(pmdEntries)
+				clearPUDEntries++
+			}
+		}
+
+		if clearPUDEntries == entriesPerPage {
+			pgdEntry.Clear()
+			w.pageTables.Allocator.FreePTEs(pudEntries)
+		}
+	}
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/walker_lookup.go b/pkg/sentry/platform/ring0/pagetables/walker_lookup.go
new file mode 100755
index 000000000..906c9c50f
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/walker_lookup.go
@@ -0,0 +1,255 @@
+package pagetables
+
+// Walker walks page tables.
+type lookupWalker struct {
+	// pageTables are the tables to walk.
+	pageTables *PageTables
+
+	// Visitor is the set of arguments.
+	visitor lookupVisitor
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The
+// exception is super pages. If a valid super page (huge or jumbo) cannot be
+// installed, then the walk will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of super pages whenever
+// possible. Whether a super page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if requiresAlloc is true, then no gaps will be present. However,
+// if alloc is not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: start must be page-aligned.
+//
+// Precondition: start must be less than end.
+//
+// Precondition: If requiresAlloc is true, then start and end should not span
+// non-canonical ranges. If they do, a panic will result.
+//
+//go:nosplit
+func (w *lookupWalker) iterateRange(start, end uintptr) {
+	if start%pteSize != 0 {
+		panic("unaligned start")
+	}
+	if end < start {
+		panic("start > end")
+	}
+	if start < lowerTop {
+		if end <= lowerTop {
+			w.iterateRangeCanonical(start, end)
+		} else if end > lowerTop && end <= upperBottom {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(start, lowerTop)
+		} else {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(start, lowerTop)
+			w.iterateRangeCanonical(upperBottom, end)
+		}
+	} else if start < upperBottom {
+		if end <= upperBottom {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+		} else {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(upperBottom, end)
+		}
+	} else {
+		w.iterateRangeCanonical(start, end)
+	}
+}
+
+// next returns the next address quantized by the given size.
+//
+//go:nosplit
+func lookupnext(start uintptr, size uintptr) uintptr {
+	start &= ^(size - 1)
+	start += size
+	return start
+}
+
+// iterateRangeCanonical walks a canonical range.
+//
+//go:nosplit
+func (w *lookupWalker) iterateRangeCanonical(start, end uintptr) {
+	for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+		var (
+			pgdEntry   = &w.pageTables.root[pgdIndex]
+			pudEntries *PTEs
+		)
+		if !pgdEntry.Valid() {
+			if !w.visitor.requiresAlloc() {
+
+				start = lookupnext(start, pgdSize)
+				continue
+			}
+
+			pudEntries = w.pageTables.Allocator.NewPTEs()
+			pgdEntry.setPageTable(w.pageTables, pudEntries)
+		} else {
+			pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address())
+		}
+
+		clearPUDEntries := uint16(0)
+
+		for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+			var (
+				pudEntry   = &pudEntries[pudIndex]
+				pmdEntries *PTEs
+			)
+			if !pudEntry.Valid() {
+				if !w.visitor.requiresAlloc() {
+
+					clearPUDEntries++
+					start = lookupnext(start, pudSize)
+					continue
+				}
+
+				if start&(pudSize-1) == 0 && end-start >= pudSize {
+					pudEntry.SetSuper()
+					w.visitor.visit(uintptr(start), pudEntry, pudSize-1)
+					if pudEntry.Valid() {
+						start = lookupnext(start, pudSize)
+						continue
+					}
+				}
+
+				pmdEntries = w.pageTables.Allocator.NewPTEs()
+				pudEntry.setPageTable(w.pageTables, pmdEntries)
+
+			} else if pudEntry.IsSuper() {
+
+				if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < lookupnext(start, pudSize)) {
+
+					pmdEntries = w.pageTables.Allocator.NewPTEs()
+					for index := uint16(0); index < entriesPerPage; index++ {
+						pmdEntries[index].SetSuper()
+						pmdEntries[index].Set(
+							pudEntry.Address()+(pmdSize*uintptr(index)),
+							pudEntry.Opts())
+					}
+					pudEntry.setPageTable(w.pageTables, pmdEntries)
+				} else {
+
+					w.visitor.visit(uintptr(start), pudEntry, pudSize-1)
+
+					if !pudEntry.Valid() {
+						clearPUDEntries++
+					}
+
+					start = lookupnext(start, pudSize)
+					continue
+				}
+			} else {
+				pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address())
+			}
+
+			clearPMDEntries := uint16(0)
+
+			for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+				var (
+					pmdEntry   = &pmdEntries[pmdIndex]
+					pteEntries *PTEs
+				)
+				if !pmdEntry.Valid() {
+					if !w.visitor.requiresAlloc() {
+
+						clearPMDEntries++
+						start = lookupnext(start, pmdSize)
+						continue
+					}
+
+					if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+						pmdEntry.SetSuper()
+						w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1)
+						if pmdEntry.Valid() {
+							start = lookupnext(start, pmdSize)
+							continue
+						}
+					}
+
+					pteEntries = w.pageTables.Allocator.NewPTEs()
+					pmdEntry.setPageTable(w.pageTables, pteEntries)
+
+				} else if pmdEntry.IsSuper() {
+
+					if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < lookupnext(start, pmdSize)) {
+
+						pteEntries = w.pageTables.Allocator.NewPTEs()
+						for index := uint16(0); index < entriesPerPage; index++ {
+							pteEntries[index].Set(
+								pmdEntry.Address()+(pteSize*uintptr(index)),
+								pmdEntry.Opts())
+						}
+						pmdEntry.setPageTable(w.pageTables, pteEntries)
+					} else {
+
+						w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1)
+
+						if !pmdEntry.Valid() {
+							clearPMDEntries++
+						}
+
+						start = lookupnext(start, pmdSize)
+						continue
+					}
+				} else {
+					pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address())
+				}
+
+				clearPTEEntries := uint16(0)
+
+				for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+					var (
+						pteEntry = &pteEntries[pteIndex]
+					)
+					if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+						clearPTEEntries++
+						start += pteSize
+						continue
+					}
+
+					w.visitor.visit(uintptr(start), pteEntry, pteSize-1)
+					if !pteEntry.Valid() {
+						if w.visitor.requiresAlloc() {
+							panic("PTE not set after iteration with requiresAlloc!")
+						}
+						clearPTEEntries++
+					}
+
+					start += pteSize
+					continue
+				}
+
+				if clearPTEEntries == entriesPerPage {
+					pmdEntry.Clear()
+					w.pageTables.Allocator.FreePTEs(pteEntries)
+					clearPMDEntries++
+				}
+			}
+
+			if clearPMDEntries == entriesPerPage {
+				pudEntry.Clear()
+				w.pageTables.Allocator.FreePTEs(pmdEntries)
+				clearPUDEntries++
+			}
+		}
+
+		if clearPUDEntries == entriesPerPage {
+			pgdEntry.Clear()
+			w.pageTables.Allocator.FreePTEs(pudEntries)
+		}
+	}
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/walker_map.go b/pkg/sentry/platform/ring0/pagetables/walker_map.go
new file mode 100755
index 000000000..61ee3c825
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/walker_map.go
@@ -0,0 +1,255 @@
+package pagetables
+
+// Walker walks page tables.
+type mapWalker struct {
+	// pageTables are the tables to walk.
+	pageTables *PageTables
+
+	// Visitor is the set of arguments.
+	visitor mapVisitor
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The
+// exception is super pages. If a valid super page (huge or jumbo) cannot be
+// installed, then the walk will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of super pages whenever
+// possible. Whether a super page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if requiresAlloc is true, then no gaps will be present. However,
+// if alloc is not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: start must be page-aligned.
+//
+// Precondition: start must be less than end.
+//
+// Precondition: If requiresAlloc is true, then start and end should not span
+// non-canonical ranges. If they do, a panic will result.
+//
+//go:nosplit
+func (w *mapWalker) iterateRange(start, end uintptr) {
+	if start%pteSize != 0 {
+		panic("unaligned start")
+	}
+	if end < start {
+		panic("start > end")
+	}
+	if start < lowerTop {
+		if end <= lowerTop {
+			w.iterateRangeCanonical(start, end)
+		} else if end > lowerTop && end <= upperBottom {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(start, lowerTop)
+		} else {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(start, lowerTop)
+			w.iterateRangeCanonical(upperBottom, end)
+		}
+	} else if start < upperBottom {
+		if end <= upperBottom {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+		} else {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(upperBottom, end)
+		}
+	} else {
+		w.iterateRangeCanonical(start, end)
+	}
+}
+
+// next returns the next address quantized by the given size.
+//
+//go:nosplit
+func mapnext(start uintptr, size uintptr) uintptr {
+	start &= ^(size - 1)
+	start += size
+	return start
+}
+
+// iterateRangeCanonical walks a canonical range.
+//
+//go:nosplit
+func (w *mapWalker) iterateRangeCanonical(start, end uintptr) {
+	for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+		var (
+			pgdEntry   = &w.pageTables.root[pgdIndex]
+			pudEntries *PTEs
+		)
+		if !pgdEntry.Valid() {
+			if !w.visitor.requiresAlloc() {
+
+				start = mapnext(start, pgdSize)
+				continue
+			}
+
+			pudEntries = w.pageTables.Allocator.NewPTEs()
+			pgdEntry.setPageTable(w.pageTables, pudEntries)
+		} else {
+			pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address())
+		}
+
+		clearPUDEntries := uint16(0)
+
+		for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+			var (
+				pudEntry   = &pudEntries[pudIndex]
+				pmdEntries *PTEs
+			)
+			if !pudEntry.Valid() {
+				if !w.visitor.requiresAlloc() {
+
+					clearPUDEntries++
+					start = mapnext(start, pudSize)
+					continue
+				}
+
+				if start&(pudSize-1) == 0 && end-start >= pudSize {
+					pudEntry.SetSuper()
+					w.visitor.visit(uintptr(start), pudEntry, pudSize-1)
+					if pudEntry.Valid() {
+						start = mapnext(start, pudSize)
+						continue
+					}
+				}
+
+				pmdEntries = w.pageTables.Allocator.NewPTEs()
+				pudEntry.setPageTable(w.pageTables, pmdEntries)
+
+			} else if pudEntry.IsSuper() {
+
+				if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < mapnext(start, pudSize)) {
+
+					pmdEntries = w.pageTables.Allocator.NewPTEs()
+					for index := uint16(0); index < entriesPerPage; index++ {
+						pmdEntries[index].SetSuper()
+						pmdEntries[index].Set(
+							pudEntry.Address()+(pmdSize*uintptr(index)),
+							pudEntry.Opts())
+					}
+					pudEntry.setPageTable(w.pageTables, pmdEntries)
+				} else {
+
+					w.visitor.visit(uintptr(start), pudEntry, pudSize-1)
+
+					if !pudEntry.Valid() {
+						clearPUDEntries++
+					}
+
+					start = mapnext(start, pudSize)
+					continue
+				}
+			} else {
+				pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address())
+			}
+
+			clearPMDEntries := uint16(0)
+
+			for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+				var (
+					pmdEntry   = &pmdEntries[pmdIndex]
+					pteEntries *PTEs
+				)
+				if !pmdEntry.Valid() {
+					if !w.visitor.requiresAlloc() {
+
+						clearPMDEntries++
+						start = mapnext(start, pmdSize)
+						continue
+					}
+
+					if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+						pmdEntry.SetSuper()
+						w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1)
+						if pmdEntry.Valid() {
+							start = mapnext(start, pmdSize)
+							continue
+						}
+					}
+
+					pteEntries = w.pageTables.Allocator.NewPTEs()
+					pmdEntry.setPageTable(w.pageTables, pteEntries)
+
+				} else if pmdEntry.IsSuper() {
+
+					if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < mapnext(start, pmdSize)) {
+
+						pteEntries = w.pageTables.Allocator.NewPTEs()
+						for index := uint16(0); index < entriesPerPage; index++ {
+							pteEntries[index].Set(
+								pmdEntry.Address()+(pteSize*uintptr(index)),
+								pmdEntry.Opts())
+						}
+						pmdEntry.setPageTable(w.pageTables, pteEntries)
+					} else {
+
+						w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1)
+
+						if !pmdEntry.Valid() {
+							clearPMDEntries++
+						}
+
+						start = mapnext(start, pmdSize)
+						continue
+					}
+				} else {
+					pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address())
+				}
+
+				clearPTEEntries := uint16(0)
+
+				for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+					var (
+						pteEntry = &pteEntries[pteIndex]
+					)
+					if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+						clearPTEEntries++
+						start += pteSize
+						continue
+					}
+
+					w.visitor.visit(uintptr(start), pteEntry, pteSize-1)
+					if !pteEntry.Valid() {
+						if w.visitor.requiresAlloc() {
+							panic("PTE not set after iteration with requiresAlloc!")
+						}
+						clearPTEEntries++
+					}
+
+					start += pteSize
+					continue
+				}
+
+				if clearPTEEntries == entriesPerPage {
+					pmdEntry.Clear()
+					w.pageTables.Allocator.FreePTEs(pteEntries)
+					clearPMDEntries++
+				}
+			}
+
+			if clearPMDEntries == entriesPerPage {
+				pudEntry.Clear()
+				w.pageTables.Allocator.FreePTEs(pmdEntries)
+				clearPUDEntries++
+			}
+		}
+
+		if clearPUDEntries == entriesPerPage {
+			pgdEntry.Clear()
+			w.pageTables.Allocator.FreePTEs(pudEntries)
+		}
+	}
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/walker_unmap.go b/pkg/sentry/platform/ring0/pagetables/walker_unmap.go
new file mode 100755
index 000000000..be2aa0ce4
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/walker_unmap.go
@@ -0,0 +1,255 @@
+package pagetables
+
+// Walker walks page tables.
+type unmapWalker struct {
+	// pageTables are the tables to walk.
+	pageTables *PageTables
+
+	// Visitor is the set of arguments.
+	visitor unmapVisitor
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The
+// exception is super pages. If a valid super page (huge or jumbo) cannot be
+// installed, then the walk will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of super pages whenever
+// possible. Whether a super page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if requiresAlloc is true, then no gaps will be present. However,
+// if alloc is not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: start must be page-aligned.
+//
+// Precondition: start must be less than end.
+//
+// Precondition: If requiresAlloc is true, then start and end should not span
+// non-canonical ranges. If they do, a panic will result.
+//
+//go:nosplit
+func (w *unmapWalker) iterateRange(start, end uintptr) {
+	if start%pteSize != 0 {
+		panic("unaligned start")
+	}
+	if end < start {
+		panic("start > end")
+	}
+	if start < lowerTop {
+		if end <= lowerTop {
+			w.iterateRangeCanonical(start, end)
+		} else if end > lowerTop && end <= upperBottom {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(start, lowerTop)
+		} else {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(start, lowerTop)
+			w.iterateRangeCanonical(upperBottom, end)
+		}
+	} else if start < upperBottom {
+		if end <= upperBottom {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+		} else {
+			if w.visitor.requiresAlloc() {
+				panic("alloc spans non-canonical range")
+			}
+			w.iterateRangeCanonical(upperBottom, end)
+		}
+	} else {
+		w.iterateRangeCanonical(start, end)
+	}
+}
+
+// next returns the next address quantized by the given size.
+//
+//go:nosplit
+func unmapnext(start uintptr, size uintptr) uintptr {
+	start &= ^(size - 1)
+	start += size
+	return start
+}
+
+// iterateRangeCanonical walks a canonical range.
+//
+//go:nosplit
+func (w *unmapWalker) iterateRangeCanonical(start, end uintptr) {
+	for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+		var (
+			pgdEntry   = &w.pageTables.root[pgdIndex]
+			pudEntries *PTEs
+		)
+		if !pgdEntry.Valid() {
+			if !w.visitor.requiresAlloc() {
+
+				start = unmapnext(start, pgdSize)
+				continue
+			}
+
+			pudEntries = w.pageTables.Allocator.NewPTEs()
+			pgdEntry.setPageTable(w.pageTables, pudEntries)
+		} else {
+			pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address())
+		}
+
+		clearPUDEntries := uint16(0)
+
+		for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+			var (
+				pudEntry   = &pudEntries[pudIndex]
+				pmdEntries *PTEs
+			)
+			if !pudEntry.Valid() {
+				if !w.visitor.requiresAlloc() {
+
+					clearPUDEntries++
+					start = unmapnext(start, pudSize)
+					continue
+				}
+
+				if start&(pudSize-1) == 0 && end-start >= pudSize {
+					pudEntry.SetSuper()
+					w.visitor.visit(uintptr(start), pudEntry, pudSize-1)
+					if pudEntry.Valid() {
+						start = unmapnext(start, pudSize)
+						continue
+					}
+				}
+
+				pmdEntries = w.pageTables.Allocator.NewPTEs()
+				pudEntry.setPageTable(w.pageTables, pmdEntries)
+
+			} else if pudEntry.IsSuper() {
+
+				if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < unmapnext(start, pudSize)) {
+
+					pmdEntries = w.pageTables.Allocator.NewPTEs()
+					for index := uint16(0); index < entriesPerPage; index++ {
+						pmdEntries[index].SetSuper()
+						pmdEntries[index].Set(
+							pudEntry.Address()+(pmdSize*uintptr(index)),
+							pudEntry.Opts())
+					}
+					pudEntry.setPageTable(w.pageTables, pmdEntries)
+				} else {
+
+					w.visitor.visit(uintptr(start), pudEntry, pudSize-1)
+
+					if !pudEntry.Valid() {
+						clearPUDEntries++
+					}
+
+					start = unmapnext(start, pudSize)
+					continue
+				}
+			} else {
+				pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address())
+			}
+
+			clearPMDEntries := uint16(0)
+
+			for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+				var (
+					pmdEntry   = &pmdEntries[pmdIndex]
+					pteEntries *PTEs
+				)
+				if !pmdEntry.Valid() {
+					if !w.visitor.requiresAlloc() {
+
+						clearPMDEntries++
+						start = unmapnext(start, pmdSize)
+						continue
+					}
+
+					if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+						pmdEntry.SetSuper()
+						w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1)
+						if pmdEntry.Valid() {
+							start = unmapnext(start, pmdSize)
+							continue
+						}
+					}
+
+					pteEntries = w.pageTables.Allocator.NewPTEs()
+					pmdEntry.setPageTable(w.pageTables, pteEntries)
+
+				} else if pmdEntry.IsSuper() {
+
+					if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < unmapnext(start, pmdSize)) {
+
+						pteEntries = w.pageTables.Allocator.NewPTEs()
+						for index := uint16(0); index < entriesPerPage; index++ {
+							pteEntries[index].Set(
+								pmdEntry.Address()+(pteSize*uintptr(index)),
+								pmdEntry.Opts())
+						}
+						pmdEntry.setPageTable(w.pageTables, pteEntries)
+					} else {
+
+						w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1)
+
+						if !pmdEntry.Valid() {
+							clearPMDEntries++
+						}
+
+						start = unmapnext(start, pmdSize)
+						continue
+					}
+				} else {
+					pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address())
+				}
+
+				clearPTEEntries := uint16(0)
+
+				for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+					var (
+						pteEntry = &pteEntries[pteIndex]
+					)
+					if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+						clearPTEEntries++
+						start += pteSize
+						continue
+					}
+
+					w.visitor.visit(uintptr(start), pteEntry, pteSize-1)
+					if !pteEntry.Valid() {
+						if w.visitor.requiresAlloc() {
+							panic("PTE not set after iteration with requiresAlloc!")
+						}
+						clearPTEEntries++
+					}
+
+					start += pteSize
+					continue
+				}
+
+				if clearPTEEntries == entriesPerPage {
+					pmdEntry.Clear()
+					w.pageTables.Allocator.FreePTEs(pteEntries)
+					clearPMDEntries++
+				}
+			}
+
+			if clearPMDEntries == entriesPerPage {
+				pudEntry.Clear()
+				w.pageTables.Allocator.FreePTEs(pmdEntries)
+				clearPUDEntries++
+			}
+		}
+
+		if clearPUDEntries == entriesPerPage {
+			pgdEntry.Clear()
+			w.pageTables.Allocator.FreePTEs(pudEntries)
+		}
+	}
+}
diff --git a/pkg/sentry/platform/ring0/ring0.go b/pkg/sentry/platform/ring0/ring0.go
new file mode 100644
index 000000000..cdeb1b43a
--- /dev/null
+++ b/pkg/sentry/platform/ring0/ring0.go
@@ -0,0 +1,16 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ring0 provides basic operating system-level stubs.
+package ring0
diff --git a/pkg/sentry/platform/ring0/ring0_state_autogen.go b/pkg/sentry/platform/ring0/ring0_state_autogen.go
new file mode 100755
index 000000000..462f9a446
--- /dev/null
+++ b/pkg/sentry/platform/ring0/ring0_state_autogen.go
@@ -0,0 +1,4 @@
+// automatically generated by stateify.
+
+package ring0
+
author	gVisor bot <gvisor-bot@google.com>	2019-06-02 06:44:55 +0000
committer	gVisor bot <gvisor-bot@google.com>	2019-06-02 06:44:55 +0000
commit	ceb0d792f328d1fc0692197d8856a43c3936a571 (patch)
tree	83155f302eff44a78bcc30a3a08f4efe59a79379 /pkg/sentry/platform/ring0
parent	deb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff)
parent	216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff)