summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/platform/ring0
diff options
context:
space:
mode:
authorGoogler <noreply@google.com>2018-04-27 10:37:02 -0700
committerAdin Scannell <ascannell@google.com>2018-04-28 01:44:26 -0400
commitd02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/platform/ring0
parentf70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)
Check in gVisor.
PiperOrigin-RevId: 194583126 Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
Diffstat (limited to 'pkg/sentry/platform/ring0')
-rw-r--r--pkg/sentry/platform/ring0/BUILD52
-rw-r--r--pkg/sentry/platform/ring0/defs.go93
-rw-r--r--pkg/sentry/platform/ring0/defs_amd64.go113
-rw-r--r--pkg/sentry/platform/ring0/entry_amd64.go128
-rw-r--r--pkg/sentry/platform/ring0/entry_amd64.s334
-rw-r--r--pkg/sentry/platform/ring0/gen_offsets/BUILD25
-rw-r--r--pkg/sentry/platform/ring0/gen_offsets/main.go24
-rw-r--r--pkg/sentry/platform/ring0/kernel.go71
-rw-r--r--pkg/sentry/platform/ring0/kernel_amd64.go280
-rw-r--r--pkg/sentry/platform/ring0/kernel_unsafe.go41
-rw-r--r--pkg/sentry/platform/ring0/lib_amd64.go128
-rw-r--r--pkg/sentry/platform/ring0/lib_amd64.s247
-rw-r--r--pkg/sentry/platform/ring0/offsets_amd64.go93
-rw-r--r--pkg/sentry/platform/ring0/pagetables/BUILD32
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables.go193
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go397
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables_test.go161
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go31
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables_x86.go79
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go79
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pcids_x86.go74
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go65
-rw-r--r--pkg/sentry/platform/ring0/ring0.go16
-rw-r--r--pkg/sentry/platform/ring0/x86.go242
24 files changed, 2998 insertions, 0 deletions
diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD
new file mode 100644
index 000000000..2df232a64
--- /dev/null
+++ b/pkg/sentry/platform/ring0/BUILD
@@ -0,0 +1,52 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
+
+go_template(
+ name = "defs",
+ srcs = [
+ "defs.go",
+ "defs_amd64.go",
+ "offsets_amd64.go",
+ "x86.go",
+ ],
+ visibility = [":__subpackages__"],
+)
+
+go_template_instance(
+ name = "defs_impl",
+ out = "defs_impl.go",
+ package = "ring0",
+ template = ":defs",
+)
+
+genrule(
+ name = "entry_impl_amd64",
+ srcs = ["entry_amd64.s"],
+ outs = ["entry_impl_amd64.s"],
+ cmd = "(echo -e '// build +amd64\\n' && $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(SRCS)) > $@",
+ tools = ["//pkg/sentry/platform/ring0/gen_offsets"],
+)
+
+go_library(
+ name = "ring0",
+ srcs = [
+ "defs_impl.go",
+ "entry_amd64.go",
+ "entry_impl_amd64.s",
+ "kernel.go",
+ "kernel_amd64.go",
+ "kernel_unsafe.go",
+ "lib_amd64.go",
+ "lib_amd64.s",
+ "ring0.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0",
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/cpuid",
+ "//pkg/sentry/platform/ring0/pagetables",
+ "//pkg/sentry/usermem",
+ ],
+)
diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go
new file mode 100644
index 000000000..9d947b73d
--- /dev/null
+++ b/pkg/sentry/platform/ring0/defs.go
@@ -0,0 +1,93 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ring0
+
+import (
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+var (
+ // UserspaceSize is the total size of userspace.
+ UserspaceSize = uintptr(1) << (VirtualAddressBits() - 1)
+
+ // MaximumUserAddress is the largest possible user address.
+ MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(usermem.PageSize-1)
+
+ // KernelStartAddress is the starting kernel address.
+ KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1)
+)
+
+// Kernel is a global kernel object.
+//
+// This contains global state, shared by multiple CPUs.
+type Kernel struct {
+ KernelArchState
+}
+
+// CPU is the per-CPU struct.
+type CPU struct {
+ // self is a self reference.
+ //
+ // This is always guaranteed to be at offset zero.
+ self *CPU
+
+ // kernel is reference to the kernel that this CPU was initialized
+ // with. This reference is kept for garbage collection purposes: CPU
+ // registers may refer to objects within the Kernel object that cannot
+ // be safely freed.
+ kernel *Kernel
+
+ // CPUArchState is architecture-specific state.
+ CPUArchState
+
+ // registers is a set of registers; these may be used on kernel system
+ // calls and exceptions via the Registers function.
+ registers syscall.PtraceRegs
+
+ // KernelException handles an exception during kernel execution.
+ //
+ // Return from this call will restore registers and return to the kernel: the
+ // registers must be modified directly.
+ //
+ // If this function is not provided, a kernel exception results in halt.
+ //
+ // This must be go:nosplit, as this will be on the interrupt stack.
+ // Closures are permitted, as the pointer to the closure frame is not
+ // passed on the stack.
+ KernelException func(Vector)
+
+ // KernelSyscall is called for kernel system calls.
+ //
+ // Return from this call will restore registers and return to the kernel: the
+ // registers must be modified directly.
+ //
+ // If this function is not provided, a kernel exception results in halt.
+ //
+ // This must be go:nosplit, as this will be on the interrupt stack.
+ // Closures are permitted, as the pointer to the closure frame is not
+ // passed on the stack.
+ KernelSyscall func()
+}
+
+// Registers returns a modifiable-copy of the kernel registers.
+//
+// This is explicitly safe to call during KernelException and KernelSyscall.
+//
+//go:nosplit
+func (c *CPU) Registers() *syscall.PtraceRegs {
+ return &c.registers
+}
diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go
new file mode 100644
index 000000000..bb3420125
--- /dev/null
+++ b/pkg/sentry/platform/ring0/defs_amd64.go
@@ -0,0 +1,113 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package ring0
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+)
+
+// Segment indices and Selectors.
+const (
+ // Index into GDT array.
+ _ = iota // Null descriptor first.
+ _ // Reserved (Linux is kernel 32).
+ segKcode // Kernel code (64-bit).
+ segKdata // Kernel data.
+ segUcode32 // User code (32-bit).
+ segUdata // User data.
+ segUcode64 // User code (64-bit).
+ segTss // Task segment descriptor.
+ segTssHi // Upper bits for TSS.
+ segLast // Last segment (terminal, not included).
+)
+
+// Selectors.
+const (
+ Kcode Selector = segKcode << 3
+ Kdata Selector = segKdata << 3
+ Ucode32 Selector = (segUcode32 << 3) | 3
+ Udata Selector = (segUdata << 3) | 3
+ Ucode64 Selector = (segUcode64 << 3) | 3
+ Tss Selector = segTss << 3
+)
+
+// Standard segments.
+var (
+ UserCodeSegment32 SegmentDescriptor
+ UserDataSegment SegmentDescriptor
+ UserCodeSegment64 SegmentDescriptor
+ KernelCodeSegment SegmentDescriptor
+ KernelDataSegment SegmentDescriptor
+)
+
+// KernelOpts has initialization options for the kernel.
+type KernelOpts struct {
+ // PageTables are the kernel pagetables; this must be provided.
+ PageTables *pagetables.PageTables
+}
+
+// KernelArchState contains architecture-specific state.
+type KernelArchState struct {
+ KernelOpts
+
+ // globalIDT is our set of interrupt gates.
+ globalIDT idt64
+}
+
+// CPUArchState contains CPU-specific arch state.
+type CPUArchState struct {
+ // stack is the stack used for interrupts on this CPU.
+ stack [256]byte
+
+ // errorCode is the error code from the last exception.
+ errorCode uintptr
+
+ // errorType indicates the type of error code here, it is always set
+ // along with the errorCode value above.
+ //
+ // It will either by 1, which indicates a user error, or 0 indicating a
+ // kernel error. If the error code below returns false (kernel error),
+ // then it cannot provide relevant information about the last
+ // exception.
+ errorType uintptr
+
+ // gdt is the CPU's descriptor table.
+ gdt descriptorTable
+
+ // tss is the CPU's task state.
+ tss TaskState64
+}
+
+// ErrorCode returns the last error code.
+//
+// The returned boolean indicates whether the error code corresponds to the
+// last user error or not. If it does not, then fault information must be
+// ignored. This is generally the result of a kernel fault while servicing a
+// user fault.
+//
+//go:nosplit
+func (c *CPU) ErrorCode() (value uintptr, user bool) {
+ return c.errorCode, c.errorType != 0
+}
+
+func init() {
+ KernelCodeSegment.setCode64(0, 0, 0)
+ KernelDataSegment.setData(0, 0xffffffff, 0)
+ UserCodeSegment32.setCode64(0, 0, 3)
+ UserDataSegment.setData(0, 0xffffffff, 3)
+ UserCodeSegment64.setCode64(0, 0, 3)
+}
diff --git a/pkg/sentry/platform/ring0/entry_amd64.go b/pkg/sentry/platform/ring0/entry_amd64.go
new file mode 100644
index 000000000..a3e992e0d
--- /dev/null
+++ b/pkg/sentry/platform/ring0/entry_amd64.go
@@ -0,0 +1,128 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package ring0
+
+import (
+ "syscall"
+)
+
+// This is an assembly function.
+//
+// The sysenter function is invoked in two situations:
+//
+// (1) The guest kernel has executed a system call.
+// (2) The guest application has executed a system call.
+//
+// The interrupt flag is examined to determine whether the system call was
+// executed from kernel mode or not and the appropriate stub is called.
+func sysenter()
+
+// swapgs swaps the current GS value.
+//
+// This must be called prior to sysret/iret.
+func swapgs()
+
+// sysret returns to userspace from a system call.
+//
+// The return code is the vector that interrupted execution.
+//
+// See stubs.go for a note regarding the frame size of this function.
+func sysret(*CPU, *syscall.PtraceRegs) Vector
+
+// "iret is the cadillac of CPL switching."
+//
+// -- Neel Natu
+//
+// iret is nearly identical to sysret, except an iret is used to fully restore
+// all user state. This must be called in cases where all registers need to be
+// restored.
+func iret(*CPU, *syscall.PtraceRegs) Vector
+
+// exception is the generic exception entry.
+//
+// This is called by the individual stub definitions.
+func exception()
+
+// resume is a stub that restores the CPU kernel registers.
+//
+// This is used when processing kernel exceptions and syscalls.
+func resume()
+
+// Start is the CPU entrypoint.
+//
+// The following start conditions must be satisfied:
+//
+// * AX should contain the CPU pointer.
+// * c.GDT() should be loaded as the GDT.
+// * c.IDT() should be loaded as the IDT.
+// * c.CR0() should be the current CR0 value.
+// * c.CR3() should be set to the kernel PageTables.
+// * c.CR4() should be the current CR4 value.
+// * c.EFER() should be the current EFER value.
+//
+// The CPU state will be set to c.Registers().
+func Start()
+
+// Exception stubs.
+func divideByZero()
+func debug()
+func nmi()
+func breakpoint()
+func overflow()
+func boundRangeExceeded()
+func invalidOpcode()
+func deviceNotAvailable()
+func doubleFault()
+func coprocessorSegmentOverrun()
+func invalidTSS()
+func segmentNotPresent()
+func stackSegmentFault()
+func generalProtectionFault()
+func pageFault()
+func x87FloatingPointException()
+func alignmentCheck()
+func machineCheck()
+func simdFloatingPointException()
+func virtualizationException()
+func securityException()
+func syscallInt80()
+
+// Exception handler index.
+var handlers = map[Vector]func(){
+ DivideByZero: divideByZero,
+ Debug: debug,
+ NMI: nmi,
+ Breakpoint: breakpoint,
+ Overflow: overflow,
+ BoundRangeExceeded: boundRangeExceeded,
+ InvalidOpcode: invalidOpcode,
+ DeviceNotAvailable: deviceNotAvailable,
+ DoubleFault: doubleFault,
+ CoprocessorSegmentOverrun: coprocessorSegmentOverrun,
+ InvalidTSS: invalidTSS,
+ SegmentNotPresent: segmentNotPresent,
+ StackSegmentFault: stackSegmentFault,
+ GeneralProtectionFault: generalProtectionFault,
+ PageFault: pageFault,
+ X87FloatingPointException: x87FloatingPointException,
+ AlignmentCheck: alignmentCheck,
+ MachineCheck: machineCheck,
+ SIMDFloatingPointException: simdFloatingPointException,
+ VirtualizationException: virtualizationException,
+ SecurityException: securityException,
+ SyscallInt80: syscallInt80,
+}
diff --git a/pkg/sentry/platform/ring0/entry_amd64.s b/pkg/sentry/platform/ring0/entry_amd64.s
new file mode 100644
index 000000000..e8638133b
--- /dev/null
+++ b/pkg/sentry/platform/ring0/entry_amd64.s
@@ -0,0 +1,334 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "funcdata.h"
+#include "textflag.h"
+
+// NB: Offsets are programatically generated (see BUILD).
+//
+// This file is concatenated with the definitions.
+
+// Saves a register set.
+//
+// This is a macro because it may need to executed in contents where a stack is
+// not available for calls.
+//
+// The following registers are not saved: AX, SP, IP, FLAGS, all segments.
+#define REGISTERS_SAVE(reg, offset) \
+ MOVQ R15, offset+PTRACE_R15(reg); \
+ MOVQ R14, offset+PTRACE_R14(reg); \
+ MOVQ R13, offset+PTRACE_R13(reg); \
+ MOVQ R12, offset+PTRACE_R12(reg); \
+ MOVQ BP, offset+PTRACE_RBP(reg); \
+ MOVQ BX, offset+PTRACE_RBX(reg); \
+ MOVQ CX, offset+PTRACE_RCX(reg); \
+ MOVQ DX, offset+PTRACE_RDX(reg); \
+ MOVQ R11, offset+PTRACE_R11(reg); \
+ MOVQ R10, offset+PTRACE_R10(reg); \
+ MOVQ R9, offset+PTRACE_R9(reg); \
+ MOVQ R8, offset+PTRACE_R8(reg); \
+ MOVQ SI, offset+PTRACE_RSI(reg); \
+ MOVQ DI, offset+PTRACE_RDI(reg);
+
+// Loads a register set.
+//
+// This is a macro because it may need to executed in contents where a stack is
+// not available for calls.
+//
+// The following registers are not loaded: AX, SP, IP, FLAGS, all segments.
+#define REGISTERS_LOAD(reg, offset) \
+ MOVQ offset+PTRACE_R15(reg), R15; \
+ MOVQ offset+PTRACE_R14(reg), R14; \
+ MOVQ offset+PTRACE_R13(reg), R13; \
+ MOVQ offset+PTRACE_R12(reg), R12; \
+ MOVQ offset+PTRACE_RBP(reg), BP; \
+ MOVQ offset+PTRACE_RBX(reg), BX; \
+ MOVQ offset+PTRACE_RCX(reg), CX; \
+ MOVQ offset+PTRACE_RDX(reg), DX; \
+ MOVQ offset+PTRACE_R11(reg), R11; \
+ MOVQ offset+PTRACE_R10(reg), R10; \
+ MOVQ offset+PTRACE_R9(reg), R9; \
+ MOVQ offset+PTRACE_R8(reg), R8; \
+ MOVQ offset+PTRACE_RSI(reg), SI; \
+ MOVQ offset+PTRACE_RDI(reg), DI;
+
+// SWAP_GS swaps the kernel GS (CPU).
+#define SWAP_GS() \
+ BYTE $0x0F; BYTE $0x01; BYTE $0xf8;
+
+// IRET returns from an interrupt frame.
+#define IRET() \
+ BYTE $0x48; BYTE $0xcf;
+
+// SYSRET64 executes the sysret instruction.
+#define SYSRET64() \
+ BYTE $0x48; BYTE $0x0f; BYTE $0x07;
+
+// LOAD_KERNEL_ADDRESS loads a kernel address.
+#define LOAD_KERNEL_ADDRESS(from, to) \
+ MOVQ from, to; \
+ ORQ ·KernelStartAddress(SB), to;
+
+// LOAD_KERNEL_STACK loads the kernel stack.
+#define LOAD_KERNEL_STACK(from) \
+ LOAD_KERNEL_ADDRESS(CPU_SELF(from), SP); \
+ LEAQ CPU_STACK_TOP(SP), SP;
+
+// See kernel.go.
+TEXT ·Halt(SB),NOSPLIT,$0
+ HLT
+ RET
+
+// See kernel.go.
+TEXT ·Current(SB),NOSPLIT,$0-8
+ MOVQ CPU_SELF(GS), AX
+ MOVQ AX, ret+0(FP)
+ RET
+
+// See entry_amd64.go.
+TEXT ·swapgs(SB),NOSPLIT,$0
+ SWAP_GS()
+ RET
+
+// See entry_amd64.go.
+TEXT ·sysret(SB),NOSPLIT,$0-24
+ // Save original state.
+ LOAD_KERNEL_ADDRESS(cpu+0(FP), BX)
+ LOAD_KERNEL_ADDRESS(regs+8(FP), AX)
+ MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX)
+ MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX)
+ MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX)
+
+ // Restore user register state.
+ REGISTERS_LOAD(AX, 0)
+ MOVQ PTRACE_RIP(AX), CX // Needed for SYSRET.
+ MOVQ PTRACE_FLAGS(AX), R11 // Needed for SYSRET.
+ MOVQ PTRACE_RSP(AX), SP // Restore the stack directly.
+ MOVQ PTRACE_RAX(AX), AX // Restore AX (scratch).
+ SYSRET64()
+
+// See entry_amd64.go.
+TEXT ·iret(SB),NOSPLIT,$0-24
+ // Save original state.
+ LOAD_KERNEL_ADDRESS(cpu+0(FP), BX)
+ LOAD_KERNEL_ADDRESS(regs+8(FP), AX)
+ MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX)
+ MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX)
+ MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX)
+
+ // Build an IRET frame & restore state.
+ LOAD_KERNEL_STACK(BX)
+ MOVQ PTRACE_SS(AX), BX; PUSHQ BX
+ MOVQ PTRACE_RSP(AX), CX; PUSHQ CX
+ MOVQ PTRACE_FLAGS(AX), DX; PUSHQ DX
+ MOVQ PTRACE_CS(AX), DI; PUSHQ DI
+ MOVQ PTRACE_RIP(AX), SI; PUSHQ SI
+ REGISTERS_LOAD(AX, 0) // Restore most registers.
+ MOVQ PTRACE_RAX(AX), AX // Restore AX (scratch).
+ IRET()
+
+// See entry_amd64.go.
+TEXT ·resume(SB),NOSPLIT,$0
+ // See iret, above.
+ MOVQ CPU_REGISTERS+PTRACE_SS(GS), BX; PUSHQ BX
+ MOVQ CPU_REGISTERS+PTRACE_RSP(GS), CX; PUSHQ CX
+ MOVQ CPU_REGISTERS+PTRACE_FLAGS(GS), DX; PUSHQ DX
+ MOVQ CPU_REGISTERS+PTRACE_CS(GS), DI; PUSHQ DI
+ MOVQ CPU_REGISTERS+PTRACE_RIP(GS), SI; PUSHQ SI
+ REGISTERS_LOAD(GS, CPU_REGISTERS)
+ MOVQ CPU_REGISTERS+PTRACE_RAX(GS), AX
+ IRET()
+
+// See entry_amd64.go.
+TEXT ·Start(SB),NOSPLIT,$0
+ LOAD_KERNEL_STACK(AX) // Set the stack.
+ PUSHQ $0x0 // Previous frame pointer.
+ MOVQ SP, BP // Set frame pointer.
+ PUSHQ AX // First argument (CPU).
+ CALL ·start(SB) // Call Go hook.
+ JMP ·resume(SB) // Restore to registers.
+
+// See entry_amd64.go.
+TEXT ·sysenter(SB),NOSPLIT,$0
+ // Interrupts are always disabled while we're executing in kernel mode
+ // and always enabled while executing in user mode. Therefore, we can
+ // reliably look at the flags in R11 to determine where this syscall
+ // was from.
+ TESTL $_RFLAGS_IF, R11
+ JZ kernel
+
+user:
+ SWAP_GS()
+ XCHGQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Swap stacks.
+ XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Swap for AX (regs).
+ REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX.
+ MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX // Load saved AX value.
+ MOVQ BX, PTRACE_RAX(AX) // Save everything else.
+ MOVQ BX, PTRACE_ORIGRAX(AX)
+ MOVQ CX, PTRACE_RIP(AX)
+ MOVQ R11, PTRACE_FLAGS(AX)
+ MOVQ CPU_REGISTERS+PTRACE_RSP(GS), BX; MOVQ BX, PTRACE_RSP(AX)
+ MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code.
+ MOVQ $1, CPU_ERROR_TYPE(GS) // Set error type to user.
+
+ // Return to the kernel, where the frame is:
+ //
+ // vector (sp+24)
+ // regs (sp+16)
+ // cpu (sp+8)
+ // vcpu.Switch (sp+0)
+ //
+ MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer.
+ MOVQ $Syscall, 24(SP) // Output vector.
+ RET
+
+kernel:
+ // We can't restore the original stack, but we can access the registers
+ // in the CPU state directly. No need for temporary juggling.
+ MOVQ AX, CPU_REGISTERS+PTRACE_ORIGRAX(GS)
+ MOVQ AX, CPU_REGISTERS+PTRACE_RAX(GS)
+ REGISTERS_SAVE(GS, CPU_REGISTERS)
+ MOVQ CX, CPU_REGISTERS+PTRACE_RIP(GS)
+ MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(GS)
+ MOVQ SP, CPU_REGISTERS+PTRACE_RSP(GS)
+ MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code.
+ MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel.
+
+ // Load the function stored in KernelSyscall.
+ //
+ // Note that this function needs to be executed on the stack in case
+ // the runtime decides to make use of the redzone (grumble). This also
+ // protects against any functions that might not be go:nosplit, since
+ // this will cause a failure immediately.
+ LOAD_KERNEL_STACK(GS)
+ MOVQ CPU_KERNEL_SYSCALL(GS), DX // Function data.
+ MOVQ 0(DX), AX // Function pointer.
+ PUSHQ BP // Push the frame pointer.
+ MOVQ SP, BP // Set frame pointer value.
+ CALL *AX // Call the function.
+ POPQ BP // Restore the frame pointer.
+ JMP ·resume(SB)
+
+// exception is a generic exception handler.
+//
+// There are two cases handled:
+//
+// 1) An exception in kernel mode: this results in saving the state at the time
+// of the exception and calling the defined hook.
+//
+// 2) An exception in guest mode: the original kernel frame is restored, and
+// the vector & error codes are pushed as return values.
+//
+// See below for the stubs that call exception.
+TEXT ·exception(SB),NOSPLIT,$0
+ // Determine whether the exception occurred in kernel mode or user
+ // mode, based on the flags. We expect the following stack:
+ //
+ // SS (sp+48)
+ // SP (sp+40)
+ // FLAGS (sp+32)
+ // CS (sp+24)
+ // IP (sp+16)
+ // ERROR_CODE (sp+8)
+ // VECTOR (sp+0)
+ //
+ TESTL $_RFLAGS_IF, 32(SP)
+ JZ kernel
+
+user:
+ SWAP_GS()
+ XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Swap for AX (regs).
+ REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX.
+ MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX // Load saved AX value.
+ MOVQ BX, PTRACE_RAX(AX) // Save everything else.
+ MOVQ BX, PTRACE_ORIGRAX(AX)
+ MOVQ 16(SP), BX; MOVQ BX, PTRACE_RIP(AX)
+ MOVQ 24(SP), CX; MOVQ CX, PTRACE_CS(AX)
+ MOVQ 32(SP), DX; MOVQ DX, PTRACE_FLAGS(AX)
+ MOVQ 40(SP), DI; MOVQ DI, PTRACE_RSP(AX)
+ MOVQ 48(SP), SI; MOVQ SI, PTRACE_SS(AX)
+
+ // Copy out and return.
+ MOVQ 0(SP), BX // Load vector.
+ MOVQ 8(SP), CX // Load error code.
+ MOVQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Original stack (kernel version).
+ MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer.
+ MOVQ CX, CPU_ERROR_CODE(GS) // Set error code.
+ MOVQ $1, CPU_ERROR_TYPE(GS) // Set error type to user.
+ MOVQ BX, 24(SP) // Output vector.
+ RET
+
+kernel:
+ // As per above, we can save directly.
+ MOVQ AX, CPU_REGISTERS+PTRACE_RAX(GS)
+ MOVQ AX, CPU_REGISTERS+PTRACE_ORIGRAX(GS)
+ REGISTERS_SAVE(GS, CPU_REGISTERS)
+ MOVQ 16(SP), AX; MOVQ AX, CPU_REGISTERS+PTRACE_RIP(GS)
+ MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(GS)
+ MOVQ 40(SP), CX; MOVQ CX, CPU_REGISTERS+PTRACE_RSP(GS)
+
+ // Set the error code and adjust the stack.
+ MOVQ 8(SP), AX // Load the error code.
+ MOVQ AX, CPU_ERROR_CODE(GS) // Copy out to the CPU.
+ MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel.
+ MOVQ 0(SP), BX // BX contains the vector.
+ ADDQ $48, SP // Drop the exception frame.
+
+ // Load the function stored in KernelException.
+ //
+ // See note above re: the kernel stack.
+ LOAD_KERNEL_STACK(GS)
+ MOVQ CPU_KERNEL_EXCEPTION(GS), DX // Function data.
+ MOVQ 0(DX), AX // Function pointer.
+ PUSHQ BP // Push the frame pointer.
+ MOVQ SP, BP // Set frame pointer value.
+ PUSHQ BX // First argument (vector).
+ CALL *AX // Call the function.
+ POPQ BX // Discard the argument.
+ POPQ BP // Restore the frame pointer.
+ JMP ·resume(SB)
+
+#define EXCEPTION_WITH_ERROR(value, symbol) \
+TEXT symbol,NOSPLIT,$0; \
+ PUSHQ $value; \
+ JMP ·exception(SB);
+
+#define EXCEPTION_WITHOUT_ERROR(value, symbol) \
+TEXT symbol,NOSPLIT,$0; \
+ PUSHQ $0x0; \
+ PUSHQ $value; \
+ JMP ·exception(SB);
+
+EXCEPTION_WITHOUT_ERROR(DivideByZero, ·divideByZero(SB))
+EXCEPTION_WITHOUT_ERROR(Debug, ·debug(SB))
+EXCEPTION_WITHOUT_ERROR(NMI, ·nmi(SB))
+EXCEPTION_WITHOUT_ERROR(Breakpoint, ·breakpoint(SB))
+EXCEPTION_WITHOUT_ERROR(Overflow, ·overflow(SB))
+EXCEPTION_WITHOUT_ERROR(BoundRangeExceeded, ·boundRangeExceeded(SB))
+EXCEPTION_WITHOUT_ERROR(InvalidOpcode, ·invalidOpcode(SB))
+EXCEPTION_WITHOUT_ERROR(DeviceNotAvailable, ·deviceNotAvailable(SB))
+EXCEPTION_WITH_ERROR(DoubleFault, ·doubleFault(SB))
+EXCEPTION_WITHOUT_ERROR(CoprocessorSegmentOverrun, ·coprocessorSegmentOverrun(SB))
+EXCEPTION_WITH_ERROR(InvalidTSS, ·invalidTSS(SB))
+EXCEPTION_WITH_ERROR(SegmentNotPresent, ·segmentNotPresent(SB))
+EXCEPTION_WITH_ERROR(StackSegmentFault, ·stackSegmentFault(SB))
+EXCEPTION_WITH_ERROR(GeneralProtectionFault, ·generalProtectionFault(SB))
+EXCEPTION_WITH_ERROR(PageFault, ·pageFault(SB))
+EXCEPTION_WITHOUT_ERROR(X87FloatingPointException, ·x87FloatingPointException(SB))
+EXCEPTION_WITH_ERROR(AlignmentCheck, ·alignmentCheck(SB))
+EXCEPTION_WITHOUT_ERROR(MachineCheck, ·machineCheck(SB))
+EXCEPTION_WITHOUT_ERROR(SIMDFloatingPointException, ·simdFloatingPointException(SB))
+EXCEPTION_WITHOUT_ERROR(VirtualizationException, ·virtualizationException(SB))
+EXCEPTION_WITH_ERROR(SecurityException, ·securityException(SB))
+EXCEPTION_WITHOUT_ERROR(SyscallInt80, ·syscallInt80(SB))
diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD
new file mode 100644
index 000000000..3bce56985
--- /dev/null
+++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD
@@ -0,0 +1,25 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+go_template_instance(
+ name = "defs_impl",
+ out = "defs_impl.go",
+ package = "main",
+ template = "//pkg/sentry/platform/ring0:defs",
+)
+
+go_binary(
+ name = "gen_offsets",
+ srcs = [
+ "defs_impl.go",
+ "main.go",
+ ],
+ visibility = ["//pkg/sentry/platform/ring0:__pkg__"],
+ deps = [
+ "//pkg/cpuid",
+ "//pkg/sentry/platform/ring0/pagetables",
+ "//pkg/sentry/usermem",
+ ],
+)
diff --git a/pkg/sentry/platform/ring0/gen_offsets/main.go b/pkg/sentry/platform/ring0/gen_offsets/main.go
new file mode 100644
index 000000000..ffa7eaf77
--- /dev/null
+++ b/pkg/sentry/platform/ring0/gen_offsets/main.go
@@ -0,0 +1,24 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary gen_offsets is a helper for generating offset headers.
+package main
+
+import (
+ "os"
+)
+
+func main() {
+ Emit(os.Stdout)
+}
diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go
new file mode 100644
index 000000000..b0471ab9a
--- /dev/null
+++ b/pkg/sentry/platform/ring0/kernel.go
@@ -0,0 +1,71 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ring0
+
+// New creates a new kernel.
+//
+// N.B. that constraints on KernelOpts must be satisfied.
+//
+// Init must have been called.
+func New(opts KernelOpts) *Kernel {
+ k := new(Kernel)
+ k.init(opts)
+ return k
+}
+
+// NewCPU creates a new CPU associated with this Kernel.
+//
+// Note that execution of the new CPU must begin at Start, with constraints as
+// documented. Initialization is not completed by this method alone.
+//
+// See also Init.
+func (k *Kernel) NewCPU() *CPU {
+ c := new(CPU)
+ c.Init(k)
+ return c
+}
+
+// Halt halts execution.
+func Halt()
+
+// Current returns the current CPU.
+//
+// Its use is only legal in the KernelSyscall and KernelException contexts,
+// which must all be guarded go:nosplit.
+func Current() *CPU
+
+// defaultSyscall is the default syscall hook.
+//
+//go:nosplit
+func defaultSyscall() { Halt() }
+
+// defaultException is the default exception hook.
+//
+//go:nosplit
+func defaultException(Vector) { Halt() }
+
+// Init allows the initialization of a CPU from a kernel without allocation.
+// The same constraints as NewCPU apply.
+//
+// Init allows embedding in other objects.
+func (c *CPU) Init(k *Kernel) {
+ c.self = c // Set self reference.
+ c.kernel = k // Set kernel reference.
+ c.init() // Perform architectural init.
+
+ // Defaults.
+ c.KernelSyscall = defaultSyscall
+ c.KernelException = defaultException
+}
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
new file mode 100644
index 000000000..c82613a9c
--- /dev/null
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -0,0 +1,280 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package ring0
+
+import (
+ "encoding/binary"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+)
+
+const (
+ // KernelFlagsSet should always be set in the kernel.
+ KernelFlagsSet = _RFLAGS_RESERVED
+
+ // UserFlagsSet are always set in userspace.
+ UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF
+
+ // KernelFlagsClear should always be clear in the kernel.
+ KernelFlagsClear = _RFLAGS_IF | _RFLAGS_NT | _RFLAGS_IOPL
+
+ // UserFlagsClear are always cleared in userspace.
+ UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL
+)
+
+// init initializes architecture-specific state.
+func (k *Kernel) init(opts KernelOpts) {
+ // Save the root page tables.
+ k.PageTables = opts.PageTables
+
+ // Setup the IDT, which is uniform.
+ for v, handler := range handlers {
+ // Note that we set all traps to use the interrupt stack, this
+ // is defined below when setting up the TSS.
+ k.globalIDT[v].setInterrupt(Kcode, uint64(kernelFunc(handler)), 0 /* dpl */, 1 /* ist */)
+ }
+}
+
+// init initializes architecture-specific state.
+func (c *CPU) init() {
+ // Null segment.
+ c.gdt[0].setNull()
+
+ // Kernel & user segments.
+ c.gdt[segKcode] = KernelCodeSegment
+ c.gdt[segKdata] = KernelDataSegment
+ c.gdt[segUcode32] = UserCodeSegment32
+ c.gdt[segUdata] = UserDataSegment
+ c.gdt[segUcode64] = UserCodeSegment64
+
+ // The task segment, this spans two entries.
+ tssBase, tssLimit, _ := c.TSS()
+ c.gdt[segTss].set(
+ uint32(tssBase),
+ uint32(tssLimit),
+ 0, // Privilege level zero.
+ SegmentDescriptorPresent|
+ SegmentDescriptorAccess|
+ SegmentDescriptorWrite|
+ SegmentDescriptorExecute)
+ c.gdt[segTssHi].setHi(uint32((tssBase) >> 32))
+
+ // Set the kernel stack pointer in the TSS (virtual address).
+ stackAddr := c.StackTop()
+ c.tss.rsp0Lo = uint32(stackAddr)
+ c.tss.rsp0Hi = uint32(stackAddr >> 32)
+ c.tss.ist1Lo = uint32(stackAddr)
+ c.tss.ist1Hi = uint32(stackAddr >> 32)
+
+ // Permanently set the kernel segments.
+ c.registers.Cs = uint64(Kcode)
+ c.registers.Ds = uint64(Kdata)
+ c.registers.Es = uint64(Kdata)
+ c.registers.Ss = uint64(Kdata)
+ c.registers.Fs = uint64(Kdata)
+ c.registers.Gs = uint64(Kdata)
+}
+
+// StackTop returns the kernel's stack address.
+//
+//go:nosplit
+func (c *CPU) StackTop() uint64 {
+ return uint64(kernelAddr(&c.stack[0])) + uint64(len(c.stack))
+}
+
+// IDT returns the CPU's IDT base and limit.
+//
+//go:nosplit
+func (c *CPU) IDT() (uint64, uint16) {
+ return uint64(kernelAddr(&c.kernel.globalIDT[0])), uint16(binary.Size(&c.kernel.globalIDT) - 1)
+}
+
+// GDT returns the CPU's GDT base and limit.
+//
+//go:nosplit
+func (c *CPU) GDT() (uint64, uint16) {
+ return uint64(kernelAddr(&c.gdt[0])), uint16(8*segLast - 1)
+}
+
+// TSS returns the CPU's TSS base, limit and value.
+//
+//go:nosplit
+func (c *CPU) TSS() (uint64, uint16, *SegmentDescriptor) {
+ return uint64(kernelAddr(&c.tss)), uint16(binary.Size(&c.tss) - 1), &c.gdt[segTss]
+}
+
+// CR0 returns the CPU's CR0 value.
+//
+//go:nosplit
+func (c *CPU) CR0() uint64 {
+ return _CR0_PE | _CR0_PG | _CR0_ET
+}
+
+// CR4 returns the CPU's CR4 value.
+//
+//go:nosplit
+func (c *CPU) CR4() uint64 {
+ cr4 := uint64(_CR4_PAE | _CR4_PSE | _CR4_OSFXSR | _CR4_OSXMMEXCPT)
+ if hasPCID {
+ cr4 |= _CR4_PCIDE
+ }
+ if hasXSAVE {
+ cr4 |= _CR4_OSXSAVE
+ }
+ if hasSMEP {
+ cr4 |= _CR4_SMEP
+ }
+ if hasFSGSBASE {
+ cr4 |= _CR4_FSGSBASE
+ }
+ return cr4
+}
+
+// EFER returns the CPU's EFER value.
+//
+//go:nosplit
+func (c *CPU) EFER() uint64 {
+ return _EFER_LME | _EFER_SCE | _EFER_NX
+}
+
+// IsCanonical indicates whether addr is canonical per the amd64 spec.
+//
+//go:nosplit
+func IsCanonical(addr uint64) bool {
+ return addr <= 0x00007fffffffffff || addr > 0xffff800000000000
+}
+
+// Flags contains flags related to switch.
+type Flags uintptr
+
+const (
+ // FlagFull indicates that a full restore should be not, not a fast
+ // restore (on the syscall return path.)
+ FlagFull = 1 << iota
+
+ // FlagFlush indicates that a full TLB flush is required.
+ FlagFlush
+)
+
+// SwitchToUser performs either a sysret or an iret.
+//
+// The return value is the vector that interrupted execution.
+//
+// This function will not split the stack. Callers will probably want to call
+// runtime.entersyscall (and pair with a call to runtime.exitsyscall) prior to
+// calling this function.
+//
+// When this is done, this region is quite sensitive to things like system
+// calls. After calling entersyscall, any memory used must have been allocated
+// and no function calls without go:nosplit are permitted. Any calls made here
+// are protected appropriately (e.g. IsCanonical and CR3).
+//
+// Also note that this function transitively depends on the compiler generating
+// code that uses IP-relative addressing inside of absolute addresses. That's
+// the case for amd64, but may not be the case for other architectures.
+//
+//go:nosplit
+func (c *CPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetables.PageTables, flags Flags) (vector Vector) {
+ // Check for canonical addresses.
+ if !IsCanonical(regs.Rip) || !IsCanonical(regs.Rsp) || !IsCanonical(regs.Fs_base) || !IsCanonical(regs.Gs_base) {
+ return GeneralProtectionFault
+ }
+
+ var (
+ userCR3 uint64
+ kernelCR3 uint64
+ )
+
+ // Sanitize registers.
+ if flags&FlagFlush != 0 {
+ userCR3 = pt.FlushCR3()
+ } else {
+ userCR3 = pt.CR3()
+ }
+ regs.Eflags &= ^uint64(UserFlagsClear)
+ regs.Eflags |= UserFlagsSet
+ regs.Cs = uint64(Ucode64) // Required for iret.
+ regs.Ss = uint64(Udata) // Ditto.
+ kernelCR3 = c.kernel.PageTables.CR3()
+
+ // Perform the switch.
+ swapgs() // GS will be swapped on return.
+ wrfs(uintptr(regs.Fs_base)) // Set application FS.
+ wrgs(uintptr(regs.Gs_base)) // Set application GS.
+ LoadFloatingPoint(fpState) // Copy in floating point.
+ jumpToKernel() // Switch to upper half.
+ writeCR3(uintptr(userCR3)) // Change to user address space.
+ if flags&FlagFull != 0 {
+ vector = iret(c, regs)
+ } else {
+ vector = sysret(c, regs)
+ }
+ writeCR3(uintptr(kernelCR3)) // Return to kernel address space.
+ jumpToUser() // Return to lower half.
+ SaveFloatingPoint(fpState) // Copy out floating point.
+ wrfs(uintptr(c.registers.Fs_base)) // Restore kernel FS.
+ return
+}
+
+// start is the CPU entrypoint.
+//
+// This is called from the Start asm stub (see entry_amd64.go); on return the
+// registers in c.registers will be restored (not segments).
+//
+//go:nosplit
+func start(c *CPU) {
+ // Save per-cpu & FS segment.
+ wrgs(kernelAddr(c))
+ wrfs(uintptr(c.Registers().Fs_base))
+
+ // Initialize floating point.
+ //
+ // Note that on skylake, the valid XCR0 mask reported seems to be 0xff.
+ // This breaks down as:
+ //
+ // bit0 - x87
+ // bit1 - SSE
+ // bit2 - AVX
+ // bit3-4 - MPX
+ // bit5-7 - AVX512
+ //
+ // For some reason, enabled MPX & AVX512 on platforms that report them
+ // seems to be cause a general protection fault. (Maybe there are some
+ // virtualization issues and these aren't exported to the guest cpuid.)
+ // This needs further investigation, but we can limit the floating
+ // point operations to x87, SSE & AVX for now.
+ fninit()
+ xsetbv(0, validXCR0Mask&0x7)
+
+ // Set the syscall target.
+ wrmsr(_MSR_LSTAR, kernelFunc(sysenter))
+ wrmsr(_MSR_SYSCALL_MASK, _RFLAGS_STEP|_RFLAGS_IF|_RFLAGS_DF|_RFLAGS_IOPL|_RFLAGS_AC|_RFLAGS_NT)
+
+ // NOTE: This depends on having the 64-bit segments immediately
+ // following the 32-bit user segments. This is simply the way the
+ // sysret instruction is designed to work (it assumes they follow).
+ wrmsr(_MSR_STAR, uintptr(uint64(Kcode)<<32|uint64(Ucode32)<<48))
+ wrmsr(_MSR_CSTAR, kernelFunc(sysenter))
+}
+
+// ReadCR2 reads the current CR2 value.
+//
+//go:nosplit
+func ReadCR2() uintptr {
+ return readCR2()
+}
diff --git a/pkg/sentry/platform/ring0/kernel_unsafe.go b/pkg/sentry/platform/ring0/kernel_unsafe.go
new file mode 100644
index 000000000..cfb3ad853
--- /dev/null
+++ b/pkg/sentry/platform/ring0/kernel_unsafe.go
@@ -0,0 +1,41 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ring0
+
+import (
+ "unsafe"
+)
+
+// eface mirrors runtime.eface.
+type eface struct {
+ typ uintptr
+ data unsafe.Pointer
+}
+
+// kernelAddr returns the kernel virtual address for the given object.
+//
+//go:nosplit
+func kernelAddr(obj interface{}) uintptr {
+ e := (*eface)(unsafe.Pointer(&obj))
+ return KernelStartAddress | uintptr(e.data)
+}
+
+// kernelFunc returns the address of the given function.
+//
+//go:nosplit
+func kernelFunc(fn func()) uintptr {
+ fnptr := (**uintptr)(unsafe.Pointer(&fn))
+ return KernelStartAddress | **fnptr
+}
diff --git a/pkg/sentry/platform/ring0/lib_amd64.go b/pkg/sentry/platform/ring0/lib_amd64.go
new file mode 100644
index 000000000..f1ed5bfb4
--- /dev/null
+++ b/pkg/sentry/platform/ring0/lib_amd64.go
@@ -0,0 +1,128 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package ring0
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/cpuid"
+)
+
+// LoadFloatingPoint loads floating point state by the most efficient mechanism
+// available (set by Init).
+var LoadFloatingPoint func(*byte)
+
+// SaveFloatingPoint saves floating point state by the most efficient mechanism
+// available (set by Init).
+var SaveFloatingPoint func(*byte)
+
+// fxrstor uses fxrstor64 to load floating point state.
+func fxrstor(*byte)
+
+// xrstor uses xrstor to load floating point state.
+func xrstor(*byte)
+
+// fxsave uses fxsave64 to save floating point state.
+func fxsave(*byte)
+
+// xsave uses xsave to save floating point state.
+func xsave(*byte)
+
+// xsaveopt uses xsaveopt to save floating point state.
+func xsaveopt(*byte)
+
+// wrfs sets the GS address (set by init).
+var wrfs func(addr uintptr)
+
+// wrfsbase writes to the GS base address.
+func wrfsbase(addr uintptr)
+
+// wrfsmsr writes to the GS_BASE MSR.
+func wrfsmsr(addr uintptr)
+
+// wrgs sets the GS address (set by init).
+var wrgs func(addr uintptr)
+
+// wrgsbase writes to the GS base address.
+func wrgsbase(addr uintptr)
+
+// wrgsmsr writes to the GS_BASE MSR.
+func wrgsmsr(addr uintptr)
+
+// writeCR3 writes the CR3 value.
+func writeCR3(phys uintptr)
+
+// readCR2 reads the current CR2 value.
+func readCR2() uintptr
+
+// jumpToKernel jumps to the kernel version of the current RIP.
+func jumpToKernel()
+
+// jumpToUser jumps to the user version of the current RIP.
+func jumpToUser()
+
+// fninit initializes the floating point unit.
+func fninit()
+
+// xsetbv writes to an extended control register.
+func xsetbv(reg, value uintptr)
+
+// xgetbv reads an extended control register.
+func xgetbv(reg uintptr) uintptr
+
+// wrmsr reads to the given MSR.
+func wrmsr(reg, value uintptr)
+
+// rdmsr reads the given MSR.
+func rdmsr(reg uintptr) uintptr
+
+// Mostly-constants set by Init.
+var (
+ hasSMEP bool
+ hasPCID bool
+ hasXSAVEOPT bool
+ hasXSAVE bool
+ hasFSGSBASE bool
+ validXCR0Mask uintptr
+)
+
+// Init sets function pointers based on architectural features.
+//
+// This must be called prior to using ring0.
+func Init(featureSet *cpuid.FeatureSet) {
+ hasSMEP = featureSet.HasFeature(cpuid.X86FeatureSMEP)
+ hasPCID = featureSet.HasFeature(cpuid.X86FeaturePCID)
+ hasXSAVEOPT = featureSet.UseXsaveopt()
+ hasXSAVE = featureSet.UseXsave()
+ hasFSGSBASE = featureSet.HasFeature(cpuid.X86FeatureFSGSBase)
+ validXCR0Mask = uintptr(featureSet.ValidXCR0Mask())
+ if hasXSAVEOPT {
+ SaveFloatingPoint = xsaveopt
+ LoadFloatingPoint = xrstor
+ } else if hasXSAVE {
+ SaveFloatingPoint = xsave
+ LoadFloatingPoint = xrstor
+ } else {
+ SaveFloatingPoint = fxsave
+ LoadFloatingPoint = fxrstor
+ }
+ if hasFSGSBASE {
+ wrfs = wrfsbase
+ wrgs = wrgsbase
+ } else {
+ wrfs = wrfsmsr
+ wrgs = wrgsmsr
+ }
+}
diff --git a/pkg/sentry/platform/ring0/lib_amd64.s b/pkg/sentry/platform/ring0/lib_amd64.s
new file mode 100644
index 000000000..6f143ea5a
--- /dev/null
+++ b/pkg/sentry/platform/ring0/lib_amd64.s
@@ -0,0 +1,247 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "funcdata.h"
+#include "textflag.h"
+
+// fxrstor loads floating point state.
+//
+// The code corresponds to:
+//
+// fxrstor64 (%rbx)
+//
+TEXT ·fxrstor(SB),NOSPLIT,$0-8
+ MOVQ addr+0(FP), BX
+ MOVL $0xffffffff, AX
+ MOVL $0xffffffff, DX
+ BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x0b;
+ RET
+
+// xrstor loads floating point state.
+//
+// The code corresponds to:
+//
+// xrstor (%rdi)
+//
+TEXT ·xrstor(SB),NOSPLIT,$0-8
+ MOVQ addr+0(FP), DI
+ MOVL $0xffffffff, AX
+ MOVL $0xffffffff, DX
+ BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x2f;
+ RET
+
+// fxsave saves floating point state.
+//
+// The code corresponds to:
+//
+// fxsave64 (%rbx)
+//
+TEXT ·fxsave(SB),NOSPLIT,$0-8
+ MOVQ addr+0(FP), BX
+ MOVL $0xffffffff, AX
+ MOVL $0xffffffff, DX
+ BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x03;
+ RET
+
+// xsave saves floating point state.
+//
+// The code corresponds to:
+//
+// xsave (%rdi)
+//
+TEXT ·xsave(SB),NOSPLIT,$0-8
+ MOVQ addr+0(FP), DI
+ MOVL $0xffffffff, AX
+ MOVL $0xffffffff, DX
+ BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27;
+ RET
+
+// xsaveopt saves floating point state.
+//
+// The code corresponds to:
+//
+// xsaveopt (%rdi)
+//
+TEXT ·xsaveopt(SB),NOSPLIT,$0-8
+ MOVQ addr+0(FP), DI
+ MOVL $0xffffffff, AX
+ MOVL $0xffffffff, DX
+ BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37;
+ RET
+
+// wrfsbase writes to the FS base.
+//
+// The code corresponds to:
+//
+// wrfsbase %rax
+//
+TEXT ·wrfsbase(SB),NOSPLIT,$0-8
+ MOVQ addr+0(FP), AX
+ BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xd0;
+ RET
+
+// wrfsmsr writes to the FSBASE MSR.
+//
+// The code corresponds to:
+//
+// wrmsr (writes EDX:EAX to the MSR in ECX)
+//
+TEXT ·wrfsmsr(SB),NOSPLIT,$0-8
+ MOVQ addr+0(FP), AX
+ MOVQ AX, DX
+ SHRQ $32, DX
+ MOVQ $0xc0000100, CX // MSR_FS_BASE
+ BYTE $0x0f; BYTE $0x30;
+ RET
+
+// wrgsbase writes to the GS base.
+//
+// The code corresponds to:
+//
+// wrgsbase %rax
+//
+TEXT ·wrgsbase(SB),NOSPLIT,$0-8
+ MOVQ addr+0(FP), AX
+ BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xd8;
+ RET
+
+// wrgsmsr writes to the GSBASE MSR.
+//
+// See wrfsmsr.
+TEXT ·wrgsmsr(SB),NOSPLIT,$0-8
+ MOVQ addr+0(FP), AX
+ MOVQ AX, DX
+ SHRQ $32, DX
+ MOVQ $0xc0000101, CX // MSR_GS_BASE
+ BYTE $0x0f; BYTE $0x30; // WRMSR
+ RET
+
+// jumpToUser changes execution to the user address.
+//
+// This works by changing the return value to the user version.
+TEXT ·jumpToUser(SB),NOSPLIT,$0
+ MOVQ 0(SP), AX
+ MOVQ ·KernelStartAddress(SB), BX
+ NOTQ BX
+ ANDQ BX, SP // Switch the stack.
+ ANDQ BX, BP // Switch the frame pointer.
+ ANDQ BX, AX // Future return value.
+ MOVQ AX, 0(SP)
+ RET
+
+// jumpToKernel changes execution to the kernel address space.
+//
+// This works by changing the return value to the kernel version.
+TEXT ·jumpToKernel(SB),NOSPLIT,$0
+ MOVQ 0(SP), AX
+ MOVQ ·KernelStartAddress(SB), BX
+ ORQ BX, SP // Switch the stack.
+ ORQ BX, BP // Switch the frame pointer.
+ ORQ BX, AX // Future return value.
+ MOVQ AX, 0(SP)
+ RET
+
+// writeCR3 writes the given CR3 value.
+//
+// The code corresponds to:
+//
+// mov %rax, %cr3
+//
+TEXT ·writeCR3(SB),NOSPLIT,$0-8
+ MOVQ cr3+0(FP), AX
+ BYTE $0x0f; BYTE $0x22; BYTE $0xd8;
+ RET
+
+// readCR3 reads the current CR3 value.
+//
+// The code corresponds to:
+//
+// mov %cr3, %rax
+//
+TEXT ·readCR3(SB),NOSPLIT,$0-8
+ BYTE $0x0f; BYTE $0x20; BYTE $0xd8;
+ MOVQ AX, ret+0(FP)
+ RET
+
+// readCR2 reads the current CR2 value.
+//
+// The code corresponds to:
+//
+// mov %cr2, %rax
+//
+TEXT ·readCR2(SB),NOSPLIT,$0-8
+ BYTE $0x0f; BYTE $0x20; BYTE $0xd0;
+ MOVQ AX, ret+0(FP)
+ RET
+
+// fninit initializes the floating point unit.
+//
+// The code corresponds to:
+//
+// fninit
+TEXT ·fninit(SB),NOSPLIT,$0
+ BYTE $0xdb; BYTE $0xe3;
+ RET
+
+// xsetbv writes to an extended control register.
+//
+// The code corresponds to:
+//
+// xsetbv
+//
+TEXT ·xsetbv(SB),NOSPLIT,$0-16
+ MOVL reg+0(FP), CX
+ MOVL value+8(FP), AX
+ MOVL value+12(FP), DX
+ BYTE $0x0f; BYTE $0x01; BYTE $0xd1;
+ RET
+
+// xgetbv reads an extended control register.
+//
+// The code corresponds to:
+//
+// xgetbv
+//
+TEXT ·xgetbv(SB),NOSPLIT,$0-16
+ MOVL reg+0(FP), CX
+ BYTE $0x0f; BYTE $0x01; BYTE $0xd0;
+ MOVL AX, ret+8(FP)
+ MOVL DX, ret+12(FP)
+ RET
+
+// wrmsr writes to a control register.
+//
+// The code corresponds to:
+//
+// wrmsr
+//
+TEXT ·wrmsr(SB),NOSPLIT,$0-16
+ MOVL reg+0(FP), CX
+ MOVL value+8(FP), AX
+ MOVL value+12(FP), DX
+ BYTE $0x0f; BYTE $0x30;
+ RET
+
+// rdmsr reads a control register.
+//
+// The code corresponds to:
+//
+// rdmsr
+//
+TEXT ·rdmsr(SB),NOSPLIT,$0-16
+ MOVL reg+0(FP), CX
+ BYTE $0x0f; BYTE $0x32;
+ MOVL AX, ret+8(FP)
+ MOVL DX, ret+12(FP)
+ RET
diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go
new file mode 100644
index 000000000..9acd442ba
--- /dev/null
+++ b/pkg/sentry/platform/ring0/offsets_amd64.go
@@ -0,0 +1,93 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package ring0
+
+import (
+ "fmt"
+ "io"
+ "reflect"
+ "syscall"
+)
+
+// Emit prints architecture-specific offsets.
+func Emit(w io.Writer) {
+ fmt.Fprintf(w, "// Automatically generated, do not edit.\n")
+
+ c := &CPU{}
+ fmt.Fprintf(w, "\n// CPU offsets.\n")
+ fmt.Fprintf(w, "#define CPU_SELF 0x%02x\n", reflect.ValueOf(&c.self).Pointer()-reflect.ValueOf(c).Pointer())
+ fmt.Fprintf(w, "#define CPU_REGISTERS 0x%02x\n", reflect.ValueOf(&c.registers).Pointer()-reflect.ValueOf(c).Pointer())
+ fmt.Fprintf(w, "#define CPU_STACK_TOP 0x%02x\n", reflect.ValueOf(&c.stack[0]).Pointer()-reflect.ValueOf(c).Pointer()+uintptr(len(c.stack)))
+ fmt.Fprintf(w, "#define CPU_ERROR_CODE 0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer())
+ fmt.Fprintf(w, "#define CPU_ERROR_TYPE 0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer())
+ fmt.Fprintf(w, "#define CPU_KERNEL_EXCEPTION 0x%02x\n", reflect.ValueOf(&c.KernelException).Pointer()-reflect.ValueOf(c).Pointer())
+ fmt.Fprintf(w, "#define CPU_KERNEL_SYSCALL 0x%02x\n", reflect.ValueOf(&c.KernelSyscall).Pointer()-reflect.ValueOf(c).Pointer())
+
+ fmt.Fprintf(w, "\n// Bits.\n")
+ fmt.Fprintf(w, "#define _RFLAGS_IF 0x%02x\n", _RFLAGS_IF)
+
+ fmt.Fprintf(w, "\n// Vectors.\n")
+ fmt.Fprintf(w, "#define DivideByZero 0x%02x\n", DivideByZero)
+ fmt.Fprintf(w, "#define Debug 0x%02x\n", Debug)
+ fmt.Fprintf(w, "#define NMI 0x%02x\n", NMI)
+ fmt.Fprintf(w, "#define Breakpoint 0x%02x\n", Breakpoint)
+ fmt.Fprintf(w, "#define Overflow 0x%02x\n", Overflow)
+ fmt.Fprintf(w, "#define BoundRangeExceeded 0x%02x\n", BoundRangeExceeded)
+ fmt.Fprintf(w, "#define InvalidOpcode 0x%02x\n", InvalidOpcode)
+ fmt.Fprintf(w, "#define DeviceNotAvailable 0x%02x\n", DeviceNotAvailable)
+ fmt.Fprintf(w, "#define DoubleFault 0x%02x\n", DoubleFault)
+ fmt.Fprintf(w, "#define CoprocessorSegmentOverrun 0x%02x\n", CoprocessorSegmentOverrun)
+ fmt.Fprintf(w, "#define InvalidTSS 0x%02x\n", InvalidTSS)
+ fmt.Fprintf(w, "#define SegmentNotPresent 0x%02x\n", SegmentNotPresent)
+ fmt.Fprintf(w, "#define StackSegmentFault 0x%02x\n", StackSegmentFault)
+ fmt.Fprintf(w, "#define GeneralProtectionFault 0x%02x\n", GeneralProtectionFault)
+ fmt.Fprintf(w, "#define PageFault 0x%02x\n", PageFault)
+ fmt.Fprintf(w, "#define X87FloatingPointException 0x%02x\n", X87FloatingPointException)
+ fmt.Fprintf(w, "#define AlignmentCheck 0x%02x\n", AlignmentCheck)
+ fmt.Fprintf(w, "#define MachineCheck 0x%02x\n", MachineCheck)
+ fmt.Fprintf(w, "#define SIMDFloatingPointException 0x%02x\n", SIMDFloatingPointException)
+ fmt.Fprintf(w, "#define VirtualizationException 0x%02x\n", VirtualizationException)
+ fmt.Fprintf(w, "#define SecurityException 0x%02x\n", SecurityException)
+ fmt.Fprintf(w, "#define SyscallInt80 0x%02x\n", SyscallInt80)
+ fmt.Fprintf(w, "#define Syscall 0x%02x\n", Syscall)
+
+ p := &syscall.PtraceRegs{}
+ fmt.Fprintf(w, "\n// Ptrace registers.\n")
+ fmt.Fprintf(w, "#define PTRACE_R15 0x%02x\n", reflect.ValueOf(&p.R15).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_R14 0x%02x\n", reflect.ValueOf(&p.R14).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_R13 0x%02x\n", reflect.ValueOf(&p.R13).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_R12 0x%02x\n", reflect.ValueOf(&p.R12).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_RBP 0x%02x\n", reflect.ValueOf(&p.Rbp).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_RBX 0x%02x\n", reflect.ValueOf(&p.Rbx).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_R11 0x%02x\n", reflect.ValueOf(&p.R11).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_R10 0x%02x\n", reflect.ValueOf(&p.R10).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_R9 0x%02x\n", reflect.ValueOf(&p.R9).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_R8 0x%02x\n", reflect.ValueOf(&p.R8).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_RAX 0x%02x\n", reflect.ValueOf(&p.Rax).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_RCX 0x%02x\n", reflect.ValueOf(&p.Rcx).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_RDX 0x%02x\n", reflect.ValueOf(&p.Rdx).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_RSI 0x%02x\n", reflect.ValueOf(&p.Rsi).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_RDI 0x%02x\n", reflect.ValueOf(&p.Rdi).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_ORIGRAX 0x%02x\n", reflect.ValueOf(&p.Orig_rax).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_RIP 0x%02x\n", reflect.ValueOf(&p.Rip).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_CS 0x%02x\n", reflect.ValueOf(&p.Cs).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_FLAGS 0x%02x\n", reflect.ValueOf(&p.Eflags).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_RSP 0x%02x\n", reflect.ValueOf(&p.Rsp).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_SS 0x%02x\n", reflect.ValueOf(&p.Ss).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_FS 0x%02x\n", reflect.ValueOf(&p.Fs_base).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_GS 0x%02x\n", reflect.ValueOf(&p.Gs_base).Pointer()-reflect.ValueOf(p).Pointer())
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
new file mode 100644
index 000000000..c0c481ab3
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -0,0 +1,32 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+ name = "pagetables",
+ srcs = [
+ "pagetables.go",
+ "pagetables_amd64.go",
+ "pagetables_unsafe.go",
+ "pagetables_x86.go",
+ "pcids_x86.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables",
+ visibility = [
+ "//pkg/sentry/platform/kvm:__subpackages__",
+ "//pkg/sentry/platform/ring0:__subpackages__",
+ ],
+ deps = ["//pkg/sentry/usermem"],
+)
+
+go_test(
+ name = "pagetables_test",
+ size = "small",
+ srcs = [
+ "pagetables_test.go",
+ "pagetables_x86_test.go",
+ "pcids_x86_test.go",
+ ],
+ embed = [":pagetables"],
+ deps = ["//pkg/sentry/usermem"],
+)
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
new file mode 100644
index 000000000..3cbf0bfa5
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -0,0 +1,193 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pagetables provides a generic implementation of pagetables.
+package pagetables
+
+import (
+ "sync"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// Node is a single node within a set of page tables.
+type Node struct {
+ // unalignedData has unaligned data. Unfortunately, we can't really
+ // rely on the allocator to give us what we want here. So we just throw
+ // it at the wall and use the portion that matches. Gross. This may be
+ // changed in the future to use a different allocation mechanism.
+ //
+ // Access must happen via functions found in pagetables_unsafe.go.
+ unalignedData [(2 * usermem.PageSize) - 1]byte
+
+ // physical is the translated address of these entries.
+ //
+ // This is filled in at creation time.
+ physical uintptr
+}
+
+// PageTables is a set of page tables.
+type PageTables struct {
+ mu sync.Mutex
+
+ // root is the pagetable root.
+ root *Node
+
+ // translater is the translater passed at creation.
+ translater Translater
+
+ // archPageTables includes architecture-specific features.
+ archPageTables
+
+ // allNodes is a set of nodes indexed by translater address.
+ allNodes map[uintptr]*Node
+}
+
+// Translater translates to guest physical addresses.
+type Translater interface {
+ // TranslateToPhysical translates the given pointer object into a
+ // "physical" address. We do not require that it translates back, the
+ // reverse mapping is maintained internally.
+ TranslateToPhysical(*PTEs) uintptr
+}
+
+// New returns new PageTables.
+func New(t Translater, opts Opts) *PageTables {
+ p := &PageTables{
+ translater: t,
+ allNodes: make(map[uintptr]*Node),
+ }
+ p.root = p.allocNode()
+ p.init(opts)
+ return p
+}
+
+// New returns a new set of PageTables derived from the given one.
+//
+// This function should always be preferred to New if there are existing
+// pagetables, as this function preserves architectural constraints relevant to
+// managing multiple sets of pagetables.
+func (p *PageTables) New() *PageTables {
+ np := &PageTables{
+ translater: p.translater,
+ allNodes: make(map[uintptr]*Node),
+ }
+ np.root = np.allocNode()
+ np.initFrom(&p.archPageTables)
+ return np
+}
+
+// setPageTable sets the given index as a page table.
+func (p *PageTables) setPageTable(n *Node, index int, child *Node) {
+ phys := p.translater.TranslateToPhysical(child.PTEs())
+ p.allNodes[phys] = child
+ pte := &n.PTEs()[index]
+ pte.setPageTable(phys)
+}
+
+// clearPageTable clears the given entry.
+func (p *PageTables) clearPageTable(n *Node, index int) {
+ pte := &n.PTEs()[index]
+ physical := pte.Address()
+ pte.Clear()
+ delete(p.allNodes, physical)
+}
+
+// getPageTable returns the page table entry.
+func (p *PageTables) getPageTable(n *Node, index int) *Node {
+ pte := &n.PTEs()[index]
+ physical := pte.Address()
+ child := p.allNodes[physical]
+ return child
+}
+
+// Map installs a mapping with the given physical address.
+//
+// True is returned iff there was a previous mapping in the range.
+//
+// Precondition: addr & length must be aligned, their sum must not overflow.
+func (p *PageTables) Map(addr usermem.Addr, length uintptr, user bool, at usermem.AccessType, physical uintptr) bool {
+ if at == usermem.NoAccess {
+ return p.Unmap(addr, length)
+ }
+ prev := false
+ p.mu.Lock()
+ end, ok := addr.AddLength(uint64(length))
+ if !ok {
+ panic("pagetables.Map: overflow")
+ }
+ p.iterateRange(uintptr(addr), uintptr(end), true, func(s, e uintptr, pte *PTE, align uintptr) {
+ p := physical + (s - uintptr(addr))
+ prev = prev || (pte.Valid() && (p != pte.Address() || at.Write != pte.Writeable() || at.Execute != pte.Executable()))
+ if p&align != 0 {
+ // We will install entries at a smaller granulaity if
+ // we don't install a valid entry here, however we must
+ // zap any existing entry to ensure this happens.
+ pte.Clear()
+ return
+ }
+ pte.Set(p, at.Write, at.Execute, user)
+ })
+ p.mu.Unlock()
+ return prev
+}
+
+// Unmap unmaps the given range.
+//
+// True is returned iff there was a previous mapping in the range.
+func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool {
+ p.mu.Lock()
+ count := 0
+ p.iterateRange(uintptr(addr), uintptr(addr)+length, false, func(s, e uintptr, pte *PTE, align uintptr) {
+ pte.Clear()
+ count++
+ })
+ p.mu.Unlock()
+ return count > 0
+}
+
+// Release releases this address space.
+//
+// This must be called to release the PCID.
+func (p *PageTables) Release() {
+ // Clear all pages.
+ p.Unmap(0, ^uintptr(0))
+ p.release()
+}
+
+// Lookup returns the physical address for the given virtual address.
+func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, accessType usermem.AccessType) {
+ mask := uintptr(usermem.PageSize - 1)
+ off := uintptr(addr) & mask
+ addr = addr &^ usermem.Addr(mask)
+ p.iterateRange(uintptr(addr), uintptr(addr+usermem.PageSize), false, func(s, e uintptr, pte *PTE, align uintptr) {
+ if !pte.Valid() {
+ return
+ }
+ physical = pte.Address() + (s - uintptr(addr)) + off
+ accessType = usermem.AccessType{
+ Read: true,
+ Write: pte.Writeable(),
+ Execute: pte.Executable(),
+ }
+ })
+ return physical, accessType
+}
+
+// allocNode allocates a new page.
+func (p *PageTables) allocNode() *Node {
+ n := new(Node)
+ n.physical = p.translater.TranslateToPhysical(n.PTEs())
+ return n
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
new file mode 100644
index 000000000..b89665c96
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
@@ -0,0 +1,397 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package pagetables
+
+import (
+ "fmt"
+ "sync/atomic"
+)
+
+// Address constraints.
+//
+// The lowerTop and upperBottom currently apply to four-level pagetables;
+// additional refactoring would be necessary to support five-level pagetables.
+const (
+ lowerTop = 0x00007fffffffffff
+ upperBottom = 0xffff800000000000
+
+ pteShift = 12
+ pmdShift = 21
+ pudShift = 30
+ pgdShift = 39
+
+ pteMask = 0x1ff << pteShift
+ pmdMask = 0x1ff << pmdShift
+ pudMask = 0x1ff << pudShift
+ pgdMask = 0x1ff << pgdShift
+
+ pteSize = 1 << pteShift
+ pmdSize = 1 << pmdShift
+ pudSize = 1 << pudShift
+ pgdSize = 1 << pgdShift
+)
+
+// Bits in page table entries.
+const (
+ present = 0x001
+ writable = 0x002
+ user = 0x004
+ writeThrough = 0x008
+ cacheDisable = 0x010
+ accessed = 0x020
+ dirty = 0x040
+ super = 0x080
+ executeDisable = 1 << 63
+)
+
+// PTE is a page table entry.
+type PTE uint64
+
+// Clear clears this PTE, including super page information.
+func (p *PTE) Clear() {
+ atomic.StoreUint64((*uint64)(p), 0)
+}
+
+// Valid returns true iff this entry is valid.
+func (p *PTE) Valid() bool {
+ return atomic.LoadUint64((*uint64)(p))&present != 0
+}
+
+// Writeable returns true iff the page is writable.
+func (p *PTE) Writeable() bool {
+ return atomic.LoadUint64((*uint64)(p))&writable != 0
+}
+
+// User returns true iff the page is user-accessible.
+func (p *PTE) User() bool {
+ return atomic.LoadUint64((*uint64)(p))&user != 0
+}
+
+// Executable returns true iff the page is executable.
+func (p *PTE) Executable() bool {
+ return atomic.LoadUint64((*uint64)(p))&executeDisable == 0
+}
+
+// SetSuper sets this page as a super page.
+//
+// The page must not be valid or a panic will result.
+func (p *PTE) SetSuper() {
+ if p.Valid() {
+ // This is not allowed.
+ panic("SetSuper called on valid page!")
+ }
+ atomic.StoreUint64((*uint64)(p), super)
+}
+
+// IsSuper returns true iff this page is a super page.
+func (p *PTE) IsSuper() bool {
+ return atomic.LoadUint64((*uint64)(p))&super != 0
+}
+
+// Set sets this PTE value.
+func (p *PTE) Set(addr uintptr, write, execute bool, userAccessible bool) {
+ v := uint64(addr)&^uint64(0xfff) | present | accessed
+ if userAccessible {
+ v |= user
+ }
+ if !execute {
+ v |= executeDisable
+ }
+ if write {
+ v |= writable | dirty
+ }
+ if p.IsSuper() {
+ v |= super
+ }
+ atomic.StoreUint64((*uint64)(p), v)
+}
+
+// setPageTable sets this PTE value and forces the write bit and super bit to
+// be cleared. This is used explicitly for breaking super pages.
+func (p *PTE) setPageTable(addr uintptr) {
+ v := uint64(addr)&^uint64(0xfff) | present | user | writable | accessed | dirty
+ atomic.StoreUint64((*uint64)(p), v)
+}
+
+// Address extracts the address. This should only be used if Valid returns true.
+func (p *PTE) Address() uintptr {
+ return uintptr(atomic.LoadUint64((*uint64)(p)) & ^uint64(executeDisable|0xfff))
+}
+
+// entriesPerPage is the number of PTEs per page.
+const entriesPerPage = 512
+
+// PTEs is a collection of entries.
+type PTEs [entriesPerPage]PTE
+
+// next returns the next address quantized by the given size.
+func next(start uint64, size uint64) uint64 {
+ start &= ^(size - 1)
+ start += size
+ return start
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If alloc is set, then Set _must_ be called on all given PTEs. The exception
+// is super pages. If a valid super page cannot be installed, then the walk
+// will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of super pages whenever
+// possible. Whether a super page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if alloc set, then no gaps will be present. However, if alloc is
+// not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: startAddr and endAddr must be page-aligned.
+//
+// Precondition: startStart must be less than endAddr.
+//
+// Precondition: If alloc is set, then startAddr and endAddr should not span
+// non-canonical ranges. If they do, a panic will result.
+func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn func(s, e uintptr, pte *PTE, align uintptr)) {
+ start := uint64(startAddr)
+ end := uint64(endAddr)
+ if start%pteSize != 0 {
+ panic(fmt.Sprintf("unaligned start: %v", start))
+ }
+ if start > end {
+ panic(fmt.Sprintf("start > end (%v > %v))", start, end))
+ }
+
+ // Deal with cases where we traverse the "gap".
+ //
+ // These are all explicitly disallowed if alloc is set, and we must
+ // traverse an entry for each address explicitly.
+ switch {
+ case start < lowerTop && end > lowerTop && end < upperBottom:
+ if alloc {
+ panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
+ }
+ p.iterateRange(startAddr, lowerTop, false, fn)
+ return
+ case start < lowerTop && end > lowerTop:
+ if alloc {
+ panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
+ }
+ p.iterateRange(startAddr, lowerTop, false, fn)
+ p.iterateRange(upperBottom, endAddr, false, fn)
+ return
+ case start > lowerTop && end < upperBottom:
+ if alloc {
+ panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
+ }
+ return
+ case start > lowerTop && start < upperBottom && end > upperBottom:
+ if alloc {
+ panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
+ }
+ p.iterateRange(upperBottom, endAddr, false, fn)
+ return
+ }
+
+ for pgdIndex := int((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+ pgdEntry := &p.root.PTEs()[pgdIndex]
+ if !pgdEntry.Valid() {
+ if !alloc {
+ // Skip over this entry.
+ start = next(start, pgdSize)
+ continue
+ }
+
+ // Allocate a new pgd.
+ p.setPageTable(p.root, pgdIndex, p.allocNode())
+ }
+
+ // Map the next level.
+ pudNode := p.getPageTable(p.root, pgdIndex)
+ clearPUDEntries := 0
+
+ for pudIndex := int((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+ pudEntry := &(pudNode.PTEs()[pudIndex])
+ if !pudEntry.Valid() {
+ if !alloc {
+ // Skip over this entry.
+ clearPUDEntries++
+ start = next(start, pudSize)
+ continue
+ }
+
+ // This level has 1-GB super pages. Is this
+ // entire region contained in a single PUD
+ // entry? If so, we can skip allocating a new
+ // page for the pmd.
+ if start&(pudSize-1) == 0 && end-start >= pudSize {
+ pudEntry.SetSuper()
+ fn(uintptr(start), uintptr(start+pudSize), pudEntry, pudSize-1)
+ if pudEntry.Valid() {
+ start = next(start, pudSize)
+ continue
+ }
+ }
+
+ // Allocate a new pud.
+ p.setPageTable(pudNode, pudIndex, p.allocNode())
+
+ } else if pudEntry.IsSuper() {
+ // Does this page need to be split?
+ if start&(pudSize-1) != 0 || end < next(start, pudSize) {
+ currentAddr := uint64(pudEntry.Address())
+ writeable := pudEntry.Writeable()
+ executable := pudEntry.Executable()
+ user := pudEntry.User()
+
+ // Install the relevant entries.
+ pmdNode := p.allocNode()
+ pmdEntries := pmdNode.PTEs()
+ for index := 0; index < entriesPerPage; index++ {
+ pmdEntry := &pmdEntries[index]
+ pmdEntry.SetSuper()
+ pmdEntry.Set(uintptr(currentAddr), writeable, executable, user)
+ currentAddr += pmdSize
+ }
+
+ // Reset to point to the new page.
+ p.setPageTable(pudNode, pudIndex, pmdNode)
+ } else {
+ // A super page to be checked directly.
+ fn(uintptr(start), uintptr(start+pudSize), pudEntry, pudSize-1)
+
+ // Might have been cleared.
+ if !pudEntry.Valid() {
+ clearPUDEntries++
+ }
+
+ // Note that the super page was changed.
+ start = next(start, pudSize)
+ continue
+ }
+ }
+
+ // Map the next level, since this is valid.
+ pmdNode := p.getPageTable(pudNode, pudIndex)
+ clearPMDEntries := 0
+
+ for pmdIndex := int((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+ pmdEntry := &pmdNode.PTEs()[pmdIndex]
+ if !pmdEntry.Valid() {
+ if !alloc {
+ // Skip over this entry.
+ clearPMDEntries++
+ start = next(start, pmdSize)
+ continue
+ }
+
+ // This level has 2-MB huge pages. If this
+ // region is contined in a single PMD entry?
+ // As above, we can skip allocating a new page.
+ if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+ pmdEntry.SetSuper()
+ fn(uintptr(start), uintptr(start+pmdSize), pmdEntry, pmdSize-1)
+ if pmdEntry.Valid() {
+ start = next(start, pmdSize)
+ continue
+ }
+ }
+
+ // Allocate a new pmd.
+ p.setPageTable(pmdNode, pmdIndex, p.allocNode())
+
+ } else if pmdEntry.IsSuper() {
+ // Does this page need to be split?
+ if start&(pmdSize-1) != 0 || end < next(start, pmdSize) {
+ currentAddr := uint64(pmdEntry.Address())
+ writeable := pmdEntry.Writeable()
+ executable := pmdEntry.Executable()
+ user := pmdEntry.User()
+
+ // Install the relevant entries.
+ pteNode := p.allocNode()
+ pteEntries := pteNode.PTEs()
+ for index := 0; index < entriesPerPage; index++ {
+ pteEntry := &pteEntries[index]
+ pteEntry.Set(uintptr(currentAddr), writeable, executable, user)
+ currentAddr += pteSize
+ }
+
+ // Reset to point to the new page.
+ p.setPageTable(pmdNode, pmdIndex, pteNode)
+ } else {
+ // A huge page to be checked directly.
+ fn(uintptr(start), uintptr(start+pmdSize), pmdEntry, pmdSize-1)
+
+ // Might have been cleared.
+ if !pmdEntry.Valid() {
+ clearPMDEntries++
+ }
+
+ // Note that the huge page was changed.
+ start = next(start, pmdSize)
+ continue
+ }
+ }
+
+ // Map the next level, since this is valid.
+ pteNode := p.getPageTable(pmdNode, pmdIndex)
+ clearPTEEntries := 0
+
+ for pteIndex := int((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+ pteEntry := &pteNode.PTEs()[pteIndex]
+ if !pteEntry.Valid() && !alloc {
+ clearPTEEntries++
+ start += pteSize
+ continue
+ }
+
+ // At this point, we are guaranteed that start%pteSize == 0.
+ fn(uintptr(start), uintptr(start+pteSize), pteEntry, pteSize-1)
+ if !pteEntry.Valid() {
+ if alloc {
+ panic("PTE not set after iteration with alloc=true!")
+ }
+ clearPTEEntries++
+ }
+
+ // Note that the pte was changed.
+ start += pteSize
+ continue
+ }
+
+ // Check if we no longer need this page.
+ if clearPTEEntries == entriesPerPage {
+ p.clearPageTable(pmdNode, pmdIndex)
+ clearPMDEntries++
+ }
+ }
+
+ // Check if we no longer need this page.
+ if clearPMDEntries == entriesPerPage {
+ p.clearPageTable(pudNode, pudIndex)
+ clearPUDEntries++
+ }
+ }
+
+ // Check if we no longer need this page.
+ if clearPUDEntries == entriesPerPage {
+ p.clearPageTable(p.root, pgdIndex)
+ }
+ }
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
new file mode 100644
index 000000000..9cbc0e3b0
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
@@ -0,0 +1,161 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pagetables
+
+import (
+ "reflect"
+ "testing"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+type reflectTranslater struct{}
+
+func (r reflectTranslater) TranslateToPhysical(ptes *PTEs) uintptr {
+ return reflect.ValueOf(ptes).Pointer()
+}
+
+type mapping struct {
+ start uintptr
+ length uintptr
+ addr uintptr
+ writeable bool
+}
+
+func checkMappings(t *testing.T, pt *PageTables, m []mapping) {
+ var (
+ current int
+ found []mapping
+ failed string
+ )
+
+ // Iterate over all the mappings.
+ pt.iterateRange(0, ^uintptr(0), false, func(s, e uintptr, pte *PTE, align uintptr) {
+ found = append(found, mapping{
+ start: s,
+ length: e - s,
+ addr: pte.Address(),
+ writeable: pte.Writeable(),
+ })
+ if failed != "" {
+ // Don't keep looking for errors.
+ return
+ }
+
+ if current >= len(m) {
+ failed = "more mappings than expected"
+ } else if m[current].start != s {
+ failed = "start didn't match expected"
+ } else if m[current].length != (e - s) {
+ failed = "end didn't match expected"
+ } else if m[current].addr != pte.Address() {
+ failed = "address didn't match expected"
+ } else if m[current].writeable != pte.Writeable() {
+ failed = "writeable didn't match"
+ }
+ current++
+ })
+
+ // Were we expected additional mappings?
+ if failed == "" && current != len(m) {
+ failed = "insufficient mappings found"
+ }
+
+ // Emit a meaningful error message on failure.
+ if failed != "" {
+ t.Errorf("%s; got %#v, wanted %#v", failed, found, m)
+ }
+}
+
+func TestAllocFree(t *testing.T) {
+ pt := New(reflectTranslater{}, Opts{})
+ pt.Release()
+}
+
+func TestUnmap(t *testing.T) {
+ pt := New(reflectTranslater{}, Opts{})
+
+ // Map and unmap one entry.
+ pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
+ pt.Unmap(0x400000, pteSize)
+
+ checkMappings(t, pt, nil)
+ pt.Release()
+}
+
+func TestReadOnly(t *testing.T) {
+ pt := New(reflectTranslater{}, Opts{})
+
+ // Map one entry.
+ pt.Map(0x400000, pteSize, true, usermem.Read, pteSize*42)
+
+ checkMappings(t, pt, []mapping{
+ {0x400000, pteSize, pteSize * 42, false},
+ })
+ pt.Release()
+}
+
+func TestReadWrite(t *testing.T) {
+ pt := New(reflectTranslater{}, Opts{})
+
+ // Map one entry.
+ pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
+
+ checkMappings(t, pt, []mapping{
+ {0x400000, pteSize, pteSize * 42, true},
+ })
+ pt.Release()
+}
+
+func TestSerialEntries(t *testing.T) {
+ pt := New(reflectTranslater{}, Opts{})
+
+ // Map two sequential entries.
+ pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
+ pt.Map(0x401000, pteSize, true, usermem.ReadWrite, pteSize*47)
+
+ checkMappings(t, pt, []mapping{
+ {0x400000, pteSize, pteSize * 42, true},
+ {0x401000, pteSize, pteSize * 47, true},
+ })
+ pt.Release()
+}
+
+func TestSpanningEntries(t *testing.T) {
+ pt := New(reflectTranslater{}, Opts{})
+
+ // Span a pgd with two pages.
+ pt.Map(0x00007efffffff000, 2*pteSize, true, usermem.Read, pteSize*42)
+
+ checkMappings(t, pt, []mapping{
+ {0x00007efffffff000, pteSize, pteSize * 42, false},
+ {0x00007f0000000000, pteSize, pteSize * 43, false},
+ })
+ pt.Release()
+}
+
+func TestSparseEntries(t *testing.T) {
+ pt := New(reflectTranslater{}, Opts{})
+
+ // Map two entries in different pgds.
+ pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
+ pt.Map(0x00007f0000000000, pteSize, true, usermem.Read, pteSize*47)
+
+ checkMappings(t, pt, []mapping{
+ {0x400000, pteSize, pteSize * 42, true},
+ {0x00007f0000000000, pteSize, pteSize * 47, false},
+ })
+ pt.Release()
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go b/pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go
new file mode 100644
index 000000000..a2b44fb79
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go
@@ -0,0 +1,31 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pagetables
+
+import (
+ "unsafe"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// PTEs returns aligned PTE entries.
+func (n *Node) PTEs() *PTEs {
+ addr := uintptr(unsafe.Pointer(&n.unalignedData[0]))
+ offset := addr & (usermem.PageSize - 1)
+ if offset != 0 {
+ offset = usermem.PageSize - offset
+ }
+ return (*PTEs)(unsafe.Pointer(addr + offset))
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
new file mode 100644
index 000000000..dac66373f
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
@@ -0,0 +1,79 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package pagetables
+
+// Opts are pagetable options.
+type Opts struct {
+ EnablePCID bool
+}
+
+// archPageTables has x86-specific features.
+type archPageTables struct {
+ // pcids is the PCID database.
+ pcids *PCIDs
+
+ // pcid is the globally unique identifier, or zero if none were
+ // available or pcids is nil.
+ pcid uint16
+}
+
+// init initializes arch-specific features.
+func (a *archPageTables) init(opts Opts) {
+ if opts.EnablePCID {
+ a.pcids = NewPCIDs()
+ a.pcid = a.pcids.allocate()
+ }
+}
+
+// initFrom initializes arch-specific features from an existing entry.'
+func (a *archPageTables) initFrom(other *archPageTables) {
+ a.pcids = other.pcids // Refer to the same PCID database.
+ if a.pcids != nil {
+ a.pcid = a.pcids.allocate()
+ }
+}
+
+// release is called from Release.
+func (a *archPageTables) release() {
+ // Return the PCID.
+ if a.pcids != nil {
+ a.pcids.free(a.pcid)
+ }
+}
+
+// CR3 returns the CR3 value for these tables.
+//
+// This may be called in interrupt contexts.
+//
+//go:nosplit
+func (p *PageTables) CR3() uint64 {
+ // Bit 63 is set to avoid flushing the PCID (per SDM 4.10.4.1).
+ const noFlushBit uint64 = 0x8000000000000000
+ if p.pcid != 0 {
+ return noFlushBit | uint64(p.root.physical) | uint64(p.pcid)
+ }
+ return uint64(p.root.physical)
+}
+
+// FlushCR3 returns the CR3 value that flushes the TLB.
+//
+// This may be called in interrupt contexts.
+//
+//go:nosplit
+func (p *PageTables) FlushCR3() uint64 {
+ return uint64(p.root.physical) | uint64(p.pcid)
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go
new file mode 100644
index 000000000..1fc403c48
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go
@@ -0,0 +1,79 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package pagetables
+
+import (
+ "testing"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+func Test2MAnd4K(t *testing.T) {
+ pt := New(reflectTranslater{}, Opts{})
+
+ // Map a small page and a huge page.
+ pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
+ pt.Map(0x00007f0000000000, 1<<21, true, usermem.Read, pmdSize*47)
+
+ checkMappings(t, pt, []mapping{
+ {0x400000, pteSize, pteSize * 42, true},
+ {0x00007f0000000000, pmdSize, pmdSize * 47, false},
+ })
+ pt.Release()
+}
+
+func Test1GAnd4K(t *testing.T) {
+ pt := New(reflectTranslater{}, Opts{})
+
+ // Map a small page and a super page.
+ pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
+ pt.Map(0x00007f0000000000, pudSize, true, usermem.Read, pudSize*47)
+
+ checkMappings(t, pt, []mapping{
+ {0x400000, pteSize, pteSize * 42, true},
+ {0x00007f0000000000, pudSize, pudSize * 47, false},
+ })
+ pt.Release()
+}
+
+func TestSplit1GPage(t *testing.T) {
+ pt := New(reflectTranslater{}, Opts{})
+
+ // Map a super page and knock out the middle.
+ pt.Map(0x00007f0000000000, pudSize, true, usermem.Read, pudSize*42)
+ pt.Unmap(usermem.Addr(0x00007f0000000000+pteSize), pudSize-(2*pteSize))
+
+ checkMappings(t, pt, []mapping{
+ {0x00007f0000000000, pteSize, pudSize * 42, false},
+ {0x00007f0000000000 + pudSize - pteSize, pteSize, pudSize*42 + pudSize - pteSize, false},
+ })
+ pt.Release()
+}
+
+func TestSplit2MPage(t *testing.T) {
+ pt := New(reflectTranslater{}, Opts{})
+
+ // Map a huge page and knock out the middle.
+ pt.Map(0x00007f0000000000, pmdSize, true, usermem.Read, pmdSize*42)
+ pt.Unmap(usermem.Addr(0x00007f0000000000+pteSize), pmdSize-(2*pteSize))
+
+ checkMappings(t, pt, []mapping{
+ {0x00007f0000000000, pteSize, pmdSize * 42, false},
+ {0x00007f0000000000 + pmdSize - pteSize, pteSize, pmdSize*42 + pmdSize - pteSize, false},
+ })
+ pt.Release()
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
new file mode 100644
index 000000000..509e8c0d9
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
@@ -0,0 +1,74 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package pagetables
+
+import (
+ "sync"
+)
+
+// maxPCID is the maximum allowed PCID.
+const maxPCID = 4095
+
+// PCIDs is a simple PCID database.
+type PCIDs struct {
+ mu sync.Mutex
+
+ // last is the last fresh PCID given out (not including the available
+ // pool). If last >= maxPCID, then the only PCIDs available in the
+ // available pool below.
+ last uint16
+
+ // available are PCIDs that have been freed.
+ available map[uint16]struct{}
+}
+
+// NewPCIDs returns a new PCID set.
+func NewPCIDs() *PCIDs {
+ return &PCIDs{
+ available: make(map[uint16]struct{}),
+ }
+}
+
+// allocate returns an unused PCID, or zero if all are taken.
+func (p *PCIDs) allocate() uint16 {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+ if len(p.available) > 0 {
+ for id := range p.available {
+ delete(p.available, id)
+ return id
+ }
+ }
+ if id := p.last + 1; id <= maxPCID {
+ p.last = id
+ return id
+ }
+ // Nothing available.
+ return 0
+}
+
+// free returns a PCID to the pool.
+//
+// It is safe to call free with a zero pcid. That is, you may always call free
+// with anything returned by allocate.
+func (p *PCIDs) free(id uint16) {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+ if id != 0 {
+ p.available[id] = struct{}{}
+ }
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go
new file mode 100644
index 000000000..0b555cd76
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go
@@ -0,0 +1,65 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package pagetables
+
+import (
+ "testing"
+)
+
+func TestMaxPCID(t *testing.T) {
+ p := NewPCIDs()
+ for i := 0; i < maxPCID; i++ {
+ if id := p.allocate(); id != uint16(i+1) {
+ t.Errorf("got %d, expected %d", id, i+1)
+ }
+ }
+ if id := p.allocate(); id != 0 {
+ if id != 0 {
+ t.Errorf("got %d, expected 0", id)
+ }
+ }
+}
+
+func TestFirstPCID(t *testing.T) {
+ p := NewPCIDs()
+ if id := p.allocate(); id != 1 {
+ t.Errorf("got %d, expected 1", id)
+ }
+}
+
+func TestFreePCID(t *testing.T) {
+ p := NewPCIDs()
+ p.free(0)
+ if id := p.allocate(); id != 1 {
+ t.Errorf("got %d, expected 1 (not zero)", id)
+ }
+}
+
+func TestReusePCID(t *testing.T) {
+ p := NewPCIDs()
+ id := p.allocate()
+ if id != 1 {
+ t.Errorf("got %d, expected 1", id)
+ }
+ p.free(id)
+ if id := p.allocate(); id != 1 {
+ t.Errorf("got %d, expected 1", id)
+ }
+ if id := p.allocate(); id != 2 {
+ t.Errorf("got %d, expected 2", id)
+ }
+}
diff --git a/pkg/sentry/platform/ring0/ring0.go b/pkg/sentry/platform/ring0/ring0.go
new file mode 100644
index 000000000..4991031c5
--- /dev/null
+++ b/pkg/sentry/platform/ring0/ring0.go
@@ -0,0 +1,16 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ring0 provides basic operating system-level stubs.
+package ring0
diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go
new file mode 100644
index 000000000..e16f6c599
--- /dev/null
+++ b/pkg/sentry/platform/ring0/x86.go
@@ -0,0 +1,242 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package ring0
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/cpuid"
+)
+
+// Useful bits.
+const (
+ _CR0_PE = 1 << 0
+ _CR0_ET = 1 << 4
+ _CR0_PG = 1 << 31
+
+ _CR4_PSE = 1 << 4
+ _CR4_PAE = 1 << 5
+ _CR4_PGE = 1 << 7
+ _CR4_OSFXSR = 1 << 9
+ _CR4_OSXMMEXCPT = 1 << 10
+ _CR4_FSGSBASE = 1 << 16
+ _CR4_PCIDE = 1 << 17
+ _CR4_OSXSAVE = 1 << 18
+ _CR4_SMEP = 1 << 20
+
+ _RFLAGS_AC = 1 << 18
+ _RFLAGS_NT = 1 << 14
+ _RFLAGS_IOPL = 3 << 12
+ _RFLAGS_DF = 1 << 10
+ _RFLAGS_IF = 1 << 9
+ _RFLAGS_STEP = 1 << 8
+ _RFLAGS_RESERVED = 1 << 1
+
+ _EFER_SCE = 0x001
+ _EFER_LME = 0x100
+ _EFER_NX = 0x800
+
+ _MSR_STAR = 0xc0000081
+ _MSR_LSTAR = 0xc0000082
+ _MSR_CSTAR = 0xc0000083
+ _MSR_SYSCALL_MASK = 0xc0000084
+)
+
+// Vector is an exception vector.
+type Vector uintptr
+
+// Exception vectors.
+const (
+ DivideByZero Vector = iota
+ Debug
+ NMI
+ Breakpoint
+ Overflow
+ BoundRangeExceeded
+ InvalidOpcode
+ DeviceNotAvailable
+ DoubleFault
+ CoprocessorSegmentOverrun
+ InvalidTSS
+ SegmentNotPresent
+ StackSegmentFault
+ GeneralProtectionFault
+ PageFault
+ _
+ X87FloatingPointException
+ AlignmentCheck
+ MachineCheck
+ SIMDFloatingPointException
+ VirtualizationException
+ SecurityException = 0x1e
+ SyscallInt80 = 0x80
+ _NR_INTERRUPTS = SyscallInt80 + 1
+)
+
+// System call vectors.
+const (
+ Syscall Vector = _NR_INTERRUPTS
+)
+
+// VirtualAddressBits returns the number bits available for virtual addresses.
+//
+// Note that sign-extension semantics apply to the highest order bit.
+//
+// FIXME: This should use the cpuid passed to Init.
+func VirtualAddressBits() uint32 {
+ ax, _, _, _ := cpuid.HostID(0x80000008, 0)
+ return (ax >> 8) & 0xff
+}
+
+// PhysicalAddressBits returns the number of bits available for physical addresses.
+//
+// FIXME: This should use the cpuid passed to Init.
+func PhysicalAddressBits() uint32 {
+ ax, _, _, _ := cpuid.HostID(0x80000008, 0)
+ return ax & 0xff
+}
+
+// Selector is a segment Selector.
+type Selector uint16
+
+// SegmentDescriptor is a segment descriptor.
+type SegmentDescriptor struct {
+ bits [2]uint32
+}
+
+// descriptorTable is a collection of descriptors.
+type descriptorTable [32]SegmentDescriptor
+
+// SegmentDescriptorFlags are typed flags within a descriptor.
+type SegmentDescriptorFlags uint32
+
+// SegmentDescriptorFlag declarations.
+const (
+ SegmentDescriptorAccess SegmentDescriptorFlags = 1 << 8 // Access bit (always set).
+ SegmentDescriptorWrite = 1 << 9 // Write permission.
+ SegmentDescriptorExpandDown = 1 << 10 // Grows down, not used.
+ SegmentDescriptorExecute = 1 << 11 // Execute permission.
+ SegmentDescriptorSystem = 1 << 12 // Zero => system, 1 => user code/data.
+ SegmentDescriptorPresent = 1 << 15 // Present.
+ SegmentDescriptorAVL = 1 << 20 // Available.
+ SegmentDescriptorLong = 1 << 21 // Long mode.
+ SegmentDescriptorDB = 1 << 22 // 16 or 32-bit.
+ SegmentDescriptorG = 1 << 23 // Granularity: page or byte.
+)
+
+// Base returns the descriptor's base linear address.
+func (d *SegmentDescriptor) Base() uint32 {
+ return d.bits[1]&0xFF000000 | (d.bits[1]&0x000000FF)<<16 | d.bits[0]>>16
+}
+
+// Limit returns the descriptor size.
+func (d *SegmentDescriptor) Limit() uint32 {
+ l := d.bits[0]&0xFFFF | d.bits[1]&0xF0000
+ if d.bits[1]&uint32(SegmentDescriptorG) != 0 {
+ l <<= 12
+ l |= 0xFFF
+ }
+ return l
+}
+
+// Flags returns descriptor flags.
+func (d *SegmentDescriptor) Flags() SegmentDescriptorFlags {
+ return SegmentDescriptorFlags(d.bits[1] & 0x00F09F00)
+}
+
+// DPL returns the descriptor privilege level.
+func (d *SegmentDescriptor) DPL() int {
+ return int((d.bits[1] >> 13) & 3)
+}
+
+func (d *SegmentDescriptor) setNull() {
+ d.bits[0] = 0
+ d.bits[1] = 0
+}
+
+func (d *SegmentDescriptor) set(base, limit uint32, dpl int, flags SegmentDescriptorFlags) {
+ flags |= SegmentDescriptorPresent
+ if limit>>12 != 0 {
+ limit >>= 12
+ flags |= SegmentDescriptorG
+ }
+ d.bits[0] = base<<16 | limit&0xFFFF
+ d.bits[1] = base&0xFF000000 | (base>>16)&0xFF | limit&0x000F0000 | uint32(flags) | uint32(dpl)<<13
+}
+
+func (d *SegmentDescriptor) setCode32(base, limit uint32, dpl int) {
+ d.set(base, limit, dpl,
+ SegmentDescriptorDB|
+ SegmentDescriptorExecute|
+ SegmentDescriptorSystem)
+}
+
+func (d *SegmentDescriptor) setCode64(base, limit uint32, dpl int) {
+ d.set(base, limit, dpl,
+ SegmentDescriptorG|
+ SegmentDescriptorLong|
+ SegmentDescriptorExecute|
+ SegmentDescriptorSystem)
+}
+
+func (d *SegmentDescriptor) setData(base, limit uint32, dpl int) {
+ d.set(base, limit, dpl,
+ SegmentDescriptorWrite|
+ SegmentDescriptorSystem)
+}
+
+// setHi is only used for the TSS segment, which is magically 64-bits.
+func (d *SegmentDescriptor) setHi(base uint32) {
+ d.bits[0] = base
+ d.bits[1] = 0
+}
+
+// Gate64 is a 64-bit task, trap, or interrupt gate.
+type Gate64 struct {
+ bits [4]uint32
+}
+
+// idt64 is a 64-bit interrupt descriptor table.
+type idt64 [_NR_INTERRUPTS]Gate64
+
+func (g *Gate64) setInterrupt(cs Selector, rip uint64, dpl int, ist int) {
+ g.bits[0] = uint32(cs)<<16 | uint32(rip)&0xFFFF
+ g.bits[1] = uint32(rip)&0xFFFF0000 | SegmentDescriptorPresent | uint32(dpl)<<13 | 14<<8 | uint32(ist)&0x7
+ g.bits[2] = uint32(rip >> 32)
+}
+
+func (g *Gate64) setTrap(cs Selector, rip uint64, dpl int, ist int) {
+ g.setInterrupt(cs, rip, dpl, ist)
+ g.bits[1] |= 1 << 8
+}
+
+// TaskState64 is a 64-bit task state structure.
+type TaskState64 struct {
+ _ uint32
+ rsp0Lo, rsp0Hi uint32
+ rsp1Lo, rsp1Hi uint32
+ rsp2Lo, rsp2Hi uint32
+ _ [2]uint32
+ ist1Lo, ist1Hi uint32
+ ist2Lo, ist2Hi uint32
+ ist3Lo, ist3Hi uint32
+ ist4Lo, ist4Hi uint32
+ ist5Lo, ist5Hi uint32
+ ist6Lo, ist6Hi uint32
+ ist7Lo, ist7Hi uint32
+ _ [2]uint32
+ _ uint16
+ ioPerm uint16
+}