diff options
author | Googler <noreply@google.com> | 2018-04-27 10:37:02 -0700 |
---|---|---|
committer | Adin Scannell <ascannell@google.com> | 2018-04-28 01:44:26 -0400 |
commit | d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch) | |
tree | 54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/platform/ring0 | |
parent | f70210e742919f40aa2f0934a22f1c9ba6dada62 (diff) |
Check in gVisor.
PiperOrigin-RevId: 194583126
Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
Diffstat (limited to 'pkg/sentry/platform/ring0')
24 files changed, 2998 insertions, 0 deletions
diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD new file mode 100644 index 000000000..2df232a64 --- /dev/null +++ b/pkg/sentry/platform/ring0/BUILD @@ -0,0 +1,52 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") + +go_template( + name = "defs", + srcs = [ + "defs.go", + "defs_amd64.go", + "offsets_amd64.go", + "x86.go", + ], + visibility = [":__subpackages__"], +) + +go_template_instance( + name = "defs_impl", + out = "defs_impl.go", + package = "ring0", + template = ":defs", +) + +genrule( + name = "entry_impl_amd64", + srcs = ["entry_amd64.s"], + outs = ["entry_impl_amd64.s"], + cmd = "(echo -e '// build +amd64\\n' && $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(SRCS)) > $@", + tools = ["//pkg/sentry/platform/ring0/gen_offsets"], +) + +go_library( + name = "ring0", + srcs = [ + "defs_impl.go", + "entry_amd64.go", + "entry_impl_amd64.s", + "kernel.go", + "kernel_amd64.go", + "kernel_unsafe.go", + "lib_amd64.go", + "lib_amd64.s", + "ring0.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/cpuid", + "//pkg/sentry/platform/ring0/pagetables", + "//pkg/sentry/usermem", + ], +) diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go new file mode 100644 index 000000000..9d947b73d --- /dev/null +++ b/pkg/sentry/platform/ring0/defs.go @@ -0,0 +1,93 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ring0 + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +var ( + // UserspaceSize is the total size of userspace. + UserspaceSize = uintptr(1) << (VirtualAddressBits() - 1) + + // MaximumUserAddress is the largest possible user address. + MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(usermem.PageSize-1) + + // KernelStartAddress is the starting kernel address. + KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1) +) + +// Kernel is a global kernel object. +// +// This contains global state, shared by multiple CPUs. +type Kernel struct { + KernelArchState +} + +// CPU is the per-CPU struct. +type CPU struct { + // self is a self reference. + // + // This is always guaranteed to be at offset zero. + self *CPU + + // kernel is reference to the kernel that this CPU was initialized + // with. This reference is kept for garbage collection purposes: CPU + // registers may refer to objects within the Kernel object that cannot + // be safely freed. + kernel *Kernel + + // CPUArchState is architecture-specific state. + CPUArchState + + // registers is a set of registers; these may be used on kernel system + // calls and exceptions via the Registers function. + registers syscall.PtraceRegs + + // KernelException handles an exception during kernel execution. + // + // Return from this call will restore registers and return to the kernel: the + // registers must be modified directly. + // + // If this function is not provided, a kernel exception results in halt. + // + // This must be go:nosplit, as this will be on the interrupt stack. + // Closures are permitted, as the pointer to the closure frame is not + // passed on the stack. + KernelException func(Vector) + + // KernelSyscall is called for kernel system calls. + // + // Return from this call will restore registers and return to the kernel: the + // registers must be modified directly. + // + // If this function is not provided, a kernel exception results in halt. + // + // This must be go:nosplit, as this will be on the interrupt stack. + // Closures are permitted, as the pointer to the closure frame is not + // passed on the stack. + KernelSyscall func() +} + +// Registers returns a modifiable-copy of the kernel registers. +// +// This is explicitly safe to call during KernelException and KernelSyscall. +// +//go:nosplit +func (c *CPU) Registers() *syscall.PtraceRegs { + return &c.registers +} diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go new file mode 100644 index 000000000..bb3420125 --- /dev/null +++ b/pkg/sentry/platform/ring0/defs_amd64.go @@ -0,0 +1,113 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package ring0 + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables" +) + +// Segment indices and Selectors. +const ( + // Index into GDT array. + _ = iota // Null descriptor first. + _ // Reserved (Linux is kernel 32). + segKcode // Kernel code (64-bit). + segKdata // Kernel data. + segUcode32 // User code (32-bit). + segUdata // User data. + segUcode64 // User code (64-bit). + segTss // Task segment descriptor. + segTssHi // Upper bits for TSS. + segLast // Last segment (terminal, not included). +) + +// Selectors. +const ( + Kcode Selector = segKcode << 3 + Kdata Selector = segKdata << 3 + Ucode32 Selector = (segUcode32 << 3) | 3 + Udata Selector = (segUdata << 3) | 3 + Ucode64 Selector = (segUcode64 << 3) | 3 + Tss Selector = segTss << 3 +) + +// Standard segments. +var ( + UserCodeSegment32 SegmentDescriptor + UserDataSegment SegmentDescriptor + UserCodeSegment64 SegmentDescriptor + KernelCodeSegment SegmentDescriptor + KernelDataSegment SegmentDescriptor +) + +// KernelOpts has initialization options for the kernel. +type KernelOpts struct { + // PageTables are the kernel pagetables; this must be provided. + PageTables *pagetables.PageTables +} + +// KernelArchState contains architecture-specific state. +type KernelArchState struct { + KernelOpts + + // globalIDT is our set of interrupt gates. + globalIDT idt64 +} + +// CPUArchState contains CPU-specific arch state. +type CPUArchState struct { + // stack is the stack used for interrupts on this CPU. + stack [256]byte + + // errorCode is the error code from the last exception. + errorCode uintptr + + // errorType indicates the type of error code here, it is always set + // along with the errorCode value above. + // + // It will either by 1, which indicates a user error, or 0 indicating a + // kernel error. If the error code below returns false (kernel error), + // then it cannot provide relevant information about the last + // exception. + errorType uintptr + + // gdt is the CPU's descriptor table. + gdt descriptorTable + + // tss is the CPU's task state. + tss TaskState64 +} + +// ErrorCode returns the last error code. +// +// The returned boolean indicates whether the error code corresponds to the +// last user error or not. If it does not, then fault information must be +// ignored. This is generally the result of a kernel fault while servicing a +// user fault. +// +//go:nosplit +func (c *CPU) ErrorCode() (value uintptr, user bool) { + return c.errorCode, c.errorType != 0 +} + +func init() { + KernelCodeSegment.setCode64(0, 0, 0) + KernelDataSegment.setData(0, 0xffffffff, 0) + UserCodeSegment32.setCode64(0, 0, 3) + UserDataSegment.setData(0, 0xffffffff, 3) + UserCodeSegment64.setCode64(0, 0, 3) +} diff --git a/pkg/sentry/platform/ring0/entry_amd64.go b/pkg/sentry/platform/ring0/entry_amd64.go new file mode 100644 index 000000000..a3e992e0d --- /dev/null +++ b/pkg/sentry/platform/ring0/entry_amd64.go @@ -0,0 +1,128 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package ring0 + +import ( + "syscall" +) + +// This is an assembly function. +// +// The sysenter function is invoked in two situations: +// +// (1) The guest kernel has executed a system call. +// (2) The guest application has executed a system call. +// +// The interrupt flag is examined to determine whether the system call was +// executed from kernel mode or not and the appropriate stub is called. +func sysenter() + +// swapgs swaps the current GS value. +// +// This must be called prior to sysret/iret. +func swapgs() + +// sysret returns to userspace from a system call. +// +// The return code is the vector that interrupted execution. +// +// See stubs.go for a note regarding the frame size of this function. +func sysret(*CPU, *syscall.PtraceRegs) Vector + +// "iret is the cadillac of CPL switching." +// +// -- Neel Natu +// +// iret is nearly identical to sysret, except an iret is used to fully restore +// all user state. This must be called in cases where all registers need to be +// restored. +func iret(*CPU, *syscall.PtraceRegs) Vector + +// exception is the generic exception entry. +// +// This is called by the individual stub definitions. +func exception() + +// resume is a stub that restores the CPU kernel registers. +// +// This is used when processing kernel exceptions and syscalls. +func resume() + +// Start is the CPU entrypoint. +// +// The following start conditions must be satisfied: +// +// * AX should contain the CPU pointer. +// * c.GDT() should be loaded as the GDT. +// * c.IDT() should be loaded as the IDT. +// * c.CR0() should be the current CR0 value. +// * c.CR3() should be set to the kernel PageTables. +// * c.CR4() should be the current CR4 value. +// * c.EFER() should be the current EFER value. +// +// The CPU state will be set to c.Registers(). +func Start() + +// Exception stubs. +func divideByZero() +func debug() +func nmi() +func breakpoint() +func overflow() +func boundRangeExceeded() +func invalidOpcode() +func deviceNotAvailable() +func doubleFault() +func coprocessorSegmentOverrun() +func invalidTSS() +func segmentNotPresent() +func stackSegmentFault() +func generalProtectionFault() +func pageFault() +func x87FloatingPointException() +func alignmentCheck() +func machineCheck() +func simdFloatingPointException() +func virtualizationException() +func securityException() +func syscallInt80() + +// Exception handler index. +var handlers = map[Vector]func(){ + DivideByZero: divideByZero, + Debug: debug, + NMI: nmi, + Breakpoint: breakpoint, + Overflow: overflow, + BoundRangeExceeded: boundRangeExceeded, + InvalidOpcode: invalidOpcode, + DeviceNotAvailable: deviceNotAvailable, + DoubleFault: doubleFault, + CoprocessorSegmentOverrun: coprocessorSegmentOverrun, + InvalidTSS: invalidTSS, + SegmentNotPresent: segmentNotPresent, + StackSegmentFault: stackSegmentFault, + GeneralProtectionFault: generalProtectionFault, + PageFault: pageFault, + X87FloatingPointException: x87FloatingPointException, + AlignmentCheck: alignmentCheck, + MachineCheck: machineCheck, + SIMDFloatingPointException: simdFloatingPointException, + VirtualizationException: virtualizationException, + SecurityException: securityException, + SyscallInt80: syscallInt80, +} diff --git a/pkg/sentry/platform/ring0/entry_amd64.s b/pkg/sentry/platform/ring0/entry_amd64.s new file mode 100644 index 000000000..e8638133b --- /dev/null +++ b/pkg/sentry/platform/ring0/entry_amd64.s @@ -0,0 +1,334 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "funcdata.h" +#include "textflag.h" + +// NB: Offsets are programatically generated (see BUILD). +// +// This file is concatenated with the definitions. + +// Saves a register set. +// +// This is a macro because it may need to executed in contents where a stack is +// not available for calls. +// +// The following registers are not saved: AX, SP, IP, FLAGS, all segments. +#define REGISTERS_SAVE(reg, offset) \ + MOVQ R15, offset+PTRACE_R15(reg); \ + MOVQ R14, offset+PTRACE_R14(reg); \ + MOVQ R13, offset+PTRACE_R13(reg); \ + MOVQ R12, offset+PTRACE_R12(reg); \ + MOVQ BP, offset+PTRACE_RBP(reg); \ + MOVQ BX, offset+PTRACE_RBX(reg); \ + MOVQ CX, offset+PTRACE_RCX(reg); \ + MOVQ DX, offset+PTRACE_RDX(reg); \ + MOVQ R11, offset+PTRACE_R11(reg); \ + MOVQ R10, offset+PTRACE_R10(reg); \ + MOVQ R9, offset+PTRACE_R9(reg); \ + MOVQ R8, offset+PTRACE_R8(reg); \ + MOVQ SI, offset+PTRACE_RSI(reg); \ + MOVQ DI, offset+PTRACE_RDI(reg); + +// Loads a register set. +// +// This is a macro because it may need to executed in contents where a stack is +// not available for calls. +// +// The following registers are not loaded: AX, SP, IP, FLAGS, all segments. +#define REGISTERS_LOAD(reg, offset) \ + MOVQ offset+PTRACE_R15(reg), R15; \ + MOVQ offset+PTRACE_R14(reg), R14; \ + MOVQ offset+PTRACE_R13(reg), R13; \ + MOVQ offset+PTRACE_R12(reg), R12; \ + MOVQ offset+PTRACE_RBP(reg), BP; \ + MOVQ offset+PTRACE_RBX(reg), BX; \ + MOVQ offset+PTRACE_RCX(reg), CX; \ + MOVQ offset+PTRACE_RDX(reg), DX; \ + MOVQ offset+PTRACE_R11(reg), R11; \ + MOVQ offset+PTRACE_R10(reg), R10; \ + MOVQ offset+PTRACE_R9(reg), R9; \ + MOVQ offset+PTRACE_R8(reg), R8; \ + MOVQ offset+PTRACE_RSI(reg), SI; \ + MOVQ offset+PTRACE_RDI(reg), DI; + +// SWAP_GS swaps the kernel GS (CPU). +#define SWAP_GS() \ + BYTE $0x0F; BYTE $0x01; BYTE $0xf8; + +// IRET returns from an interrupt frame. +#define IRET() \ + BYTE $0x48; BYTE $0xcf; + +// SYSRET64 executes the sysret instruction. +#define SYSRET64() \ + BYTE $0x48; BYTE $0x0f; BYTE $0x07; + +// LOAD_KERNEL_ADDRESS loads a kernel address. +#define LOAD_KERNEL_ADDRESS(from, to) \ + MOVQ from, to; \ + ORQ ·KernelStartAddress(SB), to; + +// LOAD_KERNEL_STACK loads the kernel stack. +#define LOAD_KERNEL_STACK(from) \ + LOAD_KERNEL_ADDRESS(CPU_SELF(from), SP); \ + LEAQ CPU_STACK_TOP(SP), SP; + +// See kernel.go. +TEXT ·Halt(SB),NOSPLIT,$0 + HLT + RET + +// See kernel.go. +TEXT ·Current(SB),NOSPLIT,$0-8 + MOVQ CPU_SELF(GS), AX + MOVQ AX, ret+0(FP) + RET + +// See entry_amd64.go. +TEXT ·swapgs(SB),NOSPLIT,$0 + SWAP_GS() + RET + +// See entry_amd64.go. +TEXT ·sysret(SB),NOSPLIT,$0-24 + // Save original state. + LOAD_KERNEL_ADDRESS(cpu+0(FP), BX) + LOAD_KERNEL_ADDRESS(regs+8(FP), AX) + MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX) + MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX) + MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX) + + // Restore user register state. + REGISTERS_LOAD(AX, 0) + MOVQ PTRACE_RIP(AX), CX // Needed for SYSRET. + MOVQ PTRACE_FLAGS(AX), R11 // Needed for SYSRET. + MOVQ PTRACE_RSP(AX), SP // Restore the stack directly. + MOVQ PTRACE_RAX(AX), AX // Restore AX (scratch). + SYSRET64() + +// See entry_amd64.go. +TEXT ·iret(SB),NOSPLIT,$0-24 + // Save original state. + LOAD_KERNEL_ADDRESS(cpu+0(FP), BX) + LOAD_KERNEL_ADDRESS(regs+8(FP), AX) + MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX) + MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX) + MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX) + + // Build an IRET frame & restore state. + LOAD_KERNEL_STACK(BX) + MOVQ PTRACE_SS(AX), BX; PUSHQ BX + MOVQ PTRACE_RSP(AX), CX; PUSHQ CX + MOVQ PTRACE_FLAGS(AX), DX; PUSHQ DX + MOVQ PTRACE_CS(AX), DI; PUSHQ DI + MOVQ PTRACE_RIP(AX), SI; PUSHQ SI + REGISTERS_LOAD(AX, 0) // Restore most registers. + MOVQ PTRACE_RAX(AX), AX // Restore AX (scratch). + IRET() + +// See entry_amd64.go. +TEXT ·resume(SB),NOSPLIT,$0 + // See iret, above. + MOVQ CPU_REGISTERS+PTRACE_SS(GS), BX; PUSHQ BX + MOVQ CPU_REGISTERS+PTRACE_RSP(GS), CX; PUSHQ CX + MOVQ CPU_REGISTERS+PTRACE_FLAGS(GS), DX; PUSHQ DX + MOVQ CPU_REGISTERS+PTRACE_CS(GS), DI; PUSHQ DI + MOVQ CPU_REGISTERS+PTRACE_RIP(GS), SI; PUSHQ SI + REGISTERS_LOAD(GS, CPU_REGISTERS) + MOVQ CPU_REGISTERS+PTRACE_RAX(GS), AX + IRET() + +// See entry_amd64.go. +TEXT ·Start(SB),NOSPLIT,$0 + LOAD_KERNEL_STACK(AX) // Set the stack. + PUSHQ $0x0 // Previous frame pointer. + MOVQ SP, BP // Set frame pointer. + PUSHQ AX // First argument (CPU). + CALL ·start(SB) // Call Go hook. + JMP ·resume(SB) // Restore to registers. + +// See entry_amd64.go. +TEXT ·sysenter(SB),NOSPLIT,$0 + // Interrupts are always disabled while we're executing in kernel mode + // and always enabled while executing in user mode. Therefore, we can + // reliably look at the flags in R11 to determine where this syscall + // was from. + TESTL $_RFLAGS_IF, R11 + JZ kernel + +user: + SWAP_GS() + XCHGQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Swap stacks. + XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Swap for AX (regs). + REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX. + MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX // Load saved AX value. + MOVQ BX, PTRACE_RAX(AX) // Save everything else. + MOVQ BX, PTRACE_ORIGRAX(AX) + MOVQ CX, PTRACE_RIP(AX) + MOVQ R11, PTRACE_FLAGS(AX) + MOVQ CPU_REGISTERS+PTRACE_RSP(GS), BX; MOVQ BX, PTRACE_RSP(AX) + MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code. + MOVQ $1, CPU_ERROR_TYPE(GS) // Set error type to user. + + // Return to the kernel, where the frame is: + // + // vector (sp+24) + // regs (sp+16) + // cpu (sp+8) + // vcpu.Switch (sp+0) + // + MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer. + MOVQ $Syscall, 24(SP) // Output vector. + RET + +kernel: + // We can't restore the original stack, but we can access the registers + // in the CPU state directly. No need for temporary juggling. + MOVQ AX, CPU_REGISTERS+PTRACE_ORIGRAX(GS) + MOVQ AX, CPU_REGISTERS+PTRACE_RAX(GS) + REGISTERS_SAVE(GS, CPU_REGISTERS) + MOVQ CX, CPU_REGISTERS+PTRACE_RIP(GS) + MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(GS) + MOVQ SP, CPU_REGISTERS+PTRACE_RSP(GS) + MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code. + MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel. + + // Load the function stored in KernelSyscall. + // + // Note that this function needs to be executed on the stack in case + // the runtime decides to make use of the redzone (grumble). This also + // protects against any functions that might not be go:nosplit, since + // this will cause a failure immediately. + LOAD_KERNEL_STACK(GS) + MOVQ CPU_KERNEL_SYSCALL(GS), DX // Function data. + MOVQ 0(DX), AX // Function pointer. + PUSHQ BP // Push the frame pointer. + MOVQ SP, BP // Set frame pointer value. + CALL *AX // Call the function. + POPQ BP // Restore the frame pointer. + JMP ·resume(SB) + +// exception is a generic exception handler. +// +// There are two cases handled: +// +// 1) An exception in kernel mode: this results in saving the state at the time +// of the exception and calling the defined hook. +// +// 2) An exception in guest mode: the original kernel frame is restored, and +// the vector & error codes are pushed as return values. +// +// See below for the stubs that call exception. +TEXT ·exception(SB),NOSPLIT,$0 + // Determine whether the exception occurred in kernel mode or user + // mode, based on the flags. We expect the following stack: + // + // SS (sp+48) + // SP (sp+40) + // FLAGS (sp+32) + // CS (sp+24) + // IP (sp+16) + // ERROR_CODE (sp+8) + // VECTOR (sp+0) + // + TESTL $_RFLAGS_IF, 32(SP) + JZ kernel + +user: + SWAP_GS() + XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Swap for AX (regs). + REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX. + MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX // Load saved AX value. + MOVQ BX, PTRACE_RAX(AX) // Save everything else. + MOVQ BX, PTRACE_ORIGRAX(AX) + MOVQ 16(SP), BX; MOVQ BX, PTRACE_RIP(AX) + MOVQ 24(SP), CX; MOVQ CX, PTRACE_CS(AX) + MOVQ 32(SP), DX; MOVQ DX, PTRACE_FLAGS(AX) + MOVQ 40(SP), DI; MOVQ DI, PTRACE_RSP(AX) + MOVQ 48(SP), SI; MOVQ SI, PTRACE_SS(AX) + + // Copy out and return. + MOVQ 0(SP), BX // Load vector. + MOVQ 8(SP), CX // Load error code. + MOVQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Original stack (kernel version). + MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer. + MOVQ CX, CPU_ERROR_CODE(GS) // Set error code. + MOVQ $1, CPU_ERROR_TYPE(GS) // Set error type to user. + MOVQ BX, 24(SP) // Output vector. + RET + +kernel: + // As per above, we can save directly. + MOVQ AX, CPU_REGISTERS+PTRACE_RAX(GS) + MOVQ AX, CPU_REGISTERS+PTRACE_ORIGRAX(GS) + REGISTERS_SAVE(GS, CPU_REGISTERS) + MOVQ 16(SP), AX; MOVQ AX, CPU_REGISTERS+PTRACE_RIP(GS) + MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(GS) + MOVQ 40(SP), CX; MOVQ CX, CPU_REGISTERS+PTRACE_RSP(GS) + + // Set the error code and adjust the stack. + MOVQ 8(SP), AX // Load the error code. + MOVQ AX, CPU_ERROR_CODE(GS) // Copy out to the CPU. + MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel. + MOVQ 0(SP), BX // BX contains the vector. + ADDQ $48, SP // Drop the exception frame. + + // Load the function stored in KernelException. + // + // See note above re: the kernel stack. + LOAD_KERNEL_STACK(GS) + MOVQ CPU_KERNEL_EXCEPTION(GS), DX // Function data. + MOVQ 0(DX), AX // Function pointer. + PUSHQ BP // Push the frame pointer. + MOVQ SP, BP // Set frame pointer value. + PUSHQ BX // First argument (vector). + CALL *AX // Call the function. + POPQ BX // Discard the argument. + POPQ BP // Restore the frame pointer. + JMP ·resume(SB) + +#define EXCEPTION_WITH_ERROR(value, symbol) \ +TEXT symbol,NOSPLIT,$0; \ + PUSHQ $value; \ + JMP ·exception(SB); + +#define EXCEPTION_WITHOUT_ERROR(value, symbol) \ +TEXT symbol,NOSPLIT,$0; \ + PUSHQ $0x0; \ + PUSHQ $value; \ + JMP ·exception(SB); + +EXCEPTION_WITHOUT_ERROR(DivideByZero, ·divideByZero(SB)) +EXCEPTION_WITHOUT_ERROR(Debug, ·debug(SB)) +EXCEPTION_WITHOUT_ERROR(NMI, ·nmi(SB)) +EXCEPTION_WITHOUT_ERROR(Breakpoint, ·breakpoint(SB)) +EXCEPTION_WITHOUT_ERROR(Overflow, ·overflow(SB)) +EXCEPTION_WITHOUT_ERROR(BoundRangeExceeded, ·boundRangeExceeded(SB)) +EXCEPTION_WITHOUT_ERROR(InvalidOpcode, ·invalidOpcode(SB)) +EXCEPTION_WITHOUT_ERROR(DeviceNotAvailable, ·deviceNotAvailable(SB)) +EXCEPTION_WITH_ERROR(DoubleFault, ·doubleFault(SB)) +EXCEPTION_WITHOUT_ERROR(CoprocessorSegmentOverrun, ·coprocessorSegmentOverrun(SB)) +EXCEPTION_WITH_ERROR(InvalidTSS, ·invalidTSS(SB)) +EXCEPTION_WITH_ERROR(SegmentNotPresent, ·segmentNotPresent(SB)) +EXCEPTION_WITH_ERROR(StackSegmentFault, ·stackSegmentFault(SB)) +EXCEPTION_WITH_ERROR(GeneralProtectionFault, ·generalProtectionFault(SB)) +EXCEPTION_WITH_ERROR(PageFault, ·pageFault(SB)) +EXCEPTION_WITHOUT_ERROR(X87FloatingPointException, ·x87FloatingPointException(SB)) +EXCEPTION_WITH_ERROR(AlignmentCheck, ·alignmentCheck(SB)) +EXCEPTION_WITHOUT_ERROR(MachineCheck, ·machineCheck(SB)) +EXCEPTION_WITHOUT_ERROR(SIMDFloatingPointException, ·simdFloatingPointException(SB)) +EXCEPTION_WITHOUT_ERROR(VirtualizationException, ·virtualizationException(SB)) +EXCEPTION_WITH_ERROR(SecurityException, ·securityException(SB)) +EXCEPTION_WITHOUT_ERROR(SyscallInt80, ·syscallInt80(SB)) diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD new file mode 100644 index 000000000..3bce56985 --- /dev/null +++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD @@ -0,0 +1,25 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +go_template_instance( + name = "defs_impl", + out = "defs_impl.go", + package = "main", + template = "//pkg/sentry/platform/ring0:defs", +) + +go_binary( + name = "gen_offsets", + srcs = [ + "defs_impl.go", + "main.go", + ], + visibility = ["//pkg/sentry/platform/ring0:__pkg__"], + deps = [ + "//pkg/cpuid", + "//pkg/sentry/platform/ring0/pagetables", + "//pkg/sentry/usermem", + ], +) diff --git a/pkg/sentry/platform/ring0/gen_offsets/main.go b/pkg/sentry/platform/ring0/gen_offsets/main.go new file mode 100644 index 000000000..ffa7eaf77 --- /dev/null +++ b/pkg/sentry/platform/ring0/gen_offsets/main.go @@ -0,0 +1,24 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Binary gen_offsets is a helper for generating offset headers. +package main + +import ( + "os" +) + +func main() { + Emit(os.Stdout) +} diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go new file mode 100644 index 000000000..b0471ab9a --- /dev/null +++ b/pkg/sentry/platform/ring0/kernel.go @@ -0,0 +1,71 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ring0 + +// New creates a new kernel. +// +// N.B. that constraints on KernelOpts must be satisfied. +// +// Init must have been called. +func New(opts KernelOpts) *Kernel { + k := new(Kernel) + k.init(opts) + return k +} + +// NewCPU creates a new CPU associated with this Kernel. +// +// Note that execution of the new CPU must begin at Start, with constraints as +// documented. Initialization is not completed by this method alone. +// +// See also Init. +func (k *Kernel) NewCPU() *CPU { + c := new(CPU) + c.Init(k) + return c +} + +// Halt halts execution. +func Halt() + +// Current returns the current CPU. +// +// Its use is only legal in the KernelSyscall and KernelException contexts, +// which must all be guarded go:nosplit. +func Current() *CPU + +// defaultSyscall is the default syscall hook. +// +//go:nosplit +func defaultSyscall() { Halt() } + +// defaultException is the default exception hook. +// +//go:nosplit +func defaultException(Vector) { Halt() } + +// Init allows the initialization of a CPU from a kernel without allocation. +// The same constraints as NewCPU apply. +// +// Init allows embedding in other objects. +func (c *CPU) Init(k *Kernel) { + c.self = c // Set self reference. + c.kernel = k // Set kernel reference. + c.init() // Perform architectural init. + + // Defaults. + c.KernelSyscall = defaultSyscall + c.KernelException = defaultException +} diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go new file mode 100644 index 000000000..c82613a9c --- /dev/null +++ b/pkg/sentry/platform/ring0/kernel_amd64.go @@ -0,0 +1,280 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package ring0 + +import ( + "encoding/binary" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables" +) + +const ( + // KernelFlagsSet should always be set in the kernel. + KernelFlagsSet = _RFLAGS_RESERVED + + // UserFlagsSet are always set in userspace. + UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF + + // KernelFlagsClear should always be clear in the kernel. + KernelFlagsClear = _RFLAGS_IF | _RFLAGS_NT | _RFLAGS_IOPL + + // UserFlagsClear are always cleared in userspace. + UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL +) + +// init initializes architecture-specific state. +func (k *Kernel) init(opts KernelOpts) { + // Save the root page tables. + k.PageTables = opts.PageTables + + // Setup the IDT, which is uniform. + for v, handler := range handlers { + // Note that we set all traps to use the interrupt stack, this + // is defined below when setting up the TSS. + k.globalIDT[v].setInterrupt(Kcode, uint64(kernelFunc(handler)), 0 /* dpl */, 1 /* ist */) + } +} + +// init initializes architecture-specific state. +func (c *CPU) init() { + // Null segment. + c.gdt[0].setNull() + + // Kernel & user segments. + c.gdt[segKcode] = KernelCodeSegment + c.gdt[segKdata] = KernelDataSegment + c.gdt[segUcode32] = UserCodeSegment32 + c.gdt[segUdata] = UserDataSegment + c.gdt[segUcode64] = UserCodeSegment64 + + // The task segment, this spans two entries. + tssBase, tssLimit, _ := c.TSS() + c.gdt[segTss].set( + uint32(tssBase), + uint32(tssLimit), + 0, // Privilege level zero. + SegmentDescriptorPresent| + SegmentDescriptorAccess| + SegmentDescriptorWrite| + SegmentDescriptorExecute) + c.gdt[segTssHi].setHi(uint32((tssBase) >> 32)) + + // Set the kernel stack pointer in the TSS (virtual address). + stackAddr := c.StackTop() + c.tss.rsp0Lo = uint32(stackAddr) + c.tss.rsp0Hi = uint32(stackAddr >> 32) + c.tss.ist1Lo = uint32(stackAddr) + c.tss.ist1Hi = uint32(stackAddr >> 32) + + // Permanently set the kernel segments. + c.registers.Cs = uint64(Kcode) + c.registers.Ds = uint64(Kdata) + c.registers.Es = uint64(Kdata) + c.registers.Ss = uint64(Kdata) + c.registers.Fs = uint64(Kdata) + c.registers.Gs = uint64(Kdata) +} + +// StackTop returns the kernel's stack address. +// +//go:nosplit +func (c *CPU) StackTop() uint64 { + return uint64(kernelAddr(&c.stack[0])) + uint64(len(c.stack)) +} + +// IDT returns the CPU's IDT base and limit. +// +//go:nosplit +func (c *CPU) IDT() (uint64, uint16) { + return uint64(kernelAddr(&c.kernel.globalIDT[0])), uint16(binary.Size(&c.kernel.globalIDT) - 1) +} + +// GDT returns the CPU's GDT base and limit. +// +//go:nosplit +func (c *CPU) GDT() (uint64, uint16) { + return uint64(kernelAddr(&c.gdt[0])), uint16(8*segLast - 1) +} + +// TSS returns the CPU's TSS base, limit and value. +// +//go:nosplit +func (c *CPU) TSS() (uint64, uint16, *SegmentDescriptor) { + return uint64(kernelAddr(&c.tss)), uint16(binary.Size(&c.tss) - 1), &c.gdt[segTss] +} + +// CR0 returns the CPU's CR0 value. +// +//go:nosplit +func (c *CPU) CR0() uint64 { + return _CR0_PE | _CR0_PG | _CR0_ET +} + +// CR4 returns the CPU's CR4 value. +// +//go:nosplit +func (c *CPU) CR4() uint64 { + cr4 := uint64(_CR4_PAE | _CR4_PSE | _CR4_OSFXSR | _CR4_OSXMMEXCPT) + if hasPCID { + cr4 |= _CR4_PCIDE + } + if hasXSAVE { + cr4 |= _CR4_OSXSAVE + } + if hasSMEP { + cr4 |= _CR4_SMEP + } + if hasFSGSBASE { + cr4 |= _CR4_FSGSBASE + } + return cr4 +} + +// EFER returns the CPU's EFER value. +// +//go:nosplit +func (c *CPU) EFER() uint64 { + return _EFER_LME | _EFER_SCE | _EFER_NX +} + +// IsCanonical indicates whether addr is canonical per the amd64 spec. +// +//go:nosplit +func IsCanonical(addr uint64) bool { + return addr <= 0x00007fffffffffff || addr > 0xffff800000000000 +} + +// Flags contains flags related to switch. +type Flags uintptr + +const ( + // FlagFull indicates that a full restore should be not, not a fast + // restore (on the syscall return path.) + FlagFull = 1 << iota + + // FlagFlush indicates that a full TLB flush is required. + FlagFlush +) + +// SwitchToUser performs either a sysret or an iret. +// +// The return value is the vector that interrupted execution. +// +// This function will not split the stack. Callers will probably want to call +// runtime.entersyscall (and pair with a call to runtime.exitsyscall) prior to +// calling this function. +// +// When this is done, this region is quite sensitive to things like system +// calls. After calling entersyscall, any memory used must have been allocated +// and no function calls without go:nosplit are permitted. Any calls made here +// are protected appropriately (e.g. IsCanonical and CR3). +// +// Also note that this function transitively depends on the compiler generating +// code that uses IP-relative addressing inside of absolute addresses. That's +// the case for amd64, but may not be the case for other architectures. +// +//go:nosplit +func (c *CPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetables.PageTables, flags Flags) (vector Vector) { + // Check for canonical addresses. + if !IsCanonical(regs.Rip) || !IsCanonical(regs.Rsp) || !IsCanonical(regs.Fs_base) || !IsCanonical(regs.Gs_base) { + return GeneralProtectionFault + } + + var ( + userCR3 uint64 + kernelCR3 uint64 + ) + + // Sanitize registers. + if flags&FlagFlush != 0 { + userCR3 = pt.FlushCR3() + } else { + userCR3 = pt.CR3() + } + regs.Eflags &= ^uint64(UserFlagsClear) + regs.Eflags |= UserFlagsSet + regs.Cs = uint64(Ucode64) // Required for iret. + regs.Ss = uint64(Udata) // Ditto. + kernelCR3 = c.kernel.PageTables.CR3() + + // Perform the switch. + swapgs() // GS will be swapped on return. + wrfs(uintptr(regs.Fs_base)) // Set application FS. + wrgs(uintptr(regs.Gs_base)) // Set application GS. + LoadFloatingPoint(fpState) // Copy in floating point. + jumpToKernel() // Switch to upper half. + writeCR3(uintptr(userCR3)) // Change to user address space. + if flags&FlagFull != 0 { + vector = iret(c, regs) + } else { + vector = sysret(c, regs) + } + writeCR3(uintptr(kernelCR3)) // Return to kernel address space. + jumpToUser() // Return to lower half. + SaveFloatingPoint(fpState) // Copy out floating point. + wrfs(uintptr(c.registers.Fs_base)) // Restore kernel FS. + return +} + +// start is the CPU entrypoint. +// +// This is called from the Start asm stub (see entry_amd64.go); on return the +// registers in c.registers will be restored (not segments). +// +//go:nosplit +func start(c *CPU) { + // Save per-cpu & FS segment. + wrgs(kernelAddr(c)) + wrfs(uintptr(c.Registers().Fs_base)) + + // Initialize floating point. + // + // Note that on skylake, the valid XCR0 mask reported seems to be 0xff. + // This breaks down as: + // + // bit0 - x87 + // bit1 - SSE + // bit2 - AVX + // bit3-4 - MPX + // bit5-7 - AVX512 + // + // For some reason, enabled MPX & AVX512 on platforms that report them + // seems to be cause a general protection fault. (Maybe there are some + // virtualization issues and these aren't exported to the guest cpuid.) + // This needs further investigation, but we can limit the floating + // point operations to x87, SSE & AVX for now. + fninit() + xsetbv(0, validXCR0Mask&0x7) + + // Set the syscall target. + wrmsr(_MSR_LSTAR, kernelFunc(sysenter)) + wrmsr(_MSR_SYSCALL_MASK, _RFLAGS_STEP|_RFLAGS_IF|_RFLAGS_DF|_RFLAGS_IOPL|_RFLAGS_AC|_RFLAGS_NT) + + // NOTE: This depends on having the 64-bit segments immediately + // following the 32-bit user segments. This is simply the way the + // sysret instruction is designed to work (it assumes they follow). + wrmsr(_MSR_STAR, uintptr(uint64(Kcode)<<32|uint64(Ucode32)<<48)) + wrmsr(_MSR_CSTAR, kernelFunc(sysenter)) +} + +// ReadCR2 reads the current CR2 value. +// +//go:nosplit +func ReadCR2() uintptr { + return readCR2() +} diff --git a/pkg/sentry/platform/ring0/kernel_unsafe.go b/pkg/sentry/platform/ring0/kernel_unsafe.go new file mode 100644 index 000000000..cfb3ad853 --- /dev/null +++ b/pkg/sentry/platform/ring0/kernel_unsafe.go @@ -0,0 +1,41 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ring0 + +import ( + "unsafe" +) + +// eface mirrors runtime.eface. +type eface struct { + typ uintptr + data unsafe.Pointer +} + +// kernelAddr returns the kernel virtual address for the given object. +// +//go:nosplit +func kernelAddr(obj interface{}) uintptr { + e := (*eface)(unsafe.Pointer(&obj)) + return KernelStartAddress | uintptr(e.data) +} + +// kernelFunc returns the address of the given function. +// +//go:nosplit +func kernelFunc(fn func()) uintptr { + fnptr := (**uintptr)(unsafe.Pointer(&fn)) + return KernelStartAddress | **fnptr +} diff --git a/pkg/sentry/platform/ring0/lib_amd64.go b/pkg/sentry/platform/ring0/lib_amd64.go new file mode 100644 index 000000000..f1ed5bfb4 --- /dev/null +++ b/pkg/sentry/platform/ring0/lib_amd64.go @@ -0,0 +1,128 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package ring0 + +import ( + "gvisor.googlesource.com/gvisor/pkg/cpuid" +) + +// LoadFloatingPoint loads floating point state by the most efficient mechanism +// available (set by Init). +var LoadFloatingPoint func(*byte) + +// SaveFloatingPoint saves floating point state by the most efficient mechanism +// available (set by Init). +var SaveFloatingPoint func(*byte) + +// fxrstor uses fxrstor64 to load floating point state. +func fxrstor(*byte) + +// xrstor uses xrstor to load floating point state. +func xrstor(*byte) + +// fxsave uses fxsave64 to save floating point state. +func fxsave(*byte) + +// xsave uses xsave to save floating point state. +func xsave(*byte) + +// xsaveopt uses xsaveopt to save floating point state. +func xsaveopt(*byte) + +// wrfs sets the GS address (set by init). +var wrfs func(addr uintptr) + +// wrfsbase writes to the GS base address. +func wrfsbase(addr uintptr) + +// wrfsmsr writes to the GS_BASE MSR. +func wrfsmsr(addr uintptr) + +// wrgs sets the GS address (set by init). +var wrgs func(addr uintptr) + +// wrgsbase writes to the GS base address. +func wrgsbase(addr uintptr) + +// wrgsmsr writes to the GS_BASE MSR. +func wrgsmsr(addr uintptr) + +// writeCR3 writes the CR3 value. +func writeCR3(phys uintptr) + +// readCR2 reads the current CR2 value. +func readCR2() uintptr + +// jumpToKernel jumps to the kernel version of the current RIP. +func jumpToKernel() + +// jumpToUser jumps to the user version of the current RIP. +func jumpToUser() + +// fninit initializes the floating point unit. +func fninit() + +// xsetbv writes to an extended control register. +func xsetbv(reg, value uintptr) + +// xgetbv reads an extended control register. +func xgetbv(reg uintptr) uintptr + +// wrmsr reads to the given MSR. +func wrmsr(reg, value uintptr) + +// rdmsr reads the given MSR. +func rdmsr(reg uintptr) uintptr + +// Mostly-constants set by Init. +var ( + hasSMEP bool + hasPCID bool + hasXSAVEOPT bool + hasXSAVE bool + hasFSGSBASE bool + validXCR0Mask uintptr +) + +// Init sets function pointers based on architectural features. +// +// This must be called prior to using ring0. +func Init(featureSet *cpuid.FeatureSet) { + hasSMEP = featureSet.HasFeature(cpuid.X86FeatureSMEP) + hasPCID = featureSet.HasFeature(cpuid.X86FeaturePCID) + hasXSAVEOPT = featureSet.UseXsaveopt() + hasXSAVE = featureSet.UseXsave() + hasFSGSBASE = featureSet.HasFeature(cpuid.X86FeatureFSGSBase) + validXCR0Mask = uintptr(featureSet.ValidXCR0Mask()) + if hasXSAVEOPT { + SaveFloatingPoint = xsaveopt + LoadFloatingPoint = xrstor + } else if hasXSAVE { + SaveFloatingPoint = xsave + LoadFloatingPoint = xrstor + } else { + SaveFloatingPoint = fxsave + LoadFloatingPoint = fxrstor + } + if hasFSGSBASE { + wrfs = wrfsbase + wrgs = wrgsbase + } else { + wrfs = wrfsmsr + wrgs = wrgsmsr + } +} diff --git a/pkg/sentry/platform/ring0/lib_amd64.s b/pkg/sentry/platform/ring0/lib_amd64.s new file mode 100644 index 000000000..6f143ea5a --- /dev/null +++ b/pkg/sentry/platform/ring0/lib_amd64.s @@ -0,0 +1,247 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "funcdata.h" +#include "textflag.h" + +// fxrstor loads floating point state. +// +// The code corresponds to: +// +// fxrstor64 (%rbx) +// +TEXT ·fxrstor(SB),NOSPLIT,$0-8 + MOVQ addr+0(FP), BX + MOVL $0xffffffff, AX + MOVL $0xffffffff, DX + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x0b; + RET + +// xrstor loads floating point state. +// +// The code corresponds to: +// +// xrstor (%rdi) +// +TEXT ·xrstor(SB),NOSPLIT,$0-8 + MOVQ addr+0(FP), DI + MOVL $0xffffffff, AX + MOVL $0xffffffff, DX + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x2f; + RET + +// fxsave saves floating point state. +// +// The code corresponds to: +// +// fxsave64 (%rbx) +// +TEXT ·fxsave(SB),NOSPLIT,$0-8 + MOVQ addr+0(FP), BX + MOVL $0xffffffff, AX + MOVL $0xffffffff, DX + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x03; + RET + +// xsave saves floating point state. +// +// The code corresponds to: +// +// xsave (%rdi) +// +TEXT ·xsave(SB),NOSPLIT,$0-8 + MOVQ addr+0(FP), DI + MOVL $0xffffffff, AX + MOVL $0xffffffff, DX + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; + RET + +// xsaveopt saves floating point state. +// +// The code corresponds to: +// +// xsaveopt (%rdi) +// +TEXT ·xsaveopt(SB),NOSPLIT,$0-8 + MOVQ addr+0(FP), DI + MOVL $0xffffffff, AX + MOVL $0xffffffff, DX + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; + RET + +// wrfsbase writes to the FS base. +// +// The code corresponds to: +// +// wrfsbase %rax +// +TEXT ·wrfsbase(SB),NOSPLIT,$0-8 + MOVQ addr+0(FP), AX + BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xd0; + RET + +// wrfsmsr writes to the FSBASE MSR. +// +// The code corresponds to: +// +// wrmsr (writes EDX:EAX to the MSR in ECX) +// +TEXT ·wrfsmsr(SB),NOSPLIT,$0-8 + MOVQ addr+0(FP), AX + MOVQ AX, DX + SHRQ $32, DX + MOVQ $0xc0000100, CX // MSR_FS_BASE + BYTE $0x0f; BYTE $0x30; + RET + +// wrgsbase writes to the GS base. +// +// The code corresponds to: +// +// wrgsbase %rax +// +TEXT ·wrgsbase(SB),NOSPLIT,$0-8 + MOVQ addr+0(FP), AX + BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xd8; + RET + +// wrgsmsr writes to the GSBASE MSR. +// +// See wrfsmsr. +TEXT ·wrgsmsr(SB),NOSPLIT,$0-8 + MOVQ addr+0(FP), AX + MOVQ AX, DX + SHRQ $32, DX + MOVQ $0xc0000101, CX // MSR_GS_BASE + BYTE $0x0f; BYTE $0x30; // WRMSR + RET + +// jumpToUser changes execution to the user address. +// +// This works by changing the return value to the user version. +TEXT ·jumpToUser(SB),NOSPLIT,$0 + MOVQ 0(SP), AX + MOVQ ·KernelStartAddress(SB), BX + NOTQ BX + ANDQ BX, SP // Switch the stack. + ANDQ BX, BP // Switch the frame pointer. + ANDQ BX, AX // Future return value. + MOVQ AX, 0(SP) + RET + +// jumpToKernel changes execution to the kernel address space. +// +// This works by changing the return value to the kernel version. +TEXT ·jumpToKernel(SB),NOSPLIT,$0 + MOVQ 0(SP), AX + MOVQ ·KernelStartAddress(SB), BX + ORQ BX, SP // Switch the stack. + ORQ BX, BP // Switch the frame pointer. + ORQ BX, AX // Future return value. + MOVQ AX, 0(SP) + RET + +// writeCR3 writes the given CR3 value. +// +// The code corresponds to: +// +// mov %rax, %cr3 +// +TEXT ·writeCR3(SB),NOSPLIT,$0-8 + MOVQ cr3+0(FP), AX + BYTE $0x0f; BYTE $0x22; BYTE $0xd8; + RET + +// readCR3 reads the current CR3 value. +// +// The code corresponds to: +// +// mov %cr3, %rax +// +TEXT ·readCR3(SB),NOSPLIT,$0-8 + BYTE $0x0f; BYTE $0x20; BYTE $0xd8; + MOVQ AX, ret+0(FP) + RET + +// readCR2 reads the current CR2 value. +// +// The code corresponds to: +// +// mov %cr2, %rax +// +TEXT ·readCR2(SB),NOSPLIT,$0-8 + BYTE $0x0f; BYTE $0x20; BYTE $0xd0; + MOVQ AX, ret+0(FP) + RET + +// fninit initializes the floating point unit. +// +// The code corresponds to: +// +// fninit +TEXT ·fninit(SB),NOSPLIT,$0 + BYTE $0xdb; BYTE $0xe3; + RET + +// xsetbv writes to an extended control register. +// +// The code corresponds to: +// +// xsetbv +// +TEXT ·xsetbv(SB),NOSPLIT,$0-16 + MOVL reg+0(FP), CX + MOVL value+8(FP), AX + MOVL value+12(FP), DX + BYTE $0x0f; BYTE $0x01; BYTE $0xd1; + RET + +// xgetbv reads an extended control register. +// +// The code corresponds to: +// +// xgetbv +// +TEXT ·xgetbv(SB),NOSPLIT,$0-16 + MOVL reg+0(FP), CX + BYTE $0x0f; BYTE $0x01; BYTE $0xd0; + MOVL AX, ret+8(FP) + MOVL DX, ret+12(FP) + RET + +// wrmsr writes to a control register. +// +// The code corresponds to: +// +// wrmsr +// +TEXT ·wrmsr(SB),NOSPLIT,$0-16 + MOVL reg+0(FP), CX + MOVL value+8(FP), AX + MOVL value+12(FP), DX + BYTE $0x0f; BYTE $0x30; + RET + +// rdmsr reads a control register. +// +// The code corresponds to: +// +// rdmsr +// +TEXT ·rdmsr(SB),NOSPLIT,$0-16 + MOVL reg+0(FP), CX + BYTE $0x0f; BYTE $0x32; + MOVL AX, ret+8(FP) + MOVL DX, ret+12(FP) + RET diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go new file mode 100644 index 000000000..9acd442ba --- /dev/null +++ b/pkg/sentry/platform/ring0/offsets_amd64.go @@ -0,0 +1,93 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package ring0 + +import ( + "fmt" + "io" + "reflect" + "syscall" +) + +// Emit prints architecture-specific offsets. +func Emit(w io.Writer) { + fmt.Fprintf(w, "// Automatically generated, do not edit.\n") + + c := &CPU{} + fmt.Fprintf(w, "\n// CPU offsets.\n") + fmt.Fprintf(w, "#define CPU_SELF 0x%02x\n", reflect.ValueOf(&c.self).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_REGISTERS 0x%02x\n", reflect.ValueOf(&c.registers).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_STACK_TOP 0x%02x\n", reflect.ValueOf(&c.stack[0]).Pointer()-reflect.ValueOf(c).Pointer()+uintptr(len(c.stack))) + fmt.Fprintf(w, "#define CPU_ERROR_CODE 0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_ERROR_TYPE 0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_KERNEL_EXCEPTION 0x%02x\n", reflect.ValueOf(&c.KernelException).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_KERNEL_SYSCALL 0x%02x\n", reflect.ValueOf(&c.KernelSyscall).Pointer()-reflect.ValueOf(c).Pointer()) + + fmt.Fprintf(w, "\n// Bits.\n") + fmt.Fprintf(w, "#define _RFLAGS_IF 0x%02x\n", _RFLAGS_IF) + + fmt.Fprintf(w, "\n// Vectors.\n") + fmt.Fprintf(w, "#define DivideByZero 0x%02x\n", DivideByZero) + fmt.Fprintf(w, "#define Debug 0x%02x\n", Debug) + fmt.Fprintf(w, "#define NMI 0x%02x\n", NMI) + fmt.Fprintf(w, "#define Breakpoint 0x%02x\n", Breakpoint) + fmt.Fprintf(w, "#define Overflow 0x%02x\n", Overflow) + fmt.Fprintf(w, "#define BoundRangeExceeded 0x%02x\n", BoundRangeExceeded) + fmt.Fprintf(w, "#define InvalidOpcode 0x%02x\n", InvalidOpcode) + fmt.Fprintf(w, "#define DeviceNotAvailable 0x%02x\n", DeviceNotAvailable) + fmt.Fprintf(w, "#define DoubleFault 0x%02x\n", DoubleFault) + fmt.Fprintf(w, "#define CoprocessorSegmentOverrun 0x%02x\n", CoprocessorSegmentOverrun) + fmt.Fprintf(w, "#define InvalidTSS 0x%02x\n", InvalidTSS) + fmt.Fprintf(w, "#define SegmentNotPresent 0x%02x\n", SegmentNotPresent) + fmt.Fprintf(w, "#define StackSegmentFault 0x%02x\n", StackSegmentFault) + fmt.Fprintf(w, "#define GeneralProtectionFault 0x%02x\n", GeneralProtectionFault) + fmt.Fprintf(w, "#define PageFault 0x%02x\n", PageFault) + fmt.Fprintf(w, "#define X87FloatingPointException 0x%02x\n", X87FloatingPointException) + fmt.Fprintf(w, "#define AlignmentCheck 0x%02x\n", AlignmentCheck) + fmt.Fprintf(w, "#define MachineCheck 0x%02x\n", MachineCheck) + fmt.Fprintf(w, "#define SIMDFloatingPointException 0x%02x\n", SIMDFloatingPointException) + fmt.Fprintf(w, "#define VirtualizationException 0x%02x\n", VirtualizationException) + fmt.Fprintf(w, "#define SecurityException 0x%02x\n", SecurityException) + fmt.Fprintf(w, "#define SyscallInt80 0x%02x\n", SyscallInt80) + fmt.Fprintf(w, "#define Syscall 0x%02x\n", Syscall) + + p := &syscall.PtraceRegs{} + fmt.Fprintf(w, "\n// Ptrace registers.\n") + fmt.Fprintf(w, "#define PTRACE_R15 0x%02x\n", reflect.ValueOf(&p.R15).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R14 0x%02x\n", reflect.ValueOf(&p.R14).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R13 0x%02x\n", reflect.ValueOf(&p.R13).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R12 0x%02x\n", reflect.ValueOf(&p.R12).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RBP 0x%02x\n", reflect.ValueOf(&p.Rbp).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RBX 0x%02x\n", reflect.ValueOf(&p.Rbx).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R11 0x%02x\n", reflect.ValueOf(&p.R11).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R10 0x%02x\n", reflect.ValueOf(&p.R10).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R9 0x%02x\n", reflect.ValueOf(&p.R9).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R8 0x%02x\n", reflect.ValueOf(&p.R8).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RAX 0x%02x\n", reflect.ValueOf(&p.Rax).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RCX 0x%02x\n", reflect.ValueOf(&p.Rcx).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RDX 0x%02x\n", reflect.ValueOf(&p.Rdx).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RSI 0x%02x\n", reflect.ValueOf(&p.Rsi).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RDI 0x%02x\n", reflect.ValueOf(&p.Rdi).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_ORIGRAX 0x%02x\n", reflect.ValueOf(&p.Orig_rax).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RIP 0x%02x\n", reflect.ValueOf(&p.Rip).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_CS 0x%02x\n", reflect.ValueOf(&p.Cs).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_FLAGS 0x%02x\n", reflect.ValueOf(&p.Eflags).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RSP 0x%02x\n", reflect.ValueOf(&p.Rsp).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_SS 0x%02x\n", reflect.ValueOf(&p.Ss).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_FS 0x%02x\n", reflect.ValueOf(&p.Fs_base).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_GS 0x%02x\n", reflect.ValueOf(&p.Gs_base).Pointer()-reflect.ValueOf(p).Pointer()) +} diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD new file mode 100644 index 000000000..c0c481ab3 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/BUILD @@ -0,0 +1,32 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") + +go_library( + name = "pagetables", + srcs = [ + "pagetables.go", + "pagetables_amd64.go", + "pagetables_unsafe.go", + "pagetables_x86.go", + "pcids_x86.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables", + visibility = [ + "//pkg/sentry/platform/kvm:__subpackages__", + "//pkg/sentry/platform/ring0:__subpackages__", + ], + deps = ["//pkg/sentry/usermem"], +) + +go_test( + name = "pagetables_test", + size = "small", + srcs = [ + "pagetables_test.go", + "pagetables_x86_test.go", + "pcids_x86_test.go", + ], + embed = [":pagetables"], + deps = ["//pkg/sentry/usermem"], +) diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go new file mode 100644 index 000000000..3cbf0bfa5 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go @@ -0,0 +1,193 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package pagetables provides a generic implementation of pagetables. +package pagetables + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// Node is a single node within a set of page tables. +type Node struct { + // unalignedData has unaligned data. Unfortunately, we can't really + // rely on the allocator to give us what we want here. So we just throw + // it at the wall and use the portion that matches. Gross. This may be + // changed in the future to use a different allocation mechanism. + // + // Access must happen via functions found in pagetables_unsafe.go. + unalignedData [(2 * usermem.PageSize) - 1]byte + + // physical is the translated address of these entries. + // + // This is filled in at creation time. + physical uintptr +} + +// PageTables is a set of page tables. +type PageTables struct { + mu sync.Mutex + + // root is the pagetable root. + root *Node + + // translater is the translater passed at creation. + translater Translater + + // archPageTables includes architecture-specific features. + archPageTables + + // allNodes is a set of nodes indexed by translater address. + allNodes map[uintptr]*Node +} + +// Translater translates to guest physical addresses. +type Translater interface { + // TranslateToPhysical translates the given pointer object into a + // "physical" address. We do not require that it translates back, the + // reverse mapping is maintained internally. + TranslateToPhysical(*PTEs) uintptr +} + +// New returns new PageTables. +func New(t Translater, opts Opts) *PageTables { + p := &PageTables{ + translater: t, + allNodes: make(map[uintptr]*Node), + } + p.root = p.allocNode() + p.init(opts) + return p +} + +// New returns a new set of PageTables derived from the given one. +// +// This function should always be preferred to New if there are existing +// pagetables, as this function preserves architectural constraints relevant to +// managing multiple sets of pagetables. +func (p *PageTables) New() *PageTables { + np := &PageTables{ + translater: p.translater, + allNodes: make(map[uintptr]*Node), + } + np.root = np.allocNode() + np.initFrom(&p.archPageTables) + return np +} + +// setPageTable sets the given index as a page table. +func (p *PageTables) setPageTable(n *Node, index int, child *Node) { + phys := p.translater.TranslateToPhysical(child.PTEs()) + p.allNodes[phys] = child + pte := &n.PTEs()[index] + pte.setPageTable(phys) +} + +// clearPageTable clears the given entry. +func (p *PageTables) clearPageTable(n *Node, index int) { + pte := &n.PTEs()[index] + physical := pte.Address() + pte.Clear() + delete(p.allNodes, physical) +} + +// getPageTable returns the page table entry. +func (p *PageTables) getPageTable(n *Node, index int) *Node { + pte := &n.PTEs()[index] + physical := pte.Address() + child := p.allNodes[physical] + return child +} + +// Map installs a mapping with the given physical address. +// +// True is returned iff there was a previous mapping in the range. +// +// Precondition: addr & length must be aligned, their sum must not overflow. +func (p *PageTables) Map(addr usermem.Addr, length uintptr, user bool, at usermem.AccessType, physical uintptr) bool { + if at == usermem.NoAccess { + return p.Unmap(addr, length) + } + prev := false + p.mu.Lock() + end, ok := addr.AddLength(uint64(length)) + if !ok { + panic("pagetables.Map: overflow") + } + p.iterateRange(uintptr(addr), uintptr(end), true, func(s, e uintptr, pte *PTE, align uintptr) { + p := physical + (s - uintptr(addr)) + prev = prev || (pte.Valid() && (p != pte.Address() || at.Write != pte.Writeable() || at.Execute != pte.Executable())) + if p&align != 0 { + // We will install entries at a smaller granulaity if + // we don't install a valid entry here, however we must + // zap any existing entry to ensure this happens. + pte.Clear() + return + } + pte.Set(p, at.Write, at.Execute, user) + }) + p.mu.Unlock() + return prev +} + +// Unmap unmaps the given range. +// +// True is returned iff there was a previous mapping in the range. +func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool { + p.mu.Lock() + count := 0 + p.iterateRange(uintptr(addr), uintptr(addr)+length, false, func(s, e uintptr, pte *PTE, align uintptr) { + pte.Clear() + count++ + }) + p.mu.Unlock() + return count > 0 +} + +// Release releases this address space. +// +// This must be called to release the PCID. +func (p *PageTables) Release() { + // Clear all pages. + p.Unmap(0, ^uintptr(0)) + p.release() +} + +// Lookup returns the physical address for the given virtual address. +func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, accessType usermem.AccessType) { + mask := uintptr(usermem.PageSize - 1) + off := uintptr(addr) & mask + addr = addr &^ usermem.Addr(mask) + p.iterateRange(uintptr(addr), uintptr(addr+usermem.PageSize), false, func(s, e uintptr, pte *PTE, align uintptr) { + if !pte.Valid() { + return + } + physical = pte.Address() + (s - uintptr(addr)) + off + accessType = usermem.AccessType{ + Read: true, + Write: pte.Writeable(), + Execute: pte.Executable(), + } + }) + return physical, accessType +} + +// allocNode allocates a new page. +func (p *PageTables) allocNode() *Node { + n := new(Node) + n.physical = p.translater.TranslateToPhysical(n.PTEs()) + return n +} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go new file mode 100644 index 000000000..b89665c96 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go @@ -0,0 +1,397 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package pagetables + +import ( + "fmt" + "sync/atomic" +) + +// Address constraints. +// +// The lowerTop and upperBottom currently apply to four-level pagetables; +// additional refactoring would be necessary to support five-level pagetables. +const ( + lowerTop = 0x00007fffffffffff + upperBottom = 0xffff800000000000 + + pteShift = 12 + pmdShift = 21 + pudShift = 30 + pgdShift = 39 + + pteMask = 0x1ff << pteShift + pmdMask = 0x1ff << pmdShift + pudMask = 0x1ff << pudShift + pgdMask = 0x1ff << pgdShift + + pteSize = 1 << pteShift + pmdSize = 1 << pmdShift + pudSize = 1 << pudShift + pgdSize = 1 << pgdShift +) + +// Bits in page table entries. +const ( + present = 0x001 + writable = 0x002 + user = 0x004 + writeThrough = 0x008 + cacheDisable = 0x010 + accessed = 0x020 + dirty = 0x040 + super = 0x080 + executeDisable = 1 << 63 +) + +// PTE is a page table entry. +type PTE uint64 + +// Clear clears this PTE, including super page information. +func (p *PTE) Clear() { + atomic.StoreUint64((*uint64)(p), 0) +} + +// Valid returns true iff this entry is valid. +func (p *PTE) Valid() bool { + return atomic.LoadUint64((*uint64)(p))&present != 0 +} + +// Writeable returns true iff the page is writable. +func (p *PTE) Writeable() bool { + return atomic.LoadUint64((*uint64)(p))&writable != 0 +} + +// User returns true iff the page is user-accessible. +func (p *PTE) User() bool { + return atomic.LoadUint64((*uint64)(p))&user != 0 +} + +// Executable returns true iff the page is executable. +func (p *PTE) Executable() bool { + return atomic.LoadUint64((*uint64)(p))&executeDisable == 0 +} + +// SetSuper sets this page as a super page. +// +// The page must not be valid or a panic will result. +func (p *PTE) SetSuper() { + if p.Valid() { + // This is not allowed. + panic("SetSuper called on valid page!") + } + atomic.StoreUint64((*uint64)(p), super) +} + +// IsSuper returns true iff this page is a super page. +func (p *PTE) IsSuper() bool { + return atomic.LoadUint64((*uint64)(p))&super != 0 +} + +// Set sets this PTE value. +func (p *PTE) Set(addr uintptr, write, execute bool, userAccessible bool) { + v := uint64(addr)&^uint64(0xfff) | present | accessed + if userAccessible { + v |= user + } + if !execute { + v |= executeDisable + } + if write { + v |= writable | dirty + } + if p.IsSuper() { + v |= super + } + atomic.StoreUint64((*uint64)(p), v) +} + +// setPageTable sets this PTE value and forces the write bit and super bit to +// be cleared. This is used explicitly for breaking super pages. +func (p *PTE) setPageTable(addr uintptr) { + v := uint64(addr)&^uint64(0xfff) | present | user | writable | accessed | dirty + atomic.StoreUint64((*uint64)(p), v) +} + +// Address extracts the address. This should only be used if Valid returns true. +func (p *PTE) Address() uintptr { + return uintptr(atomic.LoadUint64((*uint64)(p)) & ^uint64(executeDisable|0xfff)) +} + +// entriesPerPage is the number of PTEs per page. +const entriesPerPage = 512 + +// PTEs is a collection of entries. +type PTEs [entriesPerPage]PTE + +// next returns the next address quantized by the given size. +func next(start uint64, size uint64) uint64 { + start &= ^(size - 1) + start += size + return start +} + +// iterateRange iterates over all appropriate levels of page tables for the given range. +// +// If alloc is set, then Set _must_ be called on all given PTEs. The exception +// is super pages. If a valid super page cannot be installed, then the walk +// will continue to individual entries. +// +// This algorithm will attempt to maximize the use of super pages whenever +// possible. Whether a super page is provided will be clear through the range +// provided in the callback. +// +// Note that if alloc set, then no gaps will be present. However, if alloc is +// not set, then the iteration will likely be full of gaps. +// +// Note that this function should generally be avoided in favor of Map, Unmap, +// etc. when not necessary. +// +// Precondition: startAddr and endAddr must be page-aligned. +// +// Precondition: startStart must be less than endAddr. +// +// Precondition: If alloc is set, then startAddr and endAddr should not span +// non-canonical ranges. If they do, a panic will result. +func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn func(s, e uintptr, pte *PTE, align uintptr)) { + start := uint64(startAddr) + end := uint64(endAddr) + if start%pteSize != 0 { + panic(fmt.Sprintf("unaligned start: %v", start)) + } + if start > end { + panic(fmt.Sprintf("start > end (%v > %v))", start, end)) + } + + // Deal with cases where we traverse the "gap". + // + // These are all explicitly disallowed if alloc is set, and we must + // traverse an entry for each address explicitly. + switch { + case start < lowerTop && end > lowerTop && end < upperBottom: + if alloc { + panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end)) + } + p.iterateRange(startAddr, lowerTop, false, fn) + return + case start < lowerTop && end > lowerTop: + if alloc { + panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end)) + } + p.iterateRange(startAddr, lowerTop, false, fn) + p.iterateRange(upperBottom, endAddr, false, fn) + return + case start > lowerTop && end < upperBottom: + if alloc { + panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end)) + } + return + case start > lowerTop && start < upperBottom && end > upperBottom: + if alloc { + panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end)) + } + p.iterateRange(upperBottom, endAddr, false, fn) + return + } + + for pgdIndex := int((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { + pgdEntry := &p.root.PTEs()[pgdIndex] + if !pgdEntry.Valid() { + if !alloc { + // Skip over this entry. + start = next(start, pgdSize) + continue + } + + // Allocate a new pgd. + p.setPageTable(p.root, pgdIndex, p.allocNode()) + } + + // Map the next level. + pudNode := p.getPageTable(p.root, pgdIndex) + clearPUDEntries := 0 + + for pudIndex := int((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { + pudEntry := &(pudNode.PTEs()[pudIndex]) + if !pudEntry.Valid() { + if !alloc { + // Skip over this entry. + clearPUDEntries++ + start = next(start, pudSize) + continue + } + + // This level has 1-GB super pages. Is this + // entire region contained in a single PUD + // entry? If so, we can skip allocating a new + // page for the pmd. + if start&(pudSize-1) == 0 && end-start >= pudSize { + pudEntry.SetSuper() + fn(uintptr(start), uintptr(start+pudSize), pudEntry, pudSize-1) + if pudEntry.Valid() { + start = next(start, pudSize) + continue + } + } + + // Allocate a new pud. + p.setPageTable(pudNode, pudIndex, p.allocNode()) + + } else if pudEntry.IsSuper() { + // Does this page need to be split? + if start&(pudSize-1) != 0 || end < next(start, pudSize) { + currentAddr := uint64(pudEntry.Address()) + writeable := pudEntry.Writeable() + executable := pudEntry.Executable() + user := pudEntry.User() + + // Install the relevant entries. + pmdNode := p.allocNode() + pmdEntries := pmdNode.PTEs() + for index := 0; index < entriesPerPage; index++ { + pmdEntry := &pmdEntries[index] + pmdEntry.SetSuper() + pmdEntry.Set(uintptr(currentAddr), writeable, executable, user) + currentAddr += pmdSize + } + + // Reset to point to the new page. + p.setPageTable(pudNode, pudIndex, pmdNode) + } else { + // A super page to be checked directly. + fn(uintptr(start), uintptr(start+pudSize), pudEntry, pudSize-1) + + // Might have been cleared. + if !pudEntry.Valid() { + clearPUDEntries++ + } + + // Note that the super page was changed. + start = next(start, pudSize) + continue + } + } + + // Map the next level, since this is valid. + pmdNode := p.getPageTable(pudNode, pudIndex) + clearPMDEntries := 0 + + for pmdIndex := int((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { + pmdEntry := &pmdNode.PTEs()[pmdIndex] + if !pmdEntry.Valid() { + if !alloc { + // Skip over this entry. + clearPMDEntries++ + start = next(start, pmdSize) + continue + } + + // This level has 2-MB huge pages. If this + // region is contined in a single PMD entry? + // As above, we can skip allocating a new page. + if start&(pmdSize-1) == 0 && end-start >= pmdSize { + pmdEntry.SetSuper() + fn(uintptr(start), uintptr(start+pmdSize), pmdEntry, pmdSize-1) + if pmdEntry.Valid() { + start = next(start, pmdSize) + continue + } + } + + // Allocate a new pmd. + p.setPageTable(pmdNode, pmdIndex, p.allocNode()) + + } else if pmdEntry.IsSuper() { + // Does this page need to be split? + if start&(pmdSize-1) != 0 || end < next(start, pmdSize) { + currentAddr := uint64(pmdEntry.Address()) + writeable := pmdEntry.Writeable() + executable := pmdEntry.Executable() + user := pmdEntry.User() + + // Install the relevant entries. + pteNode := p.allocNode() + pteEntries := pteNode.PTEs() + for index := 0; index < entriesPerPage; index++ { + pteEntry := &pteEntries[index] + pteEntry.Set(uintptr(currentAddr), writeable, executable, user) + currentAddr += pteSize + } + + // Reset to point to the new page. + p.setPageTable(pmdNode, pmdIndex, pteNode) + } else { + // A huge page to be checked directly. + fn(uintptr(start), uintptr(start+pmdSize), pmdEntry, pmdSize-1) + + // Might have been cleared. + if !pmdEntry.Valid() { + clearPMDEntries++ + } + + // Note that the huge page was changed. + start = next(start, pmdSize) + continue + } + } + + // Map the next level, since this is valid. + pteNode := p.getPageTable(pmdNode, pmdIndex) + clearPTEEntries := 0 + + for pteIndex := int((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { + pteEntry := &pteNode.PTEs()[pteIndex] + if !pteEntry.Valid() && !alloc { + clearPTEEntries++ + start += pteSize + continue + } + + // At this point, we are guaranteed that start%pteSize == 0. + fn(uintptr(start), uintptr(start+pteSize), pteEntry, pteSize-1) + if !pteEntry.Valid() { + if alloc { + panic("PTE not set after iteration with alloc=true!") + } + clearPTEEntries++ + } + + // Note that the pte was changed. + start += pteSize + continue + } + + // Check if we no longer need this page. + if clearPTEEntries == entriesPerPage { + p.clearPageTable(pmdNode, pmdIndex) + clearPMDEntries++ + } + } + + // Check if we no longer need this page. + if clearPMDEntries == entriesPerPage { + p.clearPageTable(pudNode, pudIndex) + clearPUDEntries++ + } + } + + // Check if we no longer need this page. + if clearPUDEntries == entriesPerPage { + p.clearPageTable(p.root, pgdIndex) + } + } +} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go new file mode 100644 index 000000000..9cbc0e3b0 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go @@ -0,0 +1,161 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pagetables + +import ( + "reflect" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +type reflectTranslater struct{} + +func (r reflectTranslater) TranslateToPhysical(ptes *PTEs) uintptr { + return reflect.ValueOf(ptes).Pointer() +} + +type mapping struct { + start uintptr + length uintptr + addr uintptr + writeable bool +} + +func checkMappings(t *testing.T, pt *PageTables, m []mapping) { + var ( + current int + found []mapping + failed string + ) + + // Iterate over all the mappings. + pt.iterateRange(0, ^uintptr(0), false, func(s, e uintptr, pte *PTE, align uintptr) { + found = append(found, mapping{ + start: s, + length: e - s, + addr: pte.Address(), + writeable: pte.Writeable(), + }) + if failed != "" { + // Don't keep looking for errors. + return + } + + if current >= len(m) { + failed = "more mappings than expected" + } else if m[current].start != s { + failed = "start didn't match expected" + } else if m[current].length != (e - s) { + failed = "end didn't match expected" + } else if m[current].addr != pte.Address() { + failed = "address didn't match expected" + } else if m[current].writeable != pte.Writeable() { + failed = "writeable didn't match" + } + current++ + }) + + // Were we expected additional mappings? + if failed == "" && current != len(m) { + failed = "insufficient mappings found" + } + + // Emit a meaningful error message on failure. + if failed != "" { + t.Errorf("%s; got %#v, wanted %#v", failed, found, m) + } +} + +func TestAllocFree(t *testing.T) { + pt := New(reflectTranslater{}, Opts{}) + pt.Release() +} + +func TestUnmap(t *testing.T) { + pt := New(reflectTranslater{}, Opts{}) + + // Map and unmap one entry. + pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42) + pt.Unmap(0x400000, pteSize) + + checkMappings(t, pt, nil) + pt.Release() +} + +func TestReadOnly(t *testing.T) { + pt := New(reflectTranslater{}, Opts{}) + + // Map one entry. + pt.Map(0x400000, pteSize, true, usermem.Read, pteSize*42) + + checkMappings(t, pt, []mapping{ + {0x400000, pteSize, pteSize * 42, false}, + }) + pt.Release() +} + +func TestReadWrite(t *testing.T) { + pt := New(reflectTranslater{}, Opts{}) + + // Map one entry. + pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42) + + checkMappings(t, pt, []mapping{ + {0x400000, pteSize, pteSize * 42, true}, + }) + pt.Release() +} + +func TestSerialEntries(t *testing.T) { + pt := New(reflectTranslater{}, Opts{}) + + // Map two sequential entries. + pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42) + pt.Map(0x401000, pteSize, true, usermem.ReadWrite, pteSize*47) + + checkMappings(t, pt, []mapping{ + {0x400000, pteSize, pteSize * 42, true}, + {0x401000, pteSize, pteSize * 47, true}, + }) + pt.Release() +} + +func TestSpanningEntries(t *testing.T) { + pt := New(reflectTranslater{}, Opts{}) + + // Span a pgd with two pages. + pt.Map(0x00007efffffff000, 2*pteSize, true, usermem.Read, pteSize*42) + + checkMappings(t, pt, []mapping{ + {0x00007efffffff000, pteSize, pteSize * 42, false}, + {0x00007f0000000000, pteSize, pteSize * 43, false}, + }) + pt.Release() +} + +func TestSparseEntries(t *testing.T) { + pt := New(reflectTranslater{}, Opts{}) + + // Map two entries in different pgds. + pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42) + pt.Map(0x00007f0000000000, pteSize, true, usermem.Read, pteSize*47) + + checkMappings(t, pt, []mapping{ + {0x400000, pteSize, pteSize * 42, true}, + {0x00007f0000000000, pteSize, pteSize * 47, false}, + }) + pt.Release() +} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go b/pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go new file mode 100644 index 000000000..a2b44fb79 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go @@ -0,0 +1,31 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pagetables + +import ( + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// PTEs returns aligned PTE entries. +func (n *Node) PTEs() *PTEs { + addr := uintptr(unsafe.Pointer(&n.unalignedData[0])) + offset := addr & (usermem.PageSize - 1) + if offset != 0 { + offset = usermem.PageSize - offset + } + return (*PTEs)(unsafe.Pointer(addr + offset)) +} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go new file mode 100644 index 000000000..dac66373f --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go @@ -0,0 +1,79 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build i386 amd64 + +package pagetables + +// Opts are pagetable options. +type Opts struct { + EnablePCID bool +} + +// archPageTables has x86-specific features. +type archPageTables struct { + // pcids is the PCID database. + pcids *PCIDs + + // pcid is the globally unique identifier, or zero if none were + // available or pcids is nil. + pcid uint16 +} + +// init initializes arch-specific features. +func (a *archPageTables) init(opts Opts) { + if opts.EnablePCID { + a.pcids = NewPCIDs() + a.pcid = a.pcids.allocate() + } +} + +// initFrom initializes arch-specific features from an existing entry.' +func (a *archPageTables) initFrom(other *archPageTables) { + a.pcids = other.pcids // Refer to the same PCID database. + if a.pcids != nil { + a.pcid = a.pcids.allocate() + } +} + +// release is called from Release. +func (a *archPageTables) release() { + // Return the PCID. + if a.pcids != nil { + a.pcids.free(a.pcid) + } +} + +// CR3 returns the CR3 value for these tables. +// +// This may be called in interrupt contexts. +// +//go:nosplit +func (p *PageTables) CR3() uint64 { + // Bit 63 is set to avoid flushing the PCID (per SDM 4.10.4.1). + const noFlushBit uint64 = 0x8000000000000000 + if p.pcid != 0 { + return noFlushBit | uint64(p.root.physical) | uint64(p.pcid) + } + return uint64(p.root.physical) +} + +// FlushCR3 returns the CR3 value that flushes the TLB. +// +// This may be called in interrupt contexts. +// +//go:nosplit +func (p *PageTables) FlushCR3() uint64 { + return uint64(p.root.physical) | uint64(p.pcid) +} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go new file mode 100644 index 000000000..1fc403c48 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go @@ -0,0 +1,79 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build i386 amd64 + +package pagetables + +import ( + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +func Test2MAnd4K(t *testing.T) { + pt := New(reflectTranslater{}, Opts{}) + + // Map a small page and a huge page. + pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42) + pt.Map(0x00007f0000000000, 1<<21, true, usermem.Read, pmdSize*47) + + checkMappings(t, pt, []mapping{ + {0x400000, pteSize, pteSize * 42, true}, + {0x00007f0000000000, pmdSize, pmdSize * 47, false}, + }) + pt.Release() +} + +func Test1GAnd4K(t *testing.T) { + pt := New(reflectTranslater{}, Opts{}) + + // Map a small page and a super page. + pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42) + pt.Map(0x00007f0000000000, pudSize, true, usermem.Read, pudSize*47) + + checkMappings(t, pt, []mapping{ + {0x400000, pteSize, pteSize * 42, true}, + {0x00007f0000000000, pudSize, pudSize * 47, false}, + }) + pt.Release() +} + +func TestSplit1GPage(t *testing.T) { + pt := New(reflectTranslater{}, Opts{}) + + // Map a super page and knock out the middle. + pt.Map(0x00007f0000000000, pudSize, true, usermem.Read, pudSize*42) + pt.Unmap(usermem.Addr(0x00007f0000000000+pteSize), pudSize-(2*pteSize)) + + checkMappings(t, pt, []mapping{ + {0x00007f0000000000, pteSize, pudSize * 42, false}, + {0x00007f0000000000 + pudSize - pteSize, pteSize, pudSize*42 + pudSize - pteSize, false}, + }) + pt.Release() +} + +func TestSplit2MPage(t *testing.T) { + pt := New(reflectTranslater{}, Opts{}) + + // Map a huge page and knock out the middle. + pt.Map(0x00007f0000000000, pmdSize, true, usermem.Read, pmdSize*42) + pt.Unmap(usermem.Addr(0x00007f0000000000+pteSize), pmdSize-(2*pteSize)) + + checkMappings(t, pt, []mapping{ + {0x00007f0000000000, pteSize, pmdSize * 42, false}, + {0x00007f0000000000 + pmdSize - pteSize, pteSize, pmdSize*42 + pmdSize - pteSize, false}, + }) + pt.Release() +} diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go new file mode 100644 index 000000000..509e8c0d9 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go @@ -0,0 +1,74 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build i386 amd64 + +package pagetables + +import ( + "sync" +) + +// maxPCID is the maximum allowed PCID. +const maxPCID = 4095 + +// PCIDs is a simple PCID database. +type PCIDs struct { + mu sync.Mutex + + // last is the last fresh PCID given out (not including the available + // pool). If last >= maxPCID, then the only PCIDs available in the + // available pool below. + last uint16 + + // available are PCIDs that have been freed. + available map[uint16]struct{} +} + +// NewPCIDs returns a new PCID set. +func NewPCIDs() *PCIDs { + return &PCIDs{ + available: make(map[uint16]struct{}), + } +} + +// allocate returns an unused PCID, or zero if all are taken. +func (p *PCIDs) allocate() uint16 { + p.mu.Lock() + defer p.mu.Unlock() + if len(p.available) > 0 { + for id := range p.available { + delete(p.available, id) + return id + } + } + if id := p.last + 1; id <= maxPCID { + p.last = id + return id + } + // Nothing available. + return 0 +} + +// free returns a PCID to the pool. +// +// It is safe to call free with a zero pcid. That is, you may always call free +// with anything returned by allocate. +func (p *PCIDs) free(id uint16) { + p.mu.Lock() + defer p.mu.Unlock() + if id != 0 { + p.available[id] = struct{}{} + } +} diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go new file mode 100644 index 000000000..0b555cd76 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go @@ -0,0 +1,65 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build i386 amd64 + +package pagetables + +import ( + "testing" +) + +func TestMaxPCID(t *testing.T) { + p := NewPCIDs() + for i := 0; i < maxPCID; i++ { + if id := p.allocate(); id != uint16(i+1) { + t.Errorf("got %d, expected %d", id, i+1) + } + } + if id := p.allocate(); id != 0 { + if id != 0 { + t.Errorf("got %d, expected 0", id) + } + } +} + +func TestFirstPCID(t *testing.T) { + p := NewPCIDs() + if id := p.allocate(); id != 1 { + t.Errorf("got %d, expected 1", id) + } +} + +func TestFreePCID(t *testing.T) { + p := NewPCIDs() + p.free(0) + if id := p.allocate(); id != 1 { + t.Errorf("got %d, expected 1 (not zero)", id) + } +} + +func TestReusePCID(t *testing.T) { + p := NewPCIDs() + id := p.allocate() + if id != 1 { + t.Errorf("got %d, expected 1", id) + } + p.free(id) + if id := p.allocate(); id != 1 { + t.Errorf("got %d, expected 1", id) + } + if id := p.allocate(); id != 2 { + t.Errorf("got %d, expected 2", id) + } +} diff --git a/pkg/sentry/platform/ring0/ring0.go b/pkg/sentry/platform/ring0/ring0.go new file mode 100644 index 000000000..4991031c5 --- /dev/null +++ b/pkg/sentry/platform/ring0/ring0.go @@ -0,0 +1,16 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package ring0 provides basic operating system-level stubs. +package ring0 diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go new file mode 100644 index 000000000..e16f6c599 --- /dev/null +++ b/pkg/sentry/platform/ring0/x86.go @@ -0,0 +1,242 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build i386 amd64 + +package ring0 + +import ( + "gvisor.googlesource.com/gvisor/pkg/cpuid" +) + +// Useful bits. +const ( + _CR0_PE = 1 << 0 + _CR0_ET = 1 << 4 + _CR0_PG = 1 << 31 + + _CR4_PSE = 1 << 4 + _CR4_PAE = 1 << 5 + _CR4_PGE = 1 << 7 + _CR4_OSFXSR = 1 << 9 + _CR4_OSXMMEXCPT = 1 << 10 + _CR4_FSGSBASE = 1 << 16 + _CR4_PCIDE = 1 << 17 + _CR4_OSXSAVE = 1 << 18 + _CR4_SMEP = 1 << 20 + + _RFLAGS_AC = 1 << 18 + _RFLAGS_NT = 1 << 14 + _RFLAGS_IOPL = 3 << 12 + _RFLAGS_DF = 1 << 10 + _RFLAGS_IF = 1 << 9 + _RFLAGS_STEP = 1 << 8 + _RFLAGS_RESERVED = 1 << 1 + + _EFER_SCE = 0x001 + _EFER_LME = 0x100 + _EFER_NX = 0x800 + + _MSR_STAR = 0xc0000081 + _MSR_LSTAR = 0xc0000082 + _MSR_CSTAR = 0xc0000083 + _MSR_SYSCALL_MASK = 0xc0000084 +) + +// Vector is an exception vector. +type Vector uintptr + +// Exception vectors. +const ( + DivideByZero Vector = iota + Debug + NMI + Breakpoint + Overflow + BoundRangeExceeded + InvalidOpcode + DeviceNotAvailable + DoubleFault + CoprocessorSegmentOverrun + InvalidTSS + SegmentNotPresent + StackSegmentFault + GeneralProtectionFault + PageFault + _ + X87FloatingPointException + AlignmentCheck + MachineCheck + SIMDFloatingPointException + VirtualizationException + SecurityException = 0x1e + SyscallInt80 = 0x80 + _NR_INTERRUPTS = SyscallInt80 + 1 +) + +// System call vectors. +const ( + Syscall Vector = _NR_INTERRUPTS +) + +// VirtualAddressBits returns the number bits available for virtual addresses. +// +// Note that sign-extension semantics apply to the highest order bit. +// +// FIXME: This should use the cpuid passed to Init. +func VirtualAddressBits() uint32 { + ax, _, _, _ := cpuid.HostID(0x80000008, 0) + return (ax >> 8) & 0xff +} + +// PhysicalAddressBits returns the number of bits available for physical addresses. +// +// FIXME: This should use the cpuid passed to Init. +func PhysicalAddressBits() uint32 { + ax, _, _, _ := cpuid.HostID(0x80000008, 0) + return ax & 0xff +} + +// Selector is a segment Selector. +type Selector uint16 + +// SegmentDescriptor is a segment descriptor. +type SegmentDescriptor struct { + bits [2]uint32 +} + +// descriptorTable is a collection of descriptors. +type descriptorTable [32]SegmentDescriptor + +// SegmentDescriptorFlags are typed flags within a descriptor. +type SegmentDescriptorFlags uint32 + +// SegmentDescriptorFlag declarations. +const ( + SegmentDescriptorAccess SegmentDescriptorFlags = 1 << 8 // Access bit (always set). + SegmentDescriptorWrite = 1 << 9 // Write permission. + SegmentDescriptorExpandDown = 1 << 10 // Grows down, not used. + SegmentDescriptorExecute = 1 << 11 // Execute permission. + SegmentDescriptorSystem = 1 << 12 // Zero => system, 1 => user code/data. + SegmentDescriptorPresent = 1 << 15 // Present. + SegmentDescriptorAVL = 1 << 20 // Available. + SegmentDescriptorLong = 1 << 21 // Long mode. + SegmentDescriptorDB = 1 << 22 // 16 or 32-bit. + SegmentDescriptorG = 1 << 23 // Granularity: page or byte. +) + +// Base returns the descriptor's base linear address. +func (d *SegmentDescriptor) Base() uint32 { + return d.bits[1]&0xFF000000 | (d.bits[1]&0x000000FF)<<16 | d.bits[0]>>16 +} + +// Limit returns the descriptor size. +func (d *SegmentDescriptor) Limit() uint32 { + l := d.bits[0]&0xFFFF | d.bits[1]&0xF0000 + if d.bits[1]&uint32(SegmentDescriptorG) != 0 { + l <<= 12 + l |= 0xFFF + } + return l +} + +// Flags returns descriptor flags. +func (d *SegmentDescriptor) Flags() SegmentDescriptorFlags { + return SegmentDescriptorFlags(d.bits[1] & 0x00F09F00) +} + +// DPL returns the descriptor privilege level. +func (d *SegmentDescriptor) DPL() int { + return int((d.bits[1] >> 13) & 3) +} + +func (d *SegmentDescriptor) setNull() { + d.bits[0] = 0 + d.bits[1] = 0 +} + +func (d *SegmentDescriptor) set(base, limit uint32, dpl int, flags SegmentDescriptorFlags) { + flags |= SegmentDescriptorPresent + if limit>>12 != 0 { + limit >>= 12 + flags |= SegmentDescriptorG + } + d.bits[0] = base<<16 | limit&0xFFFF + d.bits[1] = base&0xFF000000 | (base>>16)&0xFF | limit&0x000F0000 | uint32(flags) | uint32(dpl)<<13 +} + +func (d *SegmentDescriptor) setCode32(base, limit uint32, dpl int) { + d.set(base, limit, dpl, + SegmentDescriptorDB| + SegmentDescriptorExecute| + SegmentDescriptorSystem) +} + +func (d *SegmentDescriptor) setCode64(base, limit uint32, dpl int) { + d.set(base, limit, dpl, + SegmentDescriptorG| + SegmentDescriptorLong| + SegmentDescriptorExecute| + SegmentDescriptorSystem) +} + +func (d *SegmentDescriptor) setData(base, limit uint32, dpl int) { + d.set(base, limit, dpl, + SegmentDescriptorWrite| + SegmentDescriptorSystem) +} + +// setHi is only used for the TSS segment, which is magically 64-bits. +func (d *SegmentDescriptor) setHi(base uint32) { + d.bits[0] = base + d.bits[1] = 0 +} + +// Gate64 is a 64-bit task, trap, or interrupt gate. +type Gate64 struct { + bits [4]uint32 +} + +// idt64 is a 64-bit interrupt descriptor table. +type idt64 [_NR_INTERRUPTS]Gate64 + +func (g *Gate64) setInterrupt(cs Selector, rip uint64, dpl int, ist int) { + g.bits[0] = uint32(cs)<<16 | uint32(rip)&0xFFFF + g.bits[1] = uint32(rip)&0xFFFF0000 | SegmentDescriptorPresent | uint32(dpl)<<13 | 14<<8 | uint32(ist)&0x7 + g.bits[2] = uint32(rip >> 32) +} + +func (g *Gate64) setTrap(cs Selector, rip uint64, dpl int, ist int) { + g.setInterrupt(cs, rip, dpl, ist) + g.bits[1] |= 1 << 8 +} + +// TaskState64 is a 64-bit task state structure. +type TaskState64 struct { + _ uint32 + rsp0Lo, rsp0Hi uint32 + rsp1Lo, rsp1Hi uint32 + rsp2Lo, rsp2Hi uint32 + _ [2]uint32 + ist1Lo, ist1Hi uint32 + ist2Lo, ist2Hi uint32 + ist3Lo, ist3Hi uint32 + ist4Lo, ist4Hi uint32 + ist5Lo, ist5Hi uint32 + ist6Lo, ist6Hi uint32 + ist7Lo, ist7Hi uint32 + _ [2]uint32 + _ uint16 + ioPerm uint16 +} |