diff options
author | gVisor bot <gvisor-bot@google.com> | 2019-06-02 06:44:55 +0000 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2019-06-02 06:44:55 +0000 |
commit | ceb0d792f328d1fc0692197d8856a43c3936a571 (patch) | |
tree | 83155f302eff44a78bcc30a3a08f4efe59a79379 /pkg/sentry/platform/ring0 | |
parent | deb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff) | |
parent | 216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff) |
Merge 216da0b7 (automated)
Diffstat (limited to 'pkg/sentry/platform/ring0')
21 files changed, 3579 insertions, 0 deletions
diff --git a/pkg/sentry/platform/ring0/defs_impl.go b/pkg/sentry/platform/ring0/defs_impl.go new file mode 100755 index 000000000..582553bc7 --- /dev/null +++ b/pkg/sentry/platform/ring0/defs_impl.go @@ -0,0 +1,538 @@ +package ring0 + +import ( + "syscall" + + "fmt" + "gvisor.googlesource.com/gvisor/pkg/cpuid" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "io" + "reflect" +) + +var ( + // UserspaceSize is the total size of userspace. + UserspaceSize = uintptr(1) << (VirtualAddressBits() - 1) + + // MaximumUserAddress is the largest possible user address. + MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(usermem.PageSize-1) + + // KernelStartAddress is the starting kernel address. + KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1) +) + +// Kernel is a global kernel object. +// +// This contains global state, shared by multiple CPUs. +type Kernel struct { + KernelArchState +} + +// Hooks are hooks for kernel functions. +type Hooks interface { + // KernelSyscall is called for kernel system calls. + // + // Return from this call will restore registers and return to the kernel: the + // registers must be modified directly. + // + // If this function is not provided, a kernel exception results in halt. + // + // This must be go:nosplit, as this will be on the interrupt stack. + // Closures are permitted, as the pointer to the closure frame is not + // passed on the stack. + KernelSyscall() + + // KernelException handles an exception during kernel execution. + // + // Return from this call will restore registers and return to the kernel: the + // registers must be modified directly. + // + // If this function is not provided, a kernel exception results in halt. + // + // This must be go:nosplit, as this will be on the interrupt stack. + // Closures are permitted, as the pointer to the closure frame is not + // passed on the stack. + KernelException(Vector) +} + +// CPU is the per-CPU struct. +type CPU struct { + // self is a self reference. + // + // This is always guaranteed to be at offset zero. + self *CPU + + // kernel is reference to the kernel that this CPU was initialized + // with. This reference is kept for garbage collection purposes: CPU + // registers may refer to objects within the Kernel object that cannot + // be safely freed. + kernel *Kernel + + // CPUArchState is architecture-specific state. + CPUArchState + + // registers is a set of registers; these may be used on kernel system + // calls and exceptions via the Registers function. + registers syscall.PtraceRegs + + // hooks are kernel hooks. + hooks Hooks +} + +// Registers returns a modifiable-copy of the kernel registers. +// +// This is explicitly safe to call during KernelException and KernelSyscall. +// +//go:nosplit +func (c *CPU) Registers() *syscall.PtraceRegs { + return &c.registers +} + +// SwitchOpts are passed to the Switch function. +type SwitchOpts struct { + // Registers are the user register state. + Registers *syscall.PtraceRegs + + // FloatingPointState is a byte pointer where floating point state is + // saved and restored. + FloatingPointState *byte + + // PageTables are the application page tables. + PageTables *pagetables.PageTables + + // Flush indicates that a TLB flush should be forced on switch. + Flush bool + + // FullRestore indicates that an iret-based restore should be used. + FullRestore bool + + // SwitchArchOpts are architecture-specific options. + SwitchArchOpts +} + +// Segment indices and Selectors. +const ( + // Index into GDT array. + _ = iota // Null descriptor first. + _ // Reserved (Linux is kernel 32). + segKcode // Kernel code (64-bit). + segKdata // Kernel data. + segUcode32 // User code (32-bit). + segUdata // User data. + segUcode64 // User code (64-bit). + segTss // Task segment descriptor. + segTssHi // Upper bits for TSS. + segLast // Last segment (terminal, not included). +) + +// Selectors. +const ( + Kcode Selector = segKcode << 3 + Kdata Selector = segKdata << 3 + Ucode32 Selector = (segUcode32 << 3) | 3 + Udata Selector = (segUdata << 3) | 3 + Ucode64 Selector = (segUcode64 << 3) | 3 + Tss Selector = segTss << 3 +) + +// Standard segments. +var ( + UserCodeSegment32 SegmentDescriptor + UserDataSegment SegmentDescriptor + UserCodeSegment64 SegmentDescriptor + KernelCodeSegment SegmentDescriptor + KernelDataSegment SegmentDescriptor +) + +// KernelOpts has initialization options for the kernel. +type KernelOpts struct { + // PageTables are the kernel pagetables; this must be provided. + PageTables *pagetables.PageTables +} + +// KernelArchState contains architecture-specific state. +type KernelArchState struct { + KernelOpts + + // globalIDT is our set of interrupt gates. + globalIDT idt64 +} + +// CPUArchState contains CPU-specific arch state. +type CPUArchState struct { + // stack is the stack used for interrupts on this CPU. + stack [256]byte + + // errorCode is the error code from the last exception. + errorCode uintptr + + // errorType indicates the type of error code here, it is always set + // along with the errorCode value above. + // + // It will either by 1, which indicates a user error, or 0 indicating a + // kernel error. If the error code below returns false (kernel error), + // then it cannot provide relevant information about the last + // exception. + errorType uintptr + + // gdt is the CPU's descriptor table. + gdt descriptorTable + + // tss is the CPU's task state. + tss TaskState64 +} + +// ErrorCode returns the last error code. +// +// The returned boolean indicates whether the error code corresponds to the +// last user error or not. If it does not, then fault information must be +// ignored. This is generally the result of a kernel fault while servicing a +// user fault. +// +//go:nosplit +func (c *CPU) ErrorCode() (value uintptr, user bool) { + return c.errorCode, c.errorType != 0 +} + +// ClearErrorCode resets the error code. +// +//go:nosplit +func (c *CPU) ClearErrorCode() { + c.errorCode = 0 + c.errorType = 1 +} + +// SwitchArchOpts are embedded in SwitchOpts. +type SwitchArchOpts struct { + // UserPCID indicates that the application PCID to be used on switch, + // assuming that PCIDs are supported. + // + // Per pagetables_x86.go, a zero PCID implies a flush. + UserPCID uint16 + + // KernelPCID indicates that the kernel PCID to be used on return, + // assuming that PCIDs are supported. + // + // Per pagetables_x86.go, a zero PCID implies a flush. + KernelPCID uint16 +} + +func init() { + KernelCodeSegment.setCode64(0, 0, 0) + KernelDataSegment.setData(0, 0xffffffff, 0) + UserCodeSegment32.setCode64(0, 0, 3) + UserDataSegment.setData(0, 0xffffffff, 3) + UserCodeSegment64.setCode64(0, 0, 3) +} + +// Emit prints architecture-specific offsets. +func Emit(w io.Writer) { + fmt.Fprintf(w, "// Automatically generated, do not edit.\n") + + c := &CPU{} + fmt.Fprintf(w, "\n// CPU offsets.\n") + fmt.Fprintf(w, "#define CPU_SELF 0x%02x\n", reflect.ValueOf(&c.self).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_REGISTERS 0x%02x\n", reflect.ValueOf(&c.registers).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_STACK_TOP 0x%02x\n", reflect.ValueOf(&c.stack[0]).Pointer()-reflect.ValueOf(c).Pointer()+uintptr(len(c.stack))) + fmt.Fprintf(w, "#define CPU_ERROR_CODE 0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_ERROR_TYPE 0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer()) + + fmt.Fprintf(w, "\n// Bits.\n") + fmt.Fprintf(w, "#define _RFLAGS_IF 0x%02x\n", _RFLAGS_IF) + fmt.Fprintf(w, "#define _KERNEL_FLAGS 0x%02x\n", KernelFlagsSet) + + fmt.Fprintf(w, "\n// Vectors.\n") + fmt.Fprintf(w, "#define DivideByZero 0x%02x\n", DivideByZero) + fmt.Fprintf(w, "#define Debug 0x%02x\n", Debug) + fmt.Fprintf(w, "#define NMI 0x%02x\n", NMI) + fmt.Fprintf(w, "#define Breakpoint 0x%02x\n", Breakpoint) + fmt.Fprintf(w, "#define Overflow 0x%02x\n", Overflow) + fmt.Fprintf(w, "#define BoundRangeExceeded 0x%02x\n", BoundRangeExceeded) + fmt.Fprintf(w, "#define InvalidOpcode 0x%02x\n", InvalidOpcode) + fmt.Fprintf(w, "#define DeviceNotAvailable 0x%02x\n", DeviceNotAvailable) + fmt.Fprintf(w, "#define DoubleFault 0x%02x\n", DoubleFault) + fmt.Fprintf(w, "#define CoprocessorSegmentOverrun 0x%02x\n", CoprocessorSegmentOverrun) + fmt.Fprintf(w, "#define InvalidTSS 0x%02x\n", InvalidTSS) + fmt.Fprintf(w, "#define SegmentNotPresent 0x%02x\n", SegmentNotPresent) + fmt.Fprintf(w, "#define StackSegmentFault 0x%02x\n", StackSegmentFault) + fmt.Fprintf(w, "#define GeneralProtectionFault 0x%02x\n", GeneralProtectionFault) + fmt.Fprintf(w, "#define PageFault 0x%02x\n", PageFault) + fmt.Fprintf(w, "#define X87FloatingPointException 0x%02x\n", X87FloatingPointException) + fmt.Fprintf(w, "#define AlignmentCheck 0x%02x\n", AlignmentCheck) + fmt.Fprintf(w, "#define MachineCheck 0x%02x\n", MachineCheck) + fmt.Fprintf(w, "#define SIMDFloatingPointException 0x%02x\n", SIMDFloatingPointException) + fmt.Fprintf(w, "#define VirtualizationException 0x%02x\n", VirtualizationException) + fmt.Fprintf(w, "#define SecurityException 0x%02x\n", SecurityException) + fmt.Fprintf(w, "#define SyscallInt80 0x%02x\n", SyscallInt80) + fmt.Fprintf(w, "#define Syscall 0x%02x\n", Syscall) + + p := &syscall.PtraceRegs{} + fmt.Fprintf(w, "\n// Ptrace registers.\n") + fmt.Fprintf(w, "#define PTRACE_R15 0x%02x\n", reflect.ValueOf(&p.R15).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R14 0x%02x\n", reflect.ValueOf(&p.R14).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R13 0x%02x\n", reflect.ValueOf(&p.R13).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R12 0x%02x\n", reflect.ValueOf(&p.R12).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RBP 0x%02x\n", reflect.ValueOf(&p.Rbp).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RBX 0x%02x\n", reflect.ValueOf(&p.Rbx).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R11 0x%02x\n", reflect.ValueOf(&p.R11).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R10 0x%02x\n", reflect.ValueOf(&p.R10).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R9 0x%02x\n", reflect.ValueOf(&p.R9).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R8 0x%02x\n", reflect.ValueOf(&p.R8).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RAX 0x%02x\n", reflect.ValueOf(&p.Rax).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RCX 0x%02x\n", reflect.ValueOf(&p.Rcx).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RDX 0x%02x\n", reflect.ValueOf(&p.Rdx).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RSI 0x%02x\n", reflect.ValueOf(&p.Rsi).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RDI 0x%02x\n", reflect.ValueOf(&p.Rdi).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_ORIGRAX 0x%02x\n", reflect.ValueOf(&p.Orig_rax).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RIP 0x%02x\n", reflect.ValueOf(&p.Rip).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_CS 0x%02x\n", reflect.ValueOf(&p.Cs).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_FLAGS 0x%02x\n", reflect.ValueOf(&p.Eflags).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RSP 0x%02x\n", reflect.ValueOf(&p.Rsp).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_SS 0x%02x\n", reflect.ValueOf(&p.Ss).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_FS 0x%02x\n", reflect.ValueOf(&p.Fs_base).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_GS 0x%02x\n", reflect.ValueOf(&p.Gs_base).Pointer()-reflect.ValueOf(p).Pointer()) +} + +// Useful bits. +const ( + _CR0_PE = 1 << 0 + _CR0_ET = 1 << 4 + _CR0_AM = 1 << 18 + _CR0_PG = 1 << 31 + + _CR4_PSE = 1 << 4 + _CR4_PAE = 1 << 5 + _CR4_PGE = 1 << 7 + _CR4_OSFXSR = 1 << 9 + _CR4_OSXMMEXCPT = 1 << 10 + _CR4_FSGSBASE = 1 << 16 + _CR4_PCIDE = 1 << 17 + _CR4_OSXSAVE = 1 << 18 + _CR4_SMEP = 1 << 20 + + _RFLAGS_AC = 1 << 18 + _RFLAGS_NT = 1 << 14 + _RFLAGS_IOPL = 3 << 12 + _RFLAGS_DF = 1 << 10 + _RFLAGS_IF = 1 << 9 + _RFLAGS_STEP = 1 << 8 + _RFLAGS_RESERVED = 1 << 1 + + _EFER_SCE = 0x001 + _EFER_LME = 0x100 + _EFER_LMA = 0x400 + _EFER_NX = 0x800 + + _MSR_STAR = 0xc0000081 + _MSR_LSTAR = 0xc0000082 + _MSR_CSTAR = 0xc0000083 + _MSR_SYSCALL_MASK = 0xc0000084 + _MSR_PLATFORM_INFO = 0xce + _MSR_MISC_FEATURES = 0x140 + + _PLATFORM_INFO_CPUID_FAULT = 1 << 31 + + _MISC_FEATURE_CPUID_TRAP = 0x1 +) + +const ( + // KernelFlagsSet should always be set in the kernel. + KernelFlagsSet = _RFLAGS_RESERVED + + // UserFlagsSet are always set in userspace. + UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF + + // KernelFlagsClear should always be clear in the kernel. + KernelFlagsClear = _RFLAGS_STEP | _RFLAGS_IF | _RFLAGS_IOPL | _RFLAGS_AC | _RFLAGS_NT + + // UserFlagsClear are always cleared in userspace. + UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL +) + +// Vector is an exception vector. +type Vector uintptr + +// Exception vectors. +const ( + DivideByZero Vector = iota + Debug + NMI + Breakpoint + Overflow + BoundRangeExceeded + InvalidOpcode + DeviceNotAvailable + DoubleFault + CoprocessorSegmentOverrun + InvalidTSS + SegmentNotPresent + StackSegmentFault + GeneralProtectionFault + PageFault + _ + X87FloatingPointException + AlignmentCheck + MachineCheck + SIMDFloatingPointException + VirtualizationException + SecurityException = 0x1e + SyscallInt80 = 0x80 + _NR_INTERRUPTS = SyscallInt80 + 1 +) + +// System call vectors. +const ( + Syscall Vector = _NR_INTERRUPTS +) + +// VirtualAddressBits returns the number bits available for virtual addresses. +// +// Note that sign-extension semantics apply to the highest order bit. +// +// FIXME(b/69382326): This should use the cpuid passed to Init. +func VirtualAddressBits() uint32 { + ax, _, _, _ := cpuid.HostID(0x80000008, 0) + return (ax >> 8) & 0xff +} + +// PhysicalAddressBits returns the number of bits available for physical addresses. +// +// FIXME(b/69382326): This should use the cpuid passed to Init. +func PhysicalAddressBits() uint32 { + ax, _, _, _ := cpuid.HostID(0x80000008, 0) + return ax & 0xff +} + +// Selector is a segment Selector. +type Selector uint16 + +// SegmentDescriptor is a segment descriptor. +type SegmentDescriptor struct { + bits [2]uint32 +} + +// descriptorTable is a collection of descriptors. +type descriptorTable [32]SegmentDescriptor + +// SegmentDescriptorFlags are typed flags within a descriptor. +type SegmentDescriptorFlags uint32 + +// SegmentDescriptorFlag declarations. +const ( + SegmentDescriptorAccess SegmentDescriptorFlags = 1 << 8 // Access bit (always set). + SegmentDescriptorWrite = 1 << 9 // Write permission. + SegmentDescriptorExpandDown = 1 << 10 // Grows down, not used. + SegmentDescriptorExecute = 1 << 11 // Execute permission. + SegmentDescriptorSystem = 1 << 12 // Zero => system, 1 => user code/data. + SegmentDescriptorPresent = 1 << 15 // Present. + SegmentDescriptorAVL = 1 << 20 // Available. + SegmentDescriptorLong = 1 << 21 // Long mode. + SegmentDescriptorDB = 1 << 22 // 16 or 32-bit. + SegmentDescriptorG = 1 << 23 // Granularity: page or byte. +) + +// Base returns the descriptor's base linear address. +func (d *SegmentDescriptor) Base() uint32 { + return d.bits[1]&0xFF000000 | (d.bits[1]&0x000000FF)<<16 | d.bits[0]>>16 +} + +// Limit returns the descriptor size. +func (d *SegmentDescriptor) Limit() uint32 { + l := d.bits[0]&0xFFFF | d.bits[1]&0xF0000 + if d.bits[1]&uint32(SegmentDescriptorG) != 0 { + l <<= 12 + l |= 0xFFF + } + return l +} + +// Flags returns descriptor flags. +func (d *SegmentDescriptor) Flags() SegmentDescriptorFlags { + return SegmentDescriptorFlags(d.bits[1] & 0x00F09F00) +} + +// DPL returns the descriptor privilege level. +func (d *SegmentDescriptor) DPL() int { + return int((d.bits[1] >> 13) & 3) +} + +func (d *SegmentDescriptor) setNull() { + d.bits[0] = 0 + d.bits[1] = 0 +} + +func (d *SegmentDescriptor) set(base, limit uint32, dpl int, flags SegmentDescriptorFlags) { + flags |= SegmentDescriptorPresent + if limit>>12 != 0 { + limit >>= 12 + flags |= SegmentDescriptorG + } + d.bits[0] = base<<16 | limit&0xFFFF + d.bits[1] = base&0xFF000000 | (base>>16)&0xFF | limit&0x000F0000 | uint32(flags) | uint32(dpl)<<13 +} + +func (d *SegmentDescriptor) setCode32(base, limit uint32, dpl int) { + d.set(base, limit, dpl, + SegmentDescriptorDB| + SegmentDescriptorExecute| + SegmentDescriptorSystem) +} + +func (d *SegmentDescriptor) setCode64(base, limit uint32, dpl int) { + d.set(base, limit, dpl, + SegmentDescriptorG| + SegmentDescriptorLong| + SegmentDescriptorExecute| + SegmentDescriptorSystem) +} + +func (d *SegmentDescriptor) setData(base, limit uint32, dpl int) { + d.set(base, limit, dpl, + SegmentDescriptorWrite| + SegmentDescriptorSystem) +} + +// setHi is only used for the TSS segment, which is magically 64-bits. +func (d *SegmentDescriptor) setHi(base uint32) { + d.bits[0] = base + d.bits[1] = 0 +} + +// Gate64 is a 64-bit task, trap, or interrupt gate. +type Gate64 struct { + bits [4]uint32 +} + +// idt64 is a 64-bit interrupt descriptor table. +type idt64 [_NR_INTERRUPTS]Gate64 + +func (g *Gate64) setInterrupt(cs Selector, rip uint64, dpl int, ist int) { + g.bits[0] = uint32(cs)<<16 | uint32(rip)&0xFFFF + g.bits[1] = uint32(rip)&0xFFFF0000 | SegmentDescriptorPresent | uint32(dpl)<<13 | 14<<8 | uint32(ist)&0x7 + g.bits[2] = uint32(rip >> 32) +} + +func (g *Gate64) setTrap(cs Selector, rip uint64, dpl int, ist int) { + g.setInterrupt(cs, rip, dpl, ist) + g.bits[1] |= 1 << 8 +} + +// TaskState64 is a 64-bit task state structure. +type TaskState64 struct { + _ uint32 + rsp0Lo, rsp0Hi uint32 + rsp1Lo, rsp1Hi uint32 + rsp2Lo, rsp2Hi uint32 + _ [2]uint32 + ist1Lo, ist1Hi uint32 + ist2Lo, ist2Hi uint32 + ist3Lo, ist3Hi uint32 + ist4Lo, ist4Hi uint32 + ist5Lo, ist5Hi uint32 + ist6Lo, ist6Hi uint32 + ist7Lo, ist7Hi uint32 + _ [2]uint32 + _ uint16 + ioPerm uint16 +} diff --git a/pkg/sentry/platform/ring0/entry_amd64.go b/pkg/sentry/platform/ring0/entry_amd64.go new file mode 100644 index 000000000..a5ce67885 --- /dev/null +++ b/pkg/sentry/platform/ring0/entry_amd64.go @@ -0,0 +1,128 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package ring0 + +import ( + "syscall" +) + +// This is an assembly function. +// +// The sysenter function is invoked in two situations: +// +// (1) The guest kernel has executed a system call. +// (2) The guest application has executed a system call. +// +// The interrupt flag is examined to determine whether the system call was +// executed from kernel mode or not and the appropriate stub is called. +func sysenter() + +// swapgs swaps the current GS value. +// +// This must be called prior to sysret/iret. +func swapgs() + +// sysret returns to userspace from a system call. +// +// The return code is the vector that interrupted execution. +// +// See stubs.go for a note regarding the frame size of this function. +func sysret(*CPU, *syscall.PtraceRegs) Vector + +// "iret is the cadillac of CPL switching." +// +// -- Neel Natu +// +// iret is nearly identical to sysret, except an iret is used to fully restore +// all user state. This must be called in cases where all registers need to be +// restored. +func iret(*CPU, *syscall.PtraceRegs) Vector + +// exception is the generic exception entry. +// +// This is called by the individual stub definitions. +func exception() + +// resume is a stub that restores the CPU kernel registers. +// +// This is used when processing kernel exceptions and syscalls. +func resume() + +// Start is the CPU entrypoint. +// +// The following start conditions must be satisfied: +// +// * AX should contain the CPU pointer. +// * c.GDT() should be loaded as the GDT. +// * c.IDT() should be loaded as the IDT. +// * c.CR0() should be the current CR0 value. +// * c.CR3() should be set to the kernel PageTables. +// * c.CR4() should be the current CR4 value. +// * c.EFER() should be the current EFER value. +// +// The CPU state will be set to c.Registers(). +func Start() + +// Exception stubs. +func divideByZero() +func debug() +func nmi() +func breakpoint() +func overflow() +func boundRangeExceeded() +func invalidOpcode() +func deviceNotAvailable() +func doubleFault() +func coprocessorSegmentOverrun() +func invalidTSS() +func segmentNotPresent() +func stackSegmentFault() +func generalProtectionFault() +func pageFault() +func x87FloatingPointException() +func alignmentCheck() +func machineCheck() +func simdFloatingPointException() +func virtualizationException() +func securityException() +func syscallInt80() + +// Exception handler index. +var handlers = map[Vector]func(){ + DivideByZero: divideByZero, + Debug: debug, + NMI: nmi, + Breakpoint: breakpoint, + Overflow: overflow, + BoundRangeExceeded: boundRangeExceeded, + InvalidOpcode: invalidOpcode, + DeviceNotAvailable: deviceNotAvailable, + DoubleFault: doubleFault, + CoprocessorSegmentOverrun: coprocessorSegmentOverrun, + InvalidTSS: invalidTSS, + SegmentNotPresent: segmentNotPresent, + StackSegmentFault: stackSegmentFault, + GeneralProtectionFault: generalProtectionFault, + PageFault: pageFault, + X87FloatingPointException: x87FloatingPointException, + AlignmentCheck: alignmentCheck, + MachineCheck: machineCheck, + SIMDFloatingPointException: simdFloatingPointException, + VirtualizationException: virtualizationException, + SecurityException: securityException, + SyscallInt80: syscallInt80, +} diff --git a/pkg/sentry/platform/ring0/entry_impl_amd64.s b/pkg/sentry/platform/ring0/entry_impl_amd64.s new file mode 100755 index 000000000..d082d06a9 --- /dev/null +++ b/pkg/sentry/platform/ring0/entry_impl_amd64.s @@ -0,0 +1,383 @@ +// build +amd64 + +// Automatically generated, do not edit. + +// CPU offsets. +#define CPU_SELF 0x00 +#define CPU_REGISTERS 0x288 +#define CPU_STACK_TOP 0x110 +#define CPU_ERROR_CODE 0x110 +#define CPU_ERROR_TYPE 0x118 + +// Bits. +#define _RFLAGS_IF 0x200 +#define _KERNEL_FLAGS 0x02 + +// Vectors. +#define DivideByZero 0x00 +#define Debug 0x01 +#define NMI 0x02 +#define Breakpoint 0x03 +#define Overflow 0x04 +#define BoundRangeExceeded 0x05 +#define InvalidOpcode 0x06 +#define DeviceNotAvailable 0x07 +#define DoubleFault 0x08 +#define CoprocessorSegmentOverrun 0x09 +#define InvalidTSS 0x0a +#define SegmentNotPresent 0x0b +#define StackSegmentFault 0x0c +#define GeneralProtectionFault 0x0d +#define PageFault 0x0e +#define X87FloatingPointException 0x10 +#define AlignmentCheck 0x11 +#define MachineCheck 0x12 +#define SIMDFloatingPointException 0x13 +#define VirtualizationException 0x14 +#define SecurityException 0x1e +#define SyscallInt80 0x80 +#define Syscall 0x81 + +// Ptrace registers. +#define PTRACE_R15 0x00 +#define PTRACE_R14 0x08 +#define PTRACE_R13 0x10 +#define PTRACE_R12 0x18 +#define PTRACE_RBP 0x20 +#define PTRACE_RBX 0x28 +#define PTRACE_R11 0x30 +#define PTRACE_R10 0x38 +#define PTRACE_R9 0x40 +#define PTRACE_R8 0x48 +#define PTRACE_RAX 0x50 +#define PTRACE_RCX 0x58 +#define PTRACE_RDX 0x60 +#define PTRACE_RSI 0x68 +#define PTRACE_RDI 0x70 +#define PTRACE_ORIGRAX 0x78 +#define PTRACE_RIP 0x80 +#define PTRACE_CS 0x88 +#define PTRACE_FLAGS 0x90 +#define PTRACE_RSP 0x98 +#define PTRACE_SS 0xa0 +#define PTRACE_FS 0xa8 +#define PTRACE_GS 0xb0 +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "funcdata.h" +#include "textflag.h" + +// NB: Offsets are programatically generated (see BUILD). +// +// This file is concatenated with the definitions. + +// Saves a register set. +// +// This is a macro because it may need to executed in contents where a stack is +// not available for calls. +// +// The following registers are not saved: AX, SP, IP, FLAGS, all segments. +#define REGISTERS_SAVE(reg, offset) \ + MOVQ R15, offset+PTRACE_R15(reg); \ + MOVQ R14, offset+PTRACE_R14(reg); \ + MOVQ R13, offset+PTRACE_R13(reg); \ + MOVQ R12, offset+PTRACE_R12(reg); \ + MOVQ BP, offset+PTRACE_RBP(reg); \ + MOVQ BX, offset+PTRACE_RBX(reg); \ + MOVQ CX, offset+PTRACE_RCX(reg); \ + MOVQ DX, offset+PTRACE_RDX(reg); \ + MOVQ R11, offset+PTRACE_R11(reg); \ + MOVQ R10, offset+PTRACE_R10(reg); \ + MOVQ R9, offset+PTRACE_R9(reg); \ + MOVQ R8, offset+PTRACE_R8(reg); \ + MOVQ SI, offset+PTRACE_RSI(reg); \ + MOVQ DI, offset+PTRACE_RDI(reg); + +// Loads a register set. +// +// This is a macro because it may need to executed in contents where a stack is +// not available for calls. +// +// The following registers are not loaded: AX, SP, IP, FLAGS, all segments. +#define REGISTERS_LOAD(reg, offset) \ + MOVQ offset+PTRACE_R15(reg), R15; \ + MOVQ offset+PTRACE_R14(reg), R14; \ + MOVQ offset+PTRACE_R13(reg), R13; \ + MOVQ offset+PTRACE_R12(reg), R12; \ + MOVQ offset+PTRACE_RBP(reg), BP; \ + MOVQ offset+PTRACE_RBX(reg), BX; \ + MOVQ offset+PTRACE_RCX(reg), CX; \ + MOVQ offset+PTRACE_RDX(reg), DX; \ + MOVQ offset+PTRACE_R11(reg), R11; \ + MOVQ offset+PTRACE_R10(reg), R10; \ + MOVQ offset+PTRACE_R9(reg), R9; \ + MOVQ offset+PTRACE_R8(reg), R8; \ + MOVQ offset+PTRACE_RSI(reg), SI; \ + MOVQ offset+PTRACE_RDI(reg), DI; + +// SWAP_GS swaps the kernel GS (CPU). +#define SWAP_GS() \ + BYTE $0x0F; BYTE $0x01; BYTE $0xf8; + +// IRET returns from an interrupt frame. +#define IRET() \ + BYTE $0x48; BYTE $0xcf; + +// SYSRET64 executes the sysret instruction. +#define SYSRET64() \ + BYTE $0x48; BYTE $0x0f; BYTE $0x07; + +// LOAD_KERNEL_ADDRESS loads a kernel address. +#define LOAD_KERNEL_ADDRESS(from, to) \ + MOVQ from, to; \ + ORQ ·KernelStartAddress(SB), to; + +// LOAD_KERNEL_STACK loads the kernel stack. +#define LOAD_KERNEL_STACK(from) \ + LOAD_KERNEL_ADDRESS(CPU_SELF(from), SP); \ + LEAQ CPU_STACK_TOP(SP), SP; + +// See kernel.go. +TEXT ·Halt(SB),NOSPLIT,$0 + HLT + RET + +// See entry_amd64.go. +TEXT ·swapgs(SB),NOSPLIT,$0 + SWAP_GS() + RET + +// See entry_amd64.go. +TEXT ·sysret(SB),NOSPLIT,$0-24 + // Save original state. + LOAD_KERNEL_ADDRESS(cpu+0(FP), BX) + LOAD_KERNEL_ADDRESS(regs+8(FP), AX) + MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX) + MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX) + MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX) + + // Restore user register state. + REGISTERS_LOAD(AX, 0) + MOVQ PTRACE_RIP(AX), CX // Needed for SYSRET. + MOVQ PTRACE_FLAGS(AX), R11 // Needed for SYSRET. + MOVQ PTRACE_RSP(AX), SP // Restore the stack directly. + MOVQ PTRACE_RAX(AX), AX // Restore AX (scratch). + SYSRET64() + +// See entry_amd64.go. +TEXT ·iret(SB),NOSPLIT,$0-24 + // Save original state. + LOAD_KERNEL_ADDRESS(cpu+0(FP), BX) + LOAD_KERNEL_ADDRESS(regs+8(FP), AX) + MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX) + MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX) + MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX) + + // Build an IRET frame & restore state. + LOAD_KERNEL_STACK(BX) + MOVQ PTRACE_SS(AX), BX; PUSHQ BX + MOVQ PTRACE_RSP(AX), CX; PUSHQ CX + MOVQ PTRACE_FLAGS(AX), DX; PUSHQ DX + MOVQ PTRACE_CS(AX), DI; PUSHQ DI + MOVQ PTRACE_RIP(AX), SI; PUSHQ SI + REGISTERS_LOAD(AX, 0) // Restore most registers. + MOVQ PTRACE_RAX(AX), AX // Restore AX (scratch). + IRET() + +// See entry_amd64.go. +TEXT ·resume(SB),NOSPLIT,$0 + // See iret, above. + MOVQ CPU_REGISTERS+PTRACE_SS(GS), BX; PUSHQ BX + MOVQ CPU_REGISTERS+PTRACE_RSP(GS), CX; PUSHQ CX + MOVQ CPU_REGISTERS+PTRACE_FLAGS(GS), DX; PUSHQ DX + MOVQ CPU_REGISTERS+PTRACE_CS(GS), DI; PUSHQ DI + MOVQ CPU_REGISTERS+PTRACE_RIP(GS), SI; PUSHQ SI + REGISTERS_LOAD(GS, CPU_REGISTERS) + MOVQ CPU_REGISTERS+PTRACE_RAX(GS), AX + IRET() + +// See entry_amd64.go. +TEXT ·Start(SB),NOSPLIT,$0 + LOAD_KERNEL_STACK(AX) // Set the stack. + PUSHQ $0x0 // Previous frame pointer. + MOVQ SP, BP // Set frame pointer. + PUSHQ AX // First argument (CPU). + CALL ·start(SB) // Call Go hook. + JMP ·resume(SB) // Restore to registers. + +// See entry_amd64.go. +TEXT ·sysenter(SB),NOSPLIT,$0 + // Interrupts are always disabled while we're executing in kernel mode + // and always enabled while executing in user mode. Therefore, we can + // reliably look at the flags in R11 to determine where this syscall + // was from. + TESTL $_RFLAGS_IF, R11 + JZ kernel + +user: + SWAP_GS() + XCHGQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Swap stacks. + XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Swap for AX (regs). + REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX. + MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX // Load saved AX value. + MOVQ BX, PTRACE_RAX(AX) // Save everything else. + MOVQ BX, PTRACE_ORIGRAX(AX) + MOVQ CX, PTRACE_RIP(AX) + MOVQ R11, PTRACE_FLAGS(AX) + MOVQ CPU_REGISTERS+PTRACE_RSP(GS), BX; MOVQ BX, PTRACE_RSP(AX) + MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code. + MOVQ $1, CPU_ERROR_TYPE(GS) // Set error type to user. + + // Return to the kernel, where the frame is: + // + // vector (sp+24) + // regs (sp+16) + // cpu (sp+8) + // vcpu.Switch (sp+0) + // + MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer. + MOVQ $Syscall, 24(SP) // Output vector. + RET + +kernel: + // We can't restore the original stack, but we can access the registers + // in the CPU state directly. No need for temporary juggling. + MOVQ AX, CPU_REGISTERS+PTRACE_ORIGRAX(GS) + MOVQ AX, CPU_REGISTERS+PTRACE_RAX(GS) + REGISTERS_SAVE(GS, CPU_REGISTERS) + MOVQ CX, CPU_REGISTERS+PTRACE_RIP(GS) + MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(GS) + MOVQ SP, CPU_REGISTERS+PTRACE_RSP(GS) + MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code. + MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel. + + // Call the syscall trampoline. + LOAD_KERNEL_STACK(GS) + MOVQ CPU_SELF(GS), AX // Load vCPU. + PUSHQ AX // First argument (vCPU). + CALL ·kernelSyscall(SB) // Call the trampoline. + POPQ AX // Pop vCPU. + JMP ·resume(SB) + +// exception is a generic exception handler. +// +// There are two cases handled: +// +// 1) An exception in kernel mode: this results in saving the state at the time +// of the exception and calling the defined hook. +// +// 2) An exception in guest mode: the original kernel frame is restored, and +// the vector & error codes are pushed as return values. +// +// See below for the stubs that call exception. +TEXT ·exception(SB),NOSPLIT,$0 + // Determine whether the exception occurred in kernel mode or user + // mode, based on the flags. We expect the following stack: + // + // SS (sp+48) + // SP (sp+40) + // FLAGS (sp+32) + // CS (sp+24) + // IP (sp+16) + // ERROR_CODE (sp+8) + // VECTOR (sp+0) + // + TESTL $_RFLAGS_IF, 32(SP) + JZ kernel + +user: + SWAP_GS() + ADDQ $-8, SP // Adjust for flags. + MOVQ $_KERNEL_FLAGS, 0(SP); BYTE $0x9d; // Reset flags (POPFQ). + XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Swap for user regs. + REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX. + MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX // Restore original AX. + MOVQ BX, PTRACE_RAX(AX) // Save it. + MOVQ BX, PTRACE_ORIGRAX(AX) + MOVQ 16(SP), BX; MOVQ BX, PTRACE_RIP(AX) + MOVQ 24(SP), CX; MOVQ CX, PTRACE_CS(AX) + MOVQ 32(SP), DX; MOVQ DX, PTRACE_FLAGS(AX) + MOVQ 40(SP), DI; MOVQ DI, PTRACE_RSP(AX) + MOVQ 48(SP), SI; MOVQ SI, PTRACE_SS(AX) + + // Copy out and return. + MOVQ 0(SP), BX // Load vector. + MOVQ 8(SP), CX // Load error code. + MOVQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Original stack (kernel version). + MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer. + MOVQ CX, CPU_ERROR_CODE(GS) // Set error code. + MOVQ $1, CPU_ERROR_TYPE(GS) // Set error type to user. + MOVQ BX, 24(SP) // Output vector. + RET + +kernel: + // As per above, we can save directly. + MOVQ AX, CPU_REGISTERS+PTRACE_RAX(GS) + MOVQ AX, CPU_REGISTERS+PTRACE_ORIGRAX(GS) + REGISTERS_SAVE(GS, CPU_REGISTERS) + MOVQ 16(SP), AX; MOVQ AX, CPU_REGISTERS+PTRACE_RIP(GS) + MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(GS) + MOVQ 40(SP), CX; MOVQ CX, CPU_REGISTERS+PTRACE_RSP(GS) + + // Set the error code and adjust the stack. + MOVQ 8(SP), AX // Load the error code. + MOVQ AX, CPU_ERROR_CODE(GS) // Copy out to the CPU. + MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel. + MOVQ 0(SP), BX // BX contains the vector. + ADDQ $48, SP // Drop the exception frame. + + // Call the exception trampoline. + LOAD_KERNEL_STACK(GS) + MOVQ CPU_SELF(GS), AX // Load vCPU. + PUSHQ BX // Second argument (vector). + PUSHQ AX // First argument (vCPU). + CALL ·kernelException(SB) // Call the trampoline. + POPQ BX // Pop vector. + POPQ AX // Pop vCPU. + JMP ·resume(SB) + +#define EXCEPTION_WITH_ERROR(value, symbol) \ +TEXT symbol,NOSPLIT,$0; \ + PUSHQ $value; \ + JMP ·exception(SB); + +#define EXCEPTION_WITHOUT_ERROR(value, symbol) \ +TEXT symbol,NOSPLIT,$0; \ + PUSHQ $0x0; \ + PUSHQ $value; \ + JMP ·exception(SB); + +EXCEPTION_WITHOUT_ERROR(DivideByZero, ·divideByZero(SB)) +EXCEPTION_WITHOUT_ERROR(Debug, ·debug(SB)) +EXCEPTION_WITHOUT_ERROR(NMI, ·nmi(SB)) +EXCEPTION_WITHOUT_ERROR(Breakpoint, ·breakpoint(SB)) +EXCEPTION_WITHOUT_ERROR(Overflow, ·overflow(SB)) +EXCEPTION_WITHOUT_ERROR(BoundRangeExceeded, ·boundRangeExceeded(SB)) +EXCEPTION_WITHOUT_ERROR(InvalidOpcode, ·invalidOpcode(SB)) +EXCEPTION_WITHOUT_ERROR(DeviceNotAvailable, ·deviceNotAvailable(SB)) +EXCEPTION_WITH_ERROR(DoubleFault, ·doubleFault(SB)) +EXCEPTION_WITHOUT_ERROR(CoprocessorSegmentOverrun, ·coprocessorSegmentOverrun(SB)) +EXCEPTION_WITH_ERROR(InvalidTSS, ·invalidTSS(SB)) +EXCEPTION_WITH_ERROR(SegmentNotPresent, ·segmentNotPresent(SB)) +EXCEPTION_WITH_ERROR(StackSegmentFault, ·stackSegmentFault(SB)) +EXCEPTION_WITH_ERROR(GeneralProtectionFault, ·generalProtectionFault(SB)) +EXCEPTION_WITH_ERROR(PageFault, ·pageFault(SB)) +EXCEPTION_WITHOUT_ERROR(X87FloatingPointException, ·x87FloatingPointException(SB)) +EXCEPTION_WITH_ERROR(AlignmentCheck, ·alignmentCheck(SB)) +EXCEPTION_WITHOUT_ERROR(MachineCheck, ·machineCheck(SB)) +EXCEPTION_WITHOUT_ERROR(SIMDFloatingPointException, ·simdFloatingPointException(SB)) +EXCEPTION_WITHOUT_ERROR(VirtualizationException, ·virtualizationException(SB)) +EXCEPTION_WITH_ERROR(SecurityException, ·securityException(SB)) +EXCEPTION_WITHOUT_ERROR(SyscallInt80, ·syscallInt80(SB)) diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go new file mode 100644 index 000000000..900c0bba7 --- /dev/null +++ b/pkg/sentry/platform/ring0/kernel.go @@ -0,0 +1,66 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ring0 + +// Init initializes a new kernel. +// +// N.B. that constraints on KernelOpts must be satisfied. +// +//go:nosplit +func (k *Kernel) Init(opts KernelOpts) { + k.init(opts) +} + +// Halt halts execution. +func Halt() + +// defaultHooks implements hooks. +type defaultHooks struct{} + +// KernelSyscall implements Hooks.KernelSyscall. +// +//go:nosplit +func (defaultHooks) KernelSyscall() { Halt() } + +// KernelException implements Hooks.KernelException. +// +//go:nosplit +func (defaultHooks) KernelException(Vector) { Halt() } + +// kernelSyscall is a trampoline. +// +//go:nosplit +func kernelSyscall(c *CPU) { c.hooks.KernelSyscall() } + +// kernelException is a trampoline. +// +//go:nosplit +func kernelException(c *CPU, vector Vector) { c.hooks.KernelException(vector) } + +// Init initializes a new CPU. +// +// Init allows embedding in other objects. +func (c *CPU) Init(k *Kernel, hooks Hooks) { + c.self = c // Set self reference. + c.kernel = k // Set kernel reference. + c.init() // Perform architectural init. + + // Require hooks. + if hooks != nil { + c.hooks = hooks + } else { + c.hooks = defaultHooks{} + } +} diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go new file mode 100644 index 000000000..3577b5127 --- /dev/null +++ b/pkg/sentry/platform/ring0/kernel_amd64.go @@ -0,0 +1,271 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package ring0 + +import ( + "encoding/binary" +) + +// init initializes architecture-specific state. +func (k *Kernel) init(opts KernelOpts) { + // Save the root page tables. + k.PageTables = opts.PageTables + + // Setup the IDT, which is uniform. + for v, handler := range handlers { + // Allow Breakpoint and Overflow to be called from all + // privilege levels. + dpl := 0 + if v == Breakpoint || v == Overflow { + dpl = 3 + } + // Note that we set all traps to use the interrupt stack, this + // is defined below when setting up the TSS. + k.globalIDT[v].setInterrupt(Kcode, uint64(kernelFunc(handler)), dpl, 1 /* ist */) + } +} + +// init initializes architecture-specific state. +func (c *CPU) init() { + // Null segment. + c.gdt[0].setNull() + + // Kernel & user segments. + c.gdt[segKcode] = KernelCodeSegment + c.gdt[segKdata] = KernelDataSegment + c.gdt[segUcode32] = UserCodeSegment32 + c.gdt[segUdata] = UserDataSegment + c.gdt[segUcode64] = UserCodeSegment64 + + // The task segment, this spans two entries. + tssBase, tssLimit, _ := c.TSS() + c.gdt[segTss].set( + uint32(tssBase), + uint32(tssLimit), + 0, // Privilege level zero. + SegmentDescriptorPresent| + SegmentDescriptorAccess| + SegmentDescriptorWrite| + SegmentDescriptorExecute) + c.gdt[segTssHi].setHi(uint32((tssBase) >> 32)) + + // Set the kernel stack pointer in the TSS (virtual address). + stackAddr := c.StackTop() + c.tss.rsp0Lo = uint32(stackAddr) + c.tss.rsp0Hi = uint32(stackAddr >> 32) + c.tss.ist1Lo = uint32(stackAddr) + c.tss.ist1Hi = uint32(stackAddr >> 32) + + // Permanently set the kernel segments. + c.registers.Cs = uint64(Kcode) + c.registers.Ds = uint64(Kdata) + c.registers.Es = uint64(Kdata) + c.registers.Ss = uint64(Kdata) + c.registers.Fs = uint64(Kdata) + c.registers.Gs = uint64(Kdata) + + // Set mandatory flags. + c.registers.Eflags = KernelFlagsSet +} + +// StackTop returns the kernel's stack address. +// +//go:nosplit +func (c *CPU) StackTop() uint64 { + return uint64(kernelAddr(&c.stack[0])) + uint64(len(c.stack)) +} + +// IDT returns the CPU's IDT base and limit. +// +//go:nosplit +func (c *CPU) IDT() (uint64, uint16) { + return uint64(kernelAddr(&c.kernel.globalIDT[0])), uint16(binary.Size(&c.kernel.globalIDT) - 1) +} + +// GDT returns the CPU's GDT base and limit. +// +//go:nosplit +func (c *CPU) GDT() (uint64, uint16) { + return uint64(kernelAddr(&c.gdt[0])), uint16(8*segLast - 1) +} + +// TSS returns the CPU's TSS base, limit and value. +// +//go:nosplit +func (c *CPU) TSS() (uint64, uint16, *SegmentDescriptor) { + return uint64(kernelAddr(&c.tss)), uint16(binary.Size(&c.tss) - 1), &c.gdt[segTss] +} + +// CR0 returns the CPU's CR0 value. +// +//go:nosplit +func (c *CPU) CR0() uint64 { + return _CR0_PE | _CR0_PG | _CR0_AM | _CR0_ET +} + +// CR4 returns the CPU's CR4 value. +// +//go:nosplit +func (c *CPU) CR4() uint64 { + cr4 := uint64(_CR4_PAE | _CR4_PSE | _CR4_OSFXSR | _CR4_OSXMMEXCPT) + if hasPCID { + cr4 |= _CR4_PCIDE + } + if hasXSAVE { + cr4 |= _CR4_OSXSAVE + } + if hasSMEP { + cr4 |= _CR4_SMEP + } + if hasFSGSBASE { + cr4 |= _CR4_FSGSBASE + } + return cr4 +} + +// EFER returns the CPU's EFER value. +// +//go:nosplit +func (c *CPU) EFER() uint64 { + return _EFER_LME | _EFER_LMA | _EFER_SCE | _EFER_NX +} + +// IsCanonical indicates whether addr is canonical per the amd64 spec. +// +//go:nosplit +func IsCanonical(addr uint64) bool { + return addr <= 0x00007fffffffffff || addr > 0xffff800000000000 +} + +// SwitchToUser performs either a sysret or an iret. +// +// The return value is the vector that interrupted execution. +// +// This function will not split the stack. Callers will probably want to call +// runtime.entersyscall (and pair with a call to runtime.exitsyscall) prior to +// calling this function. +// +// When this is done, this region is quite sensitive to things like system +// calls. After calling entersyscall, any memory used must have been allocated +// and no function calls without go:nosplit are permitted. Any calls made here +// are protected appropriately (e.g. IsCanonical and CR3). +// +// Also note that this function transitively depends on the compiler generating +// code that uses IP-relative addressing inside of absolute addresses. That's +// the case for amd64, but may not be the case for other architectures. +// +// Precondition: the Rip, Rsp, Fs and Gs registers must be canonical. +// +//go:nosplit +func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { + userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID) + kernelCR3 := c.kernel.PageTables.CR3(true, switchOpts.KernelPCID) + + // Sanitize registers. + regs := switchOpts.Registers + regs.Eflags &= ^uint64(UserFlagsClear) + regs.Eflags |= UserFlagsSet + regs.Cs = uint64(Ucode64) // Required for iret. + regs.Ss = uint64(Udata) // Ditto. + + // Perform the switch. + swapgs() // GS will be swapped on return. + WriteFS(uintptr(regs.Fs_base)) // Set application FS. + WriteGS(uintptr(regs.Gs_base)) // Set application GS. + LoadFloatingPoint(switchOpts.FloatingPointState) // Copy in floating point. + jumpToKernel() // Switch to upper half. + writeCR3(uintptr(userCR3)) // Change to user address space. + if switchOpts.FullRestore { + vector = iret(c, regs) + } else { + vector = sysret(c, regs) + } + writeCR3(uintptr(kernelCR3)) // Return to kernel address space. + jumpToUser() // Return to lower half. + SaveFloatingPoint(switchOpts.FloatingPointState) // Copy out floating point. + WriteFS(uintptr(c.registers.Fs_base)) // Restore kernel FS. + return +} + +// start is the CPU entrypoint. +// +// This is called from the Start asm stub (see entry_amd64.go); on return the +// registers in c.registers will be restored (not segments). +// +//go:nosplit +func start(c *CPU) { + // Save per-cpu & FS segment. + WriteGS(kernelAddr(c)) + WriteFS(uintptr(c.registers.Fs_base)) + + // Initialize floating point. + // + // Note that on skylake, the valid XCR0 mask reported seems to be 0xff. + // This breaks down as: + // + // bit0 - x87 + // bit1 - SSE + // bit2 - AVX + // bit3-4 - MPX + // bit5-7 - AVX512 + // + // For some reason, enabled MPX & AVX512 on platforms that report them + // seems to be cause a general protection fault. (Maybe there are some + // virtualization issues and these aren't exported to the guest cpuid.) + // This needs further investigation, but we can limit the floating + // point operations to x87, SSE & AVX for now. + fninit() + xsetbv(0, validXCR0Mask&0x7) + + // Set the syscall target. + wrmsr(_MSR_LSTAR, kernelFunc(sysenter)) + wrmsr(_MSR_SYSCALL_MASK, KernelFlagsClear|_RFLAGS_DF) + + // NOTE: This depends on having the 64-bit segments immediately + // following the 32-bit user segments. This is simply the way the + // sysret instruction is designed to work (it assumes they follow). + wrmsr(_MSR_STAR, uintptr(uint64(Kcode)<<32|uint64(Ucode32)<<48)) + wrmsr(_MSR_CSTAR, kernelFunc(sysenter)) +} + +// SetCPUIDFaulting sets CPUID faulting per the boolean value. +// +// True is returned if faulting could be set. +// +//go:nosplit +func SetCPUIDFaulting(on bool) bool { + // Per the SDM (Vol 3, Table 2-43), PLATFORM_INFO bit 31 denotes support + // for CPUID faulting, and we enable and disable via the MISC_FEATURES MSR. + if rdmsr(_MSR_PLATFORM_INFO)&_PLATFORM_INFO_CPUID_FAULT != 0 { + features := rdmsr(_MSR_MISC_FEATURES) + if on { + features |= _MISC_FEATURE_CPUID_TRAP + } else { + features &^= _MISC_FEATURE_CPUID_TRAP + } + wrmsr(_MSR_MISC_FEATURES, features) + return true // Setting successful. + } + return false +} + +// ReadCR2 reads the current CR2 value. +// +//go:nosplit +func ReadCR2() uintptr { + return readCR2() +} diff --git a/pkg/sentry/platform/ring0/kernel_unsafe.go b/pkg/sentry/platform/ring0/kernel_unsafe.go new file mode 100644 index 000000000..16955ad91 --- /dev/null +++ b/pkg/sentry/platform/ring0/kernel_unsafe.go @@ -0,0 +1,41 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ring0 + +import ( + "unsafe" +) + +// eface mirrors runtime.eface. +type eface struct { + typ uintptr + data unsafe.Pointer +} + +// kernelAddr returns the kernel virtual address for the given object. +// +//go:nosplit +func kernelAddr(obj interface{}) uintptr { + e := (*eface)(unsafe.Pointer(&obj)) + return KernelStartAddress | uintptr(e.data) +} + +// kernelFunc returns the address of the given function. +// +//go:nosplit +func kernelFunc(fn func()) uintptr { + fnptr := (**uintptr)(unsafe.Pointer(&fn)) + return KernelStartAddress | **fnptr +} diff --git a/pkg/sentry/platform/ring0/lib_amd64.go b/pkg/sentry/platform/ring0/lib_amd64.go new file mode 100644 index 000000000..9c5f26962 --- /dev/null +++ b/pkg/sentry/platform/ring0/lib_amd64.go @@ -0,0 +1,131 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package ring0 + +import ( + "gvisor.googlesource.com/gvisor/pkg/cpuid" +) + +// LoadFloatingPoint loads floating point state by the most efficient mechanism +// available (set by Init). +var LoadFloatingPoint func(*byte) + +// SaveFloatingPoint saves floating point state by the most efficient mechanism +// available (set by Init). +var SaveFloatingPoint func(*byte) + +// fxrstor uses fxrstor64 to load floating point state. +func fxrstor(*byte) + +// xrstor uses xrstor to load floating point state. +func xrstor(*byte) + +// fxsave uses fxsave64 to save floating point state. +func fxsave(*byte) + +// xsave uses xsave to save floating point state. +func xsave(*byte) + +// xsaveopt uses xsaveopt to save floating point state. +func xsaveopt(*byte) + +// WriteFS sets the GS address (set by init). +var WriteFS func(addr uintptr) + +// wrfsbase writes to the GS base address. +func wrfsbase(addr uintptr) + +// wrfsmsr writes to the GS_BASE MSR. +func wrfsmsr(addr uintptr) + +// WriteGS sets the GS address (set by init). +var WriteGS func(addr uintptr) + +// wrgsbase writes to the GS base address. +func wrgsbase(addr uintptr) + +// wrgsmsr writes to the GS_BASE MSR. +func wrgsmsr(addr uintptr) + +// writeCR3 writes the CR3 value. +func writeCR3(phys uintptr) + +// readCR3 reads the current CR3 value. +func readCR3() uintptr + +// readCR2 reads the current CR2 value. +func readCR2() uintptr + +// jumpToKernel jumps to the kernel version of the current RIP. +func jumpToKernel() + +// jumpToUser jumps to the user version of the current RIP. +func jumpToUser() + +// fninit initializes the floating point unit. +func fninit() + +// xsetbv writes to an extended control register. +func xsetbv(reg, value uintptr) + +// xgetbv reads an extended control register. +func xgetbv(reg uintptr) uintptr + +// wrmsr reads to the given MSR. +func wrmsr(reg, value uintptr) + +// rdmsr reads the given MSR. +func rdmsr(reg uintptr) uintptr + +// Mostly-constants set by Init. +var ( + hasSMEP bool + hasPCID bool + hasXSAVEOPT bool + hasXSAVE bool + hasFSGSBASE bool + validXCR0Mask uintptr +) + +// Init sets function pointers based on architectural features. +// +// This must be called prior to using ring0. +func Init(featureSet *cpuid.FeatureSet) { + hasSMEP = featureSet.HasFeature(cpuid.X86FeatureSMEP) + hasPCID = featureSet.HasFeature(cpuid.X86FeaturePCID) + hasXSAVEOPT = featureSet.UseXsaveopt() + hasXSAVE = featureSet.UseXsave() + hasFSGSBASE = featureSet.HasFeature(cpuid.X86FeatureFSGSBase) + validXCR0Mask = uintptr(featureSet.ValidXCR0Mask()) + if hasXSAVEOPT { + SaveFloatingPoint = xsaveopt + LoadFloatingPoint = xrstor + } else if hasXSAVE { + SaveFloatingPoint = xsave + LoadFloatingPoint = xrstor + } else { + SaveFloatingPoint = fxsave + LoadFloatingPoint = fxrstor + } + if hasFSGSBASE { + WriteFS = wrfsbase + WriteGS = wrgsbase + } else { + WriteFS = wrfsmsr + WriteGS = wrgsmsr + } +} diff --git a/pkg/sentry/platform/ring0/lib_amd64.s b/pkg/sentry/platform/ring0/lib_amd64.s new file mode 100644 index 000000000..75d742750 --- /dev/null +++ b/pkg/sentry/platform/ring0/lib_amd64.s @@ -0,0 +1,247 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "funcdata.h" +#include "textflag.h" + +// fxrstor loads floating point state. +// +// The code corresponds to: +// +// fxrstor64 (%rbx) +// +TEXT ·fxrstor(SB),NOSPLIT,$0-8 + MOVQ addr+0(FP), BX + MOVL $0xffffffff, AX + MOVL $0xffffffff, DX + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x0b; + RET + +// xrstor loads floating point state. +// +// The code corresponds to: +// +// xrstor (%rdi) +// +TEXT ·xrstor(SB),NOSPLIT,$0-8 + MOVQ addr+0(FP), DI + MOVL $0xffffffff, AX + MOVL $0xffffffff, DX + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x2f; + RET + +// fxsave saves floating point state. +// +// The code corresponds to: +// +// fxsave64 (%rbx) +// +TEXT ·fxsave(SB),NOSPLIT,$0-8 + MOVQ addr+0(FP), BX + MOVL $0xffffffff, AX + MOVL $0xffffffff, DX + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x03; + RET + +// xsave saves floating point state. +// +// The code corresponds to: +// +// xsave (%rdi) +// +TEXT ·xsave(SB),NOSPLIT,$0-8 + MOVQ addr+0(FP), DI + MOVL $0xffffffff, AX + MOVL $0xffffffff, DX + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; + RET + +// xsaveopt saves floating point state. +// +// The code corresponds to: +// +// xsaveopt (%rdi) +// +TEXT ·xsaveopt(SB),NOSPLIT,$0-8 + MOVQ addr+0(FP), DI + MOVL $0xffffffff, AX + MOVL $0xffffffff, DX + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; + RET + +// wrfsbase writes to the FS base. +// +// The code corresponds to: +// +// wrfsbase %rax +// +TEXT ·wrfsbase(SB),NOSPLIT,$0-8 + MOVQ addr+0(FP), AX + BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xd0; + RET + +// wrfsmsr writes to the FSBASE MSR. +// +// The code corresponds to: +// +// wrmsr (writes EDX:EAX to the MSR in ECX) +// +TEXT ·wrfsmsr(SB),NOSPLIT,$0-8 + MOVQ addr+0(FP), AX + MOVQ AX, DX + SHRQ $32, DX + MOVQ $0xc0000100, CX // MSR_FS_BASE + BYTE $0x0f; BYTE $0x30; + RET + +// wrgsbase writes to the GS base. +// +// The code corresponds to: +// +// wrgsbase %rax +// +TEXT ·wrgsbase(SB),NOSPLIT,$0-8 + MOVQ addr+0(FP), AX + BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0xd8; + RET + +// wrgsmsr writes to the GSBASE MSR. +// +// See wrfsmsr. +TEXT ·wrgsmsr(SB),NOSPLIT,$0-8 + MOVQ addr+0(FP), AX + MOVQ AX, DX + SHRQ $32, DX + MOVQ $0xc0000101, CX // MSR_GS_BASE + BYTE $0x0f; BYTE $0x30; // WRMSR + RET + +// jumpToUser changes execution to the user address. +// +// This works by changing the return value to the user version. +TEXT ·jumpToUser(SB),NOSPLIT,$0 + MOVQ 0(SP), AX + MOVQ ·KernelStartAddress(SB), BX + NOTQ BX + ANDQ BX, SP // Switch the stack. + ANDQ BX, BP // Switch the frame pointer. + ANDQ BX, AX // Future return value. + MOVQ AX, 0(SP) + RET + +// jumpToKernel changes execution to the kernel address space. +// +// This works by changing the return value to the kernel version. +TEXT ·jumpToKernel(SB),NOSPLIT,$0 + MOVQ 0(SP), AX + MOVQ ·KernelStartAddress(SB), BX + ORQ BX, SP // Switch the stack. + ORQ BX, BP // Switch the frame pointer. + ORQ BX, AX // Future return value. + MOVQ AX, 0(SP) + RET + +// writeCR3 writes the given CR3 value. +// +// The code corresponds to: +// +// mov %rax, %cr3 +// +TEXT ·writeCR3(SB),NOSPLIT,$0-8 + MOVQ cr3+0(FP), AX + BYTE $0x0f; BYTE $0x22; BYTE $0xd8; + RET + +// readCR3 reads the current CR3 value. +// +// The code corresponds to: +// +// mov %cr3, %rax +// +TEXT ·readCR3(SB),NOSPLIT,$0-8 + BYTE $0x0f; BYTE $0x20; BYTE $0xd8; + MOVQ AX, ret+0(FP) + RET + +// readCR2 reads the current CR2 value. +// +// The code corresponds to: +// +// mov %cr2, %rax +// +TEXT ·readCR2(SB),NOSPLIT,$0-8 + BYTE $0x0f; BYTE $0x20; BYTE $0xd0; + MOVQ AX, ret+0(FP) + RET + +// fninit initializes the floating point unit. +// +// The code corresponds to: +// +// fninit +TEXT ·fninit(SB),NOSPLIT,$0 + BYTE $0xdb; BYTE $0xe3; + RET + +// xsetbv writes to an extended control register. +// +// The code corresponds to: +// +// xsetbv +// +TEXT ·xsetbv(SB),NOSPLIT,$0-16 + MOVL reg+0(FP), CX + MOVL value+8(FP), AX + MOVL value+12(FP), DX + BYTE $0x0f; BYTE $0x01; BYTE $0xd1; + RET + +// xgetbv reads an extended control register. +// +// The code corresponds to: +// +// xgetbv +// +TEXT ·xgetbv(SB),NOSPLIT,$0-16 + MOVL reg+0(FP), CX + BYTE $0x0f; BYTE $0x01; BYTE $0xd0; + MOVL AX, ret+8(FP) + MOVL DX, ret+12(FP) + RET + +// wrmsr writes to a control register. +// +// The code corresponds to: +// +// wrmsr +// +TEXT ·wrmsr(SB),NOSPLIT,$0-16 + MOVL reg+0(FP), CX + MOVL value+8(FP), AX + MOVL value+12(FP), DX + BYTE $0x0f; BYTE $0x30; + RET + +// rdmsr reads a control register. +// +// The code corresponds to: +// +// rdmsr +// +TEXT ·rdmsr(SB),NOSPLIT,$0-16 + MOVL reg+0(FP), CX + BYTE $0x0f; BYTE $0x32; + MOVL AX, ret+8(FP) + MOVL DX, ret+12(FP) + RET diff --git a/pkg/sentry/platform/ring0/pagetables/allocator.go b/pkg/sentry/platform/ring0/pagetables/allocator.go new file mode 100644 index 000000000..23fd5c352 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/allocator.go @@ -0,0 +1,122 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pagetables + +// Allocator is used to allocate and map PTEs. +// +// Note that allocators may be called concurrently. +type Allocator interface { + // NewPTEs returns a new set of PTEs and their physical address. + NewPTEs() *PTEs + + // PhysicalFor gives the physical address for a set of PTEs. + PhysicalFor(ptes *PTEs) uintptr + + // LookupPTEs looks up PTEs by physical address. + LookupPTEs(physical uintptr) *PTEs + + // FreePTEs marks a set of PTEs a freed, although they may not be available + // for use again until Recycle is called, below. + FreePTEs(ptes *PTEs) + + // Recycle makes freed PTEs available for use again. + Recycle() +} + +// RuntimeAllocator is a trivial allocator. +type RuntimeAllocator struct { + // used is the set of PTEs that have been allocated. This includes any + // PTEs that may be in the pool below. PTEs are only freed from this + // map by the Drain call. + // + // This exists to prevent accidental garbage collection. + used map[*PTEs]struct{} + + // pool is the set of free-to-use PTEs. + pool []*PTEs + + // freed is the set of recently-freed PTEs. + freed []*PTEs +} + +// NewRuntimeAllocator returns an allocator that uses runtime allocation. +func NewRuntimeAllocator() *RuntimeAllocator { + return &RuntimeAllocator{ + used: make(map[*PTEs]struct{}), + } +} + +// Recycle returns freed pages to the pool. +func (r *RuntimeAllocator) Recycle() { + r.pool = append(r.pool, r.freed...) + r.freed = r.freed[:0] +} + +// Drain empties the pool. +func (r *RuntimeAllocator) Drain() { + r.Recycle() + for i, ptes := range r.pool { + // Zap the entry in the underlying array to ensure that it can + // be properly garbage collected. + r.pool[i] = nil + // Similarly, free the reference held by the used map (these + // also apply for the pool entries). + delete(r.used, ptes) + } + r.pool = r.pool[:0] +} + +// NewPTEs implements Allocator.NewPTEs. +// +// Note that the "physical" address here is actually the virtual address of the +// PTEs structure. The entries are tracked only to avoid garbage collection. +// +// This is guaranteed not to split as long as the pool is sufficiently full. +// +//go:nosplit +func (r *RuntimeAllocator) NewPTEs() *PTEs { + // Pull from the pool if we can. + if len(r.pool) > 0 { + ptes := r.pool[len(r.pool)-1] + r.pool = r.pool[:len(r.pool)-1] + return ptes + } + + // Allocate a new entry. + ptes := newAlignedPTEs() + r.used[ptes] = struct{}{} + return ptes +} + +// PhysicalFor returns the physical address for the given PTEs. +// +//go:nosplit +func (r *RuntimeAllocator) PhysicalFor(ptes *PTEs) uintptr { + return physicalFor(ptes) +} + +// LookupPTEs implements Allocator.LookupPTEs. +// +//go:nosplit +func (r *RuntimeAllocator) LookupPTEs(physical uintptr) *PTEs { + return fromPhysical(physical) +} + +// FreePTEs implements Allocator.FreePTEs. +// +//go:nosplit +func (r *RuntimeAllocator) FreePTEs(ptes *PTEs) { + r.freed = append(r.freed, ptes) +} diff --git a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go new file mode 100644 index 000000000..1b996b4e2 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go @@ -0,0 +1,53 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pagetables + +import ( + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// newAlignedPTEs returns a set of aligned PTEs. +func newAlignedPTEs() *PTEs { + ptes := new(PTEs) + offset := physicalFor(ptes) & (usermem.PageSize - 1) + if offset == 0 { + // Already aligned. + return ptes + } + + // Need to force an aligned allocation. + unaligned := make([]byte, (2*usermem.PageSize)-1) + offset = uintptr(unsafe.Pointer(&unaligned[0])) & (usermem.PageSize - 1) + if offset != 0 { + offset = usermem.PageSize - offset + } + return (*PTEs)(unsafe.Pointer(&unaligned[offset])) +} + +// physicalFor returns the "physical" address for PTEs. +// +//go:nosplit +func physicalFor(ptes *PTEs) uintptr { + return uintptr(unsafe.Pointer(ptes)) +} + +// fromPhysical returns the PTEs from the "physical" address. +// +//go:nosplit +func fromPhysical(physical uintptr) *PTEs { + return (*PTEs)(unsafe.Pointer(physical)) +} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go new file mode 100644 index 000000000..e5dcaada7 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go @@ -0,0 +1,221 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package pagetables provides a generic implementation of pagetables. +// +// The core functions must be safe to call from a nosplit context. Furthermore, +// this pagetables implementation goes to lengths to ensure that all functions +// are free from runtime allocation. Calls to NewPTEs/FreePTEs may be made +// during walks, but these can be cached elsewhere if required. +package pagetables + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// PageTables is a set of page tables. +type PageTables struct { + // Allocator is used to allocate nodes. + Allocator Allocator + + // root is the pagetable root. + root *PTEs + + // rootPhysical is the cached physical address of the root. + // + // This is saved only to prevent constant translation. + rootPhysical uintptr + + // archPageTables includes architecture-specific features. + archPageTables +} + +// New returns new PageTables. +func New(a Allocator) *PageTables { + p := new(PageTables) + p.Init(a) + return p +} + +// Init initializes a set of PageTables. +// +//go:nosplit +func (p *PageTables) Init(allocator Allocator) { + p.Allocator = allocator + p.root = p.Allocator.NewPTEs() + p.rootPhysical = p.Allocator.PhysicalFor(p.root) +} + +// mapVisitor is used for map. +type mapVisitor struct { + target uintptr // Input. + physical uintptr // Input. + opts MapOpts // Input. + prev bool // Output. +} + +// visit is used for map. +// +//go:nosplit +func (v *mapVisitor) visit(start uintptr, pte *PTE, align uintptr) { + p := v.physical + (start - uintptr(v.target)) + if pte.Valid() && (pte.Address() != p || pte.Opts() != v.opts) { + v.prev = true + } + if p&align != 0 { + // We will install entries at a smaller granulaity if we don't + // install a valid entry here, however we must zap any existing + // entry to ensure this happens. + pte.Clear() + return + } + pte.Set(p, v.opts) +} + +//go:nosplit +func (*mapVisitor) requiresAlloc() bool { return true } + +//go:nosplit +func (*mapVisitor) requiresSplit() bool { return true } + +// Map installs a mapping with the given physical address. +// +// True is returned iff there was a previous mapping in the range. +// +// Precondition: addr & length must be page-aligned, their sum must not overflow. +// +//go:nosplit +func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physical uintptr) bool { + if !opts.AccessType.Any() { + return p.Unmap(addr, length) + } + w := mapWalker{ + pageTables: p, + visitor: mapVisitor{ + target: uintptr(addr), + physical: physical, + opts: opts, + }, + } + w.iterateRange(uintptr(addr), uintptr(addr)+length) + return w.visitor.prev +} + +// unmapVisitor is used for unmap. +type unmapVisitor struct { + count int +} + +//go:nosplit +func (*unmapVisitor) requiresAlloc() bool { return false } + +//go:nosplit +func (*unmapVisitor) requiresSplit() bool { return true } + +// visit unmaps the given entry. +// +//go:nosplit +func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) { + pte.Clear() + v.count++ +} + +// Unmap unmaps the given range. +// +// True is returned iff there was a previous mapping in the range. +// +// Precondition: addr & length must be page-aligned. +// +//go:nosplit +func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool { + w := unmapWalker{ + pageTables: p, + visitor: unmapVisitor{ + count: 0, + }, + } + w.iterateRange(uintptr(addr), uintptr(addr)+length) + return w.visitor.count > 0 +} + +// emptyVisitor is used for emptiness checks. +type emptyVisitor struct { + count int +} + +//go:nosplit +func (*emptyVisitor) requiresAlloc() bool { return false } + +//go:nosplit +func (*emptyVisitor) requiresSplit() bool { return false } + +// visit unmaps the given entry. +// +//go:nosplit +func (v *emptyVisitor) visit(start uintptr, pte *PTE, align uintptr) { + v.count++ +} + +// IsEmpty checks if the given range is empty. +// +// Precondition: addr & length must be page-aligned. +// +//go:nosplit +func (p *PageTables) IsEmpty(addr usermem.Addr, length uintptr) bool { + w := emptyWalker{ + pageTables: p, + } + w.iterateRange(uintptr(addr), uintptr(addr)+length) + return w.visitor.count == 0 +} + +// lookupVisitor is used for lookup. +type lookupVisitor struct { + target uintptr // Input. + physical uintptr // Output. + opts MapOpts // Output. +} + +// visit matches the given address. +// +//go:nosplit +func (v *lookupVisitor) visit(start uintptr, pte *PTE, align uintptr) { + if !pte.Valid() { + return + } + v.physical = pte.Address() + (start - uintptr(v.target)) + v.opts = pte.Opts() +} + +//go:nosplit +func (*lookupVisitor) requiresAlloc() bool { return false } + +//go:nosplit +func (*lookupVisitor) requiresSplit() bool { return false } + +// Lookup returns the physical address for the given virtual address. +// +//go:nosplit +func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts) { + mask := uintptr(usermem.PageSize - 1) + offset := uintptr(addr) & mask + w := lookupWalker{ + pageTables: p, + visitor: lookupVisitor{ + target: uintptr(addr &^ usermem.Addr(mask)), + }, + } + w.iterateRange(uintptr(addr), uintptr(addr)+1) + return w.visitor.physical + offset, w.visitor.opts +} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go new file mode 100644 index 000000000..7aa6c524e --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go @@ -0,0 +1,45 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pagetables + +// Address constraints. +// +// The lowerTop and upperBottom currently apply to four-level pagetables; +// additional refactoring would be necessary to support five-level pagetables. +const ( + lowerTop = 0x00007fffffffffff + upperBottom = 0xffff800000000000 + + pteShift = 12 + pmdShift = 21 + pudShift = 30 + pgdShift = 39 + + pteMask = 0x1ff << pteShift + pmdMask = 0x1ff << pmdShift + pudMask = 0x1ff << pudShift + pgdMask = 0x1ff << pgdShift + + pteSize = 1 << pteShift + pmdSize = 1 << pmdShift + pudSize = 1 << pudShift + pgdSize = 1 << pgdShift + + executeDisable = 1 << 63 + entriesPerPage = 512 +) + +// PTEs is a collection of entries. +type PTEs [entriesPerPage]PTE diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_state_autogen.go b/pkg/sentry/platform/ring0/pagetables/pagetables_state_autogen.go new file mode 100755 index 000000000..ac1ccf3d3 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_state_autogen.go @@ -0,0 +1,4 @@ +// automatically generated by stateify. + +package pagetables + diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go new file mode 100644 index 000000000..ff427fbe9 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go @@ -0,0 +1,180 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build i386 amd64 + +package pagetables + +import ( + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// archPageTables is architecture-specific data. +type archPageTables struct { + // pcid is the value assigned by PCIDs.Assign. + // + // Note that zero is a valid PCID. + pcid uint16 +} + +// CR3 returns the CR3 value for these tables. +// +// This may be called in interrupt contexts. A PCID of zero always implies a +// flush and should be passed when PCIDs are not enabled. See pcids_x86.go for +// more information. +// +//go:nosplit +func (p *PageTables) CR3(noFlush bool, pcid uint16) uint64 { + // Bit 63 is set to avoid flushing the PCID (per SDM 4.10.4.1). + const noFlushBit uint64 = 0x8000000000000000 + if noFlush && pcid != 0 { + return noFlushBit | uint64(p.rootPhysical) | uint64(pcid) + } + return uint64(p.rootPhysical) | uint64(pcid) +} + +// Bits in page table entries. +const ( + present = 0x001 + writable = 0x002 + user = 0x004 + writeThrough = 0x008 + cacheDisable = 0x010 + accessed = 0x020 + dirty = 0x040 + super = 0x080 + global = 0x100 + optionMask = executeDisable | 0xfff +) + +// MapOpts are x86 options. +type MapOpts struct { + // AccessType defines permissions. + AccessType usermem.AccessType + + // Global indicates the page is globally accessible. + Global bool + + // User indicates the page is a user page. + User bool +} + +// PTE is a page table entry. +type PTE uintptr + +// Clear clears this PTE, including super page information. +// +//go:nosplit +func (p *PTE) Clear() { + atomic.StoreUintptr((*uintptr)(p), 0) +} + +// Valid returns true iff this entry is valid. +// +//go:nosplit +func (p *PTE) Valid() bool { + return atomic.LoadUintptr((*uintptr)(p))&present != 0 +} + +// Opts returns the PTE options. +// +// These are all options except Valid and Super. +// +//go:nosplit +func (p *PTE) Opts() MapOpts { + v := atomic.LoadUintptr((*uintptr)(p)) + return MapOpts{ + AccessType: usermem.AccessType{ + Read: v&present != 0, + Write: v&writable != 0, + Execute: v&executeDisable == 0, + }, + Global: v&global != 0, + User: v&user != 0, + } +} + +// SetSuper sets this page as a super page. +// +// The page must not be valid or a panic will result. +// +//go:nosplit +func (p *PTE) SetSuper() { + if p.Valid() { + // This is not allowed. + panic("SetSuper called on valid page!") + } + atomic.StoreUintptr((*uintptr)(p), super) +} + +// IsSuper returns true iff this page is a super page. +// +//go:nosplit +func (p *PTE) IsSuper() bool { + return atomic.LoadUintptr((*uintptr)(p))&super != 0 +} + +// Set sets this PTE value. +// +// This does not change the super page property. +// +//go:nosplit +func (p *PTE) Set(addr uintptr, opts MapOpts) { + if !opts.AccessType.Any() { + p.Clear() + return + } + v := (addr &^ optionMask) | present | accessed + if opts.User { + v |= user + } + if opts.Global { + v |= global + } + if !opts.AccessType.Execute { + v |= executeDisable + } + if opts.AccessType.Write { + v |= writable | dirty + } + if p.IsSuper() { + // Note that this is inherited from the previous instance. Set + // does not change the value of Super. See above. + v |= super + } + atomic.StoreUintptr((*uintptr)(p), v) +} + +// setPageTable sets this PTE value and forces the write bit and super bit to +// be cleared. This is used explicitly for breaking super pages. +// +//go:nosplit +func (p *PTE) setPageTable(pt *PageTables, ptes *PTEs) { + addr := pt.Allocator.PhysicalFor(ptes) + if addr&^optionMask != addr { + // This should never happen. + panic("unaligned physical address!") + } + v := addr | present | user | writable | accessed | dirty + atomic.StoreUintptr((*uintptr)(p), v) +} + +// Address extracts the address. This should only be used if Valid returns true. +// +//go:nosplit +func (p *PTE) Address() uintptr { + return atomic.LoadUintptr((*uintptr)(p)) &^ optionMask +} diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go new file mode 100644 index 000000000..0f029f25d --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go @@ -0,0 +1,109 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build i386 amd64 + +package pagetables + +import ( + "sync" +) + +// limitPCID is the number of valid PCIDs. +const limitPCID = 4096 + +// PCIDs is a simple PCID database. +// +// This is not protected by locks and is thus suitable for use only with a +// single CPU at a time. +type PCIDs struct { + // mu protects below. + mu sync.Mutex + + // cache are the assigned page tables. + cache map[*PageTables]uint16 + + // avail are available PCIDs. + avail []uint16 +} + +// NewPCIDs returns a new PCID database. +// +// start is the first index to assign. Typically this will be one, as the zero +// pcid will always be flushed on transition (see pagetables_x86.go). This may +// be more than one if specific PCIDs are reserved. +// +// Nil is returned iff the start and size are out of range. +func NewPCIDs(start, size uint16) *PCIDs { + if start+uint16(size) >= limitPCID { + return nil // See comment. + } + p := &PCIDs{ + cache: make(map[*PageTables]uint16), + } + for pcid := start; pcid < start+size; pcid++ { + p.avail = append(p.avail, pcid) + } + return p +} + +// Assign assigns a PCID to the given PageTables. +// +// This may overwrite any previous assignment provided. If this in the case, +// true is returned to indicate that the PCID should be flushed. +func (p *PCIDs) Assign(pt *PageTables) (uint16, bool) { + p.mu.Lock() + if pcid, ok := p.cache[pt]; ok { + p.mu.Unlock() + return pcid, false // No flush. + } + + // Is there something available? + if len(p.avail) > 0 { + pcid := p.avail[len(p.avail)-1] + p.avail = p.avail[:len(p.avail)-1] + p.cache[pt] = pcid + + // We need to flush because while this is in the available + // pool, it may have been used previously. + p.mu.Unlock() + return pcid, true + } + + // Evict an existing table. + for old, pcid := range p.cache { + delete(p.cache, old) + p.cache[pt] = pcid + + // A flush is definitely required in this case, these page + // tables may still be active. (They will just be assigned some + // other PCID if and when they hit the given CPU again.) + p.mu.Unlock() + return pcid, true + } + + // No PCID. + p.mu.Unlock() + return 0, false +} + +// Drop drops references to a set of page tables. +func (p *PCIDs) Drop(pt *PageTables) { + p.mu.Lock() + if pcid, ok := p.cache[pt]; ok { + delete(p.cache, pt) + p.avail = append(p.avail, pcid) + } + p.mu.Unlock() +} diff --git a/pkg/sentry/platform/ring0/pagetables/walker_empty.go b/pkg/sentry/platform/ring0/pagetables/walker_empty.go new file mode 100755 index 000000000..417784e17 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/walker_empty.go @@ -0,0 +1,255 @@ +package pagetables + +// Walker walks page tables. +type emptyWalker struct { + // pageTables are the tables to walk. + pageTables *PageTables + + // Visitor is the set of arguments. + visitor emptyVisitor +} + +// iterateRange iterates over all appropriate levels of page tables for the given range. +// +// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The +// exception is super pages. If a valid super page (huge or jumbo) cannot be +// installed, then the walk will continue to individual entries. +// +// This algorithm will attempt to maximize the use of super pages whenever +// possible. Whether a super page is provided will be clear through the range +// provided in the callback. +// +// Note that if requiresAlloc is true, then no gaps will be present. However, +// if alloc is not set, then the iteration will likely be full of gaps. +// +// Note that this function should generally be avoided in favor of Map, Unmap, +// etc. when not necessary. +// +// Precondition: start must be page-aligned. +// +// Precondition: start must be less than end. +// +// Precondition: If requiresAlloc is true, then start and end should not span +// non-canonical ranges. If they do, a panic will result. +// +//go:nosplit +func (w *emptyWalker) iterateRange(start, end uintptr) { + if start%pteSize != 0 { + panic("unaligned start") + } + if end < start { + panic("start > end") + } + if start < lowerTop { + if end <= lowerTop { + w.iterateRangeCanonical(start, end) + } else if end > lowerTop && end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + w.iterateRangeCanonical(upperBottom, end) + } + } else if start < upperBottom { + if end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(upperBottom, end) + } + } else { + w.iterateRangeCanonical(start, end) + } +} + +// next returns the next address quantized by the given size. +// +//go:nosplit +func emptynext(start uintptr, size uintptr) uintptr { + start &= ^(size - 1) + start += size + return start +} + +// iterateRangeCanonical walks a canonical range. +// +//go:nosplit +func (w *emptyWalker) iterateRangeCanonical(start, end uintptr) { + for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { + var ( + pgdEntry = &w.pageTables.root[pgdIndex] + pudEntries *PTEs + ) + if !pgdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + start = emptynext(start, pgdSize) + continue + } + + pudEntries = w.pageTables.Allocator.NewPTEs() + pgdEntry.setPageTable(w.pageTables, pudEntries) + } else { + pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) + } + + clearPUDEntries := uint16(0) + + for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { + var ( + pudEntry = &pudEntries[pudIndex] + pmdEntries *PTEs + ) + if !pudEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPUDEntries++ + start = emptynext(start, pudSize) + continue + } + + if start&(pudSize-1) == 0 && end-start >= pudSize { + pudEntry.SetSuper() + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + if pudEntry.Valid() { + start = emptynext(start, pudSize) + continue + } + } + + pmdEntries = w.pageTables.Allocator.NewPTEs() + pudEntry.setPageTable(w.pageTables, pmdEntries) + + } else if pudEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < emptynext(start, pudSize)) { + + pmdEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pmdEntries[index].SetSuper() + pmdEntries[index].Set( + pudEntry.Address()+(pmdSize*uintptr(index)), + pudEntry.Opts()) + } + pudEntry.setPageTable(w.pageTables, pmdEntries) + } else { + + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + + if !pudEntry.Valid() { + clearPUDEntries++ + } + + start = emptynext(start, pudSize) + continue + } + } else { + pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) + } + + clearPMDEntries := uint16(0) + + for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { + var ( + pmdEntry = &pmdEntries[pmdIndex] + pteEntries *PTEs + ) + if !pmdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPMDEntries++ + start = emptynext(start, pmdSize) + continue + } + + if start&(pmdSize-1) == 0 && end-start >= pmdSize { + pmdEntry.SetSuper() + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + if pmdEntry.Valid() { + start = emptynext(start, pmdSize) + continue + } + } + + pteEntries = w.pageTables.Allocator.NewPTEs() + pmdEntry.setPageTable(w.pageTables, pteEntries) + + } else if pmdEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < emptynext(start, pmdSize)) { + + pteEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pteEntries[index].Set( + pmdEntry.Address()+(pteSize*uintptr(index)), + pmdEntry.Opts()) + } + pmdEntry.setPageTable(w.pageTables, pteEntries) + } else { + + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + + if !pmdEntry.Valid() { + clearPMDEntries++ + } + + start = emptynext(start, pmdSize) + continue + } + } else { + pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) + } + + clearPTEEntries := uint16(0) + + for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { + var ( + pteEntry = &pteEntries[pteIndex] + ) + if !pteEntry.Valid() && !w.visitor.requiresAlloc() { + clearPTEEntries++ + start += pteSize + continue + } + + w.visitor.visit(uintptr(start), pteEntry, pteSize-1) + if !pteEntry.Valid() { + if w.visitor.requiresAlloc() { + panic("PTE not set after iteration with requiresAlloc!") + } + clearPTEEntries++ + } + + start += pteSize + continue + } + + if clearPTEEntries == entriesPerPage { + pmdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pteEntries) + clearPMDEntries++ + } + } + + if clearPMDEntries == entriesPerPage { + pudEntry.Clear() + w.pageTables.Allocator.FreePTEs(pmdEntries) + clearPUDEntries++ + } + } + + if clearPUDEntries == entriesPerPage { + pgdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pudEntries) + } + } +} diff --git a/pkg/sentry/platform/ring0/pagetables/walker_lookup.go b/pkg/sentry/platform/ring0/pagetables/walker_lookup.go new file mode 100755 index 000000000..906c9c50f --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/walker_lookup.go @@ -0,0 +1,255 @@ +package pagetables + +// Walker walks page tables. +type lookupWalker struct { + // pageTables are the tables to walk. + pageTables *PageTables + + // Visitor is the set of arguments. + visitor lookupVisitor +} + +// iterateRange iterates over all appropriate levels of page tables for the given range. +// +// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The +// exception is super pages. If a valid super page (huge or jumbo) cannot be +// installed, then the walk will continue to individual entries. +// +// This algorithm will attempt to maximize the use of super pages whenever +// possible. Whether a super page is provided will be clear through the range +// provided in the callback. +// +// Note that if requiresAlloc is true, then no gaps will be present. However, +// if alloc is not set, then the iteration will likely be full of gaps. +// +// Note that this function should generally be avoided in favor of Map, Unmap, +// etc. when not necessary. +// +// Precondition: start must be page-aligned. +// +// Precondition: start must be less than end. +// +// Precondition: If requiresAlloc is true, then start and end should not span +// non-canonical ranges. If they do, a panic will result. +// +//go:nosplit +func (w *lookupWalker) iterateRange(start, end uintptr) { + if start%pteSize != 0 { + panic("unaligned start") + } + if end < start { + panic("start > end") + } + if start < lowerTop { + if end <= lowerTop { + w.iterateRangeCanonical(start, end) + } else if end > lowerTop && end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + w.iterateRangeCanonical(upperBottom, end) + } + } else if start < upperBottom { + if end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(upperBottom, end) + } + } else { + w.iterateRangeCanonical(start, end) + } +} + +// next returns the next address quantized by the given size. +// +//go:nosplit +func lookupnext(start uintptr, size uintptr) uintptr { + start &= ^(size - 1) + start += size + return start +} + +// iterateRangeCanonical walks a canonical range. +// +//go:nosplit +func (w *lookupWalker) iterateRangeCanonical(start, end uintptr) { + for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { + var ( + pgdEntry = &w.pageTables.root[pgdIndex] + pudEntries *PTEs + ) + if !pgdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + start = lookupnext(start, pgdSize) + continue + } + + pudEntries = w.pageTables.Allocator.NewPTEs() + pgdEntry.setPageTable(w.pageTables, pudEntries) + } else { + pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) + } + + clearPUDEntries := uint16(0) + + for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { + var ( + pudEntry = &pudEntries[pudIndex] + pmdEntries *PTEs + ) + if !pudEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPUDEntries++ + start = lookupnext(start, pudSize) + continue + } + + if start&(pudSize-1) == 0 && end-start >= pudSize { + pudEntry.SetSuper() + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + if pudEntry.Valid() { + start = lookupnext(start, pudSize) + continue + } + } + + pmdEntries = w.pageTables.Allocator.NewPTEs() + pudEntry.setPageTable(w.pageTables, pmdEntries) + + } else if pudEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < lookupnext(start, pudSize)) { + + pmdEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pmdEntries[index].SetSuper() + pmdEntries[index].Set( + pudEntry.Address()+(pmdSize*uintptr(index)), + pudEntry.Opts()) + } + pudEntry.setPageTable(w.pageTables, pmdEntries) + } else { + + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + + if !pudEntry.Valid() { + clearPUDEntries++ + } + + start = lookupnext(start, pudSize) + continue + } + } else { + pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) + } + + clearPMDEntries := uint16(0) + + for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { + var ( + pmdEntry = &pmdEntries[pmdIndex] + pteEntries *PTEs + ) + if !pmdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPMDEntries++ + start = lookupnext(start, pmdSize) + continue + } + + if start&(pmdSize-1) == 0 && end-start >= pmdSize { + pmdEntry.SetSuper() + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + if pmdEntry.Valid() { + start = lookupnext(start, pmdSize) + continue + } + } + + pteEntries = w.pageTables.Allocator.NewPTEs() + pmdEntry.setPageTable(w.pageTables, pteEntries) + + } else if pmdEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < lookupnext(start, pmdSize)) { + + pteEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pteEntries[index].Set( + pmdEntry.Address()+(pteSize*uintptr(index)), + pmdEntry.Opts()) + } + pmdEntry.setPageTable(w.pageTables, pteEntries) + } else { + + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + + if !pmdEntry.Valid() { + clearPMDEntries++ + } + + start = lookupnext(start, pmdSize) + continue + } + } else { + pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) + } + + clearPTEEntries := uint16(0) + + for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { + var ( + pteEntry = &pteEntries[pteIndex] + ) + if !pteEntry.Valid() && !w.visitor.requiresAlloc() { + clearPTEEntries++ + start += pteSize + continue + } + + w.visitor.visit(uintptr(start), pteEntry, pteSize-1) + if !pteEntry.Valid() { + if w.visitor.requiresAlloc() { + panic("PTE not set after iteration with requiresAlloc!") + } + clearPTEEntries++ + } + + start += pteSize + continue + } + + if clearPTEEntries == entriesPerPage { + pmdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pteEntries) + clearPMDEntries++ + } + } + + if clearPMDEntries == entriesPerPage { + pudEntry.Clear() + w.pageTables.Allocator.FreePTEs(pmdEntries) + clearPUDEntries++ + } + } + + if clearPUDEntries == entriesPerPage { + pgdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pudEntries) + } + } +} diff --git a/pkg/sentry/platform/ring0/pagetables/walker_map.go b/pkg/sentry/platform/ring0/pagetables/walker_map.go new file mode 100755 index 000000000..61ee3c825 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/walker_map.go @@ -0,0 +1,255 @@ +package pagetables + +// Walker walks page tables. +type mapWalker struct { + // pageTables are the tables to walk. + pageTables *PageTables + + // Visitor is the set of arguments. + visitor mapVisitor +} + +// iterateRange iterates over all appropriate levels of page tables for the given range. +// +// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The +// exception is super pages. If a valid super page (huge or jumbo) cannot be +// installed, then the walk will continue to individual entries. +// +// This algorithm will attempt to maximize the use of super pages whenever +// possible. Whether a super page is provided will be clear through the range +// provided in the callback. +// +// Note that if requiresAlloc is true, then no gaps will be present. However, +// if alloc is not set, then the iteration will likely be full of gaps. +// +// Note that this function should generally be avoided in favor of Map, Unmap, +// etc. when not necessary. +// +// Precondition: start must be page-aligned. +// +// Precondition: start must be less than end. +// +// Precondition: If requiresAlloc is true, then start and end should not span +// non-canonical ranges. If they do, a panic will result. +// +//go:nosplit +func (w *mapWalker) iterateRange(start, end uintptr) { + if start%pteSize != 0 { + panic("unaligned start") + } + if end < start { + panic("start > end") + } + if start < lowerTop { + if end <= lowerTop { + w.iterateRangeCanonical(start, end) + } else if end > lowerTop && end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + w.iterateRangeCanonical(upperBottom, end) + } + } else if start < upperBottom { + if end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(upperBottom, end) + } + } else { + w.iterateRangeCanonical(start, end) + } +} + +// next returns the next address quantized by the given size. +// +//go:nosplit +func mapnext(start uintptr, size uintptr) uintptr { + start &= ^(size - 1) + start += size + return start +} + +// iterateRangeCanonical walks a canonical range. +// +//go:nosplit +func (w *mapWalker) iterateRangeCanonical(start, end uintptr) { + for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { + var ( + pgdEntry = &w.pageTables.root[pgdIndex] + pudEntries *PTEs + ) + if !pgdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + start = mapnext(start, pgdSize) + continue + } + + pudEntries = w.pageTables.Allocator.NewPTEs() + pgdEntry.setPageTable(w.pageTables, pudEntries) + } else { + pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) + } + + clearPUDEntries := uint16(0) + + for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { + var ( + pudEntry = &pudEntries[pudIndex] + pmdEntries *PTEs + ) + if !pudEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPUDEntries++ + start = mapnext(start, pudSize) + continue + } + + if start&(pudSize-1) == 0 && end-start >= pudSize { + pudEntry.SetSuper() + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + if pudEntry.Valid() { + start = mapnext(start, pudSize) + continue + } + } + + pmdEntries = w.pageTables.Allocator.NewPTEs() + pudEntry.setPageTable(w.pageTables, pmdEntries) + + } else if pudEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < mapnext(start, pudSize)) { + + pmdEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pmdEntries[index].SetSuper() + pmdEntries[index].Set( + pudEntry.Address()+(pmdSize*uintptr(index)), + pudEntry.Opts()) + } + pudEntry.setPageTable(w.pageTables, pmdEntries) + } else { + + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + + if !pudEntry.Valid() { + clearPUDEntries++ + } + + start = mapnext(start, pudSize) + continue + } + } else { + pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) + } + + clearPMDEntries := uint16(0) + + for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { + var ( + pmdEntry = &pmdEntries[pmdIndex] + pteEntries *PTEs + ) + if !pmdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPMDEntries++ + start = mapnext(start, pmdSize) + continue + } + + if start&(pmdSize-1) == 0 && end-start >= pmdSize { + pmdEntry.SetSuper() + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + if pmdEntry.Valid() { + start = mapnext(start, pmdSize) + continue + } + } + + pteEntries = w.pageTables.Allocator.NewPTEs() + pmdEntry.setPageTable(w.pageTables, pteEntries) + + } else if pmdEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < mapnext(start, pmdSize)) { + + pteEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pteEntries[index].Set( + pmdEntry.Address()+(pteSize*uintptr(index)), + pmdEntry.Opts()) + } + pmdEntry.setPageTable(w.pageTables, pteEntries) + } else { + + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + + if !pmdEntry.Valid() { + clearPMDEntries++ + } + + start = mapnext(start, pmdSize) + continue + } + } else { + pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) + } + + clearPTEEntries := uint16(0) + + for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { + var ( + pteEntry = &pteEntries[pteIndex] + ) + if !pteEntry.Valid() && !w.visitor.requiresAlloc() { + clearPTEEntries++ + start += pteSize + continue + } + + w.visitor.visit(uintptr(start), pteEntry, pteSize-1) + if !pteEntry.Valid() { + if w.visitor.requiresAlloc() { + panic("PTE not set after iteration with requiresAlloc!") + } + clearPTEEntries++ + } + + start += pteSize + continue + } + + if clearPTEEntries == entriesPerPage { + pmdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pteEntries) + clearPMDEntries++ + } + } + + if clearPMDEntries == entriesPerPage { + pudEntry.Clear() + w.pageTables.Allocator.FreePTEs(pmdEntries) + clearPUDEntries++ + } + } + + if clearPUDEntries == entriesPerPage { + pgdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pudEntries) + } + } +} diff --git a/pkg/sentry/platform/ring0/pagetables/walker_unmap.go b/pkg/sentry/platform/ring0/pagetables/walker_unmap.go new file mode 100755 index 000000000..be2aa0ce4 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/walker_unmap.go @@ -0,0 +1,255 @@ +package pagetables + +// Walker walks page tables. +type unmapWalker struct { + // pageTables are the tables to walk. + pageTables *PageTables + + // Visitor is the set of arguments. + visitor unmapVisitor +} + +// iterateRange iterates over all appropriate levels of page tables for the given range. +// +// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The +// exception is super pages. If a valid super page (huge or jumbo) cannot be +// installed, then the walk will continue to individual entries. +// +// This algorithm will attempt to maximize the use of super pages whenever +// possible. Whether a super page is provided will be clear through the range +// provided in the callback. +// +// Note that if requiresAlloc is true, then no gaps will be present. However, +// if alloc is not set, then the iteration will likely be full of gaps. +// +// Note that this function should generally be avoided in favor of Map, Unmap, +// etc. when not necessary. +// +// Precondition: start must be page-aligned. +// +// Precondition: start must be less than end. +// +// Precondition: If requiresAlloc is true, then start and end should not span +// non-canonical ranges. If they do, a panic will result. +// +//go:nosplit +func (w *unmapWalker) iterateRange(start, end uintptr) { + if start%pteSize != 0 { + panic("unaligned start") + } + if end < start { + panic("start > end") + } + if start < lowerTop { + if end <= lowerTop { + w.iterateRangeCanonical(start, end) + } else if end > lowerTop && end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + w.iterateRangeCanonical(upperBottom, end) + } + } else if start < upperBottom { + if end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(upperBottom, end) + } + } else { + w.iterateRangeCanonical(start, end) + } +} + +// next returns the next address quantized by the given size. +// +//go:nosplit +func unmapnext(start uintptr, size uintptr) uintptr { + start &= ^(size - 1) + start += size + return start +} + +// iterateRangeCanonical walks a canonical range. +// +//go:nosplit +func (w *unmapWalker) iterateRangeCanonical(start, end uintptr) { + for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { + var ( + pgdEntry = &w.pageTables.root[pgdIndex] + pudEntries *PTEs + ) + if !pgdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + start = unmapnext(start, pgdSize) + continue + } + + pudEntries = w.pageTables.Allocator.NewPTEs() + pgdEntry.setPageTable(w.pageTables, pudEntries) + } else { + pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) + } + + clearPUDEntries := uint16(0) + + for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { + var ( + pudEntry = &pudEntries[pudIndex] + pmdEntries *PTEs + ) + if !pudEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPUDEntries++ + start = unmapnext(start, pudSize) + continue + } + + if start&(pudSize-1) == 0 && end-start >= pudSize { + pudEntry.SetSuper() + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + if pudEntry.Valid() { + start = unmapnext(start, pudSize) + continue + } + } + + pmdEntries = w.pageTables.Allocator.NewPTEs() + pudEntry.setPageTable(w.pageTables, pmdEntries) + + } else if pudEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < unmapnext(start, pudSize)) { + + pmdEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pmdEntries[index].SetSuper() + pmdEntries[index].Set( + pudEntry.Address()+(pmdSize*uintptr(index)), + pudEntry.Opts()) + } + pudEntry.setPageTable(w.pageTables, pmdEntries) + } else { + + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + + if !pudEntry.Valid() { + clearPUDEntries++ + } + + start = unmapnext(start, pudSize) + continue + } + } else { + pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) + } + + clearPMDEntries := uint16(0) + + for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { + var ( + pmdEntry = &pmdEntries[pmdIndex] + pteEntries *PTEs + ) + if !pmdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPMDEntries++ + start = unmapnext(start, pmdSize) + continue + } + + if start&(pmdSize-1) == 0 && end-start >= pmdSize { + pmdEntry.SetSuper() + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + if pmdEntry.Valid() { + start = unmapnext(start, pmdSize) + continue + } + } + + pteEntries = w.pageTables.Allocator.NewPTEs() + pmdEntry.setPageTable(w.pageTables, pteEntries) + + } else if pmdEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < unmapnext(start, pmdSize)) { + + pteEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pteEntries[index].Set( + pmdEntry.Address()+(pteSize*uintptr(index)), + pmdEntry.Opts()) + } + pmdEntry.setPageTable(w.pageTables, pteEntries) + } else { + + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + + if !pmdEntry.Valid() { + clearPMDEntries++ + } + + start = unmapnext(start, pmdSize) + continue + } + } else { + pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) + } + + clearPTEEntries := uint16(0) + + for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { + var ( + pteEntry = &pteEntries[pteIndex] + ) + if !pteEntry.Valid() && !w.visitor.requiresAlloc() { + clearPTEEntries++ + start += pteSize + continue + } + + w.visitor.visit(uintptr(start), pteEntry, pteSize-1) + if !pteEntry.Valid() { + if w.visitor.requiresAlloc() { + panic("PTE not set after iteration with requiresAlloc!") + } + clearPTEEntries++ + } + + start += pteSize + continue + } + + if clearPTEEntries == entriesPerPage { + pmdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pteEntries) + clearPMDEntries++ + } + } + + if clearPMDEntries == entriesPerPage { + pudEntry.Clear() + w.pageTables.Allocator.FreePTEs(pmdEntries) + clearPUDEntries++ + } + } + + if clearPUDEntries == entriesPerPage { + pgdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pudEntries) + } + } +} diff --git a/pkg/sentry/platform/ring0/ring0.go b/pkg/sentry/platform/ring0/ring0.go new file mode 100644 index 000000000..cdeb1b43a --- /dev/null +++ b/pkg/sentry/platform/ring0/ring0.go @@ -0,0 +1,16 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package ring0 provides basic operating system-level stubs. +package ring0 diff --git a/pkg/sentry/platform/ring0/ring0_state_autogen.go b/pkg/sentry/platform/ring0/ring0_state_autogen.go new file mode 100755 index 000000000..462f9a446 --- /dev/null +++ b/pkg/sentry/platform/ring0/ring0_state_autogen.go @@ -0,0 +1,4 @@ +// automatically generated by stateify. + +package ring0 + |