diff options
Diffstat (limited to 'pkg/sentry/platform/ring0')
34 files changed, 2690 insertions, 165 deletions
diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD index 48b0ceaec..679b287c3 100644 --- a/pkg/sentry/platform/ring0/BUILD +++ b/pkg/sentry/platform/ring0/BUILD @@ -1,10 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") package(licenses = ["notice"]) go_template( - name = "defs", + name = "defs_amd64", srcs = [ "defs.go", "defs_amd64.go", @@ -14,11 +14,29 @@ go_template( visibility = [":__subpackages__"], ) +go_template( + name = "defs_arm64", + srcs = [ + "aarch64.go", + "defs.go", + "defs_arm64.go", + "offsets_arm64.go", + ], + visibility = [":__subpackages__"], +) + go_template_instance( - name = "defs_impl", - out = "defs_impl.go", + name = "defs_impl_amd64", + out = "defs_impl_amd64.go", package = "ring0", - template = ":defs", + template = ":defs_amd64", +) + +go_template_instance( + name = "defs_impl_arm64", + out = "defs_impl_arm64.go", + package = "ring0", + template = ":defs_arm64", ) genrule( @@ -29,24 +47,40 @@ genrule( tools = ["//pkg/sentry/platform/ring0/gen_offsets"], ) +genrule( + name = "entry_impl_arm64", + srcs = ["entry_arm64.s"], + outs = ["entry_impl_arm64.s"], + cmd = "(echo -e '// build +arm64\\n' && $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(SRCS)) > $@", + tools = ["//pkg/sentry/platform/ring0/gen_offsets"], +) + go_library( name = "ring0", srcs = [ - "defs_impl.go", + "defs_impl_amd64.go", + "defs_impl_arm64.go", "entry_amd64.go", + "entry_arm64.go", "entry_impl_amd64.s", + "entry_impl_arm64.s", "kernel.go", "kernel_amd64.go", + "kernel_arm64.go", "kernel_unsafe.go", "lib_amd64.go", "lib_amd64.s", + "lib_arm64.go", + "lib_arm64.s", + "lib_arm64_unsafe.go", "ring0.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ring0", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/cpuid", + "//pkg/safecopy", + "//pkg/sentry/arch", "//pkg/sentry/platform/ring0/pagetables", - "//pkg/sentry/usermem", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/platform/ring0/aarch64.go b/pkg/sentry/platform/ring0/aarch64.go new file mode 100644 index 000000000..87a573cc4 --- /dev/null +++ b/pkg/sentry/platform/ring0/aarch64.go @@ -0,0 +1,111 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package ring0 + +// Useful bits. +const ( + _PGD_PGT_BASE = 0x1000 + _PGD_PGT_SIZE = 0x1000 + _PUD_PGT_BASE = 0x2000 + _PUD_PGT_SIZE = 0x1000 + _PMD_PGT_BASE = 0x3000 + _PMD_PGT_SIZE = 0x4000 + _PTE_PGT_BASE = 0x7000 + _PTE_PGT_SIZE = 0x1000 +) + +const ( + // DAIF bits:debug, sError, IRQ, FIQ. + _PSR_D_BIT = 0x00000200 + _PSR_A_BIT = 0x00000100 + _PSR_I_BIT = 0x00000080 + _PSR_F_BIT = 0x00000040 + _PSR_DAIF_SHIFT = 6 + _PSR_DAIF_MASK = 0xf << _PSR_DAIF_SHIFT + + // PSR bits. + _PSR_MODE_EL0t = 0x00000000 + _PSR_MODE_EL1t = 0x00000004 + _PSR_MODE_EL1h = 0x00000005 + _PSR_MODE_MASK = 0x0000000f + + PsrFlagsClear = _PSR_MODE_MASK | _PSR_DAIF_MASK + PsrModeMask = _PSR_MODE_MASK + + // KernelFlagsSet should always be set in the kernel. + KernelFlagsSet = _PSR_MODE_EL1h | _PSR_D_BIT | _PSR_A_BIT | _PSR_I_BIT | _PSR_F_BIT + + // UserFlagsSet are always set in userspace. + UserFlagsSet = _PSR_MODE_EL0t +) + +// Vector is an exception vector. +type Vector uintptr + +// Exception vectors. +const ( + El1SyncInvalid = iota + El1IrqInvalid + El1FiqInvalid + El1ErrorInvalid + El1Sync + El1Irq + El1Fiq + El1Error + El0Sync + El0Irq + El0Fiq + El0Error + El0Sync_invalid + El0Irq_invalid + El0Fiq_invalid + El0Error_invalid + El1Sync_da + El1Sync_ia + El1Sync_sp_pc + El1Sync_undef + El1Sync_dbg + El1Sync_inv + El0Sync_svc + El0Sync_da + El0Sync_ia + El0Sync_fpsimd_acc + El0Sync_sve_acc + El0Sync_sys + El0Sync_sp_pc + El0Sync_undef + El0Sync_dbg + El0Sync_inv + _NR_INTERRUPTS +) + +// System call vectors. +const ( + Syscall Vector = El0Sync_svc + PageFault Vector = El0Sync_da + VirtualizationException Vector = El0Error +) + +// VirtualAddressBits returns the number bits available for virtual addresses. +func VirtualAddressBits() uint32 { + return 48 +} + +// PhysicalAddressBits returns the number of bits available for physical addresses. +func PhysicalAddressBits() uint32 { + return 40 +} diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go index 076063f85..e6daf24df 100644 --- a/pkg/sentry/platform/ring0/defs.go +++ b/pkg/sentry/platform/ring0/defs.go @@ -15,20 +15,8 @@ package ring0 import ( - "syscall" - - "gvisor.dev/gvisor/pkg/sentry/usermem" -) - -var ( - // UserspaceSize is the total size of userspace. - UserspaceSize = uintptr(1) << (VirtualAddressBits() - 1) - - // MaximumUserAddress is the largest possible user address. - MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(usermem.PageSize-1) - - // KernelStartAddress is the starting kernel address. - KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1) + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" ) // Kernel is a global kernel object. @@ -83,7 +71,7 @@ type CPU struct { // registers is a set of registers; these may be used on kernel system // calls and exceptions via the Registers function. - registers syscall.PtraceRegs + registers arch.Registers // hooks are kernel hooks. hooks Hooks @@ -94,14 +82,14 @@ type CPU struct { // This is explicitly safe to call during KernelException and KernelSyscall. // //go:nosplit -func (c *CPU) Registers() *syscall.PtraceRegs { +func (c *CPU) Registers() *arch.Registers { return &c.registers } // SwitchOpts are passed to the Switch function. type SwitchOpts struct { // Registers are the user register state. - Registers *syscall.PtraceRegs + Registers *arch.Registers // FloatingPointState is a byte pointer where floating point state is // saved and restored. diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go index 7206322b1..9c6c2cf5c 100644 --- a/pkg/sentry/platform/ring0/defs_amd64.go +++ b/pkg/sentry/platform/ring0/defs_amd64.go @@ -18,6 +18,18 @@ package ring0 import ( "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" + "gvisor.dev/gvisor/pkg/usermem" +) + +var ( + // UserspaceSize is the total size of userspace. + UserspaceSize = uintptr(1) << (VirtualAddressBits() - 1) + + // MaximumUserAddress is the largest possible user address. + MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(usermem.PageSize-1) + + // KernelStartAddress is the starting kernel address. + KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1) ) // Segment indices and Selectors. diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go new file mode 100644 index 000000000..0e2ab716c --- /dev/null +++ b/pkg/sentry/platform/ring0/defs_arm64.go @@ -0,0 +1,143 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package ring0 + +import ( + "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" + "gvisor.dev/gvisor/pkg/usermem" +) + +var ( + // UserspaceSize is the total size of userspace. + UserspaceSize = uintptr(1) << (VirtualAddressBits()) + + // MaximumUserAddress is the largest possible user address. + MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(usermem.PageSize-1) + + // KernelStartAddress is the starting kernel address. + KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1) +) + +// KernelOpts has initialization options for the kernel. +type KernelOpts struct { + // PageTables are the kernel pagetables; this must be provided. + PageTables *pagetables.PageTables +} + +// KernelArchState contains architecture-specific state. +type KernelArchState struct { + KernelOpts +} + +// CPUArchState contains CPU-specific arch state. +type CPUArchState struct { + // stack is the stack used for interrupts on this CPU. + stack [512]byte + + // errorCode is the error code from the last exception. + errorCode uintptr + + // errorType indicates the type of error code here, it is always set + // along with the errorCode value above. + // + // It will either by 1, which indicates a user error, or 0 indicating a + // kernel error. If the error code below returns false (kernel error), + // then it cannot provide relevant information about the last + // exception. + errorType uintptr + + // faultAddr is the value of far_el1. + faultAddr uintptr + + // ttbr0Kvm is the value of ttbr0_el1 for sentry. + ttbr0Kvm uintptr + + // ttbr0App is the value of ttbr0_el1 for applicaton. + ttbr0App uintptr + + // exception vector. + vecCode Vector + + // application context pointer. + appAddr uintptr + + // lazyVFP is the value of cpacr_el1. + lazyVFP uintptr +} + +// ErrorCode returns the last error code. +// +// The returned boolean indicates whether the error code corresponds to the +// last user error or not. If it does not, then fault information must be +// ignored. This is generally the result of a kernel fault while servicing a +// user fault. +// +//go:nosplit +func (c *CPU) ErrorCode() (value uintptr, user bool) { + return c.errorCode, c.errorType != 0 +} + +// ClearErrorCode resets the error code. +// +//go:nosplit +func (c *CPU) ClearErrorCode() { + c.errorCode = 0 // No code. + c.errorType = 1 // User mode. +} + +//go:nosplit +func (c *CPU) GetFaultAddr() (value uintptr) { + return c.faultAddr +} + +//go:nosplit +func (c *CPU) SetTtbr0Kvm(value uintptr) { + c.ttbr0Kvm = value +} + +//go:nosplit +func (c *CPU) SetTtbr0App(value uintptr) { + c.ttbr0App = value +} + +//go:nosplit +func (c *CPU) GetVector() (value Vector) { + return c.vecCode +} + +//go:nosplit +func (c *CPU) SetAppAddr(value uintptr) { + c.appAddr = value +} + +// GetLazyVFP returns the value of cpacr_el1. +//go:nosplit +func (c *CPU) GetLazyVFP() (value uintptr) { + return c.lazyVFP +} + +// SwitchArchOpts are embedded in SwitchOpts. +type SwitchArchOpts struct { + // UserASID indicates that the application ASID to be used on switch, + UserASID uint16 + + // KernelASID indicates that the kernel ASID to be used on return, + KernelASID uint16 +} + +func init() { +} diff --git a/pkg/sentry/platform/ring0/entry_amd64.go b/pkg/sentry/platform/ring0/entry_amd64.go index a5ce67885..7fa43c2f5 100644 --- a/pkg/sentry/platform/ring0/entry_amd64.go +++ b/pkg/sentry/platform/ring0/entry_amd64.go @@ -17,7 +17,7 @@ package ring0 import ( - "syscall" + "gvisor.dev/gvisor/pkg/sentry/arch" ) // This is an assembly function. @@ -41,7 +41,7 @@ func swapgs() // The return code is the vector that interrupted execution. // // See stubs.go for a note regarding the frame size of this function. -func sysret(*CPU, *syscall.PtraceRegs) Vector +func sysret(*CPU, *arch.Registers) Vector // "iret is the cadillac of CPL switching." // @@ -50,7 +50,7 @@ func sysret(*CPU, *syscall.PtraceRegs) Vector // iret is nearly identical to sysret, except an iret is used to fully restore // all user state. This must be called in cases where all registers need to be // restored. -func iret(*CPU, *syscall.PtraceRegs) Vector +func iret(*CPU, *arch.Registers) Vector // exception is the generic exception entry. // diff --git a/pkg/sentry/platform/ring0/entry_arm64.go b/pkg/sentry/platform/ring0/entry_arm64.go new file mode 100644 index 000000000..62a93f3d6 --- /dev/null +++ b/pkg/sentry/platform/ring0/entry_arm64.go @@ -0,0 +1,60 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package ring0 + +// This is an assembly function. +// +// The sysenter function is invoked in two situations: +// +// (1) The guest kernel has executed a system call. +// (2) The guest application has executed a system call. +// +// The interrupt flag is examined to determine whether the system call was +// executed from kernel mode or not and the appropriate stub is called. + +func El1_sync_invalid() +func El1_irq_invalid() +func El1_fiq_invalid() +func El1_error_invalid() + +func El1_sync() +func El1_irq() +func El1_fiq() +func El1_error() + +func El0_sync() +func El0_irq() +func El0_fiq() +func El0_error() + +func El0_sync_invalid() +func El0_irq_invalid() +func El0_fiq_invalid() +func El0_error_invalid() + +func Vectors() + +// Start is the CPU entrypoint. +// +// The CPU state will be set to c.Registers(). +func Start() +func kernelExitToEl1() + +func kernelExitToEl0() + +// Shutdown execution +func Shutdown() diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s new file mode 100644 index 000000000..9d29b7168 --- /dev/null +++ b/pkg/sentry/platform/ring0/entry_arm64.s @@ -0,0 +1,786 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "funcdata.h" +#include "textflag.h" + +// NB: Offsets are programatically generated (see BUILD). +// +// This file is concatenated with the definitions. + +// Saves a register set. +// +// This is a macro because it may need to executed in contents where a stack is +// not available for calls. +// + +// ERET returns using the ELR and SPSR for the current exception level. +#define ERET() \ + WORD $0xd69f03e0 + +// RSV_REG is a register that holds el1 information temporarily. +#define RSV_REG R18_PLATFORM + +// RSV_REG_APP is a register that holds el0 information temporarily. +#define RSV_REG_APP R9 + +#define FPEN_NOTRAP 0x3 +#define FPEN_SHIFT 20 + +#define FPEN_ENABLE (FPEN_NOTRAP << FPEN_SHIFT) + +// sctlr_el1: system control register el1. +#define SCTLR_M 1 << 0 +#define SCTLR_C 1 << 2 +#define SCTLR_I 1 << 12 +#define SCTLR_UCT 1 << 15 + +#define SCTLR_EL1_DEFAULT (SCTLR_M | SCTLR_C | SCTLR_I | SCTLR_UCT) + +// cntkctl_el1: counter-timer kernel control register el1. +#define CNTKCTL_EL0PCTEN 1 << 0 +#define CNTKCTL_EL0VCTEN 1 << 1 + +#define CNTKCTL_EL1_DEFAULT (CNTKCTL_EL0PCTEN | CNTKCTL_EL0VCTEN) + +// Saves a register set. +// +// This is a macro because it may need to executed in contents where a stack is +// not available for calls. +// +// The following registers are not saved: R9, R18. +#define REGISTERS_SAVE(reg, offset) \ + MOVD R0, offset+PTRACE_R0(reg); \ + MOVD R1, offset+PTRACE_R1(reg); \ + MOVD R2, offset+PTRACE_R2(reg); \ + MOVD R3, offset+PTRACE_R3(reg); \ + MOVD R4, offset+PTRACE_R4(reg); \ + MOVD R5, offset+PTRACE_R5(reg); \ + MOVD R6, offset+PTRACE_R6(reg); \ + MOVD R7, offset+PTRACE_R7(reg); \ + MOVD R8, offset+PTRACE_R8(reg); \ + MOVD R10, offset+PTRACE_R10(reg); \ + MOVD R11, offset+PTRACE_R11(reg); \ + MOVD R12, offset+PTRACE_R12(reg); \ + MOVD R13, offset+PTRACE_R13(reg); \ + MOVD R14, offset+PTRACE_R14(reg); \ + MOVD R15, offset+PTRACE_R15(reg); \ + MOVD R16, offset+PTRACE_R16(reg); \ + MOVD R17, offset+PTRACE_R17(reg); \ + MOVD R19, offset+PTRACE_R19(reg); \ + MOVD R20, offset+PTRACE_R20(reg); \ + MOVD R21, offset+PTRACE_R21(reg); \ + MOVD R22, offset+PTRACE_R22(reg); \ + MOVD R23, offset+PTRACE_R23(reg); \ + MOVD R24, offset+PTRACE_R24(reg); \ + MOVD R25, offset+PTRACE_R25(reg); \ + MOVD R26, offset+PTRACE_R26(reg); \ + MOVD R27, offset+PTRACE_R27(reg); \ + MOVD g, offset+PTRACE_R28(reg); \ + MOVD R29, offset+PTRACE_R29(reg); \ + MOVD R30, offset+PTRACE_R30(reg); + +// Loads a register set. +// +// This is a macro because it may need to executed in contents where a stack is +// not available for calls. +// +// The following registers are not loaded: R9, R18. +#define REGISTERS_LOAD(reg, offset) \ + MOVD offset+PTRACE_R0(reg), R0; \ + MOVD offset+PTRACE_R1(reg), R1; \ + MOVD offset+PTRACE_R2(reg), R2; \ + MOVD offset+PTRACE_R3(reg), R3; \ + MOVD offset+PTRACE_R4(reg), R4; \ + MOVD offset+PTRACE_R5(reg), R5; \ + MOVD offset+PTRACE_R6(reg), R6; \ + MOVD offset+PTRACE_R7(reg), R7; \ + MOVD offset+PTRACE_R8(reg), R8; \ + MOVD offset+PTRACE_R10(reg), R10; \ + MOVD offset+PTRACE_R11(reg), R11; \ + MOVD offset+PTRACE_R12(reg), R12; \ + MOVD offset+PTRACE_R13(reg), R13; \ + MOVD offset+PTRACE_R14(reg), R14; \ + MOVD offset+PTRACE_R15(reg), R15; \ + MOVD offset+PTRACE_R16(reg), R16; \ + MOVD offset+PTRACE_R17(reg), R17; \ + MOVD offset+PTRACE_R19(reg), R19; \ + MOVD offset+PTRACE_R20(reg), R20; \ + MOVD offset+PTRACE_R21(reg), R21; \ + MOVD offset+PTRACE_R22(reg), R22; \ + MOVD offset+PTRACE_R23(reg), R23; \ + MOVD offset+PTRACE_R24(reg), R24; \ + MOVD offset+PTRACE_R25(reg), R25; \ + MOVD offset+PTRACE_R26(reg), R26; \ + MOVD offset+PTRACE_R27(reg), R27; \ + MOVD offset+PTRACE_R28(reg), g; \ + MOVD offset+PTRACE_R29(reg), R29; \ + MOVD offset+PTRACE_R30(reg), R30; + +// NOP-s +#define nop31Instructions() \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; \ + WORD $0xd503201f; + +#define ESR_ELx_EC_UNKNOWN (0x00) +#define ESR_ELx_EC_WFx (0x01) +/* Unallocated EC: 0x02 */ +#define ESR_ELx_EC_CP15_32 (0x03) +#define ESR_ELx_EC_CP15_64 (0x04) +#define ESR_ELx_EC_CP14_MR (0x05) +#define ESR_ELx_EC_CP14_LS (0x06) +#define ESR_ELx_EC_FP_ASIMD (0x07) +#define ESR_ELx_EC_CP10_ID (0x08) /* EL2 only */ +#define ESR_ELx_EC_PAC (0x09) /* EL2 and above */ +/* Unallocated EC: 0x0A - 0x0B */ +#define ESR_ELx_EC_CP14_64 (0x0C) +/* Unallocated EC: 0x0d */ +#define ESR_ELx_EC_ILL (0x0E) +/* Unallocated EC: 0x0F - 0x10 */ +#define ESR_ELx_EC_SVC32 (0x11) +#define ESR_ELx_EC_HVC32 (0x12) /* EL2 only */ +#define ESR_ELx_EC_SMC32 (0x13) /* EL2 and above */ +/* Unallocated EC: 0x14 */ +#define ESR_ELx_EC_SVC64 (0x15) +#define ESR_ELx_EC_HVC64 (0x16) /* EL2 and above */ +#define ESR_ELx_EC_SMC64 (0x17) /* EL2 and above */ +#define ESR_ELx_EC_SYS64 (0x18) +#define ESR_ELx_EC_SVE (0x19) +/* Unallocated EC: 0x1A - 0x1E */ +#define ESR_ELx_EC_IMP_DEF (0x1f) /* EL3 only */ +#define ESR_ELx_EC_IABT_LOW (0x20) +#define ESR_ELx_EC_IABT_CUR (0x21) +#define ESR_ELx_EC_PC_ALIGN (0x22) +/* Unallocated EC: 0x23 */ +#define ESR_ELx_EC_DABT_LOW (0x24) +#define ESR_ELx_EC_DABT_CUR (0x25) +#define ESR_ELx_EC_SP_ALIGN (0x26) +/* Unallocated EC: 0x27 */ +#define ESR_ELx_EC_FP_EXC32 (0x28) +/* Unallocated EC: 0x29 - 0x2B */ +#define ESR_ELx_EC_FP_EXC64 (0x2C) +/* Unallocated EC: 0x2D - 0x2E */ +#define ESR_ELx_EC_SERROR (0x2F) +#define ESR_ELx_EC_BREAKPT_LOW (0x30) +#define ESR_ELx_EC_BREAKPT_CUR (0x31) +#define ESR_ELx_EC_SOFTSTP_LOW (0x32) +#define ESR_ELx_EC_SOFTSTP_CUR (0x33) +#define ESR_ELx_EC_WATCHPT_LOW (0x34) +#define ESR_ELx_EC_WATCHPT_CUR (0x35) +/* Unallocated EC: 0x36 - 0x37 */ +#define ESR_ELx_EC_BKPT32 (0x38) +/* Unallocated EC: 0x39 */ +#define ESR_ELx_EC_VECTOR32 (0x3A) /* EL2 only */ +/* Unallocted EC: 0x3B */ +#define ESR_ELx_EC_BRK64 (0x3C) +/* Unallocated EC: 0x3D - 0x3F */ +#define ESR_ELx_EC_MAX (0x3F) + +#define ESR_ELx_EC_SHIFT (26) +#define ESR_ELx_EC_MASK (UL(0x3F) << ESR_ELx_EC_SHIFT) +#define ESR_ELx_EC(esr) (((esr) & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT) + +#define ESR_ELx_IL_SHIFT (25) +#define ESR_ELx_IL (UL(1) << ESR_ELx_IL_SHIFT) +#define ESR_ELx_ISS_MASK (ESR_ELx_IL - 1) + +/* ISS field definitions shared by different classes */ +#define ESR_ELx_WNR_SHIFT (6) +#define ESR_ELx_WNR (UL(1) << ESR_ELx_WNR_SHIFT) + +/* Asynchronous Error Type */ +#define ESR_ELx_IDS_SHIFT (24) +#define ESR_ELx_IDS (UL(1) << ESR_ELx_IDS_SHIFT) +#define ESR_ELx_AET_SHIFT (10) +#define ESR_ELx_AET (UL(0x7) << ESR_ELx_AET_SHIFT) + +#define ESR_ELx_AET_UC (UL(0) << ESR_ELx_AET_SHIFT) +#define ESR_ELx_AET_UEU (UL(1) << ESR_ELx_AET_SHIFT) +#define ESR_ELx_AET_UEO (UL(2) << ESR_ELx_AET_SHIFT) +#define ESR_ELx_AET_UER (UL(3) << ESR_ELx_AET_SHIFT) +#define ESR_ELx_AET_CE (UL(6) << ESR_ELx_AET_SHIFT) + +/* Shared ISS field definitions for Data/Instruction aborts */ +#define ESR_ELx_SET_SHIFT (11) +#define ESR_ELx_SET_MASK (UL(3) << ESR_ELx_SET_SHIFT) +#define ESR_ELx_FnV_SHIFT (10) +#define ESR_ELx_FnV (UL(1) << ESR_ELx_FnV_SHIFT) +#define ESR_ELx_EA_SHIFT (9) +#define ESR_ELx_EA (UL(1) << ESR_ELx_EA_SHIFT) +#define ESR_ELx_S1PTW_SHIFT (7) +#define ESR_ELx_S1PTW (UL(1) << ESR_ELx_S1PTW_SHIFT) + +/* Shared ISS fault status code(IFSC/DFSC) for Data/Instruction aborts */ +#define ESR_ELx_FSC (0x3F) +#define ESR_ELx_FSC_TYPE (0x3C) +#define ESR_ELx_FSC_EXTABT (0x10) +#define ESR_ELx_FSC_SERROR (0x11) +#define ESR_ELx_FSC_ACCESS (0x08) +#define ESR_ELx_FSC_FAULT (0x04) +#define ESR_ELx_FSC_PERM (0x0C) + +/* ISS field definitions for Data Aborts */ +#define ESR_ELx_ISV_SHIFT (24) +#define ESR_ELx_ISV (UL(1) << ESR_ELx_ISV_SHIFT) +#define ESR_ELx_SAS_SHIFT (22) +#define ESR_ELx_SAS (UL(3) << ESR_ELx_SAS_SHIFT) +#define ESR_ELx_SSE_SHIFT (21) +#define ESR_ELx_SSE (UL(1) << ESR_ELx_SSE_SHIFT) +#define ESR_ELx_SRT_SHIFT (16) +#define ESR_ELx_SRT_MASK (UL(0x1F) << ESR_ELx_SRT_SHIFT) +#define ESR_ELx_SF_SHIFT (15) +#define ESR_ELx_SF (UL(1) << ESR_ELx_SF_SHIFT) +#define ESR_ELx_AR_SHIFT (14) +#define ESR_ELx_AR (UL(1) << ESR_ELx_AR_SHIFT) +#define ESR_ELx_CM_SHIFT (8) +#define ESR_ELx_CM (UL(1) << ESR_ELx_CM_SHIFT) + +/* ISS field definitions for exceptions taken in to Hyp */ +#define ESR_ELx_CV (UL(1) << 24) +#define ESR_ELx_COND_SHIFT (20) +#define ESR_ELx_COND_MASK (UL(0xF) << ESR_ELx_COND_SHIFT) +#define ESR_ELx_WFx_ISS_TI (UL(1) << 0) +#define ESR_ELx_WFx_ISS_WFI (UL(0) << 0) +#define ESR_ELx_WFx_ISS_WFE (UL(1) << 0) +#define ESR_ELx_xVC_IMM_MASK ((1UL << 16) - 1) + +// LOAD_KERNEL_ADDRESS loads a kernel address. +#define LOAD_KERNEL_ADDRESS(from, to) \ + MOVD from, to; \ + ORR $0xffff000000000000, to, to; + +// LOAD_KERNEL_STACK loads the kernel temporary stack. +#define LOAD_KERNEL_STACK(from) \ + LOAD_KERNEL_ADDRESS(CPU_SELF(from), RSV_REG); \ + MOVD $CPU_STACK_TOP(RSV_REG), RSV_REG; \ + MOVD RSV_REG, RSP; \ + WORD $0xd538d092; \ //MRS TPIDR_EL1, R18 + ISB $15; \ + DSB $15; + +// SWITCH_TO_APP_PAGETABLE sets a new pagetable for a container application. +#define SWITCH_TO_APP_PAGETABLE(from) \ + MOVD CPU_TTBR0_APP(from), RSV_REG; \ + WORD $0xd5182012; \ // MSR R18, TTBR0_EL1 + ISB $15; \ + DSB $15; + +// SWITCH_TO_KVM_PAGETABLE sets the kvm pagetable. +#define SWITCH_TO_KVM_PAGETABLE(from) \ + MOVD CPU_TTBR0_KVM(from), RSV_REG; \ + WORD $0xd5182012; \ // MSR R18, TTBR0_EL1 + ISB $15; \ + DSB $15; + +#define VFP_ENABLE \ + MOVD $FPEN_ENABLE, R0; \ + WORD $0xd5181040; \ //MSR R0, CPACR_EL1 + ISB $15; + +#define VFP_DISABLE \ + MOVD $0x0, R0; \ + WORD $0xd5181040; \ //MSR R0, CPACR_EL1 + ISB $15; + +// KERNEL_ENTRY_FROM_EL0 is the entry code of the vcpu from el0 to el1. +#define KERNEL_ENTRY_FROM_EL0 \ + SUB $16, RSP, RSP; \ // step1, save r18, r9 into kernel temporary stack. + STP (RSV_REG, RSV_REG_APP), 16*0(RSP); \ + WORD $0xd538d092; \ //MRS TPIDR_EL1, R18, step2, switch user pagetable. + SWITCH_TO_KVM_PAGETABLE(RSV_REG); \ + WORD $0xd538d092; \ //MRS TPIDR_EL1, R18 + MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP; \ // step3, load app context pointer. + REGISTERS_SAVE(RSV_REG_APP, 0); \ // step4, save app context. + MOVD RSV_REG_APP, R20; \ + LDP 16*0(RSP), (RSV_REG, RSV_REG_APP); \ + ADD $16, RSP, RSP; \ + MOVD RSV_REG, PTRACE_R18(R20); \ + MOVD RSV_REG_APP, PTRACE_R9(R20); \ + MOVD R20, RSV_REG_APP; \ + WORD $0xd5384003; \ // MRS SPSR_EL1, R3 + MOVD R3, PTRACE_PSTATE(RSV_REG_APP); \ + MRS ELR_EL1, R3; \ + MOVD R3, PTRACE_PC(RSV_REG_APP); \ + WORD $0xd5384103; \ // MRS SP_EL0, R3 + MOVD R3, PTRACE_SP(RSV_REG_APP); + +// KERNEL_ENTRY_FROM_EL1 is the entry code of the vcpu from el1 to el1. +#define KERNEL_ENTRY_FROM_EL1 \ + WORD $0xd538d092; \ //MRS TPIDR_EL1, R18 + REGISTERS_SAVE(RSV_REG, CPU_REGISTERS); \ // Save sentry context. + MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG); \ + WORD $0xd5384004; \ // MRS SPSR_EL1, R4 + MOVD R4, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG); \ + MRS ELR_EL1, R4; \ + MOVD R4, CPU_REGISTERS+PTRACE_PC(RSV_REG); \ + MOVD RSP, R4; \ + MOVD R4, CPU_REGISTERS+PTRACE_SP(RSV_REG); \ + LOAD_KERNEL_STACK(RSV_REG); // Load the temporary stack. + +// Halt halts execution. +TEXT ·Halt(SB),NOSPLIT,$0 + // Clear bluepill. + WORD $0xd538d092 //MRS TPIDR_EL1, R18 + CMP RSV_REG, R9 + BNE mmio_exit + MOVD $0, CPU_REGISTERS+PTRACE_R9(RSV_REG) + + // Flush dcache. + WORD $0xd5087e52 // DC CISW +mmio_exit: + // Disable fpsimd. + WORD $0xd5381041 // MRS CPACR_EL1, R1 + MOVD R1, CPU_LAZY_VFP(RSV_REG) + VFP_DISABLE + + // Trigger MMIO_EXIT/_KVM_HYPERCALL_VMEXIT. + // + // To keep it simple, I used the address of exception table as the + // MMIO base address, so that I can trigger a MMIO-EXIT by forcibly writing + // a read-only space. + // Also, the length is engough to match a sufficient number of hypercall ID. + // Then, in host user space, I can calculate this address to find out + // which hypercall. + MRS VBAR_EL1, R9 + MOVD R0, 0x0(R9) + + // Flush dcahce. + WORD $0xd5087e52 // DC CISW + + RET + +// HaltAndResume halts execution and point the pointer to the resume function. +TEXT ·HaltAndResume(SB),NOSPLIT,$0 + BL ·Halt(SB) + B ·kernelExitToEl1(SB) // Resume. + +// HaltEl1SvcAndResume calls Hooks.KernelSyscall and resume. +TEXT ·HaltEl1SvcAndResume(SB),NOSPLIT,$0 + WORD $0xd538d092 // MRS TPIDR_EL1, R18 + MOVD CPU_SELF(RSV_REG), R3 // Load vCPU. + MOVD R3, 8(RSP) // First argument (vCPU). + CALL ·kernelSyscall(SB) // Call the trampoline. + B ·kernelExitToEl1(SB) // Resume. + +// Shutdown stops the guest. +TEXT ·Shutdown(SB),NOSPLIT,$0 + // PSCI EVENT. + MOVD $0x84000009, R0 + HVC $0 + +// See kernel.go. +TEXT ·Current(SB),NOSPLIT,$0-8 + MOVD CPU_SELF(RSV_REG), R8 + MOVD R8, ret+0(FP) + RET + +#define STACK_FRAME_SIZE 16 + +// kernelExitToEl0 is the entrypoint for application in guest_el0. +// Prepare the vcpu environment for container application. +TEXT ·kernelExitToEl0(SB),NOSPLIT,$0 + // Step1, save sentry context into memory. + MRS TPIDR_EL1, RSV_REG + REGISTERS_SAVE(RSV_REG, CPU_REGISTERS) + MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG) + + WORD $0xd5384003 // MRS SPSR_EL1, R3 + MOVD R3, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG) + MOVD R30, CPU_REGISTERS+PTRACE_PC(RSV_REG) + MOVD RSP, R3 + MOVD R3, CPU_REGISTERS+PTRACE_SP(RSV_REG) + + MOVD CPU_REGISTERS+PTRACE_R3(RSV_REG), R3 + + // Step2, switch to temporary stack. + LOAD_KERNEL_STACK(RSV_REG) + + // Step3, load app context pointer. + MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP + + // Step4, prepare the environment for container application. + // set sp_el0. + MOVD PTRACE_SP(RSV_REG_APP), R1 + WORD $0xd5184101 //MSR R1, SP_EL0 + // set pc. + MOVD PTRACE_PC(RSV_REG_APP), R1 + MSR R1, ELR_EL1 + // set pstate. + MOVD PTRACE_PSTATE(RSV_REG_APP), R1 + WORD $0xd5184001 //MSR R1, SPSR_EL1 + + // RSV_REG & RSV_REG_APP will be loaded at the end. + REGISTERS_LOAD(RSV_REG_APP, 0) + + // switch to user pagetable. + MOVD PTRACE_R18(RSV_REG_APP), RSV_REG + MOVD PTRACE_R9(RSV_REG_APP), RSV_REG_APP + + SUB $STACK_FRAME_SIZE, RSP, RSP + STP (RSV_REG, RSV_REG_APP), 16*0(RSP) + + WORD $0xd538d092 //MRS TPIDR_EL1, R18 + + SWITCH_TO_APP_PAGETABLE(RSV_REG) + + LDP 16*0(RSP), (RSV_REG, RSV_REG_APP) + ADD $STACK_FRAME_SIZE, RSP, RSP + + ISB $15 + ERET() + +// kernelExitToEl1 is the entrypoint for sentry in guest_el1. +// Prepare the vcpu environment for sentry. +TEXT ·kernelExitToEl1(SB),NOSPLIT,$0 + WORD $0xd538d092 //MRS TPIDR_EL1, R18 + MOVD CPU_REGISTERS+PTRACE_PSTATE(RSV_REG), R1 + WORD $0xd5184001 //MSR R1, SPSR_EL1 + + MOVD CPU_REGISTERS+PTRACE_PC(RSV_REG), R1 + MSR R1, ELR_EL1 + + MOVD CPU_REGISTERS+PTRACE_SP(RSV_REG), R1 + MOVD R1, RSP + + REGISTERS_LOAD(RSV_REG, CPU_REGISTERS) + MOVD CPU_REGISTERS+PTRACE_R9(RSV_REG), RSV_REG_APP + + ERET() + +// Start is the CPU entrypoint. +TEXT ·Start(SB),NOSPLIT,$0 + // Flush dcache. + WORD $0xd5087e52 // DC CISW + // Init. + MOVD $SCTLR_EL1_DEFAULT, R1 + MSR R1, SCTLR_EL1 + + MOVD $CNTKCTL_EL1_DEFAULT, R1 + MSR R1, CNTKCTL_EL1 + + MOVD R8, RSV_REG + ORR $0xffff000000000000, RSV_REG, RSV_REG + WORD $0xd518d092 //MSR R18, TPIDR_EL1 + + B ·kernelExitToEl1(SB) + +// El1_sync_invalid is the handler for an invalid EL1_sync. +TEXT ·El1_sync_invalid(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +// El1_irq_invalid is the handler for an invalid El1_irq. +TEXT ·El1_irq_invalid(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +// El1_fiq_invalid is the handler for an invalid El1_fiq. +TEXT ·El1_fiq_invalid(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +// El1_error_invalid is the handler for an invalid El1_error. +TEXT ·El1_error_invalid(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +// El1_sync is the handler for El1_sync. +TEXT ·El1_sync(SB),NOSPLIT,$0 + KERNEL_ENTRY_FROM_EL1 + WORD $0xd5385219 // MRS ESR_EL1, R25 + LSR $ESR_ELx_EC_SHIFT, R25, R24 + CMP $ESR_ELx_EC_DABT_CUR, R24 + BEQ el1_da + CMP $ESR_ELx_EC_IABT_CUR, R24 + BEQ el1_ia + CMP $ESR_ELx_EC_SYS64, R24 + BEQ el1_undef + CMP $ESR_ELx_EC_SP_ALIGN, R24 + BEQ el1_sp_pc + CMP $ESR_ELx_EC_PC_ALIGN, R24 + BEQ el1_sp_pc + CMP $ESR_ELx_EC_UNKNOWN, R24 + BEQ el1_undef + CMP $ESR_ELx_EC_SVC64, R24 + BEQ el1_svc + CMP $ESR_ELx_EC_BREAKPT_CUR, R24 + BGE el1_dbg + CMP $ESR_ELx_EC_FP_ASIMD, R24 + BEQ el1_fpsimd_acc + B el1_invalid + +el1_da: +el1_ia: + WORD $0xd538d092 //MRS TPIDR_EL1, R18 + WORD $0xd538601a //MRS FAR_EL1, R26 + + MOVD R26, CPU_FAULT_ADDR(RSV_REG) + + MOVD $0, CPU_ERROR_TYPE(RSV_REG) + + MOVD $PageFault, R3 + MOVD R3, CPU_VECTOR_CODE(RSV_REG) + + B ·HaltAndResume(SB) + +el1_sp_pc: + B ·Shutdown(SB) + +el1_undef: + B ·Shutdown(SB) + +el1_svc: + MOVD $0, CPU_ERROR_CODE(RSV_REG) + MOVD $0, CPU_ERROR_TYPE(RSV_REG) + B ·HaltEl1SvcAndResume(SB) + +el1_dbg: + B ·Shutdown(SB) + +el1_fpsimd_acc: + VFP_ENABLE + B ·kernelExitToEl1(SB) // Resume. + +el1_invalid: + B ·Shutdown(SB) + +// El1_irq is the handler for El1_irq. +TEXT ·El1_irq(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +// El1_fiq is the handler for El1_fiq. +TEXT ·El1_fiq(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +// El1_error is the handler for El1_error. +TEXT ·El1_error(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +// El0_sync is the handler for El0_sync. +TEXT ·El0_sync(SB),NOSPLIT,$0 + KERNEL_ENTRY_FROM_EL0 + WORD $0xd5385219 // MRS ESR_EL1, R25 + LSR $ESR_ELx_EC_SHIFT, R25, R24 + CMP $ESR_ELx_EC_SVC64, R24 + BEQ el0_svc + CMP $ESR_ELx_EC_DABT_LOW, R24 + BEQ el0_da + CMP $ESR_ELx_EC_IABT_LOW, R24 + BEQ el0_ia + CMP $ESR_ELx_EC_FP_ASIMD, R24 + BEQ el0_fpsimd_acc + CMP $ESR_ELx_EC_SVE, R24 + BEQ el0_sve_acc + CMP $ESR_ELx_EC_FP_EXC64, R24 + BEQ el0_fpsimd_exc + CMP $ESR_ELx_EC_SP_ALIGN, R24 + BEQ el0_sp_pc + CMP $ESR_ELx_EC_PC_ALIGN, R24 + BEQ el0_sp_pc + CMP $ESR_ELx_EC_UNKNOWN, R24 + BEQ el0_undef + CMP $ESR_ELx_EC_BREAKPT_LOW, R24 + BGE el0_dbg + B el0_invalid + +el0_svc: + WORD $0xd538d092 //MRS TPIDR_EL1, R18 + + MOVD $0, CPU_ERROR_CODE(RSV_REG) // Clear error code. + + MOVD $1, R3 + MOVD R3, CPU_ERROR_TYPE(RSV_REG) // Set error type to user. + + MOVD $Syscall, R3 + MOVD R3, CPU_VECTOR_CODE(RSV_REG) + + B ·kernelExitToEl1(SB) + +el0_da: +el0_ia: + WORD $0xd538d092 //MRS TPIDR_EL1, R18 + WORD $0xd538601a //MRS FAR_EL1, R26 + + MOVD R26, CPU_FAULT_ADDR(RSV_REG) + + MOVD $1, R3 + MOVD R3, CPU_ERROR_TYPE(RSV_REG) // Set error type to user. + + MOVD $PageFault, R3 + MOVD R3, CPU_VECTOR_CODE(RSV_REG) + + MRS ESR_EL1, R3 + MOVD R3, CPU_ERROR_CODE(RSV_REG) + + B ·kernelExitToEl1(SB) + +el0_fpsimd_acc: + B ·Shutdown(SB) + +el0_sve_acc: + B ·Shutdown(SB) + +el0_fpsimd_exc: + B ·Shutdown(SB) + +el0_sp_pc: + B ·Shutdown(SB) + +el0_undef: + MOVD $El0Sync_undef, R3 + MOVD R3, CPU_VECTOR_CODE(RSV_REG) + + B ·kernelExitToEl1(SB) + +el0_dbg: + B ·Shutdown(SB) + +el0_invalid: + B ·Shutdown(SB) + +TEXT ·El0_irq(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +TEXT ·El0_fiq(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +TEXT ·El0_error(SB),NOSPLIT,$0 + KERNEL_ENTRY_FROM_EL0 + WORD $0xd538d092 //MRS TPIDR_EL1, R18 + WORD $0xd538601a //MRS FAR_EL1, R26 + + MOVD R26, CPU_FAULT_ADDR(RSV_REG) + + MOVD $1, R3 + MOVD R3, CPU_ERROR_TYPE(RSV_REG) // Set error type to user. + + MOVD $VirtualizationException, R3 + MOVD R3, CPU_VECTOR_CODE(RSV_REG) + + B ·HaltAndResume(SB) + +TEXT ·El0_sync_invalid(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +TEXT ·El0_irq_invalid(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +TEXT ·El0_fiq_invalid(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +TEXT ·El0_error_invalid(SB),NOSPLIT,$0 + B ·Shutdown(SB) + +// Vectors implements exception vector table. +TEXT ·Vectors(SB),NOSPLIT,$0 + B ·El1_sync_invalid(SB) + nop31Instructions() + B ·El1_irq_invalid(SB) + nop31Instructions() + B ·El1_fiq_invalid(SB) + nop31Instructions() + B ·El1_error_invalid(SB) + nop31Instructions() + + B ·El1_sync(SB) + nop31Instructions() + B ·El1_irq(SB) + nop31Instructions() + B ·El1_fiq(SB) + nop31Instructions() + B ·El1_error(SB) + nop31Instructions() + + B ·El0_sync(SB) + nop31Instructions() + B ·El0_irq(SB) + nop31Instructions() + B ·El0_fiq(SB) + nop31Instructions() + B ·El0_error(SB) + nop31Instructions() + + B ·El0_sync_invalid(SB) + nop31Instructions() + B ·El0_irq_invalid(SB) + nop31Instructions() + B ·El0_fiq_invalid(SB) + nop31Instructions() + B ·El0_error_invalid(SB) + nop31Instructions() + + // The exception-vector-table is required to be 11-bits aligned. + // Please see Linux source code as reference: arch/arm64/kernel/entry.s. + // For gvisor, I defined it as 4K in length, filled the 2nd 2K part with NOPs. + // So that, I can safely move the 1st 2K part into the address with 11-bits alignment. + WORD $0xd503201f //nop + nop31Instructions() + WORD $0xd503201f + nop31Instructions() + WORD $0xd503201f + nop31Instructions() + WORD $0xd503201f + nop31Instructions() + + WORD $0xd503201f + nop31Instructions() + WORD $0xd503201f + nop31Instructions() + WORD $0xd503201f + nop31Instructions() + WORD $0xd503201f + nop31Instructions() + + WORD $0xd503201f + nop31Instructions() + WORD $0xd503201f + nop31Instructions() + WORD $0xd503201f + nop31Instructions() + WORD $0xd503201f + nop31Instructions() + + WORD $0xd503201f + nop31Instructions() + WORD $0xd503201f + nop31Instructions() + WORD $0xd503201f + nop31Instructions() + WORD $0xd503201f + nop31Instructions() diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD index 780bf9a66..549f3d228 100644 --- a/pkg/sentry/platform/ring0/gen_offsets/BUILD +++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD @@ -1,25 +1,34 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools:defs.bzl", "go_binary") load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) go_template_instance( - name = "defs_impl", - out = "defs_impl.go", + name = "defs_impl_arm64", + out = "defs_impl_arm64.go", package = "main", - template = "//pkg/sentry/platform/ring0:defs", + template = "//pkg/sentry/platform/ring0:defs_arm64", +) + +go_template_instance( + name = "defs_impl_amd64", + out = "defs_impl_amd64.go", + package = "main", + template = "//pkg/sentry/platform/ring0:defs_amd64", ) go_binary( name = "gen_offsets", srcs = [ - "defs_impl.go", + "defs_impl_amd64.go", + "defs_impl_arm64.go", "main.go", ], visibility = ["//pkg/sentry/platform/ring0:__pkg__"], deps = [ "//pkg/cpuid", + "//pkg/sentry/arch", "//pkg/sentry/platform/ring0/pagetables", - "//pkg/sentry/usermem", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go index 900c0bba7..021693791 100644 --- a/pkg/sentry/platform/ring0/kernel.go +++ b/pkg/sentry/platform/ring0/kernel.go @@ -31,23 +31,39 @@ type defaultHooks struct{} // KernelSyscall implements Hooks.KernelSyscall. // +// +checkescape:all +// //go:nosplit -func (defaultHooks) KernelSyscall() { Halt() } +func (defaultHooks) KernelSyscall() { + Halt() +} // KernelException implements Hooks.KernelException. // +// +checkescape:all +// //go:nosplit -func (defaultHooks) KernelException(Vector) { Halt() } +func (defaultHooks) KernelException(Vector) { + Halt() +} // kernelSyscall is a trampoline. // +// +checkescape:hard,stack +// //go:nosplit -func kernelSyscall(c *CPU) { c.hooks.KernelSyscall() } +func kernelSyscall(c *CPU) { + c.hooks.KernelSyscall() +} // kernelException is a trampoline. // +// +checkescape:hard,stack +// //go:nosplit -func kernelException(c *CPU, vector Vector) { c.hooks.KernelException(vector) } +func kernelException(c *CPU, vector Vector) { + c.hooks.KernelException(vector) +} // Init initializes a new CPU. // diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go index 0feff8778..d37981dbf 100644 --- a/pkg/sentry/platform/ring0/kernel_amd64.go +++ b/pkg/sentry/platform/ring0/kernel_amd64.go @@ -178,6 +178,8 @@ func IsCanonical(addr uint64) bool { // // Precondition: the Rip, Rsp, Fs and Gs registers must be canonical. // +// +checkescape:all +// //go:nosplit func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID) @@ -192,9 +194,9 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { // Perform the switch. swapgs() // GS will be swapped on return. - WriteFS(uintptr(regs.Fs_base)) // Set application FS. - WriteGS(uintptr(regs.Gs_base)) // Set application GS. - LoadFloatingPoint(switchOpts.FloatingPointState) // Copy in floating point. + WriteFS(uintptr(regs.Fs_base)) // escapes: no. Set application FS. + WriteGS(uintptr(regs.Gs_base)) // escapes: no. Set application GS. + LoadFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy in floating point. jumpToKernel() // Switch to upper half. writeCR3(uintptr(userCR3)) // Change to user address space. if switchOpts.FullRestore { @@ -204,8 +206,8 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { } writeCR3(uintptr(kernelCR3)) // Return to kernel address space. jumpToUser() // Return to lower half. - SaveFloatingPoint(switchOpts.FloatingPointState) // Copy out floating point. - WriteFS(uintptr(c.registers.Fs_base)) // Restore kernel FS. + SaveFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy out floating point. + WriteFS(uintptr(c.registers.Fs_base)) // escapes: no. Restore kernel FS. return } diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go new file mode 100644 index 000000000..d0afa1aaa --- /dev/null +++ b/pkg/sentry/platform/ring0/kernel_arm64.go @@ -0,0 +1,72 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package ring0 + +// HaltAndResume halts execution and point the pointer to the resume function. +//go:nosplit +func HaltAndResume() + +// HaltEl1SvcAndResume calls Hooks.KernelSyscall and resume. +//go:nosplit +func HaltEl1SvcAndResume() + +// init initializes architecture-specific state. +func (k *Kernel) init(opts KernelOpts) { + // Save the root page tables. + k.PageTables = opts.PageTables +} + +// init initializes architecture-specific state. +func (c *CPU) init() { + // Set the kernel stack pointer(virtual address). + c.registers.Sp = uint64(c.StackTop()) + +} + +// StackTop returns the kernel's stack address. +// +//go:nosplit +func (c *CPU) StackTop() uint64 { + return uint64(kernelAddr(&c.stack[0])) + uint64(len(c.stack)) +} + +// IsCanonical indicates whether addr is canonical per the arm64 spec. +// +//go:nosplit +func IsCanonical(addr uint64) bool { + return addr <= 0x0000ffffffffffff || addr > 0xffff000000000000 +} + +//go:nosplit +func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { + regs := switchOpts.Registers + + regs.Pstate &= ^uint64(PsrFlagsClear) + regs.Pstate |= UserFlagsSet + + LoadFloatingPoint(switchOpts.FloatingPointState) + SetTLS(regs.TPIDR_EL0) + + kernelExitToEl0() + + regs.TPIDR_EL0 = GetTLS() + SaveFloatingPoint(switchOpts.FloatingPointState) + + vector = c.vecCode + + return +} diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go new file mode 100644 index 000000000..00e52c8af --- /dev/null +++ b/pkg/sentry/platform/ring0/lib_arm64.go @@ -0,0 +1,58 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package ring0 + +// CPACREL1 returns the value of the CPACR_EL1 register. +func CPACREL1() (value uintptr) + +// FPCR returns the value of FPCR register. +func GetFPCR() (value uintptr) + +// SetFPCR writes the FPCR value. +func SetFPCR(value uintptr) + +// FPSR returns the value of FPSR register. +func GetFPSR() (value uintptr) + +// SetFPSR writes the FPSR value. +func SetFPSR(value uintptr) + +// SaveVRegs saves V0-V31 registers. +// V0-V31: 32 128-bit registers for floating point and simd. +func SaveVRegs(*byte) + +// LoadVRegs loads V0-V31 registers. +func LoadVRegs(*byte) + +// LoadFloatingPoint loads floating point state. +func LoadFloatingPoint(*byte) + +// SaveFloatingPoint saves floating point state. +func SaveFloatingPoint(*byte) + +// GetTLS returns the value of TPIDR_EL0 register. +func GetTLS() (value uint64) + +// SetTLS writes the TPIDR_EL0 value. +func SetTLS(value uint64) + +// Init sets function pointers based on architectural features. +// +// This must be called prior to using ring0. +func Init() { + rewriteVectors() +} diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s new file mode 100644 index 000000000..86bfbe46f --- /dev/null +++ b/pkg/sentry/platform/ring0/lib_arm64.s @@ -0,0 +1,217 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "funcdata.h" +#include "textflag.h" + +TEXT ·GetTLS(SB),NOSPLIT,$0-8 + MRS TPIDR_EL0, R1 + MOVD R1, ret+0(FP) + RET + +TEXT ·SetTLS(SB),NOSPLIT,$0-8 + MOVD addr+0(FP), R1 + MSR R1, TPIDR_EL0 + RET + +TEXT ·CPACREL1(SB),NOSPLIT,$0-8 + WORD $0xd5381041 // MRS CPACR_EL1, R1 + MOVD R1, ret+0(FP) + RET + +TEXT ·GetFPCR(SB),NOSPLIT,$0-8 + WORD $0xd53b4201 // MRS NZCV, R1 + MOVD R1, ret+0(FP) + RET + +TEXT ·GetFPSR(SB),NOSPLIT,$0-8 + WORD $0xd53b4421 // MRS FPSR, R1 + MOVD R1, ret+0(FP) + RET + +TEXT ·SetFPCR(SB),NOSPLIT,$0-8 + MOVD addr+0(FP), R1 + WORD $0xd51b4201 // MSR R1, NZCV + RET + +TEXT ·SetFPSR(SB),NOSPLIT,$0-8 + MOVD addr+0(FP), R1 + WORD $0xd51b4421 // MSR R1, FPSR + RET + +TEXT ·SaveVRegs(SB),NOSPLIT,$0-8 + MOVD addr+0(FP), R0 + + // Skip aarch64_ctx, fpsr, fpcr. + FMOVD F0, 16*1(R0) + FMOVD F1, 16*2(R0) + FMOVD F2, 16*3(R0) + FMOVD F3, 16*4(R0) + FMOVD F4, 16*5(R0) + FMOVD F5, 16*6(R0) + FMOVD F6, 16*7(R0) + FMOVD F7, 16*8(R0) + FMOVD F8, 16*9(R0) + FMOVD F9, 16*10(R0) + FMOVD F10, 16*11(R0) + FMOVD F11, 16*12(R0) + FMOVD F12, 16*13(R0) + FMOVD F13, 16*14(R0) + FMOVD F14, 16*15(R0) + FMOVD F15, 16*16(R0) + FMOVD F16, 16*17(R0) + FMOVD F17, 16*18(R0) + FMOVD F18, 16*19(R0) + FMOVD F19, 16*20(R0) + FMOVD F20, 16*21(R0) + FMOVD F21, 16*22(R0) + FMOVD F22, 16*23(R0) + FMOVD F23, 16*24(R0) + FMOVD F24, 16*25(R0) + FMOVD F25, 16*26(R0) + FMOVD F26, 16*27(R0) + FMOVD F27, 16*28(R0) + FMOVD F28, 16*29(R0) + FMOVD F29, 16*30(R0) + FMOVD F30, 16*31(R0) + FMOVD F31, 16*32(R0) + ISB $15 + + RET + +TEXT ·LoadVRegs(SB),NOSPLIT,$0-8 + MOVD addr+0(FP), R0 + + // Skip aarch64_ctx, fpsr, fpcr. + FMOVD 16*1(R0), F0 + FMOVD 16*2(R0), F1 + FMOVD 16*3(R0), F2 + FMOVD 16*4(R0), F3 + FMOVD 16*5(R0), F4 + FMOVD 16*6(R0), F5 + FMOVD 16*7(R0), F6 + FMOVD 16*8(R0), F7 + FMOVD 16*9(R0), F8 + FMOVD 16*10(R0), F9 + FMOVD 16*11(R0), F10 + FMOVD 16*12(R0), F11 + FMOVD 16*13(R0), F12 + FMOVD 16*14(R0), F13 + FMOVD 16*15(R0), F14 + FMOVD 16*16(R0), F15 + FMOVD 16*17(R0), F16 + FMOVD 16*18(R0), F17 + FMOVD 16*19(R0), F18 + FMOVD 16*20(R0), F19 + FMOVD 16*21(R0), F20 + FMOVD 16*22(R0), F21 + FMOVD 16*23(R0), F22 + FMOVD 16*24(R0), F23 + FMOVD 16*25(R0), F24 + FMOVD 16*26(R0), F25 + FMOVD 16*27(R0), F26 + FMOVD 16*28(R0), F27 + FMOVD 16*29(R0), F28 + FMOVD 16*30(R0), F29 + FMOVD 16*31(R0), F30 + FMOVD 16*32(R0), F31 + ISB $15 + + RET + +TEXT ·LoadFloatingPoint(SB),NOSPLIT,$0-8 + MOVD addr+0(FP), R0 + + MOVD 0(R0), R1 + MOVD R1, FPSR + MOVD 8(R0), R1 + MOVD R1, NZCV + + FMOVD 16*1(R0), F0 + FMOVD 16*2(R0), F1 + FMOVD 16*3(R0), F2 + FMOVD 16*4(R0), F3 + FMOVD 16*5(R0), F4 + FMOVD 16*6(R0), F5 + FMOVD 16*7(R0), F6 + FMOVD 16*8(R0), F7 + FMOVD 16*9(R0), F8 + FMOVD 16*10(R0), F9 + FMOVD 16*11(R0), F10 + FMOVD 16*12(R0), F11 + FMOVD 16*13(R0), F12 + FMOVD 16*14(R0), F13 + FMOVD 16*15(R0), F14 + FMOVD 16*16(R0), F15 + FMOVD 16*17(R0), F16 + FMOVD 16*18(R0), F17 + FMOVD 16*19(R0), F18 + FMOVD 16*20(R0), F19 + FMOVD 16*21(R0), F20 + FMOVD 16*22(R0), F21 + FMOVD 16*23(R0), F22 + FMOVD 16*24(R0), F23 + FMOVD 16*25(R0), F24 + FMOVD 16*26(R0), F25 + FMOVD 16*27(R0), F26 + FMOVD 16*28(R0), F27 + FMOVD 16*29(R0), F28 + FMOVD 16*30(R0), F29 + FMOVD 16*31(R0), F30 + FMOVD 16*32(R0), F31 + + RET + +TEXT ·SaveFloatingPoint(SB),NOSPLIT,$0-8 + MOVD addr+0(FP), R0 + + MOVD FPSR, R1 + MOVD R1, 0(R0) + MOVD NZCV, R1 + MOVD R1, 8(R0) + + FMOVD F0, 16*1(R0) + FMOVD F1, 16*2(R0) + FMOVD F2, 16*3(R0) + FMOVD F3, 16*4(R0) + FMOVD F4, 16*5(R0) + FMOVD F5, 16*6(R0) + FMOVD F6, 16*7(R0) + FMOVD F7, 16*8(R0) + FMOVD F8, 16*9(R0) + FMOVD F9, 16*10(R0) + FMOVD F10, 16*11(R0) + FMOVD F11, 16*12(R0) + FMOVD F12, 16*13(R0) + FMOVD F13, 16*14(R0) + FMOVD F14, 16*15(R0) + FMOVD F15, 16*16(R0) + FMOVD F16, 16*17(R0) + FMOVD F17, 16*18(R0) + FMOVD F18, 16*19(R0) + FMOVD F19, 16*20(R0) + FMOVD F20, 16*21(R0) + FMOVD F21, 16*22(R0) + FMOVD F22, 16*23(R0) + FMOVD F23, 16*24(R0) + FMOVD F24, 16*25(R0) + FMOVD F25, 16*26(R0) + FMOVD F26, 16*27(R0) + FMOVD F27, 16*28(R0) + FMOVD F28, 16*29(R0) + FMOVD F29, 16*30(R0) + FMOVD F30, 16*31(R0) + FMOVD F31, 16*32(R0) + + RET diff --git a/pkg/sentry/platform/ring0/lib_arm64_unsafe.go b/pkg/sentry/platform/ring0/lib_arm64_unsafe.go new file mode 100644 index 000000000..c05166fea --- /dev/null +++ b/pkg/sentry/platform/ring0/lib_arm64_unsafe.go @@ -0,0 +1,108 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package ring0 + +import ( + "reflect" + "syscall" + "unsafe" + + "gvisor.dev/gvisor/pkg/safecopy" + "gvisor.dev/gvisor/pkg/usermem" +) + +const ( + nopInstruction = 0xd503201f + instSize = unsafe.Sizeof(uint32(0)) + vectorsRawLen = 0x800 +) + +func unsafeSlice(addr uintptr, length int) (slice []uint32) { + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) + hdr.Data = addr + hdr.Len = length / int(instSize) + hdr.Cap = length / int(instSize) + return slice +} + +// Work around: move ring0.Vectors() into a specific address with 11-bits alignment. +// +// According to the design documentation of Arm64, +// the start address of exception vector table should be 11-bits aligned. +// Please see the code in linux kernel as reference: arch/arm64/kernel/entry.S +// But, we can't align a function's start address to a specific address by using golang. +// We have raised this question in golang community: +// https://groups.google.com/forum/m/#!topic/golang-dev/RPj90l5x86I +// This function will be removed when golang supports this feature. +// +// There are 2 jobs were implemented in this function: +// 1, move the start address of exception vector table into the specific address. +// 2, modify the offset of each instruction. +func rewriteVectors() { + vectorsBegin := reflect.ValueOf(Vectors).Pointer() + + // The exception-vector-table is required to be 11-bits aligned. + // And the size is 0x800. + // Please see the documentation as reference: + // https://developer.arm.com/docs/100933/0100/aarch64-exception-vector-table + // + // But, golang does not allow to set a function's address to a specific value. + // So, for gvisor, I defined the size of exception-vector-table as 4K, + // filled the 2nd 2K part with NOP-s. + // So that, I can safely move the 1st 2K part into the address with 11-bits alignment. + // + // So, the prerequisite for this function to work correctly is: + // vectorsSafeLen >= 0x1000 + // vectorsRawLen = 0x800 + vectorsSafeLen := int(safecopy.FindEndAddress(vectorsBegin) - vectorsBegin) + if vectorsSafeLen < 2*vectorsRawLen { + panic("Can't update vectors") + } + + vectorsSafeTable := unsafeSlice(vectorsBegin, vectorsSafeLen) // Now a []uint32 + vectorsRawLen32 := vectorsRawLen / int(instSize) + + offset := vectorsBegin & (1<<11 - 1) + if offset != 0 { + offset = 1<<11 - offset + } + + pageBegin := (vectorsBegin + offset) & ^uintptr(usermem.PageSize-1) + + _, _, errno := syscall.Syscall(syscall.SYS_MPROTECT, uintptr(pageBegin), uintptr(usermem.PageSize), uintptr(syscall.PROT_READ|syscall.PROT_WRITE|syscall.PROT_EXEC)) + if errno != 0 { + panic(errno.Error()) + } + + offset = offset / instSize // By index, not bytes. + // Move exception-vector-table into the specific address, should uses memmove here. + for i := 1; i <= vectorsRawLen32; i++ { + vectorsSafeTable[int(offset)+vectorsRawLen32-i] = vectorsSafeTable[vectorsRawLen32-i] + } + + // Adjust branch since instruction was moved forward. + for i := 0; i < vectorsRawLen32; i++ { + if vectorsSafeTable[int(offset)+i] != nopInstruction { + vectorsSafeTable[int(offset)+i] -= uint32(offset) + } + } + + _, _, errno = syscall.Syscall(syscall.SYS_MPROTECT, uintptr(pageBegin), uintptr(usermem.PageSize), uintptr(syscall.PROT_READ|syscall.PROT_EXEC)) + if errno != 0 { + panic(errno.Error()) + } +} diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go index 85cc3fdad..b8ab120a0 100644 --- a/pkg/sentry/platform/ring0/offsets_amd64.go +++ b/pkg/sentry/platform/ring0/offsets_amd64.go @@ -20,7 +20,8 @@ import ( "fmt" "io" "reflect" - "syscall" + + "gvisor.dev/gvisor/pkg/sentry/arch" ) // Emit prints architecture-specific offsets. @@ -64,7 +65,7 @@ func Emit(w io.Writer) { fmt.Fprintf(w, "#define SyscallInt80 0x%02x\n", SyscallInt80) fmt.Fprintf(w, "#define Syscall 0x%02x\n", Syscall) - p := &syscall.PtraceRegs{} + p := &arch.Registers{} fmt.Fprintf(w, "\n// Ptrace registers.\n") fmt.Fprintf(w, "#define PTRACE_R15 0x%02x\n", reflect.ValueOf(&p.R15).Pointer()-reflect.ValueOf(p).Pointer()) fmt.Fprintf(w, "#define PTRACE_R14 0x%02x\n", reflect.ValueOf(&p.R14).Pointer()-reflect.ValueOf(p).Pointer()) diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go new file mode 100644 index 000000000..f3de962f0 --- /dev/null +++ b/pkg/sentry/platform/ring0/offsets_arm64.go @@ -0,0 +1,127 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package ring0 + +import ( + "fmt" + "io" + "reflect" + + "gvisor.dev/gvisor/pkg/sentry/arch" +) + +// Emit prints architecture-specific offsets. +func Emit(w io.Writer) { + fmt.Fprintf(w, "// Automatically generated, do not edit.\n") + + c := &CPU{} + fmt.Fprintf(w, "\n// CPU offsets.\n") + fmt.Fprintf(w, "#define CPU_SELF 0x%02x\n", reflect.ValueOf(&c.self).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_REGISTERS 0x%02x\n", reflect.ValueOf(&c.registers).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_STACK_TOP 0x%02x\n", reflect.ValueOf(&c.stack[0]).Pointer()-reflect.ValueOf(c).Pointer()+uintptr(len(c.stack))) + fmt.Fprintf(w, "#define CPU_ERROR_CODE 0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_ERROR_TYPE 0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_FAULT_ADDR 0x%02x\n", reflect.ValueOf(&c.faultAddr).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_TTBR0_KVM 0x%02x\n", reflect.ValueOf(&c.ttbr0Kvm).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_TTBR0_APP 0x%02x\n", reflect.ValueOf(&c.ttbr0App).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_VECTOR_CODE 0x%02x\n", reflect.ValueOf(&c.vecCode).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_APP_ADDR 0x%02x\n", reflect.ValueOf(&c.appAddr).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_LAZY_VFP 0x%02x\n", reflect.ValueOf(&c.lazyVFP).Pointer()-reflect.ValueOf(c).Pointer()) + + fmt.Fprintf(w, "\n// Bits.\n") + fmt.Fprintf(w, "#define _KERNEL_FLAGS 0x%02x\n", KernelFlagsSet) + + fmt.Fprintf(w, "\n// Vectors.\n") + fmt.Fprintf(w, "#define El1SyncInvalid 0x%02x\n", El1SyncInvalid) + fmt.Fprintf(w, "#define El1IrqInvalid 0x%02x\n", El1IrqInvalid) + fmt.Fprintf(w, "#define El1FiqInvalid 0x%02x\n", El1FiqInvalid) + fmt.Fprintf(w, "#define El1ErrorInvalid 0x%02x\n", El1ErrorInvalid) + + fmt.Fprintf(w, "#define El1Sync 0x%02x\n", El1Sync) + fmt.Fprintf(w, "#define El1Irq 0x%02x\n", El1Irq) + fmt.Fprintf(w, "#define El1Fiq 0x%02x\n", El1Fiq) + fmt.Fprintf(w, "#define El1Error 0x%02x\n", El1Error) + + fmt.Fprintf(w, "#define El0Sync 0x%02x\n", El0Sync) + fmt.Fprintf(w, "#define El0Irq 0x%02x\n", El0Irq) + fmt.Fprintf(w, "#define El0Fiq 0x%02x\n", El0Fiq) + fmt.Fprintf(w, "#define El0Error 0x%02x\n", El0Error) + + fmt.Fprintf(w, "#define El0Sync_invalid 0x%02x\n", El0Sync_invalid) + fmt.Fprintf(w, "#define El0Irq_invalid 0x%02x\n", El0Irq_invalid) + fmt.Fprintf(w, "#define El0Fiq_invalid 0x%02x\n", El0Fiq_invalid) + fmt.Fprintf(w, "#define El0Error_invalid 0x%02x\n", El0Error_invalid) + + fmt.Fprintf(w, "#define El1Sync_da 0x%02x\n", El1Sync_da) + fmt.Fprintf(w, "#define El1Sync_ia 0x%02x\n", El1Sync_ia) + fmt.Fprintf(w, "#define El1Sync_sp_pc 0x%02x\n", El1Sync_sp_pc) + fmt.Fprintf(w, "#define El1Sync_undef 0x%02x\n", El1Sync_undef) + fmt.Fprintf(w, "#define El1Sync_dbg 0x%02x\n", El1Sync_dbg) + fmt.Fprintf(w, "#define El1Sync_inv 0x%02x\n", El1Sync_inv) + + fmt.Fprintf(w, "#define El0Sync_svc 0x%02x\n", El0Sync_svc) + fmt.Fprintf(w, "#define El0Sync_da 0x%02x\n", El0Sync_da) + fmt.Fprintf(w, "#define El0Sync_ia 0x%02x\n", El0Sync_ia) + fmt.Fprintf(w, "#define El0Sync_fpsimd_acc 0x%02x\n", El0Sync_fpsimd_acc) + fmt.Fprintf(w, "#define El0Sync_sve_acc 0x%02x\n", El0Sync_sve_acc) + fmt.Fprintf(w, "#define El0Sync_sys 0x%02x\n", El0Sync_sys) + fmt.Fprintf(w, "#define El0Sync_sp_pc 0x%02x\n", El0Sync_sp_pc) + fmt.Fprintf(w, "#define El0Sync_undef 0x%02x\n", El0Sync_undef) + fmt.Fprintf(w, "#define El0Sync_dbg 0x%02x\n", El0Sync_dbg) + fmt.Fprintf(w, "#define El0Sync_inv 0x%02x\n", El0Sync_inv) + + fmt.Fprintf(w, "#define PageFault 0x%02x\n", PageFault) + fmt.Fprintf(w, "#define Syscall 0x%02x\n", Syscall) + fmt.Fprintf(w, "#define VirtualizationException 0x%02x\n", VirtualizationException) + + p := &arch.Registers{} + fmt.Fprintf(w, "\n// Ptrace registers.\n") + fmt.Fprintf(w, "#define PTRACE_R0 0x%02x\n", reflect.ValueOf(&p.Regs[0]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R1 0x%02x\n", reflect.ValueOf(&p.Regs[1]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R2 0x%02x\n", reflect.ValueOf(&p.Regs[2]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R3 0x%02x\n", reflect.ValueOf(&p.Regs[3]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R4 0x%02x\n", reflect.ValueOf(&p.Regs[4]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R5 0x%02x\n", reflect.ValueOf(&p.Regs[5]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R6 0x%02x\n", reflect.ValueOf(&p.Regs[6]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R7 0x%02x\n", reflect.ValueOf(&p.Regs[7]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R8 0x%02x\n", reflect.ValueOf(&p.Regs[8]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R9 0x%02x\n", reflect.ValueOf(&p.Regs[9]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R10 0x%02x\n", reflect.ValueOf(&p.Regs[10]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R11 0x%02x\n", reflect.ValueOf(&p.Regs[11]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R12 0x%02x\n", reflect.ValueOf(&p.Regs[12]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R13 0x%02x\n", reflect.ValueOf(&p.Regs[13]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R14 0x%02x\n", reflect.ValueOf(&p.Regs[14]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R15 0x%02x\n", reflect.ValueOf(&p.Regs[15]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R16 0x%02x\n", reflect.ValueOf(&p.Regs[16]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R17 0x%02x\n", reflect.ValueOf(&p.Regs[17]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R18 0x%02x\n", reflect.ValueOf(&p.Regs[18]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R19 0x%02x\n", reflect.ValueOf(&p.Regs[19]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R20 0x%02x\n", reflect.ValueOf(&p.Regs[20]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R21 0x%02x\n", reflect.ValueOf(&p.Regs[21]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R22 0x%02x\n", reflect.ValueOf(&p.Regs[22]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R23 0x%02x\n", reflect.ValueOf(&p.Regs[23]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R24 0x%02x\n", reflect.ValueOf(&p.Regs[24]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R25 0x%02x\n", reflect.ValueOf(&p.Regs[25]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R26 0x%02x\n", reflect.ValueOf(&p.Regs[26]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R27 0x%02x\n", reflect.ValueOf(&p.Regs[27]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R28 0x%02x\n", reflect.ValueOf(&p.Regs[28]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R29 0x%02x\n", reflect.ValueOf(&p.Regs[29]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R30 0x%02x\n", reflect.ValueOf(&p.Regs[30]).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_SP 0x%02x\n", reflect.ValueOf(&p.Sp).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_PC 0x%02x\n", reflect.ValueOf(&p.Pc).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_PSTATE 0x%02x\n", reflect.ValueOf(&p.Pstate).Pointer()-reflect.ValueOf(p).Pointer()) +} diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD index 934a90378..16d5f478b 100644 --- a/pkg/sentry/platform/ring0/pagetables/BUILD +++ b/pkg/sentry/platform/ring0/pagetables/BUILD @@ -1,14 +1,14 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test", "select_arch") load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") package(licenses = ["notice"]) go_template( name = "generic_walker", - srcs = [ - "walker_amd64.go", - ], + srcs = select_arch( + amd64 = ["walker_amd64.go"], + arm64 = ["walker_arm64.go"], + ), opt_types = [ "Visitor", ], @@ -76,20 +76,29 @@ go_library( "allocator.go", "allocator_unsafe.go", "pagetables.go", + "pagetables_aarch64.go", "pagetables_amd64.go", + "pagetables_arm64.go", "pagetables_x86.go", + "pcids.go", + "pcids_aarch64.go", + "pcids_aarch64.s", "pcids_x86.go", + "walker_amd64.go", + "walker_arm64.go", "walker_empty.go", "walker_lookup.go", "walker_map.go", "walker_unmap.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables", visibility = [ "//pkg/sentry/platform/kvm:__subpackages__", "//pkg/sentry/platform/ring0:__subpackages__", ], - deps = ["//pkg/sentry/usermem"], + deps = [ + "//pkg/sync", + "//pkg/usermem", + ], ) go_test( @@ -97,9 +106,10 @@ go_test( size = "small", srcs = [ "pagetables_amd64_test.go", + "pagetables_arm64_test.go", "pagetables_test.go", "walker_check.go", ], - embed = [":pagetables"], - deps = ["//pkg/sentry/usermem"], + library = ":pagetables", + deps = ["//pkg/usermem"], ) diff --git a/pkg/sentry/platform/ring0/pagetables/allocator.go b/pkg/sentry/platform/ring0/pagetables/allocator.go index 23fd5c352..8d75b7599 100644 --- a/pkg/sentry/platform/ring0/pagetables/allocator.go +++ b/pkg/sentry/platform/ring0/pagetables/allocator.go @@ -53,9 +53,14 @@ type RuntimeAllocator struct { // NewRuntimeAllocator returns an allocator that uses runtime allocation. func NewRuntimeAllocator() *RuntimeAllocator { - return &RuntimeAllocator{ - used: make(map[*PTEs]struct{}), - } + r := new(RuntimeAllocator) + r.Init() + return r +} + +// Init initializes a RuntimeAllocator. +func (r *RuntimeAllocator) Init() { + r.used = make(map[*PTEs]struct{}) } // Recycle returns freed pages to the pool. diff --git a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go index a90394a33..d08bfdeb3 100644 --- a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go +++ b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go @@ -17,7 +17,7 @@ package pagetables import ( "unsafe" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // newAlignedPTEs returns a set of aligned PTEs. diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go index 904f1a6de..7f18ac296 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go @@ -21,7 +21,7 @@ package pagetables import ( - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // PageTables is a set of page tables. @@ -48,15 +48,6 @@ func New(a Allocator) *PageTables { return p } -// Init initializes a set of PageTables. -// -//go:nosplit -func (p *PageTables) Init(allocator Allocator) { - p.Allocator = allocator - p.root = p.Allocator.NewPTEs() - p.rootPhysical = p.Allocator.PhysicalFor(p.root) -} - // mapVisitor is used for map. type mapVisitor struct { target uintptr // Input. @@ -95,6 +86,8 @@ func (*mapVisitor) requiresSplit() bool { return true } // // Precondition: addr & length must be page-aligned, their sum must not overflow. // +// +checkescape:hard,stack +// //go:nosplit func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physical uintptr) bool { if !opts.AccessType.Any() { @@ -137,6 +130,8 @@ func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) { // // Precondition: addr & length must be page-aligned. // +// +checkescape:hard,stack +// //go:nosplit func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool { w := unmapWalker{ @@ -171,6 +166,8 @@ func (v *emptyVisitor) visit(start uintptr, pte *PTE, align uintptr) { // // Precondition: addr & length must be page-aligned. // +// +checkescape:hard,stack +// //go:nosplit func (p *PageTables) IsEmpty(addr usermem.Addr, length uintptr) bool { w := emptyWalker{ @@ -206,6 +203,8 @@ func (*lookupVisitor) requiresSplit() bool { return false } // Lookup returns the physical address for the given virtual address. // +// +checkescape:hard,stack +// //go:nosplit func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts) { mask := uintptr(usermem.PageSize - 1) diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go new file mode 100644 index 000000000..6409d1d91 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go @@ -0,0 +1,215 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package pagetables + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/usermem" +) + +// archPageTables is architecture-specific data. +type archPageTables struct { + // root is the pagetable root for kernel space. + root *PTEs + + // rootPhysical is the cached physical address of the root. + // + // This is saved only to prevent constant translation. + rootPhysical uintptr + + asid uint16 +} + +// TTBR0_EL1 returns the translation table base register 0. +// +//go:nosplit +func (p *PageTables) TTBR0_EL1(noFlush bool, asid uint16) uint64 { + return uint64(p.rootPhysical) | (uint64(asid)&ttbrASIDMask)<<ttbrASIDOffset +} + +// TTBR1_EL1 returns the translation table base register 1. +// +//go:nosplit +func (p *PageTables) TTBR1_EL1(noFlush bool, asid uint16) uint64 { + return uint64(p.archPageTables.rootPhysical) | (uint64(asid)&ttbrASIDMask)<<ttbrASIDOffset +} + +// Bits in page table entries. +const ( + typeTable = 0x3 << 0 + typeSect = 0x1 << 0 + typePage = 0x3 << 0 + pteValid = 0x1 << 0 + pteTableBit = 0x1 << 1 + pteTypeMask = 0x3 << 0 + present = pteValid | pteTableBit + user = 0x1 << 6 /* AP[1] */ + readOnly = 0x1 << 7 /* AP[2] */ + accessed = 0x1 << 10 + dbm = 0x1 << 51 + writable = dbm + cont = 0x1 << 52 + pxn = 0x1 << 53 + xn = 0x1 << 54 + dirty = 0x1 << 55 + nG = 0x1 << 11 + shared = 0x3 << 8 +) + +const ( + mtDevicenGnRE = 0x1 << 2 + mtNormal = 0x4 << 2 +) + +const ( + executeDisable = xn + optionMask = 0xfff | 0xfff<<48 + protDefault = accessed | shared +) + +// MapOpts are x86 options. +type MapOpts struct { + // AccessType defines permissions. + AccessType usermem.AccessType + + // Global indicates the page is globally accessible. + Global bool + + // User indicates the page is a user page. + User bool +} + +// PTE is a page table entry. +type PTE uintptr + +// Clear clears this PTE, including sect page information. +// +//go:nosplit +func (p *PTE) Clear() { + atomic.StoreUintptr((*uintptr)(p), 0) +} + +// Valid returns true iff this entry is valid. +// +//go:nosplit +func (p *PTE) Valid() bool { + return atomic.LoadUintptr((*uintptr)(p))&present != 0 +} + +// Opts returns the PTE options. +// +// These are all options except Valid and Sect. +// +//go:nosplit +func (p *PTE) Opts() MapOpts { + v := atomic.LoadUintptr((*uintptr)(p)) + + return MapOpts{ + AccessType: usermem.AccessType{ + Read: true, + Write: v&readOnly == 0, + Execute: v&xn == 0, + }, + Global: v&nG == 0, + User: v&user != 0, + } +} + +// SetSect sets this page as a sect page. +// +// The page must not be valid or a panic will result. +// +//go:nosplit +func (p *PTE) SetSect() { + if p.Valid() { + // This is not allowed. + panic("SetSect called on valid page!") + } + atomic.StoreUintptr((*uintptr)(p), typeSect) +} + +// IsSect returns true iff this page is a sect page. +// +//go:nosplit +func (p *PTE) IsSect() bool { + return atomic.LoadUintptr((*uintptr)(p))&pteTypeMask == typeSect +} + +// Set sets this PTE value. +// +// This does not change the sect page property. +// +//go:nosplit +func (p *PTE) Set(addr uintptr, opts MapOpts) { + if !opts.AccessType.Any() { + p.Clear() + return + } + v := (addr &^ optionMask) | protDefault | nG | readOnly + + if p.IsSect() { + // Note that this is inherited from the previous instance. Set + // does not change the value of Sect. See above. + v |= typeSect + } else { + v |= typePage + } + + if opts.Global { + v = v &^ nG + } + + if opts.AccessType.Execute { + v = v &^ executeDisable + } else { + v |= executeDisable + } + if opts.AccessType.Write { + v = v &^ readOnly + } + + if opts.User { + v |= user + v |= mtNormal + } else { + v = v &^ user + v |= mtDevicenGnRE // Strong order for the addresses with ring0.KernelStartAddress. + } + atomic.StoreUintptr((*uintptr)(p), v) +} + +// setPageTable sets this PTE value and forces the write bit and sect bit to +// be cleared. This is used explicitly for breaking sect pages. +// +//go:nosplit +func (p *PTE) setPageTable(pt *PageTables, ptes *PTEs) { + addr := pt.Allocator.PhysicalFor(ptes) + if addr&^optionMask != addr { + // This should never happen. + panic("unaligned physical address!") + } + v := addr | typeTable | protDefault | mtNormal + atomic.StoreUintptr((*uintptr)(p), v) +} + +// Address extracts the address. This should only be used if Valid returns true. +// +//go:nosplit +func (p *PTE) Address() uintptr { + return atomic.LoadUintptr((*uintptr)(p)) &^ optionMask +} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go index 7aa6c524e..0c153cf8c 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go @@ -41,5 +41,14 @@ const ( entriesPerPage = 512 ) +// Init initializes a set of PageTables. +// +//go:nosplit +func (p *PageTables) Init(allocator Allocator) { + p.Allocator = allocator + p.root = p.Allocator.NewPTEs() + p.rootPhysical = p.Allocator.PhysicalFor(p.root) +} + // PTEs is a collection of entries. type PTEs [entriesPerPage]PTE diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go index 35e917526..54e8e554f 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go @@ -19,7 +19,7 @@ package pagetables import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) func Test2MAnd4K(t *testing.T) { diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go new file mode 100644 index 000000000..1a49f12a2 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go @@ -0,0 +1,57 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pagetables + +// Address constraints. +// +// The lowerTop and upperBottom currently apply to four-level pagetables; +// additional refactoring would be necessary to support five-level pagetables. +const ( + lowerTop = 0x0000ffffffffffff + upperBottom = 0xffff000000000000 + pteShift = 12 + pmdShift = 21 + pudShift = 30 + pgdShift = 39 + + pteMask = 0x1ff << pteShift + pmdMask = 0x1ff << pmdShift + pudMask = 0x1ff << pudShift + pgdMask = 0x1ff << pgdShift + + pteSize = 1 << pteShift + pmdSize = 1 << pmdShift + pudSize = 1 << pudShift + pgdSize = 1 << pgdShift + + ttbrASIDOffset = 55 + ttbrASIDMask = 0xff + + entriesPerPage = 512 +) + +// Init initializes a set of PageTables. +// +//go:nosplit +func (p *PageTables) Init(allocator Allocator) { + p.Allocator = allocator + p.root = p.Allocator.NewPTEs() + p.rootPhysical = p.Allocator.PhysicalFor(p.root) + p.archPageTables.root = p.Allocator.NewPTEs() + p.archPageTables.rootPhysical = p.Allocator.PhysicalFor(p.archPageTables.root) +} + +// PTEs is a collection of entries. +type PTEs [entriesPerPage]PTE diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go new file mode 100644 index 000000000..2f73d424f --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go @@ -0,0 +1,80 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package pagetables + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/usermem" +) + +func Test2MAnd4K(t *testing.T) { + pt := New(NewRuntimeAllocator()) + + // Map a small page and a huge page. + pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite, User: true}, pteSize*42) + pt.Map(0x0000ff0000000000, pmdSize, MapOpts{AccessType: usermem.Read, User: true}, pmdSize*47) + + pt.Map(0xffff000000400000, pteSize, MapOpts{AccessType: usermem.ReadWrite, User: false}, pteSize*42) + pt.Map(0xffffff0000000000, pmdSize, MapOpts{AccessType: usermem.Read, User: false}, pmdSize*47) + + checkMappings(t, pt, []mapping{ + {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite, User: true}}, + {0x0000ff0000000000, pmdSize, pmdSize * 47, MapOpts{AccessType: usermem.Read, User: true}}, + {0xffff000000400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite, User: false}}, + {0xffffff0000000000, pmdSize, pmdSize * 47, MapOpts{AccessType: usermem.Read, User: false}}, + }) +} + +func Test1GAnd4K(t *testing.T) { + pt := New(NewRuntimeAllocator()) + + // Map a small page and a super page. + pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite, User: true}, pteSize*42) + pt.Map(0x0000ff0000000000, pudSize, MapOpts{AccessType: usermem.Read, User: true}, pudSize*47) + + checkMappings(t, pt, []mapping{ + {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite, User: true}}, + {0x0000ff0000000000, pudSize, pudSize * 47, MapOpts{AccessType: usermem.Read, User: true}}, + }) +} + +func TestSplit1GPage(t *testing.T) { + pt := New(NewRuntimeAllocator()) + + // Map a super page and knock out the middle. + pt.Map(0x0000ff0000000000, pudSize, MapOpts{AccessType: usermem.Read, User: true}, pudSize*42) + pt.Unmap(usermem.Addr(0x0000ff0000000000+pteSize), pudSize-(2*pteSize)) + + checkMappings(t, pt, []mapping{ + {0x0000ff0000000000, pteSize, pudSize * 42, MapOpts{AccessType: usermem.Read, User: true}}, + {0x0000ff0000000000 + pudSize - pteSize, pteSize, pudSize*42 + pudSize - pteSize, MapOpts{AccessType: usermem.Read, User: true}}, + }) +} + +func TestSplit2MPage(t *testing.T) { + pt := New(NewRuntimeAllocator()) + + // Map a huge page and knock out the middle. + pt.Map(0x0000ff0000000000, pmdSize, MapOpts{AccessType: usermem.Read, User: true}, pmdSize*42) + pt.Unmap(usermem.Addr(0x0000ff0000000000+pteSize), pmdSize-(2*pteSize)) + + checkMappings(t, pt, []mapping{ + {0x0000ff0000000000, pteSize, pmdSize * 42, MapOpts{AccessType: usermem.Read, User: true}}, + {0x0000ff0000000000 + pmdSize - pteSize, pteSize, pmdSize*42 + pmdSize - pteSize, MapOpts{AccessType: usermem.Read, User: true}}, + }) +} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go index 6e95ad2b9..5c88d087d 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go @@ -17,7 +17,7 @@ package pagetables import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) type mapping struct { diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go index 3e2383c5e..157438d9b 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build i386 amd64 +// +build 386 amd64 package pagetables import ( "sync/atomic" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // archPageTables is architecture-specific data. diff --git a/pkg/sentry/platform/ring0/pagetables/pcids.go b/pkg/sentry/platform/ring0/pagetables/pcids.go new file mode 100644 index 000000000..964496aac --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pcids.go @@ -0,0 +1,104 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pagetables + +import ( + "gvisor.dev/gvisor/pkg/sync" +) + +// PCIDs is a simple PCID database. +// +// This is not protected by locks and is thus suitable for use only with a +// single CPU at a time. +type PCIDs struct { + // mu protects below. + mu sync.Mutex + + // cache are the assigned page tables. + cache map[*PageTables]uint16 + + // avail are available PCIDs. + avail []uint16 +} + +// NewPCIDs returns a new PCID database. +// +// start is the first index to assign. Typically this will be one, as the zero +// pcid will always be flushed on transition (see pagetables_x86.go). This may +// be more than one if specific PCIDs are reserved. +// +// Nil is returned iff the start and size are out of range. +func NewPCIDs(start, size uint16) *PCIDs { + if start+uint16(size) > limitPCID { + return nil // See comment. + } + p := &PCIDs{ + cache: make(map[*PageTables]uint16), + } + for pcid := start; pcid < start+size; pcid++ { + p.avail = append(p.avail, pcid) + } + return p +} + +// Assign assigns a PCID to the given PageTables. +// +// This may overwrite any previous assignment provided. If this in the case, +// true is returned to indicate that the PCID should be flushed. +func (p *PCIDs) Assign(pt *PageTables) (uint16, bool) { + p.mu.Lock() + if pcid, ok := p.cache[pt]; ok { + p.mu.Unlock() + return pcid, false // No flush. + } + + // Is there something available? + if len(p.avail) > 0 { + pcid := p.avail[len(p.avail)-1] + p.avail = p.avail[:len(p.avail)-1] + p.cache[pt] = pcid + + // We need to flush because while this is in the available + // pool, it may have been used previously. + p.mu.Unlock() + return pcid, true + } + + // Evict an existing table. + for old, pcid := range p.cache { + delete(p.cache, old) + p.cache[pt] = pcid + + // A flush is definitely required in this case, these page + // tables may still be active. (They will just be assigned some + // other PCID if and when they hit the given CPU again.) + p.mu.Unlock() + return pcid, true + } + + // No PCID. + p.mu.Unlock() + return 0, false +} + +// Drop drops references to a set of page tables. +func (p *PCIDs) Drop(pt *PageTables) { + p.mu.Lock() + if pcid, ok := p.cache[pt]; ok { + delete(p.cache, pt) + p.avail = append(p.avail, pcid) + } + p.mu.Unlock() +} diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.go b/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.go new file mode 100644 index 000000000..fbfd41d83 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.go @@ -0,0 +1,32 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package pagetables + +// limitPCID is the maximum value of PCIDs. +// +// In VMSAv8-64, the PCID(ASID) size is an IMPLEMENTATION DEFINED choice +// of 8 bits or 16 bits, and ID_AA64MMFR0_EL1.ASIDBits identifies the +// supported size. When an implementation supports a 16-bit ASID, TCR_ELx.AS +// selects whether the top 8 bits of the ASID are used. +var limitPCID uint16 + +// GetASIDBits return the system ASID bits, 8 or 16 bits. +func GetASIDBits() uint8 + +func init() { + limitPCID = uint16(1)<<GetASIDBits() - 1 +} diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.s b/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.s new file mode 100644 index 000000000..e9d62d768 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pcids_aarch64.s @@ -0,0 +1,45 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +#include "funcdata.h" +#include "textflag.h" + +#define ID_AA64MMFR0_ASIDBITS_SHIFT 4 +#define ID_AA64MMFR0_ASIDBITS_16 2 +#define TCR_EL1_AS_BIT 36 + +// GetASIDBits return the system ASID bits, 8 or 16 bits. +// +// func GetASIDBits() uint8 +TEXT ·GetASIDBits(SB),NOSPLIT,$0-1 + // First, check whether 16bits ASID is supported. + // ID_AA64MMFR0_EL1.ASIDBITS[7:4] == 0010. + WORD $0xd5380700 // MRS ID_AA64MMFR0_EL1, R0 + UBFX $ID_AA64MMFR0_ASIDBITS_SHIFT, R0, $4, R0 + CMPW $ID_AA64MMFR0_ASIDBITS_16, R0 + BNE bits_8 + + // Second, check whether 16bits ASID is enabled. + // TCR_EL1.AS[36] == 1. + WORD $0xd5382040 // MRS TCR_EL1, R0 + TBZ $TCR_EL1_AS_BIT, R0, bits_8 + MOVD $16, R0 + B done +bits_8: + MOVD $8, R0 +done: + MOVB R0, ret+0(FP) + RET diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go index 0f029f25d..91fc5e8dd 100644 --- a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go +++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go @@ -1,4 +1,4 @@ -// Copyright 2018 The gVisor Authors. +// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,94 +16,5 @@ package pagetables -import ( - "sync" -) - -// limitPCID is the number of valid PCIDs. -const limitPCID = 4096 - -// PCIDs is a simple PCID database. -// -// This is not protected by locks and is thus suitable for use only with a -// single CPU at a time. -type PCIDs struct { - // mu protects below. - mu sync.Mutex - - // cache are the assigned page tables. - cache map[*PageTables]uint16 - - // avail are available PCIDs. - avail []uint16 -} - -// NewPCIDs returns a new PCID database. -// -// start is the first index to assign. Typically this will be one, as the zero -// pcid will always be flushed on transition (see pagetables_x86.go). This may -// be more than one if specific PCIDs are reserved. -// -// Nil is returned iff the start and size are out of range. -func NewPCIDs(start, size uint16) *PCIDs { - if start+uint16(size) >= limitPCID { - return nil // See comment. - } - p := &PCIDs{ - cache: make(map[*PageTables]uint16), - } - for pcid := start; pcid < start+size; pcid++ { - p.avail = append(p.avail, pcid) - } - return p -} - -// Assign assigns a PCID to the given PageTables. -// -// This may overwrite any previous assignment provided. If this in the case, -// true is returned to indicate that the PCID should be flushed. -func (p *PCIDs) Assign(pt *PageTables) (uint16, bool) { - p.mu.Lock() - if pcid, ok := p.cache[pt]; ok { - p.mu.Unlock() - return pcid, false // No flush. - } - - // Is there something available? - if len(p.avail) > 0 { - pcid := p.avail[len(p.avail)-1] - p.avail = p.avail[:len(p.avail)-1] - p.cache[pt] = pcid - - // We need to flush because while this is in the available - // pool, it may have been used previously. - p.mu.Unlock() - return pcid, true - } - - // Evict an existing table. - for old, pcid := range p.cache { - delete(p.cache, old) - p.cache[pt] = pcid - - // A flush is definitely required in this case, these page - // tables may still be active. (They will just be assigned some - // other PCID if and when they hit the given CPU again.) - p.mu.Unlock() - return pcid, true - } - - // No PCID. - p.mu.Unlock() - return 0, false -} - -// Drop drops references to a set of page tables. -func (p *PCIDs) Drop(pt *PageTables) { - p.mu.Lock() - if pcid, ok := p.cache[pt]; ok { - delete(p.cache, pt) - p.avail = append(p.avail, pcid) - } - p.mu.Unlock() -} +// limitPCID is the maximum value of valid PCIDs. +const limitPCID = 4095 diff --git a/pkg/sentry/platform/ring0/pagetables/walker_arm64.go b/pkg/sentry/platform/ring0/pagetables/walker_arm64.go new file mode 100644 index 000000000..c261d393a --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/walker_arm64.go @@ -0,0 +1,314 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package pagetables + +// Visitor is a generic type. +type Visitor interface { + // visit is called on each PTE. + visit(start uintptr, pte *PTE, align uintptr) + + // requiresAlloc indicates that new entries should be allocated within + // the walked range. + requiresAlloc() bool + + // requiresSplit indicates that entries in the given range should be + // split if they are huge or jumbo pages. + requiresSplit() bool +} + +// Walker walks page tables. +type Walker struct { + // pageTables are the tables to walk. + pageTables *PageTables + + // Visitor is the set of arguments. + visitor Visitor +} + +// iterateRange iterates over all appropriate levels of page tables for the given range. +// +// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The +// exception is sect pages. If a valid sect page (huge or jumbo) cannot be +// installed, then the walk will continue to individual entries. +// +// This algorithm will attempt to maximize the use of sect pages whenever +// possible. Whether a sect page is provided will be clear through the range +// provided in the callback. +// +// Note that if requiresAlloc is true, then no gaps will be present. However, +// if alloc is not set, then the iteration will likely be full of gaps. +// +// Note that this function should generally be avoided in favor of Map, Unmap, +// etc. when not necessary. +// +// Precondition: start must be page-aligned. +// +// Precondition: start must be less than end. +// +// Precondition: If requiresAlloc is true, then start and end should not span +// non-canonical ranges. If they do, a panic will result. +// +//go:nosplit +func (w *Walker) iterateRange(start, end uintptr) { + if start%pteSize != 0 { + panic("unaligned start") + } + if end < start { + panic("start > end") + } + if start < lowerTop { + if end <= lowerTop { + w.iterateRangeCanonical(start, end) + } else if end > lowerTop && end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + w.iterateRangeCanonical(upperBottom, end) + } + } else if start < upperBottom { + if end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(upperBottom, end) + } + } else { + w.iterateRangeCanonical(start, end) + } +} + +// next returns the next address quantized by the given size. +// +//go:nosplit +func next(start uintptr, size uintptr) uintptr { + start &= ^(size - 1) + start += size + return start +} + +// iterateRangeCanonical walks a canonical range. +// +//go:nosplit +func (w *Walker) iterateRangeCanonical(start, end uintptr) { + pgdEntryIndex := w.pageTables.root + if start >= upperBottom { + pgdEntryIndex = w.pageTables.archPageTables.root + } + + for pgdIndex := (uint16((start & pgdMask) >> pgdShift)); start < end && pgdIndex < entriesPerPage; pgdIndex++ { + var ( + pgdEntry = &pgdEntryIndex[pgdIndex] + pudEntries *PTEs + ) + if !pgdEntry.Valid() { + if !w.visitor.requiresAlloc() { + // Skip over this entry. + start = next(start, pgdSize) + continue + } + + // Allocate a new pgd. + pudEntries = w.pageTables.Allocator.NewPTEs() + pgdEntry.setPageTable(w.pageTables, pudEntries) + } else { + pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) + } + + // Map the next level. + clearPUDEntries := uint16(0) + + for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { + var ( + pudEntry = &pudEntries[pudIndex] + pmdEntries *PTEs + ) + if !pudEntry.Valid() { + if !w.visitor.requiresAlloc() { + // Skip over this entry. + clearPUDEntries++ + start = next(start, pudSize) + continue + } + + // This level has 1-GB sect pages. Is this + // entire region at least as large as a single + // PUD entry? If so, we can skip allocating a + // new page for the pmd. + if start&(pudSize-1) == 0 && end-start >= pudSize { + pudEntry.SetSect() + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + if pudEntry.Valid() { + start = next(start, pudSize) + continue + } + } + + // Allocate a new pud. + pmdEntries = w.pageTables.Allocator.NewPTEs() + pudEntry.setPageTable(w.pageTables, pmdEntries) + + } else if pudEntry.IsSect() { + // Does this page need to be split? + if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < next(start, pudSize)) { + // Install the relevant entries. + pmdEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pmdEntries[index].SetSect() + pmdEntries[index].Set( + pudEntry.Address()+(pmdSize*uintptr(index)), + pudEntry.Opts()) + } + pudEntry.setPageTable(w.pageTables, pmdEntries) + } else { + // A sect page to be checked directly. + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + + // Might have been cleared. + if !pudEntry.Valid() { + clearPUDEntries++ + } + + // Note that the sect page was changed. + start = next(start, pudSize) + continue + } + + } else { + pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) + } + + // Map the next level, since this is valid. + clearPMDEntries := uint16(0) + + for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { + var ( + pmdEntry = &pmdEntries[pmdIndex] + pteEntries *PTEs + ) + if !pmdEntry.Valid() { + if !w.visitor.requiresAlloc() { + // Skip over this entry. + clearPMDEntries++ + start = next(start, pmdSize) + continue + } + + // This level has 2-MB huge pages. If this + // region is contined in a single PMD entry? + // As above, we can skip allocating a new page. + if start&(pmdSize-1) == 0 && end-start >= pmdSize { + pmdEntry.SetSect() + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + if pmdEntry.Valid() { + start = next(start, pmdSize) + continue + } + } + + // Allocate a new pmd. + pteEntries = w.pageTables.Allocator.NewPTEs() + pmdEntry.setPageTable(w.pageTables, pteEntries) + + } else if pmdEntry.IsSect() { + // Does this page need to be split? + if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < next(start, pmdSize)) { + // Install the relevant entries. + pteEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pteEntries[index].Set( + pmdEntry.Address()+(pteSize*uintptr(index)), + pmdEntry.Opts()) + } + pmdEntry.setPageTable(w.pageTables, pteEntries) + } else { + // A huge page to be checked directly. + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + + // Might have been cleared. + if !pmdEntry.Valid() { + clearPMDEntries++ + } + + // Note that the huge page was changed. + start = next(start, pmdSize) + continue + } + + } else { + pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) + } + + // Map the next level, since this is valid. + clearPTEEntries := uint16(0) + + for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { + var ( + pteEntry = &pteEntries[pteIndex] + ) + if !pteEntry.Valid() && !w.visitor.requiresAlloc() { + clearPTEEntries++ + start += pteSize + continue + } + + // At this point, we are guaranteed that start%pteSize == 0. + w.visitor.visit(uintptr(start), pteEntry, pteSize-1) + if !pteEntry.Valid() { + if w.visitor.requiresAlloc() { + panic("PTE not set after iteration with requiresAlloc!") + } + clearPTEEntries++ + } + + // Note that the pte was changed. + start += pteSize + continue + } + + // Check if we no longer need this page. + if clearPTEEntries == entriesPerPage { + pmdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pteEntries) + clearPMDEntries++ + } + } + + // Check if we no longer need this page. + if clearPMDEntries == entriesPerPage { + pudEntry.Clear() + w.pageTables.Allocator.FreePTEs(pmdEntries) + clearPUDEntries++ + } + } + + // Check if we no longer need this page. + if clearPUDEntries == entriesPerPage { + pgdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pudEntries) + } + } +} diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go index 5f80d64e8..9da0ea685 100644 --- a/pkg/sentry/platform/ring0/x86.go +++ b/pkg/sentry/platform/ring0/x86.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build i386 amd64 +// +build 386 amd64 package ring0 |