summaryrefslogtreecommitdiffhomepage
path: root/pkg/ring0
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/ring0')
-rw-r--r--pkg/ring0/BUILD86
-rw-r--r--pkg/ring0/aarch64.go123
-rw-r--r--pkg/ring0/defs.go125
-rw-r--r--pkg/ring0/defs_amd64.go167
-rw-r--r--pkg/ring0/defs_arm64.go142
-rw-r--r--pkg/ring0/defs_impl_amd64.go620
-rw-r--r--pkg/ring0/defs_impl_arm64.go (renamed from pkg/ring0/offsets_arm64.go)351
-rw-r--r--pkg/ring0/entry_impl_amd64.s (renamed from pkg/ring0/entry_amd64.s)73
-rw-r--r--pkg/ring0/entry_impl_arm64.s (renamed from pkg/ring0/entry_arm64.s)90
-rw-r--r--pkg/ring0/gen_offsets/BUILD41
-rw-r--r--pkg/ring0/gen_offsets/main.go24
-rw-r--r--pkg/ring0/offsets_amd64.go104
-rw-r--r--pkg/ring0/pagetables/BUILD88
-rw-r--r--pkg/ring0/pagetables/pagetables_aarch64_state_autogen.go6
-rw-r--r--pkg/ring0/pagetables/pagetables_amd64_state_autogen.go6
-rw-r--r--pkg/ring0/pagetables/pagetables_amd64_test.go76
-rw-r--r--pkg/ring0/pagetables/pagetables_arm64_state_autogen.go6
-rw-r--r--pkg/ring0/pagetables/pagetables_arm64_test.go81
-rw-r--r--pkg/ring0/pagetables/pagetables_state_autogen.go3
-rw-r--r--pkg/ring0/pagetables/pagetables_test.go156
-rw-r--r--pkg/ring0/pagetables/pagetables_unsafe_state_autogen.go3
-rw-r--r--pkg/ring0/pagetables/pagetables_x86_state_autogen.go7
-rw-r--r--pkg/ring0/pagetables/walker_empty_amd64.go266
-rw-r--r--pkg/ring0/pagetables/walker_empty_arm64.go276
-rw-r--r--pkg/ring0/pagetables/walker_lookup_amd64.go266
-rw-r--r--pkg/ring0/pagetables/walker_lookup_arm64.go276
-rw-r--r--pkg/ring0/pagetables/walker_map_amd64.go266
-rw-r--r--pkg/ring0/pagetables/walker_map_arm64.go276
-rw-r--r--pkg/ring0/pagetables/walker_unmap_amd64.go266
-rw-r--r--pkg/ring0/pagetables/walker_unmap_arm64.go276
-rw-r--r--pkg/ring0/ring0_amd64_state_autogen.go6
-rw-r--r--pkg/ring0/ring0_arm64_state_autogen.go6
-rw-r--r--pkg/ring0/ring0_impl_amd64_state_autogen.go8
-rw-r--r--pkg/ring0/ring0_impl_arm64_state_autogen.go6
-rw-r--r--pkg/ring0/ring0_state_autogen.go3
-rw-r--r--pkg/ring0/ring0_unsafe_state_autogen.go3
-rw-r--r--pkg/ring0/x86.go298
37 files changed, 3348 insertions, 1528 deletions
diff --git a/pkg/ring0/BUILD b/pkg/ring0/BUILD
deleted file mode 100644
index fda6ba601..000000000
--- a/pkg/ring0/BUILD
+++ /dev/null
@@ -1,86 +0,0 @@
-load("//tools:defs.bzl", "arch_genrule", "go_library")
-load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
-
-package(licenses = ["notice"])
-
-go_template(
- name = "defs_amd64",
- srcs = [
- "defs.go",
- "defs_amd64.go",
- "offsets_amd64.go",
- "x86.go",
- ],
- visibility = [":__subpackages__"],
-)
-
-go_template(
- name = "defs_arm64",
- srcs = [
- "aarch64.go",
- "defs.go",
- "defs_arm64.go",
- "offsets_arm64.go",
- ],
- visibility = [":__subpackages__"],
-)
-
-go_template_instance(
- name = "defs_impl_amd64",
- out = "defs_impl_amd64.go",
- package = "ring0",
- template = ":defs_amd64",
-)
-
-go_template_instance(
- name = "defs_impl_arm64",
- out = "defs_impl_arm64.go",
- package = "ring0",
- template = ":defs_arm64",
-)
-
-arch_genrule(
- name = "entry_impl_amd64",
- srcs = ["entry_amd64.s"],
- outs = ["entry_impl_amd64.s"],
- cmd = "(echo -e '// build +amd64\\n' && QEMU $(location //pkg/ring0/gen_offsets) && cat $(location entry_amd64.s)) > $@",
- tools = ["//pkg/ring0/gen_offsets"],
-)
-
-arch_genrule(
- name = "entry_impl_arm64",
- srcs = ["entry_arm64.s"],
- outs = ["entry_impl_arm64.s"],
- cmd = "(echo -e '// build +arm64\\n' && QEMU $(location //pkg/ring0/gen_offsets) && cat $(location entry_arm64.s)) > $@",
- tools = ["//pkg/ring0/gen_offsets"],
-)
-
-go_library(
- name = "ring0",
- srcs = [
- "defs_impl_amd64.go",
- "defs_impl_arm64.go",
- "entry_amd64.go",
- "entry_arm64.go",
- "entry_impl_amd64.s",
- "entry_impl_arm64.s",
- "kernel.go",
- "kernel_amd64.go",
- "kernel_arm64.go",
- "kernel_unsafe.go",
- "lib_amd64.go",
- "lib_amd64.s",
- "lib_arm64.go",
- "lib_arm64.s",
- "ring0.go",
- ],
- visibility = ["//pkg/sentry:internal"],
- deps = [
- "//pkg/cpuid",
- "//pkg/hostarch",
- "//pkg/ring0/pagetables",
- "//pkg/safecopy",
- "//pkg/sentry/arch",
- "//pkg/sentry/arch/fpu",
- ],
-)
diff --git a/pkg/ring0/aarch64.go b/pkg/ring0/aarch64.go
deleted file mode 100644
index 96c884844..000000000
--- a/pkg/ring0/aarch64.go
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//go:build arm64
-// +build arm64
-
-package ring0
-
-// Useful bits.
-const (
- _PGD_PGT_BASE = 0x1000
- _PGD_PGT_SIZE = 0x1000
- _PUD_PGT_BASE = 0x2000
- _PUD_PGT_SIZE = 0x1000
- _PMD_PGT_BASE = 0x3000
- _PMD_PGT_SIZE = 0x4000
- _PTE_PGT_BASE = 0x7000
- _PTE_PGT_SIZE = 0x1000
-)
-
-const (
- // DAIF bits:debug, sError, IRQ, FIQ.
- _PSR_D_BIT = 0x00000200
- _PSR_A_BIT = 0x00000100
- _PSR_I_BIT = 0x00000080
- _PSR_F_BIT = 0x00000040
- _PSR_DAIF_SHIFT = 6
- _PSR_DAIF_MASK = 0xf << _PSR_DAIF_SHIFT
-
- // PSR bits.
- _PSR_MODE_EL0t = 0x00000000
- _PSR_MODE_EL1t = 0x00000004
- _PSR_MODE_EL1h = 0x00000005
- _PSR_MODE_MASK = 0x0000000f
-
- PsrFlagsClear = _PSR_MODE_MASK | _PSR_DAIF_MASK
- PsrModeMask = _PSR_MODE_MASK
-
- // KernelFlagsSet should always be set in the kernel.
- KernelFlagsSet = _PSR_MODE_EL1h | _PSR_D_BIT | _PSR_A_BIT | _PSR_I_BIT | _PSR_F_BIT
-
- // UserFlagsSet are always set in userspace.
- UserFlagsSet = _PSR_MODE_EL0t
-)
-
-// Vector is an exception vector.
-type Vector uintptr
-
-// Exception vectors.
-const (
- El1InvSync = iota
- El1InvIrq
- El1InvFiq
- El1InvError
-
- El1Sync
- El1Irq
- El1Fiq
- El1Err
-
- El0Sync
- El0Irq
- El0Fiq
- El0Err
-
- El0InvSync
- El0InvIrq
- El0InvFiq
- El0InvErr
-
- El1SyncDa
- El1SyncIa
- El1SyncSpPc
- El1SyncUndef
- El1SyncDbg
- El1SyncInv
-
- El0SyncSVC
- El0SyncDa
- El0SyncIa
- El0SyncFpsimdAcc
- El0SyncSveAcc
- El0SyncFpsimdExc
- El0SyncSys
- El0SyncSpPc
- El0SyncUndef
- El0SyncDbg
- El0SyncWfx
- El0SyncInv
-
- El0ErrNMI
- El0ErrBounce
-
- _NR_INTERRUPTS
-)
-
-// System call vectors.
-const (
- Syscall Vector = El0SyncSVC
- PageFault Vector = El0SyncDa
- VirtualizationException Vector = El0ErrBounce
-)
-
-// VirtualAddressBits returns the number bits available for virtual addresses.
-func VirtualAddressBits() uint32 {
- return 48
-}
-
-// PhysicalAddressBits returns the number of bits available for physical addresses.
-func PhysicalAddressBits() uint32 {
- return 40
-}
diff --git a/pkg/ring0/defs.go b/pkg/ring0/defs.go
deleted file mode 100644
index 38ce9be1e..000000000
--- a/pkg/ring0/defs.go
+++ /dev/null
@@ -1,125 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package ring0
-
-import (
- "gvisor.dev/gvisor/pkg/ring0/pagetables"
- "gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/arch/fpu"
-)
-
-// Kernel is a global kernel object.
-//
-// This contains global state, shared by multiple CPUs.
-type Kernel struct {
- // PageTables are the kernel pagetables; this must be provided.
- PageTables *pagetables.PageTables
-
- KernelArchState
-}
-
-// Hooks are hooks for kernel functions.
-type Hooks interface {
- // KernelSyscall is called for kernel system calls.
- //
- // Return from this call will restore registers and return to the kernel: the
- // registers must be modified directly.
- //
- // If this function is not provided, a kernel exception results in halt.
- //
- // This must be go:nosplit, as this will be on the interrupt stack.
- // Closures are permitted, as the pointer to the closure frame is not
- // passed on the stack.
- KernelSyscall()
-
- // KernelException handles an exception during kernel execution.
- //
- // Return from this call will restore registers and return to the kernel: the
- // registers must be modified directly.
- //
- // If this function is not provided, a kernel exception results in halt.
- //
- // This must be go:nosplit, as this will be on the interrupt stack.
- // Closures are permitted, as the pointer to the closure frame is not
- // passed on the stack.
- KernelException(Vector)
-}
-
-// CPU is the per-CPU struct.
-type CPU struct {
- // self is a self reference.
- //
- // This is always guaranteed to be at offset zero.
- self *CPU
-
- // kernel is reference to the kernel that this CPU was initialized
- // with. This reference is kept for garbage collection purposes: CPU
- // registers may refer to objects within the Kernel object that cannot
- // be safely freed.
- kernel *Kernel
-
- // CPUArchState is architecture-specific state.
- CPUArchState
-
- // registers is a set of registers; these may be used on kernel system
- // calls and exceptions via the Registers function.
- registers arch.Registers
-
- // floatingPointState holds floating point state.
- floatingPointState fpu.State
-
- // hooks are kernel hooks.
- hooks Hooks
-}
-
-// Registers returns a modifiable-copy of the kernel registers.
-//
-// This is explicitly safe to call during KernelException and KernelSyscall.
-//
-//go:nosplit
-func (c *CPU) Registers() *arch.Registers {
- return &c.registers
-}
-
-// FloatingPointState returns the kernel floating point state.
-//
-// This is explicitly safe to call during KernelException and KernelSyscall.
-//
-//go:nosplit
-func (c *CPU) FloatingPointState() *fpu.State {
- return &c.floatingPointState
-}
-
-// SwitchOpts are passed to the Switch function.
-type SwitchOpts struct {
- // Registers are the user register state.
- Registers *arch.Registers
-
- // FloatingPointState is a byte pointer where floating point state is
- // saved and restored.
- FloatingPointState *fpu.State
-
- // PageTables are the application page tables.
- PageTables *pagetables.PageTables
-
- // Flush indicates that a TLB flush should be forced on switch.
- Flush bool
-
- // FullRestore indicates that an iret-based restore should be used.
- FullRestore bool
-
- // SwitchArchOpts are architecture-specific options.
- SwitchArchOpts
-}
diff --git a/pkg/ring0/defs_amd64.go b/pkg/ring0/defs_amd64.go
deleted file mode 100644
index 81e90dbf7..000000000
--- a/pkg/ring0/defs_amd64.go
+++ /dev/null
@@ -1,167 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//go:build amd64
-// +build amd64
-
-package ring0
-
-import (
- "gvisor.dev/gvisor/pkg/hostarch"
-)
-
-var (
- // UserspaceSize is the total size of userspace.
- UserspaceSize = uintptr(1) << (VirtualAddressBits() - 1)
-
- // MaximumUserAddress is the largest possible user address.
- MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(hostarch.PageSize-1)
-
- // KernelStartAddress is the starting kernel address.
- KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1)
-)
-
-// Segment indices and Selectors.
-const (
- // Index into GDT array.
- _ = iota // Null descriptor first.
- _ // Reserved (Linux is kernel 32).
- segKcode // Kernel code (64-bit).
- segKdata // Kernel data.
- segUcode32 // User code (32-bit).
- segUdata // User data.
- segUcode64 // User code (64-bit).
- segTss // Task segment descriptor.
- segTssHi // Upper bits for TSS.
- segLast // Last segment (terminal, not included).
-)
-
-// Selectors.
-const (
- Kcode Selector = segKcode << 3
- Kdata Selector = segKdata << 3
- Ucode32 Selector = (segUcode32 << 3) | 3
- Udata Selector = (segUdata << 3) | 3
- Ucode64 Selector = (segUcode64 << 3) | 3
- Tss Selector = segTss << 3
-)
-
-// Standard segments.
-var (
- UserCodeSegment32 SegmentDescriptor
- UserDataSegment SegmentDescriptor
- UserCodeSegment64 SegmentDescriptor
- KernelCodeSegment SegmentDescriptor
- KernelDataSegment SegmentDescriptor
-)
-
-// KernelArchState contains architecture-specific state.
-type KernelArchState struct {
- // cpuEntries is array of kernelEntry for all cpus.
- cpuEntries []kernelEntry
-
- // globalIDT is our set of interrupt gates.
- globalIDT *idt64
-}
-
-// kernelEntry contains minimal CPU-specific arch state
-// that can be mapped at the upper of the address space.
-// Malicious APP might steal info from it via CPU bugs.
-type kernelEntry struct {
- // stack is the stack used for interrupts on this CPU.
- stack [256]byte
-
- // scratch space for temporary usage.
- scratch0 uint64
-
- // stackTop is the top of the stack.
- stackTop uint64
-
- // cpuSelf is back reference to CPU.
- cpuSelf *CPU
-
- // kernelCR3 is the cr3 used for sentry kernel.
- kernelCR3 uintptr
-
- // gdt is the CPU's descriptor table.
- gdt descriptorTable
-
- // tss is the CPU's task state.
- tss TaskState64
-}
-
-// CPUArchState contains CPU-specific arch state.
-type CPUArchState struct {
- // errorCode is the error code from the last exception.
- errorCode uintptr
-
- // errorType indicates the type of error code here, it is always set
- // along with the errorCode value above.
- //
- // It will either by 1, which indicates a user error, or 0 indicating a
- // kernel error. If the error code below returns false (kernel error),
- // then it cannot provide relevant information about the last
- // exception.
- errorType uintptr
-
- *kernelEntry
-
- // Copies of global variables, stored in CPU so that they can be used by
- // syscall and exception handlers (in the upper address space).
- hasXSAVE bool
- hasXSAVEOPT bool
-}
-
-// ErrorCode returns the last error code.
-//
-// The returned boolean indicates whether the error code corresponds to the
-// last user error or not. If it does not, then fault information must be
-// ignored. This is generally the result of a kernel fault while servicing a
-// user fault.
-//
-//go:nosplit
-func (c *CPU) ErrorCode() (value uintptr, user bool) {
- return c.errorCode, c.errorType != 0
-}
-
-// ClearErrorCode resets the error code.
-//
-//go:nosplit
-func (c *CPU) ClearErrorCode() {
- c.errorCode = 0 // No code.
- c.errorType = 1 // User mode.
-}
-
-// SwitchArchOpts are embedded in SwitchOpts.
-type SwitchArchOpts struct {
- // UserPCID indicates that the application PCID to be used on switch,
- // assuming that PCIDs are supported.
- //
- // Per pagetables_x86.go, a zero PCID implies a flush.
- UserPCID uint16
-
- // KernelPCID indicates that the kernel PCID to be used on return,
- // assuming that PCIDs are supported.
- //
- // Per pagetables_x86.go, a zero PCID implies a flush.
- KernelPCID uint16
-}
-
-func init() {
- KernelCodeSegment.setCode64(0, 0, 0)
- KernelDataSegment.setData(0, 0xffffffff, 0)
- UserCodeSegment32.setCode64(0, 0, 3)
- UserDataSegment.setData(0, 0xffffffff, 3)
- UserCodeSegment64.setCode64(0, 0, 3)
-}
diff --git a/pkg/ring0/defs_arm64.go b/pkg/ring0/defs_arm64.go
deleted file mode 100644
index 3e212516f..000000000
--- a/pkg/ring0/defs_arm64.go
+++ /dev/null
@@ -1,142 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//go:build arm64
-// +build arm64
-
-package ring0
-
-import (
- "gvisor.dev/gvisor/pkg/hostarch"
-)
-
-var (
- // UserspaceSize is the total size of userspace.
- UserspaceSize = uintptr(1) << (VirtualAddressBits())
-
- // MaximumUserAddress is the largest possible user address.
- MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(hostarch.PageSize-1)
-
- // KernelStartAddress is the starting kernel address.
- KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1)
-)
-
-// KernelArchState contains architecture-specific state.
-type KernelArchState struct {
-}
-
-// CPUArchState contains CPU-specific arch state.
-type CPUArchState struct {
- // stack is the stack used for interrupts on this CPU.
- stack [128]byte
-
- // errorCode is the error code from the last exception.
- errorCode uintptr
-
- // errorType indicates the type of error code here, it is always set
- // along with the errorCode value above.
- //
- // It will either by 1, which indicates a user error, or 0 indicating a
- // kernel error. If the error code below returns false (kernel error),
- // then it cannot provide relevant information about the last
- // exception.
- errorType uintptr
-
- // faultAddr is the value of far_el1.
- faultAddr uintptr
-
- // el0Fp is the address of application's fpstate.
- el0Fp uintptr
-
- // ttbr0Kvm is the value of ttbr0_el1 for sentry.
- ttbr0Kvm uintptr
-
- // ttbr0App is the value of ttbr0_el1 for applicaton.
- ttbr0App uintptr
-
- // exception vector.
- vecCode Vector
-
- // application context pointer.
- appAddr uintptr
-
- // lazyVFP is the value of cpacr_el1.
- lazyVFP uintptr
-
- // appASID is the asid value of guest application.
- appASID uintptr
-}
-
-// ErrorCode returns the last error code.
-//
-// The returned boolean indicates whether the error code corresponds to the
-// last user error or not. If it does not, then fault information must be
-// ignored. This is generally the result of a kernel fault while servicing a
-// user fault.
-//
-//go:nosplit
-func (c *CPU) ErrorCode() (value uintptr, user bool) {
- return c.errorCode, c.errorType != 0
-}
-
-// ClearErrorCode resets the error code.
-//
-//go:nosplit
-func (c *CPU) ClearErrorCode() {
- c.errorCode = 0 // No code.
- c.errorType = 1 // User mode.
-}
-
-//go:nosplit
-func (c *CPU) GetFaultAddr() (value uintptr) {
- return c.faultAddr
-}
-
-//go:nosplit
-func (c *CPU) SetTtbr0Kvm(value uintptr) {
- c.ttbr0Kvm = value
-}
-
-//go:nosplit
-func (c *CPU) SetTtbr0App(value uintptr) {
- c.ttbr0App = value
-}
-
-//go:nosplit
-func (c *CPU) GetVector() (value Vector) {
- return c.vecCode
-}
-
-//go:nosplit
-func (c *CPU) SetAppAddr(value uintptr) {
- c.appAddr = value
-}
-
-// GetLazyVFP returns the value of cpacr_el1.
-//go:nosplit
-func (c *CPU) GetLazyVFP() (value uintptr) {
- return c.lazyVFP
-}
-
-// SwitchArchOpts are embedded in SwitchOpts.
-type SwitchArchOpts struct {
- // UserASID indicates that the application ASID to be used on switch,
- UserASID uint16
-
- // KernelASID indicates that the kernel ASID to be used on return,
- KernelASID uint16
-}
-
-func init() {
-}
diff --git a/pkg/ring0/defs_impl_amd64.go b/pkg/ring0/defs_impl_amd64.go
new file mode 100644
index 000000000..df5b4462f
--- /dev/null
+++ b/pkg/ring0/defs_impl_amd64.go
@@ -0,0 +1,620 @@
+//go:build amd64 && amd64 && (386 || amd64)
+// +build amd64
+// +build amd64
+// +build 386 amd64
+
+package ring0
+
+import (
+ "fmt"
+ "gvisor.dev/gvisor/pkg/cpuid"
+ "gvisor.dev/gvisor/pkg/hostarch"
+ "gvisor.dev/gvisor/pkg/ring0/pagetables"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/arch/fpu"
+ "io"
+ "reflect"
+)
+
+// Kernel is a global kernel object.
+//
+// This contains global state, shared by multiple CPUs.
+type Kernel struct {
+ // PageTables are the kernel pagetables; this must be provided.
+ PageTables *pagetables.PageTables
+
+ KernelArchState
+}
+
+// Hooks are hooks for kernel functions.
+type Hooks interface {
+ // KernelSyscall is called for kernel system calls.
+ //
+ // Return from this call will restore registers and return to the kernel: the
+ // registers must be modified directly.
+ //
+ // If this function is not provided, a kernel exception results in halt.
+ //
+ // This must be go:nosplit, as this will be on the interrupt stack.
+ // Closures are permitted, as the pointer to the closure frame is not
+ // passed on the stack.
+ KernelSyscall()
+
+ // KernelException handles an exception during kernel execution.
+ //
+ // Return from this call will restore registers and return to the kernel: the
+ // registers must be modified directly.
+ //
+ // If this function is not provided, a kernel exception results in halt.
+ //
+ // This must be go:nosplit, as this will be on the interrupt stack.
+ // Closures are permitted, as the pointer to the closure frame is not
+ // passed on the stack.
+ KernelException(Vector)
+}
+
+// CPU is the per-CPU struct.
+type CPU struct {
+ // self is a self reference.
+ //
+ // This is always guaranteed to be at offset zero.
+ self *CPU
+
+ // kernel is reference to the kernel that this CPU was initialized
+ // with. This reference is kept for garbage collection purposes: CPU
+ // registers may refer to objects within the Kernel object that cannot
+ // be safely freed.
+ kernel *Kernel
+
+ // CPUArchState is architecture-specific state.
+ CPUArchState
+
+ // registers is a set of registers; these may be used on kernel system
+ // calls and exceptions via the Registers function.
+ registers arch.Registers
+
+ // floatingPointState holds floating point state.
+ floatingPointState fpu.State
+
+ // hooks are kernel hooks.
+ hooks Hooks
+}
+
+// Registers returns a modifiable-copy of the kernel registers.
+//
+// This is explicitly safe to call during KernelException and KernelSyscall.
+//
+//go:nosplit
+func (c *CPU) Registers() *arch.Registers {
+ return &c.registers
+}
+
+// FloatingPointState returns the kernel floating point state.
+//
+// This is explicitly safe to call during KernelException and KernelSyscall.
+//
+//go:nosplit
+func (c *CPU) FloatingPointState() *fpu.State {
+ return &c.floatingPointState
+}
+
+// SwitchOpts are passed to the Switch function.
+type SwitchOpts struct {
+ // Registers are the user register state.
+ Registers *arch.Registers
+
+ // FloatingPointState is a byte pointer where floating point state is
+ // saved and restored.
+ FloatingPointState *fpu.State
+
+ // PageTables are the application page tables.
+ PageTables *pagetables.PageTables
+
+ // Flush indicates that a TLB flush should be forced on switch.
+ Flush bool
+
+ // FullRestore indicates that an iret-based restore should be used.
+ FullRestore bool
+
+ // SwitchArchOpts are architecture-specific options.
+ SwitchArchOpts
+}
+
+var (
+ // UserspaceSize is the total size of userspace.
+ UserspaceSize = uintptr(1) << (VirtualAddressBits() - 1)
+
+ // MaximumUserAddress is the largest possible user address.
+ MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(hostarch.PageSize-1)
+
+ // KernelStartAddress is the starting kernel address.
+ KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1)
+)
+
+// Segment indices and Selectors.
+const (
+ // Index into GDT array.
+ _ = iota // Null descriptor first.
+ _ // Reserved (Linux is kernel 32).
+ segKcode // Kernel code (64-bit).
+ segKdata // Kernel data.
+ segUcode32 // User code (32-bit).
+ segUdata // User data.
+ segUcode64 // User code (64-bit).
+ segTss // Task segment descriptor.
+ segTssHi // Upper bits for TSS.
+ segLast // Last segment (terminal, not included).
+)
+
+// Selectors.
+const (
+ Kcode Selector = segKcode << 3
+ Kdata Selector = segKdata << 3
+ Ucode32 Selector = (segUcode32 << 3) | 3
+ Udata Selector = (segUdata << 3) | 3
+ Ucode64 Selector = (segUcode64 << 3) | 3
+ Tss Selector = segTss << 3
+)
+
+// Standard segments.
+var (
+ UserCodeSegment32 SegmentDescriptor
+ UserDataSegment SegmentDescriptor
+ UserCodeSegment64 SegmentDescriptor
+ KernelCodeSegment SegmentDescriptor
+ KernelDataSegment SegmentDescriptor
+)
+
+// KernelArchState contains architecture-specific state.
+type KernelArchState struct {
+ // cpuEntries is array of kernelEntry for all cpus.
+ cpuEntries []kernelEntry
+
+ // globalIDT is our set of interrupt gates.
+ globalIDT *idt64
+}
+
+// kernelEntry contains minimal CPU-specific arch state
+// that can be mapped at the upper of the address space.
+// Malicious APP might steal info from it via CPU bugs.
+type kernelEntry struct {
+ // stack is the stack used for interrupts on this CPU.
+ stack [256]byte
+
+ // scratch space for temporary usage.
+ scratch0 uint64
+
+ // stackTop is the top of the stack.
+ stackTop uint64
+
+ // cpuSelf is back reference to CPU.
+ cpuSelf *CPU
+
+ // kernelCR3 is the cr3 used for sentry kernel.
+ kernelCR3 uintptr
+
+ // gdt is the CPU's descriptor table.
+ gdt descriptorTable
+
+ // tss is the CPU's task state.
+ tss TaskState64
+}
+
+// CPUArchState contains CPU-specific arch state.
+type CPUArchState struct {
+ // errorCode is the error code from the last exception.
+ errorCode uintptr
+
+ // errorType indicates the type of error code here, it is always set
+ // along with the errorCode value above.
+ //
+ // It will either by 1, which indicates a user error, or 0 indicating a
+ // kernel error. If the error code below returns false (kernel error),
+ // then it cannot provide relevant information about the last
+ // exception.
+ errorType uintptr
+
+ *kernelEntry
+
+ // Copies of global variables, stored in CPU so that they can be used by
+ // syscall and exception handlers (in the upper address space).
+ hasXSAVE bool
+ hasXSAVEOPT bool
+}
+
+// ErrorCode returns the last error code.
+//
+// The returned boolean indicates whether the error code corresponds to the
+// last user error or not. If it does not, then fault information must be
+// ignored. This is generally the result of a kernel fault while servicing a
+// user fault.
+//
+//go:nosplit
+func (c *CPU) ErrorCode() (value uintptr, user bool) {
+ return c.errorCode, c.errorType != 0
+}
+
+// ClearErrorCode resets the error code.
+//
+//go:nosplit
+func (c *CPU) ClearErrorCode() {
+ c.errorCode = 0
+ c.errorType = 1
+}
+
+// SwitchArchOpts are embedded in SwitchOpts.
+type SwitchArchOpts struct {
+ // UserPCID indicates that the application PCID to be used on switch,
+ // assuming that PCIDs are supported.
+ //
+ // Per pagetables_x86.go, a zero PCID implies a flush.
+ UserPCID uint16
+
+ // KernelPCID indicates that the kernel PCID to be used on return,
+ // assuming that PCIDs are supported.
+ //
+ // Per pagetables_x86.go, a zero PCID implies a flush.
+ KernelPCID uint16
+}
+
+func init() {
+ KernelCodeSegment.setCode64(0, 0, 0)
+ KernelDataSegment.setData(0, 0xffffffff, 0)
+ UserCodeSegment32.setCode64(0, 0, 3)
+ UserDataSegment.setData(0, 0xffffffff, 3)
+ UserCodeSegment64.setCode64(0, 0, 3)
+}
+
+// Emit prints architecture-specific offsets.
+func Emit(w io.Writer) {
+ fmt.Fprintf(w, "// Automatically generated, do not edit.\n")
+
+ c := &CPU{}
+ fmt.Fprintf(w, "\n// CPU offsets.\n")
+ fmt.Fprintf(w, "#define CPU_REGISTERS 0x%02x\n", reflect.ValueOf(&c.registers).Pointer()-reflect.ValueOf(c).Pointer())
+ fmt.Fprintf(w, "#define CPU_ERROR_CODE 0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer())
+ fmt.Fprintf(w, "#define CPU_ERROR_TYPE 0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer())
+ fmt.Fprintf(w, "#define CPU_ENTRY 0x%02x\n", reflect.ValueOf(&c.kernelEntry).Pointer()-reflect.ValueOf(c).Pointer())
+ fmt.Fprintf(w, "#define CPU_HAS_XSAVE 0x%02x\n", reflect.ValueOf(&c.hasXSAVE).Pointer()-reflect.ValueOf(c).Pointer())
+ fmt.Fprintf(w, "#define CPU_HAS_XSAVEOPT 0x%02x\n", reflect.ValueOf(&c.hasXSAVEOPT).Pointer()-reflect.ValueOf(c).Pointer())
+ fmt.Fprintf(w, "#define CPU_FPU_STATE 0x%02x\n", reflect.ValueOf(&c.floatingPointState).Pointer()-reflect.ValueOf(c).Pointer())
+
+ e := &kernelEntry{}
+ fmt.Fprintf(w, "\n// CPU entry offsets.\n")
+ fmt.Fprintf(w, "#define ENTRY_SCRATCH0 0x%02x\n", reflect.ValueOf(&e.scratch0).Pointer()-reflect.ValueOf(e).Pointer())
+ fmt.Fprintf(w, "#define ENTRY_STACK_TOP 0x%02x\n", reflect.ValueOf(&e.stackTop).Pointer()-reflect.ValueOf(e).Pointer())
+ fmt.Fprintf(w, "#define ENTRY_CPU_SELF 0x%02x\n", reflect.ValueOf(&e.cpuSelf).Pointer()-reflect.ValueOf(e).Pointer())
+ fmt.Fprintf(w, "#define ENTRY_KERNEL_CR3 0x%02x\n", reflect.ValueOf(&e.kernelCR3).Pointer()-reflect.ValueOf(e).Pointer())
+
+ fmt.Fprintf(w, "\n// Bits.\n")
+ fmt.Fprintf(w, "#define _RFLAGS_IF 0x%02x\n", _RFLAGS_IF)
+ fmt.Fprintf(w, "#define _RFLAGS_IOPL0 0x%02x\n", _RFLAGS_IOPL0)
+ fmt.Fprintf(w, "#define _KERNEL_FLAGS 0x%02x\n", KernelFlagsSet)
+
+ fmt.Fprintf(w, "\n// Vectors.\n")
+ fmt.Fprintf(w, "#define DivideByZero 0x%02x\n", DivideByZero)
+ fmt.Fprintf(w, "#define Debug 0x%02x\n", Debug)
+ fmt.Fprintf(w, "#define NMI 0x%02x\n", NMI)
+ fmt.Fprintf(w, "#define Breakpoint 0x%02x\n", Breakpoint)
+ fmt.Fprintf(w, "#define Overflow 0x%02x\n", Overflow)
+ fmt.Fprintf(w, "#define BoundRangeExceeded 0x%02x\n", BoundRangeExceeded)
+ fmt.Fprintf(w, "#define InvalidOpcode 0x%02x\n", InvalidOpcode)
+ fmt.Fprintf(w, "#define DeviceNotAvailable 0x%02x\n", DeviceNotAvailable)
+ fmt.Fprintf(w, "#define DoubleFault 0x%02x\n", DoubleFault)
+ fmt.Fprintf(w, "#define CoprocessorSegmentOverrun 0x%02x\n", CoprocessorSegmentOverrun)
+ fmt.Fprintf(w, "#define InvalidTSS 0x%02x\n", InvalidTSS)
+ fmt.Fprintf(w, "#define SegmentNotPresent 0x%02x\n", SegmentNotPresent)
+ fmt.Fprintf(w, "#define StackSegmentFault 0x%02x\n", StackSegmentFault)
+ fmt.Fprintf(w, "#define GeneralProtectionFault 0x%02x\n", GeneralProtectionFault)
+ fmt.Fprintf(w, "#define PageFault 0x%02x\n", PageFault)
+ fmt.Fprintf(w, "#define X87FloatingPointException 0x%02x\n", X87FloatingPointException)
+ fmt.Fprintf(w, "#define AlignmentCheck 0x%02x\n", AlignmentCheck)
+ fmt.Fprintf(w, "#define MachineCheck 0x%02x\n", MachineCheck)
+ fmt.Fprintf(w, "#define SIMDFloatingPointException 0x%02x\n", SIMDFloatingPointException)
+ fmt.Fprintf(w, "#define VirtualizationException 0x%02x\n", VirtualizationException)
+ fmt.Fprintf(w, "#define SecurityException 0x%02x\n", SecurityException)
+ fmt.Fprintf(w, "#define SyscallInt80 0x%02x\n", SyscallInt80)
+ fmt.Fprintf(w, "#define Syscall 0x%02x\n", Syscall)
+
+ p := &arch.Registers{}
+ fmt.Fprintf(w, "\n// Ptrace registers.\n")
+ fmt.Fprintf(w, "#define PTRACE_R15 0x%02x\n", reflect.ValueOf(&p.R15).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_R14 0x%02x\n", reflect.ValueOf(&p.R14).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_R13 0x%02x\n", reflect.ValueOf(&p.R13).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_R12 0x%02x\n", reflect.ValueOf(&p.R12).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_RBP 0x%02x\n", reflect.ValueOf(&p.Rbp).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_RBX 0x%02x\n", reflect.ValueOf(&p.Rbx).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_R11 0x%02x\n", reflect.ValueOf(&p.R11).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_R10 0x%02x\n", reflect.ValueOf(&p.R10).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_R9 0x%02x\n", reflect.ValueOf(&p.R9).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_R8 0x%02x\n", reflect.ValueOf(&p.R8).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_RAX 0x%02x\n", reflect.ValueOf(&p.Rax).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_RCX 0x%02x\n", reflect.ValueOf(&p.Rcx).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_RDX 0x%02x\n", reflect.ValueOf(&p.Rdx).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_RSI 0x%02x\n", reflect.ValueOf(&p.Rsi).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_RDI 0x%02x\n", reflect.ValueOf(&p.Rdi).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_ORIGRAX 0x%02x\n", reflect.ValueOf(&p.Orig_rax).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_RIP 0x%02x\n", reflect.ValueOf(&p.Rip).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_CS 0x%02x\n", reflect.ValueOf(&p.Cs).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_FLAGS 0x%02x\n", reflect.ValueOf(&p.Eflags).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_RSP 0x%02x\n", reflect.ValueOf(&p.Rsp).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_SS 0x%02x\n", reflect.ValueOf(&p.Ss).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_FS_BASE 0x%02x\n", reflect.ValueOf(&p.Fs_base).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_GS_BASE 0x%02x\n", reflect.ValueOf(&p.Gs_base).Pointer()-reflect.ValueOf(p).Pointer())
+}
+
+// Useful bits.
+const (
+ _CR0_PE = 1 << 0
+ _CR0_ET = 1 << 4
+ _CR0_NE = 1 << 5
+ _CR0_AM = 1 << 18
+ _CR0_PG = 1 << 31
+
+ _CR4_PSE = 1 << 4
+ _CR4_PAE = 1 << 5
+ _CR4_PGE = 1 << 7
+ _CR4_OSFXSR = 1 << 9
+ _CR4_OSXMMEXCPT = 1 << 10
+ _CR4_FSGSBASE = 1 << 16
+ _CR4_PCIDE = 1 << 17
+ _CR4_OSXSAVE = 1 << 18
+ _CR4_SMEP = 1 << 20
+
+ _RFLAGS_AC = 1 << 18
+ _RFLAGS_NT = 1 << 14
+ _RFLAGS_IOPL0 = 1 << 12
+ _RFLAGS_IOPL1 = 1 << 13
+ _RFLAGS_IOPL = _RFLAGS_IOPL0 | _RFLAGS_IOPL1
+ _RFLAGS_DF = 1 << 10
+ _RFLAGS_IF = 1 << 9
+ _RFLAGS_STEP = 1 << 8
+ _RFLAGS_RESERVED = 1 << 1
+
+ _EFER_SCE = 0x001
+ _EFER_LME = 0x100
+ _EFER_LMA = 0x400
+ _EFER_NX = 0x800
+
+ _MSR_STAR = 0xc0000081
+ _MSR_LSTAR = 0xc0000082
+ _MSR_CSTAR = 0xc0000083
+ _MSR_SYSCALL_MASK = 0xc0000084
+ _MSR_PLATFORM_INFO = 0xce
+ _MSR_MISC_FEATURES = 0x140
+
+ _PLATFORM_INFO_CPUID_FAULT = 1 << 31
+
+ _MISC_FEATURE_CPUID_TRAP = 0x1
+)
+
+const (
+ // KernelFlagsSet should always be set in the kernel.
+ KernelFlagsSet = _RFLAGS_RESERVED
+
+ // UserFlagsSet are always set in userspace.
+ //
+ // _RFLAGS_IOPL is a set of two bits and it shows the I/O privilege
+ // level. The Current Privilege Level (CPL) of the task must be less
+ // than or equal to the IOPL in order for the task or program to access
+ // I/O ports.
+ //
+ // Here, _RFLAGS_IOPL0 is used only to determine whether the task is
+ // running in the kernel or userspace mode. In the user mode, the CPL is
+ // always 3 and it doesn't matter what IOPL is set if it is bellow CPL.
+ //
+ // We need to have one bit which will be always different in user and
+ // kernel modes. And we have to remember that even though we have
+ // KernelFlagsClear, we still can see some of these flags in the kernel
+ // mode. This can happen when the goruntime switches on a goroutine
+ // which has been saved in the host mode. On restore, the popf
+ // instruction is used to restore flags and this means that all flags
+ // what the goroutine has in the host mode will be restored in the
+ // kernel mode.
+ //
+ // _RFLAGS_IOPL0 is never set in host and kernel modes and we always set
+ // it in the user mode. So if this flag is set, the task is running in
+ // the user mode and if it isn't set, the task is running in the kernel
+ // mode.
+ UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF | _RFLAGS_IOPL0
+
+ // KernelFlagsClear should always be clear in the kernel.
+ KernelFlagsClear = _RFLAGS_STEP | _RFLAGS_IF | _RFLAGS_IOPL | _RFLAGS_AC | _RFLAGS_NT
+
+ // UserFlagsClear are always cleared in userspace.
+ UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL1
+)
+
+// IsKernelFlags returns true if rflags coresponds to the kernel mode.
+//
+// go:nosplit
+func IsKernelFlags(rflags uint64) bool {
+ return rflags&_RFLAGS_IOPL0 == 0
+}
+
+// Vector is an exception vector.
+type Vector uintptr
+
+// Exception vectors.
+const (
+ DivideByZero Vector = iota
+ Debug
+ NMI
+ Breakpoint
+ Overflow
+ BoundRangeExceeded
+ InvalidOpcode
+ DeviceNotAvailable
+ DoubleFault
+ CoprocessorSegmentOverrun
+ InvalidTSS
+ SegmentNotPresent
+ StackSegmentFault
+ GeneralProtectionFault
+ PageFault
+ _
+ X87FloatingPointException
+ AlignmentCheck
+ MachineCheck
+ SIMDFloatingPointException
+ VirtualizationException
+ SecurityException = 0x1e
+ SyscallInt80 = 0x80
+ _NR_INTERRUPTS = 0x100
+)
+
+// System call vectors.
+const (
+ Syscall Vector = _NR_INTERRUPTS
+)
+
+// VirtualAddressBits returns the number bits available for virtual addresses.
+//
+// Note that sign-extension semantics apply to the highest order bit.
+//
+// FIXME(b/69382326): This should use the cpuid passed to Init.
+func VirtualAddressBits() uint32 {
+ ax, _, _, _ := cpuid.HostID(0x80000008, 0)
+ return (ax >> 8) & 0xff
+}
+
+// PhysicalAddressBits returns the number of bits available for physical addresses.
+//
+// FIXME(b/69382326): This should use the cpuid passed to Init.
+func PhysicalAddressBits() uint32 {
+ ax, _, _, _ := cpuid.HostID(0x80000008, 0)
+ return ax & 0xff
+}
+
+// Selector is a segment Selector.
+type Selector uint16
+
+// SegmentDescriptor is a segment descriptor.
+type SegmentDescriptor struct {
+ bits [2]uint32
+}
+
+// descriptorTable is a collection of descriptors.
+type descriptorTable [32]SegmentDescriptor
+
+// SegmentDescriptorFlags are typed flags within a descriptor.
+type SegmentDescriptorFlags uint32
+
+// SegmentDescriptorFlag declarations.
+const (
+ SegmentDescriptorAccess SegmentDescriptorFlags = 1 << 8 // Access bit (always set).
+ SegmentDescriptorWrite = 1 << 9 // Write permission.
+ SegmentDescriptorExpandDown = 1 << 10 // Grows down, not used.
+ SegmentDescriptorExecute = 1 << 11 // Execute permission.
+ SegmentDescriptorSystem = 1 << 12 // Zero => system, 1 => user code/data.
+ SegmentDescriptorPresent = 1 << 15 // Present.
+ SegmentDescriptorAVL = 1 << 20 // Available.
+ SegmentDescriptorLong = 1 << 21 // Long mode.
+ SegmentDescriptorDB = 1 << 22 // 16 or 32-bit.
+ SegmentDescriptorG = 1 << 23 // Granularity: page or byte.
+)
+
+// Base returns the descriptor's base linear address.
+func (d *SegmentDescriptor) Base() uint32 {
+ return d.bits[1]&0xFF000000 | (d.bits[1]&0x000000FF)<<16 | d.bits[0]>>16
+}
+
+// Limit returns the descriptor size.
+func (d *SegmentDescriptor) Limit() uint32 {
+ l := d.bits[0]&0xFFFF | d.bits[1]&0xF0000
+ if d.bits[1]&uint32(SegmentDescriptorG) != 0 {
+ l <<= 12
+ l |= 0xFFF
+ }
+ return l
+}
+
+// Flags returns descriptor flags.
+func (d *SegmentDescriptor) Flags() SegmentDescriptorFlags {
+ return SegmentDescriptorFlags(d.bits[1] & 0x00F09F00)
+}
+
+// DPL returns the descriptor privilege level.
+func (d *SegmentDescriptor) DPL() int {
+ return int((d.bits[1] >> 13) & 3)
+}
+
+func (d *SegmentDescriptor) setNull() {
+ d.bits[0] = 0
+ d.bits[1] = 0
+}
+
+func (d *SegmentDescriptor) set(base, limit uint32, dpl int, flags SegmentDescriptorFlags) {
+ flags |= SegmentDescriptorPresent
+ if limit>>12 != 0 {
+ limit >>= 12
+ flags |= SegmentDescriptorG
+ }
+ d.bits[0] = base<<16 | limit&0xFFFF
+ d.bits[1] = base&0xFF000000 | (base>>16)&0xFF | limit&0x000F0000 | uint32(flags) | uint32(dpl)<<13
+}
+
+func (d *SegmentDescriptor) setCode32(base, limit uint32, dpl int) {
+ d.set(base, limit, dpl,
+ SegmentDescriptorDB|
+ SegmentDescriptorExecute|
+ SegmentDescriptorSystem)
+}
+
+func (d *SegmentDescriptor) setCode64(base, limit uint32, dpl int) {
+ d.set(base, limit, dpl,
+ SegmentDescriptorG|
+ SegmentDescriptorLong|
+ SegmentDescriptorExecute|
+ SegmentDescriptorSystem)
+}
+
+func (d *SegmentDescriptor) setData(base, limit uint32, dpl int) {
+ d.set(base, limit, dpl,
+ SegmentDescriptorWrite|
+ SegmentDescriptorSystem)
+}
+
+// setHi is only used for the TSS segment, which is magically 64-bits.
+func (d *SegmentDescriptor) setHi(base uint32) {
+ d.bits[0] = base
+ d.bits[1] = 0
+}
+
+// Gate64 is a 64-bit task, trap, or interrupt gate.
+type Gate64 struct {
+ bits [4]uint32
+}
+
+// idt64 is a 64-bit interrupt descriptor table.
+type idt64 [_NR_INTERRUPTS]Gate64
+
+func (g *Gate64) setInterrupt(cs Selector, rip uint64, dpl int, ist int) {
+ g.bits[0] = uint32(cs)<<16 | uint32(rip)&0xFFFF
+ g.bits[1] = uint32(rip)&0xFFFF0000 | SegmentDescriptorPresent | uint32(dpl)<<13 | 14<<8 | uint32(ist)&0x7
+ g.bits[2] = uint32(rip >> 32)
+}
+
+func (g *Gate64) setTrap(cs Selector, rip uint64, dpl int, ist int) {
+ g.setInterrupt(cs, rip, dpl, ist)
+ g.bits[1] |= 1 << 8
+}
+
+// TaskState64 is a 64-bit task state structure.
+type TaskState64 struct {
+ _ uint32
+ rsp0Lo, rsp0Hi uint32
+ rsp1Lo, rsp1Hi uint32
+ rsp2Lo, rsp2Hi uint32
+ _ [2]uint32
+ ist1Lo, ist1Hi uint32
+ ist2Lo, ist2Hi uint32
+ ist3Lo, ist3Hi uint32
+ ist4Lo, ist4Hi uint32
+ ist5Lo, ist5Hi uint32
+ ist6Lo, ist6Hi uint32
+ ist7Lo, ist7Hi uint32
+ _ [2]uint32
+ _ uint16
+ ioPerm uint16
+}
diff --git a/pkg/ring0/offsets_arm64.go b/pkg/ring0/defs_impl_arm64.go
index 60b2c4074..0e73d2ea9 100644
--- a/pkg/ring0/offsets_arm64.go
+++ b/pkg/ring0/defs_impl_arm64.go
@@ -1,30 +1,347 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//go:build arm64
-// +build arm64
+//go:build arm64 && arm64 && arm64
+// +build arm64,arm64,arm64
package ring0
import (
"fmt"
+ "gvisor.dev/gvisor/pkg/hostarch"
+ "gvisor.dev/gvisor/pkg/ring0/pagetables"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/arch/fpu"
"io"
"reflect"
+)
- "gvisor.dev/gvisor/pkg/sentry/arch"
+// Useful bits.
+const (
+ _PGD_PGT_BASE = 0x1000
+ _PGD_PGT_SIZE = 0x1000
+ _PUD_PGT_BASE = 0x2000
+ _PUD_PGT_SIZE = 0x1000
+ _PMD_PGT_BASE = 0x3000
+ _PMD_PGT_SIZE = 0x4000
+ _PTE_PGT_BASE = 0x7000
+ _PTE_PGT_SIZE = 0x1000
+)
+
+const (
+ // DAIF bits:debug, sError, IRQ, FIQ.
+ _PSR_D_BIT = 0x00000200
+ _PSR_A_BIT = 0x00000100
+ _PSR_I_BIT = 0x00000080
+ _PSR_F_BIT = 0x00000040
+ _PSR_DAIF_SHIFT = 6
+ _PSR_DAIF_MASK = 0xf << _PSR_DAIF_SHIFT
+
+ // PSR bits.
+ _PSR_MODE_EL0t = 0x00000000
+ _PSR_MODE_EL1t = 0x00000004
+ _PSR_MODE_EL1h = 0x00000005
+ _PSR_MODE_MASK = 0x0000000f
+
+ PsrFlagsClear = _PSR_MODE_MASK | _PSR_DAIF_MASK
+ PsrModeMask = _PSR_MODE_MASK
+
+ // KernelFlagsSet should always be set in the kernel.
+ KernelFlagsSet = _PSR_MODE_EL1h | _PSR_D_BIT | _PSR_A_BIT | _PSR_I_BIT | _PSR_F_BIT
+
+ // UserFlagsSet are always set in userspace.
+ UserFlagsSet = _PSR_MODE_EL0t
+)
+
+// Vector is an exception vector.
+type Vector uintptr
+
+// Exception vectors.
+const (
+ El1InvSync = iota
+ El1InvIrq
+ El1InvFiq
+ El1InvError
+
+ El1Sync
+ El1Irq
+ El1Fiq
+ El1Err
+
+ El0Sync
+ El0Irq
+ El0Fiq
+ El0Err
+
+ El0InvSync
+ El0InvIrq
+ El0InvFiq
+ El0InvErr
+
+ El1SyncDa
+ El1SyncIa
+ El1SyncSpPc
+ El1SyncUndef
+ El1SyncDbg
+ El1SyncInv
+
+ El0SyncSVC
+ El0SyncDa
+ El0SyncIa
+ El0SyncFpsimdAcc
+ El0SyncSveAcc
+ El0SyncFpsimdExc
+ El0SyncSys
+ El0SyncSpPc
+ El0SyncUndef
+ El0SyncDbg
+ El0SyncWfx
+ El0SyncInv
+
+ El0ErrNMI
+ El0ErrBounce
+
+ _NR_INTERRUPTS
+)
+
+// System call vectors.
+const (
+ Syscall Vector = El0SyncSVC
+ PageFault Vector = El0SyncDa
+ VirtualizationException Vector = El0ErrBounce
+)
+
+// VirtualAddressBits returns the number bits available for virtual addresses.
+func VirtualAddressBits() uint32 {
+ return 48
+}
+
+// PhysicalAddressBits returns the number of bits available for physical addresses.
+func PhysicalAddressBits() uint32 {
+ return 40
+}
+
+// Kernel is a global kernel object.
+//
+// This contains global state, shared by multiple CPUs.
+type Kernel struct {
+ // PageTables are the kernel pagetables; this must be provided.
+ PageTables *pagetables.PageTables
+
+ KernelArchState
+}
+
+// Hooks are hooks for kernel functions.
+type Hooks interface {
+ // KernelSyscall is called for kernel system calls.
+ //
+ // Return from this call will restore registers and return to the kernel: the
+ // registers must be modified directly.
+ //
+ // If this function is not provided, a kernel exception results in halt.
+ //
+ // This must be go:nosplit, as this will be on the interrupt stack.
+ // Closures are permitted, as the pointer to the closure frame is not
+ // passed on the stack.
+ KernelSyscall()
+
+ // KernelException handles an exception during kernel execution.
+ //
+ // Return from this call will restore registers and return to the kernel: the
+ // registers must be modified directly.
+ //
+ // If this function is not provided, a kernel exception results in halt.
+ //
+ // This must be go:nosplit, as this will be on the interrupt stack.
+ // Closures are permitted, as the pointer to the closure frame is not
+ // passed on the stack.
+ KernelException(Vector)
+}
+
+// CPU is the per-CPU struct.
+type CPU struct {
+ // self is a self reference.
+ //
+ // This is always guaranteed to be at offset zero.
+ self *CPU
+
+ // kernel is reference to the kernel that this CPU was initialized
+ // with. This reference is kept for garbage collection purposes: CPU
+ // registers may refer to objects within the Kernel object that cannot
+ // be safely freed.
+ kernel *Kernel
+
+ // CPUArchState is architecture-specific state.
+ CPUArchState
+
+ // registers is a set of registers; these may be used on kernel system
+ // calls and exceptions via the Registers function.
+ registers arch.Registers
+
+ // floatingPointState holds floating point state.
+ floatingPointState fpu.State
+
+ // hooks are kernel hooks.
+ hooks Hooks
+}
+
+// Registers returns a modifiable-copy of the kernel registers.
+//
+// This is explicitly safe to call during KernelException and KernelSyscall.
+//
+//go:nosplit
+func (c *CPU) Registers() *arch.Registers {
+ return &c.registers
+}
+
+// FloatingPointState returns the kernel floating point state.
+//
+// This is explicitly safe to call during KernelException and KernelSyscall.
+//
+//go:nosplit
+func (c *CPU) FloatingPointState() *fpu.State {
+ return &c.floatingPointState
+}
+
+// SwitchOpts are passed to the Switch function.
+type SwitchOpts struct {
+ // Registers are the user register state.
+ Registers *arch.Registers
+
+ // FloatingPointState is a byte pointer where floating point state is
+ // saved and restored.
+ FloatingPointState *fpu.State
+
+ // PageTables are the application page tables.
+ PageTables *pagetables.PageTables
+
+ // Flush indicates that a TLB flush should be forced on switch.
+ Flush bool
+
+ // FullRestore indicates that an iret-based restore should be used.
+ FullRestore bool
+
+ // SwitchArchOpts are architecture-specific options.
+ SwitchArchOpts
+}
+
+var (
+ // UserspaceSize is the total size of userspace.
+ UserspaceSize = uintptr(1) << (VirtualAddressBits())
+
+ // MaximumUserAddress is the largest possible user address.
+ MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(hostarch.PageSize-1)
+
+ // KernelStartAddress is the starting kernel address.
+ KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1)
)
+// KernelArchState contains architecture-specific state.
+type KernelArchState struct {
+}
+
+// CPUArchState contains CPU-specific arch state.
+type CPUArchState struct {
+ // stack is the stack used for interrupts on this CPU.
+ stack [128]byte
+
+ // errorCode is the error code from the last exception.
+ errorCode uintptr
+
+ // errorType indicates the type of error code here, it is always set
+ // along with the errorCode value above.
+ //
+ // It will either by 1, which indicates a user error, or 0 indicating a
+ // kernel error. If the error code below returns false (kernel error),
+ // then it cannot provide relevant information about the last
+ // exception.
+ errorType uintptr
+
+ // faultAddr is the value of far_el1.
+ faultAddr uintptr
+
+ // el0Fp is the address of application's fpstate.
+ el0Fp uintptr
+
+ // ttbr0Kvm is the value of ttbr0_el1 for sentry.
+ ttbr0Kvm uintptr
+
+ // ttbr0App is the value of ttbr0_el1 for applicaton.
+ ttbr0App uintptr
+
+ // exception vector.
+ vecCode Vector
+
+ // application context pointer.
+ appAddr uintptr
+
+ // lazyVFP is the value of cpacr_el1.
+ lazyVFP uintptr
+
+ // appASID is the asid value of guest application.
+ appASID uintptr
+}
+
+// ErrorCode returns the last error code.
+//
+// The returned boolean indicates whether the error code corresponds to the
+// last user error or not. If it does not, then fault information must be
+// ignored. This is generally the result of a kernel fault while servicing a
+// user fault.
+//
+//go:nosplit
+func (c *CPU) ErrorCode() (value uintptr, user bool) {
+ return c.errorCode, c.errorType != 0
+}
+
+// ClearErrorCode resets the error code.
+//
+//go:nosplit
+func (c *CPU) ClearErrorCode() {
+ c.errorCode = 0
+ c.errorType = 1
+}
+
+//go:nosplit
+func (c *CPU) GetFaultAddr() (value uintptr) {
+ return c.faultAddr
+}
+
+//go:nosplit
+func (c *CPU) SetTtbr0Kvm(value uintptr) {
+ c.ttbr0Kvm = value
+}
+
+//go:nosplit
+func (c *CPU) SetTtbr0App(value uintptr) {
+ c.ttbr0App = value
+}
+
+//go:nosplit
+func (c *CPU) GetVector() (value Vector) {
+ return c.vecCode
+}
+
+//go:nosplit
+func (c *CPU) SetAppAddr(value uintptr) {
+ c.appAddr = value
+}
+
+// GetLazyVFP returns the value of cpacr_el1.
+//go:nosplit
+func (c *CPU) GetLazyVFP() (value uintptr) {
+ return c.lazyVFP
+}
+
+// SwitchArchOpts are embedded in SwitchOpts.
+type SwitchArchOpts struct {
+ // UserASID indicates that the application ASID to be used on switch,
+ UserASID uint16
+
+ // KernelASID indicates that the kernel ASID to be used on return,
+ KernelASID uint16
+}
+
+func init() {
+}
+
// Emit prints architecture-specific offsets.
func Emit(w io.Writer) {
fmt.Fprintf(w, "// Automatically generated, do not edit.\n")
diff --git a/pkg/ring0/entry_amd64.s b/pkg/ring0/entry_impl_amd64.s
index d2913f190..2bb80d8af 100644
--- a/pkg/ring0/entry_amd64.s
+++ b/pkg/ring0/entry_impl_amd64.s
@@ -1,3 +1,76 @@
+// build +amd64
+
+// Automatically generated, do not edit.
+
+// CPU offsets.
+#define CPU_REGISTERS 0x30
+#define CPU_ERROR_CODE 0x10
+#define CPU_ERROR_TYPE 0x18
+#define CPU_ENTRY 0x20
+#define CPU_HAS_XSAVE 0x28
+#define CPU_HAS_XSAVEOPT 0x29
+#define CPU_FPU_STATE 0x108
+
+// CPU entry offsets.
+#define ENTRY_SCRATCH0 0x100
+#define ENTRY_STACK_TOP 0x108
+#define ENTRY_CPU_SELF 0x110
+#define ENTRY_KERNEL_CR3 0x118
+
+// Bits.
+#define _RFLAGS_IF 0x200
+#define _RFLAGS_IOPL0 0x1000
+#define _KERNEL_FLAGS 0x02
+
+// Vectors.
+#define DivideByZero 0x00
+#define Debug 0x01
+#define NMI 0x02
+#define Breakpoint 0x03
+#define Overflow 0x04
+#define BoundRangeExceeded 0x05
+#define InvalidOpcode 0x06
+#define DeviceNotAvailable 0x07
+#define DoubleFault 0x08
+#define CoprocessorSegmentOverrun 0x09
+#define InvalidTSS 0x0a
+#define SegmentNotPresent 0x0b
+#define StackSegmentFault 0x0c
+#define GeneralProtectionFault 0x0d
+#define PageFault 0x0e
+#define X87FloatingPointException 0x10
+#define AlignmentCheck 0x11
+#define MachineCheck 0x12
+#define SIMDFloatingPointException 0x13
+#define VirtualizationException 0x14
+#define SecurityException 0x1e
+#define SyscallInt80 0x80
+#define Syscall 0x100
+
+// Ptrace registers.
+#define PTRACE_R15 0x00
+#define PTRACE_R14 0x08
+#define PTRACE_R13 0x10
+#define PTRACE_R12 0x18
+#define PTRACE_RBP 0x20
+#define PTRACE_RBX 0x28
+#define PTRACE_R11 0x30
+#define PTRACE_R10 0x38
+#define PTRACE_R9 0x40
+#define PTRACE_R8 0x48
+#define PTRACE_RAX 0x50
+#define PTRACE_RCX 0x58
+#define PTRACE_RDX 0x60
+#define PTRACE_RSI 0x68
+#define PTRACE_RDI 0x70
+#define PTRACE_ORIGRAX 0x78
+#define PTRACE_RIP 0x80
+#define PTRACE_CS 0x88
+#define PTRACE_FLAGS 0x90
+#define PTRACE_RSP 0x98
+#define PTRACE_SS 0xa0
+#define PTRACE_FS_BASE 0xa8
+#define PTRACE_GS_BASE 0xb0
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/pkg/ring0/entry_arm64.s b/pkg/ring0/entry_impl_arm64.s
index f801b8e11..9cc09524a 100644
--- a/pkg/ring0/entry_arm64.s
+++ b/pkg/ring0/entry_impl_arm64.s
@@ -1,3 +1,93 @@
+// build +arm64
+
+// Automatically generated, do not edit.
+
+// CPU offsets.
+#define CPU_SELF 0x00
+#define CPU_REGISTERS 0xe0
+#define CPU_STACK_TOP 0x90
+#define CPU_ERROR_CODE 0x90
+#define CPU_ERROR_TYPE 0x98
+#define CPU_FAULT_ADDR 0xa0
+#define CPU_FPSTATE_EL0 0xa8
+#define CPU_TTBR0_KVM 0xb0
+#define CPU_TTBR0_APP 0xb8
+#define CPU_VECTOR_CODE 0xc0
+#define CPU_APP_ADDR 0xc8
+#define CPU_LAZY_VFP 0xd0
+#define CPU_APP_ASID 0xd8
+
+// Bits.
+#define _KERNEL_FLAGS 0x3c5
+
+// Vectors.
+#define El1Sync 0x04
+#define El1Irq 0x05
+#define El1Fiq 0x06
+#define El1Err 0x07
+#define El0Sync 0x08
+#define El0Irq 0x09
+#define El0Fiq 0x0a
+#define El0Err 0x0b
+#define El1SyncDa 0x10
+#define El1SyncIa 0x11
+#define El1SyncSpPc 0x12
+#define El1SyncUndef 0x13
+#define El1SyncDbg 0x14
+#define El1SyncInv 0x15
+#define El0SyncSVC 0x16
+#define El0SyncDa 0x17
+#define El0SyncIa 0x18
+#define El0SyncFpsimdAcc 0x19
+#define El0SyncSveAcc 0x1a
+#define El0SyncFpsimdExc 0x1b
+#define El0SyncSys 0x1c
+#define El0SyncSpPc 0x1d
+#define El0SyncUndef 0x1e
+#define El0SyncDbg 0x1f
+#define El0SyncWfx 0x20
+#define El0SyncInv 0x21
+#define El0ErrNMI 0x22
+#define PageFault 0x17
+#define Syscall 0x16
+#define VirtualizationException 0x23
+
+// Ptrace registers.
+#define PTRACE_R0 0x00
+#define PTRACE_R1 0x08
+#define PTRACE_R2 0x10
+#define PTRACE_R3 0x18
+#define PTRACE_R4 0x20
+#define PTRACE_R5 0x28
+#define PTRACE_R6 0x30
+#define PTRACE_R7 0x38
+#define PTRACE_R8 0x40
+#define PTRACE_R9 0x48
+#define PTRACE_R10 0x50
+#define PTRACE_R11 0x58
+#define PTRACE_R12 0x60
+#define PTRACE_R13 0x68
+#define PTRACE_R14 0x70
+#define PTRACE_R15 0x78
+#define PTRACE_R16 0x80
+#define PTRACE_R17 0x88
+#define PTRACE_R18 0x90
+#define PTRACE_R19 0x98
+#define PTRACE_R20 0xa0
+#define PTRACE_R21 0xa8
+#define PTRACE_R22 0xb0
+#define PTRACE_R23 0xb8
+#define PTRACE_R24 0xc0
+#define PTRACE_R25 0xc8
+#define PTRACE_R26 0xd0
+#define PTRACE_R27 0xd8
+#define PTRACE_R28 0xe0
+#define PTRACE_R29 0xe8
+#define PTRACE_R30 0xf0
+#define PTRACE_SP 0xf8
+#define PTRACE_PC 0x100
+#define PTRACE_PSTATE 0x108
+#define PTRACE_TLS 0x110
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/pkg/ring0/gen_offsets/BUILD b/pkg/ring0/gen_offsets/BUILD
deleted file mode 100644
index 9ea8f9a4f..000000000
--- a/pkg/ring0/gen_offsets/BUILD
+++ /dev/null
@@ -1,41 +0,0 @@
-load("//tools:defs.bzl", "go_binary")
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
-package(licenses = ["notice"])
-
-go_template_instance(
- name = "defs_impl_arm64",
- out = "defs_impl_arm64.go",
- package = "main",
- template = "//pkg/ring0:defs_arm64",
-)
-
-go_template_instance(
- name = "defs_impl_amd64",
- out = "defs_impl_amd64.go",
- package = "main",
- template = "//pkg/ring0:defs_amd64",
-)
-
-go_binary(
- name = "gen_offsets",
- srcs = [
- "defs_impl_amd64.go",
- "defs_impl_arm64.go",
- "main.go",
- ],
- # Use the libc malloc to avoid any extra dependencies. This is required to
- # pass the sentry deps test.
- system_malloc = True,
- visibility = [
- "//pkg/ring0:__pkg__",
- "//pkg/sentry/platform/kvm:__pkg__",
- ],
- deps = [
- "//pkg/cpuid",
- "//pkg/hostarch",
- "//pkg/ring0/pagetables",
- "//pkg/sentry/arch",
- "//pkg/sentry/arch/fpu",
- ],
-)
diff --git a/pkg/ring0/gen_offsets/main.go b/pkg/ring0/gen_offsets/main.go
deleted file mode 100644
index a4927da2f..000000000
--- a/pkg/ring0/gen_offsets/main.go
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Binary gen_offsets is a helper for generating offset headers.
-package main
-
-import (
- "os"
-)
-
-func main() {
- Emit(os.Stdout)
-}
diff --git a/pkg/ring0/offsets_amd64.go b/pkg/ring0/offsets_amd64.go
deleted file mode 100644
index 38fe27c35..000000000
--- a/pkg/ring0/offsets_amd64.go
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//go:build amd64
-// +build amd64
-
-package ring0
-
-import (
- "fmt"
- "io"
- "reflect"
-
- "gvisor.dev/gvisor/pkg/sentry/arch"
-)
-
-// Emit prints architecture-specific offsets.
-func Emit(w io.Writer) {
- fmt.Fprintf(w, "// Automatically generated, do not edit.\n")
-
- c := &CPU{}
- fmt.Fprintf(w, "\n// CPU offsets.\n")
- fmt.Fprintf(w, "#define CPU_REGISTERS 0x%02x\n", reflect.ValueOf(&c.registers).Pointer()-reflect.ValueOf(c).Pointer())
- fmt.Fprintf(w, "#define CPU_ERROR_CODE 0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer())
- fmt.Fprintf(w, "#define CPU_ERROR_TYPE 0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer())
- fmt.Fprintf(w, "#define CPU_ENTRY 0x%02x\n", reflect.ValueOf(&c.kernelEntry).Pointer()-reflect.ValueOf(c).Pointer())
- fmt.Fprintf(w, "#define CPU_HAS_XSAVE 0x%02x\n", reflect.ValueOf(&c.hasXSAVE).Pointer()-reflect.ValueOf(c).Pointer())
- fmt.Fprintf(w, "#define CPU_HAS_XSAVEOPT 0x%02x\n", reflect.ValueOf(&c.hasXSAVEOPT).Pointer()-reflect.ValueOf(c).Pointer())
- fmt.Fprintf(w, "#define CPU_FPU_STATE 0x%02x\n", reflect.ValueOf(&c.floatingPointState).Pointer()-reflect.ValueOf(c).Pointer())
-
- e := &kernelEntry{}
- fmt.Fprintf(w, "\n// CPU entry offsets.\n")
- fmt.Fprintf(w, "#define ENTRY_SCRATCH0 0x%02x\n", reflect.ValueOf(&e.scratch0).Pointer()-reflect.ValueOf(e).Pointer())
- fmt.Fprintf(w, "#define ENTRY_STACK_TOP 0x%02x\n", reflect.ValueOf(&e.stackTop).Pointer()-reflect.ValueOf(e).Pointer())
- fmt.Fprintf(w, "#define ENTRY_CPU_SELF 0x%02x\n", reflect.ValueOf(&e.cpuSelf).Pointer()-reflect.ValueOf(e).Pointer())
- fmt.Fprintf(w, "#define ENTRY_KERNEL_CR3 0x%02x\n", reflect.ValueOf(&e.kernelCR3).Pointer()-reflect.ValueOf(e).Pointer())
-
- fmt.Fprintf(w, "\n// Bits.\n")
- fmt.Fprintf(w, "#define _RFLAGS_IF 0x%02x\n", _RFLAGS_IF)
- fmt.Fprintf(w, "#define _RFLAGS_IOPL0 0x%02x\n", _RFLAGS_IOPL0)
- fmt.Fprintf(w, "#define _KERNEL_FLAGS 0x%02x\n", KernelFlagsSet)
-
- fmt.Fprintf(w, "\n// Vectors.\n")
- fmt.Fprintf(w, "#define DivideByZero 0x%02x\n", DivideByZero)
- fmt.Fprintf(w, "#define Debug 0x%02x\n", Debug)
- fmt.Fprintf(w, "#define NMI 0x%02x\n", NMI)
- fmt.Fprintf(w, "#define Breakpoint 0x%02x\n", Breakpoint)
- fmt.Fprintf(w, "#define Overflow 0x%02x\n", Overflow)
- fmt.Fprintf(w, "#define BoundRangeExceeded 0x%02x\n", BoundRangeExceeded)
- fmt.Fprintf(w, "#define InvalidOpcode 0x%02x\n", InvalidOpcode)
- fmt.Fprintf(w, "#define DeviceNotAvailable 0x%02x\n", DeviceNotAvailable)
- fmt.Fprintf(w, "#define DoubleFault 0x%02x\n", DoubleFault)
- fmt.Fprintf(w, "#define CoprocessorSegmentOverrun 0x%02x\n", CoprocessorSegmentOverrun)
- fmt.Fprintf(w, "#define InvalidTSS 0x%02x\n", InvalidTSS)
- fmt.Fprintf(w, "#define SegmentNotPresent 0x%02x\n", SegmentNotPresent)
- fmt.Fprintf(w, "#define StackSegmentFault 0x%02x\n", StackSegmentFault)
- fmt.Fprintf(w, "#define GeneralProtectionFault 0x%02x\n", GeneralProtectionFault)
- fmt.Fprintf(w, "#define PageFault 0x%02x\n", PageFault)
- fmt.Fprintf(w, "#define X87FloatingPointException 0x%02x\n", X87FloatingPointException)
- fmt.Fprintf(w, "#define AlignmentCheck 0x%02x\n", AlignmentCheck)
- fmt.Fprintf(w, "#define MachineCheck 0x%02x\n", MachineCheck)
- fmt.Fprintf(w, "#define SIMDFloatingPointException 0x%02x\n", SIMDFloatingPointException)
- fmt.Fprintf(w, "#define VirtualizationException 0x%02x\n", VirtualizationException)
- fmt.Fprintf(w, "#define SecurityException 0x%02x\n", SecurityException)
- fmt.Fprintf(w, "#define SyscallInt80 0x%02x\n", SyscallInt80)
- fmt.Fprintf(w, "#define Syscall 0x%02x\n", Syscall)
-
- p := &arch.Registers{}
- fmt.Fprintf(w, "\n// Ptrace registers.\n")
- fmt.Fprintf(w, "#define PTRACE_R15 0x%02x\n", reflect.ValueOf(&p.R15).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_R14 0x%02x\n", reflect.ValueOf(&p.R14).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_R13 0x%02x\n", reflect.ValueOf(&p.R13).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_R12 0x%02x\n", reflect.ValueOf(&p.R12).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_RBP 0x%02x\n", reflect.ValueOf(&p.Rbp).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_RBX 0x%02x\n", reflect.ValueOf(&p.Rbx).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_R11 0x%02x\n", reflect.ValueOf(&p.R11).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_R10 0x%02x\n", reflect.ValueOf(&p.R10).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_R9 0x%02x\n", reflect.ValueOf(&p.R9).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_R8 0x%02x\n", reflect.ValueOf(&p.R8).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_RAX 0x%02x\n", reflect.ValueOf(&p.Rax).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_RCX 0x%02x\n", reflect.ValueOf(&p.Rcx).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_RDX 0x%02x\n", reflect.ValueOf(&p.Rdx).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_RSI 0x%02x\n", reflect.ValueOf(&p.Rsi).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_RDI 0x%02x\n", reflect.ValueOf(&p.Rdi).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_ORIGRAX 0x%02x\n", reflect.ValueOf(&p.Orig_rax).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_RIP 0x%02x\n", reflect.ValueOf(&p.Rip).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_CS 0x%02x\n", reflect.ValueOf(&p.Cs).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_FLAGS 0x%02x\n", reflect.ValueOf(&p.Eflags).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_RSP 0x%02x\n", reflect.ValueOf(&p.Rsp).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_SS 0x%02x\n", reflect.ValueOf(&p.Ss).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_FS_BASE 0x%02x\n", reflect.ValueOf(&p.Fs_base).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_GS_BASE 0x%02x\n", reflect.ValueOf(&p.Gs_base).Pointer()-reflect.ValueOf(p).Pointer())
-}
diff --git a/pkg/ring0/pagetables/BUILD b/pkg/ring0/pagetables/BUILD
deleted file mode 100644
index f855f4d42..000000000
--- a/pkg/ring0/pagetables/BUILD
+++ /dev/null
@@ -1,88 +0,0 @@
-load("//tools:defs.bzl", "go_library", "go_test")
-load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
-
-package(licenses = ["notice"])
-
-[
- # These files are tagged with relevant build architectures. We can always
- # build all the input files, which will be included only in the relevant
- # architecture builds.
- go_template(
- name = "generic_walker_%s" % arch,
- srcs = [
- "walker_generic.go",
- "walker_%s.go" % arch,
- ],
- opt_types = [
- "Visitor",
- ],
- visibility = [":__pkg__"],
- )
- for arch in ("amd64", "arm64")
-]
-
-[
- # See above.
- go_template_instance(
- name = "walker_%s_%s" % (op, arch),
- out = "walker_%s_%s.go" % (op, arch),
- package = "pagetables",
- prefix = op,
- template = ":generic_walker_%s" % arch,
- types = {
- "Visitor": "%sVisitor" % op,
- },
- )
- for op in ("map", "unmap", "lookup", "empty", "check")
- for arch in ("amd64", "arm64")
-]
-
-go_library(
- name = "pagetables",
- srcs = [
- "allocator.go",
- "allocator_unsafe.go",
- "pagetables.go",
- "pagetables_aarch64.go",
- "pagetables_amd64.go",
- "pagetables_arm64.go",
- "pagetables_x86.go",
- "pcids.go",
- "pcids_aarch64.go",
- "pcids_aarch64.s",
- "pcids_x86.go",
- "walker_amd64.go",
- "walker_arm64.go",
- "walker_generic.go",
- ":walker_empty_amd64",
- ":walker_empty_arm64",
- ":walker_lookup_amd64",
- ":walker_lookup_arm64",
- ":walker_map_amd64",
- ":walker_map_arm64",
- ":walker_unmap_amd64",
- ":walker_unmap_arm64",
- ],
- visibility = [
- "//pkg/ring0:__subpackages__",
- "//pkg/sentry/platform/kvm:__subpackages__",
- ],
- deps = [
- "//pkg/hostarch",
- "//pkg/sync",
- ],
-)
-
-go_test(
- name = "pagetables_test",
- size = "small",
- srcs = [
- "pagetables_amd64_test.go",
- "pagetables_arm64_test.go",
- "pagetables_test.go",
- ":walker_check_amd64",
- ":walker_check_arm64",
- ],
- library = ":pagetables",
- deps = ["//pkg/hostarch"],
-)
diff --git a/pkg/ring0/pagetables/pagetables_aarch64_state_autogen.go b/pkg/ring0/pagetables/pagetables_aarch64_state_autogen.go
new file mode 100644
index 000000000..e395f6e5e
--- /dev/null
+++ b/pkg/ring0/pagetables/pagetables_aarch64_state_autogen.go
@@ -0,0 +1,6 @@
+// automatically generated by stateify.
+
+//go:build arm64 && arm64
+// +build arm64,arm64
+
+package pagetables
diff --git a/pkg/ring0/pagetables/pagetables_amd64_state_autogen.go b/pkg/ring0/pagetables/pagetables_amd64_state_autogen.go
new file mode 100644
index 000000000..50f7f3fb1
--- /dev/null
+++ b/pkg/ring0/pagetables/pagetables_amd64_state_autogen.go
@@ -0,0 +1,6 @@
+// automatically generated by stateify.
+
+//go:build amd64
+// +build amd64
+
+package pagetables
diff --git a/pkg/ring0/pagetables/pagetables_amd64_test.go b/pkg/ring0/pagetables/pagetables_amd64_test.go
deleted file mode 100644
index c27b3b10a..000000000
--- a/pkg/ring0/pagetables/pagetables_amd64_test.go
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//go:build amd64
-// +build amd64
-
-package pagetables
-
-import (
- "testing"
-
- "gvisor.dev/gvisor/pkg/hostarch"
-)
-
-func Test2MAnd4K(t *testing.T) {
- pt := New(NewRuntimeAllocator())
-
- // Map a small page and a huge page.
- pt.Map(0x400000, pteSize, MapOpts{AccessType: hostarch.ReadWrite}, pteSize*42)
- pt.Map(0x00007f0000000000, pmdSize, MapOpts{AccessType: hostarch.Read}, pmdSize*47)
-
- checkMappings(t, pt, []mapping{
- {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: hostarch.ReadWrite}},
- {0x00007f0000000000, pmdSize, pmdSize * 47, MapOpts{AccessType: hostarch.Read}},
- })
-}
-
-func Test1GAnd4K(t *testing.T) {
- pt := New(NewRuntimeAllocator())
-
- // Map a small page and a super page.
- pt.Map(0x400000, pteSize, MapOpts{AccessType: hostarch.ReadWrite}, pteSize*42)
- pt.Map(0x00007f0000000000, pudSize, MapOpts{AccessType: hostarch.Read}, pudSize*47)
-
- checkMappings(t, pt, []mapping{
- {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: hostarch.ReadWrite}},
- {0x00007f0000000000, pudSize, pudSize * 47, MapOpts{AccessType: hostarch.Read}},
- })
-}
-
-func TestSplit1GPage(t *testing.T) {
- pt := New(NewRuntimeAllocator())
-
- // Map a super page and knock out the middle.
- pt.Map(0x00007f0000000000, pudSize, MapOpts{AccessType: hostarch.Read}, pudSize*42)
- pt.Unmap(hostarch.Addr(0x00007f0000000000+pteSize), pudSize-(2*pteSize))
-
- checkMappings(t, pt, []mapping{
- {0x00007f0000000000, pteSize, pudSize * 42, MapOpts{AccessType: hostarch.Read}},
- {0x00007f0000000000 + pudSize - pteSize, pteSize, pudSize*42 + pudSize - pteSize, MapOpts{AccessType: hostarch.Read}},
- })
-}
-
-func TestSplit2MPage(t *testing.T) {
- pt := New(NewRuntimeAllocator())
-
- // Map a huge page and knock out the middle.
- pt.Map(0x00007f0000000000, pmdSize, MapOpts{AccessType: hostarch.Read}, pmdSize*42)
- pt.Unmap(hostarch.Addr(0x00007f0000000000+pteSize), pmdSize-(2*pteSize))
-
- checkMappings(t, pt, []mapping{
- {0x00007f0000000000, pteSize, pmdSize * 42, MapOpts{AccessType: hostarch.Read}},
- {0x00007f0000000000 + pmdSize - pteSize, pteSize, pmdSize*42 + pmdSize - pteSize, MapOpts{AccessType: hostarch.Read}},
- })
-}
diff --git a/pkg/ring0/pagetables/pagetables_arm64_state_autogen.go b/pkg/ring0/pagetables/pagetables_arm64_state_autogen.go
new file mode 100644
index 000000000..83bdb8f3c
--- /dev/null
+++ b/pkg/ring0/pagetables/pagetables_arm64_state_autogen.go
@@ -0,0 +1,6 @@
+// automatically generated by stateify.
+
+//go:build arm64
+// +build arm64
+
+package pagetables
diff --git a/pkg/ring0/pagetables/pagetables_arm64_test.go b/pkg/ring0/pagetables/pagetables_arm64_test.go
deleted file mode 100644
index 1c919ec7d..000000000
--- a/pkg/ring0/pagetables/pagetables_arm64_test.go
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//go:build arm64
-// +build arm64
-
-package pagetables
-
-import (
- "testing"
-
- "gvisor.dev/gvisor/pkg/hostarch"
-)
-
-func Test2MAnd4K(t *testing.T) {
- pt := New(NewRuntimeAllocator())
-
- // Map a small page and a huge page.
- pt.Map(0x400000, pteSize, MapOpts{AccessType: hostarch.ReadWrite, User: true}, pteSize*42)
- pt.Map(0x0000ff0000000000, pmdSize, MapOpts{AccessType: hostarch.Read, User: true}, pmdSize*47)
-
- pt.Map(0xffff000000400000, pteSize, MapOpts{AccessType: hostarch.ReadWrite, User: false}, pteSize*42)
- pt.Map(0xffffff0000000000, pmdSize, MapOpts{AccessType: hostarch.Read, User: false}, pmdSize*47)
-
- checkMappings(t, pt, []mapping{
- {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: hostarch.ReadWrite, User: true}},
- {0x0000ff0000000000, pmdSize, pmdSize * 47, MapOpts{AccessType: hostarch.Read, User: true}},
- {0xffff000000400000, pteSize, pteSize * 42, MapOpts{AccessType: hostarch.ReadWrite, User: false}},
- {0xffffff0000000000, pmdSize, pmdSize * 47, MapOpts{AccessType: hostarch.Read, User: false}},
- })
-}
-
-func Test1GAnd4K(t *testing.T) {
- pt := New(NewRuntimeAllocator())
-
- // Map a small page and a super page.
- pt.Map(0x400000, pteSize, MapOpts{AccessType: hostarch.ReadWrite, User: true}, pteSize*42)
- pt.Map(0x0000ff0000000000, pudSize, MapOpts{AccessType: hostarch.Read, User: true}, pudSize*47)
-
- checkMappings(t, pt, []mapping{
- {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: hostarch.ReadWrite, User: true}},
- {0x0000ff0000000000, pudSize, pudSize * 47, MapOpts{AccessType: hostarch.Read, User: true}},
- })
-}
-
-func TestSplit1GPage(t *testing.T) {
- pt := New(NewRuntimeAllocator())
-
- // Map a super page and knock out the middle.
- pt.Map(0x0000ff0000000000, pudSize, MapOpts{AccessType: hostarch.Read, User: true}, pudSize*42)
- pt.Unmap(hostarch.Addr(0x0000ff0000000000+pteSize), pudSize-(2*pteSize))
-
- checkMappings(t, pt, []mapping{
- {0x0000ff0000000000, pteSize, pudSize * 42, MapOpts{AccessType: hostarch.Read, User: true}},
- {0x0000ff0000000000 + pudSize - pteSize, pteSize, pudSize*42 + pudSize - pteSize, MapOpts{AccessType: hostarch.Read, User: true}},
- })
-}
-
-func TestSplit2MPage(t *testing.T) {
- pt := New(NewRuntimeAllocator())
-
- // Map a huge page and knock out the middle.
- pt.Map(0x0000ff0000000000, pmdSize, MapOpts{AccessType: hostarch.Read, User: true}, pmdSize*42)
- pt.Unmap(hostarch.Addr(0x0000ff0000000000+pteSize), pmdSize-(2*pteSize))
-
- checkMappings(t, pt, []mapping{
- {0x0000ff0000000000, pteSize, pmdSize * 42, MapOpts{AccessType: hostarch.Read, User: true}},
- {0x0000ff0000000000 + pmdSize - pteSize, pteSize, pmdSize*42 + pmdSize - pteSize, MapOpts{AccessType: hostarch.Read, User: true}},
- })
-}
diff --git a/pkg/ring0/pagetables/pagetables_state_autogen.go b/pkg/ring0/pagetables/pagetables_state_autogen.go
new file mode 100644
index 000000000..4c4540603
--- /dev/null
+++ b/pkg/ring0/pagetables/pagetables_state_autogen.go
@@ -0,0 +1,3 @@
+// automatically generated by stateify.
+
+package pagetables
diff --git a/pkg/ring0/pagetables/pagetables_test.go b/pkg/ring0/pagetables/pagetables_test.go
deleted file mode 100644
index df93dcb6a..000000000
--- a/pkg/ring0/pagetables/pagetables_test.go
+++ /dev/null
@@ -1,156 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package pagetables
-
-import (
- "gvisor.dev/gvisor/pkg/hostarch"
- "testing"
-)
-
-type mapping struct {
- start uintptr
- length uintptr
- addr uintptr
- opts MapOpts
-}
-
-type checkVisitor struct {
- expected []mapping // Input.
- current int // Temporary.
- found []mapping // Output.
- failed string // Output.
-}
-
-func (v *checkVisitor) visit(start uintptr, pte *PTE, align uintptr) bool {
- v.found = append(v.found, mapping{
- start: start,
- length: align + 1,
- addr: pte.Address(),
- opts: pte.Opts(),
- })
- if v.failed != "" {
- // Don't keep looking for errors.
- return false
- }
-
- if v.current >= len(v.expected) {
- v.failed = "more mappings than expected"
- } else if v.expected[v.current].start != start {
- v.failed = "start didn't match expected"
- } else if v.expected[v.current].length != (align + 1) {
- v.failed = "end didn't match expected"
- } else if v.expected[v.current].addr != pte.Address() {
- v.failed = "address didn't match expected"
- } else if v.expected[v.current].opts != pte.Opts() {
- v.failed = "opts didn't match"
- }
- v.current++
- return true
-}
-
-func (*checkVisitor) requiresAlloc() bool { return false }
-
-func (*checkVisitor) requiresSplit() bool { return false }
-
-func checkMappings(t *testing.T, pt *PageTables, m []mapping) {
- // Iterate over all the mappings.
- w := checkWalker{
- pageTables: pt,
- visitor: checkVisitor{
- expected: m,
- },
- }
- w.iterateRange(0, ^uintptr(0))
-
- // Were we expected additional mappings?
- if w.visitor.failed == "" && w.visitor.current != len(w.visitor.expected) {
- w.visitor.failed = "insufficient mappings found"
- }
-
- // Emit a meaningful error message on failure.
- if w.visitor.failed != "" {
- t.Errorf("%s; got %#v, wanted %#v", w.visitor.failed, w.visitor.found, w.visitor.expected)
- }
-}
-
-func TestUnmap(t *testing.T) {
- pt := New(NewRuntimeAllocator())
-
- // Map and unmap one entry.
- pt.Map(0x400000, pteSize, MapOpts{AccessType: hostarch.ReadWrite}, pteSize*42)
- pt.Unmap(0x400000, pteSize)
-
- checkMappings(t, pt, nil)
-}
-
-func TestReadOnly(t *testing.T) {
- pt := New(NewRuntimeAllocator())
-
- // Map one entry.
- pt.Map(0x400000, pteSize, MapOpts{AccessType: hostarch.Read}, pteSize*42)
-
- checkMappings(t, pt, []mapping{
- {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: hostarch.Read}},
- })
-}
-
-func TestReadWrite(t *testing.T) {
- pt := New(NewRuntimeAllocator())
-
- // Map one entry.
- pt.Map(0x400000, pteSize, MapOpts{AccessType: hostarch.ReadWrite}, pteSize*42)
-
- checkMappings(t, pt, []mapping{
- {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: hostarch.ReadWrite}},
- })
-}
-
-func TestSerialEntries(t *testing.T) {
- pt := New(NewRuntimeAllocator())
-
- // Map two sequential entries.
- pt.Map(0x400000, pteSize, MapOpts{AccessType: hostarch.ReadWrite}, pteSize*42)
- pt.Map(0x401000, pteSize, MapOpts{AccessType: hostarch.ReadWrite}, pteSize*47)
-
- checkMappings(t, pt, []mapping{
- {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: hostarch.ReadWrite}},
- {0x401000, pteSize, pteSize * 47, MapOpts{AccessType: hostarch.ReadWrite}},
- })
-}
-
-func TestSpanningEntries(t *testing.T) {
- pt := New(NewRuntimeAllocator())
-
- // Span a pgd with two pages.
- pt.Map(0x00007efffffff000, 2*pteSize, MapOpts{AccessType: hostarch.Read}, pteSize*42)
-
- checkMappings(t, pt, []mapping{
- {0x00007efffffff000, pteSize, pteSize * 42, MapOpts{AccessType: hostarch.Read}},
- {0x00007f0000000000, pteSize, pteSize * 43, MapOpts{AccessType: hostarch.Read}},
- })
-}
-
-func TestSparseEntries(t *testing.T) {
- pt := New(NewRuntimeAllocator())
-
- // Map two entries in different pgds.
- pt.Map(0x400000, pteSize, MapOpts{AccessType: hostarch.ReadWrite}, pteSize*42)
- pt.Map(0x00007f0000000000, pteSize, MapOpts{AccessType: hostarch.Read}, pteSize*47)
-
- checkMappings(t, pt, []mapping{
- {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: hostarch.ReadWrite}},
- {0x00007f0000000000, pteSize, pteSize * 47, MapOpts{AccessType: hostarch.Read}},
- })
-}
diff --git a/pkg/ring0/pagetables/pagetables_unsafe_state_autogen.go b/pkg/ring0/pagetables/pagetables_unsafe_state_autogen.go
new file mode 100644
index 000000000..4c4540603
--- /dev/null
+++ b/pkg/ring0/pagetables/pagetables_unsafe_state_autogen.go
@@ -0,0 +1,3 @@
+// automatically generated by stateify.
+
+package pagetables
diff --git a/pkg/ring0/pagetables/pagetables_x86_state_autogen.go b/pkg/ring0/pagetables/pagetables_x86_state_autogen.go
new file mode 100644
index 000000000..36d1fad72
--- /dev/null
+++ b/pkg/ring0/pagetables/pagetables_x86_state_autogen.go
@@ -0,0 +1,7 @@
+// automatically generated by stateify.
+
+//go:build (386 || amd64) && (i386 || amd64)
+// +build 386 amd64
+// +build i386 amd64
+
+package pagetables
diff --git a/pkg/ring0/pagetables/walker_empty_amd64.go b/pkg/ring0/pagetables/walker_empty_amd64.go
new file mode 100644
index 000000000..5a6ac02eb
--- /dev/null
+++ b/pkg/ring0/pagetables/walker_empty_amd64.go
@@ -0,0 +1,266 @@
+//go:build amd64
+// +build amd64
+
+package pagetables
+
+// iterateRangeCanonical walks a canonical range.
+//
+//go:nosplit
+func (w *emptyWalker) iterateRangeCanonical(start, end uintptr) bool {
+ for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+ var (
+ pgdEntry = &w.pageTables.root[pgdIndex]
+ pudEntries *PTEs
+ )
+ if !pgdEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ start = emptynext(start, pgdSize)
+ continue
+ }
+
+ pudEntries = w.pageTables.Allocator.NewPTEs()
+ pgdEntry.setPageTable(w.pageTables, pudEntries)
+ } else {
+ pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address())
+ }
+
+ clearPUDEntries := uint16(0)
+
+ for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+ var (
+ pudEntry = &pudEntries[pudIndex]
+ pmdEntries *PTEs
+ )
+ if !pudEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ clearPUDEntries++
+ start = emptynext(start, pudSize)
+ continue
+ }
+
+ if start&(pudSize-1) == 0 && end-start >= pudSize {
+ pudEntry.SetSuper()
+ if !w.visitor.visit(uintptr(start&^(pudSize-1)), pudEntry, pudSize-1) {
+ return false
+ }
+ if pudEntry.Valid() {
+ start = emptynext(start, pudSize)
+ continue
+ }
+ }
+
+ pmdEntries = w.pageTables.Allocator.NewPTEs()
+ pudEntry.setPageTable(w.pageTables, pmdEntries)
+
+ } else if pudEntry.IsSuper() {
+
+ if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < emptynext(start, pudSize)) {
+
+ pmdEntries = w.pageTables.Allocator.NewPTEs()
+ for index := uint16(0); index < entriesPerPage; index++ {
+ pmdEntries[index].SetSuper()
+ pmdEntries[index].Set(
+ pudEntry.Address()+(pmdSize*uintptr(index)),
+ pudEntry.Opts())
+ }
+ pudEntry.setPageTable(w.pageTables, pmdEntries)
+ } else {
+
+ if !w.visitor.visit(uintptr(start&^(pudSize-1)), pudEntry, pudSize-1) {
+ return false
+ }
+
+ if !pudEntry.Valid() {
+ clearPUDEntries++
+ }
+
+ start = emptynext(start, pudSize)
+ continue
+ }
+ } else {
+ pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address())
+ }
+
+ clearPMDEntries := uint16(0)
+
+ for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+ var (
+ pmdEntry = &pmdEntries[pmdIndex]
+ pteEntries *PTEs
+ )
+ if !pmdEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ clearPMDEntries++
+ start = emptynext(start, pmdSize)
+ continue
+ }
+
+ if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+ pmdEntry.SetSuper()
+ if !w.visitor.visit(uintptr(start&^(pmdSize-1)), pmdEntry, pmdSize-1) {
+ return false
+ }
+ if pmdEntry.Valid() {
+ start = emptynext(start, pmdSize)
+ continue
+ }
+ }
+
+ pteEntries = w.pageTables.Allocator.NewPTEs()
+ pmdEntry.setPageTable(w.pageTables, pteEntries)
+
+ } else if pmdEntry.IsSuper() {
+
+ if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < emptynext(start, pmdSize)) {
+
+ pteEntries = w.pageTables.Allocator.NewPTEs()
+ for index := uint16(0); index < entriesPerPage; index++ {
+ pteEntries[index].Set(
+ pmdEntry.Address()+(pteSize*uintptr(index)),
+ pmdEntry.Opts())
+ }
+ pmdEntry.setPageTable(w.pageTables, pteEntries)
+ } else {
+
+ if !w.visitor.visit(uintptr(start&^(pmdSize-1)), pmdEntry, pmdSize-1) {
+ return false
+ }
+
+ if !pmdEntry.Valid() {
+ clearPMDEntries++
+ }
+
+ start = emptynext(start, pmdSize)
+ continue
+ }
+ } else {
+ pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address())
+ }
+
+ clearPTEEntries := uint16(0)
+
+ for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+ var (
+ pteEntry = &pteEntries[pteIndex]
+ )
+ if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+ clearPTEEntries++
+ start += pteSize
+ continue
+ }
+
+ if !w.visitor.visit(uintptr(start&^(pteSize-1)), pteEntry, pteSize-1) {
+ return false
+ }
+ if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+ clearPTEEntries++
+ }
+
+ start += pteSize
+ continue
+ }
+
+ if clearPTEEntries == entriesPerPage {
+ pmdEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pteEntries)
+ clearPMDEntries++
+ }
+ }
+
+ if clearPMDEntries == entriesPerPage {
+ pudEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pmdEntries)
+ clearPUDEntries++
+ }
+ }
+
+ if clearPUDEntries == entriesPerPage {
+ pgdEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pudEntries)
+ }
+ }
+ return true
+}
+
+// Walker walks page tables.
+type emptyWalker struct {
+ // pageTables are the tables to walk.
+ pageTables *PageTables
+
+ // Visitor is the set of arguments.
+ visitor emptyVisitor
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The
+// exception is super pages. If a valid super page (huge or jumbo) cannot be
+// installed, then the walk will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of super/sect pages whenever
+// possible. Whether a super page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if requiresAlloc is true, then no gaps will be present. However,
+// if alloc is not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: start must be page-aligned.
+// Precondition: start must be less than end.
+// Precondition: If requiresAlloc is true, then start and end should not span
+// non-canonical ranges. If they do, a panic will result.
+//
+//go:nosplit
+func (w *emptyWalker) iterateRange(start, end uintptr) {
+ if start%pteSize != 0 {
+ panic("unaligned start")
+ }
+ if end < start {
+ panic("start > end")
+ }
+ if start < lowerTop {
+ if end <= lowerTop {
+ w.iterateRangeCanonical(start, end)
+ } else if end > lowerTop && end <= upperBottom {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ w.iterateRangeCanonical(start, lowerTop)
+ } else {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ if !w.iterateRangeCanonical(start, lowerTop) {
+ return
+ }
+ w.iterateRangeCanonical(upperBottom, end)
+ }
+ } else if start < upperBottom {
+ if end <= upperBottom {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ } else {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ w.iterateRangeCanonical(upperBottom, end)
+ }
+ } else {
+ w.iterateRangeCanonical(start, end)
+ }
+}
+
+// next returns the next address quantized by the given size.
+//
+//go:nosplit
+func emptynext(start uintptr, size uintptr) uintptr {
+ start &= ^(size - 1)
+ start += size
+ return start
+}
diff --git a/pkg/ring0/pagetables/walker_empty_arm64.go b/pkg/ring0/pagetables/walker_empty_arm64.go
new file mode 100644
index 000000000..3ab92f0ec
--- /dev/null
+++ b/pkg/ring0/pagetables/walker_empty_arm64.go
@@ -0,0 +1,276 @@
+//go:build arm64
+// +build arm64
+
+package pagetables
+
+// iterateRangeCanonical walks a canonical range.
+//
+//go:nosplit
+func (w *emptyWalker) iterateRangeCanonical(start, end uintptr) bool {
+ pgdEntryIndex := w.pageTables.root
+ if start >= upperBottom {
+ pgdEntryIndex = w.pageTables.archPageTables.root
+ }
+
+ for pgdIndex := (uint16((start & pgdMask) >> pgdShift)); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+ var (
+ pgdEntry = &pgdEntryIndex[pgdIndex]
+ pudEntries *PTEs
+ )
+ if !pgdEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ start = emptynext(start, pgdSize)
+ continue
+ }
+
+ pudEntries = w.pageTables.Allocator.NewPTEs()
+ pgdEntry.setPageTable(w.pageTables, pudEntries)
+ } else {
+ pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address())
+ }
+
+ clearPUDEntries := uint16(0)
+
+ for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+ var (
+ pudEntry = &pudEntries[pudIndex]
+ pmdEntries *PTEs
+ )
+ if !pudEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ clearPUDEntries++
+ start = emptynext(start, pudSize)
+ continue
+ }
+
+ if start&(pudSize-1) == 0 && end-start >= pudSize {
+ pudEntry.SetSect()
+ if !w.visitor.visit(uintptr(start), pudEntry, pudSize-1) {
+ return false
+ }
+ if pudEntry.Valid() {
+ start = emptynext(start, pudSize)
+ continue
+ }
+ }
+
+ pmdEntries = w.pageTables.Allocator.NewPTEs()
+ pudEntry.setPageTable(w.pageTables, pmdEntries)
+
+ } else if pudEntry.IsSect() {
+
+ if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < emptynext(start, pudSize)) {
+
+ pmdEntries = w.pageTables.Allocator.NewPTEs()
+ for index := uint16(0); index < entriesPerPage; index++ {
+ pmdEntries[index].SetSect()
+ pmdEntries[index].Set(
+ pudEntry.Address()+(pmdSize*uintptr(index)),
+ pudEntry.Opts())
+ }
+ pudEntry.setPageTable(w.pageTables, pmdEntries)
+ } else {
+
+ if !w.visitor.visit(uintptr(start), pudEntry, pudSize-1) {
+ return false
+ }
+
+ if !pudEntry.Valid() {
+ clearPUDEntries++
+ }
+
+ start = emptynext(start, pudSize)
+ continue
+ }
+
+ } else {
+ pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address())
+ }
+
+ clearPMDEntries := uint16(0)
+
+ for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+ var (
+ pmdEntry = &pmdEntries[pmdIndex]
+ pteEntries *PTEs
+ )
+ if !pmdEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ clearPMDEntries++
+ start = emptynext(start, pmdSize)
+ continue
+ }
+
+ if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+ pmdEntry.SetSect()
+ if !w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) {
+ return false
+ }
+ if pmdEntry.Valid() {
+ start = emptynext(start, pmdSize)
+ continue
+ }
+ }
+
+ pteEntries = w.pageTables.Allocator.NewPTEs()
+ pmdEntry.setPageTable(w.pageTables, pteEntries)
+
+ } else if pmdEntry.IsSect() {
+
+ if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < emptynext(start, pmdSize)) {
+
+ pteEntries = w.pageTables.Allocator.NewPTEs()
+ for index := uint16(0); index < entriesPerPage; index++ {
+ pteEntries[index].Set(
+ pmdEntry.Address()+(pteSize*uintptr(index)),
+ pmdEntry.Opts())
+ }
+ pmdEntry.setPageTable(w.pageTables, pteEntries)
+ } else {
+
+ if !w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) {
+ return false
+ }
+
+ if !pmdEntry.Valid() {
+ clearPMDEntries++
+ }
+
+ start = emptynext(start, pmdSize)
+ continue
+ }
+
+ } else {
+ pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address())
+ }
+
+ clearPTEEntries := uint16(0)
+
+ for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+ var (
+ pteEntry = &pteEntries[pteIndex]
+ )
+ if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+ clearPTEEntries++
+ start += pteSize
+ continue
+ }
+
+ if !w.visitor.visit(uintptr(start), pteEntry, pteSize-1) {
+ return false
+ }
+ if !pteEntry.Valid() {
+ if w.visitor.requiresAlloc() {
+ panic("PTE not set after iteration with requiresAlloc!")
+ }
+ clearPTEEntries++
+ }
+
+ start += pteSize
+ continue
+ }
+
+ if clearPTEEntries == entriesPerPage {
+ pmdEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pteEntries)
+ clearPMDEntries++
+ }
+ }
+
+ if clearPMDEntries == entriesPerPage {
+ pudEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pmdEntries)
+ clearPUDEntries++
+ }
+ }
+
+ if clearPUDEntries == entriesPerPage {
+ pgdEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pudEntries)
+ }
+ }
+ return true
+}
+
+// Walker walks page tables.
+type emptyWalker struct {
+ // pageTables are the tables to walk.
+ pageTables *PageTables
+
+ // Visitor is the set of arguments.
+ visitor emptyVisitor
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The
+// exception is super pages. If a valid super page (huge or jumbo) cannot be
+// installed, then the walk will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of super/sect pages whenever
+// possible. Whether a super page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if requiresAlloc is true, then no gaps will be present. However,
+// if alloc is not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: start must be page-aligned.
+// Precondition: start must be less than end.
+// Precondition: If requiresAlloc is true, then start and end should not span
+// non-canonical ranges. If they do, a panic will result.
+//
+//go:nosplit
+func (w *emptyWalker) iterateRange(start, end uintptr) {
+ if start%pteSize != 0 {
+ panic("unaligned start")
+ }
+ if end < start {
+ panic("start > end")
+ }
+ if start < lowerTop {
+ if end <= lowerTop {
+ w.iterateRangeCanonical(start, end)
+ } else if end > lowerTop && end <= upperBottom {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ w.iterateRangeCanonical(start, lowerTop)
+ } else {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ if !w.iterateRangeCanonical(start, lowerTop) {
+ return
+ }
+ w.iterateRangeCanonical(upperBottom, end)
+ }
+ } else if start < upperBottom {
+ if end <= upperBottom {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ } else {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ w.iterateRangeCanonical(upperBottom, end)
+ }
+ } else {
+ w.iterateRangeCanonical(start, end)
+ }
+}
+
+// next returns the next address quantized by the given size.
+//
+//go:nosplit
+func emptynext(start uintptr, size uintptr) uintptr {
+ start &= ^(size - 1)
+ start += size
+ return start
+}
diff --git a/pkg/ring0/pagetables/walker_lookup_amd64.go b/pkg/ring0/pagetables/walker_lookup_amd64.go
new file mode 100644
index 000000000..ad0cb8bb4
--- /dev/null
+++ b/pkg/ring0/pagetables/walker_lookup_amd64.go
@@ -0,0 +1,266 @@
+//go:build amd64
+// +build amd64
+
+package pagetables
+
+// iterateRangeCanonical walks a canonical range.
+//
+//go:nosplit
+func (w *lookupWalker) iterateRangeCanonical(start, end uintptr) bool {
+ for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+ var (
+ pgdEntry = &w.pageTables.root[pgdIndex]
+ pudEntries *PTEs
+ )
+ if !pgdEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ start = lookupnext(start, pgdSize)
+ continue
+ }
+
+ pudEntries = w.pageTables.Allocator.NewPTEs()
+ pgdEntry.setPageTable(w.pageTables, pudEntries)
+ } else {
+ pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address())
+ }
+
+ clearPUDEntries := uint16(0)
+
+ for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+ var (
+ pudEntry = &pudEntries[pudIndex]
+ pmdEntries *PTEs
+ )
+ if !pudEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ clearPUDEntries++
+ start = lookupnext(start, pudSize)
+ continue
+ }
+
+ if start&(pudSize-1) == 0 && end-start >= pudSize {
+ pudEntry.SetSuper()
+ if !w.visitor.visit(uintptr(start&^(pudSize-1)), pudEntry, pudSize-1) {
+ return false
+ }
+ if pudEntry.Valid() {
+ start = lookupnext(start, pudSize)
+ continue
+ }
+ }
+
+ pmdEntries = w.pageTables.Allocator.NewPTEs()
+ pudEntry.setPageTable(w.pageTables, pmdEntries)
+
+ } else if pudEntry.IsSuper() {
+
+ if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < lookupnext(start, pudSize)) {
+
+ pmdEntries = w.pageTables.Allocator.NewPTEs()
+ for index := uint16(0); index < entriesPerPage; index++ {
+ pmdEntries[index].SetSuper()
+ pmdEntries[index].Set(
+ pudEntry.Address()+(pmdSize*uintptr(index)),
+ pudEntry.Opts())
+ }
+ pudEntry.setPageTable(w.pageTables, pmdEntries)
+ } else {
+
+ if !w.visitor.visit(uintptr(start&^(pudSize-1)), pudEntry, pudSize-1) {
+ return false
+ }
+
+ if !pudEntry.Valid() {
+ clearPUDEntries++
+ }
+
+ start = lookupnext(start, pudSize)
+ continue
+ }
+ } else {
+ pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address())
+ }
+
+ clearPMDEntries := uint16(0)
+
+ for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+ var (
+ pmdEntry = &pmdEntries[pmdIndex]
+ pteEntries *PTEs
+ )
+ if !pmdEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ clearPMDEntries++
+ start = lookupnext(start, pmdSize)
+ continue
+ }
+
+ if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+ pmdEntry.SetSuper()
+ if !w.visitor.visit(uintptr(start&^(pmdSize-1)), pmdEntry, pmdSize-1) {
+ return false
+ }
+ if pmdEntry.Valid() {
+ start = lookupnext(start, pmdSize)
+ continue
+ }
+ }
+
+ pteEntries = w.pageTables.Allocator.NewPTEs()
+ pmdEntry.setPageTable(w.pageTables, pteEntries)
+
+ } else if pmdEntry.IsSuper() {
+
+ if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < lookupnext(start, pmdSize)) {
+
+ pteEntries = w.pageTables.Allocator.NewPTEs()
+ for index := uint16(0); index < entriesPerPage; index++ {
+ pteEntries[index].Set(
+ pmdEntry.Address()+(pteSize*uintptr(index)),
+ pmdEntry.Opts())
+ }
+ pmdEntry.setPageTable(w.pageTables, pteEntries)
+ } else {
+
+ if !w.visitor.visit(uintptr(start&^(pmdSize-1)), pmdEntry, pmdSize-1) {
+ return false
+ }
+
+ if !pmdEntry.Valid() {
+ clearPMDEntries++
+ }
+
+ start = lookupnext(start, pmdSize)
+ continue
+ }
+ } else {
+ pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address())
+ }
+
+ clearPTEEntries := uint16(0)
+
+ for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+ var (
+ pteEntry = &pteEntries[pteIndex]
+ )
+ if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+ clearPTEEntries++
+ start += pteSize
+ continue
+ }
+
+ if !w.visitor.visit(uintptr(start&^(pteSize-1)), pteEntry, pteSize-1) {
+ return false
+ }
+ if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+ clearPTEEntries++
+ }
+
+ start += pteSize
+ continue
+ }
+
+ if clearPTEEntries == entriesPerPage {
+ pmdEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pteEntries)
+ clearPMDEntries++
+ }
+ }
+
+ if clearPMDEntries == entriesPerPage {
+ pudEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pmdEntries)
+ clearPUDEntries++
+ }
+ }
+
+ if clearPUDEntries == entriesPerPage {
+ pgdEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pudEntries)
+ }
+ }
+ return true
+}
+
+// Walker walks page tables.
+type lookupWalker struct {
+ // pageTables are the tables to walk.
+ pageTables *PageTables
+
+ // Visitor is the set of arguments.
+ visitor lookupVisitor
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The
+// exception is super pages. If a valid super page (huge or jumbo) cannot be
+// installed, then the walk will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of super/sect pages whenever
+// possible. Whether a super page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if requiresAlloc is true, then no gaps will be present. However,
+// if alloc is not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: start must be page-aligned.
+// Precondition: start must be less than end.
+// Precondition: If requiresAlloc is true, then start and end should not span
+// non-canonical ranges. If they do, a panic will result.
+//
+//go:nosplit
+func (w *lookupWalker) iterateRange(start, end uintptr) {
+ if start%pteSize != 0 {
+ panic("unaligned start")
+ }
+ if end < start {
+ panic("start > end")
+ }
+ if start < lowerTop {
+ if end <= lowerTop {
+ w.iterateRangeCanonical(start, end)
+ } else if end > lowerTop && end <= upperBottom {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ w.iterateRangeCanonical(start, lowerTop)
+ } else {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ if !w.iterateRangeCanonical(start, lowerTop) {
+ return
+ }
+ w.iterateRangeCanonical(upperBottom, end)
+ }
+ } else if start < upperBottom {
+ if end <= upperBottom {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ } else {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ w.iterateRangeCanonical(upperBottom, end)
+ }
+ } else {
+ w.iterateRangeCanonical(start, end)
+ }
+}
+
+// next returns the next address quantized by the given size.
+//
+//go:nosplit
+func lookupnext(start uintptr, size uintptr) uintptr {
+ start &= ^(size - 1)
+ start += size
+ return start
+}
diff --git a/pkg/ring0/pagetables/walker_lookup_arm64.go b/pkg/ring0/pagetables/walker_lookup_arm64.go
new file mode 100644
index 000000000..2b989b3f1
--- /dev/null
+++ b/pkg/ring0/pagetables/walker_lookup_arm64.go
@@ -0,0 +1,276 @@
+//go:build arm64
+// +build arm64
+
+package pagetables
+
+// iterateRangeCanonical walks a canonical range.
+//
+//go:nosplit
+func (w *lookupWalker) iterateRangeCanonical(start, end uintptr) bool {
+ pgdEntryIndex := w.pageTables.root
+ if start >= upperBottom {
+ pgdEntryIndex = w.pageTables.archPageTables.root
+ }
+
+ for pgdIndex := (uint16((start & pgdMask) >> pgdShift)); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+ var (
+ pgdEntry = &pgdEntryIndex[pgdIndex]
+ pudEntries *PTEs
+ )
+ if !pgdEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ start = lookupnext(start, pgdSize)
+ continue
+ }
+
+ pudEntries = w.pageTables.Allocator.NewPTEs()
+ pgdEntry.setPageTable(w.pageTables, pudEntries)
+ } else {
+ pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address())
+ }
+
+ clearPUDEntries := uint16(0)
+
+ for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+ var (
+ pudEntry = &pudEntries[pudIndex]
+ pmdEntries *PTEs
+ )
+ if !pudEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ clearPUDEntries++
+ start = lookupnext(start, pudSize)
+ continue
+ }
+
+ if start&(pudSize-1) == 0 && end-start >= pudSize {
+ pudEntry.SetSect()
+ if !w.visitor.visit(uintptr(start), pudEntry, pudSize-1) {
+ return false
+ }
+ if pudEntry.Valid() {
+ start = lookupnext(start, pudSize)
+ continue
+ }
+ }
+
+ pmdEntries = w.pageTables.Allocator.NewPTEs()
+ pudEntry.setPageTable(w.pageTables, pmdEntries)
+
+ } else if pudEntry.IsSect() {
+
+ if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < lookupnext(start, pudSize)) {
+
+ pmdEntries = w.pageTables.Allocator.NewPTEs()
+ for index := uint16(0); index < entriesPerPage; index++ {
+ pmdEntries[index].SetSect()
+ pmdEntries[index].Set(
+ pudEntry.Address()+(pmdSize*uintptr(index)),
+ pudEntry.Opts())
+ }
+ pudEntry.setPageTable(w.pageTables, pmdEntries)
+ } else {
+
+ if !w.visitor.visit(uintptr(start), pudEntry, pudSize-1) {
+ return false
+ }
+
+ if !pudEntry.Valid() {
+ clearPUDEntries++
+ }
+
+ start = lookupnext(start, pudSize)
+ continue
+ }
+
+ } else {
+ pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address())
+ }
+
+ clearPMDEntries := uint16(0)
+
+ for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+ var (
+ pmdEntry = &pmdEntries[pmdIndex]
+ pteEntries *PTEs
+ )
+ if !pmdEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ clearPMDEntries++
+ start = lookupnext(start, pmdSize)
+ continue
+ }
+
+ if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+ pmdEntry.SetSect()
+ if !w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) {
+ return false
+ }
+ if pmdEntry.Valid() {
+ start = lookupnext(start, pmdSize)
+ continue
+ }
+ }
+
+ pteEntries = w.pageTables.Allocator.NewPTEs()
+ pmdEntry.setPageTable(w.pageTables, pteEntries)
+
+ } else if pmdEntry.IsSect() {
+
+ if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < lookupnext(start, pmdSize)) {
+
+ pteEntries = w.pageTables.Allocator.NewPTEs()
+ for index := uint16(0); index < entriesPerPage; index++ {
+ pteEntries[index].Set(
+ pmdEntry.Address()+(pteSize*uintptr(index)),
+ pmdEntry.Opts())
+ }
+ pmdEntry.setPageTable(w.pageTables, pteEntries)
+ } else {
+
+ if !w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) {
+ return false
+ }
+
+ if !pmdEntry.Valid() {
+ clearPMDEntries++
+ }
+
+ start = lookupnext(start, pmdSize)
+ continue
+ }
+
+ } else {
+ pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address())
+ }
+
+ clearPTEEntries := uint16(0)
+
+ for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+ var (
+ pteEntry = &pteEntries[pteIndex]
+ )
+ if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+ clearPTEEntries++
+ start += pteSize
+ continue
+ }
+
+ if !w.visitor.visit(uintptr(start), pteEntry, pteSize-1) {
+ return false
+ }
+ if !pteEntry.Valid() {
+ if w.visitor.requiresAlloc() {
+ panic("PTE not set after iteration with requiresAlloc!")
+ }
+ clearPTEEntries++
+ }
+
+ start += pteSize
+ continue
+ }
+
+ if clearPTEEntries == entriesPerPage {
+ pmdEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pteEntries)
+ clearPMDEntries++
+ }
+ }
+
+ if clearPMDEntries == entriesPerPage {
+ pudEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pmdEntries)
+ clearPUDEntries++
+ }
+ }
+
+ if clearPUDEntries == entriesPerPage {
+ pgdEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pudEntries)
+ }
+ }
+ return true
+}
+
+// Walker walks page tables.
+type lookupWalker struct {
+ // pageTables are the tables to walk.
+ pageTables *PageTables
+
+ // Visitor is the set of arguments.
+ visitor lookupVisitor
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The
+// exception is super pages. If a valid super page (huge or jumbo) cannot be
+// installed, then the walk will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of super/sect pages whenever
+// possible. Whether a super page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if requiresAlloc is true, then no gaps will be present. However,
+// if alloc is not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: start must be page-aligned.
+// Precondition: start must be less than end.
+// Precondition: If requiresAlloc is true, then start and end should not span
+// non-canonical ranges. If they do, a panic will result.
+//
+//go:nosplit
+func (w *lookupWalker) iterateRange(start, end uintptr) {
+ if start%pteSize != 0 {
+ panic("unaligned start")
+ }
+ if end < start {
+ panic("start > end")
+ }
+ if start < lowerTop {
+ if end <= lowerTop {
+ w.iterateRangeCanonical(start, end)
+ } else if end > lowerTop && end <= upperBottom {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ w.iterateRangeCanonical(start, lowerTop)
+ } else {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ if !w.iterateRangeCanonical(start, lowerTop) {
+ return
+ }
+ w.iterateRangeCanonical(upperBottom, end)
+ }
+ } else if start < upperBottom {
+ if end <= upperBottom {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ } else {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ w.iterateRangeCanonical(upperBottom, end)
+ }
+ } else {
+ w.iterateRangeCanonical(start, end)
+ }
+}
+
+// next returns the next address quantized by the given size.
+//
+//go:nosplit
+func lookupnext(start uintptr, size uintptr) uintptr {
+ start &= ^(size - 1)
+ start += size
+ return start
+}
diff --git a/pkg/ring0/pagetables/walker_map_amd64.go b/pkg/ring0/pagetables/walker_map_amd64.go
new file mode 100644
index 000000000..4e0b959a5
--- /dev/null
+++ b/pkg/ring0/pagetables/walker_map_amd64.go
@@ -0,0 +1,266 @@
+//go:build amd64
+// +build amd64
+
+package pagetables
+
+// iterateRangeCanonical walks a canonical range.
+//
+//go:nosplit
+func (w *mapWalker) iterateRangeCanonical(start, end uintptr) bool {
+ for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+ var (
+ pgdEntry = &w.pageTables.root[pgdIndex]
+ pudEntries *PTEs
+ )
+ if !pgdEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ start = mapnext(start, pgdSize)
+ continue
+ }
+
+ pudEntries = w.pageTables.Allocator.NewPTEs()
+ pgdEntry.setPageTable(w.pageTables, pudEntries)
+ } else {
+ pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address())
+ }
+
+ clearPUDEntries := uint16(0)
+
+ for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+ var (
+ pudEntry = &pudEntries[pudIndex]
+ pmdEntries *PTEs
+ )
+ if !pudEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ clearPUDEntries++
+ start = mapnext(start, pudSize)
+ continue
+ }
+
+ if start&(pudSize-1) == 0 && end-start >= pudSize {
+ pudEntry.SetSuper()
+ if !w.visitor.visit(uintptr(start&^(pudSize-1)), pudEntry, pudSize-1) {
+ return false
+ }
+ if pudEntry.Valid() {
+ start = mapnext(start, pudSize)
+ continue
+ }
+ }
+
+ pmdEntries = w.pageTables.Allocator.NewPTEs()
+ pudEntry.setPageTable(w.pageTables, pmdEntries)
+
+ } else if pudEntry.IsSuper() {
+
+ if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < mapnext(start, pudSize)) {
+
+ pmdEntries = w.pageTables.Allocator.NewPTEs()
+ for index := uint16(0); index < entriesPerPage; index++ {
+ pmdEntries[index].SetSuper()
+ pmdEntries[index].Set(
+ pudEntry.Address()+(pmdSize*uintptr(index)),
+ pudEntry.Opts())
+ }
+ pudEntry.setPageTable(w.pageTables, pmdEntries)
+ } else {
+
+ if !w.visitor.visit(uintptr(start&^(pudSize-1)), pudEntry, pudSize-1) {
+ return false
+ }
+
+ if !pudEntry.Valid() {
+ clearPUDEntries++
+ }
+
+ start = mapnext(start, pudSize)
+ continue
+ }
+ } else {
+ pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address())
+ }
+
+ clearPMDEntries := uint16(0)
+
+ for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+ var (
+ pmdEntry = &pmdEntries[pmdIndex]
+ pteEntries *PTEs
+ )
+ if !pmdEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ clearPMDEntries++
+ start = mapnext(start, pmdSize)
+ continue
+ }
+
+ if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+ pmdEntry.SetSuper()
+ if !w.visitor.visit(uintptr(start&^(pmdSize-1)), pmdEntry, pmdSize-1) {
+ return false
+ }
+ if pmdEntry.Valid() {
+ start = mapnext(start, pmdSize)
+ continue
+ }
+ }
+
+ pteEntries = w.pageTables.Allocator.NewPTEs()
+ pmdEntry.setPageTable(w.pageTables, pteEntries)
+
+ } else if pmdEntry.IsSuper() {
+
+ if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < mapnext(start, pmdSize)) {
+
+ pteEntries = w.pageTables.Allocator.NewPTEs()
+ for index := uint16(0); index < entriesPerPage; index++ {
+ pteEntries[index].Set(
+ pmdEntry.Address()+(pteSize*uintptr(index)),
+ pmdEntry.Opts())
+ }
+ pmdEntry.setPageTable(w.pageTables, pteEntries)
+ } else {
+
+ if !w.visitor.visit(uintptr(start&^(pmdSize-1)), pmdEntry, pmdSize-1) {
+ return false
+ }
+
+ if !pmdEntry.Valid() {
+ clearPMDEntries++
+ }
+
+ start = mapnext(start, pmdSize)
+ continue
+ }
+ } else {
+ pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address())
+ }
+
+ clearPTEEntries := uint16(0)
+
+ for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+ var (
+ pteEntry = &pteEntries[pteIndex]
+ )
+ if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+ clearPTEEntries++
+ start += pteSize
+ continue
+ }
+
+ if !w.visitor.visit(uintptr(start&^(pteSize-1)), pteEntry, pteSize-1) {
+ return false
+ }
+ if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+ clearPTEEntries++
+ }
+
+ start += pteSize
+ continue
+ }
+
+ if clearPTEEntries == entriesPerPage {
+ pmdEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pteEntries)
+ clearPMDEntries++
+ }
+ }
+
+ if clearPMDEntries == entriesPerPage {
+ pudEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pmdEntries)
+ clearPUDEntries++
+ }
+ }
+
+ if clearPUDEntries == entriesPerPage {
+ pgdEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pudEntries)
+ }
+ }
+ return true
+}
+
+// Walker walks page tables.
+type mapWalker struct {
+ // pageTables are the tables to walk.
+ pageTables *PageTables
+
+ // Visitor is the set of arguments.
+ visitor mapVisitor
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The
+// exception is super pages. If a valid super page (huge or jumbo) cannot be
+// installed, then the walk will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of super/sect pages whenever
+// possible. Whether a super page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if requiresAlloc is true, then no gaps will be present. However,
+// if alloc is not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: start must be page-aligned.
+// Precondition: start must be less than end.
+// Precondition: If requiresAlloc is true, then start and end should not span
+// non-canonical ranges. If they do, a panic will result.
+//
+//go:nosplit
+func (w *mapWalker) iterateRange(start, end uintptr) {
+ if start%pteSize != 0 {
+ panic("unaligned start")
+ }
+ if end < start {
+ panic("start > end")
+ }
+ if start < lowerTop {
+ if end <= lowerTop {
+ w.iterateRangeCanonical(start, end)
+ } else if end > lowerTop && end <= upperBottom {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ w.iterateRangeCanonical(start, lowerTop)
+ } else {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ if !w.iterateRangeCanonical(start, lowerTop) {
+ return
+ }
+ w.iterateRangeCanonical(upperBottom, end)
+ }
+ } else if start < upperBottom {
+ if end <= upperBottom {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ } else {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ w.iterateRangeCanonical(upperBottom, end)
+ }
+ } else {
+ w.iterateRangeCanonical(start, end)
+ }
+}
+
+// next returns the next address quantized by the given size.
+//
+//go:nosplit
+func mapnext(start uintptr, size uintptr) uintptr {
+ start &= ^(size - 1)
+ start += size
+ return start
+}
diff --git a/pkg/ring0/pagetables/walker_map_arm64.go b/pkg/ring0/pagetables/walker_map_arm64.go
new file mode 100644
index 000000000..6d70b7091
--- /dev/null
+++ b/pkg/ring0/pagetables/walker_map_arm64.go
@@ -0,0 +1,276 @@
+//go:build arm64
+// +build arm64
+
+package pagetables
+
+// iterateRangeCanonical walks a canonical range.
+//
+//go:nosplit
+func (w *mapWalker) iterateRangeCanonical(start, end uintptr) bool {
+ pgdEntryIndex := w.pageTables.root
+ if start >= upperBottom {
+ pgdEntryIndex = w.pageTables.archPageTables.root
+ }
+
+ for pgdIndex := (uint16((start & pgdMask) >> pgdShift)); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+ var (
+ pgdEntry = &pgdEntryIndex[pgdIndex]
+ pudEntries *PTEs
+ )
+ if !pgdEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ start = mapnext(start, pgdSize)
+ continue
+ }
+
+ pudEntries = w.pageTables.Allocator.NewPTEs()
+ pgdEntry.setPageTable(w.pageTables, pudEntries)
+ } else {
+ pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address())
+ }
+
+ clearPUDEntries := uint16(0)
+
+ for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+ var (
+ pudEntry = &pudEntries[pudIndex]
+ pmdEntries *PTEs
+ )
+ if !pudEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ clearPUDEntries++
+ start = mapnext(start, pudSize)
+ continue
+ }
+
+ if start&(pudSize-1) == 0 && end-start >= pudSize {
+ pudEntry.SetSect()
+ if !w.visitor.visit(uintptr(start), pudEntry, pudSize-1) {
+ return false
+ }
+ if pudEntry.Valid() {
+ start = mapnext(start, pudSize)
+ continue
+ }
+ }
+
+ pmdEntries = w.pageTables.Allocator.NewPTEs()
+ pudEntry.setPageTable(w.pageTables, pmdEntries)
+
+ } else if pudEntry.IsSect() {
+
+ if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < mapnext(start, pudSize)) {
+
+ pmdEntries = w.pageTables.Allocator.NewPTEs()
+ for index := uint16(0); index < entriesPerPage; index++ {
+ pmdEntries[index].SetSect()
+ pmdEntries[index].Set(
+ pudEntry.Address()+(pmdSize*uintptr(index)),
+ pudEntry.Opts())
+ }
+ pudEntry.setPageTable(w.pageTables, pmdEntries)
+ } else {
+
+ if !w.visitor.visit(uintptr(start), pudEntry, pudSize-1) {
+ return false
+ }
+
+ if !pudEntry.Valid() {
+ clearPUDEntries++
+ }
+
+ start = mapnext(start, pudSize)
+ continue
+ }
+
+ } else {
+ pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address())
+ }
+
+ clearPMDEntries := uint16(0)
+
+ for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+ var (
+ pmdEntry = &pmdEntries[pmdIndex]
+ pteEntries *PTEs
+ )
+ if !pmdEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ clearPMDEntries++
+ start = mapnext(start, pmdSize)
+ continue
+ }
+
+ if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+ pmdEntry.SetSect()
+ if !w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) {
+ return false
+ }
+ if pmdEntry.Valid() {
+ start = mapnext(start, pmdSize)
+ continue
+ }
+ }
+
+ pteEntries = w.pageTables.Allocator.NewPTEs()
+ pmdEntry.setPageTable(w.pageTables, pteEntries)
+
+ } else if pmdEntry.IsSect() {
+
+ if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < mapnext(start, pmdSize)) {
+
+ pteEntries = w.pageTables.Allocator.NewPTEs()
+ for index := uint16(0); index < entriesPerPage; index++ {
+ pteEntries[index].Set(
+ pmdEntry.Address()+(pteSize*uintptr(index)),
+ pmdEntry.Opts())
+ }
+ pmdEntry.setPageTable(w.pageTables, pteEntries)
+ } else {
+
+ if !w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) {
+ return false
+ }
+
+ if !pmdEntry.Valid() {
+ clearPMDEntries++
+ }
+
+ start = mapnext(start, pmdSize)
+ continue
+ }
+
+ } else {
+ pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address())
+ }
+
+ clearPTEEntries := uint16(0)
+
+ for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+ var (
+ pteEntry = &pteEntries[pteIndex]
+ )
+ if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+ clearPTEEntries++
+ start += pteSize
+ continue
+ }
+
+ if !w.visitor.visit(uintptr(start), pteEntry, pteSize-1) {
+ return false
+ }
+ if !pteEntry.Valid() {
+ if w.visitor.requiresAlloc() {
+ panic("PTE not set after iteration with requiresAlloc!")
+ }
+ clearPTEEntries++
+ }
+
+ start += pteSize
+ continue
+ }
+
+ if clearPTEEntries == entriesPerPage {
+ pmdEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pteEntries)
+ clearPMDEntries++
+ }
+ }
+
+ if clearPMDEntries == entriesPerPage {
+ pudEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pmdEntries)
+ clearPUDEntries++
+ }
+ }
+
+ if clearPUDEntries == entriesPerPage {
+ pgdEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pudEntries)
+ }
+ }
+ return true
+}
+
+// Walker walks page tables.
+type mapWalker struct {
+ // pageTables are the tables to walk.
+ pageTables *PageTables
+
+ // Visitor is the set of arguments.
+ visitor mapVisitor
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The
+// exception is super pages. If a valid super page (huge or jumbo) cannot be
+// installed, then the walk will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of super/sect pages whenever
+// possible. Whether a super page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if requiresAlloc is true, then no gaps will be present. However,
+// if alloc is not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: start must be page-aligned.
+// Precondition: start must be less than end.
+// Precondition: If requiresAlloc is true, then start and end should not span
+// non-canonical ranges. If they do, a panic will result.
+//
+//go:nosplit
+func (w *mapWalker) iterateRange(start, end uintptr) {
+ if start%pteSize != 0 {
+ panic("unaligned start")
+ }
+ if end < start {
+ panic("start > end")
+ }
+ if start < lowerTop {
+ if end <= lowerTop {
+ w.iterateRangeCanonical(start, end)
+ } else if end > lowerTop && end <= upperBottom {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ w.iterateRangeCanonical(start, lowerTop)
+ } else {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ if !w.iterateRangeCanonical(start, lowerTop) {
+ return
+ }
+ w.iterateRangeCanonical(upperBottom, end)
+ }
+ } else if start < upperBottom {
+ if end <= upperBottom {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ } else {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ w.iterateRangeCanonical(upperBottom, end)
+ }
+ } else {
+ w.iterateRangeCanonical(start, end)
+ }
+}
+
+// next returns the next address quantized by the given size.
+//
+//go:nosplit
+func mapnext(start uintptr, size uintptr) uintptr {
+ start &= ^(size - 1)
+ start += size
+ return start
+}
diff --git a/pkg/ring0/pagetables/walker_unmap_amd64.go b/pkg/ring0/pagetables/walker_unmap_amd64.go
new file mode 100644
index 000000000..25d3a1710
--- /dev/null
+++ b/pkg/ring0/pagetables/walker_unmap_amd64.go
@@ -0,0 +1,266 @@
+//go:build amd64
+// +build amd64
+
+package pagetables
+
+// iterateRangeCanonical walks a canonical range.
+//
+//go:nosplit
+func (w *unmapWalker) iterateRangeCanonical(start, end uintptr) bool {
+ for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+ var (
+ pgdEntry = &w.pageTables.root[pgdIndex]
+ pudEntries *PTEs
+ )
+ if !pgdEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ start = unmapnext(start, pgdSize)
+ continue
+ }
+
+ pudEntries = w.pageTables.Allocator.NewPTEs()
+ pgdEntry.setPageTable(w.pageTables, pudEntries)
+ } else {
+ pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address())
+ }
+
+ clearPUDEntries := uint16(0)
+
+ for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+ var (
+ pudEntry = &pudEntries[pudIndex]
+ pmdEntries *PTEs
+ )
+ if !pudEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ clearPUDEntries++
+ start = unmapnext(start, pudSize)
+ continue
+ }
+
+ if start&(pudSize-1) == 0 && end-start >= pudSize {
+ pudEntry.SetSuper()
+ if !w.visitor.visit(uintptr(start&^(pudSize-1)), pudEntry, pudSize-1) {
+ return false
+ }
+ if pudEntry.Valid() {
+ start = unmapnext(start, pudSize)
+ continue
+ }
+ }
+
+ pmdEntries = w.pageTables.Allocator.NewPTEs()
+ pudEntry.setPageTable(w.pageTables, pmdEntries)
+
+ } else if pudEntry.IsSuper() {
+
+ if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < unmapnext(start, pudSize)) {
+
+ pmdEntries = w.pageTables.Allocator.NewPTEs()
+ for index := uint16(0); index < entriesPerPage; index++ {
+ pmdEntries[index].SetSuper()
+ pmdEntries[index].Set(
+ pudEntry.Address()+(pmdSize*uintptr(index)),
+ pudEntry.Opts())
+ }
+ pudEntry.setPageTable(w.pageTables, pmdEntries)
+ } else {
+
+ if !w.visitor.visit(uintptr(start&^(pudSize-1)), pudEntry, pudSize-1) {
+ return false
+ }
+
+ if !pudEntry.Valid() {
+ clearPUDEntries++
+ }
+
+ start = unmapnext(start, pudSize)
+ continue
+ }
+ } else {
+ pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address())
+ }
+
+ clearPMDEntries := uint16(0)
+
+ for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+ var (
+ pmdEntry = &pmdEntries[pmdIndex]
+ pteEntries *PTEs
+ )
+ if !pmdEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ clearPMDEntries++
+ start = unmapnext(start, pmdSize)
+ continue
+ }
+
+ if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+ pmdEntry.SetSuper()
+ if !w.visitor.visit(uintptr(start&^(pmdSize-1)), pmdEntry, pmdSize-1) {
+ return false
+ }
+ if pmdEntry.Valid() {
+ start = unmapnext(start, pmdSize)
+ continue
+ }
+ }
+
+ pteEntries = w.pageTables.Allocator.NewPTEs()
+ pmdEntry.setPageTable(w.pageTables, pteEntries)
+
+ } else if pmdEntry.IsSuper() {
+
+ if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < unmapnext(start, pmdSize)) {
+
+ pteEntries = w.pageTables.Allocator.NewPTEs()
+ for index := uint16(0); index < entriesPerPage; index++ {
+ pteEntries[index].Set(
+ pmdEntry.Address()+(pteSize*uintptr(index)),
+ pmdEntry.Opts())
+ }
+ pmdEntry.setPageTable(w.pageTables, pteEntries)
+ } else {
+
+ if !w.visitor.visit(uintptr(start&^(pmdSize-1)), pmdEntry, pmdSize-1) {
+ return false
+ }
+
+ if !pmdEntry.Valid() {
+ clearPMDEntries++
+ }
+
+ start = unmapnext(start, pmdSize)
+ continue
+ }
+ } else {
+ pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address())
+ }
+
+ clearPTEEntries := uint16(0)
+
+ for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+ var (
+ pteEntry = &pteEntries[pteIndex]
+ )
+ if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+ clearPTEEntries++
+ start += pteSize
+ continue
+ }
+
+ if !w.visitor.visit(uintptr(start&^(pteSize-1)), pteEntry, pteSize-1) {
+ return false
+ }
+ if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+ clearPTEEntries++
+ }
+
+ start += pteSize
+ continue
+ }
+
+ if clearPTEEntries == entriesPerPage {
+ pmdEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pteEntries)
+ clearPMDEntries++
+ }
+ }
+
+ if clearPMDEntries == entriesPerPage {
+ pudEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pmdEntries)
+ clearPUDEntries++
+ }
+ }
+
+ if clearPUDEntries == entriesPerPage {
+ pgdEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pudEntries)
+ }
+ }
+ return true
+}
+
+// Walker walks page tables.
+type unmapWalker struct {
+ // pageTables are the tables to walk.
+ pageTables *PageTables
+
+ // Visitor is the set of arguments.
+ visitor unmapVisitor
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The
+// exception is super pages. If a valid super page (huge or jumbo) cannot be
+// installed, then the walk will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of super/sect pages whenever
+// possible. Whether a super page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if requiresAlloc is true, then no gaps will be present. However,
+// if alloc is not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: start must be page-aligned.
+// Precondition: start must be less than end.
+// Precondition: If requiresAlloc is true, then start and end should not span
+// non-canonical ranges. If they do, a panic will result.
+//
+//go:nosplit
+func (w *unmapWalker) iterateRange(start, end uintptr) {
+ if start%pteSize != 0 {
+ panic("unaligned start")
+ }
+ if end < start {
+ panic("start > end")
+ }
+ if start < lowerTop {
+ if end <= lowerTop {
+ w.iterateRangeCanonical(start, end)
+ } else if end > lowerTop && end <= upperBottom {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ w.iterateRangeCanonical(start, lowerTop)
+ } else {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ if !w.iterateRangeCanonical(start, lowerTop) {
+ return
+ }
+ w.iterateRangeCanonical(upperBottom, end)
+ }
+ } else if start < upperBottom {
+ if end <= upperBottom {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ } else {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ w.iterateRangeCanonical(upperBottom, end)
+ }
+ } else {
+ w.iterateRangeCanonical(start, end)
+ }
+}
+
+// next returns the next address quantized by the given size.
+//
+//go:nosplit
+func unmapnext(start uintptr, size uintptr) uintptr {
+ start &= ^(size - 1)
+ start += size
+ return start
+}
diff --git a/pkg/ring0/pagetables/walker_unmap_arm64.go b/pkg/ring0/pagetables/walker_unmap_arm64.go
new file mode 100644
index 000000000..7ad732e39
--- /dev/null
+++ b/pkg/ring0/pagetables/walker_unmap_arm64.go
@@ -0,0 +1,276 @@
+//go:build arm64
+// +build arm64
+
+package pagetables
+
+// iterateRangeCanonical walks a canonical range.
+//
+//go:nosplit
+func (w *unmapWalker) iterateRangeCanonical(start, end uintptr) bool {
+ pgdEntryIndex := w.pageTables.root
+ if start >= upperBottom {
+ pgdEntryIndex = w.pageTables.archPageTables.root
+ }
+
+ for pgdIndex := (uint16((start & pgdMask) >> pgdShift)); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+ var (
+ pgdEntry = &pgdEntryIndex[pgdIndex]
+ pudEntries *PTEs
+ )
+ if !pgdEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ start = unmapnext(start, pgdSize)
+ continue
+ }
+
+ pudEntries = w.pageTables.Allocator.NewPTEs()
+ pgdEntry.setPageTable(w.pageTables, pudEntries)
+ } else {
+ pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address())
+ }
+
+ clearPUDEntries := uint16(0)
+
+ for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+ var (
+ pudEntry = &pudEntries[pudIndex]
+ pmdEntries *PTEs
+ )
+ if !pudEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ clearPUDEntries++
+ start = unmapnext(start, pudSize)
+ continue
+ }
+
+ if start&(pudSize-1) == 0 && end-start >= pudSize {
+ pudEntry.SetSect()
+ if !w.visitor.visit(uintptr(start), pudEntry, pudSize-1) {
+ return false
+ }
+ if pudEntry.Valid() {
+ start = unmapnext(start, pudSize)
+ continue
+ }
+ }
+
+ pmdEntries = w.pageTables.Allocator.NewPTEs()
+ pudEntry.setPageTable(w.pageTables, pmdEntries)
+
+ } else if pudEntry.IsSect() {
+
+ if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < unmapnext(start, pudSize)) {
+
+ pmdEntries = w.pageTables.Allocator.NewPTEs()
+ for index := uint16(0); index < entriesPerPage; index++ {
+ pmdEntries[index].SetSect()
+ pmdEntries[index].Set(
+ pudEntry.Address()+(pmdSize*uintptr(index)),
+ pudEntry.Opts())
+ }
+ pudEntry.setPageTable(w.pageTables, pmdEntries)
+ } else {
+
+ if !w.visitor.visit(uintptr(start), pudEntry, pudSize-1) {
+ return false
+ }
+
+ if !pudEntry.Valid() {
+ clearPUDEntries++
+ }
+
+ start = unmapnext(start, pudSize)
+ continue
+ }
+
+ } else {
+ pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address())
+ }
+
+ clearPMDEntries := uint16(0)
+
+ for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+ var (
+ pmdEntry = &pmdEntries[pmdIndex]
+ pteEntries *PTEs
+ )
+ if !pmdEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+
+ clearPMDEntries++
+ start = unmapnext(start, pmdSize)
+ continue
+ }
+
+ if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+ pmdEntry.SetSect()
+ if !w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) {
+ return false
+ }
+ if pmdEntry.Valid() {
+ start = unmapnext(start, pmdSize)
+ continue
+ }
+ }
+
+ pteEntries = w.pageTables.Allocator.NewPTEs()
+ pmdEntry.setPageTable(w.pageTables, pteEntries)
+
+ } else if pmdEntry.IsSect() {
+
+ if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < unmapnext(start, pmdSize)) {
+
+ pteEntries = w.pageTables.Allocator.NewPTEs()
+ for index := uint16(0); index < entriesPerPage; index++ {
+ pteEntries[index].Set(
+ pmdEntry.Address()+(pteSize*uintptr(index)),
+ pmdEntry.Opts())
+ }
+ pmdEntry.setPageTable(w.pageTables, pteEntries)
+ } else {
+
+ if !w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) {
+ return false
+ }
+
+ if !pmdEntry.Valid() {
+ clearPMDEntries++
+ }
+
+ start = unmapnext(start, pmdSize)
+ continue
+ }
+
+ } else {
+ pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address())
+ }
+
+ clearPTEEntries := uint16(0)
+
+ for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+ var (
+ pteEntry = &pteEntries[pteIndex]
+ )
+ if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+ clearPTEEntries++
+ start += pteSize
+ continue
+ }
+
+ if !w.visitor.visit(uintptr(start), pteEntry, pteSize-1) {
+ return false
+ }
+ if !pteEntry.Valid() {
+ if w.visitor.requiresAlloc() {
+ panic("PTE not set after iteration with requiresAlloc!")
+ }
+ clearPTEEntries++
+ }
+
+ start += pteSize
+ continue
+ }
+
+ if clearPTEEntries == entriesPerPage {
+ pmdEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pteEntries)
+ clearPMDEntries++
+ }
+ }
+
+ if clearPMDEntries == entriesPerPage {
+ pudEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pmdEntries)
+ clearPUDEntries++
+ }
+ }
+
+ if clearPUDEntries == entriesPerPage {
+ pgdEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pudEntries)
+ }
+ }
+ return true
+}
+
+// Walker walks page tables.
+type unmapWalker struct {
+ // pageTables are the tables to walk.
+ pageTables *PageTables
+
+ // Visitor is the set of arguments.
+ visitor unmapVisitor
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The
+// exception is super pages. If a valid super page (huge or jumbo) cannot be
+// installed, then the walk will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of super/sect pages whenever
+// possible. Whether a super page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if requiresAlloc is true, then no gaps will be present. However,
+// if alloc is not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: start must be page-aligned.
+// Precondition: start must be less than end.
+// Precondition: If requiresAlloc is true, then start and end should not span
+// non-canonical ranges. If they do, a panic will result.
+//
+//go:nosplit
+func (w *unmapWalker) iterateRange(start, end uintptr) {
+ if start%pteSize != 0 {
+ panic("unaligned start")
+ }
+ if end < start {
+ panic("start > end")
+ }
+ if start < lowerTop {
+ if end <= lowerTop {
+ w.iterateRangeCanonical(start, end)
+ } else if end > lowerTop && end <= upperBottom {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ w.iterateRangeCanonical(start, lowerTop)
+ } else {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ if !w.iterateRangeCanonical(start, lowerTop) {
+ return
+ }
+ w.iterateRangeCanonical(upperBottom, end)
+ }
+ } else if start < upperBottom {
+ if end <= upperBottom {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ } else {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ w.iterateRangeCanonical(upperBottom, end)
+ }
+ } else {
+ w.iterateRangeCanonical(start, end)
+ }
+}
+
+// next returns the next address quantized by the given size.
+//
+//go:nosplit
+func unmapnext(start uintptr, size uintptr) uintptr {
+ start &= ^(size - 1)
+ start += size
+ return start
+}
diff --git a/pkg/ring0/ring0_amd64_state_autogen.go b/pkg/ring0/ring0_amd64_state_autogen.go
new file mode 100644
index 000000000..2bd773254
--- /dev/null
+++ b/pkg/ring0/ring0_amd64_state_autogen.go
@@ -0,0 +1,6 @@
+// automatically generated by stateify.
+
+//go:build amd64 && amd64 && amd64
+// +build amd64,amd64,amd64
+
+package ring0
diff --git a/pkg/ring0/ring0_arm64_state_autogen.go b/pkg/ring0/ring0_arm64_state_autogen.go
new file mode 100644
index 000000000..d87b068ec
--- /dev/null
+++ b/pkg/ring0/ring0_arm64_state_autogen.go
@@ -0,0 +1,6 @@
+// automatically generated by stateify.
+
+//go:build arm64 && arm64 && arm64
+// +build arm64,arm64,arm64
+
+package ring0
diff --git a/pkg/ring0/ring0_impl_amd64_state_autogen.go b/pkg/ring0/ring0_impl_amd64_state_autogen.go
new file mode 100644
index 000000000..81a3dd14f
--- /dev/null
+++ b/pkg/ring0/ring0_impl_amd64_state_autogen.go
@@ -0,0 +1,8 @@
+// automatically generated by stateify.
+
+//go:build amd64 && amd64 && (386 || amd64)
+// +build amd64
+// +build amd64
+// +build 386 amd64
+
+package ring0
diff --git a/pkg/ring0/ring0_impl_arm64_state_autogen.go b/pkg/ring0/ring0_impl_arm64_state_autogen.go
new file mode 100644
index 000000000..d87b068ec
--- /dev/null
+++ b/pkg/ring0/ring0_impl_arm64_state_autogen.go
@@ -0,0 +1,6 @@
+// automatically generated by stateify.
+
+//go:build arm64 && arm64 && arm64
+// +build arm64,arm64,arm64
+
+package ring0
diff --git a/pkg/ring0/ring0_state_autogen.go b/pkg/ring0/ring0_state_autogen.go
new file mode 100644
index 000000000..327aba163
--- /dev/null
+++ b/pkg/ring0/ring0_state_autogen.go
@@ -0,0 +1,3 @@
+// automatically generated by stateify.
+
+package ring0
diff --git a/pkg/ring0/ring0_unsafe_state_autogen.go b/pkg/ring0/ring0_unsafe_state_autogen.go
new file mode 100644
index 000000000..327aba163
--- /dev/null
+++ b/pkg/ring0/ring0_unsafe_state_autogen.go
@@ -0,0 +1,3 @@
+// automatically generated by stateify.
+
+package ring0
diff --git a/pkg/ring0/x86.go b/pkg/ring0/x86.go
deleted file mode 100644
index 9a98703da..000000000
--- a/pkg/ring0/x86.go
+++ /dev/null
@@ -1,298 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//go:build 386 || amd64
-// +build 386 amd64
-
-package ring0
-
-import (
- "gvisor.dev/gvisor/pkg/cpuid"
-)
-
-// Useful bits.
-const (
- _CR0_PE = 1 << 0
- _CR0_ET = 1 << 4
- _CR0_NE = 1 << 5
- _CR0_AM = 1 << 18
- _CR0_PG = 1 << 31
-
- _CR4_PSE = 1 << 4
- _CR4_PAE = 1 << 5
- _CR4_PGE = 1 << 7
- _CR4_OSFXSR = 1 << 9
- _CR4_OSXMMEXCPT = 1 << 10
- _CR4_FSGSBASE = 1 << 16
- _CR4_PCIDE = 1 << 17
- _CR4_OSXSAVE = 1 << 18
- _CR4_SMEP = 1 << 20
-
- _RFLAGS_AC = 1 << 18
- _RFLAGS_NT = 1 << 14
- _RFLAGS_IOPL0 = 1 << 12
- _RFLAGS_IOPL1 = 1 << 13
- _RFLAGS_IOPL = _RFLAGS_IOPL0 | _RFLAGS_IOPL1
- _RFLAGS_DF = 1 << 10
- _RFLAGS_IF = 1 << 9
- _RFLAGS_STEP = 1 << 8
- _RFLAGS_RESERVED = 1 << 1
-
- _EFER_SCE = 0x001
- _EFER_LME = 0x100
- _EFER_LMA = 0x400
- _EFER_NX = 0x800
-
- _MSR_STAR = 0xc0000081
- _MSR_LSTAR = 0xc0000082
- _MSR_CSTAR = 0xc0000083
- _MSR_SYSCALL_MASK = 0xc0000084
- _MSR_PLATFORM_INFO = 0xce
- _MSR_MISC_FEATURES = 0x140
-
- _PLATFORM_INFO_CPUID_FAULT = 1 << 31
-
- _MISC_FEATURE_CPUID_TRAP = 0x1
-)
-
-const (
- // KernelFlagsSet should always be set in the kernel.
- KernelFlagsSet = _RFLAGS_RESERVED
-
- // UserFlagsSet are always set in userspace.
- //
- // _RFLAGS_IOPL is a set of two bits and it shows the I/O privilege
- // level. The Current Privilege Level (CPL) of the task must be less
- // than or equal to the IOPL in order for the task or program to access
- // I/O ports.
- //
- // Here, _RFLAGS_IOPL0 is used only to determine whether the task is
- // running in the kernel or userspace mode. In the user mode, the CPL is
- // always 3 and it doesn't matter what IOPL is set if it is bellow CPL.
- //
- // We need to have one bit which will be always different in user and
- // kernel modes. And we have to remember that even though we have
- // KernelFlagsClear, we still can see some of these flags in the kernel
- // mode. This can happen when the goruntime switches on a goroutine
- // which has been saved in the host mode. On restore, the popf
- // instruction is used to restore flags and this means that all flags
- // what the goroutine has in the host mode will be restored in the
- // kernel mode.
- //
- // _RFLAGS_IOPL0 is never set in host and kernel modes and we always set
- // it in the user mode. So if this flag is set, the task is running in
- // the user mode and if it isn't set, the task is running in the kernel
- // mode.
- UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF | _RFLAGS_IOPL0
-
- // KernelFlagsClear should always be clear in the kernel.
- KernelFlagsClear = _RFLAGS_STEP | _RFLAGS_IF | _RFLAGS_IOPL | _RFLAGS_AC | _RFLAGS_NT
-
- // UserFlagsClear are always cleared in userspace.
- UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL1
-)
-
-// IsKernelFlags returns true if rflags coresponds to the kernel mode.
-//
-// go:nosplit
-func IsKernelFlags(rflags uint64) bool {
- return rflags&_RFLAGS_IOPL0 == 0
-}
-
-// Vector is an exception vector.
-type Vector uintptr
-
-// Exception vectors.
-const (
- DivideByZero Vector = iota
- Debug
- NMI
- Breakpoint
- Overflow
- BoundRangeExceeded
- InvalidOpcode
- DeviceNotAvailable
- DoubleFault
- CoprocessorSegmentOverrun
- InvalidTSS
- SegmentNotPresent
- StackSegmentFault
- GeneralProtectionFault
- PageFault
- _
- X87FloatingPointException
- AlignmentCheck
- MachineCheck
- SIMDFloatingPointException
- VirtualizationException
- SecurityException = 0x1e
- SyscallInt80 = 0x80
- _NR_INTERRUPTS = 0x100
-)
-
-// System call vectors.
-const (
- Syscall Vector = _NR_INTERRUPTS
-)
-
-// VirtualAddressBits returns the number bits available for virtual addresses.
-//
-// Note that sign-extension semantics apply to the highest order bit.
-//
-// FIXME(b/69382326): This should use the cpuid passed to Init.
-func VirtualAddressBits() uint32 {
- ax, _, _, _ := cpuid.HostID(0x80000008, 0)
- return (ax >> 8) & 0xff
-}
-
-// PhysicalAddressBits returns the number of bits available for physical addresses.
-//
-// FIXME(b/69382326): This should use the cpuid passed to Init.
-func PhysicalAddressBits() uint32 {
- ax, _, _, _ := cpuid.HostID(0x80000008, 0)
- return ax & 0xff
-}
-
-// Selector is a segment Selector.
-type Selector uint16
-
-// SegmentDescriptor is a segment descriptor.
-type SegmentDescriptor struct {
- bits [2]uint32
-}
-
-// descriptorTable is a collection of descriptors.
-type descriptorTable [32]SegmentDescriptor
-
-// SegmentDescriptorFlags are typed flags within a descriptor.
-type SegmentDescriptorFlags uint32
-
-// SegmentDescriptorFlag declarations.
-const (
- SegmentDescriptorAccess SegmentDescriptorFlags = 1 << 8 // Access bit (always set).
- SegmentDescriptorWrite = 1 << 9 // Write permission.
- SegmentDescriptorExpandDown = 1 << 10 // Grows down, not used.
- SegmentDescriptorExecute = 1 << 11 // Execute permission.
- SegmentDescriptorSystem = 1 << 12 // Zero => system, 1 => user code/data.
- SegmentDescriptorPresent = 1 << 15 // Present.
- SegmentDescriptorAVL = 1 << 20 // Available.
- SegmentDescriptorLong = 1 << 21 // Long mode.
- SegmentDescriptorDB = 1 << 22 // 16 or 32-bit.
- SegmentDescriptorG = 1 << 23 // Granularity: page or byte.
-)
-
-// Base returns the descriptor's base linear address.
-func (d *SegmentDescriptor) Base() uint32 {
- return d.bits[1]&0xFF000000 | (d.bits[1]&0x000000FF)<<16 | d.bits[0]>>16
-}
-
-// Limit returns the descriptor size.
-func (d *SegmentDescriptor) Limit() uint32 {
- l := d.bits[0]&0xFFFF | d.bits[1]&0xF0000
- if d.bits[1]&uint32(SegmentDescriptorG) != 0 {
- l <<= 12
- l |= 0xFFF
- }
- return l
-}
-
-// Flags returns descriptor flags.
-func (d *SegmentDescriptor) Flags() SegmentDescriptorFlags {
- return SegmentDescriptorFlags(d.bits[1] & 0x00F09F00)
-}
-
-// DPL returns the descriptor privilege level.
-func (d *SegmentDescriptor) DPL() int {
- return int((d.bits[1] >> 13) & 3)
-}
-
-func (d *SegmentDescriptor) setNull() {
- d.bits[0] = 0
- d.bits[1] = 0
-}
-
-func (d *SegmentDescriptor) set(base, limit uint32, dpl int, flags SegmentDescriptorFlags) {
- flags |= SegmentDescriptorPresent
- if limit>>12 != 0 {
- limit >>= 12
- flags |= SegmentDescriptorG
- }
- d.bits[0] = base<<16 | limit&0xFFFF
- d.bits[1] = base&0xFF000000 | (base>>16)&0xFF | limit&0x000F0000 | uint32(flags) | uint32(dpl)<<13
-}
-
-func (d *SegmentDescriptor) setCode32(base, limit uint32, dpl int) {
- d.set(base, limit, dpl,
- SegmentDescriptorDB|
- SegmentDescriptorExecute|
- SegmentDescriptorSystem)
-}
-
-func (d *SegmentDescriptor) setCode64(base, limit uint32, dpl int) {
- d.set(base, limit, dpl,
- SegmentDescriptorG|
- SegmentDescriptorLong|
- SegmentDescriptorExecute|
- SegmentDescriptorSystem)
-}
-
-func (d *SegmentDescriptor) setData(base, limit uint32, dpl int) {
- d.set(base, limit, dpl,
- SegmentDescriptorWrite|
- SegmentDescriptorSystem)
-}
-
-// setHi is only used for the TSS segment, which is magically 64-bits.
-func (d *SegmentDescriptor) setHi(base uint32) {
- d.bits[0] = base
- d.bits[1] = 0
-}
-
-// Gate64 is a 64-bit task, trap, or interrupt gate.
-type Gate64 struct {
- bits [4]uint32
-}
-
-// idt64 is a 64-bit interrupt descriptor table.
-type idt64 [_NR_INTERRUPTS]Gate64
-
-func (g *Gate64) setInterrupt(cs Selector, rip uint64, dpl int, ist int) {
- g.bits[0] = uint32(cs)<<16 | uint32(rip)&0xFFFF
- g.bits[1] = uint32(rip)&0xFFFF0000 | SegmentDescriptorPresent | uint32(dpl)<<13 | 14<<8 | uint32(ist)&0x7
- g.bits[2] = uint32(rip >> 32)
-}
-
-func (g *Gate64) setTrap(cs Selector, rip uint64, dpl int, ist int) {
- g.setInterrupt(cs, rip, dpl, ist)
- g.bits[1] |= 1 << 8
-}
-
-// TaskState64 is a 64-bit task state structure.
-type TaskState64 struct {
- _ uint32
- rsp0Lo, rsp0Hi uint32
- rsp1Lo, rsp1Hi uint32
- rsp2Lo, rsp2Hi uint32
- _ [2]uint32
- ist1Lo, ist1Hi uint32
- ist2Lo, ist2Hi uint32
- ist3Lo, ist3Hi uint32
- ist4Lo, ist4Hi uint32
- ist5Lo, ist5Hi uint32
- ist6Lo, ist6Hi uint32
- ist7Lo, ist7Hi uint32
- _ [2]uint32
- _ uint16
- ioPerm uint16
-}