diff options
Diffstat (limited to 'pkg/sentry/platform')
65 files changed, 2278 insertions, 3023 deletions
diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD deleted file mode 100644 index db7d55ef2..000000000 --- a/pkg/sentry/platform/BUILD +++ /dev/null @@ -1,22 +0,0 @@ -load("//tools:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "platform", - srcs = [ - "context.go", - "mmap_min_addr.go", - "platform.go", - ], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/abi/linux", - "//pkg/context", - "//pkg/seccomp", - "//pkg/sentry/arch", - "//pkg/sentry/hostmm", - "//pkg/sentry/memmap", - "//pkg/usermem", - ], -) diff --git a/pkg/sentry/platform/interrupt/BUILD b/pkg/sentry/platform/interrupt/BUILD deleted file mode 100644 index 83b385f14..000000000 --- a/pkg/sentry/platform/interrupt/BUILD +++ /dev/null @@ -1,19 +0,0 @@ -load("//tools:defs.bzl", "go_library", "go_test") - -package(licenses = ["notice"]) - -go_library( - name = "interrupt", - srcs = [ - "interrupt.go", - ], - visibility = ["//pkg/sentry:internal"], - deps = ["//pkg/sync"], -) - -go_test( - name = "interrupt_test", - size = "small", - srcs = ["interrupt_test.go"], - library = ":interrupt", -) diff --git a/pkg/sentry/platform/interrupt/interrupt_state_autogen.go b/pkg/sentry/platform/interrupt/interrupt_state_autogen.go new file mode 100644 index 000000000..1336e5f01 --- /dev/null +++ b/pkg/sentry/platform/interrupt/interrupt_state_autogen.go @@ -0,0 +1,3 @@ +// automatically generated by stateify. + +package interrupt diff --git a/pkg/sentry/platform/interrupt/interrupt_test.go b/pkg/sentry/platform/interrupt/interrupt_test.go deleted file mode 100644 index 0ecdf6e7a..000000000 --- a/pkg/sentry/platform/interrupt/interrupt_test.go +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package interrupt - -import ( - "testing" -) - -type countingReceiver struct { - interrupts int -} - -// NotifyInterrupt implements Receiver.NotifyInterrupt. -func (r *countingReceiver) NotifyInterrupt() { - r.interrupts++ -} - -func TestSingleInterruptBeforeEnable(t *testing.T) { - var ( - f Forwarder - r countingReceiver - ) - f.NotifyInterrupt() - // The interrupt should cause the first Enable to fail. - if f.Enable(&r) { - f.Disable() - t.Fatalf("Enable: got true, wanted false") - } - // The failing Enable "acknowledges" the interrupt, allowing future Enables - // to succeed. - if !f.Enable(&r) { - t.Fatalf("Enable: got false, wanted true") - } - f.Disable() -} - -func TestMultipleInterruptsBeforeEnable(t *testing.T) { - var ( - f Forwarder - r countingReceiver - ) - f.NotifyInterrupt() - f.NotifyInterrupt() - // The interrupts should cause the first Enable to fail. - if f.Enable(&r) { - f.Disable() - t.Fatalf("Enable: got true, wanted false") - } - // Interrupts are deduplicated while the Forwarder is disabled, so the - // failing Enable "acknowledges" all interrupts, allowing future Enables to - // succeed. - if !f.Enable(&r) { - t.Fatalf("Enable: got false, wanted true") - } - f.Disable() -} - -func TestSingleInterruptAfterEnable(t *testing.T) { - var ( - f Forwarder - r countingReceiver - ) - if !f.Enable(&r) { - t.Fatalf("Enable: got false, wanted true") - } - defer f.Disable() - f.NotifyInterrupt() - if r.interrupts != 1 { - t.Errorf("interrupts: got %d, wanted 1", r.interrupts) - } -} - -func TestMultipleInterruptsAfterEnable(t *testing.T) { - var ( - f Forwarder - r countingReceiver - ) - if !f.Enable(&r) { - t.Fatalf("Enable: got false, wanted true") - } - defer f.Disable() - f.NotifyInterrupt() - f.NotifyInterrupt() - if r.interrupts != 2 { - t.Errorf("interrupts: got %d, wanted 2", r.interrupts) - } -} diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD deleted file mode 100644 index 8ce411102..000000000 --- a/pkg/sentry/platform/kvm/BUILD +++ /dev/null @@ -1,94 +0,0 @@ -load("//tools:defs.bzl", "go_library", "go_test") - -package(licenses = ["notice"]) - -go_library( - name = "kvm", - srcs = [ - "address_space.go", - "bluepill.go", - "bluepill_allocator.go", - "bluepill_amd64.go", - "bluepill_amd64_unsafe.go", - "bluepill_arm64.go", - "bluepill_arm64.s", - "bluepill_arm64_unsafe.go", - "bluepill_fault.go", - "bluepill_impl_amd64.s", - "bluepill_unsafe.go", - "context.go", - "filters_amd64.go", - "filters_arm64.go", - "kvm.go", - "kvm_amd64.go", - "kvm_amd64_unsafe.go", - "kvm_arm64.go", - "kvm_arm64_unsafe.go", - "kvm_const.go", - "kvm_const_arm64.go", - "machine.go", - "machine_amd64.go", - "machine_amd64_unsafe.go", - "machine_arm64.go", - "machine_arm64_unsafe.go", - "machine_unsafe.go", - "physical_map.go", - "physical_map_amd64.go", - "physical_map_arm64.go", - "virtual_map.go", - ], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/abi/linux", - "//pkg/atomicbitops", - "//pkg/context", - "//pkg/cpuid", - "//pkg/log", - "//pkg/procid", - "//pkg/safecopy", - "//pkg/seccomp", - "//pkg/sentry/arch", - "//pkg/sentry/memmap", - "//pkg/sentry/platform", - "//pkg/sentry/platform/interrupt", - "//pkg/sentry/platform/ring0", - "//pkg/sentry/platform/ring0/pagetables", - "//pkg/sentry/time", - "//pkg/sync", - "//pkg/usermem", - "@org_golang_x_sys//unix:go_default_library", - ], -) - -go_test( - name = "kvm_test", - srcs = [ - "kvm_amd64_test.go", - "kvm_arm64_test.go", - "kvm_test.go", - "virtual_map_test.go", - ], - library = ":kvm", - tags = [ - "manual", - "nogotsan", - "requires-kvm", - ], - deps = [ - "//pkg/sentry/arch", - "//pkg/sentry/platform", - "//pkg/sentry/platform/kvm/testutil", - "//pkg/sentry/platform/ring0", - "//pkg/sentry/platform/ring0/pagetables", - "//pkg/sentry/time", - "//pkg/usermem", - ], -) - -genrule( - name = "bluepill_impl_amd64", - srcs = ["bluepill_amd64.s"], - outs = ["bluepill_impl_amd64.s"], - cmd = "(echo -e '// build +amd64\\n' && $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(SRCS)) > $@", - tools = ["//pkg/sentry/platform/ring0/gen_offsets"], -) diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.s b/pkg/sentry/platform/kvm/bluepill_impl_amd64.s index 025ea93b5..2bbf06b18 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64.s +++ b/pkg/sentry/platform/kvm/bluepill_impl_amd64.s @@ -1,3 +1,73 @@ +// build +amd64 + +// Automatically generated, do not edit. + +// CPU offsets. +#define CPU_REGISTERS 0x28 +#define CPU_ERROR_CODE 0x10 +#define CPU_ERROR_TYPE 0x18 +#define CPU_ENTRY 0x20 + +// CPU entry offsets. +#define ENTRY_SCRATCH0 0x100 +#define ENTRY_STACK_TOP 0x108 +#define ENTRY_CPU_SELF 0x110 +#define ENTRY_KERNEL_CR3 0x118 + +// Bits. +#define _RFLAGS_IF 0x200 +#define _RFLAGS_IOPL0 0x1000 +#define _KERNEL_FLAGS 0x02 + +// Vectors. +#define DivideByZero 0x00 +#define Debug 0x01 +#define NMI 0x02 +#define Breakpoint 0x03 +#define Overflow 0x04 +#define BoundRangeExceeded 0x05 +#define InvalidOpcode 0x06 +#define DeviceNotAvailable 0x07 +#define DoubleFault 0x08 +#define CoprocessorSegmentOverrun 0x09 +#define InvalidTSS 0x0a +#define SegmentNotPresent 0x0b +#define StackSegmentFault 0x0c +#define GeneralProtectionFault 0x0d +#define PageFault 0x0e +#define X87FloatingPointException 0x10 +#define AlignmentCheck 0x11 +#define MachineCheck 0x12 +#define SIMDFloatingPointException 0x13 +#define VirtualizationException 0x14 +#define SecurityException 0x1e +#define SyscallInt80 0x80 +#define Syscall 0x100 + +// Ptrace registers. +#define PTRACE_R15 0x00 +#define PTRACE_R14 0x08 +#define PTRACE_R13 0x10 +#define PTRACE_R12 0x18 +#define PTRACE_RBP 0x20 +#define PTRACE_RBX 0x28 +#define PTRACE_R11 0x30 +#define PTRACE_R10 0x38 +#define PTRACE_R9 0x40 +#define PTRACE_R8 0x48 +#define PTRACE_RAX 0x50 +#define PTRACE_RCX 0x58 +#define PTRACE_RDX 0x60 +#define PTRACE_RSI 0x68 +#define PTRACE_RDI 0x70 +#define PTRACE_ORIGRAX 0x78 +#define PTRACE_RIP 0x80 +#define PTRACE_CS 0x88 +#define PTRACE_FLAGS 0x90 +#define PTRACE_RSP 0x98 +#define PTRACE_SS 0xa0 +#define PTRACE_FS 0xa8 +#define PTRACE_GS 0xb0 // Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/pkg/sentry/platform/kvm/kvm_amd64_state_autogen.go b/pkg/sentry/platform/kvm/kvm_amd64_state_autogen.go new file mode 100644 index 000000000..a69cbee8b --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_amd64_state_autogen.go @@ -0,0 +1,7 @@ +// automatically generated by stateify. + +// +build amd64 +// +build amd64 +// +build amd64 + +package kvm diff --git a/pkg/sentry/platform/kvm/kvm_amd64_test.go b/pkg/sentry/platform/kvm/kvm_amd64_test.go deleted file mode 100644 index c0b4fd374..000000000 --- a/pkg/sentry/platform/kvm/kvm_amd64_test.go +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build amd64 - -package kvm - -import ( - "testing" - - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" -) - -func TestSegments(t *testing.T) { - applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - testutil.SetTestSegments(regs) - for { - var si arch.SignalInfo - if _, err := c.SwitchToUser(ring0.SwitchOpts{ - Registers: regs, - FloatingPointState: dummyFPState, - PageTables: pt, - FullRestore: true, - }, &si); err == platform.ErrContextInterrupt { - continue // Retry. - } else if err != nil { - t.Errorf("application segment check with full restore got unexpected error: %v", err) - } - if err := testutil.CheckTestSegments(regs); err != nil { - t.Errorf("application segment check with full restore failed: %v", err) - } - break // Done. - } - return false - }) -} diff --git a/pkg/sentry/platform/kvm/kvm_amd64_unsafe_state_autogen.go b/pkg/sentry/platform/kvm/kvm_amd64_unsafe_state_autogen.go new file mode 100644 index 000000000..a69cbee8b --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_amd64_unsafe_state_autogen.go @@ -0,0 +1,7 @@ +// automatically generated by stateify. + +// +build amd64 +// +build amd64 +// +build amd64 + +package kvm diff --git a/pkg/sentry/platform/kvm/kvm_arm64_state_autogen.go b/pkg/sentry/platform/kvm/kvm_arm64_state_autogen.go new file mode 100644 index 000000000..90183b764 --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_arm64_state_autogen.go @@ -0,0 +1,7 @@ +// automatically generated by stateify. + +// +build arm64 +// +build arm64 +// +build arm64 + +package kvm diff --git a/pkg/sentry/platform/kvm/kvm_arm64_test.go b/pkg/sentry/platform/kvm/kvm_arm64_test.go deleted file mode 100644 index 0e3d84d95..000000000 --- a/pkg/sentry/platform/kvm/kvm_arm64_test.go +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package kvm - -import ( - "testing" - - "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil" -) - -func TestKernelTLS(t *testing.T) { - bluepillTest(t, func(c *vCPU) { - if !testutil.TLSWorks() { - t.Errorf("tls does not work, and it should!") - } - }) -} diff --git a/pkg/sentry/platform/kvm/kvm_arm64_unsafe_state_autogen.go b/pkg/sentry/platform/kvm/kvm_arm64_unsafe_state_autogen.go new file mode 100644 index 000000000..90183b764 --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_arm64_unsafe_state_autogen.go @@ -0,0 +1,7 @@ +// automatically generated by stateify. + +// +build arm64 +// +build arm64 +// +build arm64 + +package kvm diff --git a/pkg/sentry/platform/kvm/kvm_state_autogen.go b/pkg/sentry/platform/kvm/kvm_state_autogen.go new file mode 100644 index 000000000..8d85b96d0 --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_state_autogen.go @@ -0,0 +1,3 @@ +// automatically generated by stateify. + +package kvm diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go deleted file mode 100644 index e58acc071..000000000 --- a/pkg/sentry/platform/kvm/kvm_test.go +++ /dev/null @@ -1,526 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package kvm - -import ( - "math/rand" - "reflect" - "sync/atomic" - "syscall" - "testing" - "time" - - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - ktime "gvisor.dev/gvisor/pkg/sentry/time" - "gvisor.dev/gvisor/pkg/usermem" -) - -var dummyFPState = (*byte)(arch.NewFloatingPointData()) - -type testHarness interface { - Errorf(format string, args ...interface{}) - Fatalf(format string, args ...interface{}) -} - -func kvmTest(t testHarness, setup func(*KVM), fn func(*vCPU) bool) { - // Create the machine. - deviceFile, err := OpenDevice() - if err != nil { - t.Fatalf("error opening device file: %v", err) - } - k, err := New(deviceFile) - if err != nil { - t.Fatalf("error creating KVM instance: %v", err) - } - defer k.machine.Destroy() - - // Call additional setup. - if setup != nil { - setup(k) - } - - var c *vCPU // For recovery. - defer func() { - redpill() - if c != nil { - k.machine.Put(c) - } - }() - for { - c = k.machine.Get() - if !fn(c) { - break - } - - // We put the vCPU here and clear the value so that the - // deferred recovery will not re-put it above. - k.machine.Put(c) - c = nil - } -} - -func bluepillTest(t testHarness, fn func(*vCPU)) { - kvmTest(t, nil, func(c *vCPU) bool { - bluepill(c) - fn(c) - return false - }) -} - -func TestKernelSyscall(t *testing.T) { - bluepillTest(t, func(c *vCPU) { - redpill() // Leave guest mode. - if got := atomic.LoadUint32(&c.state); got != vCPUUser { - t.Errorf("vCPU not in ready state: got %v", got) - } - }) -} - -func hostFault() { - defer func() { - recover() - }() - var foo *int - *foo = 0 -} - -func TestKernelFault(t *testing.T) { - hostFault() // Ensure recovery works. - bluepillTest(t, func(c *vCPU) { - hostFault() - if got := atomic.LoadUint32(&c.state); got != vCPUUser { - t.Errorf("vCPU not in ready state: got %v", got) - } - }) -} - -func TestKernelFloatingPoint(t *testing.T) { - bluepillTest(t, func(c *vCPU) { - if !testutil.FloatingPointWorks() { - t.Errorf("floating point does not work, and it should!") - } - }) -} - -func applicationTest(t testHarness, useHostMappings bool, target func(), fn func(*vCPU, *arch.Registers, *pagetables.PageTables) bool) { - // Initialize registers & page tables. - var ( - regs arch.Registers - pt *pagetables.PageTables - ) - testutil.SetTestTarget(®s, target) - - kvmTest(t, func(k *KVM) { - // Create new page tables. - as, _, err := k.NewAddressSpace(nil /* invalidator */) - if err != nil { - t.Fatalf("can't create new address space: %v", err) - } - pt = as.(*addressSpace).pageTables - - if useHostMappings { - // Apply the physical mappings to these page tables. - // (This is normally dangerous, since they point to - // physical pages that may not exist. This shouldn't be - // done for regular user code, but is fine for test - // purposes.) - applyPhysicalRegions(func(pr physicalRegion) bool { - pt.Map(usermem.Addr(pr.virtual), pr.length, pagetables.MapOpts{ - AccessType: usermem.AnyAccess, - User: true, - }, pr.physical) - return true // Keep iterating. - }) - } - }, func(c *vCPU) bool { - // Invoke the function with the extra data. - return fn(c, ®s, pt) - }) -} - -func TestApplicationSyscall(t *testing.T) { - applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - var si arch.SignalInfo - if _, err := c.SwitchToUser(ring0.SwitchOpts{ - Registers: regs, - FloatingPointState: dummyFPState, - PageTables: pt, - FullRestore: true, - }, &si); err == platform.ErrContextInterrupt { - return true // Retry. - } else if err != nil { - t.Errorf("application syscall with full restore failed: %v", err) - } - return false - }) - applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - var si arch.SignalInfo - if _, err := c.SwitchToUser(ring0.SwitchOpts{ - Registers: regs, - FloatingPointState: dummyFPState, - PageTables: pt, - }, &si); err == platform.ErrContextInterrupt { - return true // Retry. - } else if err != nil { - t.Errorf("application syscall with partial restore failed: %v", err) - } - return false - }) -} - -func TestApplicationFault(t *testing.T) { - applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - testutil.SetTouchTarget(regs, nil) // Cause fault. - var si arch.SignalInfo - if _, err := c.SwitchToUser(ring0.SwitchOpts{ - Registers: regs, - FloatingPointState: dummyFPState, - PageTables: pt, - FullRestore: true, - }, &si); err == platform.ErrContextInterrupt { - return true // Retry. - } else if err != platform.ErrContextSignal || si.Signo != int32(syscall.SIGSEGV) { - t.Errorf("application fault with full restore got (%v, %v), expected (%v, SIGSEGV)", err, si, platform.ErrContextSignal) - } - return false - }) - applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - testutil.SetTouchTarget(regs, nil) // Cause fault. - var si arch.SignalInfo - if _, err := c.SwitchToUser(ring0.SwitchOpts{ - Registers: regs, - FloatingPointState: dummyFPState, - PageTables: pt, - }, &si); err == platform.ErrContextInterrupt { - return true // Retry. - } else if err != platform.ErrContextSignal || si.Signo != int32(syscall.SIGSEGV) { - t.Errorf("application fault with partial restore got (%v, %v), expected (%v, SIGSEGV)", err, si, platform.ErrContextSignal) - } - return false - }) -} - -func TestRegistersSyscall(t *testing.T) { - applicationTest(t, true, testutil.TwiddleRegsSyscall, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - testutil.SetTestRegs(regs) // Fill values for all registers. - for { - var si arch.SignalInfo - if _, err := c.SwitchToUser(ring0.SwitchOpts{ - Registers: regs, - FloatingPointState: dummyFPState, - PageTables: pt, - }, &si); err == platform.ErrContextInterrupt { - continue // Retry. - } else if err != nil { - t.Errorf("application register check with partial restore got unexpected error: %v", err) - } - if err := testutil.CheckTestRegs(regs, false); err != nil { - t.Errorf("application register check with partial restore failed: %v", err) - } - break // Done. - } - return false - }) -} - -func TestRegistersFault(t *testing.T) { - applicationTest(t, true, testutil.TwiddleRegsFault, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - testutil.SetTestRegs(regs) // Fill values for all registers. - for { - var si arch.SignalInfo - if _, err := c.SwitchToUser(ring0.SwitchOpts{ - Registers: regs, - FloatingPointState: dummyFPState, - PageTables: pt, - FullRestore: true, - }, &si); err == platform.ErrContextInterrupt { - continue // Retry. - } else if err != platform.ErrContextSignal || si.Signo != int32(syscall.SIGSEGV) { - t.Errorf("application register check with full restore got unexpected error: %v", err) - } - if err := testutil.CheckTestRegs(regs, true); err != nil { - t.Errorf("application register check with full restore failed: %v", err) - } - break // Done. - } - return false - }) -} - -func TestBounce(t *testing.T) { - applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - go func() { - time.Sleep(time.Millisecond) - c.BounceToKernel() - }() - var si arch.SignalInfo - if _, err := c.SwitchToUser(ring0.SwitchOpts{ - Registers: regs, - FloatingPointState: dummyFPState, - PageTables: pt, - }, &si); err != platform.ErrContextInterrupt { - t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt) - } - return false - }) - applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - go func() { - time.Sleep(time.Millisecond) - c.BounceToKernel() - }() - var si arch.SignalInfo - if _, err := c.SwitchToUser(ring0.SwitchOpts{ - Registers: regs, - FloatingPointState: dummyFPState, - PageTables: pt, - FullRestore: true, - }, &si); err != platform.ErrContextInterrupt { - t.Errorf("application full restore: got %v, wanted %v", err, platform.ErrContextInterrupt) - } - return false - }) -} - -func TestBounceStress(t *testing.T) { - applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - randomSleep := func() { - // O(hundreds of microseconds) is appropriate to ensure - // different overlaps and different schedules. - if n := rand.Intn(1000); n > 100 { - time.Sleep(time.Duration(n) * time.Microsecond) - } - } - for i := 0; i < 1000; i++ { - // Start an asynchronously executing goroutine that - // calls Bounce at pseudo-random point in time. - // This should wind up calling Bounce when the - // kernel is in various stages of the switch. - go func() { - randomSleep() - c.BounceToKernel() - }() - randomSleep() - var si arch.SignalInfo - if _, err := c.SwitchToUser(ring0.SwitchOpts{ - Registers: regs, - FloatingPointState: dummyFPState, - PageTables: pt, - }, &si); err != platform.ErrContextInterrupt { - t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt) - } - c.unlock() - randomSleep() - c.lock() - } - return false - }) -} - -func TestInvalidate(t *testing.T) { - var data uintptr // Used below. - applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - testutil.SetTouchTarget(regs, &data) // Read legitimate value. - for { - var si arch.SignalInfo - if _, err := c.SwitchToUser(ring0.SwitchOpts{ - Registers: regs, - FloatingPointState: dummyFPState, - PageTables: pt, - }, &si); err == platform.ErrContextInterrupt { - continue // Retry. - } else if err != nil { - t.Errorf("application partial restore: got %v, wanted nil", err) - } - break // Done. - } - // Unmap the page containing data & invalidate. - pt.Unmap(usermem.Addr(reflect.ValueOf(&data).Pointer() & ^uintptr(usermem.PageSize-1)), usermem.PageSize) - for { - var si arch.SignalInfo - if _, err := c.SwitchToUser(ring0.SwitchOpts{ - Registers: regs, - FloatingPointState: dummyFPState, - PageTables: pt, - Flush: true, - }, &si); err == platform.ErrContextInterrupt { - continue // Retry. - } else if err != platform.ErrContextSignal { - t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextSignal) - } - break // Success. - } - return false - }) -} - -// IsFault returns true iff the given signal represents a fault. -func IsFault(err error, si *arch.SignalInfo) bool { - return err == platform.ErrContextSignal && si.Signo == int32(syscall.SIGSEGV) -} - -func TestEmptyAddressSpace(t *testing.T) { - applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - var si arch.SignalInfo - if _, err := c.SwitchToUser(ring0.SwitchOpts{ - Registers: regs, - FloatingPointState: dummyFPState, - PageTables: pt, - }, &si); err == platform.ErrContextInterrupt { - return true // Retry. - } else if !IsFault(err, &si) { - t.Errorf("first fault with partial restore failed got %v", err) - t.Logf("registers: %#v", ®s) - } - return false - }) - applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - var si arch.SignalInfo - if _, err := c.SwitchToUser(ring0.SwitchOpts{ - Registers: regs, - FloatingPointState: dummyFPState, - PageTables: pt, - FullRestore: true, - }, &si); err == platform.ErrContextInterrupt { - return true // Retry. - } else if !IsFault(err, &si) { - t.Errorf("first fault with full restore failed got %v", err) - t.Logf("registers: %#v", ®s) - } - return false - }) -} - -func TestWrongVCPU(t *testing.T) { - kvmTest(t, nil, func(c1 *vCPU) bool { - kvmTest(t, nil, func(c2 *vCPU) bool { - // Basic test, one then the other. - bluepill(c1) - bluepill(c2) - if c2.guestExits == 0 { - // Don't allow the test to proceed if this fails. - t.Fatalf("wrong vCPU#2 exits: vCPU1=%+v,vCPU2=%+v", c1, c2) - } - - // Alternate vCPUs; we expect to need to trigger the - // wrong vCPU path on each switch. - for i := 0; i < 100; i++ { - bluepill(c1) - bluepill(c2) - } - if count := c1.guestExits; count < 90 { - t.Errorf("wrong vCPU#1 exits: vCPU1=%+v,vCPU2=%+v", c1, c2) - } - if count := c2.guestExits; count < 90 { - t.Errorf("wrong vCPU#2 exits: vCPU1=%+v,vCPU2=%+v", c1, c2) - } - return false - }) - return false - }) - kvmTest(t, nil, func(c1 *vCPU) bool { - kvmTest(t, nil, func(c2 *vCPU) bool { - bluepill(c1) - bluepill(c2) - return false - }) - return false - }) -} - -func TestRdtsc(t *testing.T) { - var i int // Iteration count. - kvmTest(t, nil, func(c *vCPU) bool { - start := ktime.Rdtsc() - bluepill(c) - guest := ktime.Rdtsc() - redpill() - end := ktime.Rdtsc() - if start > guest || guest > end { - t.Errorf("inconsistent time: start=%d, guest=%d, end=%d", start, guest, end) - } - i++ - return i < 100 - }) -} - -func BenchmarkApplicationSyscall(b *testing.B) { - var ( - i int // Iteration includes machine.Get() / machine.Put(). - a int // Count for ErrContextInterrupt. - ) - applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - var si arch.SignalInfo - if _, err := c.SwitchToUser(ring0.SwitchOpts{ - Registers: regs, - FloatingPointState: dummyFPState, - PageTables: pt, - }, &si); err == platform.ErrContextInterrupt { - a++ - return true // Ignore. - } else if err != nil { - b.Fatalf("benchmark failed: %v", err) - } - i++ - return i < b.N - }) - if a != 0 { - b.Logf("ErrContextInterrupt occurred %d times (in %d iterations).", a, a+i) - } -} - -func BenchmarkKernelSyscall(b *testing.B) { - // Note that the target passed here is irrelevant, we never execute SwitchToUser. - applicationTest(b, true, testutil.Getpid, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - // iteration does not include machine.Get() / machine.Put(). - for i := 0; i < b.N; i++ { - testutil.Getpid() - } - return false - }) -} - -func BenchmarkWorldSwitchToUserRoundtrip(b *testing.B) { - // see BenchmarkApplicationSyscall. - var ( - i int - a int - ) - applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { - var si arch.SignalInfo - if _, err := c.SwitchToUser(ring0.SwitchOpts{ - Registers: regs, - FloatingPointState: dummyFPState, - PageTables: pt, - }, &si); err == platform.ErrContextInterrupt { - a++ - return true // Ignore. - } else if err != nil { - b.Fatalf("benchmark failed: %v", err) - } - // This will intentionally cause the world switch. By executing - // a host syscall here, we force the transition between guest - // and host mode. - testutil.Getpid() - i++ - return i < b.N - }) - if a != 0 { - b.Logf("ErrContextInterrupt occurred %d times (in %d iterations).", a, a+i) - } -} diff --git a/pkg/sentry/platform/kvm/kvm_unsafe_state_autogen.go b/pkg/sentry/platform/kvm/kvm_unsafe_state_autogen.go new file mode 100644 index 000000000..04d30ac6d --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_unsafe_state_autogen.go @@ -0,0 +1,8 @@ +// automatically generated by stateify. + +// +build go1.12 +// +build !go1.17 +// +build go1.12 +// +build !go1.17 + +package kvm diff --git a/pkg/sentry/platform/kvm/testutil/BUILD b/pkg/sentry/platform/kvm/testutil/BUILD deleted file mode 100644 index f7feb8683..000000000 --- a/pkg/sentry/platform/kvm/testutil/BUILD +++ /dev/null @@ -1,17 +0,0 @@ -load("//tools:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "testutil", - testonly = 1, - srcs = [ - "testutil.go", - "testutil_amd64.go", - "testutil_amd64.s", - "testutil_arm64.go", - "testutil_arm64.s", - ], - visibility = ["//pkg/sentry/platform/kvm:__pkg__"], - deps = ["//pkg/sentry/arch"], -) diff --git a/pkg/sentry/platform/kvm/testutil/testutil.go b/pkg/sentry/platform/kvm/testutil/testutil.go deleted file mode 100644 index 5c1efa0fd..000000000 --- a/pkg/sentry/platform/kvm/testutil/testutil.go +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package testutil provides common assembly stubs for testing. -package testutil - -import ( - "fmt" - "strings" -) - -// Getpid executes a trivial system call. -func Getpid() - -// Touch touches the value in the first register. -func Touch() - -// SyscallLoop executes a syscall and loops. -func SyscallLoop() - -// SpinLoop spins on the CPU. -func SpinLoop() - -// HaltLoop immediately halts and loops. -func HaltLoop() - -// TwiddleRegsFault twiddles registers then faults. -func TwiddleRegsFault() - -// TwiddleRegsSyscall twiddles registers then executes a syscall. -func TwiddleRegsSyscall() - -// FloatingPointWorks is a floating point test. -// -// It returns true or false. -func FloatingPointWorks() bool - -// RegisterMismatchError is used for checking registers. -type RegisterMismatchError []string - -// Error returns a human-readable error. -func (r RegisterMismatchError) Error() string { - return strings.Join([]string(r), ";") -} - -// addRegisterMisatch allows simple chaining of register mismatches. -func addRegisterMismatch(err error, reg string, got, expected interface{}) error { - errStr := fmt.Sprintf("%s got %08x, expected %08x", reg, got, expected) - switch r := err.(type) { - case nil: - // Return a new register mismatch. - return RegisterMismatchError{errStr} - case RegisterMismatchError: - // Append the error. - r = append(r, errStr) - return r - default: - // Leave as is. - return err - } -} diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go deleted file mode 100644 index 8048eedec..000000000 --- a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build amd64 - -package testutil - -import ( - "reflect" - - "gvisor.dev/gvisor/pkg/sentry/arch" -) - -// TwiddleSegments reads segments into known registers. -func TwiddleSegments() - -// SetTestTarget sets the rip appropriately. -func SetTestTarget(regs *arch.Registers, fn func()) { - regs.Rip = uint64(reflect.ValueOf(fn).Pointer()) -} - -// SetTouchTarget sets rax appropriately. -func SetTouchTarget(regs *arch.Registers, target *uintptr) { - if target != nil { - regs.Rax = uint64(reflect.ValueOf(target).Pointer()) - } else { - regs.Rax = 0 - } -} - -// RewindSyscall rewinds a syscall RIP. -func RewindSyscall(regs *arch.Registers) { - regs.Rip -= 2 -} - -// SetTestRegs initializes registers to known values. -func SetTestRegs(regs *arch.Registers) { - regs.R15 = 0x15 - regs.R14 = 0x14 - regs.R13 = 0x13 - regs.R12 = 0x12 - regs.Rbp = 0xb9 - regs.Rbx = 0xb4 - regs.R11 = 0x11 - regs.R10 = 0x10 - regs.R9 = 0x09 - regs.R8 = 0x08 - regs.Rax = 0x44 - regs.Rcx = 0xc4 - regs.Rdx = 0xd4 - regs.Rsi = 0x51 - regs.Rdi = 0xd1 - regs.Rsp = 0x59 -} - -// CheckTestRegs checks that registers were twiddled per TwiddleRegs. -func CheckTestRegs(regs *arch.Registers, full bool) (err error) { - if need := ^uint64(0x15); regs.R15 != need { - err = addRegisterMismatch(err, "R15", regs.R15, need) - } - if need := ^uint64(0x14); regs.R14 != need { - err = addRegisterMismatch(err, "R14", regs.R14, need) - } - if need := ^uint64(0x13); regs.R13 != need { - err = addRegisterMismatch(err, "R13", regs.R13, need) - } - if need := ^uint64(0x12); regs.R12 != need { - err = addRegisterMismatch(err, "R12", regs.R12, need) - } - if need := ^uint64(0xb9); regs.Rbp != need { - err = addRegisterMismatch(err, "Rbp", regs.Rbp, need) - } - if need := ^uint64(0xb4); regs.Rbx != need { - err = addRegisterMismatch(err, "Rbx", regs.Rbx, need) - } - if need := ^uint64(0x10); regs.R10 != need { - err = addRegisterMismatch(err, "R10", regs.R10, need) - } - if need := ^uint64(0x09); regs.R9 != need { - err = addRegisterMismatch(err, "R9", regs.R9, need) - } - if need := ^uint64(0x08); regs.R8 != need { - err = addRegisterMismatch(err, "R8", regs.R8, need) - } - if need := ^uint64(0x44); regs.Rax != need { - err = addRegisterMismatch(err, "Rax", regs.Rax, need) - } - if need := ^uint64(0xd4); regs.Rdx != need { - err = addRegisterMismatch(err, "Rdx", regs.Rdx, need) - } - if need := ^uint64(0x51); regs.Rsi != need { - err = addRegisterMismatch(err, "Rsi", regs.Rsi, need) - } - if need := ^uint64(0xd1); regs.Rdi != need { - err = addRegisterMismatch(err, "Rdi", regs.Rdi, need) - } - if need := ^uint64(0x59); regs.Rsp != need { - err = addRegisterMismatch(err, "Rsp", regs.Rsp, need) - } - // Rcx & R11 are ignored if !full is set. - if need := ^uint64(0x11); full && regs.R11 != need { - err = addRegisterMismatch(err, "R11", regs.R11, need) - } - if need := ^uint64(0xc4); full && regs.Rcx != need { - err = addRegisterMismatch(err, "Rcx", regs.Rcx, need) - } - return -} - -var fsData uint64 = 0x55 -var gsData uint64 = 0x85 - -// SetTestSegments initializes segments to known values. -func SetTestSegments(regs *arch.Registers) { - regs.Fs_base = uint64(reflect.ValueOf(&fsData).Pointer()) - regs.Gs_base = uint64(reflect.ValueOf(&gsData).Pointer()) -} - -// CheckTestSegments checks that registers were twiddled per TwiddleSegments. -func CheckTestSegments(regs *arch.Registers) (err error) { - if regs.Rax != fsData { - err = addRegisterMismatch(err, "Rax", regs.Rax, fsData) - } - if regs.Rbx != gsData { - err = addRegisterMismatch(err, "Rbx", regs.Rcx, gsData) - } - return -} diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.s b/pkg/sentry/platform/kvm/testutil/testutil_amd64.s deleted file mode 100644 index 491ec0c2a..000000000 --- a/pkg/sentry/platform/kvm/testutil/testutil_amd64.s +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build amd64 - -// test_util_amd64.s provides AMD64 test functions. - -#include "funcdata.h" -#include "textflag.h" - -TEXT ·Getpid(SB),NOSPLIT,$0 - NO_LOCAL_POINTERS - MOVQ $39, AX // getpid - SYSCALL - RET - -TEXT ·Touch(SB),NOSPLIT,$0 -start: - MOVQ 0(AX), BX // deref AX - MOVQ $39, AX // getpid - SYSCALL - JMP start - -TEXT ·HaltLoop(SB),NOSPLIT,$0 -start: - HLT - JMP start - -TEXT ·SyscallLoop(SB),NOSPLIT,$0 -start: - SYSCALL - JMP start - -TEXT ·SpinLoop(SB),NOSPLIT,$0 -start: - JMP start - -TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8 - NO_LOCAL_POINTERS - MOVQ $1, AX - MOVQ AX, X0 - MOVQ $39, AX // getpid - SYSCALL - MOVQ X0, AX - CMPQ AX, $1 - SETEQ ret+0(FP) - RET - -#define TWIDDLE_REGS() \ - NOTQ R15; \ - NOTQ R14; \ - NOTQ R13; \ - NOTQ R12; \ - NOTQ BP; \ - NOTQ BX; \ - NOTQ R11; \ - NOTQ R10; \ - NOTQ R9; \ - NOTQ R8; \ - NOTQ AX; \ - NOTQ CX; \ - NOTQ DX; \ - NOTQ SI; \ - NOTQ DI; \ - NOTQ SP; - -TEXT ·TwiddleRegsSyscall(SB),NOSPLIT,$0 - TWIDDLE_REGS() - SYSCALL - RET // never reached - -TEXT ·TwiddleRegsFault(SB),NOSPLIT,$0 - TWIDDLE_REGS() - JMP AX // must fault - RET // never reached - -#define READ_FS() BYTE $0x64; BYTE $0x48; BYTE $0x8b; BYTE $0x00; -#define READ_GS() BYTE $0x65; BYTE $0x48; BYTE $0x8b; BYTE $0x00; - -TEXT ·TwiddleSegments(SB),NOSPLIT,$0 - MOVQ $0x0, AX - READ_GS() - MOVQ AX, BX - MOVQ $0x0, AX - READ_FS() - SYSCALL - RET // never reached diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go b/pkg/sentry/platform/kvm/testutil/testutil_arm64.go deleted file mode 100644 index c5235ca9d..000000000 --- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.go +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package testutil - -import ( - "fmt" - "reflect" - - "gvisor.dev/gvisor/pkg/sentry/arch" -) - -// TLSWorks is a tls test. -// -// It returns true or false. -func TLSWorks() bool - -// SetTestTarget sets the rip appropriately. -func SetTestTarget(regs *arch.Registers, fn func()) { - regs.Pc = uint64(reflect.ValueOf(fn).Pointer()) -} - -// SetTouchTarget sets rax appropriately. -func SetTouchTarget(regs *arch.Registers, target *uintptr) { - if target != nil { - regs.Regs[8] = uint64(reflect.ValueOf(target).Pointer()) - } else { - regs.Regs[8] = 0 - } -} - -// RewindSyscall rewinds a syscall RIP. -func RewindSyscall(regs *arch.Registers) { - regs.Pc -= 4 -} - -// SetTestRegs initializes registers to known values. -func SetTestRegs(regs *arch.Registers) { - for i := 0; i <= 30; i++ { - regs.Regs[i] = uint64(i) + 1 - } -} - -// CheckTestRegs checks that registers were twiddled per TwiddleRegs. -func CheckTestRegs(regs *arch.Registers, full bool) (err error) { - for i := 0; i <= 30; i++ { - if need := ^uint64(i + 1); regs.Regs[i] != need { - err = addRegisterMismatch(err, fmt.Sprintf("R%d", i), regs.Regs[i], need) - } - } - // Check tls. - if need := ^uint64(11); regs.TPIDR_EL0 != need { - err = addRegisterMismatch(err, "tpdir_el0", regs.TPIDR_EL0, need) - } - return -} diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s deleted file mode 100644 index 7348c29a5..000000000 --- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s +++ /dev/null @@ -1,134 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -// test_util_arm64.s provides ARM64 test functions. - -#include "funcdata.h" -#include "textflag.h" - -#define SYS_GETPID 172 - -// This function simulates the getpid syscall. -TEXT ·Getpid(SB),NOSPLIT,$0 - NO_LOCAL_POINTERS - MOVD $SYS_GETPID, R8 - SVC - RET - -TEXT ·Touch(SB),NOSPLIT,$0 -start: - MOVD 0(R8), R1 - MOVD $SYS_GETPID, R8 // getpid - SVC - B start - -TEXT ·HaltLoop(SB),NOSPLIT,$0 -start: - HLT - B start - -// This function simulates a loop of syscall. -TEXT ·SyscallLoop(SB),NOSPLIT,$0 -start: - SVC - B start - -TEXT ·SpinLoop(SB),NOSPLIT,$0 -start: - B start - -TEXT ·TLSWorks(SB),NOSPLIT,$0-8 - NO_LOCAL_POINTERS - MOVD $0x6789, R5 - MSR R5, TPIDR_EL0 - MOVD $SYS_GETPID, R8 // getpid - SVC - MRS TPIDR_EL0, R6 - CMP R5, R6 - BNE isNaN - MOVD $1, R0 - MOVD R0, ret+0(FP) - RET -isNaN: - MOVD $0, ret+0(FP) - RET - -TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8 - NO_LOCAL_POINTERS - // gc will touch fpsimd, so we should test it. - // such as in <runtime.deductSweepCredit>. - FMOVD $(9.9), F0 - MOVD $SYS_GETPID, R8 // getpid - SVC - FMOVD $(9.9), F1 - FCMPD F0, F1 - BNE isNaN - MOVD $1, R0 - MOVD R0, ret+0(FP) - RET -isNaN: - MOVD $0, ret+0(FP) - RET - -// MVN: bitwise logical NOT -// This case simulates an application that modified R0-R30. -#define TWIDDLE_REGS() \ - MVN R0, R0; \ - MVN R1, R1; \ - MVN R2, R2; \ - MVN R3, R3; \ - MVN R4, R4; \ - MVN R5, R5; \ - MVN R6, R6; \ - MVN R7, R7; \ - MVN R8, R8; \ - MVN R9, R9; \ - MVN R10, R10; \ - MVN R11, R11; \ - MVN R12, R12; \ - MVN R13, R13; \ - MVN R14, R14; \ - MVN R15, R15; \ - MVN R16, R16; \ - MVN R17, R17; \ - MVN R18_PLATFORM, R18_PLATFORM; \ - MVN R19, R19; \ - MVN R20, R20; \ - MVN R21, R21; \ - MVN R22, R22; \ - MVN R23, R23; \ - MVN R24, R24; \ - MVN R25, R25; \ - MVN R26, R26; \ - MVN R27, R27; \ - MVN g, g; \ - MVN R29, R29; \ - MVN R30, R30; - -TEXT ·TwiddleRegsSyscall(SB),NOSPLIT,$0 - TWIDDLE_REGS() - MSR R10, TPIDR_EL0 - // Trapped in el0_svc. - SVC - RET // never reached - -TEXT ·TwiddleRegsFault(SB),NOSPLIT,$0 - TWIDDLE_REGS() - MSR R10, TPIDR_EL0 - // Trapped in el0_ia. - // Branch to Register branches unconditionally to an address in <Rn>. - JMP (R6) // <=> br x6, must fault - RET // never reached diff --git a/pkg/sentry/platform/kvm/virtual_map_test.go b/pkg/sentry/platform/kvm/virtual_map_test.go deleted file mode 100644 index 327e2be4f..000000000 --- a/pkg/sentry/platform/kvm/virtual_map_test.go +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package kvm - -import ( - "syscall" - "testing" - - "gvisor.dev/gvisor/pkg/usermem" -) - -type checker struct { - ok bool - accessType usermem.AccessType -} - -func (c *checker) Containing(addr uintptr) func(virtualRegion) { - c.ok = false // Reset for below calls. - return func(vr virtualRegion) { - if vr.virtual <= addr && addr < vr.virtual+vr.length { - c.ok = true - c.accessType = vr.accessType - } - } -} - -func TestParseMaps(t *testing.T) { - c := new(checker) - - // Simple test. - if err := applyVirtualRegions(c.Containing(0)); err != nil { - t.Fatalf("unexpected error: %v", err) - } - - // MMap a new page. - addr, _, errno := syscall.RawSyscall6( - syscall.SYS_MMAP, 0, usermem.PageSize, - syscall.PROT_READ|syscall.PROT_WRITE, - syscall.MAP_ANONYMOUS|syscall.MAP_PRIVATE, 0, 0) - if errno != 0 { - t.Fatalf("unexpected map error: %v", errno) - } - - // Re-parse maps. - if err := applyVirtualRegions(c.Containing(addr)); err != nil { - syscall.RawSyscall(syscall.SYS_MUNMAP, addr, usermem.PageSize, 0) - t.Fatalf("unexpected error: %v", err) - } - - // Assert that it now does contain the region. - if !c.ok { - syscall.RawSyscall(syscall.SYS_MUNMAP, addr, usermem.PageSize, 0) - t.Fatalf("updated map does not contain 0x%08x, expected true", addr) - } - - // Map the region as PROT_NONE. - newAddr, _, errno := syscall.RawSyscall6( - syscall.SYS_MMAP, addr, usermem.PageSize, - syscall.PROT_NONE, - syscall.MAP_ANONYMOUS|syscall.MAP_FIXED|syscall.MAP_PRIVATE, 0, 0) - if errno != 0 { - t.Fatalf("unexpected map error: %v", errno) - } - if newAddr != addr { - t.Fatalf("unable to remap address: got 0x%08x, wanted 0x%08x", newAddr, addr) - } - - // Re-parse maps. - if err := applyVirtualRegions(c.Containing(addr)); err != nil { - t.Fatalf("unexpected error: %v", err) - } - if !c.ok { - t.Fatalf("final map does not contain 0x%08x, expected true", addr) - } - if c.accessType.Read || c.accessType.Write || c.accessType.Execute { - t.Fatalf("final map has incorrect permissions for 0x%08x", addr) - } - - // Unmap the region. - syscall.RawSyscall(syscall.SYS_MUNMAP, addr, usermem.PageSize, 0) -} diff --git a/pkg/sentry/platform/platform_state_autogen.go b/pkg/sentry/platform/platform_state_autogen.go new file mode 100644 index 000000000..7a4d04589 --- /dev/null +++ b/pkg/sentry/platform/platform_state_autogen.go @@ -0,0 +1,3 @@ +// automatically generated by stateify. + +package platform diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD deleted file mode 100644 index fc43cc3c0..000000000 --- a/pkg/sentry/platform/ptrace/BUILD +++ /dev/null @@ -1,40 +0,0 @@ -load("//tools:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "ptrace", - srcs = [ - "filters.go", - "ptrace.go", - "ptrace_amd64.go", - "ptrace_arm64.go", - "ptrace_arm64_unsafe.go", - "ptrace_unsafe.go", - "stub_amd64.s", - "stub_arm64.s", - "stub_unsafe.go", - "subprocess.go", - "subprocess_amd64.go", - "subprocess_arm64.go", - "subprocess_linux.go", - "subprocess_linux_unsafe.go", - "subprocess_unsafe.go", - ], - visibility = ["//:sandbox"], - deps = [ - "//pkg/abi/linux", - "//pkg/context", - "//pkg/log", - "//pkg/procid", - "//pkg/safecopy", - "//pkg/seccomp", - "//pkg/sentry/arch", - "//pkg/sentry/memmap", - "//pkg/sentry/platform", - "//pkg/sentry/platform/interrupt", - "//pkg/sync", - "//pkg/usermem", - "@org_golang_x_sys//unix:go_default_library", - ], -) diff --git a/pkg/sentry/platform/ptrace/ptrace_amd64_state_autogen.go b/pkg/sentry/platform/ptrace/ptrace_amd64_state_autogen.go new file mode 100644 index 000000000..f730ab393 --- /dev/null +++ b/pkg/sentry/platform/ptrace/ptrace_amd64_state_autogen.go @@ -0,0 +1,5 @@ +// automatically generated by stateify. + +// +build amd64 + +package ptrace diff --git a/pkg/sentry/platform/ptrace/ptrace_arm64_state_autogen.go b/pkg/sentry/platform/ptrace/ptrace_arm64_state_autogen.go new file mode 100644 index 000000000..6239d1305 --- /dev/null +++ b/pkg/sentry/platform/ptrace/ptrace_arm64_state_autogen.go @@ -0,0 +1,5 @@ +// automatically generated by stateify. + +// +build arm64 + +package ptrace diff --git a/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe_state_autogen.go b/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe_state_autogen.go new file mode 100644 index 000000000..6239d1305 --- /dev/null +++ b/pkg/sentry/platform/ptrace/ptrace_arm64_unsafe_state_autogen.go @@ -0,0 +1,5 @@ +// automatically generated by stateify. + +// +build arm64 + +package ptrace diff --git a/pkg/sentry/platform/ptrace/ptrace_linux_state_autogen.go b/pkg/sentry/platform/ptrace/ptrace_linux_state_autogen.go new file mode 100644 index 000000000..9f90aef93 --- /dev/null +++ b/pkg/sentry/platform/ptrace/ptrace_linux_state_autogen.go @@ -0,0 +1,5 @@ +// automatically generated by stateify. + +// +build linux + +package ptrace diff --git a/pkg/sentry/platform/ptrace/ptrace_linux_unsafe_state_autogen.go b/pkg/sentry/platform/ptrace/ptrace_linux_unsafe_state_autogen.go new file mode 100644 index 000000000..45d94c547 --- /dev/null +++ b/pkg/sentry/platform/ptrace/ptrace_linux_unsafe_state_autogen.go @@ -0,0 +1,6 @@ +// automatically generated by stateify. + +// +build linux +// +build amd64 arm64 + +package ptrace diff --git a/pkg/sentry/platform/ptrace/ptrace_state_autogen.go b/pkg/sentry/platform/ptrace/ptrace_state_autogen.go new file mode 100644 index 000000000..1bf0526f9 --- /dev/null +++ b/pkg/sentry/platform/ptrace/ptrace_state_autogen.go @@ -0,0 +1,3 @@ +// automatically generated by stateify. + +package ptrace diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe_state_autogen.go b/pkg/sentry/platform/ptrace/ptrace_unsafe_state_autogen.go new file mode 100644 index 000000000..8a2ebe8da --- /dev/null +++ b/pkg/sentry/platform/ptrace/ptrace_unsafe_state_autogen.go @@ -0,0 +1,6 @@ +// automatically generated by stateify. + +// +build go1.12 +// +build !go1.17 + +package ptrace diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD deleted file mode 100644 index 679b287c3..000000000 --- a/pkg/sentry/platform/ring0/BUILD +++ /dev/null @@ -1,86 +0,0 @@ -load("//tools:defs.bzl", "go_library") -load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") - -package(licenses = ["notice"]) - -go_template( - name = "defs_amd64", - srcs = [ - "defs.go", - "defs_amd64.go", - "offsets_amd64.go", - "x86.go", - ], - visibility = [":__subpackages__"], -) - -go_template( - name = "defs_arm64", - srcs = [ - "aarch64.go", - "defs.go", - "defs_arm64.go", - "offsets_arm64.go", - ], - visibility = [":__subpackages__"], -) - -go_template_instance( - name = "defs_impl_amd64", - out = "defs_impl_amd64.go", - package = "ring0", - template = ":defs_amd64", -) - -go_template_instance( - name = "defs_impl_arm64", - out = "defs_impl_arm64.go", - package = "ring0", - template = ":defs_arm64", -) - -genrule( - name = "entry_impl_amd64", - srcs = ["entry_amd64.s"], - outs = ["entry_impl_amd64.s"], - cmd = "(echo -e '// build +amd64\\n' && $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(SRCS)) > $@", - tools = ["//pkg/sentry/platform/ring0/gen_offsets"], -) - -genrule( - name = "entry_impl_arm64", - srcs = ["entry_arm64.s"], - outs = ["entry_impl_arm64.s"], - cmd = "(echo -e '// build +arm64\\n' && $(location //pkg/sentry/platform/ring0/gen_offsets) && cat $(SRCS)) > $@", - tools = ["//pkg/sentry/platform/ring0/gen_offsets"], -) - -go_library( - name = "ring0", - srcs = [ - "defs_impl_amd64.go", - "defs_impl_arm64.go", - "entry_amd64.go", - "entry_arm64.go", - "entry_impl_amd64.s", - "entry_impl_arm64.s", - "kernel.go", - "kernel_amd64.go", - "kernel_arm64.go", - "kernel_unsafe.go", - "lib_amd64.go", - "lib_amd64.s", - "lib_arm64.go", - "lib_arm64.s", - "lib_arm64_unsafe.go", - "ring0.go", - ], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/cpuid", - "//pkg/safecopy", - "//pkg/sentry/arch", - "//pkg/sentry/platform/ring0/pagetables", - "//pkg/usermem", - ], -) diff --git a/pkg/sentry/platform/ring0/aarch64.go b/pkg/sentry/platform/ring0/aarch64.go deleted file mode 100644 index 327d48465..000000000 --- a/pkg/sentry/platform/ring0/aarch64.go +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package ring0 - -// Useful bits. -const ( - _PGD_PGT_BASE = 0x1000 - _PGD_PGT_SIZE = 0x1000 - _PUD_PGT_BASE = 0x2000 - _PUD_PGT_SIZE = 0x1000 - _PMD_PGT_BASE = 0x3000 - _PMD_PGT_SIZE = 0x4000 - _PTE_PGT_BASE = 0x7000 - _PTE_PGT_SIZE = 0x1000 -) - -const ( - // DAIF bits:debug, sError, IRQ, FIQ. - _PSR_D_BIT = 0x00000200 - _PSR_A_BIT = 0x00000100 - _PSR_I_BIT = 0x00000080 - _PSR_F_BIT = 0x00000040 - _PSR_DAIF_SHIFT = 6 - _PSR_DAIF_MASK = 0xf << _PSR_DAIF_SHIFT - - // PSR bits. - _PSR_MODE_EL0t = 0x00000000 - _PSR_MODE_EL1t = 0x00000004 - _PSR_MODE_EL1h = 0x00000005 - _PSR_MODE_MASK = 0x0000000f - - PsrFlagsClear = _PSR_MODE_MASK | _PSR_DAIF_MASK - PsrModeMask = _PSR_MODE_MASK - - // KernelFlagsSet should always be set in the kernel. - KernelFlagsSet = _PSR_MODE_EL1h | _PSR_D_BIT | _PSR_A_BIT | _PSR_I_BIT | _PSR_F_BIT - - // UserFlagsSet are always set in userspace. - UserFlagsSet = _PSR_MODE_EL0t -) - -// Vector is an exception vector. -type Vector uintptr - -// Exception vectors. -const ( - El1InvSync = iota - El1InvIrq - El1InvFiq - El1InvError - - El1Sync - El1Irq - El1Fiq - El1Err - - El0Sync - El0Irq - El0Fiq - El0Err - - El0InvSync - El0InvIrq - El0InvFiq - El0InvErr - - El1SyncDa - El1SyncIa - El1SyncSpPc - El1SyncUndef - El1SyncDbg - El1SyncInv - - El0SyncSVC - El0SyncDa - El0SyncIa - El0SyncFpsimdAcc - El0SyncSveAcc - El0SyncSys - El0SyncSpPc - El0SyncUndef - El0SyncDbg - El0SyncInv - - El0ErrNMI - El0ErrBounce - - _NR_INTERRUPTS -) - -// System call vectors. -const ( - Syscall Vector = El0SyncSVC - PageFault Vector = El0SyncDa - VirtualizationException Vector = El0ErrBounce -) - -// VirtualAddressBits returns the number bits available for virtual addresses. -func VirtualAddressBits() uint32 { - return 48 -} - -// PhysicalAddressBits returns the number of bits available for physical addresses. -func PhysicalAddressBits() uint32 { - return 40 -} diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go deleted file mode 100644 index f9765771e..000000000 --- a/pkg/sentry/platform/ring0/defs.go +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ring0 - -import ( - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" -) - -// Kernel is a global kernel object. -// -// This contains global state, shared by multiple CPUs. -type Kernel struct { - // PageTables are the kernel pagetables; this must be provided. - PageTables *pagetables.PageTables - - KernelArchState -} - -// Hooks are hooks for kernel functions. -type Hooks interface { - // KernelSyscall is called for kernel system calls. - // - // Return from this call will restore registers and return to the kernel: the - // registers must be modified directly. - // - // If this function is not provided, a kernel exception results in halt. - // - // This must be go:nosplit, as this will be on the interrupt stack. - // Closures are permitted, as the pointer to the closure frame is not - // passed on the stack. - KernelSyscall() - - // KernelException handles an exception during kernel execution. - // - // Return from this call will restore registers and return to the kernel: the - // registers must be modified directly. - // - // If this function is not provided, a kernel exception results in halt. - // - // This must be go:nosplit, as this will be on the interrupt stack. - // Closures are permitted, as the pointer to the closure frame is not - // passed on the stack. - KernelException(Vector) -} - -// CPU is the per-CPU struct. -type CPU struct { - // self is a self reference. - // - // This is always guaranteed to be at offset zero. - self *CPU - - // kernel is reference to the kernel that this CPU was initialized - // with. This reference is kept for garbage collection purposes: CPU - // registers may refer to objects within the Kernel object that cannot - // be safely freed. - kernel *Kernel - - // CPUArchState is architecture-specific state. - CPUArchState - - // registers is a set of registers; these may be used on kernel system - // calls and exceptions via the Registers function. - registers arch.Registers - - // hooks are kernel hooks. - hooks Hooks -} - -// Registers returns a modifiable-copy of the kernel registers. -// -// This is explicitly safe to call during KernelException and KernelSyscall. -// -//go:nosplit -func (c *CPU) Registers() *arch.Registers { - return &c.registers -} - -// SwitchOpts are passed to the Switch function. -type SwitchOpts struct { - // Registers are the user register state. - Registers *arch.Registers - - // FloatingPointState is a byte pointer where floating point state is - // saved and restored. - FloatingPointState *byte - - // PageTables are the application page tables. - PageTables *pagetables.PageTables - - // Flush indicates that a TLB flush should be forced on switch. - Flush bool - - // FullRestore indicates that an iret-based restore should be used. - FullRestore bool - - // SwitchArchOpts are architecture-specific options. - SwitchArchOpts -} diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go deleted file mode 100644 index 7a2275558..000000000 --- a/pkg/sentry/platform/ring0/defs_amd64.go +++ /dev/null @@ -1,162 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build amd64 - -package ring0 - -import ( - "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "gvisor.dev/gvisor/pkg/usermem" -) - -var ( - // UserspaceSize is the total size of userspace. - UserspaceSize = uintptr(1) << (VirtualAddressBits() - 1) - - // MaximumUserAddress is the largest possible user address. - MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(usermem.PageSize-1) - - // KernelStartAddress is the starting kernel address. - KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1) -) - -// Segment indices and Selectors. -const ( - // Index into GDT array. - _ = iota // Null descriptor first. - _ // Reserved (Linux is kernel 32). - segKcode // Kernel code (64-bit). - segKdata // Kernel data. - segUcode32 // User code (32-bit). - segUdata // User data. - segUcode64 // User code (64-bit). - segTss // Task segment descriptor. - segTssHi // Upper bits for TSS. - segLast // Last segment (terminal, not included). -) - -// Selectors. -const ( - Kcode Selector = segKcode << 3 - Kdata Selector = segKdata << 3 - Ucode32 Selector = (segUcode32 << 3) | 3 - Udata Selector = (segUdata << 3) | 3 - Ucode64 Selector = (segUcode64 << 3) | 3 - Tss Selector = segTss << 3 -) - -// Standard segments. -var ( - UserCodeSegment32 SegmentDescriptor - UserDataSegment SegmentDescriptor - UserCodeSegment64 SegmentDescriptor - KernelCodeSegment SegmentDescriptor - KernelDataSegment SegmentDescriptor -) - -// KernelArchState contains architecture-specific state. -type KernelArchState struct { - // cpuEntries is array of kernelEntry for all cpus. - cpuEntries []kernelEntry - - // globalIDT is our set of interrupt gates. - globalIDT *idt64 -} - -// kernelEntry contains minimal CPU-specific arch state -// that can be mapped at the upper of the address space. -// Malicious APP might steal info from it via CPU bugs. -type kernelEntry struct { - // stack is the stack used for interrupts on this CPU. - stack [256]byte - - // scratch space for temporary usage. - scratch0 uint64 - - // stackTop is the top of the stack. - stackTop uint64 - - // cpuSelf is back reference to CPU. - cpuSelf *CPU - - // kernelCR3 is the cr3 used for sentry kernel. - kernelCR3 uintptr - - // gdt is the CPU's descriptor table. - gdt descriptorTable - - // tss is the CPU's task state. - tss TaskState64 -} - -// CPUArchState contains CPU-specific arch state. -type CPUArchState struct { - // errorCode is the error code from the last exception. - errorCode uintptr - - // errorType indicates the type of error code here, it is always set - // along with the errorCode value above. - // - // It will either by 1, which indicates a user error, or 0 indicating a - // kernel error. If the error code below returns false (kernel error), - // then it cannot provide relevant information about the last - // exception. - errorType uintptr - - *kernelEntry -} - -// ErrorCode returns the last error code. -// -// The returned boolean indicates whether the error code corresponds to the -// last user error or not. If it does not, then fault information must be -// ignored. This is generally the result of a kernel fault while servicing a -// user fault. -// -//go:nosplit -func (c *CPU) ErrorCode() (value uintptr, user bool) { - return c.errorCode, c.errorType != 0 -} - -// ClearErrorCode resets the error code. -// -//go:nosplit -func (c *CPU) ClearErrorCode() { - c.errorCode = 0 // No code. - c.errorType = 1 // User mode. -} - -// SwitchArchOpts are embedded in SwitchOpts. -type SwitchArchOpts struct { - // UserPCID indicates that the application PCID to be used on switch, - // assuming that PCIDs are supported. - // - // Per pagetables_x86.go, a zero PCID implies a flush. - UserPCID uint16 - - // KernelPCID indicates that the kernel PCID to be used on return, - // assuming that PCIDs are supported. - // - // Per pagetables_x86.go, a zero PCID implies a flush. - KernelPCID uint16 -} - -func init() { - KernelCodeSegment.setCode64(0, 0, 0) - KernelDataSegment.setData(0, 0xffffffff, 0) - UserCodeSegment32.setCode64(0, 0, 3) - UserDataSegment.setData(0, 0xffffffff, 3) - UserCodeSegment64.setCode64(0, 0, 3) -} diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go deleted file mode 100644 index a014dcbc0..000000000 --- a/pkg/sentry/platform/ring0/defs_arm64.go +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package ring0 - -import ( - "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "gvisor.dev/gvisor/pkg/usermem" -) - -var ( - // UserspaceSize is the total size of userspace. - UserspaceSize = uintptr(1) << (VirtualAddressBits()) - - // MaximumUserAddress is the largest possible user address. - MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(usermem.PageSize-1) - - // KernelStartAddress is the starting kernel address. - KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1) -) - -// KernelArchState contains architecture-specific state. -type KernelArchState struct { -} - -// CPUArchState contains CPU-specific arch state. -type CPUArchState struct { - // stack is the stack used for interrupts on this CPU. - stack [512]byte - - // errorCode is the error code from the last exception. - errorCode uintptr - - // errorType indicates the type of error code here, it is always set - // along with the errorCode value above. - // - // It will either by 1, which indicates a user error, or 0 indicating a - // kernel error. If the error code below returns false (kernel error), - // then it cannot provide relevant information about the last - // exception. - errorType uintptr - - // faultAddr is the value of far_el1. - faultAddr uintptr - - // ttbr0Kvm is the value of ttbr0_el1 for sentry. - ttbr0Kvm uintptr - - // ttbr0App is the value of ttbr0_el1 for applicaton. - ttbr0App uintptr - - // exception vector. - vecCode Vector - - // application context pointer. - appAddr uintptr - - // lazyVFP is the value of cpacr_el1. - lazyVFP uintptr - - // appASID is the asid value of guest application. - appASID uintptr -} - -// ErrorCode returns the last error code. -// -// The returned boolean indicates whether the error code corresponds to the -// last user error or not. If it does not, then fault information must be -// ignored. This is generally the result of a kernel fault while servicing a -// user fault. -// -//go:nosplit -func (c *CPU) ErrorCode() (value uintptr, user bool) { - return c.errorCode, c.errorType != 0 -} - -// ClearErrorCode resets the error code. -// -//go:nosplit -func (c *CPU) ClearErrorCode() { - c.errorCode = 0 // No code. - c.errorType = 1 // User mode. -} - -//go:nosplit -func (c *CPU) GetFaultAddr() (value uintptr) { - return c.faultAddr -} - -//go:nosplit -func (c *CPU) SetTtbr0Kvm(value uintptr) { - c.ttbr0Kvm = value -} - -//go:nosplit -func (c *CPU) SetTtbr0App(value uintptr) { - c.ttbr0App = value -} - -//go:nosplit -func (c *CPU) GetVector() (value Vector) { - return c.vecCode -} - -//go:nosplit -func (c *CPU) SetAppAddr(value uintptr) { - c.appAddr = value -} - -// GetLazyVFP returns the value of cpacr_el1. -//go:nosplit -func (c *CPU) GetLazyVFP() (value uintptr) { - return c.lazyVFP -} - -// SwitchArchOpts are embedded in SwitchOpts. -type SwitchArchOpts struct { - // UserASID indicates that the application ASID to be used on switch, - UserASID uint16 - - // KernelASID indicates that the kernel ASID to be used on return, - KernelASID uint16 -} - -func init() { -} diff --git a/pkg/sentry/platform/ring0/defs_impl_amd64.go b/pkg/sentry/platform/ring0/defs_impl_amd64.go new file mode 100644 index 000000000..c0d7a8d7e --- /dev/null +++ b/pkg/sentry/platform/ring0/defs_impl_amd64.go @@ -0,0 +1,594 @@ +package ring0 + +import ( + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" + + "fmt" + "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/usermem" + "io" + "reflect" +) + +// Kernel is a global kernel object. +// +// This contains global state, shared by multiple CPUs. +type Kernel struct { + // PageTables are the kernel pagetables; this must be provided. + PageTables *pagetables.PageTables + + KernelArchState +} + +// Hooks are hooks for kernel functions. +type Hooks interface { + // KernelSyscall is called for kernel system calls. + // + // Return from this call will restore registers and return to the kernel: the + // registers must be modified directly. + // + // If this function is not provided, a kernel exception results in halt. + // + // This must be go:nosplit, as this will be on the interrupt stack. + // Closures are permitted, as the pointer to the closure frame is not + // passed on the stack. + KernelSyscall() + + // KernelException handles an exception during kernel execution. + // + // Return from this call will restore registers and return to the kernel: the + // registers must be modified directly. + // + // If this function is not provided, a kernel exception results in halt. + // + // This must be go:nosplit, as this will be on the interrupt stack. + // Closures are permitted, as the pointer to the closure frame is not + // passed on the stack. + KernelException(Vector) +} + +// CPU is the per-CPU struct. +type CPU struct { + // self is a self reference. + // + // This is always guaranteed to be at offset zero. + self *CPU + + // kernel is reference to the kernel that this CPU was initialized + // with. This reference is kept for garbage collection purposes: CPU + // registers may refer to objects within the Kernel object that cannot + // be safely freed. + kernel *Kernel + + // CPUArchState is architecture-specific state. + CPUArchState + + // registers is a set of registers; these may be used on kernel system + // calls and exceptions via the Registers function. + registers arch.Registers + + // hooks are kernel hooks. + hooks Hooks +} + +// Registers returns a modifiable-copy of the kernel registers. +// +// This is explicitly safe to call during KernelException and KernelSyscall. +// +//go:nosplit +func (c *CPU) Registers() *arch.Registers { + return &c.registers +} + +// SwitchOpts are passed to the Switch function. +type SwitchOpts struct { + // Registers are the user register state. + Registers *arch.Registers + + // FloatingPointState is a byte pointer where floating point state is + // saved and restored. + FloatingPointState *byte + + // PageTables are the application page tables. + PageTables *pagetables.PageTables + + // Flush indicates that a TLB flush should be forced on switch. + Flush bool + + // FullRestore indicates that an iret-based restore should be used. + FullRestore bool + + // SwitchArchOpts are architecture-specific options. + SwitchArchOpts +} + +var ( + // UserspaceSize is the total size of userspace. + UserspaceSize = uintptr(1) << (VirtualAddressBits() - 1) + + // MaximumUserAddress is the largest possible user address. + MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(usermem.PageSize-1) + + // KernelStartAddress is the starting kernel address. + KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1) +) + +// Segment indices and Selectors. +const ( + // Index into GDT array. + _ = iota // Null descriptor first. + _ // Reserved (Linux is kernel 32). + segKcode // Kernel code (64-bit). + segKdata // Kernel data. + segUcode32 // User code (32-bit). + segUdata // User data. + segUcode64 // User code (64-bit). + segTss // Task segment descriptor. + segTssHi // Upper bits for TSS. + segLast // Last segment (terminal, not included). +) + +// Selectors. +const ( + Kcode Selector = segKcode << 3 + Kdata Selector = segKdata << 3 + Ucode32 Selector = (segUcode32 << 3) | 3 + Udata Selector = (segUdata << 3) | 3 + Ucode64 Selector = (segUcode64 << 3) | 3 + Tss Selector = segTss << 3 +) + +// Standard segments. +var ( + UserCodeSegment32 SegmentDescriptor + UserDataSegment SegmentDescriptor + UserCodeSegment64 SegmentDescriptor + KernelCodeSegment SegmentDescriptor + KernelDataSegment SegmentDescriptor +) + +// KernelArchState contains architecture-specific state. +type KernelArchState struct { + // cpuEntries is array of kernelEntry for all cpus. + cpuEntries []kernelEntry + + // globalIDT is our set of interrupt gates. + globalIDT *idt64 +} + +// kernelEntry contains minimal CPU-specific arch state +// that can be mapped at the upper of the address space. +// Malicious APP might steal info from it via CPU bugs. +type kernelEntry struct { + // stack is the stack used for interrupts on this CPU. + stack [256]byte + + // scratch space for temporary usage. + scratch0 uint64 + + // stackTop is the top of the stack. + stackTop uint64 + + // cpuSelf is back reference to CPU. + cpuSelf *CPU + + // kernelCR3 is the cr3 used for sentry kernel. + kernelCR3 uintptr + + // gdt is the CPU's descriptor table. + gdt descriptorTable + + // tss is the CPU's task state. + tss TaskState64 +} + +// CPUArchState contains CPU-specific arch state. +type CPUArchState struct { + // errorCode is the error code from the last exception. + errorCode uintptr + + // errorType indicates the type of error code here, it is always set + // along with the errorCode value above. + // + // It will either by 1, which indicates a user error, or 0 indicating a + // kernel error. If the error code below returns false (kernel error), + // then it cannot provide relevant information about the last + // exception. + errorType uintptr + + *kernelEntry +} + +// ErrorCode returns the last error code. +// +// The returned boolean indicates whether the error code corresponds to the +// last user error or not. If it does not, then fault information must be +// ignored. This is generally the result of a kernel fault while servicing a +// user fault. +// +//go:nosplit +func (c *CPU) ErrorCode() (value uintptr, user bool) { + return c.errorCode, c.errorType != 0 +} + +// ClearErrorCode resets the error code. +// +//go:nosplit +func (c *CPU) ClearErrorCode() { + c.errorCode = 0 + c.errorType = 1 +} + +// SwitchArchOpts are embedded in SwitchOpts. +type SwitchArchOpts struct { + // UserPCID indicates that the application PCID to be used on switch, + // assuming that PCIDs are supported. + // + // Per pagetables_x86.go, a zero PCID implies a flush. + UserPCID uint16 + + // KernelPCID indicates that the kernel PCID to be used on return, + // assuming that PCIDs are supported. + // + // Per pagetables_x86.go, a zero PCID implies a flush. + KernelPCID uint16 +} + +func init() { + KernelCodeSegment.setCode64(0, 0, 0) + KernelDataSegment.setData(0, 0xffffffff, 0) + UserCodeSegment32.setCode64(0, 0, 3) + UserDataSegment.setData(0, 0xffffffff, 3) + UserCodeSegment64.setCode64(0, 0, 3) +} + +// Emit prints architecture-specific offsets. +func Emit(w io.Writer) { + fmt.Fprintf(w, "// Automatically generated, do not edit.\n") + + c := &CPU{} + fmt.Fprintf(w, "\n// CPU offsets.\n") + fmt.Fprintf(w, "#define CPU_REGISTERS 0x%02x\n", reflect.ValueOf(&c.registers).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_ERROR_CODE 0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_ERROR_TYPE 0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_ENTRY 0x%02x\n", reflect.ValueOf(&c.kernelEntry).Pointer()-reflect.ValueOf(c).Pointer()) + + e := &kernelEntry{} + fmt.Fprintf(w, "\n// CPU entry offsets.\n") + fmt.Fprintf(w, "#define ENTRY_SCRATCH0 0x%02x\n", reflect.ValueOf(&e.scratch0).Pointer()-reflect.ValueOf(e).Pointer()) + fmt.Fprintf(w, "#define ENTRY_STACK_TOP 0x%02x\n", reflect.ValueOf(&e.stackTop).Pointer()-reflect.ValueOf(e).Pointer()) + fmt.Fprintf(w, "#define ENTRY_CPU_SELF 0x%02x\n", reflect.ValueOf(&e.cpuSelf).Pointer()-reflect.ValueOf(e).Pointer()) + fmt.Fprintf(w, "#define ENTRY_KERNEL_CR3 0x%02x\n", reflect.ValueOf(&e.kernelCR3).Pointer()-reflect.ValueOf(e).Pointer()) + + fmt.Fprintf(w, "\n// Bits.\n") + fmt.Fprintf(w, "#define _RFLAGS_IF 0x%02x\n", _RFLAGS_IF) + fmt.Fprintf(w, "#define _RFLAGS_IOPL0 0x%02x\n", _RFLAGS_IOPL0) + fmt.Fprintf(w, "#define _KERNEL_FLAGS 0x%02x\n", KernelFlagsSet) + + fmt.Fprintf(w, "\n// Vectors.\n") + fmt.Fprintf(w, "#define DivideByZero 0x%02x\n", DivideByZero) + fmt.Fprintf(w, "#define Debug 0x%02x\n", Debug) + fmt.Fprintf(w, "#define NMI 0x%02x\n", NMI) + fmt.Fprintf(w, "#define Breakpoint 0x%02x\n", Breakpoint) + fmt.Fprintf(w, "#define Overflow 0x%02x\n", Overflow) + fmt.Fprintf(w, "#define BoundRangeExceeded 0x%02x\n", BoundRangeExceeded) + fmt.Fprintf(w, "#define InvalidOpcode 0x%02x\n", InvalidOpcode) + fmt.Fprintf(w, "#define DeviceNotAvailable 0x%02x\n", DeviceNotAvailable) + fmt.Fprintf(w, "#define DoubleFault 0x%02x\n", DoubleFault) + fmt.Fprintf(w, "#define CoprocessorSegmentOverrun 0x%02x\n", CoprocessorSegmentOverrun) + fmt.Fprintf(w, "#define InvalidTSS 0x%02x\n", InvalidTSS) + fmt.Fprintf(w, "#define SegmentNotPresent 0x%02x\n", SegmentNotPresent) + fmt.Fprintf(w, "#define StackSegmentFault 0x%02x\n", StackSegmentFault) + fmt.Fprintf(w, "#define GeneralProtectionFault 0x%02x\n", GeneralProtectionFault) + fmt.Fprintf(w, "#define PageFault 0x%02x\n", PageFault) + fmt.Fprintf(w, "#define X87FloatingPointException 0x%02x\n", X87FloatingPointException) + fmt.Fprintf(w, "#define AlignmentCheck 0x%02x\n", AlignmentCheck) + fmt.Fprintf(w, "#define MachineCheck 0x%02x\n", MachineCheck) + fmt.Fprintf(w, "#define SIMDFloatingPointException 0x%02x\n", SIMDFloatingPointException) + fmt.Fprintf(w, "#define VirtualizationException 0x%02x\n", VirtualizationException) + fmt.Fprintf(w, "#define SecurityException 0x%02x\n", SecurityException) + fmt.Fprintf(w, "#define SyscallInt80 0x%02x\n", SyscallInt80) + fmt.Fprintf(w, "#define Syscall 0x%02x\n", Syscall) + + p := &arch.Registers{} + fmt.Fprintf(w, "\n// Ptrace registers.\n") + fmt.Fprintf(w, "#define PTRACE_R15 0x%02x\n", reflect.ValueOf(&p.R15).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R14 0x%02x\n", reflect.ValueOf(&p.R14).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R13 0x%02x\n", reflect.ValueOf(&p.R13).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R12 0x%02x\n", reflect.ValueOf(&p.R12).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RBP 0x%02x\n", reflect.ValueOf(&p.Rbp).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RBX 0x%02x\n", reflect.ValueOf(&p.Rbx).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R11 0x%02x\n", reflect.ValueOf(&p.R11).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R10 0x%02x\n", reflect.ValueOf(&p.R10).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R9 0x%02x\n", reflect.ValueOf(&p.R9).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_R8 0x%02x\n", reflect.ValueOf(&p.R8).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RAX 0x%02x\n", reflect.ValueOf(&p.Rax).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RCX 0x%02x\n", reflect.ValueOf(&p.Rcx).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RDX 0x%02x\n", reflect.ValueOf(&p.Rdx).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RSI 0x%02x\n", reflect.ValueOf(&p.Rsi).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RDI 0x%02x\n", reflect.ValueOf(&p.Rdi).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_ORIGRAX 0x%02x\n", reflect.ValueOf(&p.Orig_rax).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RIP 0x%02x\n", reflect.ValueOf(&p.Rip).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_CS 0x%02x\n", reflect.ValueOf(&p.Cs).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_FLAGS 0x%02x\n", reflect.ValueOf(&p.Eflags).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_RSP 0x%02x\n", reflect.ValueOf(&p.Rsp).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_SS 0x%02x\n", reflect.ValueOf(&p.Ss).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_FS 0x%02x\n", reflect.ValueOf(&p.Fs_base).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_GS 0x%02x\n", reflect.ValueOf(&p.Gs_base).Pointer()-reflect.ValueOf(p).Pointer()) +} + +// Useful bits. +const ( + _CR0_PE = 1 << 0 + _CR0_ET = 1 << 4 + _CR0_AM = 1 << 18 + _CR0_PG = 1 << 31 + + _CR4_PSE = 1 << 4 + _CR4_PAE = 1 << 5 + _CR4_PGE = 1 << 7 + _CR4_OSFXSR = 1 << 9 + _CR4_OSXMMEXCPT = 1 << 10 + _CR4_FSGSBASE = 1 << 16 + _CR4_PCIDE = 1 << 17 + _CR4_OSXSAVE = 1 << 18 + _CR4_SMEP = 1 << 20 + + _RFLAGS_AC = 1 << 18 + _RFLAGS_NT = 1 << 14 + _RFLAGS_IOPL0 = 1 << 12 + _RFLAGS_IOPL1 = 1 << 13 + _RFLAGS_IOPL = _RFLAGS_IOPL0 | _RFLAGS_IOPL1 + _RFLAGS_DF = 1 << 10 + _RFLAGS_IF = 1 << 9 + _RFLAGS_STEP = 1 << 8 + _RFLAGS_RESERVED = 1 << 1 + + _EFER_SCE = 0x001 + _EFER_LME = 0x100 + _EFER_LMA = 0x400 + _EFER_NX = 0x800 + + _MSR_STAR = 0xc0000081 + _MSR_LSTAR = 0xc0000082 + _MSR_CSTAR = 0xc0000083 + _MSR_SYSCALL_MASK = 0xc0000084 + _MSR_PLATFORM_INFO = 0xce + _MSR_MISC_FEATURES = 0x140 + + _PLATFORM_INFO_CPUID_FAULT = 1 << 31 + + _MISC_FEATURE_CPUID_TRAP = 0x1 +) + +const ( + // KernelFlagsSet should always be set in the kernel. + KernelFlagsSet = _RFLAGS_RESERVED + + // UserFlagsSet are always set in userspace. + // + // _RFLAGS_IOPL is a set of two bits and it shows the I/O privilege + // level. The Current Privilege Level (CPL) of the task must be less + // than or equal to the IOPL in order for the task or program to access + // I/O ports. + // + // Here, _RFLAGS_IOPL0 is used only to determine whether the task is + // running in the kernel or userspace mode. In the user mode, the CPL is + // always 3 and it doesn't matter what IOPL is set if it is bellow CPL. + // + // We need to have one bit which will be always different in user and + // kernel modes. And we have to remember that even though we have + // KernelFlagsClear, we still can see some of these flags in the kernel + // mode. This can happen when the goruntime switches on a goroutine + // which has been saved in the host mode. On restore, the popf + // instruction is used to restore flags and this means that all flags + // what the goroutine has in the host mode will be restored in the + // kernel mode. + // + // _RFLAGS_IOPL0 is never set in host and kernel modes and we always set + // it in the user mode. So if this flag is set, the task is running in + // the user mode and if it isn't set, the task is running in the kernel + // mode. + UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF | _RFLAGS_IOPL0 + + // KernelFlagsClear should always be clear in the kernel. + KernelFlagsClear = _RFLAGS_STEP | _RFLAGS_IF | _RFLAGS_IOPL | _RFLAGS_AC | _RFLAGS_NT + + // UserFlagsClear are always cleared in userspace. + UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL1 +) + +// IsKernelFlags returns true if rflags coresponds to the kernel mode. +// +// go:nosplit +func IsKernelFlags(rflags uint64) bool { + return rflags&_RFLAGS_IOPL0 == 0 +} + +// Vector is an exception vector. +type Vector uintptr + +// Exception vectors. +const ( + DivideByZero Vector = iota + Debug + NMI + Breakpoint + Overflow + BoundRangeExceeded + InvalidOpcode + DeviceNotAvailable + DoubleFault + CoprocessorSegmentOverrun + InvalidTSS + SegmentNotPresent + StackSegmentFault + GeneralProtectionFault + PageFault + _ + X87FloatingPointException + AlignmentCheck + MachineCheck + SIMDFloatingPointException + VirtualizationException + SecurityException = 0x1e + SyscallInt80 = 0x80 + _NR_INTERRUPTS = 0x100 +) + +// System call vectors. +const ( + Syscall Vector = _NR_INTERRUPTS +) + +// VirtualAddressBits returns the number bits available for virtual addresses. +// +// Note that sign-extension semantics apply to the highest order bit. +// +// FIXME(b/69382326): This should use the cpuid passed to Init. +func VirtualAddressBits() uint32 { + ax, _, _, _ := cpuid.HostID(0x80000008, 0) + return (ax >> 8) & 0xff +} + +// PhysicalAddressBits returns the number of bits available for physical addresses. +// +// FIXME(b/69382326): This should use the cpuid passed to Init. +func PhysicalAddressBits() uint32 { + ax, _, _, _ := cpuid.HostID(0x80000008, 0) + return ax & 0xff +} + +// Selector is a segment Selector. +type Selector uint16 + +// SegmentDescriptor is a segment descriptor. +type SegmentDescriptor struct { + bits [2]uint32 +} + +// descriptorTable is a collection of descriptors. +type descriptorTable [32]SegmentDescriptor + +// SegmentDescriptorFlags are typed flags within a descriptor. +type SegmentDescriptorFlags uint32 + +// SegmentDescriptorFlag declarations. +const ( + SegmentDescriptorAccess SegmentDescriptorFlags = 1 << 8 // Access bit (always set). + SegmentDescriptorWrite = 1 << 9 // Write permission. + SegmentDescriptorExpandDown = 1 << 10 // Grows down, not used. + SegmentDescriptorExecute = 1 << 11 // Execute permission. + SegmentDescriptorSystem = 1 << 12 // Zero => system, 1 => user code/data. + SegmentDescriptorPresent = 1 << 15 // Present. + SegmentDescriptorAVL = 1 << 20 // Available. + SegmentDescriptorLong = 1 << 21 // Long mode. + SegmentDescriptorDB = 1 << 22 // 16 or 32-bit. + SegmentDescriptorG = 1 << 23 // Granularity: page or byte. +) + +// Base returns the descriptor's base linear address. +func (d *SegmentDescriptor) Base() uint32 { + return d.bits[1]&0xFF000000 | (d.bits[1]&0x000000FF)<<16 | d.bits[0]>>16 +} + +// Limit returns the descriptor size. +func (d *SegmentDescriptor) Limit() uint32 { + l := d.bits[0]&0xFFFF | d.bits[1]&0xF0000 + if d.bits[1]&uint32(SegmentDescriptorG) != 0 { + l <<= 12 + l |= 0xFFF + } + return l +} + +// Flags returns descriptor flags. +func (d *SegmentDescriptor) Flags() SegmentDescriptorFlags { + return SegmentDescriptorFlags(d.bits[1] & 0x00F09F00) +} + +// DPL returns the descriptor privilege level. +func (d *SegmentDescriptor) DPL() int { + return int((d.bits[1] >> 13) & 3) +} + +func (d *SegmentDescriptor) setNull() { + d.bits[0] = 0 + d.bits[1] = 0 +} + +func (d *SegmentDescriptor) set(base, limit uint32, dpl int, flags SegmentDescriptorFlags) { + flags |= SegmentDescriptorPresent + if limit>>12 != 0 { + limit >>= 12 + flags |= SegmentDescriptorG + } + d.bits[0] = base<<16 | limit&0xFFFF + d.bits[1] = base&0xFF000000 | (base>>16)&0xFF | limit&0x000F0000 | uint32(flags) | uint32(dpl)<<13 +} + +func (d *SegmentDescriptor) setCode32(base, limit uint32, dpl int) { + d.set(base, limit, dpl, + SegmentDescriptorDB| + SegmentDescriptorExecute| + SegmentDescriptorSystem) +} + +func (d *SegmentDescriptor) setCode64(base, limit uint32, dpl int) { + d.set(base, limit, dpl, + SegmentDescriptorG| + SegmentDescriptorLong| + SegmentDescriptorExecute| + SegmentDescriptorSystem) +} + +func (d *SegmentDescriptor) setData(base, limit uint32, dpl int) { + d.set(base, limit, dpl, + SegmentDescriptorWrite| + SegmentDescriptorSystem) +} + +// setHi is only used for the TSS segment, which is magically 64-bits. +func (d *SegmentDescriptor) setHi(base uint32) { + d.bits[0] = base + d.bits[1] = 0 +} + +// Gate64 is a 64-bit task, trap, or interrupt gate. +type Gate64 struct { + bits [4]uint32 +} + +// idt64 is a 64-bit interrupt descriptor table. +type idt64 [_NR_INTERRUPTS]Gate64 + +func (g *Gate64) setInterrupt(cs Selector, rip uint64, dpl int, ist int) { + g.bits[0] = uint32(cs)<<16 | uint32(rip)&0xFFFF + g.bits[1] = uint32(rip)&0xFFFF0000 | SegmentDescriptorPresent | uint32(dpl)<<13 | 14<<8 | uint32(ist)&0x7 + g.bits[2] = uint32(rip >> 32) +} + +func (g *Gate64) setTrap(cs Selector, rip uint64, dpl int, ist int) { + g.setInterrupt(cs, rip, dpl, ist) + g.bits[1] |= 1 << 8 +} + +// TaskState64 is a 64-bit task state structure. +type TaskState64 struct { + _ uint32 + rsp0Lo, rsp0Hi uint32 + rsp1Lo, rsp1Hi uint32 + rsp2Lo, rsp2Hi uint32 + _ [2]uint32 + ist1Lo, ist1Hi uint32 + ist2Lo, ist2Hi uint32 + ist3Lo, ist3Hi uint32 + ist4Lo, ist4Hi uint32 + ist5Lo, ist5Hi uint32 + ist6Lo, ist6Hi uint32 + ist7Lo, ist7Hi uint32 + _ [2]uint32 + _ uint16 + ioPerm uint16 +} diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/defs_impl_arm64.go index 53bc3353c..1990580f3 100644 --- a/pkg/sentry/platform/ring0/offsets_arm64.go +++ b/pkg/sentry/platform/ring0/defs_impl_arm64.go @@ -1,29 +1,327 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - package ring0 import ( + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" + "fmt" + "gvisor.dev/gvisor/pkg/usermem" "io" "reflect" +) - "gvisor.dev/gvisor/pkg/sentry/arch" +// Useful bits. +const ( + _PGD_PGT_BASE = 0x1000 + _PGD_PGT_SIZE = 0x1000 + _PUD_PGT_BASE = 0x2000 + _PUD_PGT_SIZE = 0x1000 + _PMD_PGT_BASE = 0x3000 + _PMD_PGT_SIZE = 0x4000 + _PTE_PGT_BASE = 0x7000 + _PTE_PGT_SIZE = 0x1000 +) + +const ( + // DAIF bits:debug, sError, IRQ, FIQ. + _PSR_D_BIT = 0x00000200 + _PSR_A_BIT = 0x00000100 + _PSR_I_BIT = 0x00000080 + _PSR_F_BIT = 0x00000040 + _PSR_DAIF_SHIFT = 6 + _PSR_DAIF_MASK = 0xf << _PSR_DAIF_SHIFT + + // PSR bits. + _PSR_MODE_EL0t = 0x00000000 + _PSR_MODE_EL1t = 0x00000004 + _PSR_MODE_EL1h = 0x00000005 + _PSR_MODE_MASK = 0x0000000f + + PsrFlagsClear = _PSR_MODE_MASK | _PSR_DAIF_MASK + PsrModeMask = _PSR_MODE_MASK + + // KernelFlagsSet should always be set in the kernel. + KernelFlagsSet = _PSR_MODE_EL1h | _PSR_D_BIT | _PSR_A_BIT | _PSR_I_BIT | _PSR_F_BIT + + // UserFlagsSet are always set in userspace. + UserFlagsSet = _PSR_MODE_EL0t +) + +// Vector is an exception vector. +type Vector uintptr + +// Exception vectors. +const ( + El1InvSync = iota + El1InvIrq + El1InvFiq + El1InvError + + El1Sync + El1Irq + El1Fiq + El1Err + + El0Sync + El0Irq + El0Fiq + El0Err + + El0InvSync + El0InvIrq + El0InvFiq + El0InvErr + + El1SyncDa + El1SyncIa + El1SyncSpPc + El1SyncUndef + El1SyncDbg + El1SyncInv + + El0SyncSVC + El0SyncDa + El0SyncIa + El0SyncFpsimdAcc + El0SyncSveAcc + El0SyncSys + El0SyncSpPc + El0SyncUndef + El0SyncDbg + El0SyncInv + + El0ErrNMI + El0ErrBounce + + _NR_INTERRUPTS +) + +// System call vectors. +const ( + Syscall Vector = El0SyncSVC + PageFault Vector = El0SyncDa + VirtualizationException Vector = El0ErrBounce +) + +// VirtualAddressBits returns the number bits available for virtual addresses. +func VirtualAddressBits() uint32 { + return 48 +} + +// PhysicalAddressBits returns the number of bits available for physical addresses. +func PhysicalAddressBits() uint32 { + return 40 +} + +// Kernel is a global kernel object. +// +// This contains global state, shared by multiple CPUs. +type Kernel struct { + // PageTables are the kernel pagetables; this must be provided. + PageTables *pagetables.PageTables + + KernelArchState +} + +// Hooks are hooks for kernel functions. +type Hooks interface { + // KernelSyscall is called for kernel system calls. + // + // Return from this call will restore registers and return to the kernel: the + // registers must be modified directly. + // + // If this function is not provided, a kernel exception results in halt. + // + // This must be go:nosplit, as this will be on the interrupt stack. + // Closures are permitted, as the pointer to the closure frame is not + // passed on the stack. + KernelSyscall() + + // KernelException handles an exception during kernel execution. + // + // Return from this call will restore registers and return to the kernel: the + // registers must be modified directly. + // + // If this function is not provided, a kernel exception results in halt. + // + // This must be go:nosplit, as this will be on the interrupt stack. + // Closures are permitted, as the pointer to the closure frame is not + // passed on the stack. + KernelException(Vector) +} + +// CPU is the per-CPU struct. +type CPU struct { + // self is a self reference. + // + // This is always guaranteed to be at offset zero. + self *CPU + + // kernel is reference to the kernel that this CPU was initialized + // with. This reference is kept for garbage collection purposes: CPU + // registers may refer to objects within the Kernel object that cannot + // be safely freed. + kernel *Kernel + + // CPUArchState is architecture-specific state. + CPUArchState + + // registers is a set of registers; these may be used on kernel system + // calls and exceptions via the Registers function. + registers arch.Registers + + // hooks are kernel hooks. + hooks Hooks +} + +// Registers returns a modifiable-copy of the kernel registers. +// +// This is explicitly safe to call during KernelException and KernelSyscall. +// +//go:nosplit +func (c *CPU) Registers() *arch.Registers { + return &c.registers +} + +// SwitchOpts are passed to the Switch function. +type SwitchOpts struct { + // Registers are the user register state. + Registers *arch.Registers + + // FloatingPointState is a byte pointer where floating point state is + // saved and restored. + FloatingPointState *byte + + // PageTables are the application page tables. + PageTables *pagetables.PageTables + + // Flush indicates that a TLB flush should be forced on switch. + Flush bool + + // FullRestore indicates that an iret-based restore should be used. + FullRestore bool + + // SwitchArchOpts are architecture-specific options. + SwitchArchOpts +} + +var ( + // UserspaceSize is the total size of userspace. + UserspaceSize = uintptr(1) << (VirtualAddressBits()) + + // MaximumUserAddress is the largest possible user address. + MaximumUserAddress = (UserspaceSize - 1) & ^uintptr(usermem.PageSize-1) + + // KernelStartAddress is the starting kernel address. + KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1) ) +// KernelArchState contains architecture-specific state. +type KernelArchState struct { +} + +// CPUArchState contains CPU-specific arch state. +type CPUArchState struct { + // stack is the stack used for interrupts on this CPU. + stack [512]byte + + // errorCode is the error code from the last exception. + errorCode uintptr + + // errorType indicates the type of error code here, it is always set + // along with the errorCode value above. + // + // It will either by 1, which indicates a user error, or 0 indicating a + // kernel error. If the error code below returns false (kernel error), + // then it cannot provide relevant information about the last + // exception. + errorType uintptr + + // faultAddr is the value of far_el1. + faultAddr uintptr + + // ttbr0Kvm is the value of ttbr0_el1 for sentry. + ttbr0Kvm uintptr + + // ttbr0App is the value of ttbr0_el1 for applicaton. + ttbr0App uintptr + + // exception vector. + vecCode Vector + + // application context pointer. + appAddr uintptr + + // lazyVFP is the value of cpacr_el1. + lazyVFP uintptr + + // appASID is the asid value of guest application. + appASID uintptr +} + +// ErrorCode returns the last error code. +// +// The returned boolean indicates whether the error code corresponds to the +// last user error or not. If it does not, then fault information must be +// ignored. This is generally the result of a kernel fault while servicing a +// user fault. +// +//go:nosplit +func (c *CPU) ErrorCode() (value uintptr, user bool) { + return c.errorCode, c.errorType != 0 +} + +// ClearErrorCode resets the error code. +// +//go:nosplit +func (c *CPU) ClearErrorCode() { + c.errorCode = 0 + c.errorType = 1 +} + +//go:nosplit +func (c *CPU) GetFaultAddr() (value uintptr) { + return c.faultAddr +} + +//go:nosplit +func (c *CPU) SetTtbr0Kvm(value uintptr) { + c.ttbr0Kvm = value +} + +//go:nosplit +func (c *CPU) SetTtbr0App(value uintptr) { + c.ttbr0App = value +} + +//go:nosplit +func (c *CPU) GetVector() (value Vector) { + return c.vecCode +} + +//go:nosplit +func (c *CPU) SetAppAddr(value uintptr) { + c.appAddr = value +} + +// GetLazyVFP returns the value of cpacr_el1. +//go:nosplit +func (c *CPU) GetLazyVFP() (value uintptr) { + return c.lazyVFP +} + +// SwitchArchOpts are embedded in SwitchOpts. +type SwitchArchOpts struct { + // UserASID indicates that the application ASID to be used on switch, + UserASID uint16 + + // KernelASID indicates that the kernel ASID to be used on return, + KernelASID uint16 +} + +func init() { +} + // Emit prints architecture-specific offsets. func Emit(w io.Writer) { fmt.Fprintf(w, "// Automatically generated, do not edit.\n") diff --git a/pkg/sentry/platform/ring0/entry_amd64.s b/pkg/sentry/platform/ring0/entry_impl_amd64.s index f59747df3..9a5d6c064 100644 --- a/pkg/sentry/platform/ring0/entry_amd64.s +++ b/pkg/sentry/platform/ring0/entry_impl_amd64.s @@ -1,3 +1,73 @@ +// build +amd64 + +// Automatically generated, do not edit. + +// CPU offsets. +#define CPU_REGISTERS 0x28 +#define CPU_ERROR_CODE 0x10 +#define CPU_ERROR_TYPE 0x18 +#define CPU_ENTRY 0x20 + +// CPU entry offsets. +#define ENTRY_SCRATCH0 0x100 +#define ENTRY_STACK_TOP 0x108 +#define ENTRY_CPU_SELF 0x110 +#define ENTRY_KERNEL_CR3 0x118 + +// Bits. +#define _RFLAGS_IF 0x200 +#define _RFLAGS_IOPL0 0x1000 +#define _KERNEL_FLAGS 0x02 + +// Vectors. +#define DivideByZero 0x00 +#define Debug 0x01 +#define NMI 0x02 +#define Breakpoint 0x03 +#define Overflow 0x04 +#define BoundRangeExceeded 0x05 +#define InvalidOpcode 0x06 +#define DeviceNotAvailable 0x07 +#define DoubleFault 0x08 +#define CoprocessorSegmentOverrun 0x09 +#define InvalidTSS 0x0a +#define SegmentNotPresent 0x0b +#define StackSegmentFault 0x0c +#define GeneralProtectionFault 0x0d +#define PageFault 0x0e +#define X87FloatingPointException 0x10 +#define AlignmentCheck 0x11 +#define MachineCheck 0x12 +#define SIMDFloatingPointException 0x13 +#define VirtualizationException 0x14 +#define SecurityException 0x1e +#define SyscallInt80 0x80 +#define Syscall 0x100 + +// Ptrace registers. +#define PTRACE_R15 0x00 +#define PTRACE_R14 0x08 +#define PTRACE_R13 0x10 +#define PTRACE_R12 0x18 +#define PTRACE_RBP 0x20 +#define PTRACE_RBX 0x28 +#define PTRACE_R11 0x30 +#define PTRACE_R10 0x38 +#define PTRACE_R9 0x40 +#define PTRACE_R8 0x48 +#define PTRACE_RAX 0x50 +#define PTRACE_RCX 0x58 +#define PTRACE_RDX 0x60 +#define PTRACE_RSI 0x68 +#define PTRACE_RDI 0x70 +#define PTRACE_ORIGRAX 0x78 +#define PTRACE_RIP 0x80 +#define PTRACE_CS 0x88 +#define PTRACE_FLAGS 0x90 +#define PTRACE_RSP 0x98 +#define PTRACE_SS 0xa0 +#define PTRACE_FS 0xa8 +#define PTRACE_GS 0xb0 // Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_impl_arm64.s index f489ad352..1dedf9a03 100644 --- a/pkg/sentry/platform/ring0/entry_arm64.s +++ b/pkg/sentry/platform/ring0/entry_impl_arm64.s @@ -1,3 +1,73 @@ +// build +arm64 + +// Automatically generated, do not edit. + +// CPU offsets. +#define CPU_REGISTERS 0x28 +#define CPU_ERROR_CODE 0x10 +#define CPU_ERROR_TYPE 0x18 +#define CPU_ENTRY 0x20 + +// CPU entry offsets. +#define ENTRY_SCRATCH0 0x100 +#define ENTRY_STACK_TOP 0x108 +#define ENTRY_CPU_SELF 0x110 +#define ENTRY_KERNEL_CR3 0x118 + +// Bits. +#define _RFLAGS_IF 0x200 +#define _RFLAGS_IOPL0 0x1000 +#define _KERNEL_FLAGS 0x02 + +// Vectors. +#define DivideByZero 0x00 +#define Debug 0x01 +#define NMI 0x02 +#define Breakpoint 0x03 +#define Overflow 0x04 +#define BoundRangeExceeded 0x05 +#define InvalidOpcode 0x06 +#define DeviceNotAvailable 0x07 +#define DoubleFault 0x08 +#define CoprocessorSegmentOverrun 0x09 +#define InvalidTSS 0x0a +#define SegmentNotPresent 0x0b +#define StackSegmentFault 0x0c +#define GeneralProtectionFault 0x0d +#define PageFault 0x0e +#define X87FloatingPointException 0x10 +#define AlignmentCheck 0x11 +#define MachineCheck 0x12 +#define SIMDFloatingPointException 0x13 +#define VirtualizationException 0x14 +#define SecurityException 0x1e +#define SyscallInt80 0x80 +#define Syscall 0x100 + +// Ptrace registers. +#define PTRACE_R15 0x00 +#define PTRACE_R14 0x08 +#define PTRACE_R13 0x10 +#define PTRACE_R12 0x18 +#define PTRACE_RBP 0x20 +#define PTRACE_RBX 0x28 +#define PTRACE_R11 0x30 +#define PTRACE_R10 0x38 +#define PTRACE_R9 0x40 +#define PTRACE_R8 0x48 +#define PTRACE_RAX 0x50 +#define PTRACE_RCX 0x58 +#define PTRACE_RDX 0x60 +#define PTRACE_RSI 0x68 +#define PTRACE_RDI 0x70 +#define PTRACE_ORIGRAX 0x78 +#define PTRACE_RIP 0x80 +#define PTRACE_CS 0x88 +#define PTRACE_FLAGS 0x90 +#define PTRACE_RSP 0x98 +#define PTRACE_SS 0xa0 +#define PTRACE_FS 0xa8 +#define PTRACE_GS 0xb0 // Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD deleted file mode 100644 index 9742308d8..000000000 --- a/pkg/sentry/platform/ring0/gen_offsets/BUILD +++ /dev/null @@ -1,37 +0,0 @@ -load("//tools:defs.bzl", "go_binary") -load("//tools/go_generics:defs.bzl", "go_template_instance") - -package(licenses = ["notice"]) - -go_template_instance( - name = "defs_impl_arm64", - out = "defs_impl_arm64.go", - package = "main", - template = "//pkg/sentry/platform/ring0:defs_arm64", -) - -go_template_instance( - name = "defs_impl_amd64", - out = "defs_impl_amd64.go", - package = "main", - template = "//pkg/sentry/platform/ring0:defs_amd64", -) - -go_binary( - name = "gen_offsets", - srcs = [ - "defs_impl_amd64.go", - "defs_impl_arm64.go", - "main.go", - ], - visibility = [ - "//pkg/sentry/platform/kvm:__pkg__", - "//pkg/sentry/platform/ring0:__pkg__", - ], - deps = [ - "//pkg/cpuid", - "//pkg/sentry/arch", - "//pkg/sentry/platform/ring0/pagetables", - "//pkg/usermem", - ], -) diff --git a/pkg/sentry/platform/ring0/gen_offsets/main.go b/pkg/sentry/platform/ring0/gen_offsets/main.go deleted file mode 100644 index a4927da2f..000000000 --- a/pkg/sentry/platform/ring0/gen_offsets/main.go +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Binary gen_offsets is a helper for generating offset headers. -package main - -import ( - "os" -) - -func main() { - Emit(os.Stdout) -} diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go deleted file mode 100644 index ca4075b09..000000000 --- a/pkg/sentry/platform/ring0/offsets_amd64.go +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build amd64 - -package ring0 - -import ( - "fmt" - "io" - "reflect" - - "gvisor.dev/gvisor/pkg/sentry/arch" -) - -// Emit prints architecture-specific offsets. -func Emit(w io.Writer) { - fmt.Fprintf(w, "// Automatically generated, do not edit.\n") - - c := &CPU{} - fmt.Fprintf(w, "\n// CPU offsets.\n") - fmt.Fprintf(w, "#define CPU_REGISTERS 0x%02x\n", reflect.ValueOf(&c.registers).Pointer()-reflect.ValueOf(c).Pointer()) - fmt.Fprintf(w, "#define CPU_ERROR_CODE 0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer()) - fmt.Fprintf(w, "#define CPU_ERROR_TYPE 0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer()) - fmt.Fprintf(w, "#define CPU_ENTRY 0x%02x\n", reflect.ValueOf(&c.kernelEntry).Pointer()-reflect.ValueOf(c).Pointer()) - - e := &kernelEntry{} - fmt.Fprintf(w, "\n// CPU entry offsets.\n") - fmt.Fprintf(w, "#define ENTRY_SCRATCH0 0x%02x\n", reflect.ValueOf(&e.scratch0).Pointer()-reflect.ValueOf(e).Pointer()) - fmt.Fprintf(w, "#define ENTRY_STACK_TOP 0x%02x\n", reflect.ValueOf(&e.stackTop).Pointer()-reflect.ValueOf(e).Pointer()) - fmt.Fprintf(w, "#define ENTRY_CPU_SELF 0x%02x\n", reflect.ValueOf(&e.cpuSelf).Pointer()-reflect.ValueOf(e).Pointer()) - fmt.Fprintf(w, "#define ENTRY_KERNEL_CR3 0x%02x\n", reflect.ValueOf(&e.kernelCR3).Pointer()-reflect.ValueOf(e).Pointer()) - - fmt.Fprintf(w, "\n// Bits.\n") - fmt.Fprintf(w, "#define _RFLAGS_IF 0x%02x\n", _RFLAGS_IF) - fmt.Fprintf(w, "#define _RFLAGS_IOPL0 0x%02x\n", _RFLAGS_IOPL0) - fmt.Fprintf(w, "#define _KERNEL_FLAGS 0x%02x\n", KernelFlagsSet) - - fmt.Fprintf(w, "\n// Vectors.\n") - fmt.Fprintf(w, "#define DivideByZero 0x%02x\n", DivideByZero) - fmt.Fprintf(w, "#define Debug 0x%02x\n", Debug) - fmt.Fprintf(w, "#define NMI 0x%02x\n", NMI) - fmt.Fprintf(w, "#define Breakpoint 0x%02x\n", Breakpoint) - fmt.Fprintf(w, "#define Overflow 0x%02x\n", Overflow) - fmt.Fprintf(w, "#define BoundRangeExceeded 0x%02x\n", BoundRangeExceeded) - fmt.Fprintf(w, "#define InvalidOpcode 0x%02x\n", InvalidOpcode) - fmt.Fprintf(w, "#define DeviceNotAvailable 0x%02x\n", DeviceNotAvailable) - fmt.Fprintf(w, "#define DoubleFault 0x%02x\n", DoubleFault) - fmt.Fprintf(w, "#define CoprocessorSegmentOverrun 0x%02x\n", CoprocessorSegmentOverrun) - fmt.Fprintf(w, "#define InvalidTSS 0x%02x\n", InvalidTSS) - fmt.Fprintf(w, "#define SegmentNotPresent 0x%02x\n", SegmentNotPresent) - fmt.Fprintf(w, "#define StackSegmentFault 0x%02x\n", StackSegmentFault) - fmt.Fprintf(w, "#define GeneralProtectionFault 0x%02x\n", GeneralProtectionFault) - fmt.Fprintf(w, "#define PageFault 0x%02x\n", PageFault) - fmt.Fprintf(w, "#define X87FloatingPointException 0x%02x\n", X87FloatingPointException) - fmt.Fprintf(w, "#define AlignmentCheck 0x%02x\n", AlignmentCheck) - fmt.Fprintf(w, "#define MachineCheck 0x%02x\n", MachineCheck) - fmt.Fprintf(w, "#define SIMDFloatingPointException 0x%02x\n", SIMDFloatingPointException) - fmt.Fprintf(w, "#define VirtualizationException 0x%02x\n", VirtualizationException) - fmt.Fprintf(w, "#define SecurityException 0x%02x\n", SecurityException) - fmt.Fprintf(w, "#define SyscallInt80 0x%02x\n", SyscallInt80) - fmt.Fprintf(w, "#define Syscall 0x%02x\n", Syscall) - - p := &arch.Registers{} - fmt.Fprintf(w, "\n// Ptrace registers.\n") - fmt.Fprintf(w, "#define PTRACE_R15 0x%02x\n", reflect.ValueOf(&p.R15).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R14 0x%02x\n", reflect.ValueOf(&p.R14).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R13 0x%02x\n", reflect.ValueOf(&p.R13).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R12 0x%02x\n", reflect.ValueOf(&p.R12).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_RBP 0x%02x\n", reflect.ValueOf(&p.Rbp).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_RBX 0x%02x\n", reflect.ValueOf(&p.Rbx).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R11 0x%02x\n", reflect.ValueOf(&p.R11).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R10 0x%02x\n", reflect.ValueOf(&p.R10).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R9 0x%02x\n", reflect.ValueOf(&p.R9).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_R8 0x%02x\n", reflect.ValueOf(&p.R8).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_RAX 0x%02x\n", reflect.ValueOf(&p.Rax).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_RCX 0x%02x\n", reflect.ValueOf(&p.Rcx).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_RDX 0x%02x\n", reflect.ValueOf(&p.Rdx).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_RSI 0x%02x\n", reflect.ValueOf(&p.Rsi).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_RDI 0x%02x\n", reflect.ValueOf(&p.Rdi).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_ORIGRAX 0x%02x\n", reflect.ValueOf(&p.Orig_rax).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_RIP 0x%02x\n", reflect.ValueOf(&p.Rip).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_CS 0x%02x\n", reflect.ValueOf(&p.Cs).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_FLAGS 0x%02x\n", reflect.ValueOf(&p.Eflags).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_RSP 0x%02x\n", reflect.ValueOf(&p.Rsp).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_SS 0x%02x\n", reflect.ValueOf(&p.Ss).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_FS 0x%02x\n", reflect.ValueOf(&p.Fs_base).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_GS 0x%02x\n", reflect.ValueOf(&p.Gs_base).Pointer()-reflect.ValueOf(p).Pointer()) -} diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD deleted file mode 100644 index 16d5f478b..000000000 --- a/pkg/sentry/platform/ring0/pagetables/BUILD +++ /dev/null @@ -1,115 +0,0 @@ -load("//tools:defs.bzl", "go_library", "go_test", "select_arch") -load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") - -package(licenses = ["notice"]) - -go_template( - name = "generic_walker", - srcs = select_arch( - amd64 = ["walker_amd64.go"], - arm64 = ["walker_arm64.go"], - ), - opt_types = [ - "Visitor", - ], - visibility = [":__pkg__"], -) - -go_template_instance( - name = "walker_map", - out = "walker_map.go", - package = "pagetables", - prefix = "map", - template = ":generic_walker", - types = { - "Visitor": "mapVisitor", - }, -) - -go_template_instance( - name = "walker_unmap", - out = "walker_unmap.go", - package = "pagetables", - prefix = "unmap", - template = ":generic_walker", - types = { - "Visitor": "unmapVisitor", - }, -) - -go_template_instance( - name = "walker_lookup", - out = "walker_lookup.go", - package = "pagetables", - prefix = "lookup", - template = ":generic_walker", - types = { - "Visitor": "lookupVisitor", - }, -) - -go_template_instance( - name = "walker_empty", - out = "walker_empty.go", - package = "pagetables", - prefix = "empty", - template = ":generic_walker", - types = { - "Visitor": "emptyVisitor", - }, -) - -go_template_instance( - name = "walker_check", - out = "walker_check.go", - package = "pagetables", - prefix = "check", - template = ":generic_walker", - types = { - "Visitor": "checkVisitor", - }, -) - -go_library( - name = "pagetables", - srcs = [ - "allocator.go", - "allocator_unsafe.go", - "pagetables.go", - "pagetables_aarch64.go", - "pagetables_amd64.go", - "pagetables_arm64.go", - "pagetables_x86.go", - "pcids.go", - "pcids_aarch64.go", - "pcids_aarch64.s", - "pcids_x86.go", - "walker_amd64.go", - "walker_arm64.go", - "walker_empty.go", - "walker_lookup.go", - "walker_map.go", - "walker_unmap.go", - ], - visibility = [ - "//pkg/sentry/platform/kvm:__subpackages__", - "//pkg/sentry/platform/ring0:__subpackages__", - ], - deps = [ - "//pkg/sync", - "//pkg/usermem", - ], -) - -go_test( - name = "pagetables_test", - size = "small", - srcs = [ - "pagetables_amd64_test.go", - "pagetables_arm64_test.go", - "pagetables_test.go", - "walker_check.go", - ], - library = ":pagetables", - deps = ["//pkg/usermem"], -) diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64_state_autogen.go b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64_state_autogen.go new file mode 100644 index 000000000..a24523f87 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64_state_autogen.go @@ -0,0 +1,6 @@ +// automatically generated by stateify. + +// +build arm64 +// +build arm64 + +package pagetables diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_state_autogen.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_state_autogen.go new file mode 100644 index 000000000..f48a8acd1 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_state_autogen.go @@ -0,0 +1,5 @@ +// automatically generated by stateify. + +// +build amd64 + +package pagetables diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go deleted file mode 100644 index 54e8e554f..000000000 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build amd64 - -package pagetables - -import ( - "testing" - - "gvisor.dev/gvisor/pkg/usermem" -) - -func Test2MAnd4K(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map a small page and a huge page. - pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42) - pt.Map(0x00007f0000000000, pmdSize, MapOpts{AccessType: usermem.Read}, pmdSize*47) - - checkMappings(t, pt, []mapping{ - {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite}}, - {0x00007f0000000000, pmdSize, pmdSize * 47, MapOpts{AccessType: usermem.Read}}, - }) -} - -func Test1GAnd4K(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map a small page and a super page. - pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42) - pt.Map(0x00007f0000000000, pudSize, MapOpts{AccessType: usermem.Read}, pudSize*47) - - checkMappings(t, pt, []mapping{ - {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite}}, - {0x00007f0000000000, pudSize, pudSize * 47, MapOpts{AccessType: usermem.Read}}, - }) -} - -func TestSplit1GPage(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map a super page and knock out the middle. - pt.Map(0x00007f0000000000, pudSize, MapOpts{AccessType: usermem.Read}, pudSize*42) - pt.Unmap(usermem.Addr(0x00007f0000000000+pteSize), pudSize-(2*pteSize)) - - checkMappings(t, pt, []mapping{ - {0x00007f0000000000, pteSize, pudSize * 42, MapOpts{AccessType: usermem.Read}}, - {0x00007f0000000000 + pudSize - pteSize, pteSize, pudSize*42 + pudSize - pteSize, MapOpts{AccessType: usermem.Read}}, - }) -} - -func TestSplit2MPage(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map a huge page and knock out the middle. - pt.Map(0x00007f0000000000, pmdSize, MapOpts{AccessType: usermem.Read}, pmdSize*42) - pt.Unmap(usermem.Addr(0x00007f0000000000+pteSize), pmdSize-(2*pteSize)) - - checkMappings(t, pt, []mapping{ - {0x00007f0000000000, pteSize, pmdSize * 42, MapOpts{AccessType: usermem.Read}}, - {0x00007f0000000000 + pmdSize - pteSize, pteSize, pmdSize*42 + pmdSize - pteSize, MapOpts{AccessType: usermem.Read}}, - }) -} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_state_autogen.go b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_state_autogen.go new file mode 100644 index 000000000..ae9d2b272 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_state_autogen.go @@ -0,0 +1,5 @@ +// automatically generated by stateify. + +// +build arm64 + +package pagetables diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go deleted file mode 100644 index 2f73d424f..000000000 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package pagetables - -import ( - "testing" - - "gvisor.dev/gvisor/pkg/usermem" -) - -func Test2MAnd4K(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map a small page and a huge page. - pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite, User: true}, pteSize*42) - pt.Map(0x0000ff0000000000, pmdSize, MapOpts{AccessType: usermem.Read, User: true}, pmdSize*47) - - pt.Map(0xffff000000400000, pteSize, MapOpts{AccessType: usermem.ReadWrite, User: false}, pteSize*42) - pt.Map(0xffffff0000000000, pmdSize, MapOpts{AccessType: usermem.Read, User: false}, pmdSize*47) - - checkMappings(t, pt, []mapping{ - {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite, User: true}}, - {0x0000ff0000000000, pmdSize, pmdSize * 47, MapOpts{AccessType: usermem.Read, User: true}}, - {0xffff000000400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite, User: false}}, - {0xffffff0000000000, pmdSize, pmdSize * 47, MapOpts{AccessType: usermem.Read, User: false}}, - }) -} - -func Test1GAnd4K(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map a small page and a super page. - pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite, User: true}, pteSize*42) - pt.Map(0x0000ff0000000000, pudSize, MapOpts{AccessType: usermem.Read, User: true}, pudSize*47) - - checkMappings(t, pt, []mapping{ - {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite, User: true}}, - {0x0000ff0000000000, pudSize, pudSize * 47, MapOpts{AccessType: usermem.Read, User: true}}, - }) -} - -func TestSplit1GPage(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map a super page and knock out the middle. - pt.Map(0x0000ff0000000000, pudSize, MapOpts{AccessType: usermem.Read, User: true}, pudSize*42) - pt.Unmap(usermem.Addr(0x0000ff0000000000+pteSize), pudSize-(2*pteSize)) - - checkMappings(t, pt, []mapping{ - {0x0000ff0000000000, pteSize, pudSize * 42, MapOpts{AccessType: usermem.Read, User: true}}, - {0x0000ff0000000000 + pudSize - pteSize, pteSize, pudSize*42 + pudSize - pteSize, MapOpts{AccessType: usermem.Read, User: true}}, - }) -} - -func TestSplit2MPage(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map a huge page and knock out the middle. - pt.Map(0x0000ff0000000000, pmdSize, MapOpts{AccessType: usermem.Read, User: true}, pmdSize*42) - pt.Unmap(usermem.Addr(0x0000ff0000000000+pteSize), pmdSize-(2*pteSize)) - - checkMappings(t, pt, []mapping{ - {0x0000ff0000000000, pteSize, pmdSize * 42, MapOpts{AccessType: usermem.Read, User: true}}, - {0x0000ff0000000000 + pmdSize - pteSize, pteSize, pmdSize*42 + pmdSize - pteSize, MapOpts{AccessType: usermem.Read, User: true}}, - }) -} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_state_autogen.go b/pkg/sentry/platform/ring0/pagetables/pagetables_state_autogen.go new file mode 100644 index 000000000..4c4540603 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_state_autogen.go @@ -0,0 +1,3 @@ +// automatically generated by stateify. + +package pagetables diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go deleted file mode 100644 index 5c88d087d..000000000 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go +++ /dev/null @@ -1,156 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package pagetables - -import ( - "testing" - - "gvisor.dev/gvisor/pkg/usermem" -) - -type mapping struct { - start uintptr - length uintptr - addr uintptr - opts MapOpts -} - -type checkVisitor struct { - expected []mapping // Input. - current int // Temporary. - found []mapping // Output. - failed string // Output. -} - -func (v *checkVisitor) visit(start uintptr, pte *PTE, align uintptr) { - v.found = append(v.found, mapping{ - start: start, - length: align + 1, - addr: pte.Address(), - opts: pte.Opts(), - }) - if v.failed != "" { - // Don't keep looking for errors. - return - } - - if v.current >= len(v.expected) { - v.failed = "more mappings than expected" - } else if v.expected[v.current].start != start { - v.failed = "start didn't match expected" - } else if v.expected[v.current].length != (align + 1) { - v.failed = "end didn't match expected" - } else if v.expected[v.current].addr != pte.Address() { - v.failed = "address didn't match expected" - } else if v.expected[v.current].opts != pte.Opts() { - v.failed = "opts didn't match" - } - v.current++ -} - -func (*checkVisitor) requiresAlloc() bool { return false } - -func (*checkVisitor) requiresSplit() bool { return false } - -func checkMappings(t *testing.T, pt *PageTables, m []mapping) { - // Iterate over all the mappings. - w := checkWalker{ - pageTables: pt, - visitor: checkVisitor{ - expected: m, - }, - } - w.iterateRange(0, ^uintptr(0)) - - // Were we expected additional mappings? - if w.visitor.failed == "" && w.visitor.current != len(w.visitor.expected) { - w.visitor.failed = "insufficient mappings found" - } - - // Emit a meaningful error message on failure. - if w.visitor.failed != "" { - t.Errorf("%s; got %#v, wanted %#v", w.visitor.failed, w.visitor.found, w.visitor.expected) - } -} - -func TestUnmap(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map and unmap one entry. - pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42) - pt.Unmap(0x400000, pteSize) - - checkMappings(t, pt, nil) -} - -func TestReadOnly(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map one entry. - pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.Read}, pteSize*42) - - checkMappings(t, pt, []mapping{ - {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.Read}}, - }) -} - -func TestReadWrite(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map one entry. - pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42) - - checkMappings(t, pt, []mapping{ - {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite}}, - }) -} - -func TestSerialEntries(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map two sequential entries. - pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42) - pt.Map(0x401000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*47) - - checkMappings(t, pt, []mapping{ - {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite}}, - {0x401000, pteSize, pteSize * 47, MapOpts{AccessType: usermem.ReadWrite}}, - }) -} - -func TestSpanningEntries(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Span a pgd with two pages. - pt.Map(0x00007efffffff000, 2*pteSize, MapOpts{AccessType: usermem.Read}, pteSize*42) - - checkMappings(t, pt, []mapping{ - {0x00007efffffff000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.Read}}, - {0x00007f0000000000, pteSize, pteSize * 43, MapOpts{AccessType: usermem.Read}}, - }) -} - -func TestSparseEntries(t *testing.T) { - pt := New(NewRuntimeAllocator()) - - // Map two entries in different pgds. - pt.Map(0x400000, pteSize, MapOpts{AccessType: usermem.ReadWrite}, pteSize*42) - pt.Map(0x00007f0000000000, pteSize, MapOpts{AccessType: usermem.Read}, pteSize*47) - - checkMappings(t, pt, []mapping{ - {0x400000, pteSize, pteSize * 42, MapOpts{AccessType: usermem.ReadWrite}}, - {0x00007f0000000000, pteSize, pteSize * 47, MapOpts{AccessType: usermem.Read}}, - }) -} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_unsafe_state_autogen.go b/pkg/sentry/platform/ring0/pagetables/pagetables_unsafe_state_autogen.go new file mode 100644 index 000000000..4c4540603 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_unsafe_state_autogen.go @@ -0,0 +1,3 @@ +// automatically generated by stateify. + +package pagetables diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86_state_autogen.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86_state_autogen.go new file mode 100644 index 000000000..6fe78c51c --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86_state_autogen.go @@ -0,0 +1,6 @@ +// automatically generated by stateify. + +// +build 386 amd64 +// +build i386 amd64 + +package pagetables diff --git a/pkg/sentry/platform/ring0/pagetables/walker_empty.go b/pkg/sentry/platform/ring0/pagetables/walker_empty.go new file mode 100644 index 000000000..417784e17 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/walker_empty.go @@ -0,0 +1,255 @@ +package pagetables + +// Walker walks page tables. +type emptyWalker struct { + // pageTables are the tables to walk. + pageTables *PageTables + + // Visitor is the set of arguments. + visitor emptyVisitor +} + +// iterateRange iterates over all appropriate levels of page tables for the given range. +// +// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The +// exception is super pages. If a valid super page (huge or jumbo) cannot be +// installed, then the walk will continue to individual entries. +// +// This algorithm will attempt to maximize the use of super pages whenever +// possible. Whether a super page is provided will be clear through the range +// provided in the callback. +// +// Note that if requiresAlloc is true, then no gaps will be present. However, +// if alloc is not set, then the iteration will likely be full of gaps. +// +// Note that this function should generally be avoided in favor of Map, Unmap, +// etc. when not necessary. +// +// Precondition: start must be page-aligned. +// +// Precondition: start must be less than end. +// +// Precondition: If requiresAlloc is true, then start and end should not span +// non-canonical ranges. If they do, a panic will result. +// +//go:nosplit +func (w *emptyWalker) iterateRange(start, end uintptr) { + if start%pteSize != 0 { + panic("unaligned start") + } + if end < start { + panic("start > end") + } + if start < lowerTop { + if end <= lowerTop { + w.iterateRangeCanonical(start, end) + } else if end > lowerTop && end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + w.iterateRangeCanonical(upperBottom, end) + } + } else if start < upperBottom { + if end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(upperBottom, end) + } + } else { + w.iterateRangeCanonical(start, end) + } +} + +// next returns the next address quantized by the given size. +// +//go:nosplit +func emptynext(start uintptr, size uintptr) uintptr { + start &= ^(size - 1) + start += size + return start +} + +// iterateRangeCanonical walks a canonical range. +// +//go:nosplit +func (w *emptyWalker) iterateRangeCanonical(start, end uintptr) { + for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { + var ( + pgdEntry = &w.pageTables.root[pgdIndex] + pudEntries *PTEs + ) + if !pgdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + start = emptynext(start, pgdSize) + continue + } + + pudEntries = w.pageTables.Allocator.NewPTEs() + pgdEntry.setPageTable(w.pageTables, pudEntries) + } else { + pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) + } + + clearPUDEntries := uint16(0) + + for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { + var ( + pudEntry = &pudEntries[pudIndex] + pmdEntries *PTEs + ) + if !pudEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPUDEntries++ + start = emptynext(start, pudSize) + continue + } + + if start&(pudSize-1) == 0 && end-start >= pudSize { + pudEntry.SetSuper() + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + if pudEntry.Valid() { + start = emptynext(start, pudSize) + continue + } + } + + pmdEntries = w.pageTables.Allocator.NewPTEs() + pudEntry.setPageTable(w.pageTables, pmdEntries) + + } else if pudEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < emptynext(start, pudSize)) { + + pmdEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pmdEntries[index].SetSuper() + pmdEntries[index].Set( + pudEntry.Address()+(pmdSize*uintptr(index)), + pudEntry.Opts()) + } + pudEntry.setPageTable(w.pageTables, pmdEntries) + } else { + + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + + if !pudEntry.Valid() { + clearPUDEntries++ + } + + start = emptynext(start, pudSize) + continue + } + } else { + pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) + } + + clearPMDEntries := uint16(0) + + for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { + var ( + pmdEntry = &pmdEntries[pmdIndex] + pteEntries *PTEs + ) + if !pmdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPMDEntries++ + start = emptynext(start, pmdSize) + continue + } + + if start&(pmdSize-1) == 0 && end-start >= pmdSize { + pmdEntry.SetSuper() + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + if pmdEntry.Valid() { + start = emptynext(start, pmdSize) + continue + } + } + + pteEntries = w.pageTables.Allocator.NewPTEs() + pmdEntry.setPageTable(w.pageTables, pteEntries) + + } else if pmdEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < emptynext(start, pmdSize)) { + + pteEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pteEntries[index].Set( + pmdEntry.Address()+(pteSize*uintptr(index)), + pmdEntry.Opts()) + } + pmdEntry.setPageTable(w.pageTables, pteEntries) + } else { + + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + + if !pmdEntry.Valid() { + clearPMDEntries++ + } + + start = emptynext(start, pmdSize) + continue + } + } else { + pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) + } + + clearPTEEntries := uint16(0) + + for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { + var ( + pteEntry = &pteEntries[pteIndex] + ) + if !pteEntry.Valid() && !w.visitor.requiresAlloc() { + clearPTEEntries++ + start += pteSize + continue + } + + w.visitor.visit(uintptr(start), pteEntry, pteSize-1) + if !pteEntry.Valid() { + if w.visitor.requiresAlloc() { + panic("PTE not set after iteration with requiresAlloc!") + } + clearPTEEntries++ + } + + start += pteSize + continue + } + + if clearPTEEntries == entriesPerPage { + pmdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pteEntries) + clearPMDEntries++ + } + } + + if clearPMDEntries == entriesPerPage { + pudEntry.Clear() + w.pageTables.Allocator.FreePTEs(pmdEntries) + clearPUDEntries++ + } + } + + if clearPUDEntries == entriesPerPage { + pgdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pudEntries) + } + } +} diff --git a/pkg/sentry/platform/ring0/pagetables/walker_lookup.go b/pkg/sentry/platform/ring0/pagetables/walker_lookup.go new file mode 100644 index 000000000..906c9c50f --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/walker_lookup.go @@ -0,0 +1,255 @@ +package pagetables + +// Walker walks page tables. +type lookupWalker struct { + // pageTables are the tables to walk. + pageTables *PageTables + + // Visitor is the set of arguments. + visitor lookupVisitor +} + +// iterateRange iterates over all appropriate levels of page tables for the given range. +// +// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The +// exception is super pages. If a valid super page (huge or jumbo) cannot be +// installed, then the walk will continue to individual entries. +// +// This algorithm will attempt to maximize the use of super pages whenever +// possible. Whether a super page is provided will be clear through the range +// provided in the callback. +// +// Note that if requiresAlloc is true, then no gaps will be present. However, +// if alloc is not set, then the iteration will likely be full of gaps. +// +// Note that this function should generally be avoided in favor of Map, Unmap, +// etc. when not necessary. +// +// Precondition: start must be page-aligned. +// +// Precondition: start must be less than end. +// +// Precondition: If requiresAlloc is true, then start and end should not span +// non-canonical ranges. If they do, a panic will result. +// +//go:nosplit +func (w *lookupWalker) iterateRange(start, end uintptr) { + if start%pteSize != 0 { + panic("unaligned start") + } + if end < start { + panic("start > end") + } + if start < lowerTop { + if end <= lowerTop { + w.iterateRangeCanonical(start, end) + } else if end > lowerTop && end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + w.iterateRangeCanonical(upperBottom, end) + } + } else if start < upperBottom { + if end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(upperBottom, end) + } + } else { + w.iterateRangeCanonical(start, end) + } +} + +// next returns the next address quantized by the given size. +// +//go:nosplit +func lookupnext(start uintptr, size uintptr) uintptr { + start &= ^(size - 1) + start += size + return start +} + +// iterateRangeCanonical walks a canonical range. +// +//go:nosplit +func (w *lookupWalker) iterateRangeCanonical(start, end uintptr) { + for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { + var ( + pgdEntry = &w.pageTables.root[pgdIndex] + pudEntries *PTEs + ) + if !pgdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + start = lookupnext(start, pgdSize) + continue + } + + pudEntries = w.pageTables.Allocator.NewPTEs() + pgdEntry.setPageTable(w.pageTables, pudEntries) + } else { + pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) + } + + clearPUDEntries := uint16(0) + + for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { + var ( + pudEntry = &pudEntries[pudIndex] + pmdEntries *PTEs + ) + if !pudEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPUDEntries++ + start = lookupnext(start, pudSize) + continue + } + + if start&(pudSize-1) == 0 && end-start >= pudSize { + pudEntry.SetSuper() + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + if pudEntry.Valid() { + start = lookupnext(start, pudSize) + continue + } + } + + pmdEntries = w.pageTables.Allocator.NewPTEs() + pudEntry.setPageTable(w.pageTables, pmdEntries) + + } else if pudEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < lookupnext(start, pudSize)) { + + pmdEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pmdEntries[index].SetSuper() + pmdEntries[index].Set( + pudEntry.Address()+(pmdSize*uintptr(index)), + pudEntry.Opts()) + } + pudEntry.setPageTable(w.pageTables, pmdEntries) + } else { + + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + + if !pudEntry.Valid() { + clearPUDEntries++ + } + + start = lookupnext(start, pudSize) + continue + } + } else { + pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) + } + + clearPMDEntries := uint16(0) + + for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { + var ( + pmdEntry = &pmdEntries[pmdIndex] + pteEntries *PTEs + ) + if !pmdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPMDEntries++ + start = lookupnext(start, pmdSize) + continue + } + + if start&(pmdSize-1) == 0 && end-start >= pmdSize { + pmdEntry.SetSuper() + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + if pmdEntry.Valid() { + start = lookupnext(start, pmdSize) + continue + } + } + + pteEntries = w.pageTables.Allocator.NewPTEs() + pmdEntry.setPageTable(w.pageTables, pteEntries) + + } else if pmdEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < lookupnext(start, pmdSize)) { + + pteEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pteEntries[index].Set( + pmdEntry.Address()+(pteSize*uintptr(index)), + pmdEntry.Opts()) + } + pmdEntry.setPageTable(w.pageTables, pteEntries) + } else { + + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + + if !pmdEntry.Valid() { + clearPMDEntries++ + } + + start = lookupnext(start, pmdSize) + continue + } + } else { + pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) + } + + clearPTEEntries := uint16(0) + + for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { + var ( + pteEntry = &pteEntries[pteIndex] + ) + if !pteEntry.Valid() && !w.visitor.requiresAlloc() { + clearPTEEntries++ + start += pteSize + continue + } + + w.visitor.visit(uintptr(start), pteEntry, pteSize-1) + if !pteEntry.Valid() { + if w.visitor.requiresAlloc() { + panic("PTE not set after iteration with requiresAlloc!") + } + clearPTEEntries++ + } + + start += pteSize + continue + } + + if clearPTEEntries == entriesPerPage { + pmdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pteEntries) + clearPMDEntries++ + } + } + + if clearPMDEntries == entriesPerPage { + pudEntry.Clear() + w.pageTables.Allocator.FreePTEs(pmdEntries) + clearPUDEntries++ + } + } + + if clearPUDEntries == entriesPerPage { + pgdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pudEntries) + } + } +} diff --git a/pkg/sentry/platform/ring0/pagetables/walker_map.go b/pkg/sentry/platform/ring0/pagetables/walker_map.go new file mode 100644 index 000000000..61ee3c825 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/walker_map.go @@ -0,0 +1,255 @@ +package pagetables + +// Walker walks page tables. +type mapWalker struct { + // pageTables are the tables to walk. + pageTables *PageTables + + // Visitor is the set of arguments. + visitor mapVisitor +} + +// iterateRange iterates over all appropriate levels of page tables for the given range. +// +// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The +// exception is super pages. If a valid super page (huge or jumbo) cannot be +// installed, then the walk will continue to individual entries. +// +// This algorithm will attempt to maximize the use of super pages whenever +// possible. Whether a super page is provided will be clear through the range +// provided in the callback. +// +// Note that if requiresAlloc is true, then no gaps will be present. However, +// if alloc is not set, then the iteration will likely be full of gaps. +// +// Note that this function should generally be avoided in favor of Map, Unmap, +// etc. when not necessary. +// +// Precondition: start must be page-aligned. +// +// Precondition: start must be less than end. +// +// Precondition: If requiresAlloc is true, then start and end should not span +// non-canonical ranges. If they do, a panic will result. +// +//go:nosplit +func (w *mapWalker) iterateRange(start, end uintptr) { + if start%pteSize != 0 { + panic("unaligned start") + } + if end < start { + panic("start > end") + } + if start < lowerTop { + if end <= lowerTop { + w.iterateRangeCanonical(start, end) + } else if end > lowerTop && end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + w.iterateRangeCanonical(upperBottom, end) + } + } else if start < upperBottom { + if end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(upperBottom, end) + } + } else { + w.iterateRangeCanonical(start, end) + } +} + +// next returns the next address quantized by the given size. +// +//go:nosplit +func mapnext(start uintptr, size uintptr) uintptr { + start &= ^(size - 1) + start += size + return start +} + +// iterateRangeCanonical walks a canonical range. +// +//go:nosplit +func (w *mapWalker) iterateRangeCanonical(start, end uintptr) { + for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { + var ( + pgdEntry = &w.pageTables.root[pgdIndex] + pudEntries *PTEs + ) + if !pgdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + start = mapnext(start, pgdSize) + continue + } + + pudEntries = w.pageTables.Allocator.NewPTEs() + pgdEntry.setPageTable(w.pageTables, pudEntries) + } else { + pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) + } + + clearPUDEntries := uint16(0) + + for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { + var ( + pudEntry = &pudEntries[pudIndex] + pmdEntries *PTEs + ) + if !pudEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPUDEntries++ + start = mapnext(start, pudSize) + continue + } + + if start&(pudSize-1) == 0 && end-start >= pudSize { + pudEntry.SetSuper() + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + if pudEntry.Valid() { + start = mapnext(start, pudSize) + continue + } + } + + pmdEntries = w.pageTables.Allocator.NewPTEs() + pudEntry.setPageTable(w.pageTables, pmdEntries) + + } else if pudEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < mapnext(start, pudSize)) { + + pmdEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pmdEntries[index].SetSuper() + pmdEntries[index].Set( + pudEntry.Address()+(pmdSize*uintptr(index)), + pudEntry.Opts()) + } + pudEntry.setPageTable(w.pageTables, pmdEntries) + } else { + + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + + if !pudEntry.Valid() { + clearPUDEntries++ + } + + start = mapnext(start, pudSize) + continue + } + } else { + pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) + } + + clearPMDEntries := uint16(0) + + for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { + var ( + pmdEntry = &pmdEntries[pmdIndex] + pteEntries *PTEs + ) + if !pmdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPMDEntries++ + start = mapnext(start, pmdSize) + continue + } + + if start&(pmdSize-1) == 0 && end-start >= pmdSize { + pmdEntry.SetSuper() + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + if pmdEntry.Valid() { + start = mapnext(start, pmdSize) + continue + } + } + + pteEntries = w.pageTables.Allocator.NewPTEs() + pmdEntry.setPageTable(w.pageTables, pteEntries) + + } else if pmdEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < mapnext(start, pmdSize)) { + + pteEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pteEntries[index].Set( + pmdEntry.Address()+(pteSize*uintptr(index)), + pmdEntry.Opts()) + } + pmdEntry.setPageTable(w.pageTables, pteEntries) + } else { + + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + + if !pmdEntry.Valid() { + clearPMDEntries++ + } + + start = mapnext(start, pmdSize) + continue + } + } else { + pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) + } + + clearPTEEntries := uint16(0) + + for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { + var ( + pteEntry = &pteEntries[pteIndex] + ) + if !pteEntry.Valid() && !w.visitor.requiresAlloc() { + clearPTEEntries++ + start += pteSize + continue + } + + w.visitor.visit(uintptr(start), pteEntry, pteSize-1) + if !pteEntry.Valid() { + if w.visitor.requiresAlloc() { + panic("PTE not set after iteration with requiresAlloc!") + } + clearPTEEntries++ + } + + start += pteSize + continue + } + + if clearPTEEntries == entriesPerPage { + pmdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pteEntries) + clearPMDEntries++ + } + } + + if clearPMDEntries == entriesPerPage { + pudEntry.Clear() + w.pageTables.Allocator.FreePTEs(pmdEntries) + clearPUDEntries++ + } + } + + if clearPUDEntries == entriesPerPage { + pgdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pudEntries) + } + } +} diff --git a/pkg/sentry/platform/ring0/pagetables/walker_unmap.go b/pkg/sentry/platform/ring0/pagetables/walker_unmap.go new file mode 100644 index 000000000..be2aa0ce4 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/walker_unmap.go @@ -0,0 +1,255 @@ +package pagetables + +// Walker walks page tables. +type unmapWalker struct { + // pageTables are the tables to walk. + pageTables *PageTables + + // Visitor is the set of arguments. + visitor unmapVisitor +} + +// iterateRange iterates over all appropriate levels of page tables for the given range. +// +// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The +// exception is super pages. If a valid super page (huge or jumbo) cannot be +// installed, then the walk will continue to individual entries. +// +// This algorithm will attempt to maximize the use of super pages whenever +// possible. Whether a super page is provided will be clear through the range +// provided in the callback. +// +// Note that if requiresAlloc is true, then no gaps will be present. However, +// if alloc is not set, then the iteration will likely be full of gaps. +// +// Note that this function should generally be avoided in favor of Map, Unmap, +// etc. when not necessary. +// +// Precondition: start must be page-aligned. +// +// Precondition: start must be less than end. +// +// Precondition: If requiresAlloc is true, then start and end should not span +// non-canonical ranges. If they do, a panic will result. +// +//go:nosplit +func (w *unmapWalker) iterateRange(start, end uintptr) { + if start%pteSize != 0 { + panic("unaligned start") + } + if end < start { + panic("start > end") + } + if start < lowerTop { + if end <= lowerTop { + w.iterateRangeCanonical(start, end) + } else if end > lowerTop && end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + w.iterateRangeCanonical(upperBottom, end) + } + } else if start < upperBottom { + if end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(upperBottom, end) + } + } else { + w.iterateRangeCanonical(start, end) + } +} + +// next returns the next address quantized by the given size. +// +//go:nosplit +func unmapnext(start uintptr, size uintptr) uintptr { + start &= ^(size - 1) + start += size + return start +} + +// iterateRangeCanonical walks a canonical range. +// +//go:nosplit +func (w *unmapWalker) iterateRangeCanonical(start, end uintptr) { + for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { + var ( + pgdEntry = &w.pageTables.root[pgdIndex] + pudEntries *PTEs + ) + if !pgdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + start = unmapnext(start, pgdSize) + continue + } + + pudEntries = w.pageTables.Allocator.NewPTEs() + pgdEntry.setPageTable(w.pageTables, pudEntries) + } else { + pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) + } + + clearPUDEntries := uint16(0) + + for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { + var ( + pudEntry = &pudEntries[pudIndex] + pmdEntries *PTEs + ) + if !pudEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPUDEntries++ + start = unmapnext(start, pudSize) + continue + } + + if start&(pudSize-1) == 0 && end-start >= pudSize { + pudEntry.SetSuper() + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + if pudEntry.Valid() { + start = unmapnext(start, pudSize) + continue + } + } + + pmdEntries = w.pageTables.Allocator.NewPTEs() + pudEntry.setPageTable(w.pageTables, pmdEntries) + + } else if pudEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < unmapnext(start, pudSize)) { + + pmdEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pmdEntries[index].SetSuper() + pmdEntries[index].Set( + pudEntry.Address()+(pmdSize*uintptr(index)), + pudEntry.Opts()) + } + pudEntry.setPageTable(w.pageTables, pmdEntries) + } else { + + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + + if !pudEntry.Valid() { + clearPUDEntries++ + } + + start = unmapnext(start, pudSize) + continue + } + } else { + pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) + } + + clearPMDEntries := uint16(0) + + for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { + var ( + pmdEntry = &pmdEntries[pmdIndex] + pteEntries *PTEs + ) + if !pmdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPMDEntries++ + start = unmapnext(start, pmdSize) + continue + } + + if start&(pmdSize-1) == 0 && end-start >= pmdSize { + pmdEntry.SetSuper() + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + if pmdEntry.Valid() { + start = unmapnext(start, pmdSize) + continue + } + } + + pteEntries = w.pageTables.Allocator.NewPTEs() + pmdEntry.setPageTable(w.pageTables, pteEntries) + + } else if pmdEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < unmapnext(start, pmdSize)) { + + pteEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pteEntries[index].Set( + pmdEntry.Address()+(pteSize*uintptr(index)), + pmdEntry.Opts()) + } + pmdEntry.setPageTable(w.pageTables, pteEntries) + } else { + + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + + if !pmdEntry.Valid() { + clearPMDEntries++ + } + + start = unmapnext(start, pmdSize) + continue + } + } else { + pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) + } + + clearPTEEntries := uint16(0) + + for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { + var ( + pteEntry = &pteEntries[pteIndex] + ) + if !pteEntry.Valid() && !w.visitor.requiresAlloc() { + clearPTEEntries++ + start += pteSize + continue + } + + w.visitor.visit(uintptr(start), pteEntry, pteSize-1) + if !pteEntry.Valid() { + if w.visitor.requiresAlloc() { + panic("PTE not set after iteration with requiresAlloc!") + } + clearPTEEntries++ + } + + start += pteSize + continue + } + + if clearPTEEntries == entriesPerPage { + pmdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pteEntries) + clearPMDEntries++ + } + } + + if clearPMDEntries == entriesPerPage { + pudEntry.Clear() + w.pageTables.Allocator.FreePTEs(pmdEntries) + clearPUDEntries++ + } + } + + if clearPUDEntries == entriesPerPage { + pgdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pudEntries) + } + } +} diff --git a/pkg/sentry/platform/ring0/ring0_amd64_state_autogen.go b/pkg/sentry/platform/ring0/ring0_amd64_state_autogen.go new file mode 100644 index 000000000..96cf5d331 --- /dev/null +++ b/pkg/sentry/platform/ring0/ring0_amd64_state_autogen.go @@ -0,0 +1,7 @@ +// automatically generated by stateify. + +// +build amd64 +// +build amd64 +// +build amd64 + +package ring0 diff --git a/pkg/sentry/platform/ring0/ring0_arm64_state_autogen.go b/pkg/sentry/platform/ring0/ring0_arm64_state_autogen.go new file mode 100644 index 000000000..7f2ab3537 --- /dev/null +++ b/pkg/sentry/platform/ring0/ring0_arm64_state_autogen.go @@ -0,0 +1,7 @@ +// automatically generated by stateify. + +// +build arm64 +// +build arm64 +// +build arm64 + +package ring0 diff --git a/pkg/sentry/platform/ring0/ring0_arm64_unsafe_state_autogen.go b/pkg/sentry/platform/ring0/ring0_arm64_unsafe_state_autogen.go new file mode 100644 index 000000000..fa617c57e --- /dev/null +++ b/pkg/sentry/platform/ring0/ring0_arm64_unsafe_state_autogen.go @@ -0,0 +1,5 @@ +// automatically generated by stateify. + +// +build arm64 + +package ring0 diff --git a/pkg/sentry/platform/ring0/ring0_impl_amd64_state_autogen.go b/pkg/sentry/platform/ring0/ring0_impl_amd64_state_autogen.go new file mode 100644 index 000000000..327aba163 --- /dev/null +++ b/pkg/sentry/platform/ring0/ring0_impl_amd64_state_autogen.go @@ -0,0 +1,3 @@ +// automatically generated by stateify. + +package ring0 diff --git a/pkg/sentry/platform/ring0/ring0_impl_arm64_state_autogen.go b/pkg/sentry/platform/ring0/ring0_impl_arm64_state_autogen.go new file mode 100644 index 000000000..327aba163 --- /dev/null +++ b/pkg/sentry/platform/ring0/ring0_impl_arm64_state_autogen.go @@ -0,0 +1,3 @@ +// automatically generated by stateify. + +package ring0 diff --git a/pkg/sentry/platform/ring0/ring0_state_autogen.go b/pkg/sentry/platform/ring0/ring0_state_autogen.go new file mode 100644 index 000000000..327aba163 --- /dev/null +++ b/pkg/sentry/platform/ring0/ring0_state_autogen.go @@ -0,0 +1,3 @@ +// automatically generated by stateify. + +package ring0 diff --git a/pkg/sentry/platform/ring0/ring0_unsafe_state_autogen.go b/pkg/sentry/platform/ring0/ring0_unsafe_state_autogen.go new file mode 100644 index 000000000..327aba163 --- /dev/null +++ b/pkg/sentry/platform/ring0/ring0_unsafe_state_autogen.go @@ -0,0 +1,3 @@ +// automatically generated by stateify. + +package ring0 diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go deleted file mode 100644 index 34fbc1c35..000000000 --- a/pkg/sentry/platform/ring0/x86.go +++ /dev/null @@ -1,296 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build 386 amd64 - -package ring0 - -import ( - "gvisor.dev/gvisor/pkg/cpuid" -) - -// Useful bits. -const ( - _CR0_PE = 1 << 0 - _CR0_ET = 1 << 4 - _CR0_AM = 1 << 18 - _CR0_PG = 1 << 31 - - _CR4_PSE = 1 << 4 - _CR4_PAE = 1 << 5 - _CR4_PGE = 1 << 7 - _CR4_OSFXSR = 1 << 9 - _CR4_OSXMMEXCPT = 1 << 10 - _CR4_FSGSBASE = 1 << 16 - _CR4_PCIDE = 1 << 17 - _CR4_OSXSAVE = 1 << 18 - _CR4_SMEP = 1 << 20 - - _RFLAGS_AC = 1 << 18 - _RFLAGS_NT = 1 << 14 - _RFLAGS_IOPL0 = 1 << 12 - _RFLAGS_IOPL1 = 1 << 13 - _RFLAGS_IOPL = _RFLAGS_IOPL0 | _RFLAGS_IOPL1 - _RFLAGS_DF = 1 << 10 - _RFLAGS_IF = 1 << 9 - _RFLAGS_STEP = 1 << 8 - _RFLAGS_RESERVED = 1 << 1 - - _EFER_SCE = 0x001 - _EFER_LME = 0x100 - _EFER_LMA = 0x400 - _EFER_NX = 0x800 - - _MSR_STAR = 0xc0000081 - _MSR_LSTAR = 0xc0000082 - _MSR_CSTAR = 0xc0000083 - _MSR_SYSCALL_MASK = 0xc0000084 - _MSR_PLATFORM_INFO = 0xce - _MSR_MISC_FEATURES = 0x140 - - _PLATFORM_INFO_CPUID_FAULT = 1 << 31 - - _MISC_FEATURE_CPUID_TRAP = 0x1 -) - -const ( - // KernelFlagsSet should always be set in the kernel. - KernelFlagsSet = _RFLAGS_RESERVED - - // UserFlagsSet are always set in userspace. - // - // _RFLAGS_IOPL is a set of two bits and it shows the I/O privilege - // level. The Current Privilege Level (CPL) of the task must be less - // than or equal to the IOPL in order for the task or program to access - // I/O ports. - // - // Here, _RFLAGS_IOPL0 is used only to determine whether the task is - // running in the kernel or userspace mode. In the user mode, the CPL is - // always 3 and it doesn't matter what IOPL is set if it is bellow CPL. - // - // We need to have one bit which will be always different in user and - // kernel modes. And we have to remember that even though we have - // KernelFlagsClear, we still can see some of these flags in the kernel - // mode. This can happen when the goruntime switches on a goroutine - // which has been saved in the host mode. On restore, the popf - // instruction is used to restore flags and this means that all flags - // what the goroutine has in the host mode will be restored in the - // kernel mode. - // - // _RFLAGS_IOPL0 is never set in host and kernel modes and we always set - // it in the user mode. So if this flag is set, the task is running in - // the user mode and if it isn't set, the task is running in the kernel - // mode. - UserFlagsSet = _RFLAGS_RESERVED | _RFLAGS_IF | _RFLAGS_IOPL0 - - // KernelFlagsClear should always be clear in the kernel. - KernelFlagsClear = _RFLAGS_STEP | _RFLAGS_IF | _RFLAGS_IOPL | _RFLAGS_AC | _RFLAGS_NT - - // UserFlagsClear are always cleared in userspace. - UserFlagsClear = _RFLAGS_NT | _RFLAGS_IOPL1 -) - -// IsKernelFlags returns true if rflags coresponds to the kernel mode. -// -// go:nosplit -func IsKernelFlags(rflags uint64) bool { - return rflags&_RFLAGS_IOPL0 == 0 -} - -// Vector is an exception vector. -type Vector uintptr - -// Exception vectors. -const ( - DivideByZero Vector = iota - Debug - NMI - Breakpoint - Overflow - BoundRangeExceeded - InvalidOpcode - DeviceNotAvailable - DoubleFault - CoprocessorSegmentOverrun - InvalidTSS - SegmentNotPresent - StackSegmentFault - GeneralProtectionFault - PageFault - _ - X87FloatingPointException - AlignmentCheck - MachineCheck - SIMDFloatingPointException - VirtualizationException - SecurityException = 0x1e - SyscallInt80 = 0x80 - _NR_INTERRUPTS = 0x100 -) - -// System call vectors. -const ( - Syscall Vector = _NR_INTERRUPTS -) - -// VirtualAddressBits returns the number bits available for virtual addresses. -// -// Note that sign-extension semantics apply to the highest order bit. -// -// FIXME(b/69382326): This should use the cpuid passed to Init. -func VirtualAddressBits() uint32 { - ax, _, _, _ := cpuid.HostID(0x80000008, 0) - return (ax >> 8) & 0xff -} - -// PhysicalAddressBits returns the number of bits available for physical addresses. -// -// FIXME(b/69382326): This should use the cpuid passed to Init. -func PhysicalAddressBits() uint32 { - ax, _, _, _ := cpuid.HostID(0x80000008, 0) - return ax & 0xff -} - -// Selector is a segment Selector. -type Selector uint16 - -// SegmentDescriptor is a segment descriptor. -type SegmentDescriptor struct { - bits [2]uint32 -} - -// descriptorTable is a collection of descriptors. -type descriptorTable [32]SegmentDescriptor - -// SegmentDescriptorFlags are typed flags within a descriptor. -type SegmentDescriptorFlags uint32 - -// SegmentDescriptorFlag declarations. -const ( - SegmentDescriptorAccess SegmentDescriptorFlags = 1 << 8 // Access bit (always set). - SegmentDescriptorWrite = 1 << 9 // Write permission. - SegmentDescriptorExpandDown = 1 << 10 // Grows down, not used. - SegmentDescriptorExecute = 1 << 11 // Execute permission. - SegmentDescriptorSystem = 1 << 12 // Zero => system, 1 => user code/data. - SegmentDescriptorPresent = 1 << 15 // Present. - SegmentDescriptorAVL = 1 << 20 // Available. - SegmentDescriptorLong = 1 << 21 // Long mode. - SegmentDescriptorDB = 1 << 22 // 16 or 32-bit. - SegmentDescriptorG = 1 << 23 // Granularity: page or byte. -) - -// Base returns the descriptor's base linear address. -func (d *SegmentDescriptor) Base() uint32 { - return d.bits[1]&0xFF000000 | (d.bits[1]&0x000000FF)<<16 | d.bits[0]>>16 -} - -// Limit returns the descriptor size. -func (d *SegmentDescriptor) Limit() uint32 { - l := d.bits[0]&0xFFFF | d.bits[1]&0xF0000 - if d.bits[1]&uint32(SegmentDescriptorG) != 0 { - l <<= 12 - l |= 0xFFF - } - return l -} - -// Flags returns descriptor flags. -func (d *SegmentDescriptor) Flags() SegmentDescriptorFlags { - return SegmentDescriptorFlags(d.bits[1] & 0x00F09F00) -} - -// DPL returns the descriptor privilege level. -func (d *SegmentDescriptor) DPL() int { - return int((d.bits[1] >> 13) & 3) -} - -func (d *SegmentDescriptor) setNull() { - d.bits[0] = 0 - d.bits[1] = 0 -} - -func (d *SegmentDescriptor) set(base, limit uint32, dpl int, flags SegmentDescriptorFlags) { - flags |= SegmentDescriptorPresent - if limit>>12 != 0 { - limit >>= 12 - flags |= SegmentDescriptorG - } - d.bits[0] = base<<16 | limit&0xFFFF - d.bits[1] = base&0xFF000000 | (base>>16)&0xFF | limit&0x000F0000 | uint32(flags) | uint32(dpl)<<13 -} - -func (d *SegmentDescriptor) setCode32(base, limit uint32, dpl int) { - d.set(base, limit, dpl, - SegmentDescriptorDB| - SegmentDescriptorExecute| - SegmentDescriptorSystem) -} - -func (d *SegmentDescriptor) setCode64(base, limit uint32, dpl int) { - d.set(base, limit, dpl, - SegmentDescriptorG| - SegmentDescriptorLong| - SegmentDescriptorExecute| - SegmentDescriptorSystem) -} - -func (d *SegmentDescriptor) setData(base, limit uint32, dpl int) { - d.set(base, limit, dpl, - SegmentDescriptorWrite| - SegmentDescriptorSystem) -} - -// setHi is only used for the TSS segment, which is magically 64-bits. -func (d *SegmentDescriptor) setHi(base uint32) { - d.bits[0] = base - d.bits[1] = 0 -} - -// Gate64 is a 64-bit task, trap, or interrupt gate. -type Gate64 struct { - bits [4]uint32 -} - -// idt64 is a 64-bit interrupt descriptor table. -type idt64 [_NR_INTERRUPTS]Gate64 - -func (g *Gate64) setInterrupt(cs Selector, rip uint64, dpl int, ist int) { - g.bits[0] = uint32(cs)<<16 | uint32(rip)&0xFFFF - g.bits[1] = uint32(rip)&0xFFFF0000 | SegmentDescriptorPresent | uint32(dpl)<<13 | 14<<8 | uint32(ist)&0x7 - g.bits[2] = uint32(rip >> 32) -} - -func (g *Gate64) setTrap(cs Selector, rip uint64, dpl int, ist int) { - g.setInterrupt(cs, rip, dpl, ist) - g.bits[1] |= 1 << 8 -} - -// TaskState64 is a 64-bit task state structure. -type TaskState64 struct { - _ uint32 - rsp0Lo, rsp0Hi uint32 - rsp1Lo, rsp1Hi uint32 - rsp2Lo, rsp2Hi uint32 - _ [2]uint32 - ist1Lo, ist1Hi uint32 - ist2Lo, ist2Hi uint32 - ist3Lo, ist3Hi uint32 - ist4Lo, ist4Hi uint32 - ist5Lo, ist5Hi uint32 - ist6Lo, ist6Hi uint32 - ist7Lo, ist7Hi uint32 - _ [2]uint32 - _ uint16 - ioPerm uint16 -} |