diff options
author | Michael Pratt <mpratt@google.com> | 2021-07-12 07:59:47 -0700 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2021-07-12 08:01:53 -0700 |
commit | 36a17a814bf90bad33eac25ddbb7a416143a4be7 (patch) | |
tree | faa4d72267f1efc227cd56d0e8f8cb23c5dacb16 | |
parent | d78713e2da5331a22fc51fb9a9ad33cc1873879c (diff) |
Go 1.17 support for the KVM platform
Go 1.17 adds a new register-based calling convention. While transparent for
most applications, the KVM platform needs special work in a few cases.
First of all, we need the actual address of some assembly functions, rather
than the address of a wrapper. See http://gvisor.dev/pr/5832 for complete
discussion of this.
More relevant to this CL is that ABI0-to-ABIInternal wrappers (i.e., calls from
assembly to Go) access the G via FS_BASE. The KVM quite fast-and-loose about
the Go environment, often calling into (nosplit) Go functions with
uninitialized FS_BASE.
That will no longer work in Go 1.17, so this CL changes the platform to
consistently restore FS_BASE before calling into Go code.
This CL does not affect arm64 code. Go 1.17 does not support the register-based
calling convention for arm64 (it will come in 1.18), but arm64 also does not
use a non-standard register like FS_BASE for TLS, so it may not require any
changes.
PiperOrigin-RevId: 384234305
-rw-r--r-- | pkg/ring0/entry_amd64.go | 97 | ||||
-rw-r--r-- | pkg/ring0/entry_amd64.s | 147 | ||||
-rw-r--r-- | pkg/ring0/kernel_amd64.go | 33 | ||||
-rw-r--r-- | pkg/ring0/kernel_unsafe.go | 5 | ||||
-rw-r--r-- | pkg/ring0/lib_amd64.go | 6 | ||||
-rw-r--r-- | pkg/ring0/lib_amd64.s | 23 | ||||
-rw-r--r-- | pkg/ring0/offsets_amd64.go | 4 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/bluepill_amd64.go | 27 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/bluepill_unsafe.go | 7 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/kvm_amd64_test.go | 4 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/kvm_test.go | 34 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine_amd64.go | 2 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/testutil/testutil.go | 42 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/testutil/testutil_amd64.go | 10 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/testutil/testutil_amd64.s | 57 |
15 files changed, 378 insertions, 120 deletions
diff --git a/pkg/ring0/entry_amd64.go b/pkg/ring0/entry_amd64.go index d87b1fd00..397ccac7b 100644 --- a/pkg/ring0/entry_amd64.go +++ b/pkg/ring0/entry_amd64.go @@ -31,6 +31,13 @@ import ( // executed from kernel mode or not and the appropriate stub is called. func sysenter() +// addrOfSysenter returns the start address of sysenter. +// +// In Go 1.17+, Go references to assembly functions resolve to an ABIInternal +// wrapper function rather than the function itself. We must reference from +// assembly to get the ABI0 (i.e., primary) address. +func addrOfSysenter() uintptr + // swapgs swaps the current GS value. // // This must be called prior to sysret/iret. @@ -39,6 +46,9 @@ func swapgs() // jumpToKernel jumps to the kernel version of the current RIP. func jumpToKernel() +// jumpToUser jumps to the user version of the current RIP. +func jumpToUser() + // sysret returns to userspace from a system call. // // The return code is the vector that interrupted execution. @@ -65,7 +75,12 @@ func exception() // This is used when processing kernel exceptions and syscalls. func resume() -// Start is the CPU entrypoint. +// start is the CPU entrypoint. +// +// See requirements below. +func start() + +// AddrOfStart return the address of the CPU entrypoint. // // The following start conditions must be satisfied: // @@ -78,7 +93,11 @@ func resume() // * c.EFER() should be the current EFER value. // // The CPU state will be set to c.Registers(). -func Start() +// +// In Go 1.17+, Go references to assembly functions resolve to an ABIInternal +// wrapper function rather than the function itself. We must reference from +// assembly to get the ABI0 (i.e., primary) address. +func AddrOfStart() uintptr // Exception stubs. func divideByZero() @@ -104,28 +123,56 @@ func virtualizationException() func securityException() func syscallInt80() +// These returns the start address of the functions above. +// +// In Go 1.17+, Go references to assembly functions resolve to an ABIInternal +// wrapper function rather than the function itself. We must reference from +// assembly to get the ABI0 (i.e., primary) address. +func addrOfDivideByZero() uintptr +func addrOfDebug() uintptr +func addrOfNMI() uintptr +func addrOfBreakpoint() uintptr +func addrOfOverflow() uintptr +func addrOfBoundRangeExceeded() uintptr +func addrOfInvalidOpcode() uintptr +func addrOfDeviceNotAvailable() uintptr +func addrOfDoubleFault() uintptr +func addrOfCoprocessorSegmentOverrun() uintptr +func addrOfInvalidTSS() uintptr +func addrOfSegmentNotPresent() uintptr +func addrOfStackSegmentFault() uintptr +func addrOfGeneralProtectionFault() uintptr +func addrOfPageFault() uintptr +func addrOfX87FloatingPointException() uintptr +func addrOfAlignmentCheck() uintptr +func addrOfMachineCheck() uintptr +func addrOfSimdFloatingPointException() uintptr +func addrOfVirtualizationException() uintptr +func addrOfSecurityException() uintptr +func addrOfSyscallInt80() uintptr + // Exception handler index. -var handlers = map[Vector]func(){ - DivideByZero: divideByZero, - Debug: debug, - NMI: nmi, - Breakpoint: breakpoint, - Overflow: overflow, - BoundRangeExceeded: boundRangeExceeded, - InvalidOpcode: invalidOpcode, - DeviceNotAvailable: deviceNotAvailable, - DoubleFault: doubleFault, - CoprocessorSegmentOverrun: coprocessorSegmentOverrun, - InvalidTSS: invalidTSS, - SegmentNotPresent: segmentNotPresent, - StackSegmentFault: stackSegmentFault, - GeneralProtectionFault: generalProtectionFault, - PageFault: pageFault, - X87FloatingPointException: x87FloatingPointException, - AlignmentCheck: alignmentCheck, - MachineCheck: machineCheck, - SIMDFloatingPointException: simdFloatingPointException, - VirtualizationException: virtualizationException, - SecurityException: securityException, - SyscallInt80: syscallInt80, +var handlers = map[Vector]uintptr{ + DivideByZero: addrOfDivideByZero(), + Debug: addrOfDebug(), + NMI: addrOfNMI(), + Breakpoint: addrOfBreakpoint(), + Overflow: addrOfOverflow(), + BoundRangeExceeded: addrOfBoundRangeExceeded(), + InvalidOpcode: addrOfInvalidOpcode(), + DeviceNotAvailable: addrOfDeviceNotAvailable(), + DoubleFault: addrOfDoubleFault(), + CoprocessorSegmentOverrun: addrOfCoprocessorSegmentOverrun(), + InvalidTSS: addrOfInvalidTSS(), + SegmentNotPresent: addrOfSegmentNotPresent(), + StackSegmentFault: addrOfStackSegmentFault(), + GeneralProtectionFault: addrOfGeneralProtectionFault(), + PageFault: addrOfPageFault(), + X87FloatingPointException: addrOfX87FloatingPointException(), + AlignmentCheck: addrOfAlignmentCheck(), + MachineCheck: addrOfMachineCheck(), + SIMDFloatingPointException: addrOfSimdFloatingPointException(), + VirtualizationException: addrOfVirtualizationException(), + SecurityException: addrOfSecurityException(), + SyscallInt80: addrOfSyscallInt80(), } diff --git a/pkg/ring0/entry_amd64.s b/pkg/ring0/entry_amd64.s index f59747df3..520bd9f57 100644 --- a/pkg/ring0/entry_amd64.s +++ b/pkg/ring0/entry_amd64.s @@ -88,11 +88,33 @@ #define LOAD_KERNEL_STACK(entry) \ MOVQ ENTRY_STACK_TOP(entry), SP; +// ADDR_OF_FUNC defines a function named 'name' that returns the address of +// 'symbol'. +#define ADDR_OF_FUNC(name, symbol) \ +TEXT name,$0-8; \ + MOVQ $symbol, AX; \ + MOVQ AX, ret+0(FP); \ + RET + // See kernel.go. TEXT ·Halt(SB),NOSPLIT,$0 HLT RET +// See kernel_amd64.go. +TEXT ·HaltAndWriteFSBase(SB),NOSPLIT,$8-8 + HLT + + // Restore FS_BASE. + MOVQ regs+0(FP), AX + MOVQ PTRACE_FS_BASE(AX), AX + + PUSHQ AX // First argument (FS_BASE) + CALL ·writeFS(SB) + POPQ AX + + RET + // See entry_amd64.go. TEXT ·swapgs(SB),NOSPLIT,$0 SWAP_GS() @@ -107,8 +129,29 @@ TEXT ·jumpToKernel(SB),NOSPLIT,$0 MOVQ AX, 0(SP) RET +// jumpToUser changes execution to the user address space. +// +// This works by changing the return value to the user version. +TEXT ·jumpToUser(SB),NOSPLIT,$0 + // N.B. we can't access KernelStartAddress from the upper half (data + // pages not available), so just naively clear all the upper bits. + // We are assuming a 47-bit virtual address space. + MOVQ $0x00007fffffffffff, AX + MOVQ 0(SP), BX + ANDQ BX, AX // Future return value. + MOVQ AX, 0(SP) + RET + // See entry_amd64.go. TEXT ·sysret(SB),NOSPLIT,$0-24 + // Set application FS. We can't do this in Go because Go code needs FS. + MOVQ regs+8(FP), AX + MOVQ PTRACE_FS_BASE(AX), AX + + PUSHQ AX + CALL ·writeFS(SB) + POPQ AX + CALL ·jumpToKernel(SB) // Save original state and stack. sysenter() or exception() // from APP(gr3) will switch to this stack, set the return @@ -142,6 +185,14 @@ TEXT ·sysret(SB),NOSPLIT,$0-24 // See entry_amd64.go. TEXT ·iret(SB),NOSPLIT,$0-24 + // Set application FS. We can't do this in Go because Go code needs FS. + MOVQ regs+8(FP), AX + MOVQ PTRACE_FS_BASE(AX), AX + + PUSHQ AX // First argument (FS_BASE) + CALL ·writeFS(SB) + POPQ AX + CALL ·jumpToKernel(SB) // Save original state and stack. sysenter() or exception() // from APP(gr3) will switch to this stack, set the return @@ -184,13 +235,29 @@ TEXT ·resume(SB),NOSPLIT,$0 IRET() // See entry_amd64.go. -TEXT ·Start(SB),NOSPLIT,$0 +TEXT ·start(SB),NOSPLIT,$0 + // N.B. This is the vCPU entrypoint. It is not called from Go code and + // thus pushes and pops values on the stack until calling into Go + // (startGo) because we aren't usually a typical Go assembly frame. + PUSHQ $0x0 // Previous frame pointer. MOVQ SP, BP // Set frame pointer. - PUSHQ AX // First argument (CPU). - CALL ·start(SB) // Call Go hook. + + PUSHQ AX // Save CPU. + + // Set up environment required by Go before calling startGo: Go needs + // FS_BASE and floating point initialized. + MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX + PUSHQ BX // First argument (FS_BASE) + CALL ·writeFS(SB) + POPQ BX + + // First argument (CPU) already at bottom of stack. + CALL ·startGo(SB) // Call Go hook. JMP ·resume(SB) // Restore to registers. +ADDR_OF_FUNC(·AddrOfStart(SB), ·start(SB)); + // See entry_amd64.go. TEXT ·sysenter(SB),NOSPLIT,$0 // _RFLAGS_IOPL0 is always set in the user mode and it is never set in @@ -218,6 +285,18 @@ user: MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code. MOVQ $1, CPU_ERROR_TYPE(AX) // Set error type to user. + CALL ·jumpToUser(SB) + + // Restore kernel FS_BASE. + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. + MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX + + PUSHQ BX // First argument (FS_BASE) + CALL ·writeFS(SB) + POPQ BX + + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. + // Return to the kernel, where the frame is: // // vector (sp+32) @@ -252,6 +331,8 @@ kernel: POPQ AX // Pop vCPU. JMP ·resume(SB) +ADDR_OF_FUNC(·addrOfSysenter(SB), ·sysenter(SB)); + // exception is a generic exception handler. // // There are two cases handled: @@ -298,6 +379,16 @@ user: MOVQ 40(SP), DI; MOVQ DI, PTRACE_RSP(AX) MOVQ 48(SP), SI; MOVQ SI, PTRACE_SS(AX) + CALL ·jumpToUser(SB) + + // Restore kernel FS_BASE. + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. + MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX + + PUSHQ BX // First argument (FS_BASE) + CALL ·writeFS(SB) + POPQ BX + // Copy out and return. MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. MOVQ 0(SP), BX // Load vector. @@ -336,36 +427,38 @@ kernel: POPQ AX // Pop vCPU. JMP ·resume(SB) -#define EXCEPTION_WITH_ERROR(value, symbol) \ +#define EXCEPTION_WITH_ERROR(value, symbol, addr) \ +ADDR_OF_FUNC(addr, symbol); \ TEXT symbol,NOSPLIT,$0; \ PUSHQ $value; \ JMP ·exception(SB); -#define EXCEPTION_WITHOUT_ERROR(value, symbol) \ +#define EXCEPTION_WITHOUT_ERROR(value, symbol, addr) \ +ADDR_OF_FUNC(addr, symbol); \ TEXT symbol,NOSPLIT,$0; \ PUSHQ $0x0; \ PUSHQ $value; \ JMP ·exception(SB); -EXCEPTION_WITHOUT_ERROR(DivideByZero, ·divideByZero(SB)) -EXCEPTION_WITHOUT_ERROR(Debug, ·debug(SB)) -EXCEPTION_WITHOUT_ERROR(NMI, ·nmi(SB)) -EXCEPTION_WITHOUT_ERROR(Breakpoint, ·breakpoint(SB)) -EXCEPTION_WITHOUT_ERROR(Overflow, ·overflow(SB)) -EXCEPTION_WITHOUT_ERROR(BoundRangeExceeded, ·boundRangeExceeded(SB)) -EXCEPTION_WITHOUT_ERROR(InvalidOpcode, ·invalidOpcode(SB)) -EXCEPTION_WITHOUT_ERROR(DeviceNotAvailable, ·deviceNotAvailable(SB)) -EXCEPTION_WITH_ERROR(DoubleFault, ·doubleFault(SB)) -EXCEPTION_WITHOUT_ERROR(CoprocessorSegmentOverrun, ·coprocessorSegmentOverrun(SB)) -EXCEPTION_WITH_ERROR(InvalidTSS, ·invalidTSS(SB)) -EXCEPTION_WITH_ERROR(SegmentNotPresent, ·segmentNotPresent(SB)) -EXCEPTION_WITH_ERROR(StackSegmentFault, ·stackSegmentFault(SB)) -EXCEPTION_WITH_ERROR(GeneralProtectionFault, ·generalProtectionFault(SB)) -EXCEPTION_WITH_ERROR(PageFault, ·pageFault(SB)) -EXCEPTION_WITHOUT_ERROR(X87FloatingPointException, ·x87FloatingPointException(SB)) -EXCEPTION_WITH_ERROR(AlignmentCheck, ·alignmentCheck(SB)) -EXCEPTION_WITHOUT_ERROR(MachineCheck, ·machineCheck(SB)) -EXCEPTION_WITHOUT_ERROR(SIMDFloatingPointException, ·simdFloatingPointException(SB)) -EXCEPTION_WITHOUT_ERROR(VirtualizationException, ·virtualizationException(SB)) -EXCEPTION_WITH_ERROR(SecurityException, ·securityException(SB)) -EXCEPTION_WITHOUT_ERROR(SyscallInt80, ·syscallInt80(SB)) +EXCEPTION_WITHOUT_ERROR(DivideByZero, ·divideByZero(SB), ·addrOfDivideByZero(SB)) +EXCEPTION_WITHOUT_ERROR(Debug, ·debug(SB), ·addrOfDebug(SB)) +EXCEPTION_WITHOUT_ERROR(NMI, ·nmi(SB), ·addrOfNMI(SB)) +EXCEPTION_WITHOUT_ERROR(Breakpoint, ·breakpoint(SB), ·addrOfBreakpoint(SB)) +EXCEPTION_WITHOUT_ERROR(Overflow, ·overflow(SB), ·addrOfOverflow(SB)) +EXCEPTION_WITHOUT_ERROR(BoundRangeExceeded, ·boundRangeExceeded(SB), ·addrOfBoundRangeExceeded(SB)) +EXCEPTION_WITHOUT_ERROR(InvalidOpcode, ·invalidOpcode(SB), ·addrOfInvalidOpcode(SB)) +EXCEPTION_WITHOUT_ERROR(DeviceNotAvailable, ·deviceNotAvailable(SB), ·addrOfDeviceNotAvailable(SB)) +EXCEPTION_WITH_ERROR(DoubleFault, ·doubleFault(SB), ·addrOfDoubleFault(SB)) +EXCEPTION_WITHOUT_ERROR(CoprocessorSegmentOverrun, ·coprocessorSegmentOverrun(SB), ·addrOfCoprocessorSegmentOverrun(SB)) +EXCEPTION_WITH_ERROR(InvalidTSS, ·invalidTSS(SB), ·addrOfInvalidTSS(SB)) +EXCEPTION_WITH_ERROR(SegmentNotPresent, ·segmentNotPresent(SB), ·addrOfSegmentNotPresent(SB)) +EXCEPTION_WITH_ERROR(StackSegmentFault, ·stackSegmentFault(SB), ·addrOfStackSegmentFault(SB)) +EXCEPTION_WITH_ERROR(GeneralProtectionFault, ·generalProtectionFault(SB), ·addrOfGeneralProtectionFault(SB)) +EXCEPTION_WITH_ERROR(PageFault, ·pageFault(SB), ·addrOfPageFault(SB)) +EXCEPTION_WITHOUT_ERROR(X87FloatingPointException, ·x87FloatingPointException(SB), ·addrOfX87FloatingPointException(SB)) +EXCEPTION_WITH_ERROR(AlignmentCheck, ·alignmentCheck(SB), ·addrOfAlignmentCheck(SB)) +EXCEPTION_WITHOUT_ERROR(MachineCheck, ·machineCheck(SB), ·addrOfMachineCheck(SB)) +EXCEPTION_WITHOUT_ERROR(SIMDFloatingPointException, ·simdFloatingPointException(SB), ·addrOfSimdFloatingPointException(SB)) +EXCEPTION_WITHOUT_ERROR(VirtualizationException, ·virtualizationException(SB), ·addrOfVirtualizationException(SB)) +EXCEPTION_WITH_ERROR(SecurityException, ·securityException(SB), ·addrOfSecurityException(SB)) +EXCEPTION_WITHOUT_ERROR(SyscallInt80, ·syscallInt80(SB), ·addrOfSyscallInt80(SB)) diff --git a/pkg/ring0/kernel_amd64.go b/pkg/ring0/kernel_amd64.go index f63af8b76..b5c4a39e3 100644 --- a/pkg/ring0/kernel_amd64.go +++ b/pkg/ring0/kernel_amd64.go @@ -21,8 +21,13 @@ import ( "reflect" "gvisor.dev/gvisor/pkg/hostarch" + "gvisor.dev/gvisor/pkg/sentry/arch" ) +// HaltAndWriteFSBase halts execution. On resume, it sets FS_BASE from the +// value in regs. +func HaltAndWriteFSBase(regs *arch.Registers) + // init initializes architecture-specific state. func (k *Kernel) init(maxCPUs int) { entrySize := reflect.TypeOf(kernelEntry{}).Size() @@ -240,7 +245,6 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { // Perform the switch. swapgs() // GS will be swapped on return. - WriteFS(uintptr(regs.Fs_base)) // escapes: no. Set application FS. WriteGS(uintptr(regs.Gs_base)) // escapes: no. Set application GS. LoadFloatingPoint(switchOpts.FloatingPointState.BytePointer()) // escapes: no. Copy in floating point. if switchOpts.FullRestore { @@ -249,38 +253,49 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { vector = sysret(c, regs, uintptr(userCR3)) } SaveFloatingPoint(switchOpts.FloatingPointState.BytePointer()) // escapes: no. Copy out floating point. - WriteFS(uintptr(c.registers.Fs_base)) // escapes: no. Restore kernel FS. RestoreKernelFPState() // escapes: no. Restore kernel MXCSR. return } var sentryXCR0 = xgetbv(0) -// start is the CPU entrypoint. +// startGo is the CPU entrypoint. // -// This is called from the Start asm stub (see entry_amd64.go); on return the +// This is called from the start asm stub (see entry_amd64.go); on return the // registers in c.registers will be restored (not segments). // +// Note that any code written in Go should adhere to Go expected environment: +// * Initialized floating point state (required for optimizations using +// floating point instructions). +// * Go TLS in FS_BASE (this is required by splittable functions, calls into +// the runtime, calls to assembly functions (Go 1.17+ ABI wrappers access +// TLS)). +// //go:nosplit -func start(c *CPU) { - // Save per-cpu & FS segment. +func startGo(c *CPU) { + // Save per-cpu. WriteGS(kernelAddr(c.kernelEntry)) - WriteFS(uintptr(c.registers.Fs_base)) + // + // TODO(mpratt): Note that per the note above, this should be done + // before entering Go code. However for simplicity we leave it here for + // now, since the small critical sections with undefined FPU state + // should only contain very limited use of floating point instructions + // (notably, use of XMM15 as a zero register). fninit() // Need to sync XCR0 with the host, because xsave and xrstor can be // called from different contexts. xsetbv(0, sentryXCR0) // Set the syscall target. - wrmsr(_MSR_LSTAR, kernelFunc(sysenter)) + wrmsr(_MSR_LSTAR, kernelFunc(addrOfSysenter())) wrmsr(_MSR_SYSCALL_MASK, KernelFlagsClear|_RFLAGS_DF) // NOTE: This depends on having the 64-bit segments immediately // following the 32-bit user segments. This is simply the way the // sysret instruction is designed to work (it assumes they follow). wrmsr(_MSR_STAR, uintptr(uint64(Kcode)<<32|uint64(Ucode32)<<48)) - wrmsr(_MSR_CSTAR, kernelFunc(sysenter)) + wrmsr(_MSR_CSTAR, kernelFunc(addrOfSysenter())) } // SetCPUIDFaulting sets CPUID faulting per the boolean value. diff --git a/pkg/ring0/kernel_unsafe.go b/pkg/ring0/kernel_unsafe.go index 16955ad91..04c60d0a7 100644 --- a/pkg/ring0/kernel_unsafe.go +++ b/pkg/ring0/kernel_unsafe.go @@ -35,7 +35,6 @@ func kernelAddr(obj interface{}) uintptr { // kernelFunc returns the address of the given function. // //go:nosplit -func kernelFunc(fn func()) uintptr { - fnptr := (**uintptr)(unsafe.Pointer(&fn)) - return KernelStartAddress | **fnptr +func kernelFunc(fn uintptr) uintptr { + return KernelStartAddress | fn } diff --git a/pkg/ring0/lib_amd64.go b/pkg/ring0/lib_amd64.go index 3e6bb9663..46746fd80 100644 --- a/pkg/ring0/lib_amd64.go +++ b/pkg/ring0/lib_amd64.go @@ -43,8 +43,8 @@ func xsave(*byte) // xsaveopt uses xsaveopt to save floating point state. func xsaveopt(*byte) -// WriteFS sets the GS address (set by init). -var WriteFS func(addr uintptr) +// writeFS sets the FS base address (selects one of wrfsbase or wrfsmsr). +func writeFS(addr uintptr) // wrfsbase writes to the GS base address. func wrfsbase(addr uintptr) @@ -116,10 +116,8 @@ func Init(featureSet *cpuid.FeatureSet) { LoadFloatingPoint = fxrstor } if hasFSGSBASE { - WriteFS = wrfsbase WriteGS = wrgsbase } else { - WriteFS = wrfsmsr WriteGS = wrgsmsr } } diff --git a/pkg/ring0/lib_amd64.s b/pkg/ring0/lib_amd64.s index 70a43e79e..8ed98fc84 100644 --- a/pkg/ring0/lib_amd64.s +++ b/pkg/ring0/lib_amd64.s @@ -80,6 +80,29 @@ TEXT ·xsaveopt(SB),NOSPLIT,$0-8 BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; RET +// writeFS writes to the FS base. +// +// This is written in assembly because it must be safe to call before the Go +// environment is set up. See comment on start(). +// +// Preconditions: must be running in the lower address space, as it accesses +// global data. +TEXT ·writeFS(SB),NOSPLIT,$8-8 + MOVQ addr+0(FP), AX + + CMPB ·hasFSGSBASE(SB), $1 + JNE msr + + PUSHQ AX + CALL ·wrfsbase(SB) + POPQ AX + RET +msr: + PUSHQ AX + CALL ·wrfsmsr(SB) + POPQ AX + RET + // wrfsbase writes to the FS base. // // The code corresponds to: diff --git a/pkg/ring0/offsets_amd64.go b/pkg/ring0/offsets_amd64.go index ca4075b09..dd48118af 100644 --- a/pkg/ring0/offsets_amd64.go +++ b/pkg/ring0/offsets_amd64.go @@ -95,6 +95,6 @@ func Emit(w io.Writer) { fmt.Fprintf(w, "#define PTRACE_FLAGS 0x%02x\n", reflect.ValueOf(&p.Eflags).Pointer()-reflect.ValueOf(p).Pointer()) fmt.Fprintf(w, "#define PTRACE_RSP 0x%02x\n", reflect.ValueOf(&p.Rsp).Pointer()-reflect.ValueOf(p).Pointer()) fmt.Fprintf(w, "#define PTRACE_SS 0x%02x\n", reflect.ValueOf(&p.Ss).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_FS 0x%02x\n", reflect.ValueOf(&p.Fs_base).Pointer()-reflect.ValueOf(p).Pointer()) - fmt.Fprintf(w, "#define PTRACE_GS 0x%02x\n", reflect.ValueOf(&p.Gs_base).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_FS_BASE 0x%02x\n", reflect.ValueOf(&p.Fs_base).Pointer()-reflect.ValueOf(p).Pointer()) + fmt.Fprintf(w, "#define PTRACE_GS_BASE 0x%02x\n", reflect.ValueOf(&p.Gs_base).Pointer()-reflect.ValueOf(p).Pointer()) } diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go index d761bbdee..73ea73742 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64.go @@ -74,8 +74,27 @@ func (c *vCPU) KernelSyscall() { // therefore be guaranteed that there is no floating point state to be // loaded on resuming from halt. We only worry about saving on exit. ring0.SaveFloatingPoint(c.floatingPointState.BytePointer()) // escapes: no. - ring0.Halt() - ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no, reload host segment. + // N.B. Since KernelSyscall is called when the kernel makes a syscall, + // FS_BASE is already set for correct execution of this function. + // + // Refresher on syscall/exception handling: + // 1. When the sentry is in guest mode and makes a syscall, it goes to + // sysenter(), which saves the register state (including RIP of SYSCALL + // instruction) to vCPU.registers. + // 2. It then calls KernelSyscall, which rewinds the IP and executes + // HLT. + // 3. HLT does a VM-exit to bluepillHandler, which returns from the + // signal handler using vCPU.registers, directly to the SYSCALL + // instruction. + // 4. Later, when we want to re-use the vCPU (perhaps on a different + // host thread), we set the new thread's registers in vCPU.registers + // (as opposed to setting the KVM registers with KVM_SET_REGS). + // 5. KVM_RUN thus enters the guest with the old register state, + // immediately following the HLT instruction, returning here. + // 6. We then restore FS_BASE and the full registers from vCPU.register + // to return from sysenter() back to the desired bluepill point from + // the host. + ring0.HaltAndWriteFSBase(regs) // escapes: no, reload host segment. } // KernelException handles kernel exceptions. @@ -93,8 +112,8 @@ func (c *vCPU) KernelException(vector ring0.Vector) { } // See above. ring0.SaveFloatingPoint(c.floatingPointState.BytePointer()) // escapes: no. - ring0.Halt() - ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no; reload host segment. + // See above. + ring0.HaltAndWriteFSBase(regs) // escapes: no, reload host segment. } // bluepillArchExit is called during bluepillEnter. diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go index 6f87236ad..06fcf1d2e 100644 --- a/pkg/sentry/platform/kvm/bluepill_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go @@ -85,6 +85,13 @@ func bluepillGuestExit(c *vCPU, context unsafe.Pointer) { // signal stack. It should only execute raw system calls and functions that are // explicitly marked go:nosplit. // +// Ideally, this function should switch to gsignal, as runtime.sigtramp does, +// but that is tedious given all the runtime internals. That said, using +// gsignal inside a signal handler is not _required_, provided we avoid stack +// splits and allocations. Note that calling any splittable function here will +// be flaky; if the signal stack is below the G stack then we will trigger a +// split and crash. If above, we won't trigger a split. +// // +checkescape:all // //go:nosplit diff --git a/pkg/sentry/platform/kvm/kvm_amd64_test.go b/pkg/sentry/platform/kvm/kvm_amd64_test.go index b1cab89a0..a002ae00c 100644 --- a/pkg/sentry/platform/kvm/kvm_amd64_test.go +++ b/pkg/sentry/platform/kvm/kvm_amd64_test.go @@ -28,7 +28,7 @@ import ( ) func TestSegments(t *testing.T) { - applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfTwiddleSegments(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTestSegments(regs) for { var si linux.SignalInfo @@ -55,7 +55,7 @@ func TestSegments(t *testing.T) { func stmxcsr(addr *uint32) func TestMXCSR(t *testing.T) { - applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si linux.SignalInfo switchOpts := ring0.SwitchOpts{ Registers: regs, diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go index fe570aff9..3a30286e2 100644 --- a/pkg/sentry/platform/kvm/kvm_test.go +++ b/pkg/sentry/platform/kvm/kvm_test.go @@ -120,13 +120,13 @@ func TestKernelFloatingPoint(t *testing.T) { }) } -func applicationTest(t testHarness, useHostMappings bool, target func(), fn func(*vCPU, *arch.Registers, *pagetables.PageTables) bool) { +func applicationTest(t testHarness, useHostMappings bool, targetFn uintptr, fn func(*vCPU, *arch.Registers, *pagetables.PageTables) bool) { // Initialize registers & page tables. var ( regs arch.Registers pt *pagetables.PageTables ) - testutil.SetTestTarget(®s, target) + testutil.SetTestTarget(®s, targetFn) kvmTest(t, func(k *KVM) { // Create new page tables. @@ -157,7 +157,7 @@ func applicationTest(t testHarness, useHostMappings bool, target func(), fn func } func TestApplicationSyscall(t *testing.T) { - applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, @@ -171,7 +171,7 @@ func TestApplicationSyscall(t *testing.T) { } return false }) - applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, @@ -187,7 +187,7 @@ func TestApplicationSyscall(t *testing.T) { } func TestApplicationFault(t *testing.T) { - applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfTouch(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTouchTarget(regs, nil) // Cause fault. var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ @@ -202,7 +202,7 @@ func TestApplicationFault(t *testing.T) { } return false }) - applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfTouch(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTouchTarget(regs, nil) // Cause fault. var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ @@ -219,7 +219,7 @@ func TestApplicationFault(t *testing.T) { } func TestRegistersSyscall(t *testing.T) { - applicationTest(t, true, testutil.TwiddleRegsSyscall, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfTwiddleRegsSyscall(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTestRegs(regs) // Fill values for all registers. for { var si linux.SignalInfo @@ -242,7 +242,7 @@ func TestRegistersSyscall(t *testing.T) { } func TestRegistersFault(t *testing.T) { - applicationTest(t, true, testutil.TwiddleRegsFault, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfTwiddleRegsFault(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTestRegs(regs) // Fill values for all registers. for { var si linux.SignalInfo @@ -266,7 +266,7 @@ func TestRegistersFault(t *testing.T) { } func TestBounce(t *testing.T) { - applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfSpinLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { go func() { time.Sleep(time.Millisecond) c.BounceToKernel() @@ -281,7 +281,7 @@ func TestBounce(t *testing.T) { } return false }) - applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfSpinLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { go func() { time.Sleep(time.Millisecond) c.BounceToKernel() @@ -300,7 +300,7 @@ func TestBounce(t *testing.T) { } func TestBounceStress(t *testing.T) { - applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfSpinLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { randomSleep := func() { // O(hundreds of microseconds) is appropriate to ensure // different overlaps and different schedules. @@ -336,7 +336,7 @@ func TestBounceStress(t *testing.T) { func TestInvalidate(t *testing.T) { var data uintptr // Used below. - applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, true, testutil.AddrOfTouch(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { testutil.SetTouchTarget(regs, &data) // Read legitimate value. for { var si linux.SignalInfo @@ -377,7 +377,7 @@ func IsFault(err error, si *linux.SignalInfo) bool { } func TestEmptyAddressSpace(t *testing.T) { - applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, false, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, @@ -391,7 +391,7 @@ func TestEmptyAddressSpace(t *testing.T) { } return false }) - applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(t, false, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, @@ -467,7 +467,7 @@ func BenchmarkApplicationSyscall(b *testing.B) { i int // Iteration includes machine.Get() / machine.Put(). a int // Count for ErrContextInterrupt. ) - applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(b, true, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, @@ -489,7 +489,7 @@ func BenchmarkApplicationSyscall(b *testing.B) { func BenchmarkKernelSyscall(b *testing.B) { // Note that the target passed here is irrelevant, we never execute SwitchToUser. - applicationTest(b, true, testutil.Getpid, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(b, true, testutil.AddrOfGetpid(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { // iteration does not include machine.Get() / machine.Put(). for i := 0; i < b.N; i++ { testutil.Getpid() @@ -504,7 +504,7 @@ func BenchmarkWorldSwitchToUserRoundtrip(b *testing.B) { i int a int ) - applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { + applicationTest(b, true, testutil.AddrOfSyscallLoop(), func(c *vCPU, regs *arch.Registers, pt *pagetables.PageTables) bool { var si linux.SignalInfo if _, err := c.SwitchToUser(ring0.SwitchOpts{ Registers: regs, diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index 7a10fd812..b28a2c4e8 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -136,7 +136,7 @@ func (c *vCPU) initArchState() error { } // Set the entrypoint for the kernel. - kernelUserRegs.RIP = uint64(reflect.ValueOf(ring0.Start).Pointer()) + kernelUserRegs.RIP = uint64(ring0.AddrOfStart()) kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer()) kernelUserRegs.RSP = c.StackTop() kernelUserRegs.RFLAGS = ring0.KernelFlagsSet diff --git a/pkg/sentry/platform/kvm/testutil/testutil.go b/pkg/sentry/platform/kvm/testutil/testutil.go index 5c1efa0fd..d8c273796 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil.go +++ b/pkg/sentry/platform/kvm/testutil/testutil.go @@ -23,23 +23,41 @@ import ( // Getpid executes a trivial system call. func Getpid() -// Touch touches the value in the first register. -func Touch() +// AddrOfGetpid returns the address of Getpid. +// +// In Go 1.17+, Go references to assembly functions resolve to an ABIInternal +// wrapper function rather than the function itself. We must reference from +// assembly to get the ABI0 (i.e., primary) address. +func AddrOfGetpid() uintptr + +// AddrOfTouch returns the address of a function that touches the value in the +// first register. +func AddrOfTouch() uintptr +func touch() -// SyscallLoop executes a syscall and loops. -func SyscallLoop() +// AddrOfSyscallLoop returns the address of a function that executes a syscall +// and loops. +func AddrOfSyscallLoop() uintptr +func syscallLoop() -// SpinLoop spins on the CPU. -func SpinLoop() +// AddrOfSpinLoop returns the address of a function that spins on the CPU. +func AddrOfSpinLoop() uintptr +func spinLoop() -// HaltLoop immediately halts and loops. -func HaltLoop() +// AddrOfHaltLoop returns the address of a function that immediately halts and +// loops. +func AddrOfHaltLoop() uintptr +func haltLoop() -// TwiddleRegsFault twiddles registers then faults. -func TwiddleRegsFault() +// AddrOfTwiddleRegsFault returns the address of a function that twiddles +// registers then faults. +func AddrOfTwiddleRegsFault() uintptr +func twiddleRegsFault() -// TwiddleRegsSyscall twiddles registers then executes a syscall. -func TwiddleRegsSyscall() +// AddrOfTwiddleRegsSyscall returns the address of a function that twiddles +// registers then executes a syscall. +func AddrOfTwiddleRegsSyscall() uintptr +func twiddleRegsSyscall() // FloatingPointWorks is a floating point test. // diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go index 8048eedec..7c19b6a8f 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go +++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go @@ -22,12 +22,14 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" ) -// TwiddleSegments reads segments into known registers. -func TwiddleSegments() +// AddrOfTwiddleSegments return the address of a function that reads segments +// into known registers. +func AddrOfTwiddleSegments() uintptr +func twiddleSegments() // SetTestTarget sets the rip appropriately. -func SetTestTarget(regs *arch.Registers, fn func()) { - regs.Rip = uint64(reflect.ValueOf(fn).Pointer()) +func SetTestTarget(regs *arch.Registers, fn uintptr) { + regs.Rip = uint64(fn) } // SetTouchTarget sets rax appropriately. diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.s b/pkg/sentry/platform/kvm/testutil/testutil_amd64.s index 491ec0c2a..65e7c05ea 100644 --- a/pkg/sentry/platform/kvm/testutil/testutil_amd64.s +++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.s @@ -25,27 +25,46 @@ TEXT ·Getpid(SB),NOSPLIT,$0 SYSCALL RET -TEXT ·Touch(SB),NOSPLIT,$0 +// func AddrOfGetpid() uintptr +TEXT ·AddrOfGetpid(SB), $0-8 + MOVQ $·Getpid(SB), AX + MOVQ AX, ret+0(FP) + RET + +TEXT ·touch(SB),NOSPLIT,$0 start: MOVQ 0(AX), BX // deref AX MOVQ $39, AX // getpid SYSCALL JMP start -TEXT ·HaltLoop(SB),NOSPLIT,$0 -start: - HLT - JMP start +// func AddrOfTouch() uintptr +TEXT ·AddrOfTouch(SB), $0-8 + MOVQ $·touch(SB), AX + MOVQ AX, ret+0(FP) + RET -TEXT ·SyscallLoop(SB),NOSPLIT,$0 +TEXT ·syscallLoop(SB),NOSPLIT,$0 start: SYSCALL JMP start -TEXT ·SpinLoop(SB),NOSPLIT,$0 +// func AddrOfSyscallLoop() uintptr +TEXT ·AddrOfSyscallLoop(SB), $0-8 + MOVQ $·syscallLoop(SB), AX + MOVQ AX, ret+0(FP) + RET + +TEXT ·spinLoop(SB),NOSPLIT,$0 start: JMP start +// func AddrOfSpinLoop() uintptr +TEXT ·AddrOfSpinLoop(SB), $0-8 + MOVQ $·spinLoop(SB), AX + MOVQ AX, ret+0(FP) + RET + TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8 NO_LOCAL_POINTERS MOVQ $1, AX @@ -75,20 +94,32 @@ TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8 NOTQ DI; \ NOTQ SP; -TEXT ·TwiddleRegsSyscall(SB),NOSPLIT,$0 +TEXT ·twiddleRegsSyscall(SB),NOSPLIT,$0 TWIDDLE_REGS() SYSCALL RET // never reached -TEXT ·TwiddleRegsFault(SB),NOSPLIT,$0 +// func AddrOfTwiddleRegsSyscall() uintptr +TEXT ·AddrOfTwiddleRegsSyscall(SB), $0-8 + MOVQ $·twiddleRegsSyscall(SB), AX + MOVQ AX, ret+0(FP) + RET + +TEXT ·twiddleRegsFault(SB),NOSPLIT,$0 TWIDDLE_REGS() JMP AX // must fault RET // never reached +// func AddrOfTwiddleRegsFault() uintptr +TEXT ·AddrOfTwiddleRegsFault(SB), $0-8 + MOVQ $·twiddleRegsFault(SB), AX + MOVQ AX, ret+0(FP) + RET + #define READ_FS() BYTE $0x64; BYTE $0x48; BYTE $0x8b; BYTE $0x00; #define READ_GS() BYTE $0x65; BYTE $0x48; BYTE $0x8b; BYTE $0x00; -TEXT ·TwiddleSegments(SB),NOSPLIT,$0 +TEXT ·twiddleSegments(SB),NOSPLIT,$0 MOVQ $0x0, AX READ_GS() MOVQ AX, BX @@ -96,3 +127,9 @@ TEXT ·TwiddleSegments(SB),NOSPLIT,$0 READ_FS() SYSCALL RET // never reached + +// func AddrOfTwiddleSegments() uintptr +TEXT ·AddrOfTwiddleSegments(SB), $0-8 + MOVQ $·twiddleSegments(SB), AX + MOVQ AX, ret+0(FP) + RET |