diff options
Diffstat (limited to 'pkg/ring0')
-rw-r--r-- | pkg/ring0/defs_impl_amd64.go | 20 | ||||
-rw-r--r-- | pkg/ring0/defs_impl_arm64.go | 12 | ||||
-rw-r--r-- | pkg/ring0/entry_amd64.go | 5 | ||||
-rw-r--r-- | pkg/ring0/entry_impl_amd64.s | 182 | ||||
-rw-r--r-- | pkg/ring0/kernel.go | 5 | ||||
-rw-r--r-- | pkg/ring0/kernel_amd64.go | 23 | ||||
-rw-r--r-- | pkg/ring0/lib_amd64.go | 42 | ||||
-rw-r--r-- | pkg/ring0/lib_amd64.s | 23 |
8 files changed, 246 insertions, 66 deletions
diff --git a/pkg/ring0/defs_impl_amd64.go b/pkg/ring0/defs_impl_amd64.go index d22b41549..df5b4462f 100644 --- a/pkg/ring0/defs_impl_amd64.go +++ b/pkg/ring0/defs_impl_amd64.go @@ -73,6 +73,9 @@ type CPU struct { // calls and exceptions via the Registers function. registers arch.Registers + // floatingPointState holds floating point state. + floatingPointState fpu.State + // hooks are kernel hooks. hooks Hooks } @@ -86,6 +89,15 @@ func (c *CPU) Registers() *arch.Registers { return &c.registers } +// FloatingPointState returns the kernel floating point state. +// +// This is explicitly safe to call during KernelException and KernelSyscall. +// +//go:nosplit +func (c *CPU) FloatingPointState() *fpu.State { + return &c.floatingPointState +} + // SwitchOpts are passed to the Switch function. type SwitchOpts struct { // Registers are the user register state. @@ -203,6 +215,11 @@ type CPUArchState struct { errorType uintptr *kernelEntry + + // Copies of global variables, stored in CPU so that they can be used by + // syscall and exception handlers (in the upper address space). + hasXSAVE bool + hasXSAVEOPT bool } // ErrorCode returns the last error code. @@ -258,6 +275,9 @@ func Emit(w io.Writer) { fmt.Fprintf(w, "#define CPU_ERROR_CODE 0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer()) fmt.Fprintf(w, "#define CPU_ERROR_TYPE 0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer()) fmt.Fprintf(w, "#define CPU_ENTRY 0x%02x\n", reflect.ValueOf(&c.kernelEntry).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_HAS_XSAVE 0x%02x\n", reflect.ValueOf(&c.hasXSAVE).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_HAS_XSAVEOPT 0x%02x\n", reflect.ValueOf(&c.hasXSAVEOPT).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_FPU_STATE 0x%02x\n", reflect.ValueOf(&c.floatingPointState).Pointer()-reflect.ValueOf(c).Pointer()) e := &kernelEntry{} fmt.Fprintf(w, "\n// CPU entry offsets.\n") diff --git a/pkg/ring0/defs_impl_arm64.go b/pkg/ring0/defs_impl_arm64.go index c3c543c88..0e73d2ea9 100644 --- a/pkg/ring0/defs_impl_arm64.go +++ b/pkg/ring0/defs_impl_arm64.go @@ -175,6 +175,9 @@ type CPU struct { // calls and exceptions via the Registers function. registers arch.Registers + // floatingPointState holds floating point state. + floatingPointState fpu.State + // hooks are kernel hooks. hooks Hooks } @@ -188,6 +191,15 @@ func (c *CPU) Registers() *arch.Registers { return &c.registers } +// FloatingPointState returns the kernel floating point state. +// +// This is explicitly safe to call during KernelException and KernelSyscall. +// +//go:nosplit +func (c *CPU) FloatingPointState() *fpu.State { + return &c.floatingPointState +} + // SwitchOpts are passed to the Switch function. type SwitchOpts struct { // Registers are the user register state. diff --git a/pkg/ring0/entry_amd64.go b/pkg/ring0/entry_amd64.go index afd646b0b..13ad4e4df 100644 --- a/pkg/ring0/entry_amd64.go +++ b/pkg/ring0/entry_amd64.go @@ -39,11 +39,6 @@ func sysenter() // assembly to get the ABI0 (i.e., primary) address. func addrOfSysenter() uintptr -// swapgs swaps the current GS value. -// -// This must be called prior to sysret/iret. -func swapgs() - // jumpToKernel jumps to the kernel version of the current RIP. func jumpToKernel() diff --git a/pkg/ring0/entry_impl_amd64.s b/pkg/ring0/entry_impl_amd64.s index 1d0262a18..2bb80d8af 100644 --- a/pkg/ring0/entry_impl_amd64.s +++ b/pkg/ring0/entry_impl_amd64.s @@ -3,10 +3,13 @@ // Automatically generated, do not edit. // CPU offsets. -#define CPU_REGISTERS 0x28 +#define CPU_REGISTERS 0x30 #define CPU_ERROR_CODE 0x10 #define CPU_ERROR_TYPE 0x18 #define CPU_ENTRY 0x20 +#define CPU_HAS_XSAVE 0x28 +#define CPU_HAS_XSAVEOPT 0x29 +#define CPU_FPU_STATE 0x108 // CPU entry offsets. #define ENTRY_SCRATCH0 0x100 @@ -212,8 +215,103 @@ TEXT ·jumpToUser(SB),NOSPLIT,$0 MOVQ AX, 0(SP) RET +// See kernel_amd64.go. +// +// The 16-byte frame size is for the saved values of MXCSR and the x87 control +// word. +TEXT ·doSwitchToUser(SB),NOSPLIT,$16-48 + // We are passed pointers to heap objects, but do not store them in our + // local frame. + NO_LOCAL_POINTERS + + // MXCSR and the x87 control word are the only floating point state + // that is callee-save and thus we must save. + STMXCSR mxcsr-0(SP) + FSTCW cw-8(SP) + + // Restore application floating point state. + MOVQ cpu+0(FP), SI + MOVQ fpState+16(FP), DI + MOVB ·hasXSAVE(SB), BX + TESTB BX, BX + JZ no_xrstor + // Use xrstor to restore all available fp state. For now, we restore + // everything unconditionally by setting the implicit operand edx:eax + // (the "requested feature bitmap") to all 1's. + MOVL $0xffffffff, AX + MOVL $0xffffffff, DX + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x2f // XRSTOR64 0(DI) + JMP fprestore_done +no_xrstor: + // Fall back to fxrstor if xsave is not available. + FXRSTOR64 0(DI) +fprestore_done: + + // Set application GS. + MOVQ regs+8(FP), R8 + SWAP_GS() + MOVQ PTRACE_GS_BASE(R8), AX + PUSHQ AX + CALL ·writeGS(SB) + POPQ AX + + // Call sysret() or iret(). + MOVQ userCR3+24(FP), CX + MOVQ needIRET+32(FP), R9 + ADDQ $-32, SP + MOVQ SI, 0(SP) // cpu + MOVQ R8, 8(SP) // regs + MOVQ CX, 16(SP) // userCR3 + TESTQ R9, R9 + JNZ do_iret + CALL ·sysret(SB) + JMP done_sysret_or_iret +do_iret: + CALL ·iret(SB) +done_sysret_or_iret: + MOVQ 24(SP), AX // vector + ADDQ $32, SP + MOVQ AX, vector+40(FP) + + // Save application floating point state. + MOVQ fpState+16(FP), DI + MOVB ·hasXSAVE(SB), BX + MOVB ·hasXSAVEOPT(SB), CX + TESTB BX, BX + JZ no_xsave + // Use xsave/xsaveopt to save all extended state. + // We save everything unconditionally by setting RFBM to all 1's. + MOVL $0xffffffff, AX + MOVL $0xffffffff, DX + TESTB CX, CX + JZ no_xsaveopt + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; // XSAVEOPT64 0(DI) + JMP fpsave_done +no_xsaveopt: + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; // XSAVE64 0(DI) + JMP fpsave_done +no_xsave: + FXSAVE64 0(DI) +fpsave_done: + + // Restore MXCSR and the x87 control word after one of the two floating + // point save cases above, to ensure the application versions are saved + // before being clobbered here. + LDMXCSR mxcsr-0(SP) + + // FLDCW is a "waiting" x87 instruction, meaning it checks for pending + // unmasked exceptions before executing. Thus if userspace has unmasked + // an exception and has one pending, it can be raised by FLDCW even + // though the new control word will mask exceptions. To prevent this, + // we must first clear pending exceptions (which will be restored by + // XRSTOR, et al). + BYTE $0xDB; BYTE $0xE2; // FNCLEX + FLDCW cw-8(SP) + + RET + // See entry_amd64.go. -TEXT ·sysret(SB),NOSPLIT,$0-24 +TEXT ·sysret(SB),NOSPLIT,$0-32 // Set application FS. We can't do this in Go because Go code needs FS. MOVQ regs+8(FP), AX MOVQ PTRACE_FS_BASE(AX), AX @@ -252,9 +350,11 @@ TEXT ·sysret(SB),NOSPLIT,$0-24 POPQ AX // Restore AX. POPQ SP // Restore SP. SYSRET64() + // sysenter or exception will write our return value and return to our + // caller. // See entry_amd64.go. -TEXT ·iret(SB),NOSPLIT,$0-24 +TEXT ·iret(SB),NOSPLIT,$0-32 // Set application FS. We can't do this in Go because Go code needs FS. MOVQ regs+8(FP), AX MOVQ PTRACE_FS_BASE(AX), AX @@ -290,6 +390,8 @@ TEXT ·iret(SB),NOSPLIT,$0-24 WRITE_CR3() // Switch to userCR3. POPQ AX // Restore AX. IRET() + // sysenter or exception will write our return value and return to our + // caller. // See entry_amd64.go. TEXT ·resume(SB),NOSPLIT,$0 @@ -394,11 +496,39 @@ kernel: MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code. MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel. + // Save floating point state. CPU.floatingPointState is a slice, so the + // first word of CPU.floatingPointState is a pointer to the destination + // array. + MOVQ CPU_FPU_STATE(AX), DI + MOVB CPU_HAS_XSAVE(AX), BX + MOVB CPU_HAS_XSAVEOPT(AX), CX + TESTB BX, BX + JZ no_xsave + // Use xsave/xsaveopt to save all extended state. + // We save everything unconditionally by setting RFBM to all 1's. + MOVL $0xffffffff, AX + MOVL $0xffffffff, DX + TESTB CX, CX + JZ no_xsaveopt + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; // XSAVEOPT64 0(DI) + JMP fpsave_done +no_xsaveopt: + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; // XSAVE64 0(DI) + JMP fpsave_done +no_xsave: + FXSAVE64 0(DI) +fpsave_done: + // Call the syscall trampoline. LOAD_KERNEL_STACK(GS) - PUSHQ AX // First argument (vCPU). - CALL ·kernelSyscall(SB) // Call the trampoline. - POPQ AX // Pop vCPU. + MOVQ ENTRY_CPU_SELF(GS), AX // AX contains the vCPU. + PUSHQ AX // First argument (vCPU). + CALL ·kernelSyscall(SB) // Call the trampoline. + POPQ AX // Pop vCPU. + + // We only trigger a bluepill entry in the bluepill function, and can + // therefore be guaranteed that there is no floating point state to be + // loaded on resuming from halt. JMP ·resume(SB) ADDR_OF_FUNC(·addrOfSysenter(SB), ·sysenter(SB)); @@ -486,15 +616,43 @@ kernel: MOVQ 8(SP), BX // Load the error code. MOVQ BX, CPU_ERROR_CODE(AX) // Copy out to the CPU. MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel. - MOVQ 0(SP), BX // BX contains the vector. + + // Save floating point state. CPU.floatingPointState is a slice, so the + // first word of CPU.floatingPointState is a pointer to the destination + // array. + MOVQ CPU_FPU_STATE(AX), DI + MOVB CPU_HAS_XSAVE(AX), BX + MOVB CPU_HAS_XSAVEOPT(AX), CX + TESTB BX, BX + JZ no_xsave + // Use xsave/xsaveopt to save all extended state. + // We save everything unconditionally by setting RFBM to all 1's. + MOVL $0xffffffff, AX + MOVL $0xffffffff, DX + TESTB CX, CX + JZ no_xsaveopt + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37; // XSAVEOPT64 0(DI) + JMP fpsave_done +no_xsaveopt: + BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x27; // XSAVE64 0(DI) + JMP fpsave_done +no_xsave: + FXSAVE64 0(DI) +fpsave_done: // Call the exception trampoline. + MOVQ 0(SP), BX // BX contains the vector. LOAD_KERNEL_STACK(GS) - PUSHQ BX // Second argument (vector). - PUSHQ AX // First argument (vCPU). - CALL ·kernelException(SB) // Call the trampoline. - POPQ BX // Pop vector. - POPQ AX // Pop vCPU. + MOVQ ENTRY_CPU_SELF(GS), AX // AX contains the vCPU. + PUSHQ BX // Second argument (vector). + PUSHQ AX // First argument (vCPU). + CALL ·kernelException(SB) // Call the trampoline. + POPQ BX // Pop vector. + POPQ AX // Pop vCPU. + + // We only trigger a bluepill entry in the bluepill function, and can + // therefore be guaranteed that there is no floating point state to be + // loaded on resuming from halt. JMP ·resume(SB) #define EXCEPTION_WITH_ERROR(value, symbol, addr) \ diff --git a/pkg/ring0/kernel.go b/pkg/ring0/kernel.go index 292f9d0cc..e7dd84929 100644 --- a/pkg/ring0/kernel.go +++ b/pkg/ring0/kernel.go @@ -14,6 +14,10 @@ package ring0 +import ( + "gvisor.dev/gvisor/pkg/sentry/arch/fpu" +) + // Init initializes a new kernel. // //go:nosplit @@ -80,6 +84,7 @@ func (c *CPU) Init(k *Kernel, cpuID int, hooks Hooks) { c.self = c // Set self reference. c.kernel = k // Set kernel reference. c.init(cpuID) // Perform architectural init. + c.floatingPointState = fpu.NewState() // Require hooks. if hooks != nil { diff --git a/pkg/ring0/kernel_amd64.go b/pkg/ring0/kernel_amd64.go index 4a4c0ae26..7e55011b5 100644 --- a/pkg/ring0/kernel_amd64.go +++ b/pkg/ring0/kernel_amd64.go @@ -143,6 +143,9 @@ func (c *CPU) init(cpuID int) { // Set mandatory flags. c.registers.Eflags = KernelFlagsSet + + c.hasXSAVE = hasXSAVE + c.hasXSAVEOPT = hasXSAVEOPT } // StackTop returns the kernel's stack address. @@ -248,19 +251,21 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { regs.Ss = uint64(Udata) // Ditto. // Perform the switch. - swapgs() // GS will be swapped on return. - WriteGS(uintptr(regs.Gs_base)) // escapes: no. Set application GS. - LoadFloatingPoint(switchOpts.FloatingPointState.BytePointer()) // escapes: no. Copy in floating point. + needIRET := uint64(0) if switchOpts.FullRestore { - vector = iret(c, regs, uintptr(userCR3)) - } else { - vector = sysret(c, regs, uintptr(userCR3)) + needIRET = 1 } - SaveFloatingPoint(switchOpts.FloatingPointState.BytePointer()) // escapes: no. Copy out floating point. - RestoreKernelFPState() // escapes: no. Restore kernel MXCSR. + vector = doSwitchToUser(c, regs, switchOpts.FloatingPointState.BytePointer(), userCR3, needIRET) // escapes: no. return } +func doSwitchToUser( + cpu *CPU, // +0(FP) + regs *arch.Registers, // +8(FP) + fpState *byte, // +16(FP) + userCR3 uint64, // +24(FP) + needIRET uint64) Vector // +32(FP), +40(FP) + var ( sentryXCR0 uintptr sentryXCR0Once sync.Once @@ -287,7 +292,7 @@ func initSentryXCR0() { //go:nosplit func startGo(c *CPU) { // Save per-cpu. - WriteGS(kernelAddr(c.kernelEntry)) + writeGS(kernelAddr(c.kernelEntry)) // // TODO(mpratt): Note that per the note above, this should be done diff --git a/pkg/ring0/lib_amd64.go b/pkg/ring0/lib_amd64.go index 05c394ff5..c42a5b205 100644 --- a/pkg/ring0/lib_amd64.go +++ b/pkg/ring0/lib_amd64.go @@ -21,29 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/cpuid" ) -// LoadFloatingPoint loads floating point state by the most efficient mechanism -// available (set by Init). -var LoadFloatingPoint func(*byte) - -// SaveFloatingPoint saves floating point state by the most efficient mechanism -// available (set by Init). -var SaveFloatingPoint func(*byte) - -// fxrstor uses fxrstor64 to load floating point state. -func fxrstor(*byte) - -// xrstor uses xrstor to load floating point state. -func xrstor(*byte) - -// fxsave uses fxsave64 to save floating point state. -func fxsave(*byte) - -// xsave uses xsave to save floating point state. -func xsave(*byte) - -// xsaveopt uses xsaveopt to save floating point state. -func xsaveopt(*byte) - // writeFS sets the FS base address (selects one of wrfsbase or wrfsmsr). func writeFS(addr uintptr) @@ -53,8 +30,8 @@ func wrfsbase(addr uintptr) // wrfsmsr writes to the GS_BASE MSR. func wrfsmsr(addr uintptr) -// WriteGS sets the GS address (set by init). -var WriteGS func(addr uintptr) +// writeGS sets the GS address (selects one of wrgsbase or wrgsmsr). +func writeGS(addr uintptr) // wrgsbase writes to the GS base address. func wrgsbase(addr uintptr) @@ -106,19 +83,4 @@ func Init(featureSet *cpuid.FeatureSet) { hasXSAVE = featureSet.UseXsave() hasFSGSBASE = featureSet.HasFeature(cpuid.X86FeatureFSGSBase) validXCR0Mask = uintptr(featureSet.ValidXCR0Mask()) - if hasXSAVEOPT { - SaveFloatingPoint = xsaveopt - LoadFloatingPoint = xrstor - } else if hasXSAVE { - SaveFloatingPoint = xsave - LoadFloatingPoint = xrstor - } else { - SaveFloatingPoint = fxsave - LoadFloatingPoint = fxrstor - } - if hasFSGSBASE { - WriteGS = wrgsbase - } else { - WriteGS = wrgsmsr - } } diff --git a/pkg/ring0/lib_amd64.s b/pkg/ring0/lib_amd64.s index 8ed98fc84..0f283aaae 100644 --- a/pkg/ring0/lib_amd64.s +++ b/pkg/ring0/lib_amd64.s @@ -128,6 +128,29 @@ TEXT ·wrfsmsr(SB),NOSPLIT,$0-8 BYTE $0x0f; BYTE $0x30; RET +// writeGS writes to the GS base. +// +// This is written in assembly because it must be callable from assembly (ABI0) +// without an intermediate transition to ABIInternal. +// +// Preconditions: must be running in the lower address space, as it accesses +// global data. +TEXT ·writeGS(SB),NOSPLIT,$8-8 + MOVQ addr+0(FP), AX + + CMPB ·hasFSGSBASE(SB), $1 + JNE msr + + PUSHQ AX + CALL ·wrgsbase(SB) + POPQ AX + RET +msr: + PUSHQ AX + CALL ·wrgsmsr(SB) + POPQ AX + RET + // wrgsbase writes to the GS base. // // The code corresponds to: |