From fb613020c7db323c705adf6ae0f954bee4ab5fec Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Wed, 31 Oct 2018 15:58:21 -0700 Subject: kvm: simplify floating point logic. This reduces the number of floating point save/restore cycles required (since we don't need to restore immediately following the switch, this always happens in a known context) and allows the kernel hooks to capture state. This lets us remove calls like "Current()". PiperOrigin-RevId: 219552844 Change-Id: I7676fa2f6c18b9919718458aa888b832a7db8cab --- pkg/sentry/platform/kvm/bluepill_amd64.go | 46 ++++++++++++-------------- pkg/sentry/platform/kvm/bluepill_unsafe.go | 7 ++++ pkg/sentry/platform/kvm/machine.go | 4 +-- pkg/sentry/platform/kvm/machine_amd64.go | 10 ++++++ pkg/sentry/platform/ring0/defs.go | 52 +++++++++++++++++------------- pkg/sentry/platform/ring0/entry_amd64.s | 41 +++++++---------------- pkg/sentry/platform/ring0/kernel.go | 34 ++++++++++++------- pkg/sentry/platform/ring0/kernel_amd64.go | 2 +- pkg/sentry/platform/ring0/offsets_amd64.go | 2 -- 9 files changed, 103 insertions(+), 95 deletions(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go index f013d1dc9..6520682d7 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64.go @@ -47,8 +47,8 @@ func redpill() { // bluepillArchEnter is called during bluepillEnter. // //go:nosplit -func bluepillArchEnter(context *arch.SignalContext64) (c *vCPU) { - c = vCPUPtr(uintptr(context.Rax)) +func bluepillArchEnter(context *arch.SignalContext64) *vCPU { + c := vCPUPtr(uintptr(context.Rax)) regs := c.CPU.Registers() regs.R8 = context.R8 regs.R9 = context.R9 @@ -73,50 +73,41 @@ func bluepillArchEnter(context *arch.SignalContext64) (c *vCPU) { regs.Cs = uint64(ring0.Kcode) regs.Ds = uint64(ring0.Udata) regs.Es = uint64(ring0.Udata) - regs.Fs = uint64(ring0.Udata) regs.Ss = uint64(ring0.Kdata) - - // ring0 uses GS exclusively, so we use GS_base to store the location - // of the floating point address. - // - // The address will be restored directly after running the VCPU, and - // will be saved again prior to halting. We rely on the fact that the - // SaveFloatingPointer/LoadFloatingPoint functions use the most - // efficient mechanism available (including compression) so the state - // size is guaranteed to be less than what's pointed to here. - regs.Gs_base = uint64(context.Fpstate) - return + return c } -// bluepillSyscall handles kernel syscalls. +// KernelSyscall handles kernel syscalls. // //go:nosplit -func bluepillSyscall() { - regs := ring0.Current().Registers() +func (c *vCPU) KernelSyscall() { + regs := c.Registers() if regs.Rax != ^uint64(0) { regs.Rip -= 2 // Rewind. } - ring0.SaveFloatingPoint(bytePtr(uintptr(regs.Gs_base))) + // We only trigger a bluepill entry in the bluepill function, and can + // therefore be guaranteed that there is no floating point state to be + // loaded on resuming from halt. We only worry about saving on exit. + ring0.SaveFloatingPoint((*byte)(c.floatingPointState)) ring0.Halt() ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment. - ring0.LoadFloatingPoint(bytePtr(uintptr(regs.Gs_base))) } -// bluepillException handles kernel exceptions. +// KernelException handles kernel exceptions. // //go:nosplit -func bluepillException(vector ring0.Vector) { - regs := ring0.Current().Registers() +func (c *vCPU) KernelException(vector ring0.Vector) { + regs := c.Registers() if vector == ring0.Vector(bounce) { // These should not interrupt kernel execution; point the Rip // to zero to ensure that we get a reasonable panic when we - // attempt to return. + // attempt to return and a full stack trace. regs.Rip = 0 } - ring0.SaveFloatingPoint(bytePtr(uintptr(regs.Gs_base))) + // See above. + ring0.SaveFloatingPoint((*byte)(c.floatingPointState)) ring0.Halt() ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment. - ring0.LoadFloatingPoint(bytePtr(uintptr(regs.Gs_base))) } // bluepillArchExit is called during bluepillEnter. @@ -142,4 +133,9 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) { context.Rsp = regs.Rsp context.Rip = regs.Rip context.Eflags = regs.Eflags + + // Set the context pointer to the saved floating point state. This is + // where the guest data has been serialized, the kernel will restore + // from this new pointer value. + context.Fpstate = uint64(uintptrValue((*byte)(c.floatingPointState))) } diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go index 77cf7e800..2605f8c93 100644 --- a/pkg/sentry/platform/kvm/bluepill_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go @@ -37,6 +37,13 @@ func bytePtr(addr uintptr) *byte { return (*byte)(unsafe.Pointer(addr)) } +// uintptrValue returns a uintptr for the given address. +// +//go:nosplit +func uintptrValue(addr *byte) uintptr { + return (uintptr)(unsafe.Pointer(addr)) +} + // bluepillHandler is called from the signal stub. // // The world may be stopped while this is executing, and it executes on the diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index 4ba3a185a..deead1b5f 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -142,9 +142,7 @@ func (m *machine) newVCPU() *vCPU { fd: int(fd), machine: m, } - c.CPU.Init(&m.kernel) - c.CPU.KernelSyscall = bluepillSyscall - c.CPU.KernelException = bluepillException + c.CPU.Init(&m.kernel, c) m.vCPUsByID[c.id] = c // Ensure the signal mask is correct. diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index c03792a1b..5ad805b8b 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -63,6 +63,10 @@ type vCPUArchState struct { // // This starts above fixedKernelPCID. PCIDs *pagetables.PCIDs + + // floatingPointState is the floating point state buffer used in guest + // to host transitions. See usage in bluepill_amd64.go. + floatingPointState *arch.FloatingPointData } const ( @@ -149,6 +153,12 @@ func (c *vCPU) initArchState() error { return err } + // Allocate some floating point state save area for the local vCPU. + // This will be saved prior to leaving the guest, and we restore from + // this always. We cannot use the pointer in the context alone because + // we don't know how large the area there is in reality. + c.floatingPointState = arch.NewFloatingPointData() + // Set the time offset to the host native time. return c.setSystemTime() } diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go index 18137e55d..98d0a6de0 100644 --- a/pkg/sentry/platform/ring0/defs.go +++ b/pkg/sentry/platform/ring0/defs.go @@ -38,6 +38,33 @@ type Kernel struct { KernelArchState } +// Hooks are hooks for kernel functions. +type Hooks interface { + // KernelSyscall is called for kernel system calls. + // + // Return from this call will restore registers and return to the kernel: the + // registers must be modified directly. + // + // If this function is not provided, a kernel exception results in halt. + // + // This must be go:nosplit, as this will be on the interrupt stack. + // Closures are permitted, as the pointer to the closure frame is not + // passed on the stack. + KernelSyscall() + + // KernelException handles an exception during kernel execution. + // + // Return from this call will restore registers and return to the kernel: the + // registers must be modified directly. + // + // If this function is not provided, a kernel exception results in halt. + // + // This must be go:nosplit, as this will be on the interrupt stack. + // Closures are permitted, as the pointer to the closure frame is not + // passed on the stack. + KernelException(Vector) +} + // CPU is the per-CPU struct. type CPU struct { // self is a self reference. @@ -58,29 +85,8 @@ type CPU struct { // calls and exceptions via the Registers function. registers syscall.PtraceRegs - // KernelException handles an exception during kernel execution. - // - // Return from this call will restore registers and return to the kernel: the - // registers must be modified directly. - // - // If this function is not provided, a kernel exception results in halt. - // - // This must be go:nosplit, as this will be on the interrupt stack. - // Closures are permitted, as the pointer to the closure frame is not - // passed on the stack. - KernelException func(Vector) - - // KernelSyscall is called for kernel system calls. - // - // Return from this call will restore registers and return to the kernel: the - // registers must be modified directly. - // - // If this function is not provided, a kernel exception results in halt. - // - // This must be go:nosplit, as this will be on the interrupt stack. - // Closures are permitted, as the pointer to the closure frame is not - // passed on the stack. - KernelSyscall func() + // hooks are kernel hooks. + hooks Hooks } // Registers returns a modifiable-copy of the kernel registers. diff --git a/pkg/sentry/platform/ring0/entry_amd64.s b/pkg/sentry/platform/ring0/entry_amd64.s index d48fbd2d1..afb040a6f 100644 --- a/pkg/sentry/platform/ring0/entry_amd64.s +++ b/pkg/sentry/platform/ring0/entry_amd64.s @@ -90,12 +90,6 @@ TEXT ·Halt(SB),NOSPLIT,$0 HLT RET -// See kernel.go. -TEXT ·Current(SB),NOSPLIT,$0-8 - MOVQ CPU_SELF(GS), AX - MOVQ AX, ret+0(FP) - RET - // See entry_amd64.go. TEXT ·swapgs(SB),NOSPLIT,$0 SWAP_GS() @@ -205,19 +199,12 @@ kernel: MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code. MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel. - // Load the function stored in KernelSyscall. - // - // Note that this function needs to be executed on the stack in case - // the runtime decides to make use of the redzone (grumble). This also - // protects against any functions that might not be go:nosplit, since - // this will cause a failure immediately. + // Call the syscall trampoline. LOAD_KERNEL_STACK(GS) - MOVQ CPU_KERNEL_SYSCALL(GS), DX // Function data. - MOVQ 0(DX), AX // Function pointer. - PUSHQ BP // Push the frame pointer. - MOVQ SP, BP // Set frame pointer value. - CALL *AX // Call the function. - POPQ BP // Restore the frame pointer. + MOVQ CPU_SELF(GS), AX // Load vCPU. + PUSHQ AX // First argument (vCPU). + CALL ·kernelSyscall(SB) // Call the trampoline. + POPQ AX // Pop vCPU. JMP ·resume(SB) // exception is a generic exception handler. @@ -287,18 +274,14 @@ kernel: MOVQ 0(SP), BX // BX contains the vector. ADDQ $48, SP // Drop the exception frame. - // Load the function stored in KernelException. - // - // See note above re: the kernel stack. + // Call the exception trampoline. LOAD_KERNEL_STACK(GS) - MOVQ CPU_KERNEL_EXCEPTION(GS), DX // Function data. - MOVQ 0(DX), AX // Function pointer. - PUSHQ BP // Push the frame pointer. - MOVQ SP, BP // Set frame pointer value. - PUSHQ BX // First argument (vector). - CALL *AX // Call the function. - POPQ BX // Discard the argument. - POPQ BP // Restore the frame pointer. + MOVQ CPU_SELF(GS), AX // Load vCPU. + PUSHQ BX // Second argument (vector). + PUSHQ AX // First argument (vCPU). + CALL ·kernelException(SB) // Call the trampoline. + POPQ BX // Pop vector. + POPQ AX // Pop vCPU. JMP ·resume(SB) #define EXCEPTION_WITH_ERROR(value, symbol) \ diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go index e70eafde2..19ac6eb7c 100644 --- a/pkg/sentry/platform/ring0/kernel.go +++ b/pkg/sentry/platform/ring0/kernel.go @@ -26,31 +26,41 @@ func (k *Kernel) Init(opts KernelOpts) { // Halt halts execution. func Halt() -// Current returns the current CPU. +// defaultHooks implements hooks. +type defaultHooks struct{} + +// KernelSyscall implements Hooks.KernelSyscall. // -// Its use is only legal in the KernelSyscall and KernelException contexts, -// which must all be guarded go:nosplit. -func Current() *CPU +//go:nosplit +func (defaultHooks) KernelSyscall() { Halt() } + +// KernelException implements Hooks.KernelException. +// +//go:nosplit +func (defaultHooks) KernelException(Vector) { Halt() } -// defaultSyscall is the default syscall hook. +// kernelSyscall is a trampoline. // //go:nosplit -func defaultSyscall() { Halt() } +func kernelSyscall(c *CPU) { c.hooks.KernelSyscall() } -// defaultException is the default exception hook. +// kernelException is a trampoline. // //go:nosplit -func defaultException(Vector) { Halt() } +func kernelException(c *CPU, vector Vector) { c.hooks.KernelException(vector) } // Init initializes a new CPU. // // Init allows embedding in other objects. -func (c *CPU) Init(k *Kernel) { +func (c *CPU) Init(k *Kernel, hooks Hooks) { c.self = c // Set self reference. c.kernel = k // Set kernel reference. c.init() // Perform architectural init. - // Defaults. - c.KernelSyscall = defaultSyscall - c.KernelException = defaultException + // Require hooks. + if hooks != nil { + c.hooks = hooks + } else { + c.hooks = defaultHooks{} + } } diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go index ab562bca7..9e8c56a54 100644 --- a/pkg/sentry/platform/ring0/kernel_amd64.go +++ b/pkg/sentry/platform/ring0/kernel_amd64.go @@ -204,7 +204,7 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { func start(c *CPU) { // Save per-cpu & FS segment. WriteGS(kernelAddr(c)) - WriteFS(uintptr(c.Registers().Fs_base)) + WriteFS(uintptr(c.registers.Fs_base)) // Initialize floating point. // diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go index 753d31ef8..806e07ec0 100644 --- a/pkg/sentry/platform/ring0/offsets_amd64.go +++ b/pkg/sentry/platform/ring0/offsets_amd64.go @@ -34,8 +34,6 @@ func Emit(w io.Writer) { fmt.Fprintf(w, "#define CPU_STACK_TOP 0x%02x\n", reflect.ValueOf(&c.stack[0]).Pointer()-reflect.ValueOf(c).Pointer()+uintptr(len(c.stack))) fmt.Fprintf(w, "#define CPU_ERROR_CODE 0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer()) fmt.Fprintf(w, "#define CPU_ERROR_TYPE 0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer()) - fmt.Fprintf(w, "#define CPU_KERNEL_EXCEPTION 0x%02x\n", reflect.ValueOf(&c.KernelException).Pointer()-reflect.ValueOf(c).Pointer()) - fmt.Fprintf(w, "#define CPU_KERNEL_SYSCALL 0x%02x\n", reflect.ValueOf(&c.KernelSyscall).Pointer()-reflect.ValueOf(c).Pointer()) fmt.Fprintf(w, "\n// Bits.\n") fmt.Fprintf(w, "#define _RFLAGS_IF 0x%02x\n", _RFLAGS_IF) -- cgit v1.2.3