diff options
author | Andrei Vagin <avagin@gmail.com> | 2021-03-16 09:15:03 -0700 |
---|---|---|
committer | Andrei Vagin <avagin@gmail.com> | 2021-03-16 21:55:20 -0700 |
commit | 2f3dac78ca9aa1abb9d27570bc9ece0f486ddb60 (patch) | |
tree | 195f9161e491c3f31ac6a1191e651f25f9743976 | |
parent | f7e841c2cede357c4cbd6117605e3f3d54f1961c (diff) |
kvm: prefault a floating point state before restoring it
If physical pages of a memory region are not mapped yet, the kernel will
trigger KVM_EXIT_MMIO and we will map physical pages in bluepillHandler().
An instruction that triggered a fault will not be re-executed, it
will be emulated in the kernel, but it can't emulate complex
instructions like xsave, xrstor. We can touch the memory with
simple instructions to workaround this problem.
-rw-r--r-- | pkg/ring0/defs.go | 2 | ||||
-rw-r--r-- | pkg/ring0/kernel_amd64.go | 12 | ||||
-rw-r--r-- | pkg/ring0/kernel_arm64.go | 4 | ||||
-rw-r--r-- | pkg/sentry/arch/arch.go | 4 | ||||
-rw-r--r-- | pkg/sentry/arch/arch_aarch64.go | 8 | ||||
-rw-r--r-- | pkg/sentry/arch/arch_x86.go | 14 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/bluepill_amd64.go | 6 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/bluepill_arm64.go | 10 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/context.go | 2 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine_amd64.go | 25 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine_arm64.go | 2 | ||||
-rw-r--r-- | pkg/sentry/platform/ptrace/ptrace_unsafe.go | 8 |
12 files changed, 60 insertions, 37 deletions
diff --git a/pkg/ring0/defs.go b/pkg/ring0/defs.go index e2561e4c2..e8ce608ba 100644 --- a/pkg/ring0/defs.go +++ b/pkg/ring0/defs.go @@ -96,7 +96,7 @@ type SwitchOpts struct { // FloatingPointState is a byte pointer where floating point state is // saved and restored. - FloatingPointState *byte + FloatingPointState arch.FloatingPointData // PageTables are the application page tables. PageTables *pagetables.PageTables diff --git a/pkg/ring0/kernel_amd64.go b/pkg/ring0/kernel_amd64.go index 36a60700e..e9e706716 100644 --- a/pkg/ring0/kernel_amd64.go +++ b/pkg/ring0/kernel_amd64.go @@ -239,17 +239,17 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { regs.Ss = uint64(Udata) // Ditto. // Perform the switch. - swapgs() // GS will be swapped on return. - WriteFS(uintptr(regs.Fs_base)) // escapes: no. Set application FS. - WriteGS(uintptr(regs.Gs_base)) // escapes: no. Set application GS. - LoadFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy in floating point. + swapgs() // GS will be swapped on return. + WriteFS(uintptr(regs.Fs_base)) // escapes: no. Set application FS. + WriteGS(uintptr(regs.Gs_base)) // escapes: no. Set application GS. + LoadFloatingPoint(&switchOpts.FloatingPointState[0]) // escapes: no. Copy in floating point. if switchOpts.FullRestore { vector = iret(c, regs, uintptr(userCR3)) } else { vector = sysret(c, regs, uintptr(userCR3)) } - SaveFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy out floating point. - WriteFS(uintptr(c.registers.Fs_base)) // escapes: no. Restore kernel FS. + SaveFloatingPoint(&switchOpts.FloatingPointState[0]) // escapes: no. Copy out floating point. + WriteFS(uintptr(c.registers.Fs_base)) // escapes: no. Restore kernel FS. return } diff --git a/pkg/ring0/kernel_arm64.go b/pkg/ring0/kernel_arm64.go index 41909b3a0..c9a120952 100644 --- a/pkg/ring0/kernel_arm64.go +++ b/pkg/ring0/kernel_arm64.go @@ -62,7 +62,7 @@ func IsCanonical(addr uint64) bool { //go:nosplit func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { storeAppASID(uintptr(switchOpts.UserASID)) - storeEl0Fpstate(switchOpts.FloatingPointState) + storeEl0Fpstate(&switchOpts.FloatingPointState[0]) if switchOpts.Flush { FlushTlbByASID(uintptr(switchOpts.UserASID)) @@ -82,7 +82,7 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { fpDisableTrap = CPACREL1() if fpDisableTrap != 0 { - SaveFloatingPoint(switchOpts.FloatingPointState) + SaveFloatingPoint(&switchOpts.FloatingPointState[0]) } vector = c.vecCode diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go index dd2effdf9..3443b9e1b 100644 --- a/pkg/sentry/arch/arch.go +++ b/pkg/sentry/arch/arch.go @@ -54,7 +54,7 @@ func (a Arch) String() string { // We rely on the individual arch implementations to meet all the necessary // requirements. For example, on x86 the region must be 16-byte aligned and 512 // bytes in size. -type FloatingPointData byte +type FloatingPointData []byte // Context provides architecture-dependent information for a specific thread. // @@ -187,7 +187,7 @@ type Context interface { ClearSingleStep() // FloatingPointData will be passed to underlying save routines. - FloatingPointData() *FloatingPointData + FloatingPointData() FloatingPointData // NewMmapLayout returns a layout for a new MM, where MinAddr for the // returned layout must be no lower than min, and MaxAddr for the returned diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go index fd73751e7..6b81e9708 100644 --- a/pkg/sentry/arch/arch_aarch64.go +++ b/pkg/sentry/arch/arch_aarch64.go @@ -88,15 +88,15 @@ func (f aarch64FPState) fork() aarch64FPState { } // FloatingPointData returns the raw data pointer. -func (f aarch64FPState) FloatingPointData() *FloatingPointData { - return (*FloatingPointData)(&f[0]) +func (f aarch64FPState) FloatingPointData() FloatingPointData { + return ([]byte)(f) } // NewFloatingPointData returns a new floating point data blob. // // This is primarily for use in tests. -func NewFloatingPointData() *FloatingPointData { - return (*FloatingPointData)(&(newAarch64FPState()[0])) +func NewFloatingPointData() FloatingPointData { + return ([]byte)(newAarch64FPState()) } // State contains the common architecture bits for aarch64 (the build tag of this diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go index 641ada92f..91edf0703 100644 --- a/pkg/sentry/arch/arch_x86.go +++ b/pkg/sentry/arch/arch_x86.go @@ -115,7 +115,7 @@ var ( type x86FPState []byte // initX86FPState (defined in asm files) sets up initial state. -func initX86FPState(data *FloatingPointData, useXsave bool) +func initX86FPState(data *byte, useXsave bool) func newX86FPStateSlice() []byte { size, align := cpuid.HostFeatureSet().ExtendedStateSize() @@ -139,7 +139,7 @@ func newX86FPStateSlice() []byte { // CPUID we must ensure it does not contain any sentry state. func newX86FPState() x86FPState { f := x86FPState(newX86FPStateSlice()) - initX86FPState(f.FloatingPointData(), cpuid.HostFeatureSet().UseXsave()) + initX86FPState(&f.FloatingPointData()[0], cpuid.HostFeatureSet().UseXsave()) return f } @@ -151,15 +151,15 @@ func (f x86FPState) fork() x86FPState { } // FloatingPointData returns the raw data pointer. -func (f x86FPState) FloatingPointData() *FloatingPointData { - return (*FloatingPointData)(&f[0]) +func (f x86FPState) FloatingPointData() FloatingPointData { + return []byte(f) } // NewFloatingPointData returns a new floating point data blob. // // This is primarily for use in tests. -func NewFloatingPointData() *FloatingPointData { - return (*FloatingPointData)(&(newX86FPState()[0])) +func NewFloatingPointData() FloatingPointData { + return (FloatingPointData)(newX86FPState()) } // Proto returns a protobuf representation of the system registers in State. @@ -442,7 +442,7 @@ func sanitizeMXCSR(f x86FPState) { mxcsr := usermem.ByteOrder.Uint32(f[mxcsrOffset:]) initMXCSRMask.Do(func() { temp := x86FPState(alignedBytes(uint(ptraceFPRegsSize), 16)) - initX86FPState(temp.FloatingPointData(), false /* useXsave */) + initX86FPState(&temp.FloatingPointData()[0], false /* useXsave */) mxcsrMask = usermem.ByteOrder.Uint32(temp[mxcsrMaskOffset:]) if mxcsrMask == 0 { // "If the value of the MXCSR_MASK field is 00000000H, then the diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go index f4b9a5321..308696efe 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64.go +++ b/pkg/sentry/platform/kvm/bluepill_amd64.go @@ -73,7 +73,7 @@ func (c *vCPU) KernelSyscall() { // We only trigger a bluepill entry in the bluepill function, and can // therefore be guaranteed that there is no floating point state to be // loaded on resuming from halt. We only worry about saving on exit. - ring0.SaveFloatingPoint((*byte)(c.floatingPointState)) // escapes: no. + ring0.SaveFloatingPoint(&c.floatingPointState[0]) // escapes: no. ring0.Halt() ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no, reload host segment. } @@ -92,7 +92,7 @@ func (c *vCPU) KernelException(vector ring0.Vector) { regs.Rip = 0 } // See above. - ring0.SaveFloatingPoint((*byte)(c.floatingPointState)) // escapes: no. + ring0.SaveFloatingPoint(&c.floatingPointState[0]) // escapes: no. ring0.Halt() ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no; reload host segment. } @@ -124,5 +124,5 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) { // Set the context pointer to the saved floating point state. This is // where the guest data has been serialized, the kernel will restore // from this new pointer value. - context.Fpstate = uint64(uintptrValue((*byte)(c.floatingPointState))) + context.Fpstate = uint64(uintptrValue(&c.floatingPointState[0])) } diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go index e26b7da8d..c317f1e99 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64.go +++ b/pkg/sentry/platform/kvm/bluepill_arm64.go @@ -92,7 +92,7 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) { lazyVfp := c.GetLazyVFP() if lazyVfp != 0 { - fpsimd := fpsimdPtr((*byte)(c.floatingPointState)) + fpsimd := fpsimdPtr(&c.floatingPointState[0]) context.Fpsimd64.Fpsr = fpsimd.Fpsr context.Fpsimd64.Fpcr = fpsimd.Fpcr context.Fpsimd64.Vregs = fpsimd.Vregs @@ -112,12 +112,12 @@ func (c *vCPU) KernelSyscall() { fpDisableTrap := ring0.CPACREL1() if fpDisableTrap != 0 { - fpsimd := fpsimdPtr((*byte)(c.floatingPointState)) + fpsimd := fpsimdPtr(&c.floatingPointState[0]) fpcr := ring0.GetFPCR() fpsr := ring0.GetFPSR() fpsimd.Fpcr = uint32(fpcr) fpsimd.Fpsr = uint32(fpsr) - ring0.SaveVRegs((*byte)(c.floatingPointState)) + ring0.SaveVRegs(&c.floatingPointState[0]) } ring0.Halt() @@ -136,12 +136,12 @@ func (c *vCPU) KernelException(vector ring0.Vector) { fpDisableTrap := ring0.CPACREL1() if fpDisableTrap != 0 { - fpsimd := fpsimdPtr((*byte)(c.floatingPointState)) + fpsimd := fpsimdPtr(&c.floatingPointState[0]) fpcr := ring0.GetFPCR() fpsr := ring0.GetFPSR() fpsimd.Fpcr = uint32(fpcr) fpsimd.Fpsr = uint32(fpsr) - ring0.SaveVRegs((*byte)(c.floatingPointState)) + ring0.SaveVRegs(&c.floatingPointState[0]) } ring0.Halt() diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go index aeae01dbd..706fa53dc 100644 --- a/pkg/sentry/platform/kvm/context.go +++ b/pkg/sentry/platform/kvm/context.go @@ -65,7 +65,7 @@ func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac a // Prepare switch options. switchOpts := ring0.SwitchOpts{ Registers: &ac.StateData().Regs, - FloatingPointState: (*byte)(ac.FloatingPointData()), + FloatingPointState: ac.FloatingPointData(), PageTables: localAS.pageTables, Flush: localAS.Touch(cpu), FullRestore: ac.FullRestore(), diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index 6e583baa3..916903881 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -70,7 +70,7 @@ type vCPUArchState struct { // floatingPointState is the floating point state buffer used in guest // to host transitions. See usage in bluepill_amd64.go. - floatingPointState *arch.FloatingPointData + floatingPointState arch.FloatingPointData } const ( @@ -293,6 +293,28 @@ func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (usermem.AccessType, e return accessType, platform.ErrContextSignal } +//go:nosplit +//go:noinline +func loadByte(ptr *byte) byte { + return *ptr +} + +// prefaultFloatingPointState touches each page of the floating point state to +// be sure that its physical pages are mapped. +// +// Otherwise the kernel can trigger KVM_EXIT_MMIO and an instruction that +// triggered a fault will be emulated by the kvm kernel code, but it can't +// emulate instructions like xsave and xrstor. +// +//go:nosplit +func prefaultFloatingPointState(data arch.FloatingPointData) { + size := len(data) + for i := 0; i < size; i += usermem.PageSize { + loadByte(&(data)[i]) + } + loadByte(&(data)[size-1]) +} + // SwitchToUser unpacks architectural-details. func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (usermem.AccessType, error) { // Check for canonical addresses. @@ -323,6 +345,7 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) // allocations occur. entersyscall() bluepill(c) + prefaultFloatingPointState(switchOpts.FloatingPointState) vector = c.CPU.SwitchToUser(switchOpts) exitsyscall() diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index 7d7857067..3d715e570 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -32,7 +32,7 @@ type vCPUArchState struct { // floatingPointState is the floating point state buffer used in guest // to host transitions. See usage in bluepill_arm64.go. - floatingPointState *arch.FloatingPointData + floatingPointState arch.FloatingPointData } const ( diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go index 2c21f946e..6259350ec 100644 --- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go +++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go @@ -62,9 +62,9 @@ func (t *thread) setRegs(regs *arch.Registers) error { } // getFPRegs gets the floating-point data via the GETREGSET ptrace unix. -func (t *thread) getFPRegs(fpState *arch.FloatingPointData, fpLen uint64, useXsave bool) error { +func (t *thread) getFPRegs(fpState arch.FloatingPointData, fpLen uint64, useXsave bool) error { iovec := unix.Iovec{ - Base: (*byte)(fpState), + Base: (*byte)(&fpState[0]), Len: fpLen, } _, _, errno := unix.RawSyscall6( @@ -81,9 +81,9 @@ func (t *thread) getFPRegs(fpState *arch.FloatingPointData, fpLen uint64, useXsa } // setFPRegs sets the floating-point data via the SETREGSET ptrace unix. -func (t *thread) setFPRegs(fpState *arch.FloatingPointData, fpLen uint64, useXsave bool) error { +func (t *thread) setFPRegs(fpState arch.FloatingPointData, fpLen uint64, useXsave bool) error { iovec := unix.Iovec{ - Base: (*byte)(fpState), + Base: (*byte)(&fpState[0]), Len: fpLen, } _, _, errno := unix.RawSyscall6( |