summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorAndrei Vagin <avagin@gmail.com>2021-03-16 09:15:03 -0700
committerAndrei Vagin <avagin@gmail.com>2021-03-16 21:55:20 -0700
commit2f3dac78ca9aa1abb9d27570bc9ece0f486ddb60 (patch)
tree195f9161e491c3f31ac6a1191e651f25f9743976
parentf7e841c2cede357c4cbd6117605e3f3d54f1961c (diff)
kvm: prefault a floating point state before restoring it
If physical pages of a memory region are not mapped yet, the kernel will trigger KVM_EXIT_MMIO and we will map physical pages in bluepillHandler(). An instruction that triggered a fault will not be re-executed, it will be emulated in the kernel, but it can't emulate complex instructions like xsave, xrstor. We can touch the memory with simple instructions to workaround this problem.
-rw-r--r--pkg/ring0/defs.go2
-rw-r--r--pkg/ring0/kernel_amd64.go12
-rw-r--r--pkg/ring0/kernel_arm64.go4
-rw-r--r--pkg/sentry/arch/arch.go4
-rw-r--r--pkg/sentry/arch/arch_aarch64.go8
-rw-r--r--pkg/sentry/arch/arch_x86.go14
-rw-r--r--pkg/sentry/platform/kvm/bluepill_amd64.go6
-rw-r--r--pkg/sentry/platform/kvm/bluepill_arm64.go10
-rw-r--r--pkg/sentry/platform/kvm/context.go2
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64.go25
-rw-r--r--pkg/sentry/platform/kvm/machine_arm64.go2
-rw-r--r--pkg/sentry/platform/ptrace/ptrace_unsafe.go8
12 files changed, 60 insertions, 37 deletions
diff --git a/pkg/ring0/defs.go b/pkg/ring0/defs.go
index e2561e4c2..e8ce608ba 100644
--- a/pkg/ring0/defs.go
+++ b/pkg/ring0/defs.go
@@ -96,7 +96,7 @@ type SwitchOpts struct {
// FloatingPointState is a byte pointer where floating point state is
// saved and restored.
- FloatingPointState *byte
+ FloatingPointState arch.FloatingPointData
// PageTables are the application page tables.
PageTables *pagetables.PageTables
diff --git a/pkg/ring0/kernel_amd64.go b/pkg/ring0/kernel_amd64.go
index 36a60700e..e9e706716 100644
--- a/pkg/ring0/kernel_amd64.go
+++ b/pkg/ring0/kernel_amd64.go
@@ -239,17 +239,17 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
regs.Ss = uint64(Udata) // Ditto.
// Perform the switch.
- swapgs() // GS will be swapped on return.
- WriteFS(uintptr(regs.Fs_base)) // escapes: no. Set application FS.
- WriteGS(uintptr(regs.Gs_base)) // escapes: no. Set application GS.
- LoadFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy in floating point.
+ swapgs() // GS will be swapped on return.
+ WriteFS(uintptr(regs.Fs_base)) // escapes: no. Set application FS.
+ WriteGS(uintptr(regs.Gs_base)) // escapes: no. Set application GS.
+ LoadFloatingPoint(&switchOpts.FloatingPointState[0]) // escapes: no. Copy in floating point.
if switchOpts.FullRestore {
vector = iret(c, regs, uintptr(userCR3))
} else {
vector = sysret(c, regs, uintptr(userCR3))
}
- SaveFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy out floating point.
- WriteFS(uintptr(c.registers.Fs_base)) // escapes: no. Restore kernel FS.
+ SaveFloatingPoint(&switchOpts.FloatingPointState[0]) // escapes: no. Copy out floating point.
+ WriteFS(uintptr(c.registers.Fs_base)) // escapes: no. Restore kernel FS.
return
}
diff --git a/pkg/ring0/kernel_arm64.go b/pkg/ring0/kernel_arm64.go
index 41909b3a0..c9a120952 100644
--- a/pkg/ring0/kernel_arm64.go
+++ b/pkg/ring0/kernel_arm64.go
@@ -62,7 +62,7 @@ func IsCanonical(addr uint64) bool {
//go:nosplit
func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
storeAppASID(uintptr(switchOpts.UserASID))
- storeEl0Fpstate(switchOpts.FloatingPointState)
+ storeEl0Fpstate(&switchOpts.FloatingPointState[0])
if switchOpts.Flush {
FlushTlbByASID(uintptr(switchOpts.UserASID))
@@ -82,7 +82,7 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
fpDisableTrap = CPACREL1()
if fpDisableTrap != 0 {
- SaveFloatingPoint(switchOpts.FloatingPointState)
+ SaveFloatingPoint(&switchOpts.FloatingPointState[0])
}
vector = c.vecCode
diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index dd2effdf9..3443b9e1b 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -54,7 +54,7 @@ func (a Arch) String() string {
// We rely on the individual arch implementations to meet all the necessary
// requirements. For example, on x86 the region must be 16-byte aligned and 512
// bytes in size.
-type FloatingPointData byte
+type FloatingPointData []byte
// Context provides architecture-dependent information for a specific thread.
//
@@ -187,7 +187,7 @@ type Context interface {
ClearSingleStep()
// FloatingPointData will be passed to underlying save routines.
- FloatingPointData() *FloatingPointData
+ FloatingPointData() FloatingPointData
// NewMmapLayout returns a layout for a new MM, where MinAddr for the
// returned layout must be no lower than min, and MaxAddr for the returned
diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go
index fd73751e7..6b81e9708 100644
--- a/pkg/sentry/arch/arch_aarch64.go
+++ b/pkg/sentry/arch/arch_aarch64.go
@@ -88,15 +88,15 @@ func (f aarch64FPState) fork() aarch64FPState {
}
// FloatingPointData returns the raw data pointer.
-func (f aarch64FPState) FloatingPointData() *FloatingPointData {
- return (*FloatingPointData)(&f[0])
+func (f aarch64FPState) FloatingPointData() FloatingPointData {
+ return ([]byte)(f)
}
// NewFloatingPointData returns a new floating point data blob.
//
// This is primarily for use in tests.
-func NewFloatingPointData() *FloatingPointData {
- return (*FloatingPointData)(&(newAarch64FPState()[0]))
+func NewFloatingPointData() FloatingPointData {
+ return ([]byte)(newAarch64FPState())
}
// State contains the common architecture bits for aarch64 (the build tag of this
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index 641ada92f..91edf0703 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -115,7 +115,7 @@ var (
type x86FPState []byte
// initX86FPState (defined in asm files) sets up initial state.
-func initX86FPState(data *FloatingPointData, useXsave bool)
+func initX86FPState(data *byte, useXsave bool)
func newX86FPStateSlice() []byte {
size, align := cpuid.HostFeatureSet().ExtendedStateSize()
@@ -139,7 +139,7 @@ func newX86FPStateSlice() []byte {
// CPUID we must ensure it does not contain any sentry state.
func newX86FPState() x86FPState {
f := x86FPState(newX86FPStateSlice())
- initX86FPState(f.FloatingPointData(), cpuid.HostFeatureSet().UseXsave())
+ initX86FPState(&f.FloatingPointData()[0], cpuid.HostFeatureSet().UseXsave())
return f
}
@@ -151,15 +151,15 @@ func (f x86FPState) fork() x86FPState {
}
// FloatingPointData returns the raw data pointer.
-func (f x86FPState) FloatingPointData() *FloatingPointData {
- return (*FloatingPointData)(&f[0])
+func (f x86FPState) FloatingPointData() FloatingPointData {
+ return []byte(f)
}
// NewFloatingPointData returns a new floating point data blob.
//
// This is primarily for use in tests.
-func NewFloatingPointData() *FloatingPointData {
- return (*FloatingPointData)(&(newX86FPState()[0]))
+func NewFloatingPointData() FloatingPointData {
+ return (FloatingPointData)(newX86FPState())
}
// Proto returns a protobuf representation of the system registers in State.
@@ -442,7 +442,7 @@ func sanitizeMXCSR(f x86FPState) {
mxcsr := usermem.ByteOrder.Uint32(f[mxcsrOffset:])
initMXCSRMask.Do(func() {
temp := x86FPState(alignedBytes(uint(ptraceFPRegsSize), 16))
- initX86FPState(temp.FloatingPointData(), false /* useXsave */)
+ initX86FPState(&temp.FloatingPointData()[0], false /* useXsave */)
mxcsrMask = usermem.ByteOrder.Uint32(temp[mxcsrMaskOffset:])
if mxcsrMask == 0 {
// "If the value of the MXCSR_MASK field is 00000000H, then the
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go
index f4b9a5321..308696efe 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.go
@@ -73,7 +73,7 @@ func (c *vCPU) KernelSyscall() {
// We only trigger a bluepill entry in the bluepill function, and can
// therefore be guaranteed that there is no floating point state to be
// loaded on resuming from halt. We only worry about saving on exit.
- ring0.SaveFloatingPoint((*byte)(c.floatingPointState)) // escapes: no.
+ ring0.SaveFloatingPoint(&c.floatingPointState[0]) // escapes: no.
ring0.Halt()
ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no, reload host segment.
}
@@ -92,7 +92,7 @@ func (c *vCPU) KernelException(vector ring0.Vector) {
regs.Rip = 0
}
// See above.
- ring0.SaveFloatingPoint((*byte)(c.floatingPointState)) // escapes: no.
+ ring0.SaveFloatingPoint(&c.floatingPointState[0]) // escapes: no.
ring0.Halt()
ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no; reload host segment.
}
@@ -124,5 +124,5 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) {
// Set the context pointer to the saved floating point state. This is
// where the guest data has been serialized, the kernel will restore
// from this new pointer value.
- context.Fpstate = uint64(uintptrValue((*byte)(c.floatingPointState)))
+ context.Fpstate = uint64(uintptrValue(&c.floatingPointState[0]))
}
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go
index e26b7da8d..c317f1e99 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.go
@@ -92,7 +92,7 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) {
lazyVfp := c.GetLazyVFP()
if lazyVfp != 0 {
- fpsimd := fpsimdPtr((*byte)(c.floatingPointState))
+ fpsimd := fpsimdPtr(&c.floatingPointState[0])
context.Fpsimd64.Fpsr = fpsimd.Fpsr
context.Fpsimd64.Fpcr = fpsimd.Fpcr
context.Fpsimd64.Vregs = fpsimd.Vregs
@@ -112,12 +112,12 @@ func (c *vCPU) KernelSyscall() {
fpDisableTrap := ring0.CPACREL1()
if fpDisableTrap != 0 {
- fpsimd := fpsimdPtr((*byte)(c.floatingPointState))
+ fpsimd := fpsimdPtr(&c.floatingPointState[0])
fpcr := ring0.GetFPCR()
fpsr := ring0.GetFPSR()
fpsimd.Fpcr = uint32(fpcr)
fpsimd.Fpsr = uint32(fpsr)
- ring0.SaveVRegs((*byte)(c.floatingPointState))
+ ring0.SaveVRegs(&c.floatingPointState[0])
}
ring0.Halt()
@@ -136,12 +136,12 @@ func (c *vCPU) KernelException(vector ring0.Vector) {
fpDisableTrap := ring0.CPACREL1()
if fpDisableTrap != 0 {
- fpsimd := fpsimdPtr((*byte)(c.floatingPointState))
+ fpsimd := fpsimdPtr(&c.floatingPointState[0])
fpcr := ring0.GetFPCR()
fpsr := ring0.GetFPSR()
fpsimd.Fpcr = uint32(fpcr)
fpsimd.Fpsr = uint32(fpsr)
- ring0.SaveVRegs((*byte)(c.floatingPointState))
+ ring0.SaveVRegs(&c.floatingPointState[0])
}
ring0.Halt()
diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go
index aeae01dbd..706fa53dc 100644
--- a/pkg/sentry/platform/kvm/context.go
+++ b/pkg/sentry/platform/kvm/context.go
@@ -65,7 +65,7 @@ func (c *context) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac a
// Prepare switch options.
switchOpts := ring0.SwitchOpts{
Registers: &ac.StateData().Regs,
- FloatingPointState: (*byte)(ac.FloatingPointData()),
+ FloatingPointState: ac.FloatingPointData(),
PageTables: localAS.pageTables,
Flush: localAS.Touch(cpu),
FullRestore: ac.FullRestore(),
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index 6e583baa3..916903881 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -70,7 +70,7 @@ type vCPUArchState struct {
// floatingPointState is the floating point state buffer used in guest
// to host transitions. See usage in bluepill_amd64.go.
- floatingPointState *arch.FloatingPointData
+ floatingPointState arch.FloatingPointData
}
const (
@@ -293,6 +293,28 @@ func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (usermem.AccessType, e
return accessType, platform.ErrContextSignal
}
+//go:nosplit
+//go:noinline
+func loadByte(ptr *byte) byte {
+ return *ptr
+}
+
+// prefaultFloatingPointState touches each page of the floating point state to
+// be sure that its physical pages are mapped.
+//
+// Otherwise the kernel can trigger KVM_EXIT_MMIO and an instruction that
+// triggered a fault will be emulated by the kvm kernel code, but it can't
+// emulate instructions like xsave and xrstor.
+//
+//go:nosplit
+func prefaultFloatingPointState(data arch.FloatingPointData) {
+ size := len(data)
+ for i := 0; i < size; i += usermem.PageSize {
+ loadByte(&(data)[i])
+ }
+ loadByte(&(data)[size-1])
+}
+
// SwitchToUser unpacks architectural-details.
func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (usermem.AccessType, error) {
// Check for canonical addresses.
@@ -323,6 +345,7 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo)
// allocations occur.
entersyscall()
bluepill(c)
+ prefaultFloatingPointState(switchOpts.FloatingPointState)
vector = c.CPU.SwitchToUser(switchOpts)
exitsyscall()
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
index 7d7857067..3d715e570 100644
--- a/pkg/sentry/platform/kvm/machine_arm64.go
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -32,7 +32,7 @@ type vCPUArchState struct {
// floatingPointState is the floating point state buffer used in guest
// to host transitions. See usage in bluepill_arm64.go.
- floatingPointState *arch.FloatingPointData
+ floatingPointState arch.FloatingPointData
}
const (
diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
index 2c21f946e..6259350ec 100644
--- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go
+++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
@@ -62,9 +62,9 @@ func (t *thread) setRegs(regs *arch.Registers) error {
}
// getFPRegs gets the floating-point data via the GETREGSET ptrace unix.
-func (t *thread) getFPRegs(fpState *arch.FloatingPointData, fpLen uint64, useXsave bool) error {
+func (t *thread) getFPRegs(fpState arch.FloatingPointData, fpLen uint64, useXsave bool) error {
iovec := unix.Iovec{
- Base: (*byte)(fpState),
+ Base: (*byte)(&fpState[0]),
Len: fpLen,
}
_, _, errno := unix.RawSyscall6(
@@ -81,9 +81,9 @@ func (t *thread) getFPRegs(fpState *arch.FloatingPointData, fpLen uint64, useXsa
}
// setFPRegs sets the floating-point data via the SETREGSET ptrace unix.
-func (t *thread) setFPRegs(fpState *arch.FloatingPointData, fpLen uint64, useXsave bool) error {
+func (t *thread) setFPRegs(fpState arch.FloatingPointData, fpLen uint64, useXsave bool) error {
iovec := unix.Iovec{
- Base: (*byte)(fpState),
+ Base: (*byte)(&fpState[0]),
Len: fpLen,
}
_, _, errno := unix.RawSyscall6(