summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorgVisor bot <gvisor-bot@google.com>2021-07-12 15:06:49 +0000
committergVisor bot <gvisor-bot@google.com>2021-07-12 15:06:49 +0000
commitbf97c23d1679690f0df5f7daa32795a839399f49 (patch)
treed2acc0cacccf318e5266bed7ee9cc8807e1ae4b8
parenta6333d999d80e980576c11a558b69ad3106952d2 (diff)
parent36a17a814bf90bad33eac25ddbb7a416143a4be7 (diff)
Merge release-20210628.0-35-g36a17a814 (automated)
-rw-r--r--pkg/ring0/defs_impl_amd64.go4
-rw-r--r--pkg/ring0/entry_amd64.go97
-rw-r--r--pkg/ring0/entry_impl_amd64.s151
-rw-r--r--pkg/ring0/kernel_amd64.go33
-rw-r--r--pkg/ring0/kernel_unsafe.go5
-rw-r--r--pkg/ring0/lib_amd64.go6
-rw-r--r--pkg/ring0/lib_amd64.s23
-rw-r--r--pkg/sentry/platform/kvm/bluepill_amd64.go27
-rw-r--r--pkg/sentry/platform/kvm/bluepill_impl_amd64.s4
-rw-r--r--pkg/sentry/platform/kvm/bluepill_unsafe.go7
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64.go2
11 files changed, 280 insertions, 79 deletions
diff --git a/pkg/ring0/defs_impl_amd64.go b/pkg/ring0/defs_impl_amd64.go
index cd8f735c0..8005c6d94 100644
--- a/pkg/ring0/defs_impl_amd64.go
+++ b/pkg/ring0/defs_impl_amd64.go
@@ -318,8 +318,8 @@ func Emit(w io.Writer) {
fmt.Fprintf(w, "#define PTRACE_FLAGS 0x%02x\n", reflect.ValueOf(&p.Eflags).Pointer()-reflect.ValueOf(p).Pointer())
fmt.Fprintf(w, "#define PTRACE_RSP 0x%02x\n", reflect.ValueOf(&p.Rsp).Pointer()-reflect.ValueOf(p).Pointer())
fmt.Fprintf(w, "#define PTRACE_SS 0x%02x\n", reflect.ValueOf(&p.Ss).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_FS 0x%02x\n", reflect.ValueOf(&p.Fs_base).Pointer()-reflect.ValueOf(p).Pointer())
- fmt.Fprintf(w, "#define PTRACE_GS 0x%02x\n", reflect.ValueOf(&p.Gs_base).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_FS_BASE 0x%02x\n", reflect.ValueOf(&p.Fs_base).Pointer()-reflect.ValueOf(p).Pointer())
+ fmt.Fprintf(w, "#define PTRACE_GS_BASE 0x%02x\n", reflect.ValueOf(&p.Gs_base).Pointer()-reflect.ValueOf(p).Pointer())
}
// Useful bits.
diff --git a/pkg/ring0/entry_amd64.go b/pkg/ring0/entry_amd64.go
index d87b1fd00..397ccac7b 100644
--- a/pkg/ring0/entry_amd64.go
+++ b/pkg/ring0/entry_amd64.go
@@ -31,6 +31,13 @@ import (
// executed from kernel mode or not and the appropriate stub is called.
func sysenter()
+// addrOfSysenter returns the start address of sysenter.
+//
+// In Go 1.17+, Go references to assembly functions resolve to an ABIInternal
+// wrapper function rather than the function itself. We must reference from
+// assembly to get the ABI0 (i.e., primary) address.
+func addrOfSysenter() uintptr
+
// swapgs swaps the current GS value.
//
// This must be called prior to sysret/iret.
@@ -39,6 +46,9 @@ func swapgs()
// jumpToKernel jumps to the kernel version of the current RIP.
func jumpToKernel()
+// jumpToUser jumps to the user version of the current RIP.
+func jumpToUser()
+
// sysret returns to userspace from a system call.
//
// The return code is the vector that interrupted execution.
@@ -65,7 +75,12 @@ func exception()
// This is used when processing kernel exceptions and syscalls.
func resume()
-// Start is the CPU entrypoint.
+// start is the CPU entrypoint.
+//
+// See requirements below.
+func start()
+
+// AddrOfStart return the address of the CPU entrypoint.
//
// The following start conditions must be satisfied:
//
@@ -78,7 +93,11 @@ func resume()
// * c.EFER() should be the current EFER value.
//
// The CPU state will be set to c.Registers().
-func Start()
+//
+// In Go 1.17+, Go references to assembly functions resolve to an ABIInternal
+// wrapper function rather than the function itself. We must reference from
+// assembly to get the ABI0 (i.e., primary) address.
+func AddrOfStart() uintptr
// Exception stubs.
func divideByZero()
@@ -104,28 +123,56 @@ func virtualizationException()
func securityException()
func syscallInt80()
+// These returns the start address of the functions above.
+//
+// In Go 1.17+, Go references to assembly functions resolve to an ABIInternal
+// wrapper function rather than the function itself. We must reference from
+// assembly to get the ABI0 (i.e., primary) address.
+func addrOfDivideByZero() uintptr
+func addrOfDebug() uintptr
+func addrOfNMI() uintptr
+func addrOfBreakpoint() uintptr
+func addrOfOverflow() uintptr
+func addrOfBoundRangeExceeded() uintptr
+func addrOfInvalidOpcode() uintptr
+func addrOfDeviceNotAvailable() uintptr
+func addrOfDoubleFault() uintptr
+func addrOfCoprocessorSegmentOverrun() uintptr
+func addrOfInvalidTSS() uintptr
+func addrOfSegmentNotPresent() uintptr
+func addrOfStackSegmentFault() uintptr
+func addrOfGeneralProtectionFault() uintptr
+func addrOfPageFault() uintptr
+func addrOfX87FloatingPointException() uintptr
+func addrOfAlignmentCheck() uintptr
+func addrOfMachineCheck() uintptr
+func addrOfSimdFloatingPointException() uintptr
+func addrOfVirtualizationException() uintptr
+func addrOfSecurityException() uintptr
+func addrOfSyscallInt80() uintptr
+
// Exception handler index.
-var handlers = map[Vector]func(){
- DivideByZero: divideByZero,
- Debug: debug,
- NMI: nmi,
- Breakpoint: breakpoint,
- Overflow: overflow,
- BoundRangeExceeded: boundRangeExceeded,
- InvalidOpcode: invalidOpcode,
- DeviceNotAvailable: deviceNotAvailable,
- DoubleFault: doubleFault,
- CoprocessorSegmentOverrun: coprocessorSegmentOverrun,
- InvalidTSS: invalidTSS,
- SegmentNotPresent: segmentNotPresent,
- StackSegmentFault: stackSegmentFault,
- GeneralProtectionFault: generalProtectionFault,
- PageFault: pageFault,
- X87FloatingPointException: x87FloatingPointException,
- AlignmentCheck: alignmentCheck,
- MachineCheck: machineCheck,
- SIMDFloatingPointException: simdFloatingPointException,
- VirtualizationException: virtualizationException,
- SecurityException: securityException,
- SyscallInt80: syscallInt80,
+var handlers = map[Vector]uintptr{
+ DivideByZero: addrOfDivideByZero(),
+ Debug: addrOfDebug(),
+ NMI: addrOfNMI(),
+ Breakpoint: addrOfBreakpoint(),
+ Overflow: addrOfOverflow(),
+ BoundRangeExceeded: addrOfBoundRangeExceeded(),
+ InvalidOpcode: addrOfInvalidOpcode(),
+ DeviceNotAvailable: addrOfDeviceNotAvailable(),
+ DoubleFault: addrOfDoubleFault(),
+ CoprocessorSegmentOverrun: addrOfCoprocessorSegmentOverrun(),
+ InvalidTSS: addrOfInvalidTSS(),
+ SegmentNotPresent: addrOfSegmentNotPresent(),
+ StackSegmentFault: addrOfStackSegmentFault(),
+ GeneralProtectionFault: addrOfGeneralProtectionFault(),
+ PageFault: addrOfPageFault(),
+ X87FloatingPointException: addrOfX87FloatingPointException(),
+ AlignmentCheck: addrOfAlignmentCheck(),
+ MachineCheck: addrOfMachineCheck(),
+ SIMDFloatingPointException: addrOfSimdFloatingPointException(),
+ VirtualizationException: addrOfVirtualizationException(),
+ SecurityException: addrOfSecurityException(),
+ SyscallInt80: addrOfSyscallInt80(),
}
diff --git a/pkg/ring0/entry_impl_amd64.s b/pkg/ring0/entry_impl_amd64.s
index 9a5d6c064..1d0262a18 100644
--- a/pkg/ring0/entry_impl_amd64.s
+++ b/pkg/ring0/entry_impl_amd64.s
@@ -66,8 +66,8 @@
#define PTRACE_FLAGS 0x90
#define PTRACE_RSP 0x98
#define PTRACE_SS 0xa0
-#define PTRACE_FS 0xa8
-#define PTRACE_GS 0xb0
+#define PTRACE_FS_BASE 0xa8
+#define PTRACE_GS_BASE 0xb0
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
@@ -158,11 +158,33 @@
#define LOAD_KERNEL_STACK(entry) \
MOVQ ENTRY_STACK_TOP(entry), SP;
+// ADDR_OF_FUNC defines a function named 'name' that returns the address of
+// 'symbol'.
+#define ADDR_OF_FUNC(name, symbol) \
+TEXT name,$0-8; \
+ MOVQ $symbol, AX; \
+ MOVQ AX, ret+0(FP); \
+ RET
+
// See kernel.go.
TEXT ·Halt(SB),NOSPLIT,$0
HLT
RET
+// See kernel_amd64.go.
+TEXT ·HaltAndWriteFSBase(SB),NOSPLIT,$8-8
+ HLT
+
+ // Restore FS_BASE.
+ MOVQ regs+0(FP), AX
+ MOVQ PTRACE_FS_BASE(AX), AX
+
+ PUSHQ AX // First argument (FS_BASE)
+ CALL ·writeFS(SB)
+ POPQ AX
+
+ RET
+
// See entry_amd64.go.
TEXT ·swapgs(SB),NOSPLIT,$0
SWAP_GS()
@@ -177,8 +199,29 @@ TEXT ·jumpToKernel(SB),NOSPLIT,$0
MOVQ AX, 0(SP)
RET
+// jumpToUser changes execution to the user address space.
+//
+// This works by changing the return value to the user version.
+TEXT ·jumpToUser(SB),NOSPLIT,$0
+ // N.B. we can't access KernelStartAddress from the upper half (data
+ // pages not available), so just naively clear all the upper bits.
+ // We are assuming a 47-bit virtual address space.
+ MOVQ $0x00007fffffffffff, AX
+ MOVQ 0(SP), BX
+ ANDQ BX, AX // Future return value.
+ MOVQ AX, 0(SP)
+ RET
+
// See entry_amd64.go.
TEXT ·sysret(SB),NOSPLIT,$0-24
+ // Set application FS. We can't do this in Go because Go code needs FS.
+ MOVQ regs+8(FP), AX
+ MOVQ PTRACE_FS_BASE(AX), AX
+
+ PUSHQ AX
+ CALL ·writeFS(SB)
+ POPQ AX
+
CALL ·jumpToKernel(SB)
// Save original state and stack. sysenter() or exception()
// from APP(gr3) will switch to this stack, set the return
@@ -212,6 +255,14 @@ TEXT ·sysret(SB),NOSPLIT,$0-24
// See entry_amd64.go.
TEXT ·iret(SB),NOSPLIT,$0-24
+ // Set application FS. We can't do this in Go because Go code needs FS.
+ MOVQ regs+8(FP), AX
+ MOVQ PTRACE_FS_BASE(AX), AX
+
+ PUSHQ AX // First argument (FS_BASE)
+ CALL ·writeFS(SB)
+ POPQ AX
+
CALL ·jumpToKernel(SB)
// Save original state and stack. sysenter() or exception()
// from APP(gr3) will switch to this stack, set the return
@@ -254,13 +305,29 @@ TEXT ·resume(SB),NOSPLIT,$0
IRET()
// See entry_amd64.go.
-TEXT ·Start(SB),NOSPLIT,$0
+TEXT ·start(SB),NOSPLIT,$0
+ // N.B. This is the vCPU entrypoint. It is not called from Go code and
+ // thus pushes and pops values on the stack until calling into Go
+ // (startGo) because we aren't usually a typical Go assembly frame.
+
PUSHQ $0x0 // Previous frame pointer.
MOVQ SP, BP // Set frame pointer.
- PUSHQ AX // First argument (CPU).
- CALL ·start(SB) // Call Go hook.
+
+ PUSHQ AX // Save CPU.
+
+ // Set up environment required by Go before calling startGo: Go needs
+ // FS_BASE and floating point initialized.
+ MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX
+ PUSHQ BX // First argument (FS_BASE)
+ CALL ·writeFS(SB)
+ POPQ BX
+
+ // First argument (CPU) already at bottom of stack.
+ CALL ·startGo(SB) // Call Go hook.
JMP ·resume(SB) // Restore to registers.
+ADDR_OF_FUNC(·AddrOfStart(SB), ·start(SB));
+
// See entry_amd64.go.
TEXT ·sysenter(SB),NOSPLIT,$0
// _RFLAGS_IOPL0 is always set in the user mode and it is never set in
@@ -288,6 +355,18 @@ user:
MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code.
MOVQ $1, CPU_ERROR_TYPE(AX) // Set error type to user.
+ CALL ·jumpToUser(SB)
+
+ // Restore kernel FS_BASE.
+ MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
+ MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX
+
+ PUSHQ BX // First argument (FS_BASE)
+ CALL ·writeFS(SB)
+ POPQ BX
+
+ MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
+
// Return to the kernel, where the frame is:
//
// vector (sp+32)
@@ -322,6 +401,8 @@ kernel:
POPQ AX // Pop vCPU.
JMP ·resume(SB)
+ADDR_OF_FUNC(·addrOfSysenter(SB), ·sysenter(SB));
+
// exception is a generic exception handler.
//
// There are two cases handled:
@@ -368,6 +449,16 @@ user:
MOVQ 40(SP), DI; MOVQ DI, PTRACE_RSP(AX)
MOVQ 48(SP), SI; MOVQ SI, PTRACE_SS(AX)
+ CALL ·jumpToUser(SB)
+
+ // Restore kernel FS_BASE.
+ MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
+ MOVQ CPU_REGISTERS+PTRACE_FS_BASE(AX), BX
+
+ PUSHQ BX // First argument (FS_BASE)
+ CALL ·writeFS(SB)
+ POPQ BX
+
// Copy out and return.
MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU.
MOVQ 0(SP), BX // Load vector.
@@ -406,36 +497,38 @@ kernel:
POPQ AX // Pop vCPU.
JMP ·resume(SB)
-#define EXCEPTION_WITH_ERROR(value, symbol) \
+#define EXCEPTION_WITH_ERROR(value, symbol, addr) \
+ADDR_OF_FUNC(addr, symbol); \
TEXT symbol,NOSPLIT,$0; \
PUSHQ $value; \
JMP ·exception(SB);
-#define EXCEPTION_WITHOUT_ERROR(value, symbol) \
+#define EXCEPTION_WITHOUT_ERROR(value, symbol, addr) \
+ADDR_OF_FUNC(addr, symbol); \
TEXT symbol,NOSPLIT,$0; \
PUSHQ $0x0; \
PUSHQ $value; \
JMP ·exception(SB);
-EXCEPTION_WITHOUT_ERROR(DivideByZero, ·divideByZero(SB))
-EXCEPTION_WITHOUT_ERROR(Debug, ·debug(SB))
-EXCEPTION_WITHOUT_ERROR(NMI, ·nmi(SB))
-EXCEPTION_WITHOUT_ERROR(Breakpoint, ·breakpoint(SB))
-EXCEPTION_WITHOUT_ERROR(Overflow, ·overflow(SB))
-EXCEPTION_WITHOUT_ERROR(BoundRangeExceeded, ·boundRangeExceeded(SB))
-EXCEPTION_WITHOUT_ERROR(InvalidOpcode, ·invalidOpcode(SB))
-EXCEPTION_WITHOUT_ERROR(DeviceNotAvailable, ·deviceNotAvailable(SB))
-EXCEPTION_WITH_ERROR(DoubleFault, ·doubleFault(SB))
-EXCEPTION_WITHOUT_ERROR(CoprocessorSegmentOverrun, ·coprocessorSegmentOverrun(SB))
-EXCEPTION_WITH_ERROR(InvalidTSS, ·invalidTSS(SB))
-EXCEPTION_WITH_ERROR(SegmentNotPresent, ·segmentNotPresent(SB))
-EXCEPTION_WITH_ERROR(StackSegmentFault, ·stackSegmentFault(SB))
-EXCEPTION_WITH_ERROR(GeneralProtectionFault, ·generalProtectionFault(SB))
-EXCEPTION_WITH_ERROR(PageFault, ·pageFault(SB))
-EXCEPTION_WITHOUT_ERROR(X87FloatingPointException, ·x87FloatingPointException(SB))
-EXCEPTION_WITH_ERROR(AlignmentCheck, ·alignmentCheck(SB))
-EXCEPTION_WITHOUT_ERROR(MachineCheck, ·machineCheck(SB))
-EXCEPTION_WITHOUT_ERROR(SIMDFloatingPointException, ·simdFloatingPointException(SB))
-EXCEPTION_WITHOUT_ERROR(VirtualizationException, ·virtualizationException(SB))
-EXCEPTION_WITH_ERROR(SecurityException, ·securityException(SB))
-EXCEPTION_WITHOUT_ERROR(SyscallInt80, ·syscallInt80(SB))
+EXCEPTION_WITHOUT_ERROR(DivideByZero, ·divideByZero(SB), ·addrOfDivideByZero(SB))
+EXCEPTION_WITHOUT_ERROR(Debug, ·debug(SB), ·addrOfDebug(SB))
+EXCEPTION_WITHOUT_ERROR(NMI, ·nmi(SB), ·addrOfNMI(SB))
+EXCEPTION_WITHOUT_ERROR(Breakpoint, ·breakpoint(SB), ·addrOfBreakpoint(SB))
+EXCEPTION_WITHOUT_ERROR(Overflow, ·overflow(SB), ·addrOfOverflow(SB))
+EXCEPTION_WITHOUT_ERROR(BoundRangeExceeded, ·boundRangeExceeded(SB), ·addrOfBoundRangeExceeded(SB))
+EXCEPTION_WITHOUT_ERROR(InvalidOpcode, ·invalidOpcode(SB), ·addrOfInvalidOpcode(SB))
+EXCEPTION_WITHOUT_ERROR(DeviceNotAvailable, ·deviceNotAvailable(SB), ·addrOfDeviceNotAvailable(SB))
+EXCEPTION_WITH_ERROR(DoubleFault, ·doubleFault(SB), ·addrOfDoubleFault(SB))
+EXCEPTION_WITHOUT_ERROR(CoprocessorSegmentOverrun, ·coprocessorSegmentOverrun(SB), ·addrOfCoprocessorSegmentOverrun(SB))
+EXCEPTION_WITH_ERROR(InvalidTSS, ·invalidTSS(SB), ·addrOfInvalidTSS(SB))
+EXCEPTION_WITH_ERROR(SegmentNotPresent, ·segmentNotPresent(SB), ·addrOfSegmentNotPresent(SB))
+EXCEPTION_WITH_ERROR(StackSegmentFault, ·stackSegmentFault(SB), ·addrOfStackSegmentFault(SB))
+EXCEPTION_WITH_ERROR(GeneralProtectionFault, ·generalProtectionFault(SB), ·addrOfGeneralProtectionFault(SB))
+EXCEPTION_WITH_ERROR(PageFault, ·pageFault(SB), ·addrOfPageFault(SB))
+EXCEPTION_WITHOUT_ERROR(X87FloatingPointException, ·x87FloatingPointException(SB), ·addrOfX87FloatingPointException(SB))
+EXCEPTION_WITH_ERROR(AlignmentCheck, ·alignmentCheck(SB), ·addrOfAlignmentCheck(SB))
+EXCEPTION_WITHOUT_ERROR(MachineCheck, ·machineCheck(SB), ·addrOfMachineCheck(SB))
+EXCEPTION_WITHOUT_ERROR(SIMDFloatingPointException, ·simdFloatingPointException(SB), ·addrOfSimdFloatingPointException(SB))
+EXCEPTION_WITHOUT_ERROR(VirtualizationException, ·virtualizationException(SB), ·addrOfVirtualizationException(SB))
+EXCEPTION_WITH_ERROR(SecurityException, ·securityException(SB), ·addrOfSecurityException(SB))
+EXCEPTION_WITHOUT_ERROR(SyscallInt80, ·syscallInt80(SB), ·addrOfSyscallInt80(SB))
diff --git a/pkg/ring0/kernel_amd64.go b/pkg/ring0/kernel_amd64.go
index f63af8b76..b5c4a39e3 100644
--- a/pkg/ring0/kernel_amd64.go
+++ b/pkg/ring0/kernel_amd64.go
@@ -21,8 +21,13 @@ import (
"reflect"
"gvisor.dev/gvisor/pkg/hostarch"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
)
+// HaltAndWriteFSBase halts execution. On resume, it sets FS_BASE from the
+// value in regs.
+func HaltAndWriteFSBase(regs *arch.Registers)
+
// init initializes architecture-specific state.
func (k *Kernel) init(maxCPUs int) {
entrySize := reflect.TypeOf(kernelEntry{}).Size()
@@ -240,7 +245,6 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
// Perform the switch.
swapgs() // GS will be swapped on return.
- WriteFS(uintptr(regs.Fs_base)) // escapes: no. Set application FS.
WriteGS(uintptr(regs.Gs_base)) // escapes: no. Set application GS.
LoadFloatingPoint(switchOpts.FloatingPointState.BytePointer()) // escapes: no. Copy in floating point.
if switchOpts.FullRestore {
@@ -249,38 +253,49 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
vector = sysret(c, regs, uintptr(userCR3))
}
SaveFloatingPoint(switchOpts.FloatingPointState.BytePointer()) // escapes: no. Copy out floating point.
- WriteFS(uintptr(c.registers.Fs_base)) // escapes: no. Restore kernel FS.
RestoreKernelFPState() // escapes: no. Restore kernel MXCSR.
return
}
var sentryXCR0 = xgetbv(0)
-// start is the CPU entrypoint.
+// startGo is the CPU entrypoint.
//
-// This is called from the Start asm stub (see entry_amd64.go); on return the
+// This is called from the start asm stub (see entry_amd64.go); on return the
// registers in c.registers will be restored (not segments).
//
+// Note that any code written in Go should adhere to Go expected environment:
+// * Initialized floating point state (required for optimizations using
+// floating point instructions).
+// * Go TLS in FS_BASE (this is required by splittable functions, calls into
+// the runtime, calls to assembly functions (Go 1.17+ ABI wrappers access
+// TLS)).
+//
//go:nosplit
-func start(c *CPU) {
- // Save per-cpu & FS segment.
+func startGo(c *CPU) {
+ // Save per-cpu.
WriteGS(kernelAddr(c.kernelEntry))
- WriteFS(uintptr(c.registers.Fs_base))
+ //
+ // TODO(mpratt): Note that per the note above, this should be done
+ // before entering Go code. However for simplicity we leave it here for
+ // now, since the small critical sections with undefined FPU state
+ // should only contain very limited use of floating point instructions
+ // (notably, use of XMM15 as a zero register).
fninit()
// Need to sync XCR0 with the host, because xsave and xrstor can be
// called from different contexts.
xsetbv(0, sentryXCR0)
// Set the syscall target.
- wrmsr(_MSR_LSTAR, kernelFunc(sysenter))
+ wrmsr(_MSR_LSTAR, kernelFunc(addrOfSysenter()))
wrmsr(_MSR_SYSCALL_MASK, KernelFlagsClear|_RFLAGS_DF)
// NOTE: This depends on having the 64-bit segments immediately
// following the 32-bit user segments. This is simply the way the
// sysret instruction is designed to work (it assumes they follow).
wrmsr(_MSR_STAR, uintptr(uint64(Kcode)<<32|uint64(Ucode32)<<48))
- wrmsr(_MSR_CSTAR, kernelFunc(sysenter))
+ wrmsr(_MSR_CSTAR, kernelFunc(addrOfSysenter()))
}
// SetCPUIDFaulting sets CPUID faulting per the boolean value.
diff --git a/pkg/ring0/kernel_unsafe.go b/pkg/ring0/kernel_unsafe.go
index 16955ad91..04c60d0a7 100644
--- a/pkg/ring0/kernel_unsafe.go
+++ b/pkg/ring0/kernel_unsafe.go
@@ -35,7 +35,6 @@ func kernelAddr(obj interface{}) uintptr {
// kernelFunc returns the address of the given function.
//
//go:nosplit
-func kernelFunc(fn func()) uintptr {
- fnptr := (**uintptr)(unsafe.Pointer(&fn))
- return KernelStartAddress | **fnptr
+func kernelFunc(fn uintptr) uintptr {
+ return KernelStartAddress | fn
}
diff --git a/pkg/ring0/lib_amd64.go b/pkg/ring0/lib_amd64.go
index 3e6bb9663..46746fd80 100644
--- a/pkg/ring0/lib_amd64.go
+++ b/pkg/ring0/lib_amd64.go
@@ -43,8 +43,8 @@ func xsave(*byte)
// xsaveopt uses xsaveopt to save floating point state.
func xsaveopt(*byte)
-// WriteFS sets the GS address (set by init).
-var WriteFS func(addr uintptr)
+// writeFS sets the FS base address (selects one of wrfsbase or wrfsmsr).
+func writeFS(addr uintptr)
// wrfsbase writes to the GS base address.
func wrfsbase(addr uintptr)
@@ -116,10 +116,8 @@ func Init(featureSet *cpuid.FeatureSet) {
LoadFloatingPoint = fxrstor
}
if hasFSGSBASE {
- WriteFS = wrfsbase
WriteGS = wrgsbase
} else {
- WriteFS = wrfsmsr
WriteGS = wrgsmsr
}
}
diff --git a/pkg/ring0/lib_amd64.s b/pkg/ring0/lib_amd64.s
index 70a43e79e..8ed98fc84 100644
--- a/pkg/ring0/lib_amd64.s
+++ b/pkg/ring0/lib_amd64.s
@@ -80,6 +80,29 @@ TEXT ·xsaveopt(SB),NOSPLIT,$0-8
BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x37;
RET
+// writeFS writes to the FS base.
+//
+// This is written in assembly because it must be safe to call before the Go
+// environment is set up. See comment on start().
+//
+// Preconditions: must be running in the lower address space, as it accesses
+// global data.
+TEXT ·writeFS(SB),NOSPLIT,$8-8
+ MOVQ addr+0(FP), AX
+
+ CMPB ·hasFSGSBASE(SB), $1
+ JNE msr
+
+ PUSHQ AX
+ CALL ·wrfsbase(SB)
+ POPQ AX
+ RET
+msr:
+ PUSHQ AX
+ CALL ·wrfsmsr(SB)
+ POPQ AX
+ RET
+
// wrfsbase writes to the FS base.
//
// The code corresponds to:
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go
index d761bbdee..73ea73742 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.go
@@ -74,8 +74,27 @@ func (c *vCPU) KernelSyscall() {
// therefore be guaranteed that there is no floating point state to be
// loaded on resuming from halt. We only worry about saving on exit.
ring0.SaveFloatingPoint(c.floatingPointState.BytePointer()) // escapes: no.
- ring0.Halt()
- ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no, reload host segment.
+ // N.B. Since KernelSyscall is called when the kernel makes a syscall,
+ // FS_BASE is already set for correct execution of this function.
+ //
+ // Refresher on syscall/exception handling:
+ // 1. When the sentry is in guest mode and makes a syscall, it goes to
+ // sysenter(), which saves the register state (including RIP of SYSCALL
+ // instruction) to vCPU.registers.
+ // 2. It then calls KernelSyscall, which rewinds the IP and executes
+ // HLT.
+ // 3. HLT does a VM-exit to bluepillHandler, which returns from the
+ // signal handler using vCPU.registers, directly to the SYSCALL
+ // instruction.
+ // 4. Later, when we want to re-use the vCPU (perhaps on a different
+ // host thread), we set the new thread's registers in vCPU.registers
+ // (as opposed to setting the KVM registers with KVM_SET_REGS).
+ // 5. KVM_RUN thus enters the guest with the old register state,
+ // immediately following the HLT instruction, returning here.
+ // 6. We then restore FS_BASE and the full registers from vCPU.register
+ // to return from sysenter() back to the desired bluepill point from
+ // the host.
+ ring0.HaltAndWriteFSBase(regs) // escapes: no, reload host segment.
}
// KernelException handles kernel exceptions.
@@ -93,8 +112,8 @@ func (c *vCPU) KernelException(vector ring0.Vector) {
}
// See above.
ring0.SaveFloatingPoint(c.floatingPointState.BytePointer()) // escapes: no.
- ring0.Halt()
- ring0.WriteFS(uintptr(regs.Fs_base)) // escapes: no; reload host segment.
+ // See above.
+ ring0.HaltAndWriteFSBase(regs) // escapes: no, reload host segment.
}
// bluepillArchExit is called during bluepillEnter.
diff --git a/pkg/sentry/platform/kvm/bluepill_impl_amd64.s b/pkg/sentry/platform/kvm/bluepill_impl_amd64.s
index b7f1dd5ac..7ad9e4e76 100644
--- a/pkg/sentry/platform/kvm/bluepill_impl_amd64.s
+++ b/pkg/sentry/platform/kvm/bluepill_impl_amd64.s
@@ -66,8 +66,8 @@
#define PTRACE_FLAGS 0x90
#define PTRACE_RSP 0x98
#define PTRACE_SS 0xa0
-#define PTRACE_FS 0xa8
-#define PTRACE_GS 0xb0
+#define PTRACE_FS_BASE 0xa8
+#define PTRACE_GS_BASE 0xb0
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 6f87236ad..06fcf1d2e 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -85,6 +85,13 @@ func bluepillGuestExit(c *vCPU, context unsafe.Pointer) {
// signal stack. It should only execute raw system calls and functions that are
// explicitly marked go:nosplit.
//
+// Ideally, this function should switch to gsignal, as runtime.sigtramp does,
+// but that is tedious given all the runtime internals. That said, using
+// gsignal inside a signal handler is not _required_, provided we avoid stack
+// splits and allocations. Note that calling any splittable function here will
+// be flaky; if the signal stack is below the G stack then we will trigger a
+// split and crash. If above, we won't trigger a split.
+//
// +checkescape:all
//
//go:nosplit
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index 7a10fd812..b28a2c4e8 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -136,7 +136,7 @@ func (c *vCPU) initArchState() error {
}
// Set the entrypoint for the kernel.
- kernelUserRegs.RIP = uint64(reflect.ValueOf(ring0.Start).Pointer())
+ kernelUserRegs.RIP = uint64(ring0.AddrOfStart())
kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer())
kernelUserRegs.RSP = c.StackTop()
kernelUserRegs.RFLAGS = ring0.KernelFlagsSet