From 5104bc5d60b0f21e90499ccba473aadb7d6f91e3 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 24 Mar 2020 02:15:46 +0000 Subject: amd64: switch to userCR3 just before return to gr3 UserCR3 takes effect as late as possible so that less code is in the userCR3 environment. It is prepared for the next patches that make less code and data in the upper half, which is prepared for KPTI for gvisor. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan --- pkg/sentry/platform/ring0/entry_amd64.go | 4 +-- pkg/sentry/platform/ring0/entry_amd64.s | 50 +++++++++++++++++++++++-------- pkg/sentry/platform/ring0/kernel_amd64.go | 5 ++-- 3 files changed, 42 insertions(+), 17 deletions(-) diff --git a/pkg/sentry/platform/ring0/entry_amd64.go b/pkg/sentry/platform/ring0/entry_amd64.go index 7fa43c2f5..60e90af91 100644 --- a/pkg/sentry/platform/ring0/entry_amd64.go +++ b/pkg/sentry/platform/ring0/entry_amd64.go @@ -41,7 +41,7 @@ func swapgs() // The return code is the vector that interrupted execution. // // See stubs.go for a note regarding the frame size of this function. -func sysret(*CPU, *arch.Registers) Vector +func sysret(cpu *CPU, regs *arch.Registers, userCR3 uintptr) Vector // "iret is the cadillac of CPL switching." // @@ -50,7 +50,7 @@ func sysret(*CPU, *arch.Registers) Vector // iret is nearly identical to sysret, except an iret is used to fully restore // all user state. This must be called in cases where all registers need to be // restored. -func iret(*CPU, *arch.Registers) Vector +func iret(cpu *CPU, regs *arch.Registers, userCR3 uintptr) Vector // exception is the generic exception entry. // diff --git a/pkg/sentry/platform/ring0/entry_amd64.s b/pkg/sentry/platform/ring0/entry_amd64.s index 02df38331..f9aa00b4c 100644 --- a/pkg/sentry/platform/ring0/entry_amd64.s +++ b/pkg/sentry/platform/ring0/entry_amd64.s @@ -63,6 +63,15 @@ MOVQ offset+PTRACE_RSI(reg), SI; \ MOVQ offset+PTRACE_RDI(reg), DI; +// WRITE_CR3() writes the given CR3 value. +// +// The code corresponds to: +// +// mov %rax, %cr3 +// +#define WRITE_CR3() \ + BYTE $0x0f; BYTE $0x22; BYTE $0xd8; + // SWAP_GS swaps the kernel GS (CPU). #define SWAP_GS() \ BYTE $0x0F; BYTE $0x01; BYTE $0xf8; @@ -100,16 +109,27 @@ TEXT ·sysret(SB),NOSPLIT,$0-24 // Save original state. LOAD_KERNEL_ADDRESS(cpu+0(FP), BX) LOAD_KERNEL_ADDRESS(regs+8(FP), AX) + MOVQ userCR3+16(FP), CX MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX) MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX) MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX) + // save SP AX userCR3 on the kernel stack. + LOAD_KERNEL_STACK(BX) + PUSHQ PTRACE_RSP(AX) + PUSHQ PTRACE_RAX(AX) + PUSHQ CX + // Restore user register state. REGISTERS_LOAD(AX, 0) MOVQ PTRACE_RIP(AX), CX // Needed for SYSRET. MOVQ PTRACE_FLAGS(AX), R11 // Needed for SYSRET. - MOVQ PTRACE_RSP(AX), SP // Restore the stack directly. - MOVQ PTRACE_RAX(AX), AX // Restore AX (scratch). + + // restore userCR3, AX, SP. + POPQ AX // Get userCR3. + WRITE_CR3() // Switch to userCR3. + POPQ AX // Restore AX. + POPQ SP // Restore SP. SYSRET64() // See entry_amd64.go. @@ -117,19 +137,24 @@ TEXT ·iret(SB),NOSPLIT,$0-24 // Save original state. LOAD_KERNEL_ADDRESS(cpu+0(FP), BX) LOAD_KERNEL_ADDRESS(regs+8(FP), AX) + MOVQ userCR3+16(FP), CX MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX) MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX) MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX) // Build an IRET frame & restore state. LOAD_KERNEL_STACK(BX) - MOVQ PTRACE_SS(AX), BX; PUSHQ BX - MOVQ PTRACE_RSP(AX), CX; PUSHQ CX - MOVQ PTRACE_FLAGS(AX), DX; PUSHQ DX - MOVQ PTRACE_CS(AX), DI; PUSHQ DI - MOVQ PTRACE_RIP(AX), SI; PUSHQ SI - REGISTERS_LOAD(AX, 0) // Restore most registers. - MOVQ PTRACE_RAX(AX), AX // Restore AX (scratch). + PUSHQ PTRACE_SS(AX) + PUSHQ PTRACE_RSP(AX) + PUSHQ PTRACE_FLAGS(AX) + PUSHQ PTRACE_CS(AX) + PUSHQ PTRACE_RIP(AX) + PUSHQ PTRACE_RAX(AX) // Save AX on kernel stack. + PUSHQ CX // Save userCR3 on kernel stack. + REGISTERS_LOAD(AX, 0) // Restore most registers. + POPQ AX // Get userCR3. + WRITE_CR3() // Switch to userCR3. + POPQ AX // Restore AX. IRET() // See entry_amd64.go. @@ -178,13 +203,14 @@ user: // Return to the kernel, where the frame is: // - // vector (sp+24) + // vector (sp+32) + // userCR3 (sp+24) // regs (sp+16) // cpu (sp+8) // vcpu.Switch (sp+0) // MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer. - MOVQ $Syscall, 24(SP) // Output vector. + MOVQ $Syscall, 32(SP) // Output vector. RET kernel: @@ -255,7 +281,7 @@ user: MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer. MOVQ CX, CPU_ERROR_CODE(GS) // Set error code. MOVQ $1, CPU_ERROR_TYPE(GS) // Set error type to user. - MOVQ BX, 24(SP) // Output vector. + MOVQ BX, 32(SP) // Output vector. RET kernel: diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go index d37981dbf..d3352933f 100644 --- a/pkg/sentry/platform/ring0/kernel_amd64.go +++ b/pkg/sentry/platform/ring0/kernel_amd64.go @@ -198,11 +198,10 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { WriteGS(uintptr(regs.Gs_base)) // escapes: no. Set application GS. LoadFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy in floating point. jumpToKernel() // Switch to upper half. - writeCR3(uintptr(userCR3)) // Change to user address space. if switchOpts.FullRestore { - vector = iret(c, regs) + vector = iret(c, regs, uintptr(userCR3)) } else { - vector = sysret(c, regs) + vector = sysret(c, regs, uintptr(userCR3)) } writeCR3(uintptr(kernelCR3)) // Return to kernel address space. jumpToUser() // Return to lower half. -- cgit v1.2.3 From 5a0f2f6e499125c4b0bde2f84cf0eccbb6d6df94 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 24 Mar 2020 04:20:00 +0000 Subject: amd64: switch to kernelCR3 when just return to gr0 KernelCR3 takes effect as early as possible so that less code is in the userCR3 environment. It is prepared for the next patches that make less code and data in the upper half, which is prepared for KPTI for gvisor. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan --- pkg/sentry/platform/ring0/defs_amd64.go | 3 +++ pkg/sentry/platform/ring0/entry_amd64.s | 16 ++++++++++++---- pkg/sentry/platform/ring0/kernel_amd64.go | 3 +-- pkg/sentry/platform/ring0/lib_amd64.go | 6 ------ pkg/sentry/platform/ring0/lib_amd64.s | 22 ---------------------- pkg/sentry/platform/ring0/offsets_amd64.go | 1 + 6 files changed, 17 insertions(+), 34 deletions(-) diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go index 9c6c2cf5c..9c709f271 100644 --- a/pkg/sentry/platform/ring0/defs_amd64.go +++ b/pkg/sentry/platform/ring0/defs_amd64.go @@ -85,6 +85,9 @@ type CPUArchState struct { // stack is the stack used for interrupts on this CPU. stack [256]byte + // kernelCR3 is the cr3 used for sentry kernel. + kernelCR3 uintptr + // errorCode is the error code from the last exception. errorCode uintptr diff --git a/pkg/sentry/platform/ring0/entry_amd64.s b/pkg/sentry/platform/ring0/entry_amd64.s index f9aa00b4c..89ca1dcfa 100644 --- a/pkg/sentry/platform/ring0/entry_amd64.s +++ b/pkg/sentry/platform/ring0/entry_amd64.s @@ -189,10 +189,14 @@ TEXT ·sysenter(SB),NOSPLIT,$0 user: SWAP_GS() + MOVQ AX, CPU_REGISTERS+PTRACE_RCX(GS) // Save user AX on scratch. + MOVQ CPU_KERNEL_CR3(GS), AX // Get kernel cr3 on AX. + WRITE_CR3() // Switch to kernel cr3. + XCHGQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Swap stacks. - XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Swap for AX (regs). + MOVQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Get user regs. REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX. - MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX // Load saved AX value. + MOVQ CPU_REGISTERS+PTRACE_RCX(GS), BX // Load saved user AX value. MOVQ BX, PTRACE_RAX(AX) // Save everything else. MOVQ BX, PTRACE_ORIGRAX(AX) MOVQ CX, PTRACE_RIP(AX) @@ -263,9 +267,13 @@ user: SWAP_GS() ADDQ $-8, SP // Adjust for flags. MOVQ $_KERNEL_FLAGS, 0(SP); BYTE $0x9d; // Reset flags (POPFQ). - XCHGQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Swap for user regs. + PUSHQ AX // Save user AX on stack. + MOVQ CPU_KERNEL_CR3(GS), AX // Get kernel cr3 on AX. + WRITE_CR3() // Switch to kernel cr3. + + MOVQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Get user regs. REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX. - MOVQ CPU_REGISTERS+PTRACE_RAX(GS), BX // Restore original AX. + POPQ BX // Restore original AX. MOVQ BX, PTRACE_RAX(AX) // Save it. MOVQ BX, PTRACE_ORIGRAX(AX) MOVQ 16(SP), BX; MOVQ BX, PTRACE_RIP(AX) diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go index d3352933f..266bbe372 100644 --- a/pkg/sentry/platform/ring0/kernel_amd64.go +++ b/pkg/sentry/platform/ring0/kernel_amd64.go @@ -183,7 +183,7 @@ func IsCanonical(addr uint64) bool { //go:nosplit func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { userCR3 := switchOpts.PageTables.CR3(!switchOpts.Flush, switchOpts.UserPCID) - kernelCR3 := c.kernel.PageTables.CR3(true, switchOpts.KernelPCID) + c.kernelCR3 = uintptr(c.kernel.PageTables.CR3(true, switchOpts.KernelPCID)) // Sanitize registers. regs := switchOpts.Registers @@ -203,7 +203,6 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { } else { vector = sysret(c, regs, uintptr(userCR3)) } - writeCR3(uintptr(kernelCR3)) // Return to kernel address space. jumpToUser() // Return to lower half. SaveFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy out floating point. WriteFS(uintptr(c.registers.Fs_base)) // escapes: no. Restore kernel FS. diff --git a/pkg/sentry/platform/ring0/lib_amd64.go b/pkg/sentry/platform/ring0/lib_amd64.go index ca968a036..7c9517153 100644 --- a/pkg/sentry/platform/ring0/lib_amd64.go +++ b/pkg/sentry/platform/ring0/lib_amd64.go @@ -61,12 +61,6 @@ func wrgsbase(addr uintptr) // wrgsmsr writes to the GS_BASE MSR. func wrgsmsr(addr uintptr) -// writeCR3 writes the CR3 value. -func writeCR3(phys uintptr) - -// readCR3 reads the current CR3 value. -func readCR3() uintptr - // readCR2 reads the current CR2 value. func readCR2() uintptr diff --git a/pkg/sentry/platform/ring0/lib_amd64.s b/pkg/sentry/platform/ring0/lib_amd64.s index 75d742750..c81061639 100644 --- a/pkg/sentry/platform/ring0/lib_amd64.s +++ b/pkg/sentry/platform/ring0/lib_amd64.s @@ -152,28 +152,6 @@ TEXT ·jumpToKernel(SB),NOSPLIT,$0 MOVQ AX, 0(SP) RET -// writeCR3 writes the given CR3 value. -// -// The code corresponds to: -// -// mov %rax, %cr3 -// -TEXT ·writeCR3(SB),NOSPLIT,$0-8 - MOVQ cr3+0(FP), AX - BYTE $0x0f; BYTE $0x22; BYTE $0xd8; - RET - -// readCR3 reads the current CR3 value. -// -// The code corresponds to: -// -// mov %cr3, %rax -// -TEXT ·readCR3(SB),NOSPLIT,$0-8 - BYTE $0x0f; BYTE $0x20; BYTE $0xd8; - MOVQ AX, ret+0(FP) - RET - // readCR2 reads the current CR2 value. // // The code corresponds to: diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go index b8ab120a0..d6df12c2f 100644 --- a/pkg/sentry/platform/ring0/offsets_amd64.go +++ b/pkg/sentry/platform/ring0/offsets_amd64.go @@ -33,6 +33,7 @@ func Emit(w io.Writer) { fmt.Fprintf(w, "#define CPU_SELF 0x%02x\n", reflect.ValueOf(&c.self).Pointer()-reflect.ValueOf(c).Pointer()) fmt.Fprintf(w, "#define CPU_REGISTERS 0x%02x\n", reflect.ValueOf(&c.registers).Pointer()-reflect.ValueOf(c).Pointer()) fmt.Fprintf(w, "#define CPU_STACK_TOP 0x%02x\n", reflect.ValueOf(&c.stack[0]).Pointer()-reflect.ValueOf(c).Pointer()+uintptr(len(c.stack))) + fmt.Fprintf(w, "#define CPU_KERNEL_CR3 0x%02x\n", reflect.ValueOf(&c.kernelCR3).Pointer()-reflect.ValueOf(c).Pointer()) fmt.Fprintf(w, "#define CPU_ERROR_CODE 0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer()) fmt.Fprintf(w, "#define CPU_ERROR_TYPE 0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer()) -- cgit v1.2.3 From c4d364c7cea23e29befa5b708d14a7912f47ce37 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 24 Mar 2020 09:54:02 +0000 Subject: amd64: less code and data in the upper half Call jumpToKernel() in sysret()/iret() so that there is less code and data in the upper half, and, especially, current goroutine's stack and user regs will not be accessed from the upper half (also with the help from previous patches which make less code in userCR3 context). jumpToUser() will not be needed, because current goroutine's stack and return value in the stack is lower half address. It is prepared for KPTI for gvisor. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan --- pkg/sentry/platform/ring0/entry_amd64.go | 3 +++ pkg/sentry/platform/ring0/entry_amd64.s | 29 +++++++++++++++++++++++------ pkg/sentry/platform/ring0/kernel_amd64.go | 2 -- pkg/sentry/platform/ring0/lib_amd64.go | 6 ------ pkg/sentry/platform/ring0/lib_amd64.s | 25 ------------------------- 5 files changed, 26 insertions(+), 39 deletions(-) diff --git a/pkg/sentry/platform/ring0/entry_amd64.go b/pkg/sentry/platform/ring0/entry_amd64.go index 60e90af91..d87b1fd00 100644 --- a/pkg/sentry/platform/ring0/entry_amd64.go +++ b/pkg/sentry/platform/ring0/entry_amd64.go @@ -36,6 +36,9 @@ func sysenter() // This must be called prior to sysret/iret. func swapgs() +// jumpToKernel jumps to the kernel version of the current RIP. +func jumpToKernel() + // sysret returns to userspace from a system call. // // The return code is the vector that interrupted execution. diff --git a/pkg/sentry/platform/ring0/entry_amd64.s b/pkg/sentry/platform/ring0/entry_amd64.s index 89ca1dcfa..e54582ede 100644 --- a/pkg/sentry/platform/ring0/entry_amd64.s +++ b/pkg/sentry/platform/ring0/entry_amd64.s @@ -104,11 +104,24 @@ TEXT ·swapgs(SB),NOSPLIT,$0 SWAP_GS() RET +// jumpToKernel changes execution to the kernel address space. +// +// This works by changing the return value to the kernel version. +TEXT ·jumpToKernel(SB),NOSPLIT,$0 + MOVQ 0(SP), AX + ORQ ·KernelStartAddress(SB), AX // Future return value. + MOVQ AX, 0(SP) + RET + // See entry_amd64.go. TEXT ·sysret(SB),NOSPLIT,$0-24 - // Save original state. - LOAD_KERNEL_ADDRESS(cpu+0(FP), BX) - LOAD_KERNEL_ADDRESS(regs+8(FP), AX) + CALL ·jumpToKernel(SB) + // Save original state and stack. sysenter() or exception() + // from APP(gr3) will switch to this stack, set the return + // value (vector: 32(SP)) and then do RET, which will also + // automatically return to the lower half. + MOVQ cpu+0(FP), BX + MOVQ regs+8(FP), AX MOVQ userCR3+16(FP), CX MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX) MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX) @@ -134,9 +147,13 @@ TEXT ·sysret(SB),NOSPLIT,$0-24 // See entry_amd64.go. TEXT ·iret(SB),NOSPLIT,$0-24 - // Save original state. - LOAD_KERNEL_ADDRESS(cpu+0(FP), BX) - LOAD_KERNEL_ADDRESS(regs+8(FP), AX) + CALL ·jumpToKernel(SB) + // Save original state and stack. sysenter() or exception() + // from APP(gr3) will switch to this stack, set the return + // value (vector: 32(SP)) and then do RET, which will also + // automatically return to the lower half. + MOVQ cpu+0(FP), BX + MOVQ regs+8(FP), AX MOVQ userCR3+16(FP), CX MOVQ SP, CPU_REGISTERS+PTRACE_RSP(BX) MOVQ BP, CPU_REGISTERS+PTRACE_RBP(BX) diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go index 266bbe372..196afd6bb 100644 --- a/pkg/sentry/platform/ring0/kernel_amd64.go +++ b/pkg/sentry/platform/ring0/kernel_amd64.go @@ -197,13 +197,11 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { WriteFS(uintptr(regs.Fs_base)) // escapes: no. Set application FS. WriteGS(uintptr(regs.Gs_base)) // escapes: no. Set application GS. LoadFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy in floating point. - jumpToKernel() // Switch to upper half. if switchOpts.FullRestore { vector = iret(c, regs, uintptr(userCR3)) } else { vector = sysret(c, regs, uintptr(userCR3)) } - jumpToUser() // Return to lower half. SaveFloatingPoint(switchOpts.FloatingPointState) // escapes: no. Copy out floating point. WriteFS(uintptr(c.registers.Fs_base)) // escapes: no. Restore kernel FS. return diff --git a/pkg/sentry/platform/ring0/lib_amd64.go b/pkg/sentry/platform/ring0/lib_amd64.go index 7c9517153..0ec5c3bc5 100644 --- a/pkg/sentry/platform/ring0/lib_amd64.go +++ b/pkg/sentry/platform/ring0/lib_amd64.go @@ -64,12 +64,6 @@ func wrgsmsr(addr uintptr) // readCR2 reads the current CR2 value. func readCR2() uintptr -// jumpToKernel jumps to the kernel version of the current RIP. -func jumpToKernel() - -// jumpToUser jumps to the user version of the current RIP. -func jumpToUser() - // fninit initializes the floating point unit. func fninit() diff --git a/pkg/sentry/platform/ring0/lib_amd64.s b/pkg/sentry/platform/ring0/lib_amd64.s index c81061639..2fe83568a 100644 --- a/pkg/sentry/platform/ring0/lib_amd64.s +++ b/pkg/sentry/platform/ring0/lib_amd64.s @@ -127,31 +127,6 @@ TEXT ·wrgsmsr(SB),NOSPLIT,$0-8 BYTE $0x0f; BYTE $0x30; // WRMSR RET -// jumpToUser changes execution to the user address. -// -// This works by changing the return value to the user version. -TEXT ·jumpToUser(SB),NOSPLIT,$0 - MOVQ 0(SP), AX - MOVQ ·KernelStartAddress(SB), BX - NOTQ BX - ANDQ BX, SP // Switch the stack. - ANDQ BX, BP // Switch the frame pointer. - ANDQ BX, AX // Future return value. - MOVQ AX, 0(SP) - RET - -// jumpToKernel changes execution to the kernel address space. -// -// This works by changing the return value to the kernel version. -TEXT ·jumpToKernel(SB),NOSPLIT,$0 - MOVQ 0(SP), AX - MOVQ ·KernelStartAddress(SB), BX - ORQ BX, SP // Switch the stack. - ORQ BX, BP // Switch the frame pointer. - ORQ BX, AX // Future return value. - MOVQ AX, 0(SP) - RET - // readCR2 reads the current CR2 value. // // The code corresponds to: -- cgit v1.2.3 From d17425082d24109f79f5235dd6d1ced36ac2dd67 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 26 Mar 2020 02:57:25 +0000 Subject: amd64: don't check vcpu in bluepill() m.Get() has guaranteed that if any OS thread TID is in guest, m.vCPUs[TID] points to the vCPU in which the OS thread TID is running. So if m.Get() returns with the corrent context in guest, the vCPU of it must be the same as what Get() returns. So bluepill() doesn't need to check if the vCPU is matched or not. The check need to access to %gs register which will not points to vCPU later when KPTI for gvisor is enabled. We can still fetch the vCPU pointer from %gs later (when %gs points to kernelEntry), but it needs the ENTRY_CPU_SELF which is generated by ring0/offset_amd64.go. So we just simply remove the check. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan --- pkg/sentry/platform/kvm/bluepill_amd64.s | 20 -------------------- pkg/sentry/platform/kvm/machine.go | 5 +++++ 2 files changed, 5 insertions(+), 20 deletions(-) diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.s b/pkg/sentry/platform/kvm/bluepill_amd64.s index 2bc34a435..13b58d2c4 100644 --- a/pkg/sentry/platform/kvm/bluepill_amd64.s +++ b/pkg/sentry/platform/kvm/bluepill_amd64.s @@ -14,16 +14,6 @@ #include "textflag.h" -// VCPU_CPU is the location of the CPU in the vCPU struct. -// -// This is guaranteed to be zero. -#define VCPU_CPU 0x0 - -// CPU_SELF is the self reference in ring0's percpu. -// -// This is guaranteed to be zero. -#define CPU_SELF 0x0 - // Context offsets. // // Only limited use of the context is done in the assembly stub below, most is @@ -39,18 +29,8 @@ // See bluepill.go. TEXT ·bluepill(SB),NOSPLIT,$0 -begin: MOVQ vcpu+0(FP), AX - LEAQ VCPU_CPU(AX), BX BYTE CLI; -check_vcpu: - MOVQ CPU_SELF(GS), CX - CMPQ BX, CX - JE right_vCPU -wrong_vcpu: - CALL ·redpill(SB) - JMP begin -right_vCPU: RET // sighandler: see bluepill.go for documentation. diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index 6c54712d1..1c8a1dd6f 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -339,6 +339,11 @@ func (m *machine) Destroy() { // Get gets an available vCPU. // // This will return with the OS thread locked. +// +// It is guaranteed that if any OS thread TID is in guest, m.vCPUs[TID] points +// to the vCPU in which the OS thread TID is running. So if Get() returns with +// the corrent context in guest, the vCPU of it must be the same as what +// Get() returns. func (m *machine) Get() *vCPU { m.mu.RLock() runtime.LockOSThread() -- cgit v1.2.3 From 6ce10c3c2ff516aabdef8b4d198690e7e7a87ca4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 24 Mar 2020 12:40:52 +0000 Subject: amd64: introduce kernelEntry kernelEntry is split from CPU that contains minimal CPU-specific arch state that can be mapped at the upper of the address space. It is prepared for KPTI for gvisor. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan --- pkg/sentry/platform/kvm/machine_amd64.go | 1 + pkg/sentry/platform/ring0/defs_amd64.go | 31 ++++++-- pkg/sentry/platform/ring0/entry_amd64.s | 110 +++++++++++++++-------------- pkg/sentry/platform/ring0/kernel_amd64.go | 5 +- pkg/sentry/platform/ring0/offsets_amd64.go | 12 +++- 5 files changed, 95 insertions(+), 64 deletions(-) diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index acc823ba6..81ee1efce 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -144,6 +144,7 @@ func (c *vCPU) initArchState() error { // Set the entrypoint for the kernel. kernelUserRegs.RIP = uint64(reflect.ValueOf(ring0.Start).Pointer()) kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer()) + kernelUserRegs.RSP = c.StackTop() kernelUserRegs.RFLAGS = ring0.KernelFlagsSet // Set the system registers. diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go index 9c709f271..2c48bf49d 100644 --- a/pkg/sentry/platform/ring0/defs_amd64.go +++ b/pkg/sentry/platform/ring0/defs_amd64.go @@ -80,14 +80,35 @@ type KernelArchState struct { globalIDT idt64 } -// CPUArchState contains CPU-specific arch state. -type CPUArchState struct { +// kernelEntry contains minimal CPU-specific arch state +// that can be mapped at the upper of the address space. +// Malicious APP might steal info from it via CPU bugs. +type kernelEntry struct { // stack is the stack used for interrupts on this CPU. stack [256]byte + // scratch space for temporary usage. + scratch0 uint64 + scratch1 uint64 + + // stackTop is the top of the stack. + stackTop uint64 + + // cpuSelf is back reference to CPU. + cpuSelf *CPU + // kernelCR3 is the cr3 used for sentry kernel. kernelCR3 uintptr + // gdt is the CPU's descriptor table. + gdt descriptorTable + + // tss is the CPU's task state. + tss TaskState64 +} + +// CPUArchState contains CPU-specific arch state. +type CPUArchState struct { // errorCode is the error code from the last exception. errorCode uintptr @@ -100,11 +121,7 @@ type CPUArchState struct { // exception. errorType uintptr - // gdt is the CPU's descriptor table. - gdt descriptorTable - - // tss is the CPU's task state. - tss TaskState64 + *kernelEntry } // ErrorCode returns the last error code. diff --git a/pkg/sentry/platform/ring0/entry_amd64.s b/pkg/sentry/platform/ring0/entry_amd64.s index e54582ede..82689a194 100644 --- a/pkg/sentry/platform/ring0/entry_amd64.s +++ b/pkg/sentry/platform/ring0/entry_amd64.s @@ -84,15 +84,9 @@ #define SYSRET64() \ BYTE $0x48; BYTE $0x0f; BYTE $0x07; -// LOAD_KERNEL_ADDRESS loads a kernel address. -#define LOAD_KERNEL_ADDRESS(from, to) \ - MOVQ from, to; \ - ORQ ·KernelStartAddress(SB), to; - // LOAD_KERNEL_STACK loads the kernel stack. -#define LOAD_KERNEL_STACK(from) \ - LOAD_KERNEL_ADDRESS(CPU_SELF(from), SP); \ - LEAQ CPU_STACK_TOP(SP), SP; +#define LOAD_KERNEL_STACK(entry) \ + MOVQ ENTRY_STACK_TOP(entry), SP; // See kernel.go. TEXT ·Halt(SB),NOSPLIT,$0 @@ -128,6 +122,7 @@ TEXT ·sysret(SB),NOSPLIT,$0-24 MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX) // save SP AX userCR3 on the kernel stack. + MOVQ CPU_ENTRY(BX), BX LOAD_KERNEL_STACK(BX) PUSHQ PTRACE_RSP(AX) PUSHQ PTRACE_RAX(AX) @@ -160,6 +155,7 @@ TEXT ·iret(SB),NOSPLIT,$0-24 MOVQ AX, CPU_REGISTERS+PTRACE_RAX(BX) // Build an IRET frame & restore state. + MOVQ CPU_ENTRY(BX), BX LOAD_KERNEL_STACK(BX) PUSHQ PTRACE_SS(AX) PUSHQ PTRACE_RSP(AX) @@ -177,18 +173,18 @@ TEXT ·iret(SB),NOSPLIT,$0-24 // See entry_amd64.go. TEXT ·resume(SB),NOSPLIT,$0 // See iret, above. - MOVQ CPU_REGISTERS+PTRACE_SS(GS), BX; PUSHQ BX - MOVQ CPU_REGISTERS+PTRACE_RSP(GS), CX; PUSHQ CX - MOVQ CPU_REGISTERS+PTRACE_FLAGS(GS), DX; PUSHQ DX - MOVQ CPU_REGISTERS+PTRACE_CS(GS), DI; PUSHQ DI - MOVQ CPU_REGISTERS+PTRACE_RIP(GS), SI; PUSHQ SI - REGISTERS_LOAD(GS, CPU_REGISTERS) - MOVQ CPU_REGISTERS+PTRACE_RAX(GS), AX + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. + PUSHQ CPU_REGISTERS+PTRACE_SS(AX) + PUSHQ CPU_REGISTERS+PTRACE_RSP(AX) + PUSHQ CPU_REGISTERS+PTRACE_FLAGS(AX) + PUSHQ CPU_REGISTERS+PTRACE_CS(AX) + PUSHQ CPU_REGISTERS+PTRACE_RIP(AX) + REGISTERS_LOAD(AX, CPU_REGISTERS) + MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX IRET() // See entry_amd64.go. TEXT ·Start(SB),NOSPLIT,$0 - LOAD_KERNEL_STACK(AX) // Set the stack. PUSHQ $0x0 // Previous frame pointer. MOVQ SP, BP // Set frame pointer. PUSHQ AX // First argument (CPU). @@ -206,21 +202,24 @@ TEXT ·sysenter(SB),NOSPLIT,$0 user: SWAP_GS() - MOVQ AX, CPU_REGISTERS+PTRACE_RCX(GS) // Save user AX on scratch. - MOVQ CPU_KERNEL_CR3(GS), AX // Get kernel cr3 on AX. + MOVQ AX, ENTRY_SCRATCH0(GS) // Save user AX on scratch. + MOVQ ENTRY_KERNEL_CR3(GS), AX // Get kernel cr3 on AX. WRITE_CR3() // Switch to kernel cr3. - XCHGQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Swap stacks. - MOVQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Get user regs. + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. + MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX // Get user regs. REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX. - MOVQ CPU_REGISTERS+PTRACE_RCX(GS), BX // Load saved user AX value. - MOVQ BX, PTRACE_RAX(AX) // Save everything else. - MOVQ BX, PTRACE_ORIGRAX(AX) MOVQ CX, PTRACE_RIP(AX) MOVQ R11, PTRACE_FLAGS(AX) - MOVQ CPU_REGISTERS+PTRACE_RSP(GS), BX; MOVQ BX, PTRACE_RSP(AX) - MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code. - MOVQ $1, CPU_ERROR_TYPE(GS) // Set error type to user. + MOVQ SP, PTRACE_RSP(AX) + MOVQ ENTRY_SCRATCH0(GS), CX // Load saved user AX value. + MOVQ CX, PTRACE_RAX(AX) // Save everything else. + MOVQ CX, PTRACE_ORIGRAX(AX) + + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. + MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP // Get stacks. + MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code. + MOVQ $1, CPU_ERROR_TYPE(AX) // Set error type to user. // Return to the kernel, where the frame is: // @@ -230,25 +229,27 @@ user: // cpu (sp+8) // vcpu.Switch (sp+0) // - MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer. + MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer. MOVQ $Syscall, 32(SP) // Output vector. RET kernel: // We can't restore the original stack, but we can access the registers // in the CPU state directly. No need for temporary juggling. - MOVQ AX, CPU_REGISTERS+PTRACE_ORIGRAX(GS) - MOVQ AX, CPU_REGISTERS+PTRACE_RAX(GS) - REGISTERS_SAVE(GS, CPU_REGISTERS) - MOVQ CX, CPU_REGISTERS+PTRACE_RIP(GS) - MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(GS) - MOVQ SP, CPU_REGISTERS+PTRACE_RSP(GS) - MOVQ $0, CPU_ERROR_CODE(GS) // Clear error code. - MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel. + MOVQ AX, ENTRY_SCRATCH0(GS) + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. + REGISTERS_SAVE(AX, CPU_REGISTERS) + MOVQ CX, CPU_REGISTERS+PTRACE_RIP(AX) + MOVQ R11, CPU_REGISTERS+PTRACE_FLAGS(AX) + MOVQ SP, CPU_REGISTERS+PTRACE_RSP(AX) + MOVQ ENTRY_SCRATCH0(GS), BX + MOVQ BX, CPU_REGISTERS+PTRACE_ORIGRAX(AX) + MOVQ BX, CPU_REGISTERS+PTRACE_RAX(AX) + MOVQ $0, CPU_ERROR_CODE(AX) // Clear error code. + MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel. // Call the syscall trampoline. LOAD_KERNEL_STACK(GS) - MOVQ CPU_SELF(GS), AX // Load vCPU. PUSHQ AX // First argument (vCPU). CALL ·kernelSyscall(SB) // Call the trampoline. POPQ AX // Pop vCPU. @@ -285,10 +286,11 @@ user: ADDQ $-8, SP // Adjust for flags. MOVQ $_KERNEL_FLAGS, 0(SP); BYTE $0x9d; // Reset flags (POPFQ). PUSHQ AX // Save user AX on stack. - MOVQ CPU_KERNEL_CR3(GS), AX // Get kernel cr3 on AX. + MOVQ ENTRY_KERNEL_CR3(GS), AX // Get kernel cr3 on AX. WRITE_CR3() // Switch to kernel cr3. - MOVQ CPU_REGISTERS+PTRACE_RAX(GS), AX // Get user regs. + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. + MOVQ CPU_REGISTERS+PTRACE_RAX(AX), AX // Get user regs. REGISTERS_SAVE(AX, 0) // Save all except IP, FLAGS, SP, AX. POPQ BX // Restore original AX. MOVQ BX, PTRACE_RAX(AX) // Save it. @@ -300,34 +302,36 @@ user: MOVQ 48(SP), SI; MOVQ SI, PTRACE_SS(AX) // Copy out and return. + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. MOVQ 0(SP), BX // Load vector. MOVQ 8(SP), CX // Load error code. - MOVQ CPU_REGISTERS+PTRACE_RSP(GS), SP // Original stack (kernel version). - MOVQ CPU_REGISTERS+PTRACE_RBP(GS), BP // Original base pointer. - MOVQ CX, CPU_ERROR_CODE(GS) // Set error code. - MOVQ $1, CPU_ERROR_TYPE(GS) // Set error type to user. + MOVQ CPU_REGISTERS+PTRACE_RSP(AX), SP // Original stack (kernel version). + MOVQ CPU_REGISTERS+PTRACE_RBP(AX), BP // Original base pointer. + MOVQ CX, CPU_ERROR_CODE(AX) // Set error code. + MOVQ $1, CPU_ERROR_TYPE(AX) // Set error type to user. MOVQ BX, 32(SP) // Output vector. RET kernel: // As per above, we can save directly. - MOVQ AX, CPU_REGISTERS+PTRACE_RAX(GS) - MOVQ AX, CPU_REGISTERS+PTRACE_ORIGRAX(GS) - REGISTERS_SAVE(GS, CPU_REGISTERS) - MOVQ 16(SP), AX; MOVQ AX, CPU_REGISTERS+PTRACE_RIP(GS) - MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(GS) - MOVQ 40(SP), CX; MOVQ CX, CPU_REGISTERS+PTRACE_RSP(GS) + PUSHQ AX + MOVQ ENTRY_CPU_SELF(GS), AX // Load vCPU. + REGISTERS_SAVE(AX, CPU_REGISTERS) + POPQ BX + MOVQ BX, CPU_REGISTERS+PTRACE_RAX(AX) + MOVQ BX, CPU_REGISTERS+PTRACE_ORIGRAX(AX) + MOVQ 16(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RIP(AX) + MOVQ 32(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_FLAGS(AX) + MOVQ 40(SP), BX; MOVQ BX, CPU_REGISTERS+PTRACE_RSP(AX) // Set the error code and adjust the stack. - MOVQ 8(SP), AX // Load the error code. - MOVQ AX, CPU_ERROR_CODE(GS) // Copy out to the CPU. - MOVQ $0, CPU_ERROR_TYPE(GS) // Set error type to kernel. + MOVQ 8(SP), BX // Load the error code. + MOVQ BX, CPU_ERROR_CODE(AX) // Copy out to the CPU. + MOVQ $0, CPU_ERROR_TYPE(AX) // Set error type to kernel. MOVQ 0(SP), BX // BX contains the vector. - ADDQ $48, SP // Drop the exception frame. // Call the exception trampoline. LOAD_KERNEL_STACK(GS) - MOVQ CPU_SELF(GS), AX // Load vCPU. PUSHQ BX // Second argument (vector). PUSHQ AX // First argument (vCPU). CALL ·kernelException(SB) // Call the trampoline. diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go index 196afd6bb..7c420ecf4 100644 --- a/pkg/sentry/platform/ring0/kernel_amd64.go +++ b/pkg/sentry/platform/ring0/kernel_amd64.go @@ -41,6 +41,8 @@ func (k *Kernel) init(opts KernelOpts) { // init initializes architecture-specific state. func (c *CPU) init() { + c.kernelEntry = &kernelEntry{} + c.cpuSelf = c // Null segment. c.gdt[0].setNull() @@ -65,6 +67,7 @@ func (c *CPU) init() { // Set the kernel stack pointer in the TSS (virtual address). stackAddr := c.StackTop() + c.stackTop = stackAddr c.tss.rsp0Lo = uint32(stackAddr) c.tss.rsp0Hi = uint32(stackAddr >> 32) c.tss.ist1Lo = uint32(stackAddr) @@ -215,7 +218,7 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) { //go:nosplit func start(c *CPU) { // Save per-cpu & FS segment. - WriteGS(kernelAddr(c)) + WriteGS(kernelAddr(c.kernelEntry)) WriteFS(uintptr(c.registers.Fs_base)) // Initialize floating point. diff --git a/pkg/sentry/platform/ring0/offsets_amd64.go b/pkg/sentry/platform/ring0/offsets_amd64.go index d6df12c2f..bcb73cb31 100644 --- a/pkg/sentry/platform/ring0/offsets_amd64.go +++ b/pkg/sentry/platform/ring0/offsets_amd64.go @@ -30,12 +30,18 @@ func Emit(w io.Writer) { c := &CPU{} fmt.Fprintf(w, "\n// CPU offsets.\n") - fmt.Fprintf(w, "#define CPU_SELF 0x%02x\n", reflect.ValueOf(&c.self).Pointer()-reflect.ValueOf(c).Pointer()) fmt.Fprintf(w, "#define CPU_REGISTERS 0x%02x\n", reflect.ValueOf(&c.registers).Pointer()-reflect.ValueOf(c).Pointer()) - fmt.Fprintf(w, "#define CPU_STACK_TOP 0x%02x\n", reflect.ValueOf(&c.stack[0]).Pointer()-reflect.ValueOf(c).Pointer()+uintptr(len(c.stack))) - fmt.Fprintf(w, "#define CPU_KERNEL_CR3 0x%02x\n", reflect.ValueOf(&c.kernelCR3).Pointer()-reflect.ValueOf(c).Pointer()) fmt.Fprintf(w, "#define CPU_ERROR_CODE 0x%02x\n", reflect.ValueOf(&c.errorCode).Pointer()-reflect.ValueOf(c).Pointer()) fmt.Fprintf(w, "#define CPU_ERROR_TYPE 0x%02x\n", reflect.ValueOf(&c.errorType).Pointer()-reflect.ValueOf(c).Pointer()) + fmt.Fprintf(w, "#define CPU_ENTRY 0x%02x\n", reflect.ValueOf(&c.kernelEntry).Pointer()-reflect.ValueOf(c).Pointer()) + + e := &kernelEntry{} + fmt.Fprintf(w, "\n// CPU entry offsets.\n") + fmt.Fprintf(w, "#define ENTRY_SCRATCH0 0x%02x\n", reflect.ValueOf(&e.scratch0).Pointer()-reflect.ValueOf(e).Pointer()) + fmt.Fprintf(w, "#define ENTRY_SCRATCH1 0x%02x\n", reflect.ValueOf(&e.scratch1).Pointer()-reflect.ValueOf(e).Pointer()) + fmt.Fprintf(w, "#define ENTRY_STACK_TOP 0x%02x\n", reflect.ValueOf(&e.stackTop).Pointer()-reflect.ValueOf(e).Pointer()) + fmt.Fprintf(w, "#define ENTRY_CPU_SELF 0x%02x\n", reflect.ValueOf(&e.cpuSelf).Pointer()-reflect.ValueOf(e).Pointer()) + fmt.Fprintf(w, "#define ENTRY_KERNEL_CR3 0x%02x\n", reflect.ValueOf(&e.kernelCR3).Pointer()-reflect.ValueOf(e).Pointer()) fmt.Fprintf(w, "\n// Bits.\n") fmt.Fprintf(w, "#define _RFLAGS_IF 0x%02x\n", _RFLAGS_IF) -- cgit v1.2.3 From 9cae407b270b20ff78370d33602191d37ddf5530 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 26 Mar 2020 09:23:23 +0000 Subject: amd64: implement KPTI for gvisor Actually, gvisor has KPTI (Kernel PageTable Isolation) between gr0 and gr3. But the upper half of the userCR3 contains the whole sentry kernel which makes the kernel vulnerable to gr3 APP through CPU bugs. This patch implement full KPTI functionality for gvisor. It doesn't map the whole kernel in the upper. It maps only the text section of the binary and the entry area required by the ISA. The entry area contains the global idt, the percpu gdt/tss etc. The entry area packs all these together which is less than 350k for 512 vCPUs. The text section is normally nonsensitive. It is possible to map only the entry functions (interrupt handler etc.) only. But it requires some hacks. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan --- pkg/sentry/platform/kvm/kvm.go | 10 +----- pkg/sentry/platform/kvm/machine.go | 16 +++------- pkg/sentry/platform/kvm/machine_amd64.go | 40 ++++++++++++++++++++++++ pkg/sentry/platform/kvm/machine_arm64.go | 13 ++++++++ pkg/sentry/platform/ring0/defs_amd64.go | 5 ++- pkg/sentry/platform/ring0/kernel.go | 22 +++++++++---- pkg/sentry/platform/ring0/kernel_amd64.go | 51 +++++++++++++++++++++++++++++-- pkg/sentry/platform/ring0/kernel_arm64.go | 4 +-- pkg/sentry/platform/ring0/x86.go | 2 +- 9 files changed, 130 insertions(+), 33 deletions(-) diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go index ae813e24e..d46946402 100644 --- a/pkg/sentry/platform/kvm/kvm.go +++ b/pkg/sentry/platform/kvm/kvm.go @@ -156,15 +156,7 @@ func (*KVM) MaxUserAddress() usermem.Addr { func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan struct{}, error) { // Allocate page tables and install system mappings. pageTables := pagetables.New(newAllocator()) - applyPhysicalRegions(func(pr physicalRegion) bool { - // Map the kernel in the upper half. - pageTables.Map( - usermem.Addr(ring0.KernelStartAddress|pr.virtual), - pr.length, - pagetables.MapOpts{AccessType: usermem.AnyAccess}, - pr.physical) - return true // Keep iterating. - }) + k.machine.mapUpperHalf(pageTables) // Return the new address space. return &addressSpace{ diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index 1c8a1dd6f..0069d2d46 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -152,7 +152,7 @@ func (m *machine) newVCPU() *vCPU { fd: int(fd), machine: m, } - c.CPU.Init(&m.kernel, c) + c.CPU.Init(&m.kernel, c.id, c) m.vCPUsByID[c.id] = c // Ensure the signal mask is correct. @@ -180,9 +180,6 @@ func newMachine(vm int) (*machine, error) { // Create the machine. m := &machine{fd: vm} m.available.L = &m.mu - m.kernel.Init(ring0.KernelOpts{ - PageTables: pagetables.New(newAllocator()), - }) maxVCPUs, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) if errno != 0 { @@ -195,6 +192,9 @@ func newMachine(vm int) (*machine, error) { // Create the vCPUs map/slices. m.vCPUsByTID = make(map[uint64]*vCPU) m.vCPUsByID = make([]*vCPU, m.maxVCPUs) + m.kernel.Init(ring0.KernelOpts{ + PageTables: pagetables.New(newAllocator()), + }, m.maxVCPUs) // Apply the physical mappings. Note that these mappings may point to // guest physical addresses that are not actually available. These @@ -207,15 +207,9 @@ func newMachine(vm int) (*machine, error) { pagetables.MapOpts{AccessType: usermem.AnyAccess}, pr.physical) - // And keep everything in the upper half. - m.kernel.PageTables.Map( - usermem.Addr(ring0.KernelStartAddress|pr.virtual), - pr.length, - pagetables.MapOpts{AccessType: usermem.AnyAccess}, - pr.physical) - return true // Keep iterating. }) + m.mapUpperHalf(m.kernel.PageTables) var physicalRegionsReadOnly []physicalRegion var physicalRegionsAvailable []physicalRegion diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index 81ee1efce..6849ab113 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -346,3 +346,43 @@ func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) { func availableRegionsForSetMem() (phyRegions []physicalRegion) { return physicalRegions } + +var execRegions []region + +func init() { + applyVirtualRegions(func(vr virtualRegion) { + if excludeVirtualRegion(vr) || vr.filename == "[vsyscall]" { + return + } + + if vr.accessType.Execute { + execRegions = append(execRegions, vr.region) + } + }) +} + +func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) { + for _, r := range execRegions { + physical, length, ok := translateToPhysical(r.virtual) + if !ok || length < r.length { + panic("impossilbe translation") + } + pageTable.Map( + usermem.Addr(ring0.KernelStartAddress|r.virtual), + r.length, + pagetables.MapOpts{AccessType: usermem.Execute}, + physical) + } + for start, end := range m.kernel.EntryRegions() { + regionLen := end - start + physical, length, ok := translateToPhysical(start) + if !ok || length < regionLen { + panic("impossible translation") + } + pageTable.Map( + usermem.Addr(ring0.KernelStartAddress|start), + regionLen, + pagetables.MapOpts{AccessType: usermem.ReadWrite}, + physical) + } +} diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index 9db171af9..2df762991 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -19,6 +19,7 @@ package kvm import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/platform/ring0" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" "gvisor.dev/gvisor/pkg/usermem" ) @@ -48,6 +49,18 @@ const ( poolPCIDs = 8 ) +func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) { + applyPhysicalRegions(func(pr physicalRegion) bool { + pageTable.Map( + usermem.Addr(ring0.KernelStartAddress|pr.virtual), + pr.length, + pagetables.MapOpts{AccessType: usermem.AnyAccess}, + pr.physical) + + return true // Keep iterating. + }) +} + // Get all read-only physicalRegions. func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) { var rdonlyRegions []region diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go index 2c48bf49d..f617519fa 100644 --- a/pkg/sentry/platform/ring0/defs_amd64.go +++ b/pkg/sentry/platform/ring0/defs_amd64.go @@ -76,8 +76,11 @@ type KernelOpts struct { type KernelArchState struct { KernelOpts + // cpuEntries is array of kernelEntry for all cpus + cpuEntries []kernelEntry + // globalIDT is our set of interrupt gates. - globalIDT idt64 + globalIDT *idt64 } // kernelEntry contains minimal CPU-specific arch state diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go index 021693791..264be23d3 100644 --- a/pkg/sentry/platform/ring0/kernel.go +++ b/pkg/sentry/platform/ring0/kernel.go @@ -19,8 +19,8 @@ package ring0 // N.B. that constraints on KernelOpts must be satisfied. // //go:nosplit -func (k *Kernel) Init(opts KernelOpts) { - k.init(opts) +func (k *Kernel) Init(opts KernelOpts, maxCPUs int) { + k.init(opts, maxCPUs) } // Halt halts execution. @@ -49,6 +49,11 @@ func (defaultHooks) KernelException(Vector) { // kernelSyscall is a trampoline. // +// When in amd64, it is called with %rip on the upper half, so it can +// NOT access to any global data which is not mapped on upper and must +// call to function pointers or interfaces to switch to the lower half +// so that callee can access to global data. +// // +checkescape:hard,stack // //go:nosplit @@ -58,6 +63,11 @@ func kernelSyscall(c *CPU) { // kernelException is a trampoline. // +// When in amd64, it is called with %rip on the upper half, so it can +// NOT access to any global data which is not mapped on upper and must +// call to function pointers or interfaces to switch to the lower half +// so that callee can access to global data. +// // +checkescape:hard,stack // //go:nosplit @@ -68,10 +78,10 @@ func kernelException(c *CPU, vector Vector) { // Init initializes a new CPU. // // Init allows embedding in other objects. -func (c *CPU) Init(k *Kernel, hooks Hooks) { - c.self = c // Set self reference. - c.kernel = k // Set kernel reference. - c.init() // Perform architectural init. +func (c *CPU) Init(k *Kernel, cpuID int, hooks Hooks) { + c.self = c // Set self reference. + c.kernel = k // Set kernel reference. + c.init(cpuID) // Perform architectural init. // Require hooks. if hooks != nil { diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go index 7c420ecf4..622e2d918 100644 --- a/pkg/sentry/platform/ring0/kernel_amd64.go +++ b/pkg/sentry/platform/ring0/kernel_amd64.go @@ -18,13 +18,42 @@ package ring0 import ( "encoding/binary" + "reflect" + + "gvisor.dev/gvisor/pkg/usermem" ) // init initializes architecture-specific state. -func (k *Kernel) init(opts KernelOpts) { +func (k *Kernel) init(opts KernelOpts, maxCPUs int) { // Save the root page tables. k.PageTables = opts.PageTables + entrySize := reflect.TypeOf(kernelEntry{}).Size() + var ( + entries []kernelEntry + padding = 1 + ) + for { + entries = make([]kernelEntry, maxCPUs + padding - 1) + totalSize := entrySize * uintptr(maxCPUs + padding - 1) + addr := reflect.ValueOf(&entries[0]).Pointer() + if addr&(usermem.PageSize-1) == 0 && totalSize >= usermem.PageSize { + // The runtime forces power-of-2 alignment for allocations, and we are therefore + // safe once the first address is aligned and the chunk is at least a full page. + break + } + padding = padding << 1 + } + k.cpuEntries = entries + + k.globalIDT = &idt64{} + if reflect.TypeOf(idt64{}).Size() != usermem.PageSize { + panic("Size of globalIDT should be PageSize") + } + if reflect.ValueOf(k.globalIDT).Pointer() & (usermem.PageSize-1) != 0 { + panic("Allocated globalIDT should be page aligned") + } + // Setup the IDT, which is uniform. for v, handler := range handlers { // Allow Breakpoint and Overflow to be called from all @@ -39,9 +68,25 @@ func (k *Kernel) init(opts KernelOpts) { } } +func (k *Kernel) EntryRegions() map[uintptr]uintptr { + regions := make(map[uintptr]uintptr) + + addr := reflect.ValueOf(&k.cpuEntries[0]).Pointer() + size := reflect.TypeOf(kernelEntry{}).Size() * uintptr(len(k.cpuEntries)) + end, _ := usermem.Addr(addr + size).RoundUp() + regions[uintptr(usermem.Addr(addr).RoundDown())] = uintptr(end) + + addr = reflect.ValueOf(k.globalIDT).Pointer() + size = reflect.TypeOf(idt64{}).Size() + end, _ = usermem.Addr(addr + size).RoundUp() + regions[uintptr(usermem.Addr(addr).RoundDown())] = uintptr(end) + + return regions +} + // init initializes architecture-specific state. -func (c *CPU) init() { - c.kernelEntry = &kernelEntry{} +func (c *CPU) init(cpuID int) { + c.kernelEntry = &c.kernel.cpuEntries[cpuID] c.cpuSelf = c // Null segment. c.gdt[0].setNull() diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go index d483ff03c..0a71f903a 100644 --- a/pkg/sentry/platform/ring0/kernel_arm64.go +++ b/pkg/sentry/platform/ring0/kernel_arm64.go @@ -25,13 +25,13 @@ func HaltAndResume() func HaltEl1SvcAndResume() // init initializes architecture-specific state. -func (k *Kernel) init(opts KernelOpts) { +func (k *Kernel) init(opts KernelOpts, maxCPUs int) { // Save the root page tables. k.PageTables = opts.PageTables } // init initializes architecture-specific state. -func (c *CPU) init() { +func (c *CPU) init(cpuID int) { // Set the kernel stack pointer(virtual address). c.registers.Sp = uint64(c.StackTop()) diff --git a/pkg/sentry/platform/ring0/x86.go b/pkg/sentry/platform/ring0/x86.go index 9da0ea685..e99da0b35 100644 --- a/pkg/sentry/platform/ring0/x86.go +++ b/pkg/sentry/platform/ring0/x86.go @@ -104,7 +104,7 @@ const ( VirtualizationException SecurityException = 0x1e SyscallInt80 = 0x80 - _NR_INTERRUPTS = SyscallInt80 + 1 + _NR_INTERRUPTS = 0x100 ) // System call vectors. -- cgit v1.2.3