diff options
Diffstat (limited to 'pkg/sentry/platform/kvm')
20 files changed, 3150 insertions, 0 deletions
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go new file mode 100644 index 000000000..689122175 --- /dev/null +++ b/pkg/sentry/platform/kvm/address_space.go @@ -0,0 +1,234 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "sync" + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/atomicbitops" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// dirtySet tracks vCPUs for invalidation. +type dirtySet struct { + vCPUs []uint64 +} + +// forEach iterates over all CPUs in the dirty set. +func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) { + m.mu.RLock() + defer m.mu.RUnlock() + + for index := range ds.vCPUs { + mask := atomic.SwapUint64(&ds.vCPUs[index], 0) + if mask != 0 { + for bit := 0; bit < 64; bit++ { + if mask&(1<<uint64(bit)) == 0 { + continue + } + id := 64*index + bit + fn(m.vCPUsByID[id]) + } + } + } +} + +// mark marks the given vCPU as dirty and returns whether it was previously +// clean. Being previously clean implies that a flush is needed on entry. +func (ds *dirtySet) mark(c *vCPU) bool { + index := uint64(c.id) / 64 + bit := uint64(1) << uint(c.id%64) + + oldValue := atomic.LoadUint64(&ds.vCPUs[index]) + if oldValue&bit != 0 { + return false // Not clean. + } + + // Set the bit unilaterally, and ensure that a flush takes place. Note + // that it's possible for races to occur here, but since the flush is + // taking place long after these lines there's no race in practice. + atomicbitops.OrUint64(&ds.vCPUs[index], bit) + return true // Previously clean. +} + +// addressSpace is a wrapper for PageTables. +type addressSpace struct { + platform.NoAddressSpaceIO + + // mu is the lock for modifications to the address space. + // + // Note that the page tables themselves are not locked. + mu sync.Mutex + + // machine is the underlying machine. + machine *machine + + // pageTables are for this particular address space. + pageTables *pagetables.PageTables + + // dirtySet is the set of dirty vCPUs. + dirtySet *dirtySet +} + +// invalidate is the implementation for Invalidate. +func (as *addressSpace) invalidate() { + as.dirtySet.forEach(as.machine, func(c *vCPU) { + if c.active.get() == as { // If this happens to be active, + c.BounceToKernel() // ... force a kernel transition. + } + }) +} + +// Invalidate interrupts all dirty contexts. +func (as *addressSpace) Invalidate() { + as.mu.Lock() + defer as.mu.Unlock() + as.invalidate() +} + +// Touch adds the given vCPU to the dirty list. +// +// The return value indicates whether a flush is required. +func (as *addressSpace) Touch(c *vCPU) bool { + return as.dirtySet.mark(c) +} + +type hostMapEntry struct { + addr uintptr + length uintptr +} + +func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) { + for m.length > 0 { + physical, length, ok := translateToPhysical(m.addr) + if !ok { + panic("unable to translate segment") + } + if length > m.length { + length = m.length + } + + // Ensure that this map has physical mappings. If the page does + // not have physical mappings, the KVM module may inject + // spurious exceptions when emulation fails (i.e. it tries to + // emulate because the RIP is pointed at those pages). + as.machine.mapPhysical(physical, length) + + // Install the page table mappings. Note that the ordering is + // important; if the pagetable mappings were installed before + // ensuring the physical pages were available, then some other + // thread could theoretically access them. + // + // Due to the way KVM's shadow paging implementation works, + // modifications to the page tables while in host mode may not + // be trapped, leading to the shadow pages being out of sync. + // Therefore, we need to ensure that we are in guest mode for + // page table modifications. See the call to bluepill, below. + as.machine.retryInGuest(func() { + inv = as.pageTables.Map(addr, length, pagetables.MapOpts{ + AccessType: at, + User: true, + }, physical) || inv + }) + m.addr += length + m.length -= length + addr += usermem.Addr(length) + } + + return inv +} + +// MapFile implements platform.AddressSpace.MapFile. +func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.FileRange, at usermem.AccessType, precommit bool) error { + as.mu.Lock() + defer as.mu.Unlock() + + // Get mappings in the sentry's address space, which are guaranteed to be + // valid as long as a reference is held on the mapped pages (which is in + // turn required by AddressSpace.MapFile precondition). + // + // If precommit is true, we will touch mappings to commit them, so ensure + // that mappings are readable from sentry context. + // + // We don't execute from application file-mapped memory, and guest page + // tables don't care if we have execute permission (but they do need pages + // to be readable). + bs, err := f.MapInternal(fr, usermem.AccessType{ + Read: at.Read || at.Execute || precommit, + Write: at.Write, + }) + if err != nil { + return err + } + + // Map the mappings in the sentry's address space (guest physical memory) + // into the application's address space (guest virtual memory). + inv := false + for !bs.IsEmpty() { + b := bs.Head() + bs = bs.Tail() + // Since fr was page-aligned, b should also be page-aligned. We do the + // lookup in our host page tables for this translation. + if precommit { + s := b.ToSlice() + for i := 0; i < len(s); i += usermem.PageSize { + _ = s[i] // Touch to commit. + } + } + prev := as.mapHost(addr, hostMapEntry{ + addr: b.Addr(), + length: uintptr(b.Len()), + }, at) + inv = inv || prev + addr += usermem.Addr(b.Len()) + } + if inv { + as.invalidate() + } + + return nil +} + +// Unmap unmaps the given range by calling pagetables.PageTables.Unmap. +func (as *addressSpace) Unmap(addr usermem.Addr, length uint64) { + as.mu.Lock() + defer as.mu.Unlock() + + // See above re: retryInGuest. + var prev bool + as.machine.retryInGuest(func() { + prev = as.pageTables.Unmap(addr, uintptr(length)) || prev + }) + if prev { + as.invalidate() + + // Recycle any freed intermediate pages. + as.pageTables.Allocator.Recycle() + } +} + +// Release releases the page tables. +func (as *addressSpace) Release() { + as.Unmap(0, ^uint64(0)) + + // Free all pages from the allocator. + as.pageTables.Allocator.(allocator).base.Drain() + + // Drop all cached machine references. + as.machine.dropPageTables(as.pageTables) +} diff --git a/pkg/sentry/platform/kvm/allocator.go b/pkg/sentry/platform/kvm/allocator.go new file mode 100644 index 000000000..42bcc9733 --- /dev/null +++ b/pkg/sentry/platform/kvm/allocator.go @@ -0,0 +1,76 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables" +) + +type allocator struct { + base *pagetables.RuntimeAllocator +} + +// newAllocator is used to define the allocator. +func newAllocator() allocator { + return allocator{ + base: pagetables.NewRuntimeAllocator(), + } +} + +// NewPTEs implements pagetables.Allocator.NewPTEs. +// +//go:nosplit +func (a allocator) NewPTEs() *pagetables.PTEs { + return a.base.NewPTEs() +} + +// PhysicalFor returns the physical address for a set of PTEs. +// +//go:nosplit +func (a allocator) PhysicalFor(ptes *pagetables.PTEs) uintptr { + virtual := a.base.PhysicalFor(ptes) + physical, _, ok := translateToPhysical(virtual) + if !ok { + panic(fmt.Sprintf("PhysicalFor failed for %p", ptes)) + } + return physical +} + +// LookupPTEs implements pagetables.Allocator.LookupPTEs. +// +//go:nosplit +func (a allocator) LookupPTEs(physical uintptr) *pagetables.PTEs { + virtualStart, physicalStart, _, ok := calculateBluepillFault(physical) + if !ok { + panic(fmt.Sprintf("LookupPTEs failed for 0x%x", physical)) + } + return a.base.LookupPTEs(virtualStart + (physical - physicalStart)) +} + +// FreePTEs implements pagetables.Allocator.FreePTEs. +// +//go:nosplit +func (a allocator) FreePTEs(ptes *pagetables.PTEs) { + a.base.FreePTEs(ptes) +} + +// Recycle implements pagetables.Allocator.Recycle. +// +//go:nosplit +func (a allocator) Recycle() { + a.base.Recycle() +} diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go new file mode 100644 index 000000000..a926e6f8b --- /dev/null +++ b/pkg/sentry/platform/kvm/bluepill.go @@ -0,0 +1,82 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "fmt" + "reflect" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/safecopy" +) + +// bluepill enters guest mode. +func bluepill(*vCPU) + +// sighandler is the signal entry point. +func sighandler() + +// dieTrampoline is the assembly trampoline. This calls dieHandler. +// +// This uses an architecture-specific calling convention, documented in +// dieArchSetup and the assembly implementation for dieTrampoline. +func dieTrampoline() + +var ( + // savedHandler is a pointer to the previous handler. + // + // This is called by bluepillHandler. + savedHandler uintptr + + // dieTrampolineAddr is the address of dieTrampoline. + dieTrampolineAddr uintptr +) + +// dieHandler is called by dieTrampoline. +// +//go:nosplit +func dieHandler(c *vCPU) { + throw(c.dieState.message) +} + +// die is called to set the vCPU up to panic. +// +// This loads vCPU state, and sets up a call for the trampoline. +// +//go:nosplit +func (c *vCPU) die(context *arch.SignalContext64, msg string) { + // Save the death message, which will be thrown. + c.dieState.message = msg + + // Reload all registers to have an accurate stack trace when we return + // to host mode. This means that the stack should be unwound correctly. + if errno := c.getUserRegisters(&c.dieState.guestRegs); errno != 0 { + throw(msg) + } + + // Setup the trampoline. + dieArchSetup(c, context, &c.dieState.guestRegs) +} + +func init() { + // Install the handler. + if err := safecopy.ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil { + panic(fmt.Sprintf("Unable to set handler for signal %d: %v", syscall.SIGSEGV, err)) + } + + // Extract the address for the trampoline. + dieTrampolineAddr = reflect.ValueOf(dieTrampoline).Pointer() +} diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go new file mode 100644 index 000000000..c258408f9 --- /dev/null +++ b/pkg/sentry/platform/kvm/bluepill_amd64.go @@ -0,0 +1,141 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package kvm + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0" +) + +var ( + // bounceSignal is the signal used for bouncing KVM. + // + // We use SIGCHLD because it is not masked by the runtime, and + // it will be ignored properly by other parts of the kernel. + bounceSignal = syscall.SIGCHLD + + // bounceSignalMask has only bounceSignal set. + bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1)) + + // bounce is the interrupt vector used to return to the kernel. + bounce = uint32(ring0.VirtualizationException) +) + +// redpill on amd64 invokes a syscall with -1. +// +//go:nosplit +func redpill() { + syscall.RawSyscall(^uintptr(0), 0, 0, 0) +} + +// bluepillArchEnter is called during bluepillEnter. +// +//go:nosplit +func bluepillArchEnter(context *arch.SignalContext64) *vCPU { + c := vCPUPtr(uintptr(context.Rax)) + regs := c.CPU.Registers() + regs.R8 = context.R8 + regs.R9 = context.R9 + regs.R10 = context.R10 + regs.R11 = context.R11 + regs.R12 = context.R12 + regs.R13 = context.R13 + regs.R14 = context.R14 + regs.R15 = context.R15 + regs.Rdi = context.Rdi + regs.Rsi = context.Rsi + regs.Rbp = context.Rbp + regs.Rbx = context.Rbx + regs.Rdx = context.Rdx + regs.Rax = context.Rax + regs.Rcx = context.Rcx + regs.Rsp = context.Rsp + regs.Rip = context.Rip + regs.Eflags = context.Eflags + regs.Eflags &^= uint64(ring0.KernelFlagsClear) + regs.Eflags |= ring0.KernelFlagsSet + regs.Cs = uint64(ring0.Kcode) + regs.Ds = uint64(ring0.Udata) + regs.Es = uint64(ring0.Udata) + regs.Ss = uint64(ring0.Kdata) + return c +} + +// KernelSyscall handles kernel syscalls. +// +//go:nosplit +func (c *vCPU) KernelSyscall() { + regs := c.Registers() + if regs.Rax != ^uint64(0) { + regs.Rip -= 2 // Rewind. + } + // We only trigger a bluepill entry in the bluepill function, and can + // therefore be guaranteed that there is no floating point state to be + // loaded on resuming from halt. We only worry about saving on exit. + ring0.SaveFloatingPoint((*byte)(c.floatingPointState)) + ring0.Halt() + ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment. +} + +// KernelException handles kernel exceptions. +// +//go:nosplit +func (c *vCPU) KernelException(vector ring0.Vector) { + regs := c.Registers() + if vector == ring0.Vector(bounce) { + // These should not interrupt kernel execution; point the Rip + // to zero to ensure that we get a reasonable panic when we + // attempt to return and a full stack trace. + regs.Rip = 0 + } + // See above. + ring0.SaveFloatingPoint((*byte)(c.floatingPointState)) + ring0.Halt() + ring0.WriteFS(uintptr(regs.Fs_base)) // Reload host segment. +} + +// bluepillArchExit is called during bluepillEnter. +// +//go:nosplit +func bluepillArchExit(c *vCPU, context *arch.SignalContext64) { + regs := c.CPU.Registers() + context.R8 = regs.R8 + context.R9 = regs.R9 + context.R10 = regs.R10 + context.R11 = regs.R11 + context.R12 = regs.R12 + context.R13 = regs.R13 + context.R14 = regs.R14 + context.R15 = regs.R15 + context.Rdi = regs.Rdi + context.Rsi = regs.Rsi + context.Rbp = regs.Rbp + context.Rbx = regs.Rbx + context.Rdx = regs.Rdx + context.Rax = regs.Rax + context.Rcx = regs.Rcx + context.Rsp = regs.Rsp + context.Rip = regs.Rip + context.Eflags = regs.Eflags + + // Set the context pointer to the saved floating point state. This is + // where the guest data has been serialized, the kernel will restore + // from this new pointer value. + context.Fpstate = uint64(uintptrValue((*byte)(c.floatingPointState))) +} diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.s b/pkg/sentry/platform/kvm/bluepill_amd64.s new file mode 100644 index 000000000..2bc34a435 --- /dev/null +++ b/pkg/sentry/platform/kvm/bluepill_amd64.s @@ -0,0 +1,93 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "textflag.h" + +// VCPU_CPU is the location of the CPU in the vCPU struct. +// +// This is guaranteed to be zero. +#define VCPU_CPU 0x0 + +// CPU_SELF is the self reference in ring0's percpu. +// +// This is guaranteed to be zero. +#define CPU_SELF 0x0 + +// Context offsets. +// +// Only limited use of the context is done in the assembly stub below, most is +// done in the Go handlers. However, the RIP must be examined. +#define CONTEXT_RAX 0x90 +#define CONTEXT_RIP 0xa8 +#define CONTEXT_FP 0xe0 + +// CLI is the literal byte for the disable interrupts instruction. +// +// This is checked as the source of the fault. +#define CLI $0xfa + +// See bluepill.go. +TEXT ·bluepill(SB),NOSPLIT,$0 +begin: + MOVQ vcpu+0(FP), AX + LEAQ VCPU_CPU(AX), BX + BYTE CLI; +check_vcpu: + MOVQ CPU_SELF(GS), CX + CMPQ BX, CX + JE right_vCPU +wrong_vcpu: + CALL ·redpill(SB) + JMP begin +right_vCPU: + RET + +// sighandler: see bluepill.go for documentation. +// +// The arguments are the following: +// +// DI - The signal number. +// SI - Pointer to siginfo_t structure. +// DX - Pointer to ucontext structure. +// +TEXT ·sighandler(SB),NOSPLIT,$0 + // Check if the signal is from the kernel. + MOVQ $0x80, CX + CMPL CX, 0x8(SI) + JNE fallback + + // Check if RIP is disable interrupts. + MOVQ CONTEXT_RIP(DX), CX + CMPQ CX, $0x0 + JE fallback + CMPB 0(CX), CLI + JNE fallback + + // Call the bluepillHandler. + PUSHQ DX // First argument (context). + CALL ·bluepillHandler(SB) // Call the handler. + POPQ DX // Discard the argument. + RET + +fallback: + // Jump to the previous signal handler. + XORQ CX, CX + MOVQ ·savedHandler(SB), AX + JMP AX + +// dieTrampoline: see bluepill.go, bluepill_amd64_unsafe.go for documentation. +TEXT ·dieTrampoline(SB),NOSPLIT,$0 + PUSHQ BX // First argument (vCPU). + PUSHQ AX // Fake the old RIP as caller. + JMP ·dieHandler(SB) diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go new file mode 100644 index 000000000..92fde7ee0 --- /dev/null +++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go @@ -0,0 +1,56 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package kvm + +import ( + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0" +) + +// bluepillArchContext returns the arch-specific context. +// +//go:nosplit +func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 { + return &((*arch.UContext64)(context).MContext) +} + +// dieArchSetup initialies the state for dieTrampoline. +// +// The amd64 dieTrampoline requires the vCPU to be set in BX, and the last RIP +// to be in AX. The trampoline then simulates a call to dieHandler from the +// provided RIP. +// +//go:nosplit +func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) { + // If the vCPU is in user mode, we set the stack to the stored stack + // value in the vCPU itself. We don't want to unwind the user stack. + if guestRegs.RFLAGS&ring0.UserFlagsSet == ring0.UserFlagsSet { + regs := c.CPU.Registers() + context.Rax = regs.Rax + context.Rsp = regs.Rsp + context.Rbp = regs.Rbp + } else { + context.Rax = guestRegs.RIP + context.Rsp = guestRegs.RSP + context.Rbp = guestRegs.RBP + context.Eflags = guestRegs.RFLAGS + } + context.Rbx = uint64(uintptr(unsafe.Pointer(c))) + context.Rip = uint64(dieTrampolineAddr) +} diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go new file mode 100644 index 000000000..3c452f5ba --- /dev/null +++ b/pkg/sentry/platform/kvm/bluepill_fault.go @@ -0,0 +1,127 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "sync/atomic" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +const ( + // faultBlockSize is the size used for servicing memory faults. + // + // This should be large enough to avoid frequent faults and avoid using + // all available KVM slots (~512), but small enough that KVM does not + // complain about slot sizes (~4GB). See handleBluepillFault for how + // this block is used. + faultBlockSize = 2 << 30 + + // faultBlockMask is the mask for the fault blocks. + // + // This must be typed to avoid overflow complaints (ugh). + faultBlockMask = ^uintptr(faultBlockSize - 1) +) + +// yield yields the CPU. +// +//go:nosplit +func yield() { + syscall.RawSyscall(syscall.SYS_SCHED_YIELD, 0, 0, 0) +} + +// calculateBluepillFault calculates the fault address range. +// +//go:nosplit +func calculateBluepillFault(physical uintptr) (virtualStart, physicalStart, length uintptr, ok bool) { + alignedPhysical := physical &^ uintptr(usermem.PageSize-1) + for _, pr := range physicalRegions { + end := pr.physical + pr.length + if physical < pr.physical || physical >= end { + continue + } + + // Adjust the block to match our size. + physicalStart = alignedPhysical & faultBlockMask + if physicalStart < pr.physical { + // Bound the starting point to the start of the region. + physicalStart = pr.physical + } + virtualStart = pr.virtual + (physicalStart - pr.physical) + physicalEnd := physicalStart + faultBlockSize + if physicalEnd > end { + physicalEnd = end + } + length = physicalEnd - physicalStart + return virtualStart, physicalStart, length, true + } + + return 0, 0, 0, false +} + +// handleBluepillFault handles a physical fault. +// +// The corresponding virtual address is returned. This may throw on error. +// +//go:nosplit +func handleBluepillFault(m *machine, physical uintptr) (uintptr, bool) { + // Paging fault: we need to map the underlying physical pages for this + // fault. This all has to be done in this function because we're in a + // signal handler context. (We can't call any functions that might + // split the stack.) + virtualStart, physicalStart, length, ok := calculateBluepillFault(physical) + if !ok { + return 0, false + } + + // Set the KVM slot. + // + // First, we need to acquire the exclusive right to set a slot. See + // machine.nextSlot for information about the protocol. + slot := atomic.SwapUint32(&m.nextSlot, ^uint32(0)) + for slot == ^uint32(0) { + yield() // Race with another call. + slot = atomic.SwapUint32(&m.nextSlot, ^uint32(0)) + } + errno := m.setMemoryRegion(int(slot), physicalStart, length, virtualStart) + if errno == 0 { + // Successfully added region; we can increment nextSlot and + // allow another set to proceed here. + atomic.StoreUint32(&m.nextSlot, slot+1) + return virtualStart + (physical - physicalStart), true + } + + // Release our slot (still available). + atomic.StoreUint32(&m.nextSlot, slot) + + switch errno { + case syscall.EEXIST: + // The region already exists. It's possible that we raced with + // another vCPU here. We just revert nextSlot and return true, + // because this must have been satisfied by some other vCPU. + return virtualStart + (physical - physicalStart), true + case syscall.EINVAL: + throw("set memory region failed; out of slots") + case syscall.ENOMEM: + throw("set memory region failed: out of memory") + case syscall.EFAULT: + throw("set memory region failed: invalid physical range") + default: + throw("set memory region failed: unknown reason") + } + + panic("unreachable") +} diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go new file mode 100644 index 000000000..7e8e9f42a --- /dev/null +++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go @@ -0,0 +1,213 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build go1.12 +// +build !go1.14 + +// Check go:linkname function signatures when updating Go version. + +package kvm + +import ( + "sync/atomic" + "syscall" + "unsafe" +) + +//go:linkname throw runtime.throw +func throw(string) + +// vCPUPtr returns a CPU for the given address. +// +//go:nosplit +func vCPUPtr(addr uintptr) *vCPU { + return (*vCPU)(unsafe.Pointer(addr)) +} + +// bytePtr returns a bytePtr for the given address. +// +//go:nosplit +func bytePtr(addr uintptr) *byte { + return (*byte)(unsafe.Pointer(addr)) +} + +// uintptrValue returns a uintptr for the given address. +// +//go:nosplit +func uintptrValue(addr *byte) uintptr { + return (uintptr)(unsafe.Pointer(addr)) +} + +// bluepillHandler is called from the signal stub. +// +// The world may be stopped while this is executing, and it executes on the +// signal stack. It should only execute raw system calls and functions that are +// explicitly marked go:nosplit. +// +//go:nosplit +func bluepillHandler(context unsafe.Pointer) { + // Sanitize the registers; interrupts must always be disabled. + c := bluepillArchEnter(bluepillArchContext(context)) + + // Increment the number of switches. + atomic.AddUint32(&c.switches, 1) + + // Mark this as guest mode. + switch atomic.SwapUint32(&c.state, vCPUGuest|vCPUUser) { + case vCPUUser: // Expected case. + case vCPUUser | vCPUWaiter: + c.notify() + default: + throw("invalid state") + } + + for { + switch _, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0); errno { + case 0: // Expected case. + case syscall.EINTR: + // First, we process whatever pending signal + // interrupted KVM. Since we're in a signal handler + // currently, all signals are masked and the signal + // must have been delivered directly to this thread. + sig, _, errno := syscall.RawSyscall6( + syscall.SYS_RT_SIGTIMEDWAIT, + uintptr(unsafe.Pointer(&bounceSignalMask)), + 0, // siginfo. + 0, // timeout. + 8, // sigset size. + 0, 0) + if errno != 0 { + throw("error waiting for pending signal") + } + if sig != uintptr(bounceSignal) { + throw("unexpected signal") + } + + // Check whether the current state of the vCPU is ready + // for interrupt injection. Because we don't have a + // PIC, we can't inject an interrupt while they are + // masked. We need to request a window if it's not + // ready. + if c.runData.readyForInterruptInjection == 0 { + c.runData.requestInterruptWindow = 1 + continue // Rerun vCPU. + } else { + // Force injection below; the vCPU is ready. + c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN + } + case syscall.EFAULT: + // If a fault is not serviceable due to the host + // backing pages having page permissions, instead of an + // MMIO exit we receive EFAULT from the run ioctl. We + // always inject an NMI here since we may be in kernel + // mode and have interrupts disabled. + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_NMI, 0); errno != 0 { + throw("NMI injection failed") + } + continue // Rerun vCPU. + default: + throw("run failed") + } + + switch c.runData.exitReason { + case _KVM_EXIT_EXCEPTION: + c.die(bluepillArchContext(context), "exception") + return + case _KVM_EXIT_IO: + c.die(bluepillArchContext(context), "I/O") + return + case _KVM_EXIT_INTERNAL_ERROR: + // An internal error is typically thrown when emulation + // fails. This can occur via the MMIO path below (and + // it might fail because we have multiple regions that + // are not mapped). We would actually prefer that no + // emulation occur, and don't mind at all if it fails. + case _KVM_EXIT_HYPERCALL: + c.die(bluepillArchContext(context), "hypercall") + return + case _KVM_EXIT_DEBUG: + c.die(bluepillArchContext(context), "debug") + return + case _KVM_EXIT_HLT: + // Copy out registers. + bluepillArchExit(c, bluepillArchContext(context)) + + // Return to the vCPUReady state; notify any waiters. + user := atomic.LoadUint32(&c.state) & vCPUUser + switch atomic.SwapUint32(&c.state, user) { + case user | vCPUGuest: // Expected case. + case user | vCPUGuest | vCPUWaiter: + c.notify() + default: + throw("invalid state") + } + return + case _KVM_EXIT_MMIO: + // Increment the fault count. + atomic.AddUint32(&c.faults, 1) + + // For MMIO, the physical address is the first data item. + physical := uintptr(c.runData.data[0]) + virtual, ok := handleBluepillFault(c.machine, physical) + if !ok { + c.die(bluepillArchContext(context), "invalid physical address") + return + } + + // We now need to fill in the data appropriately. KVM + // expects us to provide the result of the given MMIO + // operation in the runData struct. This is safe + // because, if a fault occurs here, the same fault + // would have occurred in guest mode. The kernel should + // not create invalid page table mappings. + data := (*[8]byte)(unsafe.Pointer(&c.runData.data[1])) + length := (uintptr)((uint32)(c.runData.data[2])) + write := (uint8)(((c.runData.data[2] >> 32) & 0xff)) != 0 + for i := uintptr(0); i < length; i++ { + b := bytePtr(uintptr(virtual) + i) + if write { + // Write to the given address. + *b = data[i] + } else { + // Read from the given address. + data[i] = *b + } + } + case _KVM_EXIT_IRQ_WINDOW_OPEN: + // Interrupt: we must have requested an interrupt + // window; set the interrupt line. + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_INTERRUPT, + uintptr(unsafe.Pointer(&bounce))); errno != 0 { + throw("interrupt injection failed") + } + // Clear previous injection request. + c.runData.requestInterruptWindow = 0 + case _KVM_EXIT_SHUTDOWN: + c.die(bluepillArchContext(context), "shutdown") + return + case _KVM_EXIT_FAIL_ENTRY: + c.die(bluepillArchContext(context), "entry failed") + return + default: + c.die(bluepillArchContext(context), "unknown") + return + } + } +} diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go new file mode 100644 index 000000000..0eb0020f7 --- /dev/null +++ b/pkg/sentry/platform/kvm/context.go @@ -0,0 +1,87 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/interrupt" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// context is an implementation of the platform context. +// +// This is a thin wrapper around the machine. +type context struct { + // machine is the parent machine, and is immutable. + machine *machine + + // info is the arch.SignalInfo cached for this context. + info arch.SignalInfo + + // interrupt is the interrupt context. + interrupt interrupt.Forwarder +} + +// Switch runs the provided context in the given address space. +func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*arch.SignalInfo, usermem.AccessType, error) { + localAS := as.(*addressSpace) + + // Grab a vCPU. + cpu := c.machine.Get() + + // Enable interrupts (i.e. calls to vCPU.Notify). + if !c.interrupt.Enable(cpu) { + c.machine.Put(cpu) // Already preempted. + return nil, usermem.NoAccess, platform.ErrContextInterrupt + } + + // Set the active address space. + // + // This must be done prior to the call to Touch below. If the address + // space is invalidated between this line and the call below, we will + // flag on entry anyways. When the active address space below is + // cleared, it indicates that we don't need an explicit interrupt and + // that the flush can occur naturally on the next user entry. + cpu.active.set(localAS) + + // Prepare switch options. + switchOpts := ring0.SwitchOpts{ + Registers: &ac.StateData().Regs, + FloatingPointState: (*byte)(ac.FloatingPointData()), + PageTables: localAS.pageTables, + Flush: localAS.Touch(cpu), + FullRestore: ac.FullRestore(), + } + + // Take the blue pill. + at, err := cpu.SwitchToUser(switchOpts, &c.info) + + // Clear the address space. + cpu.active.set(nil) + + // Release resources. + c.machine.Put(cpu) + + // All done. + c.interrupt.Disable() + return &c.info, at, err +} + +// Interrupt interrupts the running context. +func (c *context) Interrupt() { + c.interrupt.NotifyInterrupt() +} diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go new file mode 100644 index 000000000..ed0521c3f --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm.go @@ -0,0 +1,143 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package kvm provides a kvm-based implementation of the platform interface. +package kvm + +import ( + "fmt" + "os" + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/cpuid" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// KVM represents a lightweight VM context. +type KVM struct { + platform.NoCPUPreemptionDetection + + // machine is the backing VM. + machine *machine +} + +var ( + globalOnce sync.Once + globalErr error +) + +// OpenDevice opens the KVM device at /dev/kvm and returns the File. +func OpenDevice() (*os.File, error) { + f, err := os.OpenFile("/dev/kvm", syscall.O_RDWR, 0) + if err != nil { + return nil, fmt.Errorf("error opening /dev/kvm: %v", err) + } + return f, nil +} + +// New returns a new KVM-based implementation of the platform interface. +func New(deviceFile *os.File) (*KVM, error) { + fd := deviceFile.Fd() + + // Ensure global initialization is done. + globalOnce.Do(func() { + physicalInit() + globalErr = updateSystemValues(int(fd)) + ring0.Init(cpuid.HostFeatureSet()) + }) + if globalErr != nil { + return nil, globalErr + } + + // Create a new VM fd. + vm, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, fd, _KVM_CREATE_VM, 0) + if errno != 0 { + return nil, fmt.Errorf("creating VM: %v", errno) + } + // We are done with the device file. + deviceFile.Close() + + // Create a VM context. + machine, err := newMachine(int(vm)) + if err != nil { + return nil, err + } + + // All set. + return &KVM{ + machine: machine, + }, nil +} + +// SupportsAddressSpaceIO implements platform.Platform.SupportsAddressSpaceIO. +func (*KVM) SupportsAddressSpaceIO() bool { + return false +} + +// CooperativelySchedulesAddressSpace implements platform.Platform.CooperativelySchedulesAddressSpace. +func (*KVM) CooperativelySchedulesAddressSpace() bool { + return false +} + +// MapUnit implements platform.Platform.MapUnit. +func (*KVM) MapUnit() uint64 { + // We greedily creates PTEs in MapFile, so extremely large mappings can + // be expensive. Not _that_ expensive since we allow super pages, but + // even though can get out of hand if you're creating multi-terabyte + // mappings. For this reason, we limit mappings to an arbitrary 16MB. + return 16 << 20 +} + +// MinUserAddress returns the lowest available address. +func (*KVM) MinUserAddress() usermem.Addr { + return usermem.PageSize +} + +// MaxUserAddress returns the first address that may not be used. +func (*KVM) MaxUserAddress() usermem.Addr { + return usermem.Addr(ring0.MaximumUserAddress) +} + +// NewAddressSpace returns a new pagetable root. +func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan struct{}, error) { + // Allocate page tables and install system mappings. + pageTables := pagetables.New(newAllocator()) + applyPhysicalRegions(func(pr physicalRegion) bool { + // Map the kernel in the upper half. + pageTables.Map( + usermem.Addr(ring0.KernelStartAddress|pr.virtual), + pr.length, + pagetables.MapOpts{AccessType: usermem.AnyAccess}, + pr.physical) + return true // Keep iterating. + }) + + // Return the new address space. + return &addressSpace{ + machine: k.machine, + pageTables: pageTables, + dirtySet: k.machine.newDirtySet(), + }, nil, nil +} + +// NewContext returns an interruptible context. +func (k *KVM) NewContext() platform.Context { + return &context{ + machine: k.machine, + } +} diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go new file mode 100644 index 000000000..61493ccaf --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_amd64.go @@ -0,0 +1,213 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package kvm + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0" +) + +// userMemoryRegion is a region of physical memory. +// +// This mirrors kvm_memory_region. +type userMemoryRegion struct { + slot uint32 + flags uint32 + guestPhysAddr uint64 + memorySize uint64 + userspaceAddr uint64 +} + +// userRegs represents KVM user registers. +// +// This mirrors kvm_regs. +type userRegs struct { + RAX uint64 + RBX uint64 + RCX uint64 + RDX uint64 + RSI uint64 + RDI uint64 + RSP uint64 + RBP uint64 + R8 uint64 + R9 uint64 + R10 uint64 + R11 uint64 + R12 uint64 + R13 uint64 + R14 uint64 + R15 uint64 + RIP uint64 + RFLAGS uint64 +} + +// systemRegs represents KVM system registers. +// +// This mirrors kvm_sregs. +type systemRegs struct { + CS segment + DS segment + ES segment + FS segment + GS segment + SS segment + TR segment + LDT segment + GDT descriptor + IDT descriptor + CR0 uint64 + CR2 uint64 + CR3 uint64 + CR4 uint64 + CR8 uint64 + EFER uint64 + apicBase uint64 + interruptBitmap [(_KVM_NR_INTERRUPTS + 63) / 64]uint64 +} + +// segment is the expanded form of a segment register. +// +// This mirrors kvm_segment. +type segment struct { + base uint64 + limit uint32 + selector uint16 + typ uint8 + present uint8 + DPL uint8 + DB uint8 + S uint8 + L uint8 + G uint8 + AVL uint8 + unusable uint8 + _ uint8 +} + +// Clear clears the segment and marks it unusable. +func (s *segment) Clear() { + *s = segment{unusable: 1} +} + +// selector is a segment selector. +type selector uint16 + +// tobool is a simple helper. +func tobool(x ring0.SegmentDescriptorFlags) uint8 { + if x != 0 { + return 1 + } + return 0 +} + +// Load loads the segment described by d into the segment s. +// +// The argument sel is recorded as the segment selector index. +func (s *segment) Load(d *ring0.SegmentDescriptor, sel ring0.Selector) { + flag := d.Flags() + if flag&ring0.SegmentDescriptorPresent == 0 { + s.Clear() + return + } + s.base = uint64(d.Base()) + s.limit = d.Limit() + s.typ = uint8((flag>>8)&0xF) | 1 + s.S = tobool(flag & ring0.SegmentDescriptorSystem) + s.DPL = uint8(d.DPL()) + s.present = tobool(flag & ring0.SegmentDescriptorPresent) + s.AVL = tobool(flag & ring0.SegmentDescriptorAVL) + s.L = tobool(flag & ring0.SegmentDescriptorLong) + s.DB = tobool(flag & ring0.SegmentDescriptorDB) + s.G = tobool(flag & ring0.SegmentDescriptorG) + if s.L != 0 { + s.limit = 0xffffffff + } + s.unusable = 0 + s.selector = uint16(sel) +} + +// descriptor describes a region of physical memory. +// +// It corresponds to the pseudo-descriptor used in the x86 LGDT and LIDT +// instructions, and mirrors kvm_dtable. +type descriptor struct { + base uint64 + limit uint16 + _ [3]uint16 +} + +// modelControlRegister is an MSR entry. +// +// This mirrors kvm_msr_entry. +type modelControlRegister struct { + index uint32 + _ uint32 + data uint64 +} + +// modelControlRegisers is a collection of MSRs. +// +// This mirrors kvm_msrs. +type modelControlRegisters struct { + nmsrs uint32 + _ uint32 + entries [16]modelControlRegister +} + +// runData is the run structure. This may be mapped for synchronous register +// access (although that doesn't appear to be supported by my kernel at least). +// +// This mirrors kvm_run. +type runData struct { + requestInterruptWindow uint8 + _ [7]uint8 + + exitReason uint32 + readyForInterruptInjection uint8 + ifFlag uint8 + _ [2]uint8 + + cr8 uint64 + apicBase uint64 + + // This is the union data for exits. Interpretation depends entirely on + // the exitReason above (see vCPU code for more information). + data [32]uint64 +} + +// cpuidEntry is a single CPUID entry. +// +// This mirrors kvm_cpuid_entry2. +type cpuidEntry struct { + function uint32 + index uint32 + flags uint32 + eax uint32 + ebx uint32 + ecx uint32 + edx uint32 + _ [3]uint32 +} + +// cpuidEntries is a collection of CPUID entries. +// +// This mirrors kvm_cpuid2. +type cpuidEntries struct { + nr uint32 + _ uint32 + entries [_KVM_NR_CPUID_ENTRIES]cpuidEntry +} diff --git a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go new file mode 100644 index 000000000..46c4b9113 --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go @@ -0,0 +1,77 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package kvm + +import ( + "fmt" + "syscall" + "unsafe" +) + +var ( + runDataSize int + hasGuestPCID bool + cpuidSupported = cpuidEntries{nr: _KVM_NR_CPUID_ENTRIES} +) + +func updateSystemValues(fd int) error { + // Extract the mmap size. + sz, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(fd), _KVM_GET_VCPU_MMAP_SIZE, 0) + if errno != 0 { + return fmt.Errorf("getting VCPU mmap size: %v", errno) + } + + // Save the data. + runDataSize = int(sz) + + // Must do the dance to figure out the number of entries. + _, _, errno = syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(fd), + _KVM_GET_SUPPORTED_CPUID, + uintptr(unsafe.Pointer(&cpuidSupported))) + if errno != 0 && errno != syscall.ENOMEM { + // Some other error occurred. + return fmt.Errorf("getting supported CPUID: %v", errno) + } + + // The number should now be correct. + _, _, errno = syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(fd), + _KVM_GET_SUPPORTED_CPUID, + uintptr(unsafe.Pointer(&cpuidSupported))) + if errno != 0 { + // Didn't work with the right number. + return fmt.Errorf("getting supported CPUID (2nd attempt): %v", errno) + } + + // Calculate whether guestPCID is supported. + // + // FIXME(ascannell): These should go through the much more pleasant + // cpuid package interfaces, once a way to accept raw kvm CPUID entries + // is plumbed (or some rough equivalent). + for i := 0; i < int(cpuidSupported.nr); i++ { + entry := cpuidSupported.entries[i] + if entry.function == 1 && entry.index == 0 && entry.ecx&(1<<17) != 0 { + hasGuestPCID = true // Found matching PCID in guest feature set. + } + } + + // Success. + return nil +} diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go new file mode 100644 index 000000000..d05f05c29 --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_const.go @@ -0,0 +1,64 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +// KVM ioctls. +// +// Only the ioctls we need in Go appear here; some additional ioctls are used +// within the assembly stubs (KVM_INTERRUPT, etc.). +const ( + _KVM_CREATE_VM = 0xae01 + _KVM_GET_VCPU_MMAP_SIZE = 0xae04 + _KVM_CREATE_VCPU = 0xae41 + _KVM_SET_TSS_ADDR = 0xae47 + _KVM_RUN = 0xae80 + _KVM_NMI = 0xae9a + _KVM_CHECK_EXTENSION = 0xae03 + _KVM_INTERRUPT = 0x4004ae86 + _KVM_SET_MSRS = 0x4008ae89 + _KVM_SET_USER_MEMORY_REGION = 0x4020ae46 + _KVM_SET_REGS = 0x4090ae82 + _KVM_SET_SREGS = 0x4138ae84 + _KVM_GET_REGS = 0x8090ae81 + _KVM_GET_SUPPORTED_CPUID = 0xc008ae05 + _KVM_SET_CPUID2 = 0x4008ae90 + _KVM_SET_SIGNAL_MASK = 0x4004ae8b +) + +// KVM exit reasons. +const ( + _KVM_EXIT_EXCEPTION = 0x1 + _KVM_EXIT_IO = 0x2 + _KVM_EXIT_HYPERCALL = 0x3 + _KVM_EXIT_DEBUG = 0x4 + _KVM_EXIT_HLT = 0x5 + _KVM_EXIT_MMIO = 0x6 + _KVM_EXIT_IRQ_WINDOW_OPEN = 0x7 + _KVM_EXIT_SHUTDOWN = 0x8 + _KVM_EXIT_FAIL_ENTRY = 0x9 + _KVM_EXIT_INTERNAL_ERROR = 0x11 +) + +// KVM capability options. +const ( + _KVM_CAP_MAX_VCPUS = 0x42 +) + +// KVM limits. +const ( + _KVM_NR_VCPUS = 0xff + _KVM_NR_INTERRUPTS = 0x100 + _KVM_NR_CPUID_ENTRIES = 0x100 +) diff --git a/pkg/sentry/platform/kvm/kvm_state_autogen.go b/pkg/sentry/platform/kvm/kvm_state_autogen.go new file mode 100755 index 000000000..5ab0e0735 --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_state_autogen.go @@ -0,0 +1,4 @@ +// automatically generated by stateify. + +package kvm + diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go new file mode 100644 index 000000000..f5953b96e --- /dev/null +++ b/pkg/sentry/platform/kvm/machine.go @@ -0,0 +1,525 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "fmt" + "runtime" + "sync" + "sync/atomic" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/atomicbitops" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// machine contains state associated with the VM as a whole. +type machine struct { + // fd is the vm fd. + fd int + + // nextSlot is the next slot for setMemoryRegion. + // + // This must be accessed atomically. If nextSlot is ^uint32(0), then + // slots are currently being updated, and the caller should retry. + nextSlot uint32 + + // kernel is the set of global structures. + kernel ring0.Kernel + + // mappingCache is used for mapPhysical. + mappingCache sync.Map + + // mu protects vCPUs. + mu sync.RWMutex + + // available is notified when vCPUs are available. + available sync.Cond + + // vCPUs are the machine vCPUs. + // + // These are populated dynamically. + vCPUs map[uint64]*vCPU + + // vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID. + vCPUsByID map[int]*vCPU + + // maxVCPUs is the maximum number of vCPUs supported by the machine. + maxVCPUs int +} + +const ( + // vCPUReady is an alias for all the below clear. + vCPUReady uint32 = 0 + + // vCPUser indicates that the vCPU is in or about to enter user mode. + vCPUUser uint32 = 1 << 0 + + // vCPUGuest indicates the vCPU is in guest mode. + vCPUGuest uint32 = 1 << 1 + + // vCPUWaiter indicates that there is a waiter. + // + // If this is set, then notify must be called on any state transitions. + vCPUWaiter uint32 = 1 << 2 +) + +// vCPU is a single KVM vCPU. +type vCPU struct { + // CPU is the kernel CPU data. + // + // This must be the first element of this structure, it is referenced + // by the bluepill code (see bluepill_amd64.s). + ring0.CPU + + // id is the vCPU id. + id int + + // fd is the vCPU fd. + fd int + + // tid is the last set tid. + tid uint64 + + // switches is a count of world switches (informational only). + switches uint32 + + // faults is a count of world faults (informational only). + faults uint32 + + // state is the vCPU state. + // + // This is a bitmask of the three fields (vCPU*) described above. + state uint32 + + // runData for this vCPU. + runData *runData + + // machine associated with this vCPU. + machine *machine + + // active is the current addressSpace: this is set and read atomically, + // it is used to elide unnecessary interrupts due to invalidations. + active atomicAddressSpace + + // vCPUArchState is the architecture-specific state. + vCPUArchState + + dieState dieState +} + +type dieState struct { + // message is thrown from die. + message string + + // guestRegs is used to store register state during vCPU.die() to prevent + // allocation inside nosplit function. + guestRegs userRegs +} + +// newVCPU creates a returns a new vCPU. +// +// Precondtion: mu must be held. +func (m *machine) newVCPU() *vCPU { + id := len(m.vCPUs) + + // Create the vCPU. + fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id)) + if errno != 0 { + panic(fmt.Sprintf("error creating new vCPU: %v", errno)) + } + + c := &vCPU{ + id: id, + fd: int(fd), + machine: m, + } + c.CPU.Init(&m.kernel, c) + m.vCPUsByID[c.id] = c + + // Ensure the signal mask is correct. + if err := c.setSignalMask(); err != nil { + panic(fmt.Sprintf("error setting signal mask: %v", err)) + } + + // Map the run data. + runData, err := mapRunData(int(fd)) + if err != nil { + panic(fmt.Sprintf("error mapping run data: %v", err)) + } + c.runData = runData + + // Initialize architecture state. + if err := c.initArchState(); err != nil { + panic(fmt.Sprintf("error initialization vCPU state: %v", err)) + } + + return c // Done. +} + +// newMachine returns a new VM context. +func newMachine(vm int) (*machine, error) { + // Create the machine. + m := &machine{ + fd: vm, + vCPUs: make(map[uint64]*vCPU), + vCPUsByID: make(map[int]*vCPU), + } + m.available.L = &m.mu + m.kernel.Init(ring0.KernelOpts{ + PageTables: pagetables.New(newAllocator()), + }) + + maxVCPUs, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) + if errno != 0 { + m.maxVCPUs = _KVM_NR_VCPUS + } else { + m.maxVCPUs = int(maxVCPUs) + } + log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs) + + // Apply the physical mappings. Note that these mappings may point to + // guest physical addresses that are not actually available. These + // physical pages are mapped on demand, see kernel_unsafe.go. + applyPhysicalRegions(func(pr physicalRegion) bool { + // Map everything in the lower half. + m.kernel.PageTables.Map( + usermem.Addr(pr.virtual), + pr.length, + pagetables.MapOpts{AccessType: usermem.AnyAccess}, + pr.physical) + + // And keep everything in the upper half. + m.kernel.PageTables.Map( + usermem.Addr(ring0.KernelStartAddress|pr.virtual), + pr.length, + pagetables.MapOpts{AccessType: usermem.AnyAccess}, + pr.physical) + + return true // Keep iterating. + }) + + // Ensure that the currently mapped virtual regions are actually + // available in the VM. Note that this doesn't guarantee no future + // faults, however it should guarantee that everything is available to + // ensure successful vCPU entry. + applyVirtualRegions(func(vr virtualRegion) { + if excludeVirtualRegion(vr) { + return // skip region. + } + for virtual := vr.virtual; virtual < vr.virtual+vr.length; { + physical, length, ok := translateToPhysical(virtual) + if !ok { + // This must be an invalid region that was + // knocked out by creation of the physical map. + return + } + if virtual+length > vr.virtual+vr.length { + // Cap the length to the end of the area. + length = vr.virtual + vr.length - virtual + } + + // Ensure the physical range is mapped. + m.mapPhysical(physical, length) + virtual += length + } + }) + + // Initialize architecture state. + if err := m.initArchState(); err != nil { + m.Destroy() + return nil, err + } + + // Ensure the machine is cleaned up properly. + runtime.SetFinalizer(m, (*machine).Destroy) + return m, nil +} + +// mapPhysical checks for the mapping of a physical range, and installs one if +// not available. This attempts to be efficient for calls in the hot path. +// +// This panics on error. +func (m *machine) mapPhysical(physical, length uintptr) { + for end := physical + length; physical < end; { + _, physicalStart, length, ok := calculateBluepillFault(physical) + if !ok { + // Should never happen. + panic("mapPhysical on unknown physical address") + } + + if _, ok := m.mappingCache.LoadOrStore(physicalStart, true); !ok { + // Not present in the cache; requires setting the slot. + if _, ok := handleBluepillFault(m, physical); !ok { + panic("handleBluepillFault failed") + } + } + + // Move to the next chunk. + physical = physicalStart + length + } +} + +// Destroy frees associated resources. +// +// Destroy should only be called once all active users of the machine are gone. +// The machine object should not be used after calling Destroy. +// +// Precondition: all vCPUs must be returned to the machine. +func (m *machine) Destroy() { + runtime.SetFinalizer(m, nil) + + // Destroy vCPUs. + for _, c := range m.vCPUs { + // Ensure the vCPU is not still running in guest mode. This is + // possible iff teardown has been done by other threads, and + // somehow a single thread has not executed any system calls. + c.BounceToHost() + + // Note that the runData may not be mapped if an error occurs + // during the middle of initialization. + if c.runData != nil { + if err := unmapRunData(c.runData); err != nil { + panic(fmt.Sprintf("error unmapping rundata: %v", err)) + } + } + if err := syscall.Close(int(c.fd)); err != nil { + panic(fmt.Sprintf("error closing vCPU fd: %v", err)) + } + } + + // vCPUs are gone: teardown machine state. + if err := syscall.Close(m.fd); err != nil { + panic(fmt.Sprintf("error closing VM fd: %v", err)) + } +} + +// Get gets an available vCPU. +func (m *machine) Get() *vCPU { + runtime.LockOSThread() + tid := procid.Current() + m.mu.RLock() + + // Check for an exact match. + if c := m.vCPUs[tid]; c != nil { + c.lock() + m.mu.RUnlock() + return c + } + + // The happy path failed. We now proceed to acquire an exclusive lock + // (because the vCPU map may change), and scan all available vCPUs. + m.mu.RUnlock() + m.mu.Lock() + + for { + // Scan for an available vCPU. + for origTID, c := range m.vCPUs { + if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) { + delete(m.vCPUs, origTID) + m.vCPUs[tid] = c + m.mu.Unlock() + c.loadSegments(tid) + return c + } + } + + // Create a new vCPU (maybe). + if len(m.vCPUs) < m.maxVCPUs { + c := m.newVCPU() + c.lock() + m.vCPUs[tid] = c + m.mu.Unlock() + c.loadSegments(tid) + return c + } + + // Scan for something not in user mode. + for origTID, c := range m.vCPUs { + if !atomic.CompareAndSwapUint32(&c.state, vCPUGuest, vCPUGuest|vCPUWaiter) { + continue + } + + // The vCPU is not be able to transition to + // vCPUGuest|vCPUUser or to vCPUUser because that + // transition requires holding the machine mutex, as we + // do now. There is no path to register a waiter on + // just the vCPUReady state. + for { + c.waitUntilNot(vCPUGuest | vCPUWaiter) + if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) { + break + } + } + + // Steal the vCPU. + delete(m.vCPUs, origTID) + m.vCPUs[tid] = c + m.mu.Unlock() + c.loadSegments(tid) + return c + } + + // Everything is executing in user mode. Wait until something + // is available. Note that signaling the condition variable + // will have the extra effect of kicking the vCPUs out of guest + // mode if that's where they were. + m.available.Wait() + } +} + +// Put puts the current vCPU. +func (m *machine) Put(c *vCPU) { + c.unlock() + runtime.UnlockOSThread() + m.available.Signal() +} + +// newDirtySet returns a new dirty set. +func (m *machine) newDirtySet() *dirtySet { + return &dirtySet{ + vCPUs: make([]uint64, (m.maxVCPUs+63)/64, (m.maxVCPUs+63)/64), + } +} + +// lock marks the vCPU as in user mode. +// +// This should only be called directly when known to be safe, i.e. when +// the vCPU is owned by the current TID with no chance of theft. +// +//go:nosplit +func (c *vCPU) lock() { + atomicbitops.OrUint32(&c.state, vCPUUser) +} + +// unlock clears the vCPUUser bit. +// +//go:nosplit +func (c *vCPU) unlock() { + if atomic.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest) { + // Happy path: no exits are forced, and we can continue + // executing on our merry way with a single atomic access. + return + } + + // Clear the lock. + origState := atomic.LoadUint32(&c.state) + atomicbitops.AndUint32(&c.state, ^vCPUUser) + switch origState { + case vCPUUser: + // Normal state. + case vCPUUser | vCPUGuest | vCPUWaiter: + // Force a transition: this must trigger a notification when we + // return from guest mode. + c.notify() + case vCPUUser | vCPUWaiter: + // Waiting for the lock to be released; the responsibility is + // on us to notify the waiter and clear the associated bit. + atomicbitops.AndUint32(&c.state, ^vCPUWaiter) + c.notify() + default: + panic("invalid state") + } +} + +// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt. +// +//go:nosplit +func (c *vCPU) NotifyInterrupt() { + c.BounceToKernel() +} + +// pid is used below in bounce. +var pid = syscall.Getpid() + +// bounce forces a return to the kernel or to host mode. +// +// This effectively unwinds the state machine. +func (c *vCPU) bounce(forceGuestExit bool) { + for { + switch state := atomic.LoadUint32(&c.state); state { + case vCPUReady, vCPUWaiter: + // There is nothing to be done, we're already in the + // kernel pre-acquisition. The Bounce criteria have + // been satisfied. + return + case vCPUUser: + // We need to register a waiter for the actual guest + // transition. When the transition takes place, then we + // can inject an interrupt to ensure a return to host + // mode. + atomic.CompareAndSwapUint32(&c.state, state, state|vCPUWaiter) + case vCPUUser | vCPUWaiter: + // Wait for the transition to guest mode. This should + // come from the bluepill handler. + c.waitUntilNot(state) + case vCPUGuest, vCPUUser | vCPUGuest: + if state == vCPUGuest && !forceGuestExit { + // The vCPU is already not acquired, so there's + // no need to do a fresh injection here. + return + } + // The vCPU is in user or kernel mode. Attempt to + // register a notification on change. + if !atomic.CompareAndSwapUint32(&c.state, state, state|vCPUWaiter) { + break // Retry. + } + for { + // We need to spin here until the signal is + // delivered, because Tgkill can return EAGAIN + // under memory pressure. Since we already + // marked ourselves as a waiter, we need to + // ensure that a signal is actually delivered. + if err := syscall.Tgkill(pid, int(atomic.LoadUint64(&c.tid)), bounceSignal); err == nil { + break + } else if err.(syscall.Errno) == syscall.EAGAIN { + continue + } else { + // Nothing else should be returned by tgkill. + panic(fmt.Sprintf("unexpected tgkill error: %v", err)) + } + } + case vCPUGuest | vCPUWaiter, vCPUUser | vCPUGuest | vCPUWaiter: + if state == vCPUGuest|vCPUWaiter && !forceGuestExit { + // See above. + return + } + // Wait for the transition. This again should happen + // from the bluepill handler, but on the way out. + c.waitUntilNot(state) + default: + // Should not happen: the above is exhaustive. + panic("invalid state") + } + } +} + +// BounceToKernel ensures that the vCPU bounces back to the kernel. +// +//go:nosplit +func (c *vCPU) BounceToKernel() { + c.bounce(false) +} + +// BounceToHost ensures that the vCPU is in host mode. +// +//go:nosplit +func (c *vCPU) BounceToHost() { + c.bounce(true) +} diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go new file mode 100644 index 000000000..b6821122a --- /dev/null +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -0,0 +1,357 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package kvm + +import ( + "fmt" + "reflect" + "runtime/debug" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// initArchState initializes architecture-specific state. +func (m *machine) initArchState() error { + // Set the legacy TSS address. This address is covered by the reserved + // range (up to 4GB). In fact, this is a main reason it exists. + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(m.fd), + _KVM_SET_TSS_ADDR, + uintptr(reservedMemory-(3*usermem.PageSize))); errno != 0 { + return errno + } + + // Enable CPUID faulting, if possible. Note that this also serves as a + // basic platform sanity tests, since we will enter guest mode for the + // first time here. The recovery is necessary, since if we fail to read + // the platform info register, we will retry to host mode and + // ultimately need to handle a segmentation fault. + old := debug.SetPanicOnFault(true) + defer func() { + recover() + debug.SetPanicOnFault(old) + }() + m.retryInGuest(func() { + ring0.SetCPUIDFaulting(true) + }) + + return nil +} + +type vCPUArchState struct { + // PCIDs is the set of PCIDs for this vCPU. + // + // This starts above fixedKernelPCID. + PCIDs *pagetables.PCIDs + + // floatingPointState is the floating point state buffer used in guest + // to host transitions. See usage in bluepill_amd64.go. + floatingPointState *arch.FloatingPointData +} + +const ( + // fixedKernelPCID is a fixed kernel PCID used for the kernel page + // tables. We must start allocating user PCIDs above this in order to + // avoid any conflict (see below). + fixedKernelPCID = 1 + + // poolPCIDs is the number of PCIDs to record in the database. As this + // grows, assignment can take longer, since it is a simple linear scan. + // Beyond a relatively small number, there are likely few perform + // benefits, since the TLB has likely long since lost any translations + // from more than a few PCIDs past. + poolPCIDs = 8 +) + +// dropPageTables drops cached page table entries. +func (m *machine) dropPageTables(pt *pagetables.PageTables) { + m.mu.Lock() + defer m.mu.Unlock() + + // Clear from all PCIDs. + for _, c := range m.vCPUs { + c.PCIDs.Drop(pt) + } +} + +// initArchState initializes architecture-specific state. +func (c *vCPU) initArchState() error { + var ( + kernelSystemRegs systemRegs + kernelUserRegs userRegs + ) + + // Set base control registers. + kernelSystemRegs.CR0 = c.CR0() + kernelSystemRegs.CR4 = c.CR4() + kernelSystemRegs.EFER = c.EFER() + + // Set the IDT & GDT in the registers. + kernelSystemRegs.IDT.base, kernelSystemRegs.IDT.limit = c.IDT() + kernelSystemRegs.GDT.base, kernelSystemRegs.GDT.limit = c.GDT() + kernelSystemRegs.CS.Load(&ring0.KernelCodeSegment, ring0.Kcode) + kernelSystemRegs.DS.Load(&ring0.UserDataSegment, ring0.Udata) + kernelSystemRegs.ES.Load(&ring0.UserDataSegment, ring0.Udata) + kernelSystemRegs.SS.Load(&ring0.KernelDataSegment, ring0.Kdata) + kernelSystemRegs.FS.Load(&ring0.UserDataSegment, ring0.Udata) + kernelSystemRegs.GS.Load(&ring0.UserDataSegment, ring0.Udata) + tssBase, tssLimit, tss := c.TSS() + kernelSystemRegs.TR.Load(tss, ring0.Tss) + kernelSystemRegs.TR.base = tssBase + kernelSystemRegs.TR.limit = uint32(tssLimit) + + // Point to kernel page tables, with no initial PCID. + kernelSystemRegs.CR3 = c.machine.kernel.PageTables.CR3(false, 0) + + // Initialize the PCID database. + if hasGuestPCID { + // Note that NewPCIDs may return a nil table here, in which + // case we simply don't use PCID support (see below). In + // practice, this should not happen, however. + c.PCIDs = pagetables.NewPCIDs(fixedKernelPCID+1, poolPCIDs) + } + + // Set the CPUID; this is required before setting system registers, + // since KVM will reject several CR4 bits if the CPUID does not + // indicate the support is available. + if err := c.setCPUID(); err != nil { + return err + } + + // Set the entrypoint for the kernel. + kernelUserRegs.RIP = uint64(reflect.ValueOf(ring0.Start).Pointer()) + kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer()) + kernelUserRegs.RFLAGS = ring0.KernelFlagsSet + + // Set the system registers. + if err := c.setSystemRegisters(&kernelSystemRegs); err != nil { + return err + } + + // Set the user registers. + if err := c.setUserRegisters(&kernelUserRegs); err != nil { + return err + } + + // Allocate some floating point state save area for the local vCPU. + // This will be saved prior to leaving the guest, and we restore from + // this always. We cannot use the pointer in the context alone because + // we don't know how large the area there is in reality. + c.floatingPointState = arch.NewFloatingPointData() + + // Set the time offset to the host native time. + return c.setSystemTime() +} + +// nonCanonical generates a canonical address return. +// +//go:nosplit +func nonCanonical(addr uint64, signal int32, info *arch.SignalInfo) (usermem.AccessType, error) { + *info = arch.SignalInfo{ + Signo: signal, + Code: arch.SignalInfoKernel, + } + info.SetAddr(addr) // Include address. + return usermem.NoAccess, platform.ErrContextSignal +} + +// fault generates an appropriate fault return. +// +//go:nosplit +func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (usermem.AccessType, error) { + bluepill(c) // Probably no-op, but may not be. + faultAddr := ring0.ReadCR2() + code, user := c.ErrorCode() + if !user { + // The last fault serviced by this CPU was not a user + // fault, so we can't reliably trust the faultAddr or + // the code provided here. We need to re-execute. + return usermem.NoAccess, platform.ErrContextInterrupt + } + // Reset the pointed SignalInfo. + *info = arch.SignalInfo{Signo: signal} + info.SetAddr(uint64(faultAddr)) + accessType := usermem.AccessType{ + Read: code&(1<<1) == 0, + Write: code&(1<<1) != 0, + Execute: code&(1<<4) != 0, + } + if !accessType.Write && !accessType.Execute { + info.Code = 1 // SEGV_MAPERR. + } else { + info.Code = 2 // SEGV_ACCERR. + } + return accessType, platform.ErrContextSignal +} + +// SwitchToUser unpacks architectural-details. +func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (usermem.AccessType, error) { + // Check for canonical addresses. + if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Rip) { + return nonCanonical(regs.Rip, int32(syscall.SIGSEGV), info) + } else if !ring0.IsCanonical(regs.Rsp) { + return nonCanonical(regs.Rsp, int32(syscall.SIGBUS), info) + } else if !ring0.IsCanonical(regs.Fs_base) { + return nonCanonical(regs.Fs_base, int32(syscall.SIGBUS), info) + } else if !ring0.IsCanonical(regs.Gs_base) { + return nonCanonical(regs.Gs_base, int32(syscall.SIGBUS), info) + } + + // Assign PCIDs. + if c.PCIDs != nil { + var requireFlushPCID bool // Force a flush? + switchOpts.UserPCID, requireFlushPCID = c.PCIDs.Assign(switchOpts.PageTables) + switchOpts.KernelPCID = fixedKernelPCID + switchOpts.Flush = switchOpts.Flush || requireFlushPCID + } + + // See below. + var vector ring0.Vector + + // Past this point, stack growth can cause system calls (and a break + // from guest mode). So we need to ensure that between the bluepill + // call here and the switch call immediately below, no additional + // allocations occur. + entersyscall() + bluepill(c) + vector = c.CPU.SwitchToUser(switchOpts) + exitsyscall() + + switch vector { + case ring0.Syscall, ring0.SyscallInt80: + // Fast path: system call executed. + return usermem.NoAccess, nil + + case ring0.PageFault: + return c.fault(int32(syscall.SIGSEGV), info) + + case ring0.Debug, ring0.Breakpoint: + *info = arch.SignalInfo{ + Signo: int32(syscall.SIGTRAP), + Code: 1, // TRAP_BRKPT (breakpoint). + } + info.SetAddr(switchOpts.Registers.Rip) // Include address. + return usermem.AccessType{}, platform.ErrContextSignal + + case ring0.GeneralProtectionFault, + ring0.SegmentNotPresent, + ring0.BoundRangeExceeded, + ring0.InvalidTSS, + ring0.StackSegmentFault: + *info = arch.SignalInfo{ + Signo: int32(syscall.SIGSEGV), + Code: arch.SignalInfoKernel, + } + info.SetAddr(switchOpts.Registers.Rip) // Include address. + if vector == ring0.GeneralProtectionFault { + // When CPUID faulting is enabled, we will generate a #GP(0) when + // userspace executes a CPUID instruction. This is handled above, + // because we need to be able to map and read user memory. + return usermem.AccessType{}, platform.ErrContextSignalCPUID + } + return usermem.AccessType{}, platform.ErrContextSignal + + case ring0.InvalidOpcode: + *info = arch.SignalInfo{ + Signo: int32(syscall.SIGILL), + Code: 1, // ILL_ILLOPC (illegal opcode). + } + info.SetAddr(switchOpts.Registers.Rip) // Include address. + return usermem.AccessType{}, platform.ErrContextSignal + + case ring0.DivideByZero: + *info = arch.SignalInfo{ + Signo: int32(syscall.SIGFPE), + Code: 1, // FPE_INTDIV (divide by zero). + } + info.SetAddr(switchOpts.Registers.Rip) // Include address. + return usermem.AccessType{}, platform.ErrContextSignal + + case ring0.Overflow: + *info = arch.SignalInfo{ + Signo: int32(syscall.SIGFPE), + Code: 2, // FPE_INTOVF (integer overflow). + } + info.SetAddr(switchOpts.Registers.Rip) // Include address. + return usermem.AccessType{}, platform.ErrContextSignal + + case ring0.X87FloatingPointException, + ring0.SIMDFloatingPointException: + *info = arch.SignalInfo{ + Signo: int32(syscall.SIGFPE), + Code: 7, // FPE_FLTINV (invalid operation). + } + info.SetAddr(switchOpts.Registers.Rip) // Include address. + return usermem.AccessType{}, platform.ErrContextSignal + + case ring0.Vector(bounce): // ring0.VirtualizationException + return usermem.NoAccess, platform.ErrContextInterrupt + + case ring0.AlignmentCheck: + *info = arch.SignalInfo{ + Signo: int32(syscall.SIGBUS), + Code: 2, // BUS_ADRERR (physical address does not exist). + } + return usermem.NoAccess, platform.ErrContextSignal + + case ring0.NMI: + // An NMI is generated only when a fault is not servicable by + // KVM itself, so we think some mapping is writeable but it's + // really not. This could happen, e.g. if some file is + // truncated (and would generate a SIGBUS) and we map it + // directly into the instance. + return c.fault(int32(syscall.SIGBUS), info) + + case ring0.DeviceNotAvailable, + ring0.DoubleFault, + ring0.CoprocessorSegmentOverrun, + ring0.MachineCheck, + ring0.SecurityException: + fallthrough + default: + panic(fmt.Sprintf("unexpected vector: 0x%x", vector)) + } +} + +// retryInGuest runs the given function in guest mode. +// +// If the function does not complete in guest mode (due to execution of a +// system call due to a GC stall, for example), then it will be retried. The +// given function must be idempotent as a result of the retry mechanism. +func (m *machine) retryInGuest(fn func()) { + c := m.Get() + defer m.Put(c) + for { + c.ClearErrorCode() // See below. + bluepill(c) // Force guest mode. + fn() // Execute the given function. + _, user := c.ErrorCode() + if user { + // If user is set, then we haven't bailed back to host + // mode via a kernel exception or system call. We + // consider the full function to have executed in guest + // mode and we can return. + break + } + } +} diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go new file mode 100644 index 000000000..06a2e3b0c --- /dev/null +++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go @@ -0,0 +1,161 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package kvm + +import ( + "fmt" + "sync/atomic" + "syscall" + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/time" +) + +// setMemoryRegion initializes a region. +// +// This may be called from bluepillHandler, and therefore returns an errno +// directly (instead of wrapping in an error) to avoid allocations. +// +//go:nosplit +func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr) syscall.Errno { + userRegion := userMemoryRegion{ + slot: uint32(slot), + flags: 0, + guestPhysAddr: uint64(physical), + memorySize: uint64(length), + userspaceAddr: uint64(virtual), + } + + // Set the region. + _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(m.fd), + _KVM_SET_USER_MEMORY_REGION, + uintptr(unsafe.Pointer(&userRegion))) + return errno +} + +// loadSegments copies the current segments. +// +// This may be called from within the signal context and throws on error. +// +//go:nosplit +func (c *vCPU) loadSegments(tid uint64) { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_ARCH_PRCTL, + linux.ARCH_GET_FS, + uintptr(unsafe.Pointer(&c.CPU.Registers().Fs_base)), + 0); errno != 0 { + throw("getting FS segment") + } + if _, _, errno := syscall.RawSyscall( + syscall.SYS_ARCH_PRCTL, + linux.ARCH_GET_GS, + uintptr(unsafe.Pointer(&c.CPU.Registers().Gs_base)), + 0); errno != 0 { + throw("getting GS segment") + } + atomic.StoreUint64(&c.tid, tid) +} + +// setCPUID sets the CPUID to be used by the guest. +func (c *vCPU) setCPUID() error { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_CPUID2, + uintptr(unsafe.Pointer(&cpuidSupported))); errno != 0 { + return fmt.Errorf("error setting CPUID: %v", errno) + } + return nil +} + +// setSystemTime sets the TSC for the vCPU. +// +// This has to make the call many times in order to minimize the intrinstic +// error in the offset. Unfortunately KVM does not expose a relative offset via +// the API, so this is an approximation. We do this via an iterative algorithm. +// This has the advantage that it can generally deal with highly variable +// system call times and should converge on the correct offset. +func (c *vCPU) setSystemTime() error { + const ( + _MSR_IA32_TSC = 0x00000010 + calibrateTries = 10 + ) + registers := modelControlRegisters{ + nmsrs: 1, + } + registers.entries[0] = modelControlRegister{ + index: _MSR_IA32_TSC, + } + target := uint64(^uint32(0)) + for done := 0; done < calibrateTries; { + start := uint64(time.Rdtsc()) + registers.entries[0].data = start + target + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_MSRS, + uintptr(unsafe.Pointer(®isters))); errno != 0 { + return fmt.Errorf("error setting system time: %v", errno) + } + // See if this is our new minimum call time. Note that this + // serves two functions: one, we make sure that we are + // accurately predicting the offset we need to set. Second, we + // don't want to do the final set on a slow call, which could + // produce a really bad result. So we only count attempts + // within +/- 6.25% of our minimum as an attempt. + end := uint64(time.Rdtsc()) + if end < start { + continue // Totally bogus. + } + half := (end - start) / 2 + if half < target { + target = half + } + if (half - target) < target/8 { + done++ + } + } + return nil +} + +// setSignalMask sets the vCPU signal mask. +// +// This must be called prior to running the vCPU. +func (c *vCPU) setSignalMask() error { + // The layout of this structure implies that it will not necessarily be + // the same layout chosen by the Go compiler. It gets fudged here. + var data struct { + length uint32 + mask1 uint32 + mask2 uint32 + _ uint32 + } + data.length = 8 // Fixed sigset size. + data.mask1 = ^uint32(bounceSignalMask & 0xffffffff) + data.mask2 = ^uint32(bounceSignalMask >> 32) + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_SIGNAL_MASK, + uintptr(unsafe.Pointer(&data))); errno != 0 { + return fmt.Errorf("error setting signal mask: %v", errno) + } + return nil +} diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go new file mode 100644 index 000000000..1d3c6d2d6 --- /dev/null +++ b/pkg/sentry/platform/kvm/machine_unsafe.go @@ -0,0 +1,160 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build go1.12 +// +build !go1.14 + +// Check go:linkname function signatures when updating Go version. + +package kvm + +import ( + "fmt" + "sync/atomic" + "syscall" + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" +) + +//go:linkname entersyscall runtime.entersyscall +func entersyscall() + +//go:linkname exitsyscall runtime.exitsyscall +func exitsyscall() + +// mapRunData maps the vCPU run data. +func mapRunData(fd int) (*runData, error) { + r, _, errno := syscall.RawSyscall6( + syscall.SYS_MMAP, + 0, + uintptr(runDataSize), + syscall.PROT_READ|syscall.PROT_WRITE, + syscall.MAP_SHARED, + uintptr(fd), + 0) + if errno != 0 { + return nil, fmt.Errorf("error mapping runData: %v", errno) + } + return (*runData)(unsafe.Pointer(r)), nil +} + +// unmapRunData unmaps the vCPU run data. +func unmapRunData(r *runData) error { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_MUNMAP, + uintptr(unsafe.Pointer(r)), + uintptr(runDataSize), + 0); errno != 0 { + return fmt.Errorf("error unmapping runData: %v", errno) + } + return nil +} + +// setUserRegisters sets user registers in the vCPU. +func (c *vCPU) setUserRegisters(uregs *userRegs) error { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_REGS, + uintptr(unsafe.Pointer(uregs))); errno != 0 { + return fmt.Errorf("error setting user registers: %v", errno) + } + return nil +} + +// getUserRegisters reloads user registers in the vCPU. +// +// This is safe to call from a nosplit context. +// +//go:nosplit +func (c *vCPU) getUserRegisters(uregs *userRegs) syscall.Errno { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_GET_REGS, + uintptr(unsafe.Pointer(uregs))); errno != 0 { + return errno + } + return 0 +} + +// setSystemRegisters sets system registers. +func (c *vCPU) setSystemRegisters(sregs *systemRegs) error { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_SREGS, + uintptr(unsafe.Pointer(sregs))); errno != 0 { + return fmt.Errorf("error setting system registers: %v", errno) + } + return nil +} + +// atomicAddressSpace is an atomic address space pointer. +type atomicAddressSpace struct { + pointer unsafe.Pointer +} + +// set sets the address space value. +// +//go:nosplit +func (a *atomicAddressSpace) set(as *addressSpace) { + atomic.StorePointer(&a.pointer, unsafe.Pointer(as)) +} + +// get gets the address space value. +// +// Note that this should be considered best-effort, and may have changed by the +// time this function returns. +// +//go:nosplit +func (a *atomicAddressSpace) get() *addressSpace { + return (*addressSpace)(atomic.LoadPointer(&a.pointer)) +} + +// notify notifies that the vCPU has transitioned modes. +// +// This may be called by a signal handler and therefore throws on error. +// +//go:nosplit +func (c *vCPU) notify() { + _, _, errno := syscall.RawSyscall6( + syscall.SYS_FUTEX, + uintptr(unsafe.Pointer(&c.state)), + linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG, + ^uintptr(0), // Number of waiters. + 0, 0, 0) + if errno != 0 { + throw("futex wake error") + } +} + +// waitUntilNot waits for the vCPU to transition modes. +// +// The state should have been previously set to vCPUWaiter after performing an +// appropriate action to cause a transition (e.g. interrupt injection). +// +// This panics on error. +func (c *vCPU) waitUntilNot(state uint32) { + _, _, errno := syscall.Syscall6( + syscall.SYS_FUTEX, + uintptr(unsafe.Pointer(&c.state)), + linux.FUTEX_WAIT|linux.FUTEX_PRIVATE_FLAG, + uintptr(state), + 0, 0, 0) + if errno != 0 && errno != syscall.EINTR && errno != syscall.EAGAIN { + panic("futex wait error") + } +} diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go new file mode 100644 index 000000000..450eb8201 --- /dev/null +++ b/pkg/sentry/platform/kvm/physical_map.go @@ -0,0 +1,224 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "fmt" + "sort" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +const ( + // reservedMemory is a chunk of physical memory reserved starting at + // physical address zero. There are some special pages in this region, + // so we just call the whole thing off. + // + // Other architectures may define this to be zero. + reservedMemory = 0x100000000 +) + +type region struct { + virtual uintptr + length uintptr +} + +type physicalRegion struct { + region + physical uintptr +} + +// physicalRegions contains a list of available physical regions. +// +// The physical value used in physicalRegions is a number indicating the +// physical offset, aligned appropriately and starting above reservedMemory. +var physicalRegions []physicalRegion + +// fillAddressSpace fills the host address space with PROT_NONE mappings until +// we have a host address space size that is less than or equal to the physical +// address space. This allows us to have an injective host virtual to guest +// physical mapping. +// +// The excluded regions are returned. +func fillAddressSpace() (excludedRegions []region) { + // We can cut vSize in half, because the kernel will be using the top + // half and we ignore it while constructing mappings. It's as if we've + // already excluded half the possible addresses. + vSize := uintptr(1) << ring0.VirtualAddressBits() + vSize = vSize >> 1 + + // We exclude reservedMemory below from our physical memory size, so it + // needs to be dropped here as well. Otherwise, we could end up with + // physical addresses that are beyond what is mapped. + pSize := uintptr(1) << ring0.PhysicalAddressBits() + pSize -= reservedMemory + + // Add specifically excluded regions; see excludeVirtualRegion. + applyVirtualRegions(func(vr virtualRegion) { + if excludeVirtualRegion(vr) { + excludedRegions = append(excludedRegions, vr.region) + vSize -= vr.length + log.Infof("excluded: virtual [%x,%x)", vr.virtual, vr.virtual+vr.length) + } + }) + + // Do we need any more work? + if vSize < pSize { + return excludedRegions + } + + // Calculate the required space and fill it. + // + // Note carefully that we add faultBlockSize to required up front, and + // on each iteration of the loop below (i.e. each new physical region + // we define), we add faultBlockSize again. This is done because the + // computation of physical regions will ensure proper alignments with + // faultBlockSize, potentially causing up to faultBlockSize bytes in + // internal fragmentation for each physical region. So we need to + // account for this properly during allocation. + requiredAddr, ok := usermem.Addr(vSize - pSize + faultBlockSize).RoundUp() + if !ok { + panic(fmt.Sprintf( + "overflow for vSize (%x) - pSize (%x) + faultBlockSize (%x)", + vSize, pSize, faultBlockSize)) + } + required := uintptr(requiredAddr) + current := required // Attempted mmap size. + for filled := uintptr(0); filled < required && current > 0; { + addr, _, errno := syscall.RawSyscall6( + syscall.SYS_MMAP, + 0, // Suggested address. + current, + syscall.PROT_NONE, + syscall.MAP_ANONYMOUS|syscall.MAP_PRIVATE|syscall.MAP_NORESERVE, + 0, 0) + if errno != 0 { + // Attempt half the size; overflow not possible. + currentAddr, _ := usermem.Addr(current >> 1).RoundUp() + current = uintptr(currentAddr) + continue + } + // We filled a block. + filled += current + excludedRegions = append(excludedRegions, region{ + virtual: addr, + length: current, + }) + // See comment above. + if filled != required { + required += faultBlockSize + } + } + if current == 0 { + panic("filling address space failed") + } + sort.Slice(excludedRegions, func(i, j int) bool { + return excludedRegions[i].virtual < excludedRegions[j].virtual + }) + for _, r := range excludedRegions { + log.Infof("region: virtual [%x,%x)", r.virtual, r.virtual+r.length) + } + return excludedRegions +} + +// computePhysicalRegions computes physical regions. +func computePhysicalRegions(excludedRegions []region) (physicalRegions []physicalRegion) { + physical := uintptr(reservedMemory) + addValidRegion := func(virtual, length uintptr) { + if length == 0 { + return + } + if virtual == 0 { + virtual += usermem.PageSize + length -= usermem.PageSize + } + if end := virtual + length; end > ring0.MaximumUserAddress { + length -= (end - ring0.MaximumUserAddress) + } + if length == 0 { + return + } + // Round physical up to the same alignment as the virtual + // address (with respect to faultBlockSize). + if offset := virtual &^ faultBlockMask; physical&^faultBlockMask != offset { + if newPhysical := (physical & faultBlockMask) + offset; newPhysical > physical { + physical = newPhysical // Round up by only a little bit. + } else { + physical = ((physical + faultBlockSize) & faultBlockMask) + offset + } + } + physicalRegions = append(physicalRegions, physicalRegion{ + region: region{ + virtual: virtual, + length: length, + }, + physical: physical, + }) + physical += length + } + lastExcludedEnd := uintptr(0) + for _, r := range excludedRegions { + addValidRegion(lastExcludedEnd, r.virtual-lastExcludedEnd) + lastExcludedEnd = r.virtual + r.length + } + addValidRegion(lastExcludedEnd, ring0.MaximumUserAddress-lastExcludedEnd) + + // Dump our all physical regions. + for _, r := range physicalRegions { + log.Infof("physicalRegion: virtual [%x,%x) => physical [%x,%x)", + r.virtual, r.virtual+r.length, r.physical, r.physical+r.length) + } + return physicalRegions +} + +// physicalInit initializes physical address mappings. +func physicalInit() { + physicalRegions = computePhysicalRegions(fillAddressSpace()) +} + +// applyPhysicalRegions applies the given function on physical regions. +// +// Iteration continues as long as true is returned. The return value is the +// return from the last call to fn, or true if there are no entries. +// +// Precondition: physicalInit must have been called. +func applyPhysicalRegions(fn func(pr physicalRegion) bool) bool { + for _, pr := range physicalRegions { + if !fn(pr) { + return false + } + } + return true +} + +// translateToPhysical translates the given virtual address. +// +// Precondition: physicalInit must have been called. +// +//go:nosplit +func translateToPhysical(virtual uintptr) (physical uintptr, length uintptr, ok bool) { + for _, pr := range physicalRegions { + if pr.virtual <= virtual && virtual < pr.virtual+pr.length { + physical = pr.physical + (virtual - pr.virtual) + length = pr.length - (virtual - pr.virtual) + ok = true + return + } + } + return +} diff --git a/pkg/sentry/platform/kvm/virtual_map.go b/pkg/sentry/platform/kvm/virtual_map.go new file mode 100644 index 000000000..28a1b4414 --- /dev/null +++ b/pkg/sentry/platform/kvm/virtual_map.go @@ -0,0 +1,113 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "bufio" + "fmt" + "io" + "os" + "regexp" + "strconv" + + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +type virtualRegion struct { + region + accessType usermem.AccessType + shared bool + offset uintptr + filename string +} + +// mapsLine matches a single line from /proc/PID/maps. +var mapsLine = regexp.MustCompile("([0-9a-f]+)-([0-9a-f]+) ([r-][w-][x-][sp]) ([0-9a-f]+) [0-9a-f]{2}:[0-9a-f]{2,} [0-9]+\\s+(.*)") + +// excludeRegion returns true if these regions should be excluded from the +// physical map. Virtual regions need to be excluded if get_user_pages will +// fail on those addresses, preventing KVM from satisfying EPT faults. +// +// This includes the VVAR page because the VVAR page may be mapped as I/O +// memory. And the VDSO page is knocked out because the VVAR page is not even +// recorded in /proc/self/maps on older kernels; knocking out the VDSO page +// prevents code in the VDSO from accessing the VVAR address. +// +// This is called by the physical map functions, not applyVirtualRegions. +func excludeVirtualRegion(r virtualRegion) bool { + return r.filename == "[vvar]" || r.filename == "[vdso]" +} + +// applyVirtualRegions parses the process maps file. +// +// Unlike mappedRegions, these are not consistent over time. +func applyVirtualRegions(fn func(vr virtualRegion)) error { + // Open /proc/self/maps. + f, err := os.Open("/proc/self/maps") + if err != nil { + return err + } + defer f.Close() + + // Parse all entries. + r := bufio.NewReader(f) + for { + b, err := r.ReadBytes('\n') + if b != nil && len(b) > 0 { + m := mapsLine.FindSubmatch(b) + if m == nil { + // This should not happen: kernel bug? + return fmt.Errorf("badly formed line: %v", string(b)) + } + start, err := strconv.ParseUint(string(m[1]), 16, 64) + if err != nil { + return fmt.Errorf("bad start address: %v", string(b)) + } + end, err := strconv.ParseUint(string(m[2]), 16, 64) + if err != nil { + return fmt.Errorf("bad end address: %v", string(b)) + } + read := m[3][0] == 'r' + write := m[3][1] == 'w' + execute := m[3][2] == 'x' + shared := m[3][3] == 's' + offset, err := strconv.ParseUint(string(m[4]), 16, 64) + if err != nil { + return fmt.Errorf("bad offset: %v", string(b)) + } + fn(virtualRegion{ + region: region{ + virtual: uintptr(start), + length: uintptr(end - start), + }, + accessType: usermem.AccessType{ + Read: read, + Write: write, + Execute: execute, + }, + shared: shared, + offset: uintptr(offset), + filename: string(m[5]), + }) + } + if err != nil && err == io.EOF { + break + } else if err != nil { + return err + } + } + + return nil +} |