diff options
Diffstat (limited to 'pkg/sentry/platform/kvm')
26 files changed, 3656 insertions, 0 deletions
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD new file mode 100644 index 000000000..d902e344a --- /dev/null +++ b/pkg/sentry/platform/kvm/BUILD @@ -0,0 +1,90 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +go_template_instance( + name = "host_map_set", + out = "host_map_set.go", + consts = { + "minDegree": "15", + }, + imports = { + "usermem": "gvisor.googlesource.com/gvisor/pkg/sentry/usermem", + }, + package = "kvm", + prefix = "hostMap", + template = "//pkg/segment:generic_set", + types = { + "Key": "usermem.Addr", + "Range": "usermem.AddrRange", + "Value": "uintptr", + "Functions": "hostMapSetFunctions", + }, +) + +go_library( + name = "kvm", + srcs = [ + "address_space.go", + "bluepill.go", + "bluepill_amd64.go", + "bluepill_amd64.s", + "bluepill_amd64_unsafe.go", + "bluepill_fault.go", + "bluepill_unsafe.go", + "context.go", + "host_map.go", + "host_map_set.go", + "kvm.go", + "kvm_amd64.go", + "kvm_amd64_unsafe.go", + "kvm_const.go", + "machine.go", + "machine_amd64.go", + "machine_amd64_unsafe.go", + "machine_unsafe.go", + "physical_map.go", + "virtual_map.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/cpuid", + "//pkg/log", + "//pkg/sentry/arch", + "//pkg/sentry/platform", + "//pkg/sentry/platform/filemem", + "//pkg/sentry/platform/interrupt", + "//pkg/sentry/platform/procid", + "//pkg/sentry/platform/ring0", + "//pkg/sentry/platform/ring0/pagetables", + "//pkg/sentry/platform/safecopy", + "//pkg/sentry/time", + "//pkg/sentry/usermem", + "//pkg/tmutex", + ], +) + +go_test( + name = "kvm_test", + size = "small", + srcs = [ + "kvm_test.go", + "virtual_map_test.go", + ], + embed = [":kvm"], + tags = [ + "nogotsan", + "requires-kvm", + ], + deps = [ + "//pkg/sentry/arch", + "//pkg/sentry/platform", + "//pkg/sentry/platform/kvm/testutil", + "//pkg/sentry/platform/ring0", + "//pkg/sentry/platform/ring0/pagetables", + "//pkg/sentry/usermem", + ], +) diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go new file mode 100644 index 000000000..791f038b0 --- /dev/null +++ b/pkg/sentry/platform/kvm/address_space.go @@ -0,0 +1,207 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "reflect" + "sync" + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// addressSpace is a wrapper for PageTables. +type addressSpace struct { + platform.NoAddressSpaceIO + + // filemem is the memory instance. + filemem *filemem.FileMem + + // machine is the underlying machine. + machine *machine + + // pageTables are for this particular address space. + pageTables *pagetables.PageTables + + // dirtySet is the set of dirty vCPUs. + // + // The key is the vCPU, the value is a shared uint32 pointer that + // indicates whether or not the context is clean. A zero here indicates + // that the context should be cleaned prior to re-entry. + dirtySet sync.Map + + // files contains files mapped in the host address space. + files hostMap +} + +// Invalidate interrupts all dirty contexts. +func (as *addressSpace) Invalidate() { + as.dirtySet.Range(func(key, value interface{}) bool { + c := key.(*vCPU) + v := value.(*uint32) + atomic.StoreUint32(v, 0) // Invalidation required. + c.Bounce() // Force a kernel transition. + return true // Keep iterating. + }) +} + +// Touch adds the given vCPU to the dirty list. +func (as *addressSpace) Touch(c *vCPU) *uint32 { + value, ok := as.dirtySet.Load(c) + if !ok { + value, _ = as.dirtySet.LoadOrStore(c, new(uint32)) + } + return value.(*uint32) +} + +func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) { + for m.length > 0 { + physical, length, ok := TranslateToPhysical(m.addr) + if !ok { + panic("unable to translate segment") + } + if length > m.length { + length = m.length + } + + // Ensure that this map has physical mappings. If the page does + // not have physical mappings, the KVM module may inject + // spurious exceptions when emulation fails (i.e. it tries to + // emulate because the RIP is pointed at those pages). + as.machine.mapPhysical(physical, length) + + // Install the page table mappings. Note that the ordering is + // important; if the pagetable mappings were installed before + // ensuring the physical pages were available, then some other + // thread could theoretically access them. + prev := as.pageTables.Map(addr, length, true /* user */, at, physical) + inv = inv || prev + m.addr += length + m.length -= length + addr += usermem.Addr(length) + } + + return inv +} + +func (as *addressSpace) mapHostFile(addr usermem.Addr, fd int, fr platform.FileRange, at usermem.AccessType) error { + // Create custom host mappings. + ms, err := as.files.CreateMappings(usermem.AddrRange{ + Start: addr, + End: addr + usermem.Addr(fr.End-fr.Start), + }, at, fd, fr.Start) + if err != nil { + return err + } + + inv := false + for _, m := range ms { + // The host mapped slices are guaranteed to be aligned. + inv = inv || as.mapHost(addr, m, at) + addr += usermem.Addr(m.length) + } + if inv { + as.Invalidate() + } + + return nil +} + +func (as *addressSpace) mapFilemem(addr usermem.Addr, fr platform.FileRange, at usermem.AccessType, precommit bool) error { + // TODO: Lock order at the platform level is not sufficiently + // well-defined to guarantee that the caller (FileMem.MapInto) is not + // holding any locks that FileMem.MapInternal may take. + + // Retrieve mappings for the underlying filemem. Note that the + // permissions here are largely irrelevant, since it corresponds to + // physical memory for the guest. We enforce the given access type + // below, in the guest page tables. + bs, err := as.filemem.MapInternal(fr, usermem.AccessType{ + Read: true, + Write: true, + }) + if err != nil { + return err + } + + // Save the original range for invalidation. + orig := usermem.AddrRange{ + Start: addr, + End: addr + usermem.Addr(fr.End-fr.Start), + } + + inv := false + for !bs.IsEmpty() { + b := bs.Head() + bs = bs.Tail() + // Since fr was page-aligned, b should also be page-aligned. We do the + // lookup in our host page tables for this translation. + s := b.ToSlice() + if precommit { + for i := 0; i < len(s); i += usermem.PageSize { + _ = s[i] // Touch to commit. + } + } + inv = inv || as.mapHost(addr, hostMapEntry{ + addr: reflect.ValueOf(&s[0]).Pointer(), + length: uintptr(len(s)), + }, at) + addr += usermem.Addr(len(s)) + } + if inv { + as.Invalidate() + as.files.DeleteMapping(orig) + } + + return nil +} + +// MapFile implements platform.AddressSpace.MapFile. +func (as *addressSpace) MapFile(addr usermem.Addr, fd int, fr platform.FileRange, at usermem.AccessType, precommit bool) error { + // Create an appropriate mapping. If this is filemem, we don't create + // custom mappings for each in-application mapping. For files however, + // we create distinct mappings for each address space. Unfortunately, + // there's not a better way to manage this here. The file underlying + // this fd can change at any time, so we can't actually index the file + // and share between address space. Oh well. It's all refering to the + // same physical pages, hopefully we don't run out of address space. + if fd != int(as.filemem.File().Fd()) { + // N.B. precommit is ignored for host files. + return as.mapHostFile(addr, fd, fr, at) + } + + return as.mapFilemem(addr, fr, at, precommit) +} + +// Unmap unmaps the given range by calling pagetables.PageTables.Unmap. +func (as *addressSpace) Unmap(addr usermem.Addr, length uint64) { + if prev := as.pageTables.Unmap(addr, uintptr(length)); prev { + as.Invalidate() + as.files.DeleteMapping(usermem.AddrRange{ + Start: addr, + End: addr + usermem.Addr(length), + }) + } +} + +// Release releases the page tables. +func (as *addressSpace) Release() error { + as.Unmap(0, ^uint64(0)) + as.pageTables.Release() + return nil +} diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go new file mode 100644 index 000000000..ecc33d7dd --- /dev/null +++ b/pkg/sentry/platform/kvm/bluepill.go @@ -0,0 +1,41 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "fmt" + "reflect" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/safecopy" +) + +// bluepill enters guest mode. +func bluepill(*vCPU) + +// sighandler is the signal entry point. +func sighandler() + +// savedHandler is a pointer to the previous handler. +// +// This is called by bluepillHandler. +var savedHandler uintptr + +func init() { + // Install the handler. + if err := safecopy.ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil { + panic(fmt.Sprintf("Unable to set handler for signal %d: %v", syscall.SIGSEGV, err)) + } +} diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go new file mode 100644 index 000000000..a2baefb7d --- /dev/null +++ b/pkg/sentry/platform/kvm/bluepill_amd64.go @@ -0,0 +1,143 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package kvm + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0" +) + +var ( + // bounceSignal is the signal used for bouncing KVM. + // + // We use SIGCHLD because it is not masked by the runtime, and + // it will be ignored properly by other parts of the kernel. + bounceSignal = syscall.SIGCHLD + + // bounceSignalMask has only bounceSignal set. + bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1)) + + // bounce is the interrupt vector used to return to the kernel. + bounce = uint32(ring0.VirtualizationException) +) + +// redpill on amd64 invokes a syscall with -1. +// +//go:nosplit +func redpill() { + syscall.RawSyscall(^uintptr(0), 0, 0, 0) +} + +// bluepillArchEnter is called during bluepillEnter. +// +//go:nosplit +func bluepillArchEnter(context *arch.SignalContext64) (c *vCPU) { + c = vCPUPtr(uintptr(context.Rax)) + regs := c.CPU.Registers() + regs.R8 = context.R8 + regs.R9 = context.R9 + regs.R10 = context.R10 + regs.R11 = context.R11 + regs.R12 = context.R12 + regs.R13 = context.R13 + regs.R14 = context.R14 + regs.R15 = context.R15 + regs.Rdi = context.Rdi + regs.Rsi = context.Rsi + regs.Rbp = context.Rbp + regs.Rbx = context.Rbx + regs.Rdx = context.Rdx + regs.Rax = context.Rax + regs.Rcx = context.Rcx + regs.Rsp = context.Rsp + regs.Rip = context.Rip + regs.Eflags = context.Eflags + regs.Eflags &^= uint64(ring0.KernelFlagsClear) + regs.Eflags |= ring0.KernelFlagsSet + regs.Cs = uint64(ring0.Kcode) + regs.Ds = uint64(ring0.Udata) + regs.Es = uint64(ring0.Udata) + regs.Fs = uint64(ring0.Udata) + regs.Ss = uint64(ring0.Kdata) + + // ring0 uses GS exclusively, so we use GS_base to store the location + // of the floating point address. + // + // The address will be restored directly after running the VCPU, and + // will be saved again prior to halting. We rely on the fact that the + // SaveFloatingPointer/LoadFloatingPoint functions use the most + // efficient mechanism available (including compression) so the state + // size is guaranteed to be less than what's pointed to here. + regs.Gs_base = uint64(context.Fpstate) + return +} + +// bluepillSyscall handles kernel syscalls. +// +//go:nosplit +func bluepillSyscall() { + regs := ring0.Current().Registers() + if regs.Rax != ^uint64(0) { + regs.Rip -= 2 // Rewind. + } + ring0.SaveFloatingPoint(bytePtr(uintptr(regs.Gs_base))) + ring0.Halt() + ring0.LoadFloatingPoint(bytePtr(uintptr(regs.Gs_base))) +} + +// bluepillException handles kernel exceptions. +// +//go:nosplit +func bluepillException(vector ring0.Vector) { + regs := ring0.Current().Registers() + if vector == ring0.Vector(bounce) { + // These should not interrupt kernel execution; point the Rip + // to zero to ensure that we get a reasonable panic when we + // attempt to return. + regs.Rip = 0 + } + ring0.SaveFloatingPoint(bytePtr(uintptr(regs.Gs_base))) + ring0.Halt() + ring0.LoadFloatingPoint(bytePtr(uintptr(regs.Gs_base))) +} + +// bluepillArchExit is called during bluepillEnter. +// +//go:nosplit +func bluepillArchExit(c *vCPU, context *arch.SignalContext64) { + regs := c.CPU.Registers() + context.R8 = regs.R8 + context.R9 = regs.R9 + context.R10 = regs.R10 + context.R11 = regs.R11 + context.R12 = regs.R12 + context.R13 = regs.R13 + context.R14 = regs.R14 + context.R15 = regs.R15 + context.Rdi = regs.Rdi + context.Rsi = regs.Rsi + context.Rbp = regs.Rbp + context.Rbx = regs.Rbx + context.Rdx = regs.Rdx + context.Rax = regs.Rax + context.Rcx = regs.Rcx + context.Rsp = regs.Rsp + context.Rip = regs.Rip + context.Eflags = regs.Eflags +} diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.s b/pkg/sentry/platform/kvm/bluepill_amd64.s new file mode 100644 index 000000000..0881bd5f5 --- /dev/null +++ b/pkg/sentry/platform/kvm/bluepill_amd64.s @@ -0,0 +1,87 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "textflag.h" + +// VCPU_CPU is the location of the CPU in the vCPU struct. +// +// This is guaranteed to be zero. +#define VCPU_CPU 0x0 + +// CPU_SELF is the self reference in ring0's percpu. +// +// This is guaranteed to be zero. +#define CPU_SELF 0x0 + +// Context offsets. +// +// Only limited use of the context is done in the assembly stub below, most is +// done in the Go handlers. However, the RIP must be examined. +#define CONTEXT_RAX 0x90 +#define CONTEXT_RIP 0xa8 +#define CONTEXT_FP 0xe0 + +// CLI is the literal byte for the disable interrupts instruction. +// +// This is checked as the source of the fault. +#define CLI $0xfa + +// See bluepill.go. +TEXT ·bluepill(SB),NOSPLIT,$0 +begin: + MOVQ vcpu+0(FP), AX + LEAQ VCPU_CPU(AX), BX + BYTE CLI; +check_vcpu: + MOVQ CPU_SELF(GS), CX + CMPQ BX, CX + JE right_vCPU +wrong_vcpu: + CALL ·redpill(SB) + JMP begin +right_vCPU: + RET + +// sighandler: see bluepill.go for documentation. +// +// The arguments are the following: +// +// DI - The signal number. +// SI - Pointer to siginfo_t structure. +// DX - Pointer to ucontext structure. +// +TEXT ·sighandler(SB),NOSPLIT,$0 + // Check if the signal is from the kernel. + MOVQ $0x80, CX + CMPL CX, 0x8(SI) + JNE fallback + + // Check if RIP is disable interrupts. + MOVQ CONTEXT_RIP(DX), CX + CMPQ CX, $0x0 + JE fallback + CMPB 0(CX), CLI + JNE fallback + + // Call the bluepillHandler. + PUSHQ DX // First argument (context). + CALL ·bluepillHandler(SB) // Call the handler. + POPQ DX // Discard the argument. + RET + +fallback: + // Jump to the previous signal handler. + XORQ CX, CX + MOVQ ·savedHandler(SB), AX + JMP AX diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go new file mode 100644 index 000000000..61ca61dcb --- /dev/null +++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go @@ -0,0 +1,28 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package kvm + +import ( + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" +) + +// bluepillArchContext returns the arch-specific context. +func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 { + return &((*arch.UContext64)(context).MContext) +} diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go new file mode 100644 index 000000000..7c8c7bc37 --- /dev/null +++ b/pkg/sentry/platform/kvm/bluepill_fault.go @@ -0,0 +1,127 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "sync/atomic" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +const ( + // faultBlockSize is the size used for servicing memory faults. + // + // This should be large enough to avoid frequent faults and avoid using + // all available KVM slots (~512), but small enough that KVM does not + // complain about slot sizes (~4GB). See handleBluepillFault for how + // this block is used. + faultBlockSize = 2 << 30 + + // faultBlockMask is the mask for the fault blocks. + // + // This must be typed to avoid overflow complaints (ugh). + faultBlockMask = ^uintptr(faultBlockSize - 1) +) + +// yield yields the CPU. +// +//go:nosplit +func yield() { + syscall.RawSyscall(syscall.SYS_SCHED_YIELD, 0, 0, 0) +} + +// calculateBluepillFault calculates the fault address range. +// +//go:nosplit +func calculateBluepillFault(m *machine, physical uintptr) (virtualStart, physicalStart, length uintptr, ok bool) { + alignedPhysical := physical &^ uintptr(usermem.PageSize-1) + for _, pr := range physicalRegions { + end := pr.physical + pr.length + if physical < pr.physical || physical >= end { + continue + } + + // Adjust the block to match our size. + physicalStart = alignedPhysical & faultBlockMask + if physicalStart < pr.physical { + // Bound the starting point to the start of the region. + physicalStart = pr.physical + } + virtualStart = pr.virtual + (physicalStart - pr.physical) + physicalEnd := physicalStart + faultBlockSize + if physicalEnd > end { + physicalEnd = end + } + length = physicalEnd - physicalStart + return virtualStart, physicalStart, length, true + } + + return 0, 0, 0, false +} + +// handleBluepillFault handles a physical fault. +// +// The corresponding virtual address is returned. This may throw on error. +// +//go:nosplit +func handleBluepillFault(m *machine, physical uintptr) (uintptr, bool) { + // Paging fault: we need to map the underlying physical pages for this + // fault. This all has to be done in this function because we're in a + // signal handler context. (We can't call any functions that might + // split the stack.) + virtualStart, physicalStart, length, ok := calculateBluepillFault(m, physical) + if !ok { + return 0, false + } + + // Set the KVM slot. + // + // First, we need to acquire the exclusive right to set a slot. See + // machine.nextSlot for information about the protocol. + slot := atomic.SwapUint32(&m.nextSlot, ^uint32(0)) + for slot == ^uint32(0) { + yield() // Race with another call. + slot = atomic.SwapUint32(&m.nextSlot, ^uint32(0)) + } + errno := m.setMemoryRegion(int(slot), physicalStart, length, virtualStart) + if errno == 0 { + // Successfully added region; we can increment nextSlot and + // allow another set to proceed here. + atomic.StoreUint32(&m.nextSlot, slot+1) + return virtualStart + (physical - physicalStart), true + } + + // Release our slot (still available). + atomic.StoreUint32(&m.nextSlot, slot) + + switch errno { + case syscall.EEXIST: + // The region already exists. It's possible that we raced with + // another vCPU here. We just revert nextSlot and return true, + // because this must have been satisfied by some other vCPU. + return virtualStart + (physical - physicalStart), true + case syscall.EINVAL: + throw("set memory region failed; out of slots") + case syscall.ENOMEM: + throw("set memory region failed: out of memory") + case syscall.EFAULT: + throw("set memory region failed: invalid physical range") + default: + throw("set memory region failed: unknown reason") + } + + panic("unreachable") +} diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go new file mode 100644 index 000000000..85703ff18 --- /dev/null +++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go @@ -0,0 +1,175 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "sync/atomic" + "syscall" + "unsafe" +) + +//go:linkname throw runtime.throw +func throw(string) + +// vCPUPtr returns a CPU for the given address. +// +//go:nosplit +func vCPUPtr(addr uintptr) *vCPU { + return (*vCPU)(unsafe.Pointer(addr)) +} + +// bytePtr returns a bytePtr for the given address. +// +//go:nosplit +func bytePtr(addr uintptr) *byte { + return (*byte)(unsafe.Pointer(addr)) +} + +// bluepillHandler is called from the signal stub. +// +// The world may be stopped while this is executing, and it executes on the +// signal stack. It should only execute raw system calls and functions that are +// explicitly marked go:nosplit. +// +//go:nosplit +func bluepillHandler(context unsafe.Pointer) { + // Sanitize the registers; interrupts must always be disabled. + c := bluepillArchEnter(bluepillArchContext(context)) + + // Increment the number of switches. + atomic.AddUint32(&c.switches, 1) + + // Store vCPUGuest. + // + // This is fine even if we're not in guest mode yet. In this signal + // handler, we'll already have all the relevant signals blocked, so an + // interrupt is only deliverable when we actually execute the KVM_RUN. + // + // The state will be returned to vCPUReady by Phase2. + if state := atomic.SwapUintptr(&c.state, vCPUGuest); state != vCPUReady { + throw("vCPU not in ready state") + } + + for { + _, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0) + if errno == syscall.EINTR { + // First, we process whatever pending signal + // interrupted KVM. Since we're in a signal handler + // currently, all signals are masked and the signal + // must have been delivered directly to this thread. + sig, _, errno := syscall.RawSyscall6( + syscall.SYS_RT_SIGTIMEDWAIT, + uintptr(unsafe.Pointer(&bounceSignalMask)), + 0, // siginfo. + 0, // timeout. + 8, // sigset size. + 0, 0) + if errno != 0 { + throw("error waiting for pending signal") + } + if sig != uintptr(bounceSignal) { + throw("unexpected signal") + } + + // Check whether the current state of the vCPU is ready + // for interrupt injection. Because we don't have a + // PIC, we can't inject an interrupt while they are + // masked. We need to request a window if it's not + // ready. + if c.runData.readyForInterruptInjection == 0 { + c.runData.requestInterruptWindow = 1 + continue // Rerun vCPU. + } else { + // Force injection below; the vCPU is ready. + c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN + } + } else if errno != 0 { + throw("run failed") + } + + switch c.runData.exitReason { + case _KVM_EXIT_EXCEPTION: + throw("exception") + case _KVM_EXIT_IO: + throw("I/O") + case _KVM_EXIT_INTERNAL_ERROR: + throw("internal error") + case _KVM_EXIT_HYPERCALL: + throw("hypercall") + case _KVM_EXIT_DEBUG: + throw("debug") + case _KVM_EXIT_HLT: + // Copy out registers. + bluepillArchExit(c, bluepillArchContext(context)) + + // Notify any waiters. + switch state := atomic.SwapUintptr(&c.state, vCPUReady); state { + case vCPUGuest: + case vCPUWaiter: + c.notify() // Safe from handler. + default: + throw("invalid state") + } + return + case _KVM_EXIT_MMIO: + // Increment the fault count. + atomic.AddUint32(&c.faults, 1) + + // For MMIO, the physical address is the first data item. + virtual, ok := handleBluepillFault(c.machine, uintptr(c.runData.data[0])) + if !ok { + throw("physical address not valid") + } + + // We now need to fill in the data appropriately. KVM + // expects us to provide the result of the given MMIO + // operation in the runData struct. This is safe + // because, if a fault occurs here, the same fault + // would have occurred in guest mode. The kernel should + // not create invalid page table mappings. + data := (*[8]byte)(unsafe.Pointer(&c.runData.data[1])) + length := (uintptr)((uint32)(c.runData.data[2])) + write := (uint8)((c.runData.data[2] >> 32 & 0xff)) != 0 + for i := uintptr(0); i < length; i++ { + b := bytePtr(uintptr(virtual) + i) + if write { + // Write to the given address. + *b = data[i] + } else { + // Read from the given address. + data[i] = *b + } + } + case _KVM_EXIT_IRQ_WINDOW_OPEN: + // Interrupt: we must have requested an interrupt + // window; set the interrupt line. + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_INTERRUPT, + uintptr(unsafe.Pointer(&bounce))); errno != 0 { + throw("interrupt injection failed") + } + // Clear previous injection request. + c.runData.requestInterruptWindow = 0 + case _KVM_EXIT_SHUTDOWN: + throw("shutdown") + case _KVM_EXIT_FAIL_ENTRY: + throw("entry failed") + default: + throw("unknown failure") + } + } +} diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go new file mode 100644 index 000000000..fd04a2c47 --- /dev/null +++ b/pkg/sentry/platform/kvm/context.go @@ -0,0 +1,81 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/interrupt" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// context is an implementation of the platform context. +// +// This is a thin wrapper around the machine. +type context struct { + // machine is the parent machine, and is immutable. + machine *machine + + // interrupt is the interrupt context. + interrupt interrupt.Forwarder +} + +// Switch runs the provided context in the given address space. +func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*arch.SignalInfo, usermem.AccessType, error) { + // Extract data. + localAS := as.(*addressSpace) + regs := &ac.StateData().Regs + fp := (*byte)(ac.FloatingPointData()) + + // Grab a vCPU. + cpu, err := c.machine.Get() + if err != nil { + return nil, usermem.NoAccess, err + } + + // Enable interrupts (i.e. calls to vCPU.Notify). + if !c.interrupt.Enable(cpu) { + c.machine.Put(cpu) // Already preempted. + return nil, usermem.NoAccess, platform.ErrContextInterrupt + } + + // Mark the address space as dirty. + flags := ring0.Flags(0) + dirty := localAS.Touch(cpu) + if v := atomic.SwapUint32(dirty, 1); v == 0 { + flags |= ring0.FlagFlush + } + if ac.FullRestore() { + flags |= ring0.FlagFull + } + + // Take the blue pill. + si, at, err := cpu.SwitchToUser(regs, fp, localAS.pageTables, flags) + + // Release resources. + c.machine.Put(cpu) + + // All done. + c.interrupt.Disable() + return si, at, err +} + +// Interrupt interrupts the running context. +func (c *context) Interrupt() { + c.interrupt.NotifyInterrupt() +} diff --git a/pkg/sentry/platform/kvm/host_map.go b/pkg/sentry/platform/kvm/host_map.go new file mode 100644 index 000000000..357f8c92e --- /dev/null +++ b/pkg/sentry/platform/kvm/host_map.go @@ -0,0 +1,168 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "fmt" + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +type hostMap struct { + // mu protects below. + mu sync.RWMutex + + // set contains host mappings. + set hostMapSet +} + +type hostMapEntry struct { + addr uintptr + length uintptr +} + +func (hm *hostMap) forEachEntry(r usermem.AddrRange, fn func(offset uint64, m hostMapEntry)) { + for seg := hm.set.FindSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { + length := uintptr(seg.Range().Length()) + segOffset := uint64(0) // Adjusted below. + if seg.End() > r.End { + length -= uintptr(seg.End() - r.End) + } + if seg.Start() < r.Start { + length -= uintptr(r.Start - seg.Start()) + } else { + segOffset = uint64(seg.Start() - r.Start) + } + fn(segOffset, hostMapEntry{ + addr: seg.Value(), + length: length, + }) + } +} + +func (hm *hostMap) createMappings(r usermem.AddrRange, at usermem.AccessType, fd int, offset uint64) (ms []hostMapEntry, err error) { + // Replace any existing mappings. + hm.forEachEntry(r, func(segOffset uint64, m hostMapEntry) { + _, _, errno := syscall.RawSyscall6( + syscall.SYS_MMAP, + m.addr, + m.length, + uintptr(at.Prot()), + syscall.MAP_FIXED|syscall.MAP_SHARED, + uintptr(fd), + uintptr(offset+segOffset)) + if errno != 0 && err == nil { + err = errno + } + }) + if err != nil { + return nil, err + } + + // Add in necessary new mappings. + for gap := hm.set.FindGap(r.Start); gap.Ok() && gap.Start() < r.End; { + length := uintptr(gap.Range().Length()) + gapOffset := uint64(0) // Adjusted below. + if gap.End() > r.End { + length -= uintptr(gap.End() - r.End) + } + if gap.Start() < r.Start { + length -= uintptr(r.Start - gap.Start()) + } else { + gapOffset = uint64(gap.Start() - r.Start) + } + + // Map the host file memory. + hostAddr, _, errno := syscall.RawSyscall6( + syscall.SYS_MMAP, + 0, + length, + uintptr(at.Prot()), + syscall.MAP_SHARED, + uintptr(fd), + uintptr(offset+gapOffset)) + if errno != 0 { + return nil, errno + } + + // Insert into the host set and move to the next gap. + gap = hm.set.Insert(gap, gap.Range().Intersect(r), hostAddr).NextGap() + } + + // Collect all slices. + hm.forEachEntry(r, func(_ uint64, m hostMapEntry) { + ms = append(ms, m) + }) + + return ms, nil +} + +// CreateMappings creates a new set of host mapping entries. +func (hm *hostMap) CreateMappings(r usermem.AddrRange, at usermem.AccessType, fd int, offset uint64) (ms []hostMapEntry, err error) { + hm.mu.Lock() + ms, err = hm.createMappings(r, at, fd, offset) + hm.mu.Unlock() + return +} + +func (hm *hostMap) deleteMapping(r usermem.AddrRange) { + // Remove all the existing mappings. + hm.forEachEntry(r, func(_ uint64, m hostMapEntry) { + _, _, errno := syscall.RawSyscall( + syscall.SYS_MUNMAP, + m.addr, + m.length, + 0) + if errno != 0 { + // Should never happen. + panic(fmt.Sprintf("unmap error: %v", errno)) + } + }) + + // Knock the range out. + hm.set.RemoveRange(r) +} + +// DeleteMapping deletes the given range. +func (hm *hostMap) DeleteMapping(r usermem.AddrRange) { + hm.mu.Lock() + hm.deleteMapping(r) + hm.mu.Unlock() +} + +// hostMapSetFunctions is used in the implementation of mapSet. +type hostMapSetFunctions struct{} + +func (hostMapSetFunctions) MinKey() usermem.Addr { return 0 } +func (hostMapSetFunctions) MaxKey() usermem.Addr { return ^usermem.Addr(0) } +func (hostMapSetFunctions) ClearValue(val *uintptr) { *val = 0 } + +func (hostMapSetFunctions) Merge(r1 usermem.AddrRange, addr1 uintptr, r2 usermem.AddrRange, addr2 uintptr) (uintptr, bool) { + if addr1+uintptr(r1.Length()) != addr2 { + return 0, false + } + + // Since the two regions are contiguous in both the key space and the + // value space, we can just store a single segment with the first host + // virtual address; the logic above operates based on the size of the + // segments. + return addr1, true +} + +func (hostMapSetFunctions) Split(r usermem.AddrRange, hostAddr uintptr, split usermem.Addr) (uintptr, uintptr) { + return hostAddr, hostAddr + uintptr(split-r.Start) +} diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go new file mode 100644 index 000000000..31928c9f0 --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm.go @@ -0,0 +1,149 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package kvm provides a kvm-based implementation of the platform interface. +package kvm + +import ( + "fmt" + "runtime" + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/cpuid" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// KVM represents a lightweight VM context. +type KVM struct { + platform.NoCPUPreemptionDetection + + // filemem is our memory source. + *filemem.FileMem + + // machine is the backing VM. + machine *machine +} + +var ( + globalOnce sync.Once + globalErr error +) + +// New returns a new KVM-based implementation of the platform interface. +func New() (*KVM, error) { + // Allocate physical memory for the vCPUs. + fm, err := filemem.New("kvm-memory") + if err != nil { + return nil, err + } + + // Try opening KVM. + fd, err := syscall.Open("/dev/kvm", syscall.O_RDWR, 0) + if err != nil { + return nil, fmt.Errorf("opening /dev/kvm: %v", err) + } + defer syscall.Close(fd) + + // Ensure global initialization is done. + globalOnce.Do(func() { + physicalInit() + globalErr = updateSystemValues(fd) + ring0.Init(cpuid.HostFeatureSet()) + }) + if globalErr != nil { + return nil, err + } + + // Create a new VM fd. + vm, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(fd), _KVM_CREATE_VM, 0) + if errno != 0 { + return nil, fmt.Errorf("creating VM: %v", errno) + } + + // Create a VM context. + machine, err := newMachine(int(vm), runtime.NumCPU()) + if err != nil { + return nil, err + } + + // All set. + return &KVM{ + FileMem: fm, + machine: machine, + }, nil +} + +// SupportsAddressSpaceIO implements platform.Platform.SupportsAddressSpaceIO. +func (*KVM) SupportsAddressSpaceIO() bool { + return false +} + +// CooperativelySchedulesAddressSpace implements platform.Platform.CooperativelySchedulesAddressSpace. +func (*KVM) CooperativelySchedulesAddressSpace() bool { + return false +} + +// MapUnit implements platform.Platform.MapUnit. +func (*KVM) MapUnit() uint64 { + // We greedily creates PTEs in MapFile, so extremely large mappings can + // be expensive. Not _that_ expensive since we allow super pages, but + // even though can get out of hand if you're creating multi-terabyte + // mappings. For this reason, we limit mappings to an arbitrary 16MB. + return 16 << 20 +} + +// MinUserAddress returns the lowest available address. +func (*KVM) MinUserAddress() usermem.Addr { + return usermem.PageSize +} + +// MaxUserAddress returns the first address that may not be used. +func (*KVM) MaxUserAddress() usermem.Addr { + return usermem.Addr(ring0.MaximumUserAddress) +} + +// NewAddressSpace returns a new pagetable root. +func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan struct{}, error) { + // Allocate page tables and install system mappings. + pageTables := k.machine.kernel.PageTables.New() + applyPhysicalRegions(func(pr physicalRegion) bool { + // Map the kernel in the upper half. + kernelVirtual := usermem.Addr(ring0.KernelStartAddress | pr.virtual) + pageTables.Map(kernelVirtual, pr.length, false /* kernel */, usermem.AnyAccess, pr.physical) + return true // Keep iterating. + }) + + // Return the new address space. + return &addressSpace{ + filemem: k.FileMem, + machine: k.machine, + pageTables: pageTables, + }, nil, nil +} + +// NewContext returns an interruptible context. +func (k *KVM) NewContext() platform.Context { + return &context{ + machine: k.machine, + } +} + +// Memory returns the platform memory used to do allocations. +func (k *KVM) Memory() platform.Memory { + return k.FileMem +} diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go new file mode 100644 index 000000000..3d56ed895 --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_amd64.go @@ -0,0 +1,213 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package kvm + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0" +) + +// userMemoryRegion is a region of physical memory. +// +// This mirrors kvm_memory_region. +type userMemoryRegion struct { + slot uint32 + flags uint32 + guestPhysAddr uint64 + memorySize uint64 + userspaceAddr uint64 +} + +// userRegs represents KVM user registers. +// +// This mirrors kvm_regs. +type userRegs struct { + RAX uint64 + RBX uint64 + RCX uint64 + RDX uint64 + RSI uint64 + RDI uint64 + RSP uint64 + RBP uint64 + R8 uint64 + R9 uint64 + R10 uint64 + R11 uint64 + R12 uint64 + R13 uint64 + R14 uint64 + R15 uint64 + RIP uint64 + RFLAGS uint64 +} + +// systemRegs represents KVM system registers. +// +// This mirrors kvm_sregs. +type systemRegs struct { + CS segment + DS segment + ES segment + FS segment + GS segment + SS segment + TR segment + LDT segment + GDT descriptor + IDT descriptor + CR0 uint64 + CR2 uint64 + CR3 uint64 + CR4 uint64 + CR8 uint64 + EFER uint64 + apicBase uint64 + interruptBitmap [(_KVM_NR_INTERRUPTS + 63) / 64]uint64 +} + +// segment is the expanded form of a segment register. +// +// This mirrors kvm_segment. +type segment struct { + base uint64 + limit uint32 + selector uint16 + typ uint8 + present uint8 + DPL uint8 + DB uint8 + S uint8 + L uint8 + G uint8 + AVL uint8 + unusable uint8 + _ uint8 +} + +// Clear clears the segment and marks it unusable. +func (s *segment) Clear() { + *s = segment{unusable: 1} +} + +// selector is a segment selector. +type selector uint16 + +// tobool is a simple helper. +func tobool(x ring0.SegmentDescriptorFlags) uint8 { + if x != 0 { + return 1 + } + return 0 +} + +// Load loads the segment described by d into the segment s. +// +// The argument sel is recorded as the segment selector index. +func (s *segment) Load(d *ring0.SegmentDescriptor, sel ring0.Selector) { + flag := d.Flags() + if flag&ring0.SegmentDescriptorPresent == 0 { + s.Clear() + return + } + s.base = uint64(d.Base()) + s.limit = d.Limit() + s.typ = uint8((flag>>8)&0xF) | 1 + s.S = tobool(flag & ring0.SegmentDescriptorSystem) + s.DPL = uint8(d.DPL()) + s.present = tobool(flag & ring0.SegmentDescriptorPresent) + s.AVL = tobool(flag & ring0.SegmentDescriptorAVL) + s.L = tobool(flag & ring0.SegmentDescriptorLong) + s.DB = tobool(flag & ring0.SegmentDescriptorDB) + s.G = tobool(flag & ring0.SegmentDescriptorG) + if s.L != 0 { + s.limit = 0xffffffff + } + s.unusable = 0 + s.selector = uint16(sel) +} + +// descriptor describes a region of physical memory. +// +// It corresponds to the pseudo-descriptor used in the x86 LGDT and LIDT +// instructions, and mirrors kvm_dtable. +type descriptor struct { + base uint64 + limit uint16 + _ [3]uint16 +} + +// modelControlRegister is an MSR entry. +// +// This mirrors kvm_msr_entry. +type modelControlRegister struct { + index uint32 + _ uint32 + data uint64 +} + +// modelControlRegisers is a collection of MSRs. +// +// This mirrors kvm_msrs. +type modelControlRegisters struct { + nmsrs uint32 + _ uint32 + entries [16]modelControlRegister +} + +// runData is the run structure. This may be mapped for synchronous register +// access (although that doesn't appear to be supported by my kernel at least). +// +// This mirrors kvm_run. +type runData struct { + requestInterruptWindow uint8 + _ [7]uint8 + + exitReason uint32 + readyForInterruptInjection uint8 + ifFlag uint8 + _ [2]uint8 + + cr8 uint64 + apicBase uint64 + + // This is the union data for exits. Interpretation depends entirely on + // the exitReason above (see vCPU code for more information). + data [32]uint64 +} + +// cpuidEntry is a single CPUID entry. +// +// This mirrors kvm_cpuid_entry2. +type cpuidEntry struct { + function uint32 + index uint32 + flags uint32 + eax uint32 + ebx uint32 + ecx uint32 + edx uint32 + _ [3]uint32 +} + +// cpuidEntries is a collection of CPUID entries. +// +// This mirrors kvm_cpuid2. +type cpuidEntries struct { + nr uint32 + _ uint32 + entries [_KVM_NR_CPUID_ENTRIES]cpuidEntry +} diff --git a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go new file mode 100644 index 000000000..389412d87 --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go @@ -0,0 +1,93 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package kvm + +import ( + "fmt" + "syscall" + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables" +) + +var ( + runDataSize int + hasGuestPCID bool + hasGuestINVPCID bool + pagetablesOpts pagetables.Opts + cpuidSupported = cpuidEntries{nr: _KVM_NR_CPUID_ENTRIES} +) + +func updateSystemValues(fd int) error { + // Extract the mmap size. + sz, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(fd), _KVM_GET_VCPU_MMAP_SIZE, 0) + if errno != 0 { + return fmt.Errorf("getting VCPU mmap size: %v", errno) + } + + // Save the data. + runDataSize = int(sz) + + // Must do the dance to figure out the number of entries. + _, _, errno = syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(fd), + _KVM_GET_SUPPORTED_CPUID, + uintptr(unsafe.Pointer(&cpuidSupported))) + if errno != 0 && errno != syscall.ENOMEM { + // Some other error occurred. + return fmt.Errorf("getting supported CPUID: %v", errno) + } + + // The number should now be correct. + _, _, errno = syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(fd), + _KVM_GET_SUPPORTED_CPUID, + uintptr(unsafe.Pointer(&cpuidSupported))) + if errno != 0 { + // Didn't work with the right number. + return fmt.Errorf("getting supported CPUID (2nd attempt): %v", errno) + } + + // Calculate whether guestPCID is supported. + // + // FIXME: These should go through the much more pleasant + // cpuid package interfaces, once a way to accept raw kvm CPUID entries + // is plumbed (or some rough equivalent). + for i := 0; i < int(cpuidSupported.nr); i++ { + entry := cpuidSupported.entries[i] + if entry.function == 1 && entry.index == 0 && entry.ecx&(1<<17) != 0 { + hasGuestPCID = true // Found matching PCID in guest feature set. + } + if entry.function == 7 && entry.index == 0 && entry.ebx&(1<<10) != 0 { + hasGuestINVPCID = true // Found matching INVPCID in guest feature set. + } + } + + // A basic sanity check: ensure that we don't attempt to + // invpcid if guest PCIDs are not supported; it's not clear + // what the semantics of this would be (or why some CPU or + // hypervisor would export this particular combination). + hasGuestINVPCID = hasGuestPCID && hasGuestINVPCID + + // Set the pagetables to use PCID if it's available. + pagetablesOpts.EnablePCID = hasGuestPCID + + // Success. + return nil +} diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go new file mode 100644 index 000000000..0ec6a4a00 --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_const.go @@ -0,0 +1,56 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +// KVM ioctls. +// +// Only the ioctls we need in Go appear here; some additional ioctls are used +// within the assembly stubs (KVM_INTERRUPT, etc.). +const ( + _KVM_CREATE_VM = 0xae01 + _KVM_GET_VCPU_MMAP_SIZE = 0xae04 + _KVM_CREATE_VCPU = 0xae41 + _KVM_SET_TSS_ADDR = 0xae47 + _KVM_RUN = 0xae80 + _KVM_INTERRUPT = 0x4004ae86 + _KVM_SET_MSRS = 0x4008ae89 + _KVM_SET_USER_MEMORY_REGION = 0x4020ae46 + _KVM_SET_REGS = 0x4090ae82 + _KVM_SET_SREGS = 0x4138ae84 + _KVM_GET_SUPPORTED_CPUID = 0xc008ae05 + _KVM_SET_CPUID2 = 0x4008ae90 + _KVM_SET_SIGNAL_MASK = 0x4004ae8b +) + +// KVM exit reasons. +const ( + _KVM_EXIT_EXCEPTION = 0x1 + _KVM_EXIT_IO = 0x2 + _KVM_EXIT_HYPERCALL = 0x3 + _KVM_EXIT_DEBUG = 0x4 + _KVM_EXIT_HLT = 0x5 + _KVM_EXIT_MMIO = 0x6 + _KVM_EXIT_IRQ_WINDOW_OPEN = 0x7 + _KVM_EXIT_SHUTDOWN = 0x8 + _KVM_EXIT_FAIL_ENTRY = 0x9 + _KVM_EXIT_INTERNAL_ERROR = 0x11 +) + +// KVM limits. +const ( + _KVM_NR_VCPUS = 0x100 + _KVM_NR_INTERRUPTS = 0x100 + _KVM_NR_CPUID_ENTRIES = 0x100 +) diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go new file mode 100644 index 000000000..61cfdd8fd --- /dev/null +++ b/pkg/sentry/platform/kvm/kvm_test.go @@ -0,0 +1,415 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "math/rand" + "reflect" + "syscall" + "testing" + "time" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm/testutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +var dummyFPState = (*byte)(arch.NewFloatingPointData()) + +type testHarness interface { + Errorf(format string, args ...interface{}) + Fatalf(format string, args ...interface{}) +} + +func kvmTest(t testHarness, setup func(*KVM), fn func(*vCPU) bool) { + // Create the machine. + k, err := New() + if err != nil { + t.Fatalf("error creating KVM instance: %v", err) + } + defer k.machine.Destroy() + defer k.FileMem.Destroy() + + // Call additional setup. + if setup != nil { + setup(k) + } + + var c *vCPU // For recovery. + defer func() { + redpill() + if c != nil { + k.machine.Put(c) + } + }() + for { + c, err = k.machine.Get() + if err != nil { + t.Fatalf("error getting vCPU: %v", err) + } + if !fn(c) { + break + } + + // We put the vCPU here and clear the value so that the + // deferred recovery will not re-put it above. + k.machine.Put(c) + c = nil + } +} + +func bluepillTest(t testHarness, fn func(*vCPU)) { + kvmTest(t, nil, func(c *vCPU) bool { + bluepill(c) + fn(c) + return false + }) +} + +func TestKernelSyscall(t *testing.T) { + bluepillTest(t, func(c *vCPU) { + redpill() // Leave guest mode. + if got := c.State(); got != vCPUReady { + t.Errorf("vCPU not in ready state: got %v", got) + } + }) +} + +func hostFault() { + defer func() { + recover() + }() + var foo *int + *foo = 0 +} + +func TestKernelFault(t *testing.T) { + hostFault() // Ensure recovery works. + bluepillTest(t, func(c *vCPU) { + hostFault() + if got := c.State(); got != vCPUReady { + t.Errorf("vCPU not in ready state: got %v", got) + } + }) +} + +func TestKernelFloatingPoint(t *testing.T) { + bluepillTest(t, func(c *vCPU) { + if !testutil.FloatingPointWorks() { + t.Errorf("floating point does not work, and it should!") + } + }) +} + +func applicationTest(t testHarness, useHostMappings bool, target func(), fn func(*vCPU, *syscall.PtraceRegs, *pagetables.PageTables) bool) { + // Initialize registers & page tables. + var ( + regs syscall.PtraceRegs + pt *pagetables.PageTables + ) + testutil.SetTestTarget(®s, target) + defer func() { + if pt != nil { + pt.Release() + } + }() + + kvmTest(t, func(k *KVM) { + // Create new page tables. + as, _, err := k.NewAddressSpace(nil /* invalidator */) + if err != nil { + t.Fatalf("can't create new address space: %v", err) + } + pt = as.(*addressSpace).pageTables + + if useHostMappings { + // Apply the physical mappings to these page tables. + // (This is normally dangerous, since they point to + // physical pages that may not exist. This shouldn't be + // done for regular user code, but is fine for test + // purposes.) + applyPhysicalRegions(func(pr physicalRegion) bool { + pt.Map(usermem.Addr(pr.virtual), pr.length, true /* user */, usermem.AnyAccess, pr.physical) + return true // Keep iterating. + }) + } + }, func(c *vCPU) bool { + // Invoke the function with the extra data. + return fn(c, ®s, pt) + }) +} + +func TestApplicationSyscall(t *testing.T) { + applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); err != nil { + t.Errorf("application syscall with full restore failed: %v", err) + } + return false + }) + applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != nil { + t.Errorf("application syscall with partial restore failed: %v", err) + } + return false + }) +} + +func TestApplicationFault(t *testing.T) { + applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + testutil.SetTouchTarget(regs, nil) // Cause fault. + if si, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); err != platform.ErrContextSignal || (si != nil && si.Signo != int32(syscall.SIGSEGV)) { + t.Errorf("application fault with full restore got (%v, %v), expected (%v, SIGSEGV)", err, si, platform.ErrContextSignal) + } + return false + }) + applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + testutil.SetTouchTarget(regs, nil) // Cause fault. + if si, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != platform.ErrContextSignal || (si != nil && si.Signo != int32(syscall.SIGSEGV)) { + t.Errorf("application fault with partial restore got (%v, %v), expected (%v, SIGSEGV)", err, si, platform.ErrContextSignal) + } + return false + }) +} + +func TestRegistersSyscall(t *testing.T) { + applicationTest(t, true, testutil.TwiddleRegsSyscall, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + testutil.SetTestRegs(regs) // Fill values for all registers. + if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != nil { + t.Errorf("application register check with partial restore got unexpected error: %v", err) + } + if err := testutil.CheckTestRegs(regs, false); err != nil { + t.Errorf("application register check with partial restore failed: %v", err) + } + return false + }) +} + +func TestRegistersFault(t *testing.T) { + applicationTest(t, true, testutil.TwiddleRegsFault, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + testutil.SetTestRegs(regs) // Fill values for all registers. + if si, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); err != platform.ErrContextSignal || si.Signo != int32(syscall.SIGSEGV) { + t.Errorf("application register check with full restore got unexpected error: %v", err) + } + if err := testutil.CheckTestRegs(regs, true); err != nil { + t.Errorf("application register check with full restore failed: %v", err) + } + return false + }) +} + +func TestSegments(t *testing.T) { + applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + testutil.SetTestSegments(regs) + if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); err != nil { + t.Errorf("application segment check with full restore got unexpected error: %v", err) + } + if err := testutil.CheckTestSegments(regs); err != nil { + t.Errorf("application segment check with full restore failed: %v", err) + } + return false + }) +} + +func TestBounce(t *testing.T) { + applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + go func() { + time.Sleep(time.Millisecond) + c.Bounce() + }() + if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != platform.ErrContextInterrupt { + t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt) + } + return false + }) + applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + go func() { + time.Sleep(time.Millisecond) + c.Bounce() + }() + if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); err != platform.ErrContextInterrupt { + t.Errorf("application full restore: got %v, wanted %v", err, platform.ErrContextInterrupt) + } + return false + }) +} + +func TestBounceStress(t *testing.T) { + applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + randomSleep := func() { + // O(hundreds of microseconds) is appropriate to ensure + // different overlaps and different schedules. + if n := rand.Intn(1000); n > 100 { + time.Sleep(time.Duration(n) * time.Microsecond) + } + } + for i := 0; i < 1000; i++ { + // Start an asynchronously executing goroutine that + // calls Bounce at pseudo-random point in time. + // This should wind up calling Bounce when the + // kernel is in various stages of the switch. + go func() { + randomSleep() + c.Bounce() + }() + randomSleep() + // Execute the switch. + if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != platform.ErrContextInterrupt { + t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt) + } + // Simulate work. + c.Unlock() + randomSleep() + c.Lock() + } + return false + }) +} + +func TestInvalidate(t *testing.T) { + var data uintptr // Used below. + applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + testutil.SetTouchTarget(regs, &data) // Read legitimate value. + if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != nil { + t.Errorf("application partial restore: got %v, wanted nil", err) + } + // Unmap the page containing data & invalidate. + pt.Unmap(usermem.Addr(reflect.ValueOf(&data).Pointer() & ^uintptr(usermem.PageSize-1)), usermem.PageSize) + c.Invalidate() // Ensure invalidation. + if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != platform.ErrContextSignal { + t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextSignal) + } + return false + }) +} + +// IsFault returns true iff the given signal represents a fault. +func IsFault(err error, si *arch.SignalInfo) bool { + return err == platform.ErrContextSignal && si.Signo == int32(syscall.SIGSEGV) +} + +func TestEmptyAddressSpace(t *testing.T) { + applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + if si, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); !IsFault(err, si) { + t.Errorf("first fault with partial restore failed got %v", err) + t.Logf("registers: %#v", ®s) + } + return false + }) + applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + if si, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); !IsFault(err, si) { + t.Errorf("first fault with full restore failed got %v", err) + t.Logf("registers: %#v", ®s) + } + return false + }) +} + +func TestWrongVCPU(t *testing.T) { + kvmTest(t, nil, func(c1 *vCPU) bool { + kvmTest(t, nil, func(c2 *vCPU) bool { + // Basic test, one then the other. + bluepill(c1) + bluepill(c2) + if c2.switches == 0 { + // Don't allow the test to proceed if this fails. + t.Fatalf("wrong vCPU#2 switches: vCPU1=%+v,vCPU2=%+v", c1, c2) + } + + // Alternate vCPUs; we expect to need to trigger the + // wrong vCPU path on each switch. + for i := 0; i < 100; i++ { + bluepill(c1) + bluepill(c2) + } + if count := c1.switches; count < 90 { + t.Errorf("wrong vCPU#1 switches: vCPU1=%+v,vCPU2=%+v", c1, c2) + } + if count := c2.switches; count < 90 { + t.Errorf("wrong vCPU#2 switches: vCPU1=%+v,vCPU2=%+v", c1, c2) + } + return false + }) + return false + }) + kvmTest(t, nil, func(c1 *vCPU) bool { + kvmTest(t, nil, func(c2 *vCPU) bool { + bluepill(c1) + bluepill(c2) + return false + }) + return false + }) +} + +func BenchmarkApplicationSyscall(b *testing.B) { + var ( + i int // Iteration includes machine.Get() / machine.Put(). + a int // Count for ErrContextInterrupt. + ) + applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != nil { + if err == platform.ErrContextInterrupt { + a++ + return true // Ignore. + } + b.Fatalf("benchmark failed: %v", err) + } + i++ + return i < b.N + }) + if a != 0 { + b.Logf("ErrContextInterrupt occurred %d times (in %d iterations).", a, a+i) + } +} + +func BenchmarkKernelSyscall(b *testing.B) { + // Note that the target passed here is irrelevant, we never execute SwitchToUser. + applicationTest(b, true, testutil.Getpid, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + // iteration does not include machine.Get() / machine.Put(). + for i := 0; i < b.N; i++ { + testutil.Getpid() + } + return false + }) +} + +func BenchmarkWorldSwitchToUserRoundtrip(b *testing.B) { + // see BenchmarkApplicationSyscall. + var ( + i int + a int + ) + applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool { + if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != nil { + if err == platform.ErrContextInterrupt { + a++ + return true // Ignore. + } + b.Fatalf("benchmark failed: %v", err) + } + // This will intentionally cause the world switch. By executing + // a host syscall here, we force the transition between guest + // and host mode. + testutil.Getpid() + i++ + return i < b.N + }) + if a != 0 { + b.Logf("EAGAIN occurred %d times (in %d iterations).", a, a+i) + } +} diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go new file mode 100644 index 000000000..a5be0cee3 --- /dev/null +++ b/pkg/sentry/platform/kvm/machine.go @@ -0,0 +1,412 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "fmt" + "runtime" + "sync" + "sync/atomic" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/tmutex" +) + +// machine contains state associated with the VM as a whole. +type machine struct { + // fd is the vm fd. + fd int + + // nextSlot is the next slot for setMemoryRegion. + // + // This must be accessed atomically. If nextSlot is ^uint32(0), then + // slots are currently being updated, and the caller should retry. + nextSlot uint32 + + // kernel is the set of global structures. + kernel *ring0.Kernel + + // mappingCache is used for mapPhysical. + mappingCache sync.Map + + // mu protects vCPUs. + mu sync.Mutex + + // vCPUs are the machine vCPUs. + // + // This is eventually keyed by system TID, but is initially indexed by + // the negative vCPU id. This is merely an optimization, so while + // collisions here are not possible, it wouldn't matter anyways. + vCPUs map[uint64]*vCPU +} + +const ( + // vCPUReady is the lock value for an available vCPU. + // + // Legal transitions: vCPUGuest (bluepill). + vCPUReady uintptr = iota + + // vCPUGuest indicates the vCPU is in guest mode. + // + // Legal transition: vCPUReady (bluepill), vCPUWaiter (wait). + vCPUGuest + + // vCPUWaiter indicates that the vCPU should be released. + // + // Legal transition: vCPUReady (bluepill). + vCPUWaiter +) + +// vCPU is a single KVM vCPU. +type vCPU struct { + // CPU is the kernel CPU data. + // + // This must be the first element of this structure, it is referenced + // by the bluepill code (see bluepill_amd64.s). + ring0.CPU + + // fd is the vCPU fd. + fd int + + // tid is the last set tid. + tid uint64 + + // switches is a count of world switches (informational only). + switches uint32 + + // faults is a count of world faults (informational only). + faults uint32 + + // state is the vCPU state; all are described above. + state uintptr + + // runData for this vCPU. + runData *runData + + // machine associated with this vCPU. + machine *machine + + // mu applies across get/put; it does not protect the above. + mu tmutex.Mutex +} + +// newMachine returns a new VM context. +func newMachine(vm int, vCPUs int) (*machine, error) { + // Create the machine. + m := &machine{ + fd: vm, + vCPUs: make(map[uint64]*vCPU), + } + if vCPUs > _KVM_NR_VCPUS { + // Hard cap at KVM's limit. + vCPUs = _KVM_NR_VCPUS + } + if n := 2 * runtime.NumCPU(); vCPUs > n { + // Cap at twice the number of physical cores. Otherwise we're + // just wasting memory and thrashing. (There may be scheduling + // issues when you've got > n active threads.) + vCPUs = n + } + m.kernel = ring0.New(ring0.KernelOpts{ + PageTables: pagetables.New(m, pagetablesOpts), + }) + + // Initialize architecture state. + if err := m.initArchState(vCPUs); err != nil { + m.Destroy() + return nil, err + } + + // Create all the vCPUs. + for id := 0; id < vCPUs; id++ { + // Create the vCPU. + fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(vm), _KVM_CREATE_VCPU, uintptr(id)) + if errno != 0 { + m.Destroy() + return nil, fmt.Errorf("error creating VCPU: %v", errno) + } + c := &vCPU{ + fd: int(fd), + machine: m, + } + c.mu.Init() + c.CPU.Init(m.kernel) + c.CPU.KernelSyscall = bluepillSyscall + c.CPU.KernelException = bluepillException + m.vCPUs[uint64(-id)] = c // See above. + + // Ensure the signal mask is correct. + if err := c.setSignalMask(); err != nil { + m.Destroy() + return nil, err + } + + // Initialize architecture state. + if err := c.initArchState(); err != nil { + m.Destroy() + return nil, err + } + + // Map the run data. + runData, err := mapRunData(int(fd)) + if err != nil { + m.Destroy() + return nil, err + } + c.runData = runData + } + + // Apply the physical mappings. Note that these mappings may point to + // guest physical addresses that are not actually available. These + // physical pages are mapped on demand, see kernel_unsafe.go. + applyPhysicalRegions(func(pr physicalRegion) bool { + // Map everything in the lower half. + m.kernel.PageTables.Map(usermem.Addr(pr.virtual), pr.length, false /* kernel */, usermem.AnyAccess, pr.physical) + // And keep everything in the upper half. + kernelAddr := usermem.Addr(ring0.KernelStartAddress | pr.virtual) + m.kernel.PageTables.Map(kernelAddr, pr.length, false /* kernel */, usermem.AnyAccess, pr.physical) + return true // Keep iterating. + }) + + // Ensure that the currently mapped virtual regions are actually + // available in the VM. Note that this doesn't guarantee no future + // faults, however it should guarantee that everything is available to + // ensure successful vCPU entry. + applyVirtualRegions(func(vr virtualRegion) { + if excludeVirtualRegion(vr) { + return // skip region. + } + for virtual := vr.virtual; virtual < vr.virtual+vr.length; { + physical, length, ok := TranslateToPhysical(virtual) + if !ok { + // This must be an invalid region that was + // knocked out by creation of the physical map. + return + } + if virtual+length > vr.virtual+vr.length { + // Cap the length to the end of the area. + length = vr.virtual + vr.length - virtual + } + + // Ensure the physical range is mapped. + m.mapPhysical(physical, length) + virtual += length + } + }) + + // Ensure the machine is cleaned up properly. + runtime.SetFinalizer(m, (*machine).Destroy) + return m, nil +} + +// mapPhysical checks for the mapping of a physical range, and installs one if +// not available. This attempts to be efficient for calls in the hot path. +// +// This panics on error. +func (m *machine) mapPhysical(physical, length uintptr) { + for end := physical + length; physical < end; { + _, physicalStart, length, ok := calculateBluepillFault(m, physical) + if !ok { + // Should never happen. + panic("mapPhysical on unknown physical address") + } + + if _, ok := m.mappingCache.LoadOrStore(physicalStart, true); !ok { + // Not present in the cache; requires setting the slot. + if _, ok := handleBluepillFault(m, physical); !ok { + panic("handleBluepillFault failed") + } + } + + // Move to the next chunk. + physical = physicalStart + length + } +} + +// Destroy frees associated resources. +// +// Destroy should only be called once all active users of the machine are gone. +// The machine object should not be used after calling Destroy. +// +// Precondition: all vCPUs must be returned to the machine. +func (m *machine) Destroy() { + runtime.SetFinalizer(m, nil) + + // Destroy vCPUs. + for _, c := range m.vCPUs { + // Ensure the vCPU is not still running in guest mode. This is + // possible iff teardown has been done by other threads, and + // somehow a single thread has not executed any system calls. + c.wait() + + // Teardown the vCPU itself. + switch state := c.State(); state { + case vCPUReady: + // Note that the runData may not be mapped if an error + // occurs during the middle of initialization. + if c.runData != nil { + if err := unmapRunData(c.runData); err != nil { + panic(fmt.Sprintf("error unmapping rundata: %v", err)) + } + } + if err := syscall.Close(int(c.fd)); err != nil { + panic(fmt.Sprintf("error closing vCPU fd: %v", err)) + } + case vCPUGuest, vCPUWaiter: + // Should never happen; waited above. + panic("vCPU disposed in guest state") + default: + // Should never happen; not a valid state. + panic(fmt.Sprintf("vCPU in invalid state: %v", state)) + } + } + + // Release host mappings. + if m.kernel.PageTables != nil { + m.kernel.PageTables.Release() + } + + // vCPUs are gone: teardown machine state. + if err := syscall.Close(m.fd); err != nil { + panic(fmt.Sprintf("error closing VM fd: %v", err)) + } +} + +// Get gets an available vCPU. +func (m *machine) Get() (*vCPU, error) { + runtime.LockOSThread() + tid := procid.Current() + m.mu.Lock() + + for { + // Check for an exact match. + if c := m.vCPUs[tid]; c != nil && c.mu.TryLock() { + m.mu.Unlock() + return c, nil + } + + // Scan for an available vCPU. + for origTID, c := range m.vCPUs { + if c.LockInState(vCPUReady) { + delete(m.vCPUs, origTID) + m.vCPUs[tid] = c + m.mu.Unlock() + + // We need to reload thread-local segments as + // we have origTID != tid and the vCPU state + // may be stale. + c.loadSegments() + atomic.StoreUint64(&c.tid, tid) + return c, nil + } + } + + // Everything is busy executing user code (locked). + // + // We hold the pool lock here, so we should be able to kick something + // out of kernel mode and have it bounce into host mode when it tries + // to grab the vCPU again. + for _, c := range m.vCPUs { + if c.State() != vCPUWaiter { + c.Bounce() + } + } + + // Give other threads an opportunity to run. + yield() + } +} + +// Put puts the current vCPU. +func (m *machine) Put(c *vCPU) { + c.Unlock() + runtime.UnlockOSThread() +} + +// State returns the current state. +func (c *vCPU) State() uintptr { + return atomic.LoadUintptr(&c.state) +} + +// Lock locks the vCPU. +func (c *vCPU) Lock() { + c.mu.Lock() +} + +// Invalidate invalidates caches. +func (c *vCPU) Invalidate() { +} + +// LockInState locks the vCPU if it is in the given state and TryLock succeeds. +func (c *vCPU) LockInState(state uintptr) bool { + if c.State() == state && c.mu.TryLock() { + if c.State() != state { + c.mu.Unlock() + return false + } + return true + } + return false +} + +// Unlock unlocks the given vCPU. +func (c *vCPU) Unlock() { + // Ensure we're out of guest mode, if necessary. + if c.State() == vCPUWaiter { + redpill() // Force guest mode exit. + } + c.mu.Unlock() +} + +// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt. +func (c *vCPU) NotifyInterrupt() { + c.Bounce() +} + +// pid is used below in bounce. +var pid = syscall.Getpid() + +// Bounce ensures that the vCPU bounces back to the kernel. +// +// In practice, this means returning EAGAIN from running user code. The vCPU +// will be unlocked and relock, and the kernel is guaranteed to check for +// interrupt notifications (e.g. injected via Notify) and invalidations. +func (c *vCPU) Bounce() { + for { + if c.mu.TryLock() { + // We know that the vCPU must be in the kernel already, + // because the lock was not acquired. We specifically + // don't want to call bounce in this case, because it's + // not necessary to knock the vCPU out of guest mode. + c.mu.Unlock() + return + } + + if state := c.State(); state == vCPUGuest || state == vCPUWaiter { + // We know that the vCPU was in guest mode, so a single signal + // interruption will guarantee that a transition takes place. + syscall.Tgkill(pid, int(atomic.LoadUint64(&c.tid)), bounceSignal) + return + } + + // Someone holds the lock, but the vCPU is not yet transitioned + // into guest mode. It's in the critical section; give it time. + yield() + } +} diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go new file mode 100644 index 000000000..dfa691e88 --- /dev/null +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -0,0 +1,168 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package kvm + +import ( + "fmt" + "reflect" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// initArchState initializes architecture-specific state. +func (m *machine) initArchState(vCPUs int) error { + // Set the legacy TSS address. This address is covered by the reserved + // range (up to 4GB). In fact, this is a main reason it exists. + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(m.fd), + _KVM_SET_TSS_ADDR, + uintptr(reservedMemory-(3*usermem.PageSize))); errno != 0 { + return errno + } + return nil +} + +// initArchState initializes architecture-specific state. +func (c *vCPU) initArchState() error { + var ( + kernelSystemRegs systemRegs + kernelUserRegs userRegs + ) + + // Set base control registers. + kernelSystemRegs.CR0 = c.CR0() + kernelSystemRegs.CR4 = c.CR4() + kernelSystemRegs.EFER = c.EFER() + + // Set the IDT & GDT in the registers. + kernelSystemRegs.IDT.base, kernelSystemRegs.IDT.limit = c.IDT() + kernelSystemRegs.GDT.base, kernelSystemRegs.GDT.limit = c.GDT() + kernelSystemRegs.CS.Load(&ring0.KernelCodeSegment, ring0.Kcode) + kernelSystemRegs.DS.Load(&ring0.UserDataSegment, ring0.Udata) + kernelSystemRegs.ES.Load(&ring0.UserDataSegment, ring0.Udata) + kernelSystemRegs.SS.Load(&ring0.KernelDataSegment, ring0.Kdata) + kernelSystemRegs.FS.Load(&ring0.UserDataSegment, ring0.Udata) + kernelSystemRegs.GS.Load(&ring0.UserDataSegment, ring0.Udata) + tssBase, tssLimit, tss := c.TSS() + kernelSystemRegs.TR.Load(tss, ring0.Tss) + kernelSystemRegs.TR.base = tssBase + kernelSystemRegs.TR.limit = uint32(tssLimit) + + // Point to kernel page tables. + kernelSystemRegs.CR3 = c.machine.kernel.PageTables.FlushCR3() + + // Set the CPUID; this is required before setting system registers, + // since KVM will reject several CR4 bits if the CPUID does not + // indicate the support is available. + if err := c.setCPUID(); err != nil { + return err + } + + // Set the entrypoint for the kernel. + kernelUserRegs.RIP = uint64(reflect.ValueOf(ring0.Start).Pointer()) + kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer()) + kernelUserRegs.RFLAGS = ring0.KernelFlagsSet + + // Set the system registers. + if err := c.setSystemRegisters(&kernelSystemRegs); err != nil { + return err + } + + // Set the user registers. + if err := c.setUserRegisters(&kernelUserRegs); err != nil { + return err + } + + // Set the time offset to the host native time. + return c.setSystemTime() +} + +// SwitchToUser unpacks architectural-details. +func (c *vCPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetables.PageTables, flags ring0.Flags) (*arch.SignalInfo, usermem.AccessType, error) { + // See below. + var vector ring0.Vector + + // Past this point, stack growth can cause system calls (and a break + // from guest mode). So we need to ensure that between the bluepill + // call here and the switch call immediately below, no additional + // allocations occur. + entersyscall() + bluepill(c) + vector = c.CPU.SwitchToUser(regs, fpState, pt, flags) + exitsyscall() + + // Free and clear. + switch vector { + case ring0.Debug, ring0.Breakpoint: + info := &arch.SignalInfo{Signo: int32(syscall.SIGTRAP)} + return info, usermem.AccessType{}, platform.ErrContextSignal + + case ring0.PageFault: + bluepill(c) // Probably no-op, but may not be. + faultAddr := ring0.ReadCR2() + code, user := c.ErrorCode() + if !user { + // The last fault serviced by this CPU was not a user + // fault, so we can't reliably trust the faultAddr or + // the code provided here. We need to re-execute. + return nil, usermem.NoAccess, platform.ErrContextInterrupt + } + info := &arch.SignalInfo{Signo: int32(syscall.SIGSEGV)} + info.SetAddr(uint64(faultAddr)) + accessType := usermem.AccessType{ + Read: code&(1<<1) == 0, + Write: code&(1<<1) != 0, + Execute: code&(1<<4) != 0, + } + return info, accessType, platform.ErrContextSignal + + case ring0.GeneralProtectionFault: + if !ring0.IsCanonical(regs.Rip) { + // If the RIP is non-canonical, it's a SEGV. + info := &arch.SignalInfo{Signo: int32(syscall.SIGSEGV)} + return info, usermem.AccessType{}, platform.ErrContextSignal + } + // Otherwise, we deliver a SIGBUS. + info := &arch.SignalInfo{Signo: int32(syscall.SIGBUS)} + return info, usermem.AccessType{}, platform.ErrContextSignal + + case ring0.InvalidOpcode: + info := &arch.SignalInfo{Signo: int32(syscall.SIGILL)} + return info, usermem.AccessType{}, platform.ErrContextSignal + + case ring0.X87FloatingPointException: + info := &arch.SignalInfo{Signo: int32(syscall.SIGFPE)} + return info, usermem.AccessType{}, platform.ErrContextSignal + + case ring0.Vector(bounce): + redpill() // Bail and reacqire. + return nil, usermem.NoAccess, platform.ErrContextInterrupt + + case ring0.Syscall, ring0.SyscallInt80: + // System call executed. + return nil, usermem.NoAccess, nil + + default: + panic(fmt.Sprintf("unexpected vector: 0x%x", vector)) + } +} diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go new file mode 100644 index 000000000..c2bcb3a47 --- /dev/null +++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go @@ -0,0 +1,156 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package kvm + +import ( + "fmt" + "syscall" + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/time" +) + +// setMemoryRegion initializes a region. +// +// This may be called from bluepillHandler, and therefore returns an errno +// directly (instead of wrapping in an error) to avoid allocations. +// +//go:nosplit +func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr) syscall.Errno { + userRegion := userMemoryRegion{ + slot: uint32(slot), + flags: 0, + guestPhysAddr: uint64(physical), + memorySize: uint64(length), + userspaceAddr: uint64(virtual), + } + + // Set the region. + _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(m.fd), + _KVM_SET_USER_MEMORY_REGION, + uintptr(unsafe.Pointer(&userRegion))) + return errno +} + +// loadSegments copies the current segments. +// +// This may be called from within the signal context and throws on error. +// +//go:nosplit +func (c *vCPU) loadSegments() { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_ARCH_PRCTL, + linux.ARCH_GET_FS, + uintptr(unsafe.Pointer(&c.CPU.Registers().Fs_base)), + 0); errno != 0 { + throw("getting FS segment") + } + if _, _, errno := syscall.RawSyscall( + syscall.SYS_ARCH_PRCTL, + linux.ARCH_GET_GS, + uintptr(unsafe.Pointer(&c.CPU.Registers().Gs_base)), + 0); errno != 0 { + throw("getting GS segment") + } +} + +// setUserRegisters sets user registers in the vCPU. +func (c *vCPU) setUserRegisters(uregs *userRegs) error { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_REGS, + uintptr(unsafe.Pointer(uregs))); errno != 0 { + return fmt.Errorf("error setting user registers: %v", errno) + } + return nil +} + +// setSystemRegisters sets system registers. +func (c *vCPU) setSystemRegisters(sregs *systemRegs) error { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_SREGS, + uintptr(unsafe.Pointer(sregs))); errno != 0 { + return fmt.Errorf("error setting system registers: %v", errno) + } + return nil +} + +// setCPUID sets the CPUID to be used by the guest. +func (c *vCPU) setCPUID() error { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_CPUID2, + uintptr(unsafe.Pointer(&cpuidSupported))); errno != 0 { + return fmt.Errorf("error setting CPUID: %v", errno) + } + return nil +} + +// setSystemTime sets the TSC for the vCPU. +// +// FIXME: This introduces a slight TSC offset between host and +// guest, which may vary per vCPU. +func (c *vCPU) setSystemTime() error { + const _MSR_IA32_TSC = 0x00000010 + registers := modelControlRegisters{ + nmsrs: 1, + } + registers.entries[0] = modelControlRegister{ + index: _MSR_IA32_TSC, + data: uint64(time.Rdtsc()), + } + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_MSRS, + uintptr(unsafe.Pointer(®isters))); errno != 0 { + return fmt.Errorf("error setting system time: %v", errno) + } + return nil +} + +// setSignalMask sets the vCPU signal mask. +// +// This must be called prior to running the vCPU. +func (c *vCPU) setSignalMask() error { + // The layout of this structure implies that it will not necessarily be + // the same layout chosen by the Go compiler. It gets fudged here. + var data struct { + length uint32 + mask1 uint32 + mask2 uint32 + _ uint32 + } + data.length = 8 // Fixed sigset size. + data.mask1 = ^uint32(bounceSignalMask & 0xffffffff) + data.mask2 = ^uint32(bounceSignalMask >> 32) + if _, _, errno := syscall.RawSyscall( + syscall.SYS_IOCTL, + uintptr(c.fd), + _KVM_SET_SIGNAL_MASK, + uintptr(unsafe.Pointer(&data))); errno != 0 { + return fmt.Errorf("error setting signal mask: %v", errno) + } + return nil +} diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go new file mode 100644 index 000000000..da67e23f6 --- /dev/null +++ b/pkg/sentry/platform/kvm/machine_unsafe.go @@ -0,0 +1,112 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "fmt" + "sync/atomic" + "syscall" + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables" +) + +//go:linkname entersyscall runtime.entersyscall +func entersyscall() + +//go:linkname exitsyscall runtime.exitsyscall +func exitsyscall() + +// TranslateToVirtual implements pagetables.Translater.TranslateToPhysical. +func (m *machine) TranslateToPhysical(ptes *pagetables.PTEs) uintptr { + // The length doesn't matter because all these translations require + // only a single page, which is guaranteed to be satisfied. + physical, _, ok := TranslateToPhysical(uintptr(unsafe.Pointer(ptes))) + if !ok { + panic("unable to translate pagetables.Node to physical address") + } + return physical +} + +// mapRunData maps the vCPU run data. +func mapRunData(fd int) (*runData, error) { + r, _, errno := syscall.RawSyscall6( + syscall.SYS_MMAP, + 0, + uintptr(runDataSize), + syscall.PROT_READ|syscall.PROT_WRITE, + syscall.MAP_SHARED, + uintptr(fd), + 0) + if errno != 0 { + return nil, fmt.Errorf("error mapping runData: %v", errno) + } + return (*runData)(unsafe.Pointer(r)), nil +} + +// unmapRunData unmaps the vCPU run data. +func unmapRunData(r *runData) error { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_MUNMAP, + uintptr(unsafe.Pointer(r)), + uintptr(runDataSize), + 0); errno != 0 { + return fmt.Errorf("error unmapping runData: %v", errno) + } + return nil +} + +// notify notifies that the vCPU has returned to host mode. +// +// This may be called by a signal handler and therefore throws on error. +// +//go:nosplit +func (c *vCPU) notify() { + _, _, errno := syscall.RawSyscall6( + syscall.SYS_FUTEX, + uintptr(unsafe.Pointer(&c.state)), + linux.FUTEX_WAKE, + ^uintptr(0), // Number of waiters. + 0, 0, 0) + if errno != 0 { + throw("futex wake error") + } +} + +// wait waits for the vCPU to return to host mode. +// +// This panics on error. +func (c *vCPU) wait() { + if !atomic.CompareAndSwapUintptr(&c.state, vCPUGuest, vCPUWaiter) { + return // Nothing to wait for. + } + for { + _, _, errno := syscall.Syscall6( + syscall.SYS_FUTEX, + uintptr(unsafe.Pointer(&c.state)), + linux.FUTEX_WAIT, + uintptr(vCPUWaiter), // Expected value. + 0, 0, 0) + if errno == syscall.EINTR { + continue + } else if errno == syscall.EAGAIN { + break + } else if errno != 0 { + panic("futex wait error") + } + break + } +} diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go new file mode 100644 index 000000000..5d55c9486 --- /dev/null +++ b/pkg/sentry/platform/kvm/physical_map.go @@ -0,0 +1,221 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "fmt" + "sort" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +const ( + // reservedMemory is a chunk of physical memory reserved starting at + // physical address zero. There are some special pages in this region, + // so we just call the whole thing off. + // + // Other architectures may define this to be zero. + reservedMemory = 0x100000000 +) + +type region struct { + virtual uintptr + length uintptr +} + +type physicalRegion struct { + region + physical uintptr +} + +// physicalRegions contains a list of available physical regions. +// +// The physical value used in physicalRegions is a number indicating the +// physical offset, aligned appropriately and starting above reservedMemory. +var physicalRegions []physicalRegion + +// fillAddressSpace fills the host address space with PROT_NONE mappings until +// the number of available bits until we have a host address space size that is +// equal to the physical address space. +// +// The excluded regions are returned. +func fillAddressSpace() (excludedRegions []region) { + // We can cut vSize in half, because the kernel will be using the top + // half and we ignore it while constructing mappings. It's as if we've + // already excluded half the possible addresses. + vSize := uintptr(1) << ring0.VirtualAddressBits() + vSize = vSize >> 1 + + // We exclude reservedMemory below from our physical memory size, so it + // needs to be dropped here as well. Otherwise, we could end up with + // physical addresses that are beyond what is mapped. + pSize := uintptr(1) << ring0.PhysicalAddressBits() + pSize -= reservedMemory + + // Sanity check. + if vSize < pSize { + panic(fmt.Sprintf("vSize (%x) < pSize (%x)", vSize, pSize)) + } + + // Add specifically excluded regions; see excludeVirtualRegion. + applyVirtualRegions(func(vr virtualRegion) { + if excludeVirtualRegion(vr) { + excludedRegions = append(excludedRegions, vr.region) + vSize -= vr.length + log.Infof("excluded: virtual [%x,%x)", vr.virtual, vr.virtual+vr.length) + } + }) + + // Calculate the required space and fill it. + // + // Note carefully that we add faultBlockSize to required up front, and + // on each iteration of the loop below (i.e. each new physical region + // we define), we add faultBlockSize again. This is done because the + // computation of physical regions will ensure proper alignments with + // faultBlockSize, potentially causing up to faultBlockSize bytes in + // internal fragmentation for each physical region. So we need to + // account for this properly during allocation. + requiredAddr, ok := usermem.Addr(vSize - pSize + faultBlockSize).RoundUp() + if !ok { + panic(fmt.Sprintf( + "overflow for vSize (%x) - pSize (%x) + faultBlockSize (%x)", + vSize, pSize, faultBlockSize)) + } + required := uintptr(requiredAddr) + current := required // Attempted mmap size. + for filled := uintptr(0); filled < required && current > 0; { + addr, _, errno := syscall.RawSyscall6( + syscall.SYS_MMAP, + 0, // Suggested address. + current, + syscall.PROT_NONE, + syscall.MAP_ANONYMOUS|syscall.MAP_PRIVATE|syscall.MAP_NORESERVE, + 0, 0) + if errno != 0 { + // Attempt half the size; overflow not possible. + currentAddr, _ := usermem.Addr(current >> 1).RoundUp() + current = uintptr(currentAddr) + continue + } + // We filled a block. + filled += current + excludedRegions = append(excludedRegions, region{ + virtual: addr, + length: current, + }) + // See comment above. + if filled != required { + required += faultBlockSize + } + } + if current == 0 { + panic("filling address space failed") + } + sort.Slice(excludedRegions, func(i, j int) bool { + return excludedRegions[i].virtual < excludedRegions[j].virtual + }) + for _, r := range excludedRegions { + log.Infof("region: virtual [%x,%x)", r.virtual, r.virtual+r.length) + } + return excludedRegions +} + +// computePhysicalRegions computes physical regions. +func computePhysicalRegions(excludedRegions []region) (physicalRegions []physicalRegion) { + physical := uintptr(reservedMemory) + addValidRegion := func(virtual, length uintptr) { + if length == 0 { + return + } + if virtual == 0 { + virtual += usermem.PageSize + length -= usermem.PageSize + } + if end := virtual + length; end > ring0.MaximumUserAddress { + length -= (end - ring0.MaximumUserAddress) + } + if length == 0 { + return + } + // Round physical up to the same alignment as the virtual + // address (with respect to faultBlockSize). + if offset := virtual &^ faultBlockMask; physical&^faultBlockMask != offset { + if newPhysical := (physical & faultBlockMask) + offset; newPhysical > physical { + physical = newPhysical // Round up by only a little bit. + } else { + physical = ((physical + faultBlockSize) & faultBlockMask) + offset + } + } + physicalRegions = append(physicalRegions, physicalRegion{ + region: region{ + virtual: virtual, + length: length, + }, + physical: physical, + }) + physical += length + } + lastExcludedEnd := uintptr(0) + for _, r := range excludedRegions { + addValidRegion(lastExcludedEnd, r.virtual-lastExcludedEnd) + lastExcludedEnd = r.virtual + r.length + } + addValidRegion(lastExcludedEnd, ring0.MaximumUserAddress-lastExcludedEnd) + + // Dump our all physical regions. + for _, r := range physicalRegions { + log.Infof("physicalRegion: virtual [%x,%x) => physical [%x,%x)", + r.virtual, r.virtual+r.length, r.physical, r.physical+r.length) + } + return physicalRegions +} + +// physicalInit initializes physical address mappings. +func physicalInit() { + physicalRegions = computePhysicalRegions(fillAddressSpace()) +} + +// applyPhysicalRegions applies the given function on physical regions. +// +// Iteration continues as long as true is returned. The return value is the +// return from the last call to fn, or true if there are no entries. +// +// Precondition: physicalInit must have been called. +func applyPhysicalRegions(fn func(pr physicalRegion) bool) bool { + for _, pr := range physicalRegions { + if !fn(pr) { + return false + } + } + return true +} + +// TranslateToPhysical translates the given virtual address. +// +// Precondition: physicalInit must have been called. +func TranslateToPhysical(virtual uintptr) (physical uintptr, length uintptr, ok bool) { + ok = !applyPhysicalRegions(func(pr physicalRegion) bool { + if pr.virtual <= virtual && virtual < pr.virtual+pr.length { + physical = pr.physical + (virtual - pr.virtual) + length = pr.length - (virtual - pr.virtual) + return false + } + return true + }) + return +} diff --git a/pkg/sentry/platform/kvm/testutil/BUILD b/pkg/sentry/platform/kvm/testutil/BUILD new file mode 100644 index 000000000..8533a8d89 --- /dev/null +++ b/pkg/sentry/platform/kvm/testutil/BUILD @@ -0,0 +1,15 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "testutil", + testonly = 1, + srcs = [ + "testutil.go", + "testutil_amd64.go", + "testutil_amd64.s", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm/testutil", + visibility = ["//pkg/sentry/platform/kvm:__pkg__"], +) diff --git a/pkg/sentry/platform/kvm/testutil/testutil.go b/pkg/sentry/platform/kvm/testutil/testutil.go new file mode 100644 index 000000000..8a614e25d --- /dev/null +++ b/pkg/sentry/platform/kvm/testutil/testutil.go @@ -0,0 +1,75 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package testutil provides common assembly stubs for testing. +package testutil + +import ( + "fmt" + "strings" +) + +// Getpid executes a trivial system call. +func Getpid() + +// Touch touches the value in the first register. +func Touch() + +// SyscallLoop executes a syscall and loops. +func SyscallLoop() + +// SpinLoop spins on the CPU. +func SpinLoop() + +// HaltLoop immediately halts and loops. +func HaltLoop() + +// TwiddleRegsFault twiddles registers then faults. +func TwiddleRegsFault() + +// TwiddleRegsSyscall twiddles registers then executes a syscall. +func TwiddleRegsSyscall() + +// TwiddleSegments reads segments into known registers. +func TwiddleSegments() + +// FloatingPointWorks is a floating point test. +// +// It returns true or false. +func FloatingPointWorks() bool + +// RegisterMismatchError is used for checking registers. +type RegisterMismatchError []string + +// Error returns a human-readable error. +func (r RegisterMismatchError) Error() string { + return strings.Join([]string(r), ";") +} + +// addRegisterMisatch allows simple chaining of register mismatches. +func addRegisterMismatch(err error, reg string, got, expected interface{}) error { + errStr := fmt.Sprintf("%s got %08x, expected %08x", reg, got, expected) + switch r := err.(type) { + case nil: + // Return a new register mismatch. + return RegisterMismatchError{errStr} + case RegisterMismatchError: + // Append the error. + r = append(r, errStr) + return r + default: + // Leave as is. + return err + } +} diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go new file mode 100644 index 000000000..39286a0af --- /dev/null +++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go @@ -0,0 +1,135 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package testutil + +import ( + "reflect" + "syscall" +) + +// SetTestTarget sets the rip appropriately. +func SetTestTarget(regs *syscall.PtraceRegs, fn func()) { + regs.Rip = uint64(reflect.ValueOf(fn).Pointer()) +} + +// SetTouchTarget sets rax appropriately. +func SetTouchTarget(regs *syscall.PtraceRegs, target *uintptr) { + if target != nil { + regs.Rax = uint64(reflect.ValueOf(target).Pointer()) + } else { + regs.Rax = 0 + } +} + +// RewindSyscall rewinds a syscall RIP. +func RewindSyscall(regs *syscall.PtraceRegs) { + regs.Rip -= 2 +} + +// SetTestRegs initializes registers to known values. +func SetTestRegs(regs *syscall.PtraceRegs) { + regs.R15 = 0x15 + regs.R14 = 0x14 + regs.R13 = 0x13 + regs.R12 = 0x12 + regs.Rbp = 0xb9 + regs.Rbx = 0xb4 + regs.R11 = 0x11 + regs.R10 = 0x10 + regs.R9 = 0x09 + regs.R8 = 0x08 + regs.Rax = 0x44 + regs.Rcx = 0xc4 + regs.Rdx = 0xd4 + regs.Rsi = 0x51 + regs.Rdi = 0xd1 + regs.Rsp = 0x59 +} + +// CheckTestRegs checks that registers were twiddled per TwiddleRegs. +func CheckTestRegs(regs *syscall.PtraceRegs, full bool) (err error) { + if need := ^uint64(0x15); regs.R15 != need { + err = addRegisterMismatch(err, "R15", regs.R15, need) + } + if need := ^uint64(0x14); regs.R14 != need { + err = addRegisterMismatch(err, "R14", regs.R14, need) + } + if need := ^uint64(0x13); regs.R13 != need { + err = addRegisterMismatch(err, "R13", regs.R13, need) + } + if need := ^uint64(0x12); regs.R12 != need { + err = addRegisterMismatch(err, "R12", regs.R12, need) + } + if need := ^uint64(0xb9); regs.Rbp != need { + err = addRegisterMismatch(err, "Rbp", regs.Rbp, need) + } + if need := ^uint64(0xb4); regs.Rbx != need { + err = addRegisterMismatch(err, "Rbx", regs.Rbx, need) + } + if need := ^uint64(0x10); regs.R10 != need { + err = addRegisterMismatch(err, "R10", regs.R10, need) + } + if need := ^uint64(0x09); regs.R9 != need { + err = addRegisterMismatch(err, "R9", regs.R9, need) + } + if need := ^uint64(0x08); regs.R8 != need { + err = addRegisterMismatch(err, "R8", regs.R8, need) + } + if need := ^uint64(0x44); regs.Rax != need { + err = addRegisterMismatch(err, "Rax", regs.Rax, need) + } + if need := ^uint64(0xd4); regs.Rdx != need { + err = addRegisterMismatch(err, "Rdx", regs.Rdx, need) + } + if need := ^uint64(0x51); regs.Rsi != need { + err = addRegisterMismatch(err, "Rsi", regs.Rsi, need) + } + if need := ^uint64(0xd1); regs.Rdi != need { + err = addRegisterMismatch(err, "Rdi", regs.Rdi, need) + } + if need := ^uint64(0x59); regs.Rsp != need { + err = addRegisterMismatch(err, "Rsp", regs.Rsp, need) + } + // Rcx & R11 are ignored if !full is set. + if need := ^uint64(0x11); full && regs.R11 != need { + err = addRegisterMismatch(err, "R11", regs.R11, need) + } + if need := ^uint64(0xc4); full && regs.Rcx != need { + err = addRegisterMismatch(err, "Rcx", regs.Rcx, need) + } + return +} + +var fsData uint64 = 0x55 +var gsData uint64 = 0x85 + +// SetTestSegments initializes segments to known values. +func SetTestSegments(regs *syscall.PtraceRegs) { + regs.Fs_base = uint64(reflect.ValueOf(&fsData).Pointer()) + regs.Gs_base = uint64(reflect.ValueOf(&gsData).Pointer()) +} + +// CheckTestSegments checks that registers were twiddled per TwiddleSegments. +func CheckTestSegments(regs *syscall.PtraceRegs) (err error) { + if regs.Rax != fsData { + err = addRegisterMismatch(err, "Rax", regs.Rax, fsData) + } + if regs.Rbx != gsData { + err = addRegisterMismatch(err, "Rbx", regs.Rcx, gsData) + } + return +} diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.s b/pkg/sentry/platform/kvm/testutil/testutil_amd64.s new file mode 100644 index 000000000..3b5ad8817 --- /dev/null +++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.s @@ -0,0 +1,98 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +// test_util_amd64.s provides AMD64 test functions. + +#include "funcdata.h" +#include "textflag.h" + +TEXT ·Getpid(SB),NOSPLIT,$0 + NO_LOCAL_POINTERS + MOVQ $39, AX // getpid + SYSCALL + RET + +TEXT ·Touch(SB),NOSPLIT,$0 +start: + MOVQ 0(AX), BX // deref AX + MOVQ $39, AX // getpid + SYSCALL + JMP start + +TEXT ·HaltLoop(SB),NOSPLIT,$0 +start: + HLT + JMP start + +TEXT ·SyscallLoop(SB),NOSPLIT,$0 +start: + SYSCALL + JMP start + +TEXT ·SpinLoop(SB),NOSPLIT,$0 +start: + JMP start + +TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8 + NO_LOCAL_POINTERS + MOVQ $1, AX + MOVQ AX, X0 + MOVQ $39, AX // getpid + SYSCALL + MOVQ X0, AX + CMPQ AX, $1 + SETEQ ret+0(FP) + RET + +#define TWIDDLE_REGS() \ + NOTQ R15; \ + NOTQ R14; \ + NOTQ R13; \ + NOTQ R12; \ + NOTQ BP; \ + NOTQ BX; \ + NOTQ R11; \ + NOTQ R10; \ + NOTQ R9; \ + NOTQ R8; \ + NOTQ AX; \ + NOTQ CX; \ + NOTQ DX; \ + NOTQ SI; \ + NOTQ DI; \ + NOTQ SP; + +TEXT ·TwiddleRegsSyscall(SB),NOSPLIT,$0 + TWIDDLE_REGS() + SYSCALL + RET // never reached + +TEXT ·TwiddleRegsFault(SB),NOSPLIT,$0 + TWIDDLE_REGS() + JMP AX // must fault + RET // never reached + +#define READ_FS() BYTE $0x64; BYTE $0x48; BYTE $0x8b; BYTE $0x00; +#define READ_GS() BYTE $0x65; BYTE $0x48; BYTE $0x8b; BYTE $0x00; + +TEXT ·TwiddleSegments(SB),NOSPLIT,$0 + MOVQ $0x0, AX + READ_GS() + MOVQ AX, BX + MOVQ $0x0, AX + READ_FS() + SYSCALL + RET // never reached diff --git a/pkg/sentry/platform/kvm/virtual_map.go b/pkg/sentry/platform/kvm/virtual_map.go new file mode 100644 index 000000000..0d3fbe043 --- /dev/null +++ b/pkg/sentry/platform/kvm/virtual_map.go @@ -0,0 +1,113 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "bufio" + "fmt" + "io" + "os" + "regexp" + "strconv" + + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +type virtualRegion struct { + region + accessType usermem.AccessType + shared bool + offset uintptr + filename string +} + +// mapsLine matches a single line from /proc/PID/maps. +var mapsLine = regexp.MustCompile("([0-9a-f]+)-([0-9a-f]+) ([r-][w-][x-][sp]) ([0-9a-f]+) [0-9a-f]{2}:[0-9a-f]{2,} [0-9]+\\s+(.*)") + +// excludeRegion returns true if these regions should be excluded from the +// physical map. Virtual regions need to be excluded if get_user_pages will +// fail on those addresses, preventing KVM from satisfying EPT faults. +// +// This includes the VVAR page because the VVAR page may be mapped as I/O +// memory. And the VDSO page is knocked out because the VVAR page is not even +// recorded in /proc/self/maps on older kernels; knocking out the VDSO page +// prevents code in the VDSO from accessing the VVAR address. +// +// This is called by the physical map functions, not applyVirtualRegions. +func excludeVirtualRegion(r virtualRegion) bool { + return r.filename == "[vvar]" || r.filename == "[vdso]" +} + +// applyVirtualRegions parses the process maps file. +// +// Unlike mappedRegions, these are not consistent over time. +func applyVirtualRegions(fn func(vr virtualRegion)) error { + // Open /proc/self/maps. + f, err := os.Open("/proc/self/maps") + if err != nil { + return err + } + defer f.Close() + + // Parse all entries. + r := bufio.NewReader(f) + for { + b, err := r.ReadBytes('\n') + if b != nil && len(b) > 0 { + m := mapsLine.FindSubmatch(b) + if m == nil { + // This should not happen: kernel bug? + return fmt.Errorf("badly formed line: %v", string(b)) + } + start, err := strconv.ParseUint(string(m[1]), 16, 64) + if err != nil { + return fmt.Errorf("bad start address: %v", string(b)) + } + end, err := strconv.ParseUint(string(m[2]), 16, 64) + if err != nil { + return fmt.Errorf("bad end address: %v", string(b)) + } + read := m[3][0] == 'r' + write := m[3][1] == 'w' + execute := m[3][2] == 'x' + shared := m[3][3] == 's' + offset, err := strconv.ParseUint(string(m[4]), 16, 64) + if err != nil { + return fmt.Errorf("bad offset: %v", string(b)) + } + fn(virtualRegion{ + region: region{ + virtual: uintptr(start), + length: uintptr(end - start), + }, + accessType: usermem.AccessType{ + Read: read, + Write: write, + Execute: execute, + }, + shared: shared, + offset: uintptr(offset), + filename: string(m[5]), + }) + } + if err != nil && err == io.EOF { + break + } else if err != nil { + return err + } + } + + return nil +} diff --git a/pkg/sentry/platform/kvm/virtual_map_test.go b/pkg/sentry/platform/kvm/virtual_map_test.go new file mode 100644 index 000000000..31e5b0e61 --- /dev/null +++ b/pkg/sentry/platform/kvm/virtual_map_test.go @@ -0,0 +1,78 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kvm + +import ( + "syscall" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +type checker struct { + ok bool +} + +func (c *checker) Contains(addr uintptr) func(virtualRegion) { + c.ok = false // Reset for below calls. + return func(vr virtualRegion) { + if vr.virtual <= addr && addr < vr.virtual+vr.length { + c.ok = true + } + } +} + +func TestParseMaps(t *testing.T) { + c := new(checker) + + // Simple test. + if err := applyVirtualRegions(c.Contains(0)); err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // MMap a new page. + addr, _, errno := syscall.RawSyscall6( + syscall.SYS_MMAP, 0, usermem.PageSize, + syscall.PROT_READ|syscall.PROT_WRITE, + syscall.MAP_ANONYMOUS|syscall.MAP_PRIVATE, 0, 0) + if errno != 0 { + t.Fatalf("unexpected map error: %v", errno) + } + + // Re-parse maps. + if err := applyVirtualRegions(c.Contains(addr)); err != nil { + syscall.RawSyscall(syscall.SYS_MUNMAP, addr, usermem.PageSize, 0) + t.Fatalf("unexpected error: %v", err) + } + + // Assert that it now does contain the region. + if !c.ok { + syscall.RawSyscall(syscall.SYS_MUNMAP, addr, usermem.PageSize, 0) + t.Fatalf("updated map does not contain 0x%08x, expected true", addr) + } + + // Unmap the region. + syscall.RawSyscall(syscall.SYS_MUNMAP, addr, usermem.PageSize, 0) + + // Re-parse maps. + if err := applyVirtualRegions(c.Contains(addr)); err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Assert that it once again does _not_ contain the region. + if c.ok { + t.Fatalf("final map does contain 0x%08x, expected false", addr) + } +} |