summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/platform/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/platform/kvm')
-rw-r--r--pkg/sentry/platform/kvm/BUILD90
-rw-r--r--pkg/sentry/platform/kvm/address_space.go207
-rw-r--r--pkg/sentry/platform/kvm/bluepill.go41
-rw-r--r--pkg/sentry/platform/kvm/bluepill_amd64.go143
-rw-r--r--pkg/sentry/platform/kvm/bluepill_amd64.s87
-rw-r--r--pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go28
-rw-r--r--pkg/sentry/platform/kvm/bluepill_fault.go127
-rw-r--r--pkg/sentry/platform/kvm/bluepill_unsafe.go175
-rw-r--r--pkg/sentry/platform/kvm/context.go81
-rw-r--r--pkg/sentry/platform/kvm/host_map.go168
-rw-r--r--pkg/sentry/platform/kvm/kvm.go149
-rw-r--r--pkg/sentry/platform/kvm/kvm_amd64.go213
-rw-r--r--pkg/sentry/platform/kvm/kvm_amd64_unsafe.go93
-rw-r--r--pkg/sentry/platform/kvm/kvm_const.go56
-rw-r--r--pkg/sentry/platform/kvm/kvm_test.go415
-rw-r--r--pkg/sentry/platform/kvm/machine.go412
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64.go168
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64_unsafe.go156
-rw-r--r--pkg/sentry/platform/kvm/machine_unsafe.go112
-rw-r--r--pkg/sentry/platform/kvm/physical_map.go221
-rw-r--r--pkg/sentry/platform/kvm/testutil/BUILD15
-rw-r--r--pkg/sentry/platform/kvm/testutil/testutil.go75
-rw-r--r--pkg/sentry/platform/kvm/testutil/testutil_amd64.go135
-rw-r--r--pkg/sentry/platform/kvm/testutil/testutil_amd64.s98
-rw-r--r--pkg/sentry/platform/kvm/virtual_map.go113
-rw-r--r--pkg/sentry/platform/kvm/virtual_map_test.go78
26 files changed, 3656 insertions, 0 deletions
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
new file mode 100644
index 000000000..d902e344a
--- /dev/null
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -0,0 +1,90 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+go_template_instance(
+ name = "host_map_set",
+ out = "host_map_set.go",
+ consts = {
+ "minDegree": "15",
+ },
+ imports = {
+ "usermem": "gvisor.googlesource.com/gvisor/pkg/sentry/usermem",
+ },
+ package = "kvm",
+ prefix = "hostMap",
+ template = "//pkg/segment:generic_set",
+ types = {
+ "Key": "usermem.Addr",
+ "Range": "usermem.AddrRange",
+ "Value": "uintptr",
+ "Functions": "hostMapSetFunctions",
+ },
+)
+
+go_library(
+ name = "kvm",
+ srcs = [
+ "address_space.go",
+ "bluepill.go",
+ "bluepill_amd64.go",
+ "bluepill_amd64.s",
+ "bluepill_amd64_unsafe.go",
+ "bluepill_fault.go",
+ "bluepill_unsafe.go",
+ "context.go",
+ "host_map.go",
+ "host_map_set.go",
+ "kvm.go",
+ "kvm_amd64.go",
+ "kvm_amd64_unsafe.go",
+ "kvm_const.go",
+ "machine.go",
+ "machine_amd64.go",
+ "machine_amd64_unsafe.go",
+ "machine_unsafe.go",
+ "physical_map.go",
+ "virtual_map.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm",
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/cpuid",
+ "//pkg/log",
+ "//pkg/sentry/arch",
+ "//pkg/sentry/platform",
+ "//pkg/sentry/platform/filemem",
+ "//pkg/sentry/platform/interrupt",
+ "//pkg/sentry/platform/procid",
+ "//pkg/sentry/platform/ring0",
+ "//pkg/sentry/platform/ring0/pagetables",
+ "//pkg/sentry/platform/safecopy",
+ "//pkg/sentry/time",
+ "//pkg/sentry/usermem",
+ "//pkg/tmutex",
+ ],
+)
+
+go_test(
+ name = "kvm_test",
+ size = "small",
+ srcs = [
+ "kvm_test.go",
+ "virtual_map_test.go",
+ ],
+ embed = [":kvm"],
+ tags = [
+ "nogotsan",
+ "requires-kvm",
+ ],
+ deps = [
+ "//pkg/sentry/arch",
+ "//pkg/sentry/platform",
+ "//pkg/sentry/platform/kvm/testutil",
+ "//pkg/sentry/platform/ring0",
+ "//pkg/sentry/platform/ring0/pagetables",
+ "//pkg/sentry/usermem",
+ ],
+)
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
new file mode 100644
index 000000000..791f038b0
--- /dev/null
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -0,0 +1,207 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+ "reflect"
+ "sync"
+ "sync/atomic"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// addressSpace is a wrapper for PageTables.
+type addressSpace struct {
+ platform.NoAddressSpaceIO
+
+ // filemem is the memory instance.
+ filemem *filemem.FileMem
+
+ // machine is the underlying machine.
+ machine *machine
+
+ // pageTables are for this particular address space.
+ pageTables *pagetables.PageTables
+
+ // dirtySet is the set of dirty vCPUs.
+ //
+ // The key is the vCPU, the value is a shared uint32 pointer that
+ // indicates whether or not the context is clean. A zero here indicates
+ // that the context should be cleaned prior to re-entry.
+ dirtySet sync.Map
+
+ // files contains files mapped in the host address space.
+ files hostMap
+}
+
+// Invalidate interrupts all dirty contexts.
+func (as *addressSpace) Invalidate() {
+ as.dirtySet.Range(func(key, value interface{}) bool {
+ c := key.(*vCPU)
+ v := value.(*uint32)
+ atomic.StoreUint32(v, 0) // Invalidation required.
+ c.Bounce() // Force a kernel transition.
+ return true // Keep iterating.
+ })
+}
+
+// Touch adds the given vCPU to the dirty list.
+func (as *addressSpace) Touch(c *vCPU) *uint32 {
+ value, ok := as.dirtySet.Load(c)
+ if !ok {
+ value, _ = as.dirtySet.LoadOrStore(c, new(uint32))
+ }
+ return value.(*uint32)
+}
+
+func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) {
+ for m.length > 0 {
+ physical, length, ok := TranslateToPhysical(m.addr)
+ if !ok {
+ panic("unable to translate segment")
+ }
+ if length > m.length {
+ length = m.length
+ }
+
+ // Ensure that this map has physical mappings. If the page does
+ // not have physical mappings, the KVM module may inject
+ // spurious exceptions when emulation fails (i.e. it tries to
+ // emulate because the RIP is pointed at those pages).
+ as.machine.mapPhysical(physical, length)
+
+ // Install the page table mappings. Note that the ordering is
+ // important; if the pagetable mappings were installed before
+ // ensuring the physical pages were available, then some other
+ // thread could theoretically access them.
+ prev := as.pageTables.Map(addr, length, true /* user */, at, physical)
+ inv = inv || prev
+ m.addr += length
+ m.length -= length
+ addr += usermem.Addr(length)
+ }
+
+ return inv
+}
+
+func (as *addressSpace) mapHostFile(addr usermem.Addr, fd int, fr platform.FileRange, at usermem.AccessType) error {
+ // Create custom host mappings.
+ ms, err := as.files.CreateMappings(usermem.AddrRange{
+ Start: addr,
+ End: addr + usermem.Addr(fr.End-fr.Start),
+ }, at, fd, fr.Start)
+ if err != nil {
+ return err
+ }
+
+ inv := false
+ for _, m := range ms {
+ // The host mapped slices are guaranteed to be aligned.
+ inv = inv || as.mapHost(addr, m, at)
+ addr += usermem.Addr(m.length)
+ }
+ if inv {
+ as.Invalidate()
+ }
+
+ return nil
+}
+
+func (as *addressSpace) mapFilemem(addr usermem.Addr, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
+ // TODO: Lock order at the platform level is not sufficiently
+ // well-defined to guarantee that the caller (FileMem.MapInto) is not
+ // holding any locks that FileMem.MapInternal may take.
+
+ // Retrieve mappings for the underlying filemem. Note that the
+ // permissions here are largely irrelevant, since it corresponds to
+ // physical memory for the guest. We enforce the given access type
+ // below, in the guest page tables.
+ bs, err := as.filemem.MapInternal(fr, usermem.AccessType{
+ Read: true,
+ Write: true,
+ })
+ if err != nil {
+ return err
+ }
+
+ // Save the original range for invalidation.
+ orig := usermem.AddrRange{
+ Start: addr,
+ End: addr + usermem.Addr(fr.End-fr.Start),
+ }
+
+ inv := false
+ for !bs.IsEmpty() {
+ b := bs.Head()
+ bs = bs.Tail()
+ // Since fr was page-aligned, b should also be page-aligned. We do the
+ // lookup in our host page tables for this translation.
+ s := b.ToSlice()
+ if precommit {
+ for i := 0; i < len(s); i += usermem.PageSize {
+ _ = s[i] // Touch to commit.
+ }
+ }
+ inv = inv || as.mapHost(addr, hostMapEntry{
+ addr: reflect.ValueOf(&s[0]).Pointer(),
+ length: uintptr(len(s)),
+ }, at)
+ addr += usermem.Addr(len(s))
+ }
+ if inv {
+ as.Invalidate()
+ as.files.DeleteMapping(orig)
+ }
+
+ return nil
+}
+
+// MapFile implements platform.AddressSpace.MapFile.
+func (as *addressSpace) MapFile(addr usermem.Addr, fd int, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
+ // Create an appropriate mapping. If this is filemem, we don't create
+ // custom mappings for each in-application mapping. For files however,
+ // we create distinct mappings for each address space. Unfortunately,
+ // there's not a better way to manage this here. The file underlying
+ // this fd can change at any time, so we can't actually index the file
+ // and share between address space. Oh well. It's all refering to the
+ // same physical pages, hopefully we don't run out of address space.
+ if fd != int(as.filemem.File().Fd()) {
+ // N.B. precommit is ignored for host files.
+ return as.mapHostFile(addr, fd, fr, at)
+ }
+
+ return as.mapFilemem(addr, fr, at, precommit)
+}
+
+// Unmap unmaps the given range by calling pagetables.PageTables.Unmap.
+func (as *addressSpace) Unmap(addr usermem.Addr, length uint64) {
+ if prev := as.pageTables.Unmap(addr, uintptr(length)); prev {
+ as.Invalidate()
+ as.files.DeleteMapping(usermem.AddrRange{
+ Start: addr,
+ End: addr + usermem.Addr(length),
+ })
+ }
+}
+
+// Release releases the page tables.
+func (as *addressSpace) Release() error {
+ as.Unmap(0, ^uint64(0))
+ as.pageTables.Release()
+ return nil
+}
diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
new file mode 100644
index 000000000..ecc33d7dd
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -0,0 +1,41 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+ "fmt"
+ "reflect"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/safecopy"
+)
+
+// bluepill enters guest mode.
+func bluepill(*vCPU)
+
+// sighandler is the signal entry point.
+func sighandler()
+
+// savedHandler is a pointer to the previous handler.
+//
+// This is called by bluepillHandler.
+var savedHandler uintptr
+
+func init() {
+ // Install the handler.
+ if err := safecopy.ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil {
+ panic(fmt.Sprintf("Unable to set handler for signal %d: %v", syscall.SIGSEGV, err))
+ }
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go
new file mode 100644
index 000000000..a2baefb7d
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.go
@@ -0,0 +1,143 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+)
+
+var (
+ // bounceSignal is the signal used for bouncing KVM.
+ //
+ // We use SIGCHLD because it is not masked by the runtime, and
+ // it will be ignored properly by other parts of the kernel.
+ bounceSignal = syscall.SIGCHLD
+
+ // bounceSignalMask has only bounceSignal set.
+ bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1))
+
+ // bounce is the interrupt vector used to return to the kernel.
+ bounce = uint32(ring0.VirtualizationException)
+)
+
+// redpill on amd64 invokes a syscall with -1.
+//
+//go:nosplit
+func redpill() {
+ syscall.RawSyscall(^uintptr(0), 0, 0, 0)
+}
+
+// bluepillArchEnter is called during bluepillEnter.
+//
+//go:nosplit
+func bluepillArchEnter(context *arch.SignalContext64) (c *vCPU) {
+ c = vCPUPtr(uintptr(context.Rax))
+ regs := c.CPU.Registers()
+ regs.R8 = context.R8
+ regs.R9 = context.R9
+ regs.R10 = context.R10
+ regs.R11 = context.R11
+ regs.R12 = context.R12
+ regs.R13 = context.R13
+ regs.R14 = context.R14
+ regs.R15 = context.R15
+ regs.Rdi = context.Rdi
+ regs.Rsi = context.Rsi
+ regs.Rbp = context.Rbp
+ regs.Rbx = context.Rbx
+ regs.Rdx = context.Rdx
+ regs.Rax = context.Rax
+ regs.Rcx = context.Rcx
+ regs.Rsp = context.Rsp
+ regs.Rip = context.Rip
+ regs.Eflags = context.Eflags
+ regs.Eflags &^= uint64(ring0.KernelFlagsClear)
+ regs.Eflags |= ring0.KernelFlagsSet
+ regs.Cs = uint64(ring0.Kcode)
+ regs.Ds = uint64(ring0.Udata)
+ regs.Es = uint64(ring0.Udata)
+ regs.Fs = uint64(ring0.Udata)
+ regs.Ss = uint64(ring0.Kdata)
+
+ // ring0 uses GS exclusively, so we use GS_base to store the location
+ // of the floating point address.
+ //
+ // The address will be restored directly after running the VCPU, and
+ // will be saved again prior to halting. We rely on the fact that the
+ // SaveFloatingPointer/LoadFloatingPoint functions use the most
+ // efficient mechanism available (including compression) so the state
+ // size is guaranteed to be less than what's pointed to here.
+ regs.Gs_base = uint64(context.Fpstate)
+ return
+}
+
+// bluepillSyscall handles kernel syscalls.
+//
+//go:nosplit
+func bluepillSyscall() {
+ regs := ring0.Current().Registers()
+ if regs.Rax != ^uint64(0) {
+ regs.Rip -= 2 // Rewind.
+ }
+ ring0.SaveFloatingPoint(bytePtr(uintptr(regs.Gs_base)))
+ ring0.Halt()
+ ring0.LoadFloatingPoint(bytePtr(uintptr(regs.Gs_base)))
+}
+
+// bluepillException handles kernel exceptions.
+//
+//go:nosplit
+func bluepillException(vector ring0.Vector) {
+ regs := ring0.Current().Registers()
+ if vector == ring0.Vector(bounce) {
+ // These should not interrupt kernel execution; point the Rip
+ // to zero to ensure that we get a reasonable panic when we
+ // attempt to return.
+ regs.Rip = 0
+ }
+ ring0.SaveFloatingPoint(bytePtr(uintptr(regs.Gs_base)))
+ ring0.Halt()
+ ring0.LoadFloatingPoint(bytePtr(uintptr(regs.Gs_base)))
+}
+
+// bluepillArchExit is called during bluepillEnter.
+//
+//go:nosplit
+func bluepillArchExit(c *vCPU, context *arch.SignalContext64) {
+ regs := c.CPU.Registers()
+ context.R8 = regs.R8
+ context.R9 = regs.R9
+ context.R10 = regs.R10
+ context.R11 = regs.R11
+ context.R12 = regs.R12
+ context.R13 = regs.R13
+ context.R14 = regs.R14
+ context.R15 = regs.R15
+ context.Rdi = regs.Rdi
+ context.Rsi = regs.Rsi
+ context.Rbp = regs.Rbp
+ context.Rbx = regs.Rbx
+ context.Rdx = regs.Rdx
+ context.Rax = regs.Rax
+ context.Rcx = regs.Rcx
+ context.Rsp = regs.Rsp
+ context.Rip = regs.Rip
+ context.Eflags = regs.Eflags
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.s b/pkg/sentry/platform/kvm/bluepill_amd64.s
new file mode 100644
index 000000000..0881bd5f5
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.s
@@ -0,0 +1,87 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// VCPU_CPU is the location of the CPU in the vCPU struct.
+//
+// This is guaranteed to be zero.
+#define VCPU_CPU 0x0
+
+// CPU_SELF is the self reference in ring0's percpu.
+//
+// This is guaranteed to be zero.
+#define CPU_SELF 0x0
+
+// Context offsets.
+//
+// Only limited use of the context is done in the assembly stub below, most is
+// done in the Go handlers. However, the RIP must be examined.
+#define CONTEXT_RAX 0x90
+#define CONTEXT_RIP 0xa8
+#define CONTEXT_FP 0xe0
+
+// CLI is the literal byte for the disable interrupts instruction.
+//
+// This is checked as the source of the fault.
+#define CLI $0xfa
+
+// See bluepill.go.
+TEXT ·bluepill(SB),NOSPLIT,$0
+begin:
+ MOVQ vcpu+0(FP), AX
+ LEAQ VCPU_CPU(AX), BX
+ BYTE CLI;
+check_vcpu:
+ MOVQ CPU_SELF(GS), CX
+ CMPQ BX, CX
+ JE right_vCPU
+wrong_vcpu:
+ CALL ·redpill(SB)
+ JMP begin
+right_vCPU:
+ RET
+
+// sighandler: see bluepill.go for documentation.
+//
+// The arguments are the following:
+//
+// DI - The signal number.
+// SI - Pointer to siginfo_t structure.
+// DX - Pointer to ucontext structure.
+//
+TEXT ·sighandler(SB),NOSPLIT,$0
+ // Check if the signal is from the kernel.
+ MOVQ $0x80, CX
+ CMPL CX, 0x8(SI)
+ JNE fallback
+
+ // Check if RIP is disable interrupts.
+ MOVQ CONTEXT_RIP(DX), CX
+ CMPQ CX, $0x0
+ JE fallback
+ CMPB 0(CX), CLI
+ JNE fallback
+
+ // Call the bluepillHandler.
+ PUSHQ DX // First argument (context).
+ CALL ·bluepillHandler(SB) // Call the handler.
+ POPQ DX // Discard the argument.
+ RET
+
+fallback:
+ // Jump to the previous signal handler.
+ XORQ CX, CX
+ MOVQ ·savedHandler(SB), AX
+ JMP AX
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
new file mode 100644
index 000000000..61ca61dcb
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -0,0 +1,28 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+ "unsafe"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+)
+
+// bluepillArchContext returns the arch-specific context.
+func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
+ return &((*arch.UContext64)(context).MContext)
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go
new file mode 100644
index 000000000..7c8c7bc37
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_fault.go
@@ -0,0 +1,127 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+ "sync/atomic"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+const (
+ // faultBlockSize is the size used for servicing memory faults.
+ //
+ // This should be large enough to avoid frequent faults and avoid using
+ // all available KVM slots (~512), but small enough that KVM does not
+ // complain about slot sizes (~4GB). See handleBluepillFault for how
+ // this block is used.
+ faultBlockSize = 2 << 30
+
+ // faultBlockMask is the mask for the fault blocks.
+ //
+ // This must be typed to avoid overflow complaints (ugh).
+ faultBlockMask = ^uintptr(faultBlockSize - 1)
+)
+
+// yield yields the CPU.
+//
+//go:nosplit
+func yield() {
+ syscall.RawSyscall(syscall.SYS_SCHED_YIELD, 0, 0, 0)
+}
+
+// calculateBluepillFault calculates the fault address range.
+//
+//go:nosplit
+func calculateBluepillFault(m *machine, physical uintptr) (virtualStart, physicalStart, length uintptr, ok bool) {
+ alignedPhysical := physical &^ uintptr(usermem.PageSize-1)
+ for _, pr := range physicalRegions {
+ end := pr.physical + pr.length
+ if physical < pr.physical || physical >= end {
+ continue
+ }
+
+ // Adjust the block to match our size.
+ physicalStart = alignedPhysical & faultBlockMask
+ if physicalStart < pr.physical {
+ // Bound the starting point to the start of the region.
+ physicalStart = pr.physical
+ }
+ virtualStart = pr.virtual + (physicalStart - pr.physical)
+ physicalEnd := physicalStart + faultBlockSize
+ if physicalEnd > end {
+ physicalEnd = end
+ }
+ length = physicalEnd - physicalStart
+ return virtualStart, physicalStart, length, true
+ }
+
+ return 0, 0, 0, false
+}
+
+// handleBluepillFault handles a physical fault.
+//
+// The corresponding virtual address is returned. This may throw on error.
+//
+//go:nosplit
+func handleBluepillFault(m *machine, physical uintptr) (uintptr, bool) {
+ // Paging fault: we need to map the underlying physical pages for this
+ // fault. This all has to be done in this function because we're in a
+ // signal handler context. (We can't call any functions that might
+ // split the stack.)
+ virtualStart, physicalStart, length, ok := calculateBluepillFault(m, physical)
+ if !ok {
+ return 0, false
+ }
+
+ // Set the KVM slot.
+ //
+ // First, we need to acquire the exclusive right to set a slot. See
+ // machine.nextSlot for information about the protocol.
+ slot := atomic.SwapUint32(&m.nextSlot, ^uint32(0))
+ for slot == ^uint32(0) {
+ yield() // Race with another call.
+ slot = atomic.SwapUint32(&m.nextSlot, ^uint32(0))
+ }
+ errno := m.setMemoryRegion(int(slot), physicalStart, length, virtualStart)
+ if errno == 0 {
+ // Successfully added region; we can increment nextSlot and
+ // allow another set to proceed here.
+ atomic.StoreUint32(&m.nextSlot, slot+1)
+ return virtualStart + (physical - physicalStart), true
+ }
+
+ // Release our slot (still available).
+ atomic.StoreUint32(&m.nextSlot, slot)
+
+ switch errno {
+ case syscall.EEXIST:
+ // The region already exists. It's possible that we raced with
+ // another vCPU here. We just revert nextSlot and return true,
+ // because this must have been satisfied by some other vCPU.
+ return virtualStart + (physical - physicalStart), true
+ case syscall.EINVAL:
+ throw("set memory region failed; out of slots")
+ case syscall.ENOMEM:
+ throw("set memory region failed: out of memory")
+ case syscall.EFAULT:
+ throw("set memory region failed: invalid physical range")
+ default:
+ throw("set memory region failed: unknown reason")
+ }
+
+ panic("unreachable")
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
new file mode 100644
index 000000000..85703ff18
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -0,0 +1,175 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+ "sync/atomic"
+ "syscall"
+ "unsafe"
+)
+
+//go:linkname throw runtime.throw
+func throw(string)
+
+// vCPUPtr returns a CPU for the given address.
+//
+//go:nosplit
+func vCPUPtr(addr uintptr) *vCPU {
+ return (*vCPU)(unsafe.Pointer(addr))
+}
+
+// bytePtr returns a bytePtr for the given address.
+//
+//go:nosplit
+func bytePtr(addr uintptr) *byte {
+ return (*byte)(unsafe.Pointer(addr))
+}
+
+// bluepillHandler is called from the signal stub.
+//
+// The world may be stopped while this is executing, and it executes on the
+// signal stack. It should only execute raw system calls and functions that are
+// explicitly marked go:nosplit.
+//
+//go:nosplit
+func bluepillHandler(context unsafe.Pointer) {
+ // Sanitize the registers; interrupts must always be disabled.
+ c := bluepillArchEnter(bluepillArchContext(context))
+
+ // Increment the number of switches.
+ atomic.AddUint32(&c.switches, 1)
+
+ // Store vCPUGuest.
+ //
+ // This is fine even if we're not in guest mode yet. In this signal
+ // handler, we'll already have all the relevant signals blocked, so an
+ // interrupt is only deliverable when we actually execute the KVM_RUN.
+ //
+ // The state will be returned to vCPUReady by Phase2.
+ if state := atomic.SwapUintptr(&c.state, vCPUGuest); state != vCPUReady {
+ throw("vCPU not in ready state")
+ }
+
+ for {
+ _, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(c.fd), _KVM_RUN, 0)
+ if errno == syscall.EINTR {
+ // First, we process whatever pending signal
+ // interrupted KVM. Since we're in a signal handler
+ // currently, all signals are masked and the signal
+ // must have been delivered directly to this thread.
+ sig, _, errno := syscall.RawSyscall6(
+ syscall.SYS_RT_SIGTIMEDWAIT,
+ uintptr(unsafe.Pointer(&bounceSignalMask)),
+ 0, // siginfo.
+ 0, // timeout.
+ 8, // sigset size.
+ 0, 0)
+ if errno != 0 {
+ throw("error waiting for pending signal")
+ }
+ if sig != uintptr(bounceSignal) {
+ throw("unexpected signal")
+ }
+
+ // Check whether the current state of the vCPU is ready
+ // for interrupt injection. Because we don't have a
+ // PIC, we can't inject an interrupt while they are
+ // masked. We need to request a window if it's not
+ // ready.
+ if c.runData.readyForInterruptInjection == 0 {
+ c.runData.requestInterruptWindow = 1
+ continue // Rerun vCPU.
+ } else {
+ // Force injection below; the vCPU is ready.
+ c.runData.exitReason = _KVM_EXIT_IRQ_WINDOW_OPEN
+ }
+ } else if errno != 0 {
+ throw("run failed")
+ }
+
+ switch c.runData.exitReason {
+ case _KVM_EXIT_EXCEPTION:
+ throw("exception")
+ case _KVM_EXIT_IO:
+ throw("I/O")
+ case _KVM_EXIT_INTERNAL_ERROR:
+ throw("internal error")
+ case _KVM_EXIT_HYPERCALL:
+ throw("hypercall")
+ case _KVM_EXIT_DEBUG:
+ throw("debug")
+ case _KVM_EXIT_HLT:
+ // Copy out registers.
+ bluepillArchExit(c, bluepillArchContext(context))
+
+ // Notify any waiters.
+ switch state := atomic.SwapUintptr(&c.state, vCPUReady); state {
+ case vCPUGuest:
+ case vCPUWaiter:
+ c.notify() // Safe from handler.
+ default:
+ throw("invalid state")
+ }
+ return
+ case _KVM_EXIT_MMIO:
+ // Increment the fault count.
+ atomic.AddUint32(&c.faults, 1)
+
+ // For MMIO, the physical address is the first data item.
+ virtual, ok := handleBluepillFault(c.machine, uintptr(c.runData.data[0]))
+ if !ok {
+ throw("physical address not valid")
+ }
+
+ // We now need to fill in the data appropriately. KVM
+ // expects us to provide the result of the given MMIO
+ // operation in the runData struct. This is safe
+ // because, if a fault occurs here, the same fault
+ // would have occurred in guest mode. The kernel should
+ // not create invalid page table mappings.
+ data := (*[8]byte)(unsafe.Pointer(&c.runData.data[1]))
+ length := (uintptr)((uint32)(c.runData.data[2]))
+ write := (uint8)((c.runData.data[2] >> 32 & 0xff)) != 0
+ for i := uintptr(0); i < length; i++ {
+ b := bytePtr(uintptr(virtual) + i)
+ if write {
+ // Write to the given address.
+ *b = data[i]
+ } else {
+ // Read from the given address.
+ data[i] = *b
+ }
+ }
+ case _KVM_EXIT_IRQ_WINDOW_OPEN:
+ // Interrupt: we must have requested an interrupt
+ // window; set the interrupt line.
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_INTERRUPT,
+ uintptr(unsafe.Pointer(&bounce))); errno != 0 {
+ throw("interrupt injection failed")
+ }
+ // Clear previous injection request.
+ c.runData.requestInterruptWindow = 0
+ case _KVM_EXIT_SHUTDOWN:
+ throw("shutdown")
+ case _KVM_EXIT_FAIL_ENTRY:
+ throw("entry failed")
+ default:
+ throw("unknown failure")
+ }
+ }
+}
diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go
new file mode 100644
index 000000000..fd04a2c47
--- /dev/null
+++ b/pkg/sentry/platform/kvm/context.go
@@ -0,0 +1,81 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+ "sync/atomic"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/interrupt"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// context is an implementation of the platform context.
+//
+// This is a thin wrapper around the machine.
+type context struct {
+ // machine is the parent machine, and is immutable.
+ machine *machine
+
+ // interrupt is the interrupt context.
+ interrupt interrupt.Forwarder
+}
+
+// Switch runs the provided context in the given address space.
+func (c *context) Switch(as platform.AddressSpace, ac arch.Context, _ int32) (*arch.SignalInfo, usermem.AccessType, error) {
+ // Extract data.
+ localAS := as.(*addressSpace)
+ regs := &ac.StateData().Regs
+ fp := (*byte)(ac.FloatingPointData())
+
+ // Grab a vCPU.
+ cpu, err := c.machine.Get()
+ if err != nil {
+ return nil, usermem.NoAccess, err
+ }
+
+ // Enable interrupts (i.e. calls to vCPU.Notify).
+ if !c.interrupt.Enable(cpu) {
+ c.machine.Put(cpu) // Already preempted.
+ return nil, usermem.NoAccess, platform.ErrContextInterrupt
+ }
+
+ // Mark the address space as dirty.
+ flags := ring0.Flags(0)
+ dirty := localAS.Touch(cpu)
+ if v := atomic.SwapUint32(dirty, 1); v == 0 {
+ flags |= ring0.FlagFlush
+ }
+ if ac.FullRestore() {
+ flags |= ring0.FlagFull
+ }
+
+ // Take the blue pill.
+ si, at, err := cpu.SwitchToUser(regs, fp, localAS.pageTables, flags)
+
+ // Release resources.
+ c.machine.Put(cpu)
+
+ // All done.
+ c.interrupt.Disable()
+ return si, at, err
+}
+
+// Interrupt interrupts the running context.
+func (c *context) Interrupt() {
+ c.interrupt.NotifyInterrupt()
+}
diff --git a/pkg/sentry/platform/kvm/host_map.go b/pkg/sentry/platform/kvm/host_map.go
new file mode 100644
index 000000000..357f8c92e
--- /dev/null
+++ b/pkg/sentry/platform/kvm/host_map.go
@@ -0,0 +1,168 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+ "fmt"
+ "sync"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+type hostMap struct {
+ // mu protects below.
+ mu sync.RWMutex
+
+ // set contains host mappings.
+ set hostMapSet
+}
+
+type hostMapEntry struct {
+ addr uintptr
+ length uintptr
+}
+
+func (hm *hostMap) forEachEntry(r usermem.AddrRange, fn func(offset uint64, m hostMapEntry)) {
+ for seg := hm.set.FindSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() {
+ length := uintptr(seg.Range().Length())
+ segOffset := uint64(0) // Adjusted below.
+ if seg.End() > r.End {
+ length -= uintptr(seg.End() - r.End)
+ }
+ if seg.Start() < r.Start {
+ length -= uintptr(r.Start - seg.Start())
+ } else {
+ segOffset = uint64(seg.Start() - r.Start)
+ }
+ fn(segOffset, hostMapEntry{
+ addr: seg.Value(),
+ length: length,
+ })
+ }
+}
+
+func (hm *hostMap) createMappings(r usermem.AddrRange, at usermem.AccessType, fd int, offset uint64) (ms []hostMapEntry, err error) {
+ // Replace any existing mappings.
+ hm.forEachEntry(r, func(segOffset uint64, m hostMapEntry) {
+ _, _, errno := syscall.RawSyscall6(
+ syscall.SYS_MMAP,
+ m.addr,
+ m.length,
+ uintptr(at.Prot()),
+ syscall.MAP_FIXED|syscall.MAP_SHARED,
+ uintptr(fd),
+ uintptr(offset+segOffset))
+ if errno != 0 && err == nil {
+ err = errno
+ }
+ })
+ if err != nil {
+ return nil, err
+ }
+
+ // Add in necessary new mappings.
+ for gap := hm.set.FindGap(r.Start); gap.Ok() && gap.Start() < r.End; {
+ length := uintptr(gap.Range().Length())
+ gapOffset := uint64(0) // Adjusted below.
+ if gap.End() > r.End {
+ length -= uintptr(gap.End() - r.End)
+ }
+ if gap.Start() < r.Start {
+ length -= uintptr(r.Start - gap.Start())
+ } else {
+ gapOffset = uint64(gap.Start() - r.Start)
+ }
+
+ // Map the host file memory.
+ hostAddr, _, errno := syscall.RawSyscall6(
+ syscall.SYS_MMAP,
+ 0,
+ length,
+ uintptr(at.Prot()),
+ syscall.MAP_SHARED,
+ uintptr(fd),
+ uintptr(offset+gapOffset))
+ if errno != 0 {
+ return nil, errno
+ }
+
+ // Insert into the host set and move to the next gap.
+ gap = hm.set.Insert(gap, gap.Range().Intersect(r), hostAddr).NextGap()
+ }
+
+ // Collect all slices.
+ hm.forEachEntry(r, func(_ uint64, m hostMapEntry) {
+ ms = append(ms, m)
+ })
+
+ return ms, nil
+}
+
+// CreateMappings creates a new set of host mapping entries.
+func (hm *hostMap) CreateMappings(r usermem.AddrRange, at usermem.AccessType, fd int, offset uint64) (ms []hostMapEntry, err error) {
+ hm.mu.Lock()
+ ms, err = hm.createMappings(r, at, fd, offset)
+ hm.mu.Unlock()
+ return
+}
+
+func (hm *hostMap) deleteMapping(r usermem.AddrRange) {
+ // Remove all the existing mappings.
+ hm.forEachEntry(r, func(_ uint64, m hostMapEntry) {
+ _, _, errno := syscall.RawSyscall(
+ syscall.SYS_MUNMAP,
+ m.addr,
+ m.length,
+ 0)
+ if errno != 0 {
+ // Should never happen.
+ panic(fmt.Sprintf("unmap error: %v", errno))
+ }
+ })
+
+ // Knock the range out.
+ hm.set.RemoveRange(r)
+}
+
+// DeleteMapping deletes the given range.
+func (hm *hostMap) DeleteMapping(r usermem.AddrRange) {
+ hm.mu.Lock()
+ hm.deleteMapping(r)
+ hm.mu.Unlock()
+}
+
+// hostMapSetFunctions is used in the implementation of mapSet.
+type hostMapSetFunctions struct{}
+
+func (hostMapSetFunctions) MinKey() usermem.Addr { return 0 }
+func (hostMapSetFunctions) MaxKey() usermem.Addr { return ^usermem.Addr(0) }
+func (hostMapSetFunctions) ClearValue(val *uintptr) { *val = 0 }
+
+func (hostMapSetFunctions) Merge(r1 usermem.AddrRange, addr1 uintptr, r2 usermem.AddrRange, addr2 uintptr) (uintptr, bool) {
+ if addr1+uintptr(r1.Length()) != addr2 {
+ return 0, false
+ }
+
+ // Since the two regions are contiguous in both the key space and the
+ // value space, we can just store a single segment with the first host
+ // virtual address; the logic above operates based on the size of the
+ // segments.
+ return addr1, true
+}
+
+func (hostMapSetFunctions) Split(r usermem.AddrRange, hostAddr uintptr, split usermem.Addr) (uintptr, uintptr) {
+ return hostAddr, hostAddr + uintptr(split-r.Start)
+}
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
new file mode 100644
index 000000000..31928c9f0
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -0,0 +1,149 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package kvm provides a kvm-based implementation of the platform interface.
+package kvm
+
+import (
+ "fmt"
+ "runtime"
+ "sync"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/cpuid"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// KVM represents a lightweight VM context.
+type KVM struct {
+ platform.NoCPUPreemptionDetection
+
+ // filemem is our memory source.
+ *filemem.FileMem
+
+ // machine is the backing VM.
+ machine *machine
+}
+
+var (
+ globalOnce sync.Once
+ globalErr error
+)
+
+// New returns a new KVM-based implementation of the platform interface.
+func New() (*KVM, error) {
+ // Allocate physical memory for the vCPUs.
+ fm, err := filemem.New("kvm-memory")
+ if err != nil {
+ return nil, err
+ }
+
+ // Try opening KVM.
+ fd, err := syscall.Open("/dev/kvm", syscall.O_RDWR, 0)
+ if err != nil {
+ return nil, fmt.Errorf("opening /dev/kvm: %v", err)
+ }
+ defer syscall.Close(fd)
+
+ // Ensure global initialization is done.
+ globalOnce.Do(func() {
+ physicalInit()
+ globalErr = updateSystemValues(fd)
+ ring0.Init(cpuid.HostFeatureSet())
+ })
+ if globalErr != nil {
+ return nil, err
+ }
+
+ // Create a new VM fd.
+ vm, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(fd), _KVM_CREATE_VM, 0)
+ if errno != 0 {
+ return nil, fmt.Errorf("creating VM: %v", errno)
+ }
+
+ // Create a VM context.
+ machine, err := newMachine(int(vm), runtime.NumCPU())
+ if err != nil {
+ return nil, err
+ }
+
+ // All set.
+ return &KVM{
+ FileMem: fm,
+ machine: machine,
+ }, nil
+}
+
+// SupportsAddressSpaceIO implements platform.Platform.SupportsAddressSpaceIO.
+func (*KVM) SupportsAddressSpaceIO() bool {
+ return false
+}
+
+// CooperativelySchedulesAddressSpace implements platform.Platform.CooperativelySchedulesAddressSpace.
+func (*KVM) CooperativelySchedulesAddressSpace() bool {
+ return false
+}
+
+// MapUnit implements platform.Platform.MapUnit.
+func (*KVM) MapUnit() uint64 {
+ // We greedily creates PTEs in MapFile, so extremely large mappings can
+ // be expensive. Not _that_ expensive since we allow super pages, but
+ // even though can get out of hand if you're creating multi-terabyte
+ // mappings. For this reason, we limit mappings to an arbitrary 16MB.
+ return 16 << 20
+}
+
+// MinUserAddress returns the lowest available address.
+func (*KVM) MinUserAddress() usermem.Addr {
+ return usermem.PageSize
+}
+
+// MaxUserAddress returns the first address that may not be used.
+func (*KVM) MaxUserAddress() usermem.Addr {
+ return usermem.Addr(ring0.MaximumUserAddress)
+}
+
+// NewAddressSpace returns a new pagetable root.
+func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan struct{}, error) {
+ // Allocate page tables and install system mappings.
+ pageTables := k.machine.kernel.PageTables.New()
+ applyPhysicalRegions(func(pr physicalRegion) bool {
+ // Map the kernel in the upper half.
+ kernelVirtual := usermem.Addr(ring0.KernelStartAddress | pr.virtual)
+ pageTables.Map(kernelVirtual, pr.length, false /* kernel */, usermem.AnyAccess, pr.physical)
+ return true // Keep iterating.
+ })
+
+ // Return the new address space.
+ return &addressSpace{
+ filemem: k.FileMem,
+ machine: k.machine,
+ pageTables: pageTables,
+ }, nil, nil
+}
+
+// NewContext returns an interruptible context.
+func (k *KVM) NewContext() platform.Context {
+ return &context{
+ machine: k.machine,
+ }
+}
+
+// Memory returns the platform memory used to do allocations.
+func (k *KVM) Memory() platform.Memory {
+ return k.FileMem
+}
diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go
new file mode 100644
index 000000000..3d56ed895
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_amd64.go
@@ -0,0 +1,213 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+)
+
+// userMemoryRegion is a region of physical memory.
+//
+// This mirrors kvm_memory_region.
+type userMemoryRegion struct {
+ slot uint32
+ flags uint32
+ guestPhysAddr uint64
+ memorySize uint64
+ userspaceAddr uint64
+}
+
+// userRegs represents KVM user registers.
+//
+// This mirrors kvm_regs.
+type userRegs struct {
+ RAX uint64
+ RBX uint64
+ RCX uint64
+ RDX uint64
+ RSI uint64
+ RDI uint64
+ RSP uint64
+ RBP uint64
+ R8 uint64
+ R9 uint64
+ R10 uint64
+ R11 uint64
+ R12 uint64
+ R13 uint64
+ R14 uint64
+ R15 uint64
+ RIP uint64
+ RFLAGS uint64
+}
+
+// systemRegs represents KVM system registers.
+//
+// This mirrors kvm_sregs.
+type systemRegs struct {
+ CS segment
+ DS segment
+ ES segment
+ FS segment
+ GS segment
+ SS segment
+ TR segment
+ LDT segment
+ GDT descriptor
+ IDT descriptor
+ CR0 uint64
+ CR2 uint64
+ CR3 uint64
+ CR4 uint64
+ CR8 uint64
+ EFER uint64
+ apicBase uint64
+ interruptBitmap [(_KVM_NR_INTERRUPTS + 63) / 64]uint64
+}
+
+// segment is the expanded form of a segment register.
+//
+// This mirrors kvm_segment.
+type segment struct {
+ base uint64
+ limit uint32
+ selector uint16
+ typ uint8
+ present uint8
+ DPL uint8
+ DB uint8
+ S uint8
+ L uint8
+ G uint8
+ AVL uint8
+ unusable uint8
+ _ uint8
+}
+
+// Clear clears the segment and marks it unusable.
+func (s *segment) Clear() {
+ *s = segment{unusable: 1}
+}
+
+// selector is a segment selector.
+type selector uint16
+
+// tobool is a simple helper.
+func tobool(x ring0.SegmentDescriptorFlags) uint8 {
+ if x != 0 {
+ return 1
+ }
+ return 0
+}
+
+// Load loads the segment described by d into the segment s.
+//
+// The argument sel is recorded as the segment selector index.
+func (s *segment) Load(d *ring0.SegmentDescriptor, sel ring0.Selector) {
+ flag := d.Flags()
+ if flag&ring0.SegmentDescriptorPresent == 0 {
+ s.Clear()
+ return
+ }
+ s.base = uint64(d.Base())
+ s.limit = d.Limit()
+ s.typ = uint8((flag>>8)&0xF) | 1
+ s.S = tobool(flag & ring0.SegmentDescriptorSystem)
+ s.DPL = uint8(d.DPL())
+ s.present = tobool(flag & ring0.SegmentDescriptorPresent)
+ s.AVL = tobool(flag & ring0.SegmentDescriptorAVL)
+ s.L = tobool(flag & ring0.SegmentDescriptorLong)
+ s.DB = tobool(flag & ring0.SegmentDescriptorDB)
+ s.G = tobool(flag & ring0.SegmentDescriptorG)
+ if s.L != 0 {
+ s.limit = 0xffffffff
+ }
+ s.unusable = 0
+ s.selector = uint16(sel)
+}
+
+// descriptor describes a region of physical memory.
+//
+// It corresponds to the pseudo-descriptor used in the x86 LGDT and LIDT
+// instructions, and mirrors kvm_dtable.
+type descriptor struct {
+ base uint64
+ limit uint16
+ _ [3]uint16
+}
+
+// modelControlRegister is an MSR entry.
+//
+// This mirrors kvm_msr_entry.
+type modelControlRegister struct {
+ index uint32
+ _ uint32
+ data uint64
+}
+
+// modelControlRegisers is a collection of MSRs.
+//
+// This mirrors kvm_msrs.
+type modelControlRegisters struct {
+ nmsrs uint32
+ _ uint32
+ entries [16]modelControlRegister
+}
+
+// runData is the run structure. This may be mapped for synchronous register
+// access (although that doesn't appear to be supported by my kernel at least).
+//
+// This mirrors kvm_run.
+type runData struct {
+ requestInterruptWindow uint8
+ _ [7]uint8
+
+ exitReason uint32
+ readyForInterruptInjection uint8
+ ifFlag uint8
+ _ [2]uint8
+
+ cr8 uint64
+ apicBase uint64
+
+ // This is the union data for exits. Interpretation depends entirely on
+ // the exitReason above (see vCPU code for more information).
+ data [32]uint64
+}
+
+// cpuidEntry is a single CPUID entry.
+//
+// This mirrors kvm_cpuid_entry2.
+type cpuidEntry struct {
+ function uint32
+ index uint32
+ flags uint32
+ eax uint32
+ ebx uint32
+ ecx uint32
+ edx uint32
+ _ [3]uint32
+}
+
+// cpuidEntries is a collection of CPUID entries.
+//
+// This mirrors kvm_cpuid2.
+type cpuidEntries struct {
+ nr uint32
+ _ uint32
+ entries [_KVM_NR_CPUID_ENTRIES]cpuidEntry
+}
diff --git a/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
new file mode 100644
index 000000000..389412d87
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_amd64_unsafe.go
@@ -0,0 +1,93 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+ "fmt"
+ "syscall"
+ "unsafe"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+)
+
+var (
+ runDataSize int
+ hasGuestPCID bool
+ hasGuestINVPCID bool
+ pagetablesOpts pagetables.Opts
+ cpuidSupported = cpuidEntries{nr: _KVM_NR_CPUID_ENTRIES}
+)
+
+func updateSystemValues(fd int) error {
+ // Extract the mmap size.
+ sz, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(fd), _KVM_GET_VCPU_MMAP_SIZE, 0)
+ if errno != 0 {
+ return fmt.Errorf("getting VCPU mmap size: %v", errno)
+ }
+
+ // Save the data.
+ runDataSize = int(sz)
+
+ // Must do the dance to figure out the number of entries.
+ _, _, errno = syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(fd),
+ _KVM_GET_SUPPORTED_CPUID,
+ uintptr(unsafe.Pointer(&cpuidSupported)))
+ if errno != 0 && errno != syscall.ENOMEM {
+ // Some other error occurred.
+ return fmt.Errorf("getting supported CPUID: %v", errno)
+ }
+
+ // The number should now be correct.
+ _, _, errno = syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(fd),
+ _KVM_GET_SUPPORTED_CPUID,
+ uintptr(unsafe.Pointer(&cpuidSupported)))
+ if errno != 0 {
+ // Didn't work with the right number.
+ return fmt.Errorf("getting supported CPUID (2nd attempt): %v", errno)
+ }
+
+ // Calculate whether guestPCID is supported.
+ //
+ // FIXME: These should go through the much more pleasant
+ // cpuid package interfaces, once a way to accept raw kvm CPUID entries
+ // is plumbed (or some rough equivalent).
+ for i := 0; i < int(cpuidSupported.nr); i++ {
+ entry := cpuidSupported.entries[i]
+ if entry.function == 1 && entry.index == 0 && entry.ecx&(1<<17) != 0 {
+ hasGuestPCID = true // Found matching PCID in guest feature set.
+ }
+ if entry.function == 7 && entry.index == 0 && entry.ebx&(1<<10) != 0 {
+ hasGuestINVPCID = true // Found matching INVPCID in guest feature set.
+ }
+ }
+
+ // A basic sanity check: ensure that we don't attempt to
+ // invpcid if guest PCIDs are not supported; it's not clear
+ // what the semantics of this would be (or why some CPU or
+ // hypervisor would export this particular combination).
+ hasGuestINVPCID = hasGuestPCID && hasGuestINVPCID
+
+ // Set the pagetables to use PCID if it's available.
+ pagetablesOpts.EnablePCID = hasGuestPCID
+
+ // Success.
+ return nil
+}
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
new file mode 100644
index 000000000..0ec6a4a00
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -0,0 +1,56 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+// KVM ioctls.
+//
+// Only the ioctls we need in Go appear here; some additional ioctls are used
+// within the assembly stubs (KVM_INTERRUPT, etc.).
+const (
+ _KVM_CREATE_VM = 0xae01
+ _KVM_GET_VCPU_MMAP_SIZE = 0xae04
+ _KVM_CREATE_VCPU = 0xae41
+ _KVM_SET_TSS_ADDR = 0xae47
+ _KVM_RUN = 0xae80
+ _KVM_INTERRUPT = 0x4004ae86
+ _KVM_SET_MSRS = 0x4008ae89
+ _KVM_SET_USER_MEMORY_REGION = 0x4020ae46
+ _KVM_SET_REGS = 0x4090ae82
+ _KVM_SET_SREGS = 0x4138ae84
+ _KVM_GET_SUPPORTED_CPUID = 0xc008ae05
+ _KVM_SET_CPUID2 = 0x4008ae90
+ _KVM_SET_SIGNAL_MASK = 0x4004ae8b
+)
+
+// KVM exit reasons.
+const (
+ _KVM_EXIT_EXCEPTION = 0x1
+ _KVM_EXIT_IO = 0x2
+ _KVM_EXIT_HYPERCALL = 0x3
+ _KVM_EXIT_DEBUG = 0x4
+ _KVM_EXIT_HLT = 0x5
+ _KVM_EXIT_MMIO = 0x6
+ _KVM_EXIT_IRQ_WINDOW_OPEN = 0x7
+ _KVM_EXIT_SHUTDOWN = 0x8
+ _KVM_EXIT_FAIL_ENTRY = 0x9
+ _KVM_EXIT_INTERNAL_ERROR = 0x11
+)
+
+// KVM limits.
+const (
+ _KVM_NR_VCPUS = 0x100
+ _KVM_NR_INTERRUPTS = 0x100
+ _KVM_NR_CPUID_ENTRIES = 0x100
+)
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
new file mode 100644
index 000000000..61cfdd8fd
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -0,0 +1,415 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+ "math/rand"
+ "reflect"
+ "syscall"
+ "testing"
+ "time"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm/testutil"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+var dummyFPState = (*byte)(arch.NewFloatingPointData())
+
+type testHarness interface {
+ Errorf(format string, args ...interface{})
+ Fatalf(format string, args ...interface{})
+}
+
+func kvmTest(t testHarness, setup func(*KVM), fn func(*vCPU) bool) {
+ // Create the machine.
+ k, err := New()
+ if err != nil {
+ t.Fatalf("error creating KVM instance: %v", err)
+ }
+ defer k.machine.Destroy()
+ defer k.FileMem.Destroy()
+
+ // Call additional setup.
+ if setup != nil {
+ setup(k)
+ }
+
+ var c *vCPU // For recovery.
+ defer func() {
+ redpill()
+ if c != nil {
+ k.machine.Put(c)
+ }
+ }()
+ for {
+ c, err = k.machine.Get()
+ if err != nil {
+ t.Fatalf("error getting vCPU: %v", err)
+ }
+ if !fn(c) {
+ break
+ }
+
+ // We put the vCPU here and clear the value so that the
+ // deferred recovery will not re-put it above.
+ k.machine.Put(c)
+ c = nil
+ }
+}
+
+func bluepillTest(t testHarness, fn func(*vCPU)) {
+ kvmTest(t, nil, func(c *vCPU) bool {
+ bluepill(c)
+ fn(c)
+ return false
+ })
+}
+
+func TestKernelSyscall(t *testing.T) {
+ bluepillTest(t, func(c *vCPU) {
+ redpill() // Leave guest mode.
+ if got := c.State(); got != vCPUReady {
+ t.Errorf("vCPU not in ready state: got %v", got)
+ }
+ })
+}
+
+func hostFault() {
+ defer func() {
+ recover()
+ }()
+ var foo *int
+ *foo = 0
+}
+
+func TestKernelFault(t *testing.T) {
+ hostFault() // Ensure recovery works.
+ bluepillTest(t, func(c *vCPU) {
+ hostFault()
+ if got := c.State(); got != vCPUReady {
+ t.Errorf("vCPU not in ready state: got %v", got)
+ }
+ })
+}
+
+func TestKernelFloatingPoint(t *testing.T) {
+ bluepillTest(t, func(c *vCPU) {
+ if !testutil.FloatingPointWorks() {
+ t.Errorf("floating point does not work, and it should!")
+ }
+ })
+}
+
+func applicationTest(t testHarness, useHostMappings bool, target func(), fn func(*vCPU, *syscall.PtraceRegs, *pagetables.PageTables) bool) {
+ // Initialize registers & page tables.
+ var (
+ regs syscall.PtraceRegs
+ pt *pagetables.PageTables
+ )
+ testutil.SetTestTarget(&regs, target)
+ defer func() {
+ if pt != nil {
+ pt.Release()
+ }
+ }()
+
+ kvmTest(t, func(k *KVM) {
+ // Create new page tables.
+ as, _, err := k.NewAddressSpace(nil /* invalidator */)
+ if err != nil {
+ t.Fatalf("can't create new address space: %v", err)
+ }
+ pt = as.(*addressSpace).pageTables
+
+ if useHostMappings {
+ // Apply the physical mappings to these page tables.
+ // (This is normally dangerous, since they point to
+ // physical pages that may not exist. This shouldn't be
+ // done for regular user code, but is fine for test
+ // purposes.)
+ applyPhysicalRegions(func(pr physicalRegion) bool {
+ pt.Map(usermem.Addr(pr.virtual), pr.length, true /* user */, usermem.AnyAccess, pr.physical)
+ return true // Keep iterating.
+ })
+ }
+ }, func(c *vCPU) bool {
+ // Invoke the function with the extra data.
+ return fn(c, &regs, pt)
+ })
+}
+
+func TestApplicationSyscall(t *testing.T) {
+ applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+ if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); err != nil {
+ t.Errorf("application syscall with full restore failed: %v", err)
+ }
+ return false
+ })
+ applicationTest(t, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+ if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != nil {
+ t.Errorf("application syscall with partial restore failed: %v", err)
+ }
+ return false
+ })
+}
+
+func TestApplicationFault(t *testing.T) {
+ applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+ testutil.SetTouchTarget(regs, nil) // Cause fault.
+ if si, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); err != platform.ErrContextSignal || (si != nil && si.Signo != int32(syscall.SIGSEGV)) {
+ t.Errorf("application fault with full restore got (%v, %v), expected (%v, SIGSEGV)", err, si, platform.ErrContextSignal)
+ }
+ return false
+ })
+ applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+ testutil.SetTouchTarget(regs, nil) // Cause fault.
+ if si, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != platform.ErrContextSignal || (si != nil && si.Signo != int32(syscall.SIGSEGV)) {
+ t.Errorf("application fault with partial restore got (%v, %v), expected (%v, SIGSEGV)", err, si, platform.ErrContextSignal)
+ }
+ return false
+ })
+}
+
+func TestRegistersSyscall(t *testing.T) {
+ applicationTest(t, true, testutil.TwiddleRegsSyscall, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+ testutil.SetTestRegs(regs) // Fill values for all registers.
+ if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != nil {
+ t.Errorf("application register check with partial restore got unexpected error: %v", err)
+ }
+ if err := testutil.CheckTestRegs(regs, false); err != nil {
+ t.Errorf("application register check with partial restore failed: %v", err)
+ }
+ return false
+ })
+}
+
+func TestRegistersFault(t *testing.T) {
+ applicationTest(t, true, testutil.TwiddleRegsFault, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+ testutil.SetTestRegs(regs) // Fill values for all registers.
+ if si, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); err != platform.ErrContextSignal || si.Signo != int32(syscall.SIGSEGV) {
+ t.Errorf("application register check with full restore got unexpected error: %v", err)
+ }
+ if err := testutil.CheckTestRegs(regs, true); err != nil {
+ t.Errorf("application register check with full restore failed: %v", err)
+ }
+ return false
+ })
+}
+
+func TestSegments(t *testing.T) {
+ applicationTest(t, true, testutil.TwiddleSegments, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+ testutil.SetTestSegments(regs)
+ if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); err != nil {
+ t.Errorf("application segment check with full restore got unexpected error: %v", err)
+ }
+ if err := testutil.CheckTestSegments(regs); err != nil {
+ t.Errorf("application segment check with full restore failed: %v", err)
+ }
+ return false
+ })
+}
+
+func TestBounce(t *testing.T) {
+ applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+ go func() {
+ time.Sleep(time.Millisecond)
+ c.Bounce()
+ }()
+ if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != platform.ErrContextInterrupt {
+ t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
+ }
+ return false
+ })
+ applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+ go func() {
+ time.Sleep(time.Millisecond)
+ c.Bounce()
+ }()
+ if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); err != platform.ErrContextInterrupt {
+ t.Errorf("application full restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
+ }
+ return false
+ })
+}
+
+func TestBounceStress(t *testing.T) {
+ applicationTest(t, true, testutil.SpinLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+ randomSleep := func() {
+ // O(hundreds of microseconds) is appropriate to ensure
+ // different overlaps and different schedules.
+ if n := rand.Intn(1000); n > 100 {
+ time.Sleep(time.Duration(n) * time.Microsecond)
+ }
+ }
+ for i := 0; i < 1000; i++ {
+ // Start an asynchronously executing goroutine that
+ // calls Bounce at pseudo-random point in time.
+ // This should wind up calling Bounce when the
+ // kernel is in various stages of the switch.
+ go func() {
+ randomSleep()
+ c.Bounce()
+ }()
+ randomSleep()
+ // Execute the switch.
+ if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != platform.ErrContextInterrupt {
+ t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextInterrupt)
+ }
+ // Simulate work.
+ c.Unlock()
+ randomSleep()
+ c.Lock()
+ }
+ return false
+ })
+}
+
+func TestInvalidate(t *testing.T) {
+ var data uintptr // Used below.
+ applicationTest(t, true, testutil.Touch, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+ testutil.SetTouchTarget(regs, &data) // Read legitimate value.
+ if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != nil {
+ t.Errorf("application partial restore: got %v, wanted nil", err)
+ }
+ // Unmap the page containing data & invalidate.
+ pt.Unmap(usermem.Addr(reflect.ValueOf(&data).Pointer() & ^uintptr(usermem.PageSize-1)), usermem.PageSize)
+ c.Invalidate() // Ensure invalidation.
+ if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != platform.ErrContextSignal {
+ t.Errorf("application partial restore: got %v, wanted %v", err, platform.ErrContextSignal)
+ }
+ return false
+ })
+}
+
+// IsFault returns true iff the given signal represents a fault.
+func IsFault(err error, si *arch.SignalInfo) bool {
+ return err == platform.ErrContextSignal && si.Signo == int32(syscall.SIGSEGV)
+}
+
+func TestEmptyAddressSpace(t *testing.T) {
+ applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+ if si, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); !IsFault(err, si) {
+ t.Errorf("first fault with partial restore failed got %v", err)
+ t.Logf("registers: %#v", &regs)
+ }
+ return false
+ })
+ applicationTest(t, false, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+ if si, _, err := c.SwitchToUser(regs, dummyFPState, pt, ring0.FlagFull); !IsFault(err, si) {
+ t.Errorf("first fault with full restore failed got %v", err)
+ t.Logf("registers: %#v", &regs)
+ }
+ return false
+ })
+}
+
+func TestWrongVCPU(t *testing.T) {
+ kvmTest(t, nil, func(c1 *vCPU) bool {
+ kvmTest(t, nil, func(c2 *vCPU) bool {
+ // Basic test, one then the other.
+ bluepill(c1)
+ bluepill(c2)
+ if c2.switches == 0 {
+ // Don't allow the test to proceed if this fails.
+ t.Fatalf("wrong vCPU#2 switches: vCPU1=%+v,vCPU2=%+v", c1, c2)
+ }
+
+ // Alternate vCPUs; we expect to need to trigger the
+ // wrong vCPU path on each switch.
+ for i := 0; i < 100; i++ {
+ bluepill(c1)
+ bluepill(c2)
+ }
+ if count := c1.switches; count < 90 {
+ t.Errorf("wrong vCPU#1 switches: vCPU1=%+v,vCPU2=%+v", c1, c2)
+ }
+ if count := c2.switches; count < 90 {
+ t.Errorf("wrong vCPU#2 switches: vCPU1=%+v,vCPU2=%+v", c1, c2)
+ }
+ return false
+ })
+ return false
+ })
+ kvmTest(t, nil, func(c1 *vCPU) bool {
+ kvmTest(t, nil, func(c2 *vCPU) bool {
+ bluepill(c1)
+ bluepill(c2)
+ return false
+ })
+ return false
+ })
+}
+
+func BenchmarkApplicationSyscall(b *testing.B) {
+ var (
+ i int // Iteration includes machine.Get() / machine.Put().
+ a int // Count for ErrContextInterrupt.
+ )
+ applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+ if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != nil {
+ if err == platform.ErrContextInterrupt {
+ a++
+ return true // Ignore.
+ }
+ b.Fatalf("benchmark failed: %v", err)
+ }
+ i++
+ return i < b.N
+ })
+ if a != 0 {
+ b.Logf("ErrContextInterrupt occurred %d times (in %d iterations).", a, a+i)
+ }
+}
+
+func BenchmarkKernelSyscall(b *testing.B) {
+ // Note that the target passed here is irrelevant, we never execute SwitchToUser.
+ applicationTest(b, true, testutil.Getpid, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+ // iteration does not include machine.Get() / machine.Put().
+ for i := 0; i < b.N; i++ {
+ testutil.Getpid()
+ }
+ return false
+ })
+}
+
+func BenchmarkWorldSwitchToUserRoundtrip(b *testing.B) {
+ // see BenchmarkApplicationSyscall.
+ var (
+ i int
+ a int
+ )
+ applicationTest(b, true, testutil.SyscallLoop, func(c *vCPU, regs *syscall.PtraceRegs, pt *pagetables.PageTables) bool {
+ if _, _, err := c.SwitchToUser(regs, dummyFPState, pt, 0); err != nil {
+ if err == platform.ErrContextInterrupt {
+ a++
+ return true // Ignore.
+ }
+ b.Fatalf("benchmark failed: %v", err)
+ }
+ // This will intentionally cause the world switch. By executing
+ // a host syscall here, we force the transition between guest
+ // and host mode.
+ testutil.Getpid()
+ i++
+ return i < b.N
+ })
+ if a != 0 {
+ b.Logf("EAGAIN occurred %d times (in %d iterations).", a, a+i)
+ }
+}
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
new file mode 100644
index 000000000..a5be0cee3
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -0,0 +1,412 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+ "fmt"
+ "runtime"
+ "sync"
+ "sync/atomic"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/tmutex"
+)
+
+// machine contains state associated with the VM as a whole.
+type machine struct {
+ // fd is the vm fd.
+ fd int
+
+ // nextSlot is the next slot for setMemoryRegion.
+ //
+ // This must be accessed atomically. If nextSlot is ^uint32(0), then
+ // slots are currently being updated, and the caller should retry.
+ nextSlot uint32
+
+ // kernel is the set of global structures.
+ kernel *ring0.Kernel
+
+ // mappingCache is used for mapPhysical.
+ mappingCache sync.Map
+
+ // mu protects vCPUs.
+ mu sync.Mutex
+
+ // vCPUs are the machine vCPUs.
+ //
+ // This is eventually keyed by system TID, but is initially indexed by
+ // the negative vCPU id. This is merely an optimization, so while
+ // collisions here are not possible, it wouldn't matter anyways.
+ vCPUs map[uint64]*vCPU
+}
+
+const (
+ // vCPUReady is the lock value for an available vCPU.
+ //
+ // Legal transitions: vCPUGuest (bluepill).
+ vCPUReady uintptr = iota
+
+ // vCPUGuest indicates the vCPU is in guest mode.
+ //
+ // Legal transition: vCPUReady (bluepill), vCPUWaiter (wait).
+ vCPUGuest
+
+ // vCPUWaiter indicates that the vCPU should be released.
+ //
+ // Legal transition: vCPUReady (bluepill).
+ vCPUWaiter
+)
+
+// vCPU is a single KVM vCPU.
+type vCPU struct {
+ // CPU is the kernel CPU data.
+ //
+ // This must be the first element of this structure, it is referenced
+ // by the bluepill code (see bluepill_amd64.s).
+ ring0.CPU
+
+ // fd is the vCPU fd.
+ fd int
+
+ // tid is the last set tid.
+ tid uint64
+
+ // switches is a count of world switches (informational only).
+ switches uint32
+
+ // faults is a count of world faults (informational only).
+ faults uint32
+
+ // state is the vCPU state; all are described above.
+ state uintptr
+
+ // runData for this vCPU.
+ runData *runData
+
+ // machine associated with this vCPU.
+ machine *machine
+
+ // mu applies across get/put; it does not protect the above.
+ mu tmutex.Mutex
+}
+
+// newMachine returns a new VM context.
+func newMachine(vm int, vCPUs int) (*machine, error) {
+ // Create the machine.
+ m := &machine{
+ fd: vm,
+ vCPUs: make(map[uint64]*vCPU),
+ }
+ if vCPUs > _KVM_NR_VCPUS {
+ // Hard cap at KVM's limit.
+ vCPUs = _KVM_NR_VCPUS
+ }
+ if n := 2 * runtime.NumCPU(); vCPUs > n {
+ // Cap at twice the number of physical cores. Otherwise we're
+ // just wasting memory and thrashing. (There may be scheduling
+ // issues when you've got > n active threads.)
+ vCPUs = n
+ }
+ m.kernel = ring0.New(ring0.KernelOpts{
+ PageTables: pagetables.New(m, pagetablesOpts),
+ })
+
+ // Initialize architecture state.
+ if err := m.initArchState(vCPUs); err != nil {
+ m.Destroy()
+ return nil, err
+ }
+
+ // Create all the vCPUs.
+ for id := 0; id < vCPUs; id++ {
+ // Create the vCPU.
+ fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(vm), _KVM_CREATE_VCPU, uintptr(id))
+ if errno != 0 {
+ m.Destroy()
+ return nil, fmt.Errorf("error creating VCPU: %v", errno)
+ }
+ c := &vCPU{
+ fd: int(fd),
+ machine: m,
+ }
+ c.mu.Init()
+ c.CPU.Init(m.kernel)
+ c.CPU.KernelSyscall = bluepillSyscall
+ c.CPU.KernelException = bluepillException
+ m.vCPUs[uint64(-id)] = c // See above.
+
+ // Ensure the signal mask is correct.
+ if err := c.setSignalMask(); err != nil {
+ m.Destroy()
+ return nil, err
+ }
+
+ // Initialize architecture state.
+ if err := c.initArchState(); err != nil {
+ m.Destroy()
+ return nil, err
+ }
+
+ // Map the run data.
+ runData, err := mapRunData(int(fd))
+ if err != nil {
+ m.Destroy()
+ return nil, err
+ }
+ c.runData = runData
+ }
+
+ // Apply the physical mappings. Note that these mappings may point to
+ // guest physical addresses that are not actually available. These
+ // physical pages are mapped on demand, see kernel_unsafe.go.
+ applyPhysicalRegions(func(pr physicalRegion) bool {
+ // Map everything in the lower half.
+ m.kernel.PageTables.Map(usermem.Addr(pr.virtual), pr.length, false /* kernel */, usermem.AnyAccess, pr.physical)
+ // And keep everything in the upper half.
+ kernelAddr := usermem.Addr(ring0.KernelStartAddress | pr.virtual)
+ m.kernel.PageTables.Map(kernelAddr, pr.length, false /* kernel */, usermem.AnyAccess, pr.physical)
+ return true // Keep iterating.
+ })
+
+ // Ensure that the currently mapped virtual regions are actually
+ // available in the VM. Note that this doesn't guarantee no future
+ // faults, however it should guarantee that everything is available to
+ // ensure successful vCPU entry.
+ applyVirtualRegions(func(vr virtualRegion) {
+ if excludeVirtualRegion(vr) {
+ return // skip region.
+ }
+ for virtual := vr.virtual; virtual < vr.virtual+vr.length; {
+ physical, length, ok := TranslateToPhysical(virtual)
+ if !ok {
+ // This must be an invalid region that was
+ // knocked out by creation of the physical map.
+ return
+ }
+ if virtual+length > vr.virtual+vr.length {
+ // Cap the length to the end of the area.
+ length = vr.virtual + vr.length - virtual
+ }
+
+ // Ensure the physical range is mapped.
+ m.mapPhysical(physical, length)
+ virtual += length
+ }
+ })
+
+ // Ensure the machine is cleaned up properly.
+ runtime.SetFinalizer(m, (*machine).Destroy)
+ return m, nil
+}
+
+// mapPhysical checks for the mapping of a physical range, and installs one if
+// not available. This attempts to be efficient for calls in the hot path.
+//
+// This panics on error.
+func (m *machine) mapPhysical(physical, length uintptr) {
+ for end := physical + length; physical < end; {
+ _, physicalStart, length, ok := calculateBluepillFault(m, physical)
+ if !ok {
+ // Should never happen.
+ panic("mapPhysical on unknown physical address")
+ }
+
+ if _, ok := m.mappingCache.LoadOrStore(physicalStart, true); !ok {
+ // Not present in the cache; requires setting the slot.
+ if _, ok := handleBluepillFault(m, physical); !ok {
+ panic("handleBluepillFault failed")
+ }
+ }
+
+ // Move to the next chunk.
+ physical = physicalStart + length
+ }
+}
+
+// Destroy frees associated resources.
+//
+// Destroy should only be called once all active users of the machine are gone.
+// The machine object should not be used after calling Destroy.
+//
+// Precondition: all vCPUs must be returned to the machine.
+func (m *machine) Destroy() {
+ runtime.SetFinalizer(m, nil)
+
+ // Destroy vCPUs.
+ for _, c := range m.vCPUs {
+ // Ensure the vCPU is not still running in guest mode. This is
+ // possible iff teardown has been done by other threads, and
+ // somehow a single thread has not executed any system calls.
+ c.wait()
+
+ // Teardown the vCPU itself.
+ switch state := c.State(); state {
+ case vCPUReady:
+ // Note that the runData may not be mapped if an error
+ // occurs during the middle of initialization.
+ if c.runData != nil {
+ if err := unmapRunData(c.runData); err != nil {
+ panic(fmt.Sprintf("error unmapping rundata: %v", err))
+ }
+ }
+ if err := syscall.Close(int(c.fd)); err != nil {
+ panic(fmt.Sprintf("error closing vCPU fd: %v", err))
+ }
+ case vCPUGuest, vCPUWaiter:
+ // Should never happen; waited above.
+ panic("vCPU disposed in guest state")
+ default:
+ // Should never happen; not a valid state.
+ panic(fmt.Sprintf("vCPU in invalid state: %v", state))
+ }
+ }
+
+ // Release host mappings.
+ if m.kernel.PageTables != nil {
+ m.kernel.PageTables.Release()
+ }
+
+ // vCPUs are gone: teardown machine state.
+ if err := syscall.Close(m.fd); err != nil {
+ panic(fmt.Sprintf("error closing VM fd: %v", err))
+ }
+}
+
+// Get gets an available vCPU.
+func (m *machine) Get() (*vCPU, error) {
+ runtime.LockOSThread()
+ tid := procid.Current()
+ m.mu.Lock()
+
+ for {
+ // Check for an exact match.
+ if c := m.vCPUs[tid]; c != nil && c.mu.TryLock() {
+ m.mu.Unlock()
+ return c, nil
+ }
+
+ // Scan for an available vCPU.
+ for origTID, c := range m.vCPUs {
+ if c.LockInState(vCPUReady) {
+ delete(m.vCPUs, origTID)
+ m.vCPUs[tid] = c
+ m.mu.Unlock()
+
+ // We need to reload thread-local segments as
+ // we have origTID != tid and the vCPU state
+ // may be stale.
+ c.loadSegments()
+ atomic.StoreUint64(&c.tid, tid)
+ return c, nil
+ }
+ }
+
+ // Everything is busy executing user code (locked).
+ //
+ // We hold the pool lock here, so we should be able to kick something
+ // out of kernel mode and have it bounce into host mode when it tries
+ // to grab the vCPU again.
+ for _, c := range m.vCPUs {
+ if c.State() != vCPUWaiter {
+ c.Bounce()
+ }
+ }
+
+ // Give other threads an opportunity to run.
+ yield()
+ }
+}
+
+// Put puts the current vCPU.
+func (m *machine) Put(c *vCPU) {
+ c.Unlock()
+ runtime.UnlockOSThread()
+}
+
+// State returns the current state.
+func (c *vCPU) State() uintptr {
+ return atomic.LoadUintptr(&c.state)
+}
+
+// Lock locks the vCPU.
+func (c *vCPU) Lock() {
+ c.mu.Lock()
+}
+
+// Invalidate invalidates caches.
+func (c *vCPU) Invalidate() {
+}
+
+// LockInState locks the vCPU if it is in the given state and TryLock succeeds.
+func (c *vCPU) LockInState(state uintptr) bool {
+ if c.State() == state && c.mu.TryLock() {
+ if c.State() != state {
+ c.mu.Unlock()
+ return false
+ }
+ return true
+ }
+ return false
+}
+
+// Unlock unlocks the given vCPU.
+func (c *vCPU) Unlock() {
+ // Ensure we're out of guest mode, if necessary.
+ if c.State() == vCPUWaiter {
+ redpill() // Force guest mode exit.
+ }
+ c.mu.Unlock()
+}
+
+// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
+func (c *vCPU) NotifyInterrupt() {
+ c.Bounce()
+}
+
+// pid is used below in bounce.
+var pid = syscall.Getpid()
+
+// Bounce ensures that the vCPU bounces back to the kernel.
+//
+// In practice, this means returning EAGAIN from running user code. The vCPU
+// will be unlocked and relock, and the kernel is guaranteed to check for
+// interrupt notifications (e.g. injected via Notify) and invalidations.
+func (c *vCPU) Bounce() {
+ for {
+ if c.mu.TryLock() {
+ // We know that the vCPU must be in the kernel already,
+ // because the lock was not acquired. We specifically
+ // don't want to call bounce in this case, because it's
+ // not necessary to knock the vCPU out of guest mode.
+ c.mu.Unlock()
+ return
+ }
+
+ if state := c.State(); state == vCPUGuest || state == vCPUWaiter {
+ // We know that the vCPU was in guest mode, so a single signal
+ // interruption will guarantee that a transition takes place.
+ syscall.Tgkill(pid, int(atomic.LoadUint64(&c.tid)), bounceSignal)
+ return
+ }
+
+ // Someone holds the lock, but the vCPU is not yet transitioned
+ // into guest mode. It's in the critical section; give it time.
+ yield()
+ }
+}
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
new file mode 100644
index 000000000..dfa691e88
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -0,0 +1,168 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+ "fmt"
+ "reflect"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// initArchState initializes architecture-specific state.
+func (m *machine) initArchState(vCPUs int) error {
+ // Set the legacy TSS address. This address is covered by the reserved
+ // range (up to 4GB). In fact, this is a main reason it exists.
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(m.fd),
+ _KVM_SET_TSS_ADDR,
+ uintptr(reservedMemory-(3*usermem.PageSize))); errno != 0 {
+ return errno
+ }
+ return nil
+}
+
+// initArchState initializes architecture-specific state.
+func (c *vCPU) initArchState() error {
+ var (
+ kernelSystemRegs systemRegs
+ kernelUserRegs userRegs
+ )
+
+ // Set base control registers.
+ kernelSystemRegs.CR0 = c.CR0()
+ kernelSystemRegs.CR4 = c.CR4()
+ kernelSystemRegs.EFER = c.EFER()
+
+ // Set the IDT & GDT in the registers.
+ kernelSystemRegs.IDT.base, kernelSystemRegs.IDT.limit = c.IDT()
+ kernelSystemRegs.GDT.base, kernelSystemRegs.GDT.limit = c.GDT()
+ kernelSystemRegs.CS.Load(&ring0.KernelCodeSegment, ring0.Kcode)
+ kernelSystemRegs.DS.Load(&ring0.UserDataSegment, ring0.Udata)
+ kernelSystemRegs.ES.Load(&ring0.UserDataSegment, ring0.Udata)
+ kernelSystemRegs.SS.Load(&ring0.KernelDataSegment, ring0.Kdata)
+ kernelSystemRegs.FS.Load(&ring0.UserDataSegment, ring0.Udata)
+ kernelSystemRegs.GS.Load(&ring0.UserDataSegment, ring0.Udata)
+ tssBase, tssLimit, tss := c.TSS()
+ kernelSystemRegs.TR.Load(tss, ring0.Tss)
+ kernelSystemRegs.TR.base = tssBase
+ kernelSystemRegs.TR.limit = uint32(tssLimit)
+
+ // Point to kernel page tables.
+ kernelSystemRegs.CR3 = c.machine.kernel.PageTables.FlushCR3()
+
+ // Set the CPUID; this is required before setting system registers,
+ // since KVM will reject several CR4 bits if the CPUID does not
+ // indicate the support is available.
+ if err := c.setCPUID(); err != nil {
+ return err
+ }
+
+ // Set the entrypoint for the kernel.
+ kernelUserRegs.RIP = uint64(reflect.ValueOf(ring0.Start).Pointer())
+ kernelUserRegs.RAX = uint64(reflect.ValueOf(&c.CPU).Pointer())
+ kernelUserRegs.RFLAGS = ring0.KernelFlagsSet
+
+ // Set the system registers.
+ if err := c.setSystemRegisters(&kernelSystemRegs); err != nil {
+ return err
+ }
+
+ // Set the user registers.
+ if err := c.setUserRegisters(&kernelUserRegs); err != nil {
+ return err
+ }
+
+ // Set the time offset to the host native time.
+ return c.setSystemTime()
+}
+
+// SwitchToUser unpacks architectural-details.
+func (c *vCPU) SwitchToUser(regs *syscall.PtraceRegs, fpState *byte, pt *pagetables.PageTables, flags ring0.Flags) (*arch.SignalInfo, usermem.AccessType, error) {
+ // See below.
+ var vector ring0.Vector
+
+ // Past this point, stack growth can cause system calls (and a break
+ // from guest mode). So we need to ensure that between the bluepill
+ // call here and the switch call immediately below, no additional
+ // allocations occur.
+ entersyscall()
+ bluepill(c)
+ vector = c.CPU.SwitchToUser(regs, fpState, pt, flags)
+ exitsyscall()
+
+ // Free and clear.
+ switch vector {
+ case ring0.Debug, ring0.Breakpoint:
+ info := &arch.SignalInfo{Signo: int32(syscall.SIGTRAP)}
+ return info, usermem.AccessType{}, platform.ErrContextSignal
+
+ case ring0.PageFault:
+ bluepill(c) // Probably no-op, but may not be.
+ faultAddr := ring0.ReadCR2()
+ code, user := c.ErrorCode()
+ if !user {
+ // The last fault serviced by this CPU was not a user
+ // fault, so we can't reliably trust the faultAddr or
+ // the code provided here. We need to re-execute.
+ return nil, usermem.NoAccess, platform.ErrContextInterrupt
+ }
+ info := &arch.SignalInfo{Signo: int32(syscall.SIGSEGV)}
+ info.SetAddr(uint64(faultAddr))
+ accessType := usermem.AccessType{
+ Read: code&(1<<1) == 0,
+ Write: code&(1<<1) != 0,
+ Execute: code&(1<<4) != 0,
+ }
+ return info, accessType, platform.ErrContextSignal
+
+ case ring0.GeneralProtectionFault:
+ if !ring0.IsCanonical(regs.Rip) {
+ // If the RIP is non-canonical, it's a SEGV.
+ info := &arch.SignalInfo{Signo: int32(syscall.SIGSEGV)}
+ return info, usermem.AccessType{}, platform.ErrContextSignal
+ }
+ // Otherwise, we deliver a SIGBUS.
+ info := &arch.SignalInfo{Signo: int32(syscall.SIGBUS)}
+ return info, usermem.AccessType{}, platform.ErrContextSignal
+
+ case ring0.InvalidOpcode:
+ info := &arch.SignalInfo{Signo: int32(syscall.SIGILL)}
+ return info, usermem.AccessType{}, platform.ErrContextSignal
+
+ case ring0.X87FloatingPointException:
+ info := &arch.SignalInfo{Signo: int32(syscall.SIGFPE)}
+ return info, usermem.AccessType{}, platform.ErrContextSignal
+
+ case ring0.Vector(bounce):
+ redpill() // Bail and reacqire.
+ return nil, usermem.NoAccess, platform.ErrContextInterrupt
+
+ case ring0.Syscall, ring0.SyscallInt80:
+ // System call executed.
+ return nil, usermem.NoAccess, nil
+
+ default:
+ panic(fmt.Sprintf("unexpected vector: 0x%x", vector))
+ }
+}
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
new file mode 100644
index 000000000..c2bcb3a47
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -0,0 +1,156 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package kvm
+
+import (
+ "fmt"
+ "syscall"
+ "unsafe"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/time"
+)
+
+// setMemoryRegion initializes a region.
+//
+// This may be called from bluepillHandler, and therefore returns an errno
+// directly (instead of wrapping in an error) to avoid allocations.
+//
+//go:nosplit
+func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr) syscall.Errno {
+ userRegion := userMemoryRegion{
+ slot: uint32(slot),
+ flags: 0,
+ guestPhysAddr: uint64(physical),
+ memorySize: uint64(length),
+ userspaceAddr: uint64(virtual),
+ }
+
+ // Set the region.
+ _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(m.fd),
+ _KVM_SET_USER_MEMORY_REGION,
+ uintptr(unsafe.Pointer(&userRegion)))
+ return errno
+}
+
+// loadSegments copies the current segments.
+//
+// This may be called from within the signal context and throws on error.
+//
+//go:nosplit
+func (c *vCPU) loadSegments() {
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_ARCH_PRCTL,
+ linux.ARCH_GET_FS,
+ uintptr(unsafe.Pointer(&c.CPU.Registers().Fs_base)),
+ 0); errno != 0 {
+ throw("getting FS segment")
+ }
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_ARCH_PRCTL,
+ linux.ARCH_GET_GS,
+ uintptr(unsafe.Pointer(&c.CPU.Registers().Gs_base)),
+ 0); errno != 0 {
+ throw("getting GS segment")
+ }
+}
+
+// setUserRegisters sets user registers in the vCPU.
+func (c *vCPU) setUserRegisters(uregs *userRegs) error {
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_SET_REGS,
+ uintptr(unsafe.Pointer(uregs))); errno != 0 {
+ return fmt.Errorf("error setting user registers: %v", errno)
+ }
+ return nil
+}
+
+// setSystemRegisters sets system registers.
+func (c *vCPU) setSystemRegisters(sregs *systemRegs) error {
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_SET_SREGS,
+ uintptr(unsafe.Pointer(sregs))); errno != 0 {
+ return fmt.Errorf("error setting system registers: %v", errno)
+ }
+ return nil
+}
+
+// setCPUID sets the CPUID to be used by the guest.
+func (c *vCPU) setCPUID() error {
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_SET_CPUID2,
+ uintptr(unsafe.Pointer(&cpuidSupported))); errno != 0 {
+ return fmt.Errorf("error setting CPUID: %v", errno)
+ }
+ return nil
+}
+
+// setSystemTime sets the TSC for the vCPU.
+//
+// FIXME: This introduces a slight TSC offset between host and
+// guest, which may vary per vCPU.
+func (c *vCPU) setSystemTime() error {
+ const _MSR_IA32_TSC = 0x00000010
+ registers := modelControlRegisters{
+ nmsrs: 1,
+ }
+ registers.entries[0] = modelControlRegister{
+ index: _MSR_IA32_TSC,
+ data: uint64(time.Rdtsc()),
+ }
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_SET_MSRS,
+ uintptr(unsafe.Pointer(&registers))); errno != 0 {
+ return fmt.Errorf("error setting system time: %v", errno)
+ }
+ return nil
+}
+
+// setSignalMask sets the vCPU signal mask.
+//
+// This must be called prior to running the vCPU.
+func (c *vCPU) setSignalMask() error {
+ // The layout of this structure implies that it will not necessarily be
+ // the same layout chosen by the Go compiler. It gets fudged here.
+ var data struct {
+ length uint32
+ mask1 uint32
+ mask2 uint32
+ _ uint32
+ }
+ data.length = 8 // Fixed sigset size.
+ data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
+ data.mask2 = ^uint32(bounceSignalMask >> 32)
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_SET_SIGNAL_MASK,
+ uintptr(unsafe.Pointer(&data))); errno != 0 {
+ return fmt.Errorf("error setting signal mask: %v", errno)
+ }
+ return nil
+}
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
new file mode 100644
index 000000000..da67e23f6
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -0,0 +1,112 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+ "fmt"
+ "sync/atomic"
+ "syscall"
+ "unsafe"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
+)
+
+//go:linkname entersyscall runtime.entersyscall
+func entersyscall()
+
+//go:linkname exitsyscall runtime.exitsyscall
+func exitsyscall()
+
+// TranslateToVirtual implements pagetables.Translater.TranslateToPhysical.
+func (m *machine) TranslateToPhysical(ptes *pagetables.PTEs) uintptr {
+ // The length doesn't matter because all these translations require
+ // only a single page, which is guaranteed to be satisfied.
+ physical, _, ok := TranslateToPhysical(uintptr(unsafe.Pointer(ptes)))
+ if !ok {
+ panic("unable to translate pagetables.Node to physical address")
+ }
+ return physical
+}
+
+// mapRunData maps the vCPU run data.
+func mapRunData(fd int) (*runData, error) {
+ r, _, errno := syscall.RawSyscall6(
+ syscall.SYS_MMAP,
+ 0,
+ uintptr(runDataSize),
+ syscall.PROT_READ|syscall.PROT_WRITE,
+ syscall.MAP_SHARED,
+ uintptr(fd),
+ 0)
+ if errno != 0 {
+ return nil, fmt.Errorf("error mapping runData: %v", errno)
+ }
+ return (*runData)(unsafe.Pointer(r)), nil
+}
+
+// unmapRunData unmaps the vCPU run data.
+func unmapRunData(r *runData) error {
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_MUNMAP,
+ uintptr(unsafe.Pointer(r)),
+ uintptr(runDataSize),
+ 0); errno != 0 {
+ return fmt.Errorf("error unmapping runData: %v", errno)
+ }
+ return nil
+}
+
+// notify notifies that the vCPU has returned to host mode.
+//
+// This may be called by a signal handler and therefore throws on error.
+//
+//go:nosplit
+func (c *vCPU) notify() {
+ _, _, errno := syscall.RawSyscall6(
+ syscall.SYS_FUTEX,
+ uintptr(unsafe.Pointer(&c.state)),
+ linux.FUTEX_WAKE,
+ ^uintptr(0), // Number of waiters.
+ 0, 0, 0)
+ if errno != 0 {
+ throw("futex wake error")
+ }
+}
+
+// wait waits for the vCPU to return to host mode.
+//
+// This panics on error.
+func (c *vCPU) wait() {
+ if !atomic.CompareAndSwapUintptr(&c.state, vCPUGuest, vCPUWaiter) {
+ return // Nothing to wait for.
+ }
+ for {
+ _, _, errno := syscall.Syscall6(
+ syscall.SYS_FUTEX,
+ uintptr(unsafe.Pointer(&c.state)),
+ linux.FUTEX_WAIT,
+ uintptr(vCPUWaiter), // Expected value.
+ 0, 0, 0)
+ if errno == syscall.EINTR {
+ continue
+ } else if errno == syscall.EAGAIN {
+ break
+ } else if errno != 0 {
+ panic("futex wait error")
+ }
+ break
+ }
+}
diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go
new file mode 100644
index 000000000..5d55c9486
--- /dev/null
+++ b/pkg/sentry/platform/kvm/physical_map.go
@@ -0,0 +1,221 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+ "fmt"
+ "sort"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+const (
+ // reservedMemory is a chunk of physical memory reserved starting at
+ // physical address zero. There are some special pages in this region,
+ // so we just call the whole thing off.
+ //
+ // Other architectures may define this to be zero.
+ reservedMemory = 0x100000000
+)
+
+type region struct {
+ virtual uintptr
+ length uintptr
+}
+
+type physicalRegion struct {
+ region
+ physical uintptr
+}
+
+// physicalRegions contains a list of available physical regions.
+//
+// The physical value used in physicalRegions is a number indicating the
+// physical offset, aligned appropriately and starting above reservedMemory.
+var physicalRegions []physicalRegion
+
+// fillAddressSpace fills the host address space with PROT_NONE mappings until
+// the number of available bits until we have a host address space size that is
+// equal to the physical address space.
+//
+// The excluded regions are returned.
+func fillAddressSpace() (excludedRegions []region) {
+ // We can cut vSize in half, because the kernel will be using the top
+ // half and we ignore it while constructing mappings. It's as if we've
+ // already excluded half the possible addresses.
+ vSize := uintptr(1) << ring0.VirtualAddressBits()
+ vSize = vSize >> 1
+
+ // We exclude reservedMemory below from our physical memory size, so it
+ // needs to be dropped here as well. Otherwise, we could end up with
+ // physical addresses that are beyond what is mapped.
+ pSize := uintptr(1) << ring0.PhysicalAddressBits()
+ pSize -= reservedMemory
+
+ // Sanity check.
+ if vSize < pSize {
+ panic(fmt.Sprintf("vSize (%x) < pSize (%x)", vSize, pSize))
+ }
+
+ // Add specifically excluded regions; see excludeVirtualRegion.
+ applyVirtualRegions(func(vr virtualRegion) {
+ if excludeVirtualRegion(vr) {
+ excludedRegions = append(excludedRegions, vr.region)
+ vSize -= vr.length
+ log.Infof("excluded: virtual [%x,%x)", vr.virtual, vr.virtual+vr.length)
+ }
+ })
+
+ // Calculate the required space and fill it.
+ //
+ // Note carefully that we add faultBlockSize to required up front, and
+ // on each iteration of the loop below (i.e. each new physical region
+ // we define), we add faultBlockSize again. This is done because the
+ // computation of physical regions will ensure proper alignments with
+ // faultBlockSize, potentially causing up to faultBlockSize bytes in
+ // internal fragmentation for each physical region. So we need to
+ // account for this properly during allocation.
+ requiredAddr, ok := usermem.Addr(vSize - pSize + faultBlockSize).RoundUp()
+ if !ok {
+ panic(fmt.Sprintf(
+ "overflow for vSize (%x) - pSize (%x) + faultBlockSize (%x)",
+ vSize, pSize, faultBlockSize))
+ }
+ required := uintptr(requiredAddr)
+ current := required // Attempted mmap size.
+ for filled := uintptr(0); filled < required && current > 0; {
+ addr, _, errno := syscall.RawSyscall6(
+ syscall.SYS_MMAP,
+ 0, // Suggested address.
+ current,
+ syscall.PROT_NONE,
+ syscall.MAP_ANONYMOUS|syscall.MAP_PRIVATE|syscall.MAP_NORESERVE,
+ 0, 0)
+ if errno != 0 {
+ // Attempt half the size; overflow not possible.
+ currentAddr, _ := usermem.Addr(current >> 1).RoundUp()
+ current = uintptr(currentAddr)
+ continue
+ }
+ // We filled a block.
+ filled += current
+ excludedRegions = append(excludedRegions, region{
+ virtual: addr,
+ length: current,
+ })
+ // See comment above.
+ if filled != required {
+ required += faultBlockSize
+ }
+ }
+ if current == 0 {
+ panic("filling address space failed")
+ }
+ sort.Slice(excludedRegions, func(i, j int) bool {
+ return excludedRegions[i].virtual < excludedRegions[j].virtual
+ })
+ for _, r := range excludedRegions {
+ log.Infof("region: virtual [%x,%x)", r.virtual, r.virtual+r.length)
+ }
+ return excludedRegions
+}
+
+// computePhysicalRegions computes physical regions.
+func computePhysicalRegions(excludedRegions []region) (physicalRegions []physicalRegion) {
+ physical := uintptr(reservedMemory)
+ addValidRegion := func(virtual, length uintptr) {
+ if length == 0 {
+ return
+ }
+ if virtual == 0 {
+ virtual += usermem.PageSize
+ length -= usermem.PageSize
+ }
+ if end := virtual + length; end > ring0.MaximumUserAddress {
+ length -= (end - ring0.MaximumUserAddress)
+ }
+ if length == 0 {
+ return
+ }
+ // Round physical up to the same alignment as the virtual
+ // address (with respect to faultBlockSize).
+ if offset := virtual &^ faultBlockMask; physical&^faultBlockMask != offset {
+ if newPhysical := (physical & faultBlockMask) + offset; newPhysical > physical {
+ physical = newPhysical // Round up by only a little bit.
+ } else {
+ physical = ((physical + faultBlockSize) & faultBlockMask) + offset
+ }
+ }
+ physicalRegions = append(physicalRegions, physicalRegion{
+ region: region{
+ virtual: virtual,
+ length: length,
+ },
+ physical: physical,
+ })
+ physical += length
+ }
+ lastExcludedEnd := uintptr(0)
+ for _, r := range excludedRegions {
+ addValidRegion(lastExcludedEnd, r.virtual-lastExcludedEnd)
+ lastExcludedEnd = r.virtual + r.length
+ }
+ addValidRegion(lastExcludedEnd, ring0.MaximumUserAddress-lastExcludedEnd)
+
+ // Dump our all physical regions.
+ for _, r := range physicalRegions {
+ log.Infof("physicalRegion: virtual [%x,%x) => physical [%x,%x)",
+ r.virtual, r.virtual+r.length, r.physical, r.physical+r.length)
+ }
+ return physicalRegions
+}
+
+// physicalInit initializes physical address mappings.
+func physicalInit() {
+ physicalRegions = computePhysicalRegions(fillAddressSpace())
+}
+
+// applyPhysicalRegions applies the given function on physical regions.
+//
+// Iteration continues as long as true is returned. The return value is the
+// return from the last call to fn, or true if there are no entries.
+//
+// Precondition: physicalInit must have been called.
+func applyPhysicalRegions(fn func(pr physicalRegion) bool) bool {
+ for _, pr := range physicalRegions {
+ if !fn(pr) {
+ return false
+ }
+ }
+ return true
+}
+
+// TranslateToPhysical translates the given virtual address.
+//
+// Precondition: physicalInit must have been called.
+func TranslateToPhysical(virtual uintptr) (physical uintptr, length uintptr, ok bool) {
+ ok = !applyPhysicalRegions(func(pr physicalRegion) bool {
+ if pr.virtual <= virtual && virtual < pr.virtual+pr.length {
+ physical = pr.physical + (virtual - pr.virtual)
+ length = pr.length - (virtual - pr.virtual)
+ return false
+ }
+ return true
+ })
+ return
+}
diff --git a/pkg/sentry/platform/kvm/testutil/BUILD b/pkg/sentry/platform/kvm/testutil/BUILD
new file mode 100644
index 000000000..8533a8d89
--- /dev/null
+++ b/pkg/sentry/platform/kvm/testutil/BUILD
@@ -0,0 +1,15 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+ name = "testutil",
+ testonly = 1,
+ srcs = [
+ "testutil.go",
+ "testutil_amd64.go",
+ "testutil_amd64.s",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm/testutil",
+ visibility = ["//pkg/sentry/platform/kvm:__pkg__"],
+)
diff --git a/pkg/sentry/platform/kvm/testutil/testutil.go b/pkg/sentry/platform/kvm/testutil/testutil.go
new file mode 100644
index 000000000..8a614e25d
--- /dev/null
+++ b/pkg/sentry/platform/kvm/testutil/testutil.go
@@ -0,0 +1,75 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package testutil provides common assembly stubs for testing.
+package testutil
+
+import (
+ "fmt"
+ "strings"
+)
+
+// Getpid executes a trivial system call.
+func Getpid()
+
+// Touch touches the value in the first register.
+func Touch()
+
+// SyscallLoop executes a syscall and loops.
+func SyscallLoop()
+
+// SpinLoop spins on the CPU.
+func SpinLoop()
+
+// HaltLoop immediately halts and loops.
+func HaltLoop()
+
+// TwiddleRegsFault twiddles registers then faults.
+func TwiddleRegsFault()
+
+// TwiddleRegsSyscall twiddles registers then executes a syscall.
+func TwiddleRegsSyscall()
+
+// TwiddleSegments reads segments into known registers.
+func TwiddleSegments()
+
+// FloatingPointWorks is a floating point test.
+//
+// It returns true or false.
+func FloatingPointWorks() bool
+
+// RegisterMismatchError is used for checking registers.
+type RegisterMismatchError []string
+
+// Error returns a human-readable error.
+func (r RegisterMismatchError) Error() string {
+ return strings.Join([]string(r), ";")
+}
+
+// addRegisterMisatch allows simple chaining of register mismatches.
+func addRegisterMismatch(err error, reg string, got, expected interface{}) error {
+ errStr := fmt.Sprintf("%s got %08x, expected %08x", reg, got, expected)
+ switch r := err.(type) {
+ case nil:
+ // Return a new register mismatch.
+ return RegisterMismatchError{errStr}
+ case RegisterMismatchError:
+ // Append the error.
+ r = append(r, errStr)
+ return r
+ default:
+ // Leave as is.
+ return err
+ }
+}
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.go b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go
new file mode 100644
index 000000000..39286a0af
--- /dev/null
+++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.go
@@ -0,0 +1,135 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package testutil
+
+import (
+ "reflect"
+ "syscall"
+)
+
+// SetTestTarget sets the rip appropriately.
+func SetTestTarget(regs *syscall.PtraceRegs, fn func()) {
+ regs.Rip = uint64(reflect.ValueOf(fn).Pointer())
+}
+
+// SetTouchTarget sets rax appropriately.
+func SetTouchTarget(regs *syscall.PtraceRegs, target *uintptr) {
+ if target != nil {
+ regs.Rax = uint64(reflect.ValueOf(target).Pointer())
+ } else {
+ regs.Rax = 0
+ }
+}
+
+// RewindSyscall rewinds a syscall RIP.
+func RewindSyscall(regs *syscall.PtraceRegs) {
+ regs.Rip -= 2
+}
+
+// SetTestRegs initializes registers to known values.
+func SetTestRegs(regs *syscall.PtraceRegs) {
+ regs.R15 = 0x15
+ regs.R14 = 0x14
+ regs.R13 = 0x13
+ regs.R12 = 0x12
+ regs.Rbp = 0xb9
+ regs.Rbx = 0xb4
+ regs.R11 = 0x11
+ regs.R10 = 0x10
+ regs.R9 = 0x09
+ regs.R8 = 0x08
+ regs.Rax = 0x44
+ regs.Rcx = 0xc4
+ regs.Rdx = 0xd4
+ regs.Rsi = 0x51
+ regs.Rdi = 0xd1
+ regs.Rsp = 0x59
+}
+
+// CheckTestRegs checks that registers were twiddled per TwiddleRegs.
+func CheckTestRegs(regs *syscall.PtraceRegs, full bool) (err error) {
+ if need := ^uint64(0x15); regs.R15 != need {
+ err = addRegisterMismatch(err, "R15", regs.R15, need)
+ }
+ if need := ^uint64(0x14); regs.R14 != need {
+ err = addRegisterMismatch(err, "R14", regs.R14, need)
+ }
+ if need := ^uint64(0x13); regs.R13 != need {
+ err = addRegisterMismatch(err, "R13", regs.R13, need)
+ }
+ if need := ^uint64(0x12); regs.R12 != need {
+ err = addRegisterMismatch(err, "R12", regs.R12, need)
+ }
+ if need := ^uint64(0xb9); regs.Rbp != need {
+ err = addRegisterMismatch(err, "Rbp", regs.Rbp, need)
+ }
+ if need := ^uint64(0xb4); regs.Rbx != need {
+ err = addRegisterMismatch(err, "Rbx", regs.Rbx, need)
+ }
+ if need := ^uint64(0x10); regs.R10 != need {
+ err = addRegisterMismatch(err, "R10", regs.R10, need)
+ }
+ if need := ^uint64(0x09); regs.R9 != need {
+ err = addRegisterMismatch(err, "R9", regs.R9, need)
+ }
+ if need := ^uint64(0x08); regs.R8 != need {
+ err = addRegisterMismatch(err, "R8", regs.R8, need)
+ }
+ if need := ^uint64(0x44); regs.Rax != need {
+ err = addRegisterMismatch(err, "Rax", regs.Rax, need)
+ }
+ if need := ^uint64(0xd4); regs.Rdx != need {
+ err = addRegisterMismatch(err, "Rdx", regs.Rdx, need)
+ }
+ if need := ^uint64(0x51); regs.Rsi != need {
+ err = addRegisterMismatch(err, "Rsi", regs.Rsi, need)
+ }
+ if need := ^uint64(0xd1); regs.Rdi != need {
+ err = addRegisterMismatch(err, "Rdi", regs.Rdi, need)
+ }
+ if need := ^uint64(0x59); regs.Rsp != need {
+ err = addRegisterMismatch(err, "Rsp", regs.Rsp, need)
+ }
+ // Rcx & R11 are ignored if !full is set.
+ if need := ^uint64(0x11); full && regs.R11 != need {
+ err = addRegisterMismatch(err, "R11", regs.R11, need)
+ }
+ if need := ^uint64(0xc4); full && regs.Rcx != need {
+ err = addRegisterMismatch(err, "Rcx", regs.Rcx, need)
+ }
+ return
+}
+
+var fsData uint64 = 0x55
+var gsData uint64 = 0x85
+
+// SetTestSegments initializes segments to known values.
+func SetTestSegments(regs *syscall.PtraceRegs) {
+ regs.Fs_base = uint64(reflect.ValueOf(&fsData).Pointer())
+ regs.Gs_base = uint64(reflect.ValueOf(&gsData).Pointer())
+}
+
+// CheckTestSegments checks that registers were twiddled per TwiddleSegments.
+func CheckTestSegments(regs *syscall.PtraceRegs) (err error) {
+ if regs.Rax != fsData {
+ err = addRegisterMismatch(err, "Rax", regs.Rax, fsData)
+ }
+ if regs.Rbx != gsData {
+ err = addRegisterMismatch(err, "Rbx", regs.Rcx, gsData)
+ }
+ return
+}
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_amd64.s b/pkg/sentry/platform/kvm/testutil/testutil_amd64.s
new file mode 100644
index 000000000..3b5ad8817
--- /dev/null
+++ b/pkg/sentry/platform/kvm/testutil/testutil_amd64.s
@@ -0,0 +1,98 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+// test_util_amd64.s provides AMD64 test functions.
+
+#include "funcdata.h"
+#include "textflag.h"
+
+TEXT ·Getpid(SB),NOSPLIT,$0
+ NO_LOCAL_POINTERS
+ MOVQ $39, AX // getpid
+ SYSCALL
+ RET
+
+TEXT ·Touch(SB),NOSPLIT,$0
+start:
+ MOVQ 0(AX), BX // deref AX
+ MOVQ $39, AX // getpid
+ SYSCALL
+ JMP start
+
+TEXT ·HaltLoop(SB),NOSPLIT,$0
+start:
+ HLT
+ JMP start
+
+TEXT ·SyscallLoop(SB),NOSPLIT,$0
+start:
+ SYSCALL
+ JMP start
+
+TEXT ·SpinLoop(SB),NOSPLIT,$0
+start:
+ JMP start
+
+TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8
+ NO_LOCAL_POINTERS
+ MOVQ $1, AX
+ MOVQ AX, X0
+ MOVQ $39, AX // getpid
+ SYSCALL
+ MOVQ X0, AX
+ CMPQ AX, $1
+ SETEQ ret+0(FP)
+ RET
+
+#define TWIDDLE_REGS() \
+ NOTQ R15; \
+ NOTQ R14; \
+ NOTQ R13; \
+ NOTQ R12; \
+ NOTQ BP; \
+ NOTQ BX; \
+ NOTQ R11; \
+ NOTQ R10; \
+ NOTQ R9; \
+ NOTQ R8; \
+ NOTQ AX; \
+ NOTQ CX; \
+ NOTQ DX; \
+ NOTQ SI; \
+ NOTQ DI; \
+ NOTQ SP;
+
+TEXT ·TwiddleRegsSyscall(SB),NOSPLIT,$0
+ TWIDDLE_REGS()
+ SYSCALL
+ RET // never reached
+
+TEXT ·TwiddleRegsFault(SB),NOSPLIT,$0
+ TWIDDLE_REGS()
+ JMP AX // must fault
+ RET // never reached
+
+#define READ_FS() BYTE $0x64; BYTE $0x48; BYTE $0x8b; BYTE $0x00;
+#define READ_GS() BYTE $0x65; BYTE $0x48; BYTE $0x8b; BYTE $0x00;
+
+TEXT ·TwiddleSegments(SB),NOSPLIT,$0
+ MOVQ $0x0, AX
+ READ_GS()
+ MOVQ AX, BX
+ MOVQ $0x0, AX
+ READ_FS()
+ SYSCALL
+ RET // never reached
diff --git a/pkg/sentry/platform/kvm/virtual_map.go b/pkg/sentry/platform/kvm/virtual_map.go
new file mode 100644
index 000000000..0d3fbe043
--- /dev/null
+++ b/pkg/sentry/platform/kvm/virtual_map.go
@@ -0,0 +1,113 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+ "bufio"
+ "fmt"
+ "io"
+ "os"
+ "regexp"
+ "strconv"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+type virtualRegion struct {
+ region
+ accessType usermem.AccessType
+ shared bool
+ offset uintptr
+ filename string
+}
+
+// mapsLine matches a single line from /proc/PID/maps.
+var mapsLine = regexp.MustCompile("([0-9a-f]+)-([0-9a-f]+) ([r-][w-][x-][sp]) ([0-9a-f]+) [0-9a-f]{2}:[0-9a-f]{2,} [0-9]+\\s+(.*)")
+
+// excludeRegion returns true if these regions should be excluded from the
+// physical map. Virtual regions need to be excluded if get_user_pages will
+// fail on those addresses, preventing KVM from satisfying EPT faults.
+//
+// This includes the VVAR page because the VVAR page may be mapped as I/O
+// memory. And the VDSO page is knocked out because the VVAR page is not even
+// recorded in /proc/self/maps on older kernels; knocking out the VDSO page
+// prevents code in the VDSO from accessing the VVAR address.
+//
+// This is called by the physical map functions, not applyVirtualRegions.
+func excludeVirtualRegion(r virtualRegion) bool {
+ return r.filename == "[vvar]" || r.filename == "[vdso]"
+}
+
+// applyVirtualRegions parses the process maps file.
+//
+// Unlike mappedRegions, these are not consistent over time.
+func applyVirtualRegions(fn func(vr virtualRegion)) error {
+ // Open /proc/self/maps.
+ f, err := os.Open("/proc/self/maps")
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+
+ // Parse all entries.
+ r := bufio.NewReader(f)
+ for {
+ b, err := r.ReadBytes('\n')
+ if b != nil && len(b) > 0 {
+ m := mapsLine.FindSubmatch(b)
+ if m == nil {
+ // This should not happen: kernel bug?
+ return fmt.Errorf("badly formed line: %v", string(b))
+ }
+ start, err := strconv.ParseUint(string(m[1]), 16, 64)
+ if err != nil {
+ return fmt.Errorf("bad start address: %v", string(b))
+ }
+ end, err := strconv.ParseUint(string(m[2]), 16, 64)
+ if err != nil {
+ return fmt.Errorf("bad end address: %v", string(b))
+ }
+ read := m[3][0] == 'r'
+ write := m[3][1] == 'w'
+ execute := m[3][2] == 'x'
+ shared := m[3][3] == 's'
+ offset, err := strconv.ParseUint(string(m[4]), 16, 64)
+ if err != nil {
+ return fmt.Errorf("bad offset: %v", string(b))
+ }
+ fn(virtualRegion{
+ region: region{
+ virtual: uintptr(start),
+ length: uintptr(end - start),
+ },
+ accessType: usermem.AccessType{
+ Read: read,
+ Write: write,
+ Execute: execute,
+ },
+ shared: shared,
+ offset: uintptr(offset),
+ filename: string(m[5]),
+ })
+ }
+ if err != nil && err == io.EOF {
+ break
+ } else if err != nil {
+ return err
+ }
+ }
+
+ return nil
+}
diff --git a/pkg/sentry/platform/kvm/virtual_map_test.go b/pkg/sentry/platform/kvm/virtual_map_test.go
new file mode 100644
index 000000000..31e5b0e61
--- /dev/null
+++ b/pkg/sentry/platform/kvm/virtual_map_test.go
@@ -0,0 +1,78 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+ "syscall"
+ "testing"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+type checker struct {
+ ok bool
+}
+
+func (c *checker) Contains(addr uintptr) func(virtualRegion) {
+ c.ok = false // Reset for below calls.
+ return func(vr virtualRegion) {
+ if vr.virtual <= addr && addr < vr.virtual+vr.length {
+ c.ok = true
+ }
+ }
+}
+
+func TestParseMaps(t *testing.T) {
+ c := new(checker)
+
+ // Simple test.
+ if err := applyVirtualRegions(c.Contains(0)); err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ // MMap a new page.
+ addr, _, errno := syscall.RawSyscall6(
+ syscall.SYS_MMAP, 0, usermem.PageSize,
+ syscall.PROT_READ|syscall.PROT_WRITE,
+ syscall.MAP_ANONYMOUS|syscall.MAP_PRIVATE, 0, 0)
+ if errno != 0 {
+ t.Fatalf("unexpected map error: %v", errno)
+ }
+
+ // Re-parse maps.
+ if err := applyVirtualRegions(c.Contains(addr)); err != nil {
+ syscall.RawSyscall(syscall.SYS_MUNMAP, addr, usermem.PageSize, 0)
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ // Assert that it now does contain the region.
+ if !c.ok {
+ syscall.RawSyscall(syscall.SYS_MUNMAP, addr, usermem.PageSize, 0)
+ t.Fatalf("updated map does not contain 0x%08x, expected true", addr)
+ }
+
+ // Unmap the region.
+ syscall.RawSyscall(syscall.SYS_MUNMAP, addr, usermem.PageSize, 0)
+
+ // Re-parse maps.
+ if err := applyVirtualRegions(c.Contains(addr)); err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+
+ // Assert that it once again does _not_ contain the region.
+ if c.ok {
+ t.Fatalf("final map does contain 0x%08x, expected false", addr)
+ }
+}