summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--pkg/sentry/platform/kvm/BUILD1
-rw-r--r--pkg/sentry/platform/kvm/address_space.go67
-rw-r--r--pkg/sentry/platform/kvm/address_space_unsafe.go44
-rw-r--r--pkg/sentry/platform/kvm/kvm.go4
-rw-r--r--pkg/sentry/platform/kvm/machine.go131
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64.go2
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64_unsafe.go4
7 files changed, 123 insertions, 130 deletions
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 135861368..673393fad 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -27,7 +27,6 @@ go_library(
name = "kvm",
srcs = [
"address_space.go",
- "address_space_unsafe.go",
"allocator.go",
"bluepill.go",
"bluepill_amd64.go",
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index f74c98dd0..fbd11ed71 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -17,13 +17,62 @@ package kvm
import (
"reflect"
"sync"
+ "sync/atomic"
+ "gvisor.googlesource.com/gvisor/pkg/atomicbitops"
"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
"gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem"
"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
)
+type vCPUBitArray [(_KVM_NR_VCPUS + 63) / 64]uint64
+
+// dirtySet tracks vCPUs for invalidation.
+type dirtySet struct {
+ vCPUs vCPUBitArray
+}
+
+// forEach iterates over all CPUs in the dirty set.
+func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) {
+ var localSet vCPUBitArray
+ for index := 0; index < len(ds.vCPUs); index++ {
+ // Clear the dirty set, copy to the local one.
+ localSet[index] = atomic.SwapUint64(&ds.vCPUs[index], 0)
+ }
+
+ m.mu.RLock()
+ defer m.mu.RUnlock()
+
+ for _, c := range m.vCPUs {
+ index := uint64(c.id) / 64
+ bit := uint64(1) << uint(c.id%64)
+
+ // Call the function if it was set.
+ if localSet[index]&bit != 0 {
+ fn(c)
+ }
+ }
+}
+
+// mark marks the given vCPU as dirty and returns whether it was previously
+// clean. Being previously clean implies that a flush is needed on entry.
+func (ds *dirtySet) mark(c *vCPU) bool {
+ index := uint64(c.id) / 64
+ bit := uint64(1) << uint(c.id%64)
+
+ oldValue := atomic.LoadUint64(&ds.vCPUs[index])
+ if oldValue&bit != 0 {
+ return false // Not clean.
+ }
+
+ // Set the bit unilaterally, and ensure that a flush takes place. Note
+ // that it's possible for races to occur here, but since the flush is
+ // taking place long after these lines there's no race in practice.
+ atomicbitops.OrUint64(&ds.vCPUs[index], bit)
+ return true // Previously clean.
+}
+
// addressSpace is a wrapper for PageTables.
type addressSpace struct {
platform.NoAddressSpaceIO
@@ -43,10 +92,6 @@ type addressSpace struct {
pageTables *pagetables.PageTables
// dirtySet is the set of dirty vCPUs.
- //
- // These are actually vCPU pointers that are stored iff the vCPU is
- // dirty. If the vCPU is not dirty and requires invalidation, then a
- // nil value is stored here instead.
dirtySet dirtySet
// files contains files mapped in the host address space.
@@ -57,11 +102,11 @@ type addressSpace struct {
// invalidate is the implementation for Invalidate.
func (as *addressSpace) invalidate() {
- for i := 0; i < as.dirtySet.size(); i++ {
- if c := as.dirtySet.swap(i, nil); c != nil && c.active.get() == as {
- c.BounceToKernel() // Force a kernel transition.
+ as.dirtySet.forEach(as.machine, func(c *vCPU) {
+ if c.active.get() == as { // If this happens to be active,
+ c.BounceToKernel() // ... force a kernel transition.
}
- }
+ })
}
// Invalidate interrupts all dirty contexts.
@@ -75,11 +120,7 @@ func (as *addressSpace) Invalidate() {
//
// The return value indicates whether a flush is required.
func (as *addressSpace) Touch(c *vCPU) bool {
- if old := as.dirtySet.swap(c.id, c); old == nil {
- return true // Flush is required.
- }
- // Already dirty: no flush required.
- return false
+ return as.dirtySet.mark(c)
}
func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) {
diff --git a/pkg/sentry/platform/kvm/address_space_unsafe.go b/pkg/sentry/platform/kvm/address_space_unsafe.go
deleted file mode 100644
index b6c31ce10..000000000
--- a/pkg/sentry/platform/kvm/address_space_unsafe.go
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package kvm
-
-import (
- "sync/atomic"
- "unsafe"
-)
-
-// dirtySet tracks vCPUs for invalidation.
-type dirtySet struct {
- vCPUs []unsafe.Pointer
-}
-
-// makeDirtySet makes a new dirtySet.
-func makeDirtySet(size int) dirtySet {
- return dirtySet{
- vCPUs: make([]unsafe.Pointer, size),
- }
-}
-
-// size is the size of the set.
-func (ds *dirtySet) size() int {
- return len(ds.vCPUs)
-}
-
-// swap sets the given index and returns the previous value.
-//
-// The index is typically the id for a non-nil vCPU.
-func (ds *dirtySet) swap(index int, c *vCPU) *vCPU {
- return (*vCPU)(atomic.SwapPointer(&ds.vCPUs[index], unsafe.Pointer(c)))
-}
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index 1a8e16ca0..3ed057881 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -17,7 +17,6 @@ package kvm
import (
"fmt"
- "runtime"
"sync"
"syscall"
@@ -77,7 +76,7 @@ func New() (*KVM, error) {
}
// Create a VM context.
- machine, err := newMachine(int(vm), runtime.NumCPU())
+ machine, err := newMachine(int(vm))
if err != nil {
return nil, err
}
@@ -137,7 +136,6 @@ func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan stru
filemem: k.FileMem,
machine: k.machine,
pageTables: pageTables,
- dirtySet: makeDirtySet(len(k.machine.vCPUs)),
}, nil, nil
}
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index f045345d5..abdc51431 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -46,16 +46,14 @@ type machine struct {
mappingCache sync.Map
// mu protects vCPUs.
- mu sync.Mutex
+ mu sync.RWMutex
// available is notified when vCPUs are available.
available sync.Cond
// vCPUs are the machine vCPUs.
//
- // This is eventually keyed by system TID, but is initially indexed by
- // the negative vCPU id. This is merely an optimization, so while
- // collisions here are not possible, it wouldn't matter anyways.
+ // These are populated dynamically.
vCPUs map[uint64]*vCPU
}
@@ -117,73 +115,65 @@ type vCPU struct {
vCPUArchState
}
+// newVCPU creates a returns a new vCPU.
+//
+// Precondtion: mu must be held.
+func (m *machine) newVCPU() *vCPU {
+ id := len(m.vCPUs)
+
+ // Create the vCPU.
+ fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id))
+ if errno != 0 {
+ panic(fmt.Sprintf("error creating new vCPU: %v", errno))
+ }
+
+ c := &vCPU{
+ id: id,
+ fd: int(fd),
+ machine: m,
+ }
+ c.CPU.Init(&m.kernel)
+ c.CPU.KernelSyscall = bluepillSyscall
+ c.CPU.KernelException = bluepillException
+
+ // Ensure the signal mask is correct.
+ if err := c.setSignalMask(); err != nil {
+ panic(fmt.Sprintf("error setting signal mask: %v", err))
+ }
+
+ // Initialize architecture state.
+ if err := c.initArchState(); err != nil {
+ panic(fmt.Sprintf("error initialization vCPU state: %v", err))
+ }
+
+ // Map the run data.
+ runData, err := mapRunData(int(fd))
+ if err != nil {
+ panic(fmt.Sprintf("error mapping run data: %v", err))
+ }
+ c.runData = runData
+
+ return c // Done.
+}
+
// newMachine returns a new VM context.
-func newMachine(vm int, vCPUs int) (*machine, error) {
+func newMachine(vm int) (*machine, error) {
// Create the machine.
m := &machine{
fd: vm,
vCPUs: make(map[uint64]*vCPU),
}
m.available.L = &m.mu
- if vCPUs > _KVM_NR_VCPUS {
- // Hard cap at KVM's limit.
- vCPUs = _KVM_NR_VCPUS
- }
- if n := 2 * runtime.NumCPU(); vCPUs > n {
- // Cap at twice the number of physical cores. Otherwise we're
- // just wasting memory and thrashing. (There may be scheduling
- // issues when you've got > n active threads.)
- vCPUs = n
- }
m.kernel.Init(ring0.KernelOpts{
PageTables: pagetables.New(newAllocator()),
})
// Initialize architecture state.
- if err := m.initArchState(vCPUs); err != nil {
+ if err := m.initArchState(); err != nil {
m.Destroy()
return nil, err
}
- // Create all the vCPUs.
- for id := 0; id < vCPUs; id++ {
- // Create the vCPU.
- fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(vm), _KVM_CREATE_VCPU, uintptr(id))
- if errno != 0 {
- m.Destroy()
- return nil, fmt.Errorf("error creating VCPU: %v", errno)
- }
- c := &vCPU{
- id: id,
- fd: int(fd),
- machine: m,
- }
- c.CPU.Init(&m.kernel)
- c.CPU.KernelSyscall = bluepillSyscall
- c.CPU.KernelException = bluepillException
- m.vCPUs[uint64(-id)] = c // See above.
-
- // Ensure the signal mask is correct.
- if err := c.setSignalMask(); err != nil {
- m.Destroy()
- return nil, err
- }
-
- // Initialize architecture state.
- if err := c.initArchState(); err != nil {
- m.Destroy()
- return nil, err
- }
-
- // Map the run data.
- runData, err := mapRunData(int(fd))
- if err != nil {
- m.Destroy()
- return nil, err
- }
- c.runData = runData
- }
-
// Apply the physical mappings. Note that these mappings may point to
// guest physical addresses that are not actually available. These
// physical pages are mapped on demand, see kernel_unsafe.go.
@@ -298,15 +288,20 @@ func (m *machine) Destroy() {
func (m *machine) Get() *vCPU {
runtime.LockOSThread()
tid := procid.Current()
- m.mu.Lock()
+ m.mu.RLock()
// Check for an exact match.
if c := m.vCPUs[tid]; c != nil {
c.lock()
- m.mu.Unlock()
+ m.mu.RUnlock()
return c
}
+ // The happy path failed. We now proceed to acquire an exclusive lock
+ // (because the vCPU map may change), and scan all available vCPUs.
+ m.mu.RUnlock()
+ m.mu.Lock()
+
for {
// Scan for an available vCPU.
for origTID, c := range m.vCPUs {
@@ -314,16 +309,21 @@ func (m *machine) Get() *vCPU {
delete(m.vCPUs, origTID)
m.vCPUs[tid] = c
m.mu.Unlock()
-
- // We need to reload thread-local segments as
- // we have origTID != tid and the vCPU state
- // may be stale.
- c.loadSegments()
- atomic.StoreUint64(&c.tid, tid)
+ c.loadSegments(tid)
return c
}
}
+ // Create a new vCPU (maybe).
+ if len(m.vCPUs) < _KVM_NR_VCPUS {
+ c := m.newVCPU()
+ c.lock()
+ m.vCPUs[tid] = c
+ m.mu.Unlock()
+ c.loadSegments(tid)
+ return c
+ }
+
// Scan for something not in user mode.
for origTID, c := range m.vCPUs {
if !atomic.CompareAndSwapUint32(&c.state, vCPUGuest, vCPUGuest|vCPUWaiter) {
@@ -346,10 +346,7 @@ func (m *machine) Get() *vCPU {
delete(m.vCPUs, origTID)
m.vCPUs[tid] = c
m.mu.Unlock()
-
- // See above.
- c.loadSegments()
- atomic.StoreUint64(&c.tid, tid)
+ c.loadSegments(tid)
return c
}
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index 52896eefe..9af4f3f3d 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -29,7 +29,7 @@ import (
)
// initArchState initializes architecture-specific state.
-func (m *machine) initArchState(vCPUs int) error {
+func (m *machine) initArchState() error {
// Set the legacy TSS address. This address is covered by the reserved
// range (up to 4GB). In fact, this is a main reason it exists.
if _, _, errno := syscall.RawSyscall(
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index c2bcb3a47..8b9041f13 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -18,6 +18,7 @@ package kvm
import (
"fmt"
+ "sync/atomic"
"syscall"
"unsafe"
@@ -54,7 +55,7 @@ func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr) s
// This may be called from within the signal context and throws on error.
//
//go:nosplit
-func (c *vCPU) loadSegments() {
+func (c *vCPU) loadSegments(tid uint64) {
if _, _, errno := syscall.RawSyscall(
syscall.SYS_ARCH_PRCTL,
linux.ARCH_GET_FS,
@@ -69,6 +70,7 @@ func (c *vCPU) loadSegments() {
0); errno != 0 {
throw("getting GS segment")
}
+ atomic.StoreUint64(&c.tid, tid)
}
// setUserRegisters sets user registers in the vCPU.