diff options
-rw-r--r-- | pkg/sentry/platform/kvm/BUILD | 1 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/address_space.go | 67 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/address_space_unsafe.go | 44 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/kvm.go | 4 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine.go | 131 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine_amd64.go | 2 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine_amd64_unsafe.go | 4 |
7 files changed, 123 insertions, 130 deletions
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index 135861368..673393fad 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -27,7 +27,6 @@ go_library( name = "kvm", srcs = [ "address_space.go", - "address_space_unsafe.go", "allocator.go", "bluepill.go", "bluepill_amd64.go", diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go index f74c98dd0..fbd11ed71 100644 --- a/pkg/sentry/platform/kvm/address_space.go +++ b/pkg/sentry/platform/kvm/address_space.go @@ -17,13 +17,62 @@ package kvm import ( "reflect" "sync" + "sync/atomic" + "gvisor.googlesource.com/gvisor/pkg/atomicbitops" "gvisor.googlesource.com/gvisor/pkg/sentry/platform" "gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem" "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" ) +type vCPUBitArray [(_KVM_NR_VCPUS + 63) / 64]uint64 + +// dirtySet tracks vCPUs for invalidation. +type dirtySet struct { + vCPUs vCPUBitArray +} + +// forEach iterates over all CPUs in the dirty set. +func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) { + var localSet vCPUBitArray + for index := 0; index < len(ds.vCPUs); index++ { + // Clear the dirty set, copy to the local one. + localSet[index] = atomic.SwapUint64(&ds.vCPUs[index], 0) + } + + m.mu.RLock() + defer m.mu.RUnlock() + + for _, c := range m.vCPUs { + index := uint64(c.id) / 64 + bit := uint64(1) << uint(c.id%64) + + // Call the function if it was set. + if localSet[index]&bit != 0 { + fn(c) + } + } +} + +// mark marks the given vCPU as dirty and returns whether it was previously +// clean. Being previously clean implies that a flush is needed on entry. +func (ds *dirtySet) mark(c *vCPU) bool { + index := uint64(c.id) / 64 + bit := uint64(1) << uint(c.id%64) + + oldValue := atomic.LoadUint64(&ds.vCPUs[index]) + if oldValue&bit != 0 { + return false // Not clean. + } + + // Set the bit unilaterally, and ensure that a flush takes place. Note + // that it's possible for races to occur here, but since the flush is + // taking place long after these lines there's no race in practice. + atomicbitops.OrUint64(&ds.vCPUs[index], bit) + return true // Previously clean. +} + // addressSpace is a wrapper for PageTables. type addressSpace struct { platform.NoAddressSpaceIO @@ -43,10 +92,6 @@ type addressSpace struct { pageTables *pagetables.PageTables // dirtySet is the set of dirty vCPUs. - // - // These are actually vCPU pointers that are stored iff the vCPU is - // dirty. If the vCPU is not dirty and requires invalidation, then a - // nil value is stored here instead. dirtySet dirtySet // files contains files mapped in the host address space. @@ -57,11 +102,11 @@ type addressSpace struct { // invalidate is the implementation for Invalidate. func (as *addressSpace) invalidate() { - for i := 0; i < as.dirtySet.size(); i++ { - if c := as.dirtySet.swap(i, nil); c != nil && c.active.get() == as { - c.BounceToKernel() // Force a kernel transition. + as.dirtySet.forEach(as.machine, func(c *vCPU) { + if c.active.get() == as { // If this happens to be active, + c.BounceToKernel() // ... force a kernel transition. } - } + }) } // Invalidate interrupts all dirty contexts. @@ -75,11 +120,7 @@ func (as *addressSpace) Invalidate() { // // The return value indicates whether a flush is required. func (as *addressSpace) Touch(c *vCPU) bool { - if old := as.dirtySet.swap(c.id, c); old == nil { - return true // Flush is required. - } - // Already dirty: no flush required. - return false + return as.dirtySet.mark(c) } func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) { diff --git a/pkg/sentry/platform/kvm/address_space_unsafe.go b/pkg/sentry/platform/kvm/address_space_unsafe.go deleted file mode 100644 index b6c31ce10..000000000 --- a/pkg/sentry/platform/kvm/address_space_unsafe.go +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2018 Google Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package kvm - -import ( - "sync/atomic" - "unsafe" -) - -// dirtySet tracks vCPUs for invalidation. -type dirtySet struct { - vCPUs []unsafe.Pointer -} - -// makeDirtySet makes a new dirtySet. -func makeDirtySet(size int) dirtySet { - return dirtySet{ - vCPUs: make([]unsafe.Pointer, size), - } -} - -// size is the size of the set. -func (ds *dirtySet) size() int { - return len(ds.vCPUs) -} - -// swap sets the given index and returns the previous value. -// -// The index is typically the id for a non-nil vCPU. -func (ds *dirtySet) swap(index int, c *vCPU) *vCPU { - return (*vCPU)(atomic.SwapPointer(&ds.vCPUs[index], unsafe.Pointer(c))) -} diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go index 1a8e16ca0..3ed057881 100644 --- a/pkg/sentry/platform/kvm/kvm.go +++ b/pkg/sentry/platform/kvm/kvm.go @@ -17,7 +17,6 @@ package kvm import ( "fmt" - "runtime" "sync" "syscall" @@ -77,7 +76,7 @@ func New() (*KVM, error) { } // Create a VM context. - machine, err := newMachine(int(vm), runtime.NumCPU()) + machine, err := newMachine(int(vm)) if err != nil { return nil, err } @@ -137,7 +136,6 @@ func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan stru filemem: k.FileMem, machine: k.machine, pageTables: pageTables, - dirtySet: makeDirtySet(len(k.machine.vCPUs)), }, nil, nil } diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index f045345d5..abdc51431 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -46,16 +46,14 @@ type machine struct { mappingCache sync.Map // mu protects vCPUs. - mu sync.Mutex + mu sync.RWMutex // available is notified when vCPUs are available. available sync.Cond // vCPUs are the machine vCPUs. // - // This is eventually keyed by system TID, but is initially indexed by - // the negative vCPU id. This is merely an optimization, so while - // collisions here are not possible, it wouldn't matter anyways. + // These are populated dynamically. vCPUs map[uint64]*vCPU } @@ -117,73 +115,65 @@ type vCPU struct { vCPUArchState } +// newVCPU creates a returns a new vCPU. +// +// Precondtion: mu must be held. +func (m *machine) newVCPU() *vCPU { + id := len(m.vCPUs) + + // Create the vCPU. + fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id)) + if errno != 0 { + panic(fmt.Sprintf("error creating new vCPU: %v", errno)) + } + + c := &vCPU{ + id: id, + fd: int(fd), + machine: m, + } + c.CPU.Init(&m.kernel) + c.CPU.KernelSyscall = bluepillSyscall + c.CPU.KernelException = bluepillException + + // Ensure the signal mask is correct. + if err := c.setSignalMask(); err != nil { + panic(fmt.Sprintf("error setting signal mask: %v", err)) + } + + // Initialize architecture state. + if err := c.initArchState(); err != nil { + panic(fmt.Sprintf("error initialization vCPU state: %v", err)) + } + + // Map the run data. + runData, err := mapRunData(int(fd)) + if err != nil { + panic(fmt.Sprintf("error mapping run data: %v", err)) + } + c.runData = runData + + return c // Done. +} + // newMachine returns a new VM context. -func newMachine(vm int, vCPUs int) (*machine, error) { +func newMachine(vm int) (*machine, error) { // Create the machine. m := &machine{ fd: vm, vCPUs: make(map[uint64]*vCPU), } m.available.L = &m.mu - if vCPUs > _KVM_NR_VCPUS { - // Hard cap at KVM's limit. - vCPUs = _KVM_NR_VCPUS - } - if n := 2 * runtime.NumCPU(); vCPUs > n { - // Cap at twice the number of physical cores. Otherwise we're - // just wasting memory and thrashing. (There may be scheduling - // issues when you've got > n active threads.) - vCPUs = n - } m.kernel.Init(ring0.KernelOpts{ PageTables: pagetables.New(newAllocator()), }) // Initialize architecture state. - if err := m.initArchState(vCPUs); err != nil { + if err := m.initArchState(); err != nil { m.Destroy() return nil, err } - // Create all the vCPUs. - for id := 0; id < vCPUs; id++ { - // Create the vCPU. - fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(vm), _KVM_CREATE_VCPU, uintptr(id)) - if errno != 0 { - m.Destroy() - return nil, fmt.Errorf("error creating VCPU: %v", errno) - } - c := &vCPU{ - id: id, - fd: int(fd), - machine: m, - } - c.CPU.Init(&m.kernel) - c.CPU.KernelSyscall = bluepillSyscall - c.CPU.KernelException = bluepillException - m.vCPUs[uint64(-id)] = c // See above. - - // Ensure the signal mask is correct. - if err := c.setSignalMask(); err != nil { - m.Destroy() - return nil, err - } - - // Initialize architecture state. - if err := c.initArchState(); err != nil { - m.Destroy() - return nil, err - } - - // Map the run data. - runData, err := mapRunData(int(fd)) - if err != nil { - m.Destroy() - return nil, err - } - c.runData = runData - } - // Apply the physical mappings. Note that these mappings may point to // guest physical addresses that are not actually available. These // physical pages are mapped on demand, see kernel_unsafe.go. @@ -298,15 +288,20 @@ func (m *machine) Destroy() { func (m *machine) Get() *vCPU { runtime.LockOSThread() tid := procid.Current() - m.mu.Lock() + m.mu.RLock() // Check for an exact match. if c := m.vCPUs[tid]; c != nil { c.lock() - m.mu.Unlock() + m.mu.RUnlock() return c } + // The happy path failed. We now proceed to acquire an exclusive lock + // (because the vCPU map may change), and scan all available vCPUs. + m.mu.RUnlock() + m.mu.Lock() + for { // Scan for an available vCPU. for origTID, c := range m.vCPUs { @@ -314,16 +309,21 @@ func (m *machine) Get() *vCPU { delete(m.vCPUs, origTID) m.vCPUs[tid] = c m.mu.Unlock() - - // We need to reload thread-local segments as - // we have origTID != tid and the vCPU state - // may be stale. - c.loadSegments() - atomic.StoreUint64(&c.tid, tid) + c.loadSegments(tid) return c } } + // Create a new vCPU (maybe). + if len(m.vCPUs) < _KVM_NR_VCPUS { + c := m.newVCPU() + c.lock() + m.vCPUs[tid] = c + m.mu.Unlock() + c.loadSegments(tid) + return c + } + // Scan for something not in user mode. for origTID, c := range m.vCPUs { if !atomic.CompareAndSwapUint32(&c.state, vCPUGuest, vCPUGuest|vCPUWaiter) { @@ -346,10 +346,7 @@ func (m *machine) Get() *vCPU { delete(m.vCPUs, origTID) m.vCPUs[tid] = c m.mu.Unlock() - - // See above. - c.loadSegments() - atomic.StoreUint64(&c.tid, tid) + c.loadSegments(tid) return c } diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index 52896eefe..9af4f3f3d 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -29,7 +29,7 @@ import ( ) // initArchState initializes architecture-specific state. -func (m *machine) initArchState(vCPUs int) error { +func (m *machine) initArchState() error { // Set the legacy TSS address. This address is covered by the reserved // range (up to 4GB). In fact, this is a main reason it exists. if _, _, errno := syscall.RawSyscall( diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go index c2bcb3a47..8b9041f13 100644 --- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go @@ -18,6 +18,7 @@ package kvm import ( "fmt" + "sync/atomic" "syscall" "unsafe" @@ -54,7 +55,7 @@ func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr) s // This may be called from within the signal context and throws on error. // //go:nosplit -func (c *vCPU) loadSegments() { +func (c *vCPU) loadSegments(tid uint64) { if _, _, errno := syscall.RawSyscall( syscall.SYS_ARCH_PRCTL, linux.ARCH_GET_FS, @@ -69,6 +70,7 @@ func (c *vCPU) loadSegments() { 0); errno != 0 { throw("getting GS segment") } + atomic.StoreUint64(&c.tid, tid) } // setUserRegisters sets user registers in the vCPU. |