diff options
author | Adin Scannell <ascannell@google.com> | 2018-06-19 16:59:25 -0700 |
---|---|---|
committer | Shentubot <shentubot@google.com> | 2018-06-19 17:00:30 -0700 |
commit | be76cad5bccd4091393e523b57960a4107101ca9 (patch) | |
tree | 1aa74fc94239957b53b7ad6940c4d07e7db63cec /pkg/sentry | |
parent | aa14a2c1be7f705927e9558f0e46ceca159e23e6 (diff) |
Make KVM more scalable by removing CPU cap.
Instead, CPUs will be created dynamically. We also allow a relatively
efficient mechanism for stealing and notifying when a vCPU becomes
available via unlock.
Since the number of vCPUs is no longer fixed at machine creation time,
we make the dirtySet packing more efficient. This has the pleasant side
effect of cutting out the unsafe address space code.
PiperOrigin-RevId: 201266691
Change-Id: I275c73525a4f38e3714b9ac0fd88731c26adfe66
Diffstat (limited to 'pkg/sentry')
-rw-r--r-- | pkg/sentry/platform/kvm/BUILD | 1 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/address_space.go | 67 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/address_space_unsafe.go | 44 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/kvm.go | 4 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine.go | 131 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine_amd64.go | 2 | ||||
-rw-r--r-- | pkg/sentry/platform/kvm/machine_amd64_unsafe.go | 4 |
7 files changed, 123 insertions, 130 deletions
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index 135861368..673393fad 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -27,7 +27,6 @@ go_library( name = "kvm", srcs = [ "address_space.go", - "address_space_unsafe.go", "allocator.go", "bluepill.go", "bluepill_amd64.go", diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go index f74c98dd0..fbd11ed71 100644 --- a/pkg/sentry/platform/kvm/address_space.go +++ b/pkg/sentry/platform/kvm/address_space.go @@ -17,13 +17,62 @@ package kvm import ( "reflect" "sync" + "sync/atomic" + "gvisor.googlesource.com/gvisor/pkg/atomicbitops" "gvisor.googlesource.com/gvisor/pkg/sentry/platform" "gvisor.googlesource.com/gvisor/pkg/sentry/platform/filemem" "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" ) +type vCPUBitArray [(_KVM_NR_VCPUS + 63) / 64]uint64 + +// dirtySet tracks vCPUs for invalidation. +type dirtySet struct { + vCPUs vCPUBitArray +} + +// forEach iterates over all CPUs in the dirty set. +func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) { + var localSet vCPUBitArray + for index := 0; index < len(ds.vCPUs); index++ { + // Clear the dirty set, copy to the local one. + localSet[index] = atomic.SwapUint64(&ds.vCPUs[index], 0) + } + + m.mu.RLock() + defer m.mu.RUnlock() + + for _, c := range m.vCPUs { + index := uint64(c.id) / 64 + bit := uint64(1) << uint(c.id%64) + + // Call the function if it was set. + if localSet[index]&bit != 0 { + fn(c) + } + } +} + +// mark marks the given vCPU as dirty and returns whether it was previously +// clean. Being previously clean implies that a flush is needed on entry. +func (ds *dirtySet) mark(c *vCPU) bool { + index := uint64(c.id) / 64 + bit := uint64(1) << uint(c.id%64) + + oldValue := atomic.LoadUint64(&ds.vCPUs[index]) + if oldValue&bit != 0 { + return false // Not clean. + } + + // Set the bit unilaterally, and ensure that a flush takes place. Note + // that it's possible for races to occur here, but since the flush is + // taking place long after these lines there's no race in practice. + atomicbitops.OrUint64(&ds.vCPUs[index], bit) + return true // Previously clean. +} + // addressSpace is a wrapper for PageTables. type addressSpace struct { platform.NoAddressSpaceIO @@ -43,10 +92,6 @@ type addressSpace struct { pageTables *pagetables.PageTables // dirtySet is the set of dirty vCPUs. - // - // These are actually vCPU pointers that are stored iff the vCPU is - // dirty. If the vCPU is not dirty and requires invalidation, then a - // nil value is stored here instead. dirtySet dirtySet // files contains files mapped in the host address space. @@ -57,11 +102,11 @@ type addressSpace struct { // invalidate is the implementation for Invalidate. func (as *addressSpace) invalidate() { - for i := 0; i < as.dirtySet.size(); i++ { - if c := as.dirtySet.swap(i, nil); c != nil && c.active.get() == as { - c.BounceToKernel() // Force a kernel transition. + as.dirtySet.forEach(as.machine, func(c *vCPU) { + if c.active.get() == as { // If this happens to be active, + c.BounceToKernel() // ... force a kernel transition. } - } + }) } // Invalidate interrupts all dirty contexts. @@ -75,11 +120,7 @@ func (as *addressSpace) Invalidate() { // // The return value indicates whether a flush is required. func (as *addressSpace) Touch(c *vCPU) bool { - if old := as.dirtySet.swap(c.id, c); old == nil { - return true // Flush is required. - } - // Already dirty: no flush required. - return false + return as.dirtySet.mark(c) } func (as *addressSpace) mapHost(addr usermem.Addr, m hostMapEntry, at usermem.AccessType) (inv bool) { diff --git a/pkg/sentry/platform/kvm/address_space_unsafe.go b/pkg/sentry/platform/kvm/address_space_unsafe.go deleted file mode 100644 index b6c31ce10..000000000 --- a/pkg/sentry/platform/kvm/address_space_unsafe.go +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2018 Google Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package kvm - -import ( - "sync/atomic" - "unsafe" -) - -// dirtySet tracks vCPUs for invalidation. -type dirtySet struct { - vCPUs []unsafe.Pointer -} - -// makeDirtySet makes a new dirtySet. -func makeDirtySet(size int) dirtySet { - return dirtySet{ - vCPUs: make([]unsafe.Pointer, size), - } -} - -// size is the size of the set. -func (ds *dirtySet) size() int { - return len(ds.vCPUs) -} - -// swap sets the given index and returns the previous value. -// -// The index is typically the id for a non-nil vCPU. -func (ds *dirtySet) swap(index int, c *vCPU) *vCPU { - return (*vCPU)(atomic.SwapPointer(&ds.vCPUs[index], unsafe.Pointer(c))) -} diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go index 1a8e16ca0..3ed057881 100644 --- a/pkg/sentry/platform/kvm/kvm.go +++ b/pkg/sentry/platform/kvm/kvm.go @@ -17,7 +17,6 @@ package kvm import ( "fmt" - "runtime" "sync" "syscall" @@ -77,7 +76,7 @@ func New() (*KVM, error) { } // Create a VM context. - machine, err := newMachine(int(vm), runtime.NumCPU()) + machine, err := newMachine(int(vm)) if err != nil { return nil, err } @@ -137,7 +136,6 @@ func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan stru filemem: k.FileMem, machine: k.machine, pageTables: pageTables, - dirtySet: makeDirtySet(len(k.machine.vCPUs)), }, nil, nil } diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index f045345d5..abdc51431 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -46,16 +46,14 @@ type machine struct { mappingCache sync.Map // mu protects vCPUs. - mu sync.Mutex + mu sync.RWMutex // available is notified when vCPUs are available. available sync.Cond // vCPUs are the machine vCPUs. // - // This is eventually keyed by system TID, but is initially indexed by - // the negative vCPU id. This is merely an optimization, so while - // collisions here are not possible, it wouldn't matter anyways. + // These are populated dynamically. vCPUs map[uint64]*vCPU } @@ -117,73 +115,65 @@ type vCPU struct { vCPUArchState } +// newVCPU creates a returns a new vCPU. +// +// Precondtion: mu must be held. +func (m *machine) newVCPU() *vCPU { + id := len(m.vCPUs) + + // Create the vCPU. + fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CREATE_VCPU, uintptr(id)) + if errno != 0 { + panic(fmt.Sprintf("error creating new vCPU: %v", errno)) + } + + c := &vCPU{ + id: id, + fd: int(fd), + machine: m, + } + c.CPU.Init(&m.kernel) + c.CPU.KernelSyscall = bluepillSyscall + c.CPU.KernelException = bluepillException + + // Ensure the signal mask is correct. + if err := c.setSignalMask(); err != nil { + panic(fmt.Sprintf("error setting signal mask: %v", err)) + } + + // Initialize architecture state. + if err := c.initArchState(); err != nil { + panic(fmt.Sprintf("error initialization vCPU state: %v", err)) + } + + // Map the run data. + runData, err := mapRunData(int(fd)) + if err != nil { + panic(fmt.Sprintf("error mapping run data: %v", err)) + } + c.runData = runData + + return c // Done. +} + // newMachine returns a new VM context. -func newMachine(vm int, vCPUs int) (*machine, error) { +func newMachine(vm int) (*machine, error) { // Create the machine. m := &machine{ fd: vm, vCPUs: make(map[uint64]*vCPU), } m.available.L = &m.mu - if vCPUs > _KVM_NR_VCPUS { - // Hard cap at KVM's limit. - vCPUs = _KVM_NR_VCPUS - } - if n := 2 * runtime.NumCPU(); vCPUs > n { - // Cap at twice the number of physical cores. Otherwise we're - // just wasting memory and thrashing. (There may be scheduling - // issues when you've got > n active threads.) - vCPUs = n - } m.kernel.Init(ring0.KernelOpts{ PageTables: pagetables.New(newAllocator()), }) // Initialize architecture state. - if err := m.initArchState(vCPUs); err != nil { + if err := m.initArchState(); err != nil { m.Destroy() return nil, err } - // Create all the vCPUs. - for id := 0; id < vCPUs; id++ { - // Create the vCPU. - fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(vm), _KVM_CREATE_VCPU, uintptr(id)) - if errno != 0 { - m.Destroy() - return nil, fmt.Errorf("error creating VCPU: %v", errno) - } - c := &vCPU{ - id: id, - fd: int(fd), - machine: m, - } - c.CPU.Init(&m.kernel) - c.CPU.KernelSyscall = bluepillSyscall - c.CPU.KernelException = bluepillException - m.vCPUs[uint64(-id)] = c // See above. - - // Ensure the signal mask is correct. - if err := c.setSignalMask(); err != nil { - m.Destroy() - return nil, err - } - - // Initialize architecture state. - if err := c.initArchState(); err != nil { - m.Destroy() - return nil, err - } - - // Map the run data. - runData, err := mapRunData(int(fd)) - if err != nil { - m.Destroy() - return nil, err - } - c.runData = runData - } - // Apply the physical mappings. Note that these mappings may point to // guest physical addresses that are not actually available. These // physical pages are mapped on demand, see kernel_unsafe.go. @@ -298,15 +288,20 @@ func (m *machine) Destroy() { func (m *machine) Get() *vCPU { runtime.LockOSThread() tid := procid.Current() - m.mu.Lock() + m.mu.RLock() // Check for an exact match. if c := m.vCPUs[tid]; c != nil { c.lock() - m.mu.Unlock() + m.mu.RUnlock() return c } + // The happy path failed. We now proceed to acquire an exclusive lock + // (because the vCPU map may change), and scan all available vCPUs. + m.mu.RUnlock() + m.mu.Lock() + for { // Scan for an available vCPU. for origTID, c := range m.vCPUs { @@ -314,16 +309,21 @@ func (m *machine) Get() *vCPU { delete(m.vCPUs, origTID) m.vCPUs[tid] = c m.mu.Unlock() - - // We need to reload thread-local segments as - // we have origTID != tid and the vCPU state - // may be stale. - c.loadSegments() - atomic.StoreUint64(&c.tid, tid) + c.loadSegments(tid) return c } } + // Create a new vCPU (maybe). + if len(m.vCPUs) < _KVM_NR_VCPUS { + c := m.newVCPU() + c.lock() + m.vCPUs[tid] = c + m.mu.Unlock() + c.loadSegments(tid) + return c + } + // Scan for something not in user mode. for origTID, c := range m.vCPUs { if !atomic.CompareAndSwapUint32(&c.state, vCPUGuest, vCPUGuest|vCPUWaiter) { @@ -346,10 +346,7 @@ func (m *machine) Get() *vCPU { delete(m.vCPUs, origTID) m.vCPUs[tid] = c m.mu.Unlock() - - // See above. - c.loadSegments() - atomic.StoreUint64(&c.tid, tid) + c.loadSegments(tid) return c } diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index 52896eefe..9af4f3f3d 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -29,7 +29,7 @@ import ( ) // initArchState initializes architecture-specific state. -func (m *machine) initArchState(vCPUs int) error { +func (m *machine) initArchState() error { // Set the legacy TSS address. This address is covered by the reserved // range (up to 4GB). In fact, this is a main reason it exists. if _, _, errno := syscall.RawSyscall( diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go index c2bcb3a47..8b9041f13 100644 --- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go @@ -18,6 +18,7 @@ package kvm import ( "fmt" + "sync/atomic" "syscall" "unsafe" @@ -54,7 +55,7 @@ func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr) s // This may be called from within the signal context and throws on error. // //go:nosplit -func (c *vCPU) loadSegments() { +func (c *vCPU) loadSegments(tid uint64) { if _, _, errno := syscall.RawSyscall( syscall.SYS_ARCH_PRCTL, linux.ARCH_GET_FS, @@ -69,6 +70,7 @@ func (c *vCPU) loadSegments() { 0); errno != 0 { throw("getting GS segment") } + atomic.StoreUint64(&c.tid, tid) } // setUserRegisters sets user registers in the vCPU. |