// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build arm64 // +build arm64 package kvm import ( "runtime" "sync/atomic" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/ring0" "gvisor.dev/gvisor/pkg/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/platform" ) type machineArchState struct { //initialvCPUs is the machine vCPUs which has initialized but not used initialvCPUs map[int]*vCPU } type vCPUArchState struct { // PCIDs is the set of PCIDs for this vCPU. // // This starts above fixedKernelPCID. PCIDs *pagetables.PCIDs } const ( // fixedKernelPCID is a fixed kernel PCID used for the kernel page // tables. We must start allocating user PCIDs above this in order to // avoid any conflict (see below). fixedKernelPCID = 1 // poolPCIDs is the number of PCIDs to record in the database. As this // grows, assignment can take longer, since it is a simple linear scan. // Beyond a relatively small number, there are likely few perform // benefits, since the TLB has likely long since lost any translations // from more than a few PCIDs past. poolPCIDs = 128 ) func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) { applyPhysicalRegions(func(pr physicalRegion) bool { pageTable.Map( hostarch.Addr(ring0.KernelStartAddress|pr.virtual), pr.length, pagetables.MapOpts{AccessType: hostarch.AnyAccess, Global: true}, pr.physical) return true // Keep iterating. }) } // Get all read-only physicalRegions. func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) { var rdonlyRegions []region applyVirtualRegions(func(vr virtualRegion) { if excludeVirtualRegion(vr) { return } if !vr.accessType.Write && vr.accessType.Read { rdonlyRegions = append(rdonlyRegions, vr.region) } // TODO(gvisor.dev/issue/2686): PROT_NONE should be specially treated. // Workaround: treated as rdonly temporarily. if !vr.accessType.Write && !vr.accessType.Read && !vr.accessType.Execute { rdonlyRegions = append(rdonlyRegions, vr.region) } }) for _, r := range rdonlyRegions { physical, _, ok := translateToPhysical(r.virtual) if !ok { continue } phyRegions = append(phyRegions, physicalRegion{ region: region{ virtual: r.virtual, length: r.length, }, physical: physical, }) } return phyRegions } // archPhysicalRegions fills readOnlyGuestRegions and allocates separate // physical regions form them. func archPhysicalRegions(physicalRegions []physicalRegion) []physicalRegion { applyVirtualRegions(func(vr virtualRegion) { if excludeVirtualRegion(vr) { return // skip region. } if !vr.accessType.Write { readOnlyGuestRegions = append(readOnlyGuestRegions, vr.region) } }) rdRegions := readOnlyGuestRegions[:] // Add an unreachable region. rdRegions = append(rdRegions, region{ virtual: 0xffffffffffffffff, length: 0, }) var regions []physicalRegion addValidRegion := func(r *physicalRegion, virtual, length uintptr) { if length == 0 { return } regions = append(regions, physicalRegion{ region: region{ virtual: virtual, length: length, }, physical: r.physical + (virtual - r.virtual), }) } i := 0 for _, pr := range physicalRegions { start := pr.virtual end := pr.virtual + pr.length for start < end { rdRegion := rdRegions[i] rdStart := rdRegion.virtual rdEnd := rdRegion.virtual + rdRegion.length if rdEnd <= start { i++ continue } if rdStart > start { newEnd := rdStart if end < rdStart { newEnd = end } addValidRegion(&pr, start, newEnd-start) start = rdStart continue } if rdEnd < end { addValidRegion(&pr, start, rdEnd-start) start = rdEnd continue } addValidRegion(&pr, start, end-start) start = end } } return regions } // Get all available physicalRegions. func availableRegionsForSetMem() []physicalRegion { var excludedRegions []region applyVirtualRegions(func(vr virtualRegion) { if !vr.accessType.Write { excludedRegions = append(excludedRegions, vr.region) } }) // Add an unreachable region. excludedRegions = append(excludedRegions, region{ virtual: 0xffffffffffffffff, length: 0, }) var regions []physicalRegion addValidRegion := func(r *physicalRegion, virtual, length uintptr) { if length == 0 { return } regions = append(regions, physicalRegion{ region: region{ virtual: virtual, length: length, }, physical: r.physical + (virtual - r.virtual), }) } i := 0 for _, pr := range physicalRegions { start := pr.virtual end := pr.virtual + pr.length for start < end { er := excludedRegions[i] excludeEnd := er.virtual + er.length excludeStart := er.virtual if excludeEnd < start { i++ continue } if excludeStart < start { start = excludeEnd i++ continue } rend := excludeStart if rend > end { rend = end } addValidRegion(&pr, start, rend-start) start = excludeEnd } } return regions } // nonCanonical generates a canonical address return. // //go:nosplit func nonCanonical(addr uint64, signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) { *info = linux.SignalInfo{ Signo: signal, Code: linux.SI_KERNEL, } info.SetAddr(addr) // Include address. return hostarch.NoAccess, platform.ErrContextSignal } // isInstructionAbort returns true if it is an instruction abort. // //go:nosplit func isInstructionAbort(code uint64) bool { value := (code & _ESR_ELx_EC_MASK) >> _ESR_ELx_EC_SHIFT return value == _ESR_ELx_EC_IABT_LOW } // isWriteFault returns whether it is a write fault. // //go:nosplit func isWriteFault(code uint64) bool { if isInstructionAbort(code) { return false } return (code & _ESR_ELx_WNR) != 0 } // fault generates an appropriate fault return. // //go:nosplit func (c *vCPU) fault(signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) { bluepill(c) // Probably no-op, but may not be. faultAddr := c.GetFaultAddr() code, user := c.ErrorCode() if !user { // The last fault serviced by this CPU was not a user // fault, so we can't reliably trust the faultAddr or // the code provided here. We need to re-execute. return hostarch.NoAccess, platform.ErrContextInterrupt } // Reset the pointed SignalInfo. *info = linux.SignalInfo{Signo: signal} info.SetAddr(uint64(faultAddr)) ret := code & _ESR_ELx_FSC switch ret { case _ESR_SEGV_MAPERR_L0, _ESR_SEGV_MAPERR_L1, _ESR_SEGV_MAPERR_L2, _ESR_SEGV_MAPERR_L3: info.Code = 1 //SEGV_MAPERR case _ESR_SEGV_ACCERR_L1, _ESR_SEGV_ACCERR_L2, _ESR_SEGV_ACCERR_L3, _ESR_SEGV_PEMERR_L1, _ESR_SEGV_PEMERR_L2, _ESR_SEGV_PEMERR_L3: info.Code = 2 // SEGV_ACCERR. default: info.Code = 2 } accessType := hostarch.AccessType{ Read: !isWriteFault(uint64(code)), Write: isWriteFault(uint64(code)), Execute: isInstructionAbort(uint64(code)), } return accessType, platform.ErrContextSignal } // getMaxVCPU get max vCPU number func (m *machine) getMaxVCPU() { rmaxVCPUs := runtime.NumCPU() smaxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) // compare the max vcpu number from runtime and syscall, use smaller one. if errno != 0 { m.maxVCPUs = rmaxVCPUs } else { if rmaxVCPUs < int(smaxVCPUs) { m.maxVCPUs = rmaxVCPUs } else { m.maxVCPUs = int(smaxVCPUs) } } } // getNewVCPU() scan for an available vCPU from initialvCPUs func (m *machine) getNewVCPU() *vCPU { for CID, c := range m.initialvCPUs { if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) { delete(m.initialvCPUs, CID) return c } } return nil }