// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package mm import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/syserror" ) // Preconditions: // * mm.mappingMu must be locked for writing. // * opts must be valid as defined by the checks in MMap. func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOpts) (vmaIterator, hostarch.AddrRange, error) { if opts.MaxPerms != opts.MaxPerms.Effective() { panic(fmt.Sprintf("Non-effective MaxPerms %s cannot be enforced", opts.MaxPerms)) } // Find a usable range. addr, err := mm.findAvailableLocked(opts.Length, findAvailableOpts{ Addr: opts.Addr, Fixed: opts.Fixed, Unmap: opts.Unmap, Map32Bit: opts.Map32Bit, }) if err != nil { // Can't force without opts.Unmap and opts.Fixed. if opts.Force && opts.Unmap && opts.Fixed { addr = opts.Addr } else { return vmaIterator{}, hostarch.AddrRange{}, err } } ar, _ := addr.ToRange(opts.Length) // Check against RLIMIT_AS. newUsageAS := mm.usageAS + opts.Length if opts.Unmap { newUsageAS -= uint64(mm.vmas.SpanRange(ar)) } if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS { return vmaIterator{}, hostarch.AddrRange{}, syserror.ENOMEM } if opts.MLockMode != memmap.MLockNone { // Check against RLIMIT_MEMLOCK. if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur if mlockLimit == 0 { return vmaIterator{}, hostarch.AddrRange{}, linuxerr.EPERM } newLockedAS := mm.lockedAS + opts.Length if opts.Unmap { newLockedAS -= mm.mlockedBytesRangeLocked(ar) } if newLockedAS > mlockLimit { return vmaIterator{}, hostarch.AddrRange{}, linuxerr.EAGAIN } } } // Remove overwritten mappings. This ordering is consistent with Linux: // compare Linux's mm/mmap.c:mmap_region() => do_munmap(), // file->f_op->mmap(). var vgap vmaGapIterator if opts.Unmap { vgap = mm.unmapLocked(ctx, ar) } else { vgap = mm.vmas.FindGap(ar.Start) } // Inform the Mappable, if any, of the new mapping. if opts.Mappable != nil { // The expression for writable is vma.canWriteMappableLocked(), but we // don't yet have a vma. if err := opts.Mappable.AddMapping(ctx, mm, ar, opts.Offset, !opts.Private && opts.MaxPerms.Write); err != nil { return vmaIterator{}, hostarch.AddrRange{}, err } } // Take a reference on opts.MappingIdentity before inserting the vma since // vma merging can drop the reference. if opts.MappingIdentity != nil { opts.MappingIdentity.IncRef() } // Finally insert the vma. v := vma{ mappable: opts.Mappable, off: opts.Offset, realPerms: opts.Perms, effectivePerms: opts.Perms.Effective(), maxPerms: opts.MaxPerms, private: opts.Private, growsDown: opts.GrowsDown, mlockMode: opts.MLockMode, numaPolicy: linux.MPOL_DEFAULT, id: opts.MappingIdentity, hint: opts.Hint, } vseg := mm.vmas.Insert(vgap, ar, v) mm.usageAS += opts.Length if v.isPrivateDataLocked() { mm.dataAS += opts.Length } if opts.MLockMode != memmap.MLockNone { mm.lockedAS += opts.Length } return vseg, ar, nil } type findAvailableOpts struct { // These fields are equivalent to those in memmap.MMapOpts, except that: // // - Addr must be page-aligned. // // - Unmap allows existing guard pages in the returned range. Addr hostarch.Addr Fixed bool Unmap bool Map32Bit bool } // map32Start/End are the bounds to which MAP_32BIT mappings are constrained, // and are equivalent to Linux's MAP32_BASE and MAP32_MAX respectively. const ( map32Start = 0x40000000 map32End = 0x80000000 ) // findAvailableLocked finds an allocatable range. // // Preconditions: mm.mappingMu must be locked. func (mm *MemoryManager) findAvailableLocked(length uint64, opts findAvailableOpts) (hostarch.Addr, error) { if opts.Fixed { opts.Map32Bit = false } allowedAR := mm.applicationAddrRange() if opts.Map32Bit { allowedAR = allowedAR.Intersect(hostarch.AddrRange{map32Start, map32End}) } // Does the provided suggestion work? if ar, ok := opts.Addr.ToRange(length); ok { if allowedAR.IsSupersetOf(ar) { if opts.Unmap { return ar.Start, nil } // Check for the presence of an existing vma or guard page. if vgap := mm.vmas.FindGap(ar.Start); vgap.Ok() && vgap.availableRange().IsSupersetOf(ar) { return ar.Start, nil } } } // Fixed mappings accept only the requested address. if opts.Fixed { return 0, syserror.ENOMEM } // Prefer hugepage alignment if a hugepage or more is requested. alignment := uint64(hostarch.PageSize) if length >= hostarch.HugePageSize { alignment = hostarch.HugePageSize } if opts.Map32Bit { return mm.findLowestAvailableLocked(length, alignment, allowedAR) } if mm.layout.DefaultDirection == arch.MmapBottomUp { return mm.findLowestAvailableLocked(length, alignment, hostarch.AddrRange{mm.layout.BottomUpBase, mm.layout.MaxAddr}) } return mm.findHighestAvailableLocked(length, alignment, hostarch.AddrRange{mm.layout.MinAddr, mm.layout.TopDownBase}) } func (mm *MemoryManager) applicationAddrRange() hostarch.AddrRange { return hostarch.AddrRange{mm.layout.MinAddr, mm.layout.MaxAddr} } // Preconditions: mm.mappingMu must be locked. func (mm *MemoryManager) findLowestAvailableLocked(length, alignment uint64, bounds hostarch.AddrRange) (hostarch.Addr, error) { for gap := mm.vmas.LowerBoundGap(bounds.Start); gap.Ok() && gap.Start() < bounds.End; gap = gap.NextLargeEnoughGap(hostarch.Addr(length)) { if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length { // Can we shift up to match the alignment? if offset := uint64(gr.Start) % alignment; offset != 0 { if uint64(gr.Length()) >= length+alignment-offset { // Yes, we're aligned. return gr.Start + hostarch.Addr(alignment-offset), nil } } // Either aligned perfectly, or can't align it. return gr.Start, nil } } return 0, syserror.ENOMEM } // Preconditions: mm.mappingMu must be locked. func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bounds hostarch.AddrRange) (hostarch.Addr, error) { for gap := mm.vmas.UpperBoundGap(bounds.End); gap.Ok() && gap.End() > bounds.Start; gap = gap.PrevLargeEnoughGap(hostarch.Addr(length)) { if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length { // Can we shift down to match the alignment? start := gr.End - hostarch.Addr(length) if offset := uint64(start) % alignment; offset != 0 { if gr.Start <= start-hostarch.Addr(offset) { // Yes, we're aligned. return start - hostarch.Addr(offset), nil } } // Either aligned perfectly, or can't align it. return start, nil } } return 0, syserror.ENOMEM } // Preconditions: mm.mappingMu must be locked. func (mm *MemoryManager) mlockedBytesRangeLocked(ar hostarch.AddrRange) uint64 { var total uint64 for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { if vseg.ValuePtr().mlockMode != memmap.MLockNone { total += uint64(vseg.Range().Intersect(ar).Length()) } } return total } // getVMAsLocked ensures that vmas exist for all addresses in ar, and support // access of type (at, ignorePermissions). It returns: // // - An iterator to the vma containing ar.Start. If no vma contains ar.Start, // the iterator is unspecified. // // - An iterator to the gap after the last vma containing an address in ar. If // vmas exist for no addresses in ar, the iterator is to a gap that begins // before ar.Start. // // - An error that is non-nil if vmas exist for only a subset of ar. // // Preconditions: // * mm.mappingMu must be locked for reading; it may be temporarily unlocked. // * ar.Length() != 0. func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool) (vmaIterator, vmaGapIterator, error) { if checkInvariants { if !ar.WellFormed() || ar.Length() == 0 { panic(fmt.Sprintf("invalid ar: %v", ar)) } } // Inline mm.vmas.LowerBoundSegment so that we have the preceding gap if // !vbegin.Ok(). vbegin, vgap := mm.vmas.Find(ar.Start) if !vbegin.Ok() { vbegin = vgap.NextSegment() // vseg.Ok() is checked before entering the following loop. } else { vgap = vbegin.PrevGap() } addr := ar.Start vseg := vbegin for vseg.Ok() { // Loop invariants: vgap = vseg.PrevGap(); addr < vseg.End(). vma := vseg.ValuePtr() if addr < vseg.Start() { // TODO(jamieliu): Implement vma.growsDown here. return vbegin, vgap, syserror.EFAULT } perms := vma.effectivePerms if ignorePermissions { perms = vma.maxPerms } if !perms.SupersetOf(at) { return vbegin, vgap, linuxerr.EPERM } addr = vseg.End() vgap = vseg.NextGap() if addr >= ar.End { return vbegin, vgap, nil } vseg = vgap.NextSegment() } // Ran out of vmas before ar.End. return vbegin, vgap, syserror.EFAULT } // getVecVMAsLocked ensures that vmas exist for all addresses in ars, and // support access to type of (at, ignorePermissions). It returns the subset of // ars for which vmas exist. If this is not equal to ars, it returns a non-nil // error explaining why. // // Preconditions: mm.mappingMu must be locked for reading; it may be // temporarily unlocked. // // Postconditions: ars is not mutated. func (mm *MemoryManager) getVecVMAsLocked(ctx context.Context, ars hostarch.AddrRangeSeq, at hostarch.AccessType, ignorePermissions bool) (hostarch.AddrRangeSeq, error) { for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() { ar := arsit.Head() if ar.Length() == 0 { continue } if _, vend, err := mm.getVMAsLocked(ctx, ar, at, ignorePermissions); err != nil { return truncatedAddrRangeSeq(ars, arsit, vend.Start()), err } } return ars, nil } // vma extension will not shrink the number of unmapped bytes between the start // of a growsDown vma and the end of its predecessor non-growsDown vma below // guardBytes. // // guardBytes is equivalent to Linux's stack_guard_gap after upstream // 1be7107fbe18 "mm: larger stack guard gap, between vmas". const guardBytes = 256 * hostarch.PageSize // unmapLocked unmaps all addresses in ar and returns the resulting gap in // mm.vmas. // // Preconditions: // * mm.mappingMu must be locked for writing. // * ar.Length() != 0. // * ar must be page-aligned. func (mm *MemoryManager) unmapLocked(ctx context.Context, ar hostarch.AddrRange) vmaGapIterator { if checkInvariants { if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { panic(fmt.Sprintf("invalid ar: %v", ar)) } } // AddressSpace mappings and pmas must be invalidated before // mm.removeVMAsLocked() => memmap.Mappable.RemoveMapping(). mm.Invalidate(ar, memmap.InvalidateOpts{InvalidatePrivate: true}) return mm.removeVMAsLocked(ctx, ar) } // removeVMAsLocked removes vmas for addresses in ar and returns the resulting // gap in mm.vmas. It does not remove pmas or AddressSpace mappings; clients // must do so before calling removeVMAsLocked. // // Preconditions: // * mm.mappingMu must be locked for writing. // * ar.Length() != 0. // * ar must be page-aligned. func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar hostarch.AddrRange) vmaGapIterator { if checkInvariants { if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() { panic(fmt.Sprintf("invalid ar: %v", ar)) } } vseg, vgap := mm.vmas.Find(ar.Start) if vgap.Ok() { vseg = vgap.NextSegment() } for vseg.Ok() && vseg.Start() < ar.End { vseg = mm.vmas.Isolate(vseg, ar) vmaAR := vseg.Range() vma := vseg.ValuePtr() if vma.mappable != nil { vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off, vma.canWriteMappableLocked()) } if vma.id != nil { vma.id.DecRef(ctx) } mm.usageAS -= uint64(vmaAR.Length()) if vma.isPrivateDataLocked() { mm.dataAS -= uint64(vmaAR.Length()) } if vma.mlockMode != memmap.MLockNone { mm.lockedAS -= uint64(vmaAR.Length()) } vgap = mm.vmas.Remove(vseg) vseg = vgap.NextSegment() } return vgap } // canWriteMappableLocked returns true if it is possible for vma.mappable to be // written to via this vma, i.e. if it is possible that // vma.mappable.Translate(at.Write=true) may be called as a result of this vma. // This includes via I/O with usermem.IOOpts.IgnorePermissions = true, such as // PTRACE_POKEDATA. // // canWriteMappableLocked is equivalent to Linux's VM_SHARED. // // Preconditions: mm.mappingMu must be locked. func (vma *vma) canWriteMappableLocked() bool { return !vma.private && vma.maxPerms.Write } // isPrivateDataLocked identify the data segments - private, writable, not stack // // Preconditions: mm.mappingMu must be locked. func (vma *vma) isPrivateDataLocked() bool { return vma.realPerms.Write && vma.private && !vma.growsDown } // vmaSetFunctions implements segment.Functions for vmaSet. type vmaSetFunctions struct{} func (vmaSetFunctions) MinKey() hostarch.Addr { return 0 } func (vmaSetFunctions) MaxKey() hostarch.Addr { return ^hostarch.Addr(0) } func (vmaSetFunctions) ClearValue(vma *vma) { vma.mappable = nil vma.id = nil vma.hint = "" } func (vmaSetFunctions) Merge(ar1 hostarch.AddrRange, vma1 vma, ar2 hostarch.AddrRange, vma2 vma) (vma, bool) { if vma1.mappable != vma2.mappable || (vma1.mappable != nil && vma1.off+uint64(ar1.Length()) != vma2.off) || vma1.realPerms != vma2.realPerms || vma1.maxPerms != vma2.maxPerms || vma1.private != vma2.private || vma1.growsDown != vma2.growsDown || vma1.mlockMode != vma2.mlockMode || vma1.numaPolicy != vma2.numaPolicy || vma1.numaNodemask != vma2.numaNodemask || vma1.dontfork != vma2.dontfork || vma1.id != vma2.id || vma1.hint != vma2.hint { return vma{}, false } if vma2.id != nil { vma2.id.DecRef(context.Background()) } return vma1, true } func (vmaSetFunctions) Split(ar hostarch.AddrRange, v vma, split hostarch.Addr) (vma, vma) { v2 := v if v2.mappable != nil { v2.off += uint64(split - ar.Start) } if v2.id != nil { v2.id.IncRef() } return v, v2 } // Preconditions: // * vseg.ValuePtr().mappable != nil. // * vseg.Range().Contains(addr). func (vseg vmaIterator) mappableOffsetAt(addr hostarch.Addr) uint64 { if checkInvariants { if !vseg.Ok() { panic("terminal vma iterator") } if vseg.ValuePtr().mappable == nil { panic("Mappable offset is meaningless for anonymous vma") } if !vseg.Range().Contains(addr) { panic(fmt.Sprintf("addr %v out of bounds %v", addr, vseg.Range())) } } vma := vseg.ValuePtr() vstart := vseg.Start() return vma.off + uint64(addr-vstart) } // Preconditions: vseg.ValuePtr().mappable != nil. func (vseg vmaIterator) mappableRange() memmap.MappableRange { return vseg.mappableRangeOf(vseg.Range()) } // Preconditions: // * vseg.ValuePtr().mappable != nil. // * vseg.Range().IsSupersetOf(ar). // * ar.Length() != 0. func (vseg vmaIterator) mappableRangeOf(ar hostarch.AddrRange) memmap.MappableRange { if checkInvariants { if !vseg.Ok() { panic("terminal vma iterator") } if vseg.ValuePtr().mappable == nil { panic("MappableRange is meaningless for anonymous vma") } if !ar.WellFormed() || ar.Length() == 0 { panic(fmt.Sprintf("invalid ar: %v", ar)) } if !vseg.Range().IsSupersetOf(ar) { panic(fmt.Sprintf("ar %v out of bounds %v", ar, vseg.Range())) } } vma := vseg.ValuePtr() vstart := vseg.Start() return memmap.MappableRange{vma.off + uint64(ar.Start-vstart), vma.off + uint64(ar.End-vstart)} } // Preconditions: // * vseg.ValuePtr().mappable != nil. // * vseg.mappableRange().IsSupersetOf(mr). // * mr.Length() != 0. func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) hostarch.AddrRange { if checkInvariants { if !vseg.Ok() { panic("terminal vma iterator") } if vseg.ValuePtr().mappable == nil { panic("MappableRange is meaningless for anonymous vma") } if !mr.WellFormed() || mr.Length() == 0 { panic(fmt.Sprintf("invalid mr: %v", mr)) } if !vseg.mappableRange().IsSupersetOf(mr) { panic(fmt.Sprintf("mr %v out of bounds %v", mr, vseg.mappableRange())) } } vma := vseg.ValuePtr() vstart := vseg.Start() return hostarch.AddrRange{vstart + hostarch.Addr(mr.Start-vma.off), vstart + hostarch.Addr(mr.End-vma.off)} } // seekNextLowerBound returns mm.vmas.LowerBoundSegment(addr), but does so by // scanning linearly forward from vseg. // // Preconditions: // * mm.mappingMu must be locked. // * addr >= vseg.Start(). func (vseg vmaIterator) seekNextLowerBound(addr hostarch.Addr) vmaIterator { if checkInvariants { if !vseg.Ok() { panic("terminal vma iterator") } if addr < vseg.Start() { panic(fmt.Sprintf("can't seek forward to %#x from %#x", addr, vseg.Start())) } } for vseg.Ok() && addr >= vseg.End() { vseg = vseg.NextSegment() } return vseg } // availableRange returns the subset of vgap.Range() in which new vmas may be // created without MMapOpts.Unmap == true. func (vgap vmaGapIterator) availableRange() hostarch.AddrRange { ar := vgap.Range() next := vgap.NextSegment() if !next.Ok() || !next.ValuePtr().growsDown { return ar } // Exclude guard pages. if ar.Length() < guardBytes { return hostarch.AddrRange{ar.Start, ar.Start} } ar.End -= guardBytes return ar }