summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/mm/vma.go
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/mm/vma.go')
-rw-r--r--pkg/sentry/mm/vma.go568
1 files changed, 568 insertions, 0 deletions
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
new file mode 100644
index 000000000..16d8207e9
--- /dev/null
+++ b/pkg/sentry/mm/vma.go
@@ -0,0 +1,568 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Preconditions: mm.mappingMu must be locked for writing. opts must be valid
+// as defined by the checks in MMap.
+func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOpts) (vmaIterator, usermem.AddrRange, error) {
+ if opts.MaxPerms != opts.MaxPerms.Effective() {
+ panic(fmt.Sprintf("Non-effective MaxPerms %s cannot be enforced", opts.MaxPerms))
+ }
+
+ // Find a usable range.
+ addr, err := mm.findAvailableLocked(opts.Length, findAvailableOpts{
+ Addr: opts.Addr,
+ Fixed: opts.Fixed,
+ Unmap: opts.Unmap,
+ Map32Bit: opts.Map32Bit,
+ })
+ if err != nil {
+ return vmaIterator{}, usermem.AddrRange{}, err
+ }
+ ar, _ := addr.ToRange(opts.Length)
+
+ // Check against RLIMIT_AS.
+ newUsageAS := mm.usageAS + opts.Length
+ if opts.Unmap {
+ newUsageAS -= uint64(mm.vmas.SpanRange(ar))
+ }
+ if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS {
+ return vmaIterator{}, usermem.AddrRange{}, syserror.ENOMEM
+ }
+
+ if opts.MLockMode != memmap.MLockNone {
+ // Check against RLIMIT_MEMLOCK.
+ if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+ mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+ if mlockLimit == 0 {
+ return vmaIterator{}, usermem.AddrRange{}, syserror.EPERM
+ }
+ newLockedAS := mm.lockedAS + opts.Length
+ if opts.Unmap {
+ newLockedAS -= mm.mlockedBytesRangeLocked(ar)
+ }
+ if newLockedAS > mlockLimit {
+ return vmaIterator{}, usermem.AddrRange{}, syserror.EAGAIN
+ }
+ }
+ }
+
+ // Remove overwritten mappings. This ordering is consistent with Linux:
+ // compare Linux's mm/mmap.c:mmap_region() => do_munmap(),
+ // file->f_op->mmap().
+ var vgap vmaGapIterator
+ if opts.Unmap {
+ vgap = mm.unmapLocked(ctx, ar)
+ } else {
+ vgap = mm.vmas.FindGap(ar.Start)
+ }
+
+ // Inform the Mappable, if any, of the new mapping.
+ if opts.Mappable != nil {
+ // The expression for writable is vma.canWriteMappableLocked(), but we
+ // don't yet have a vma.
+ if err := opts.Mappable.AddMapping(ctx, mm, ar, opts.Offset, !opts.Private && opts.MaxPerms.Write); err != nil {
+ return vmaIterator{}, usermem.AddrRange{}, err
+ }
+ }
+
+ // Take a reference on opts.MappingIdentity before inserting the vma since
+ // vma merging can drop the reference.
+ if opts.MappingIdentity != nil {
+ opts.MappingIdentity.IncRef()
+ }
+
+ // Finally insert the vma.
+ v := vma{
+ mappable: opts.Mappable,
+ off: opts.Offset,
+ realPerms: opts.Perms,
+ effectivePerms: opts.Perms.Effective(),
+ maxPerms: opts.MaxPerms,
+ private: opts.Private,
+ growsDown: opts.GrowsDown,
+ mlockMode: opts.MLockMode,
+ numaPolicy: linux.MPOL_DEFAULT,
+ id: opts.MappingIdentity,
+ hint: opts.Hint,
+ }
+
+ vseg := mm.vmas.Insert(vgap, ar, v)
+ mm.usageAS += opts.Length
+ if v.isPrivateDataLocked() {
+ mm.dataAS += opts.Length
+ }
+ if opts.MLockMode != memmap.MLockNone {
+ mm.lockedAS += opts.Length
+ }
+
+ return vseg, ar, nil
+}
+
+type findAvailableOpts struct {
+ // These fields are equivalent to those in memmap.MMapOpts, except that:
+ //
+ // - Addr must be page-aligned.
+ //
+ // - Unmap allows existing guard pages in the returned range.
+
+ Addr usermem.Addr
+ Fixed bool
+ Unmap bool
+ Map32Bit bool
+}
+
+// map32Start/End are the bounds to which MAP_32BIT mappings are constrained,
+// and are equivalent to Linux's MAP32_BASE and MAP32_MAX respectively.
+const (
+ map32Start = 0x40000000
+ map32End = 0x80000000
+)
+
+// findAvailableLocked finds an allocatable range.
+//
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) findAvailableLocked(length uint64, opts findAvailableOpts) (usermem.Addr, error) {
+ if opts.Fixed {
+ opts.Map32Bit = false
+ }
+ allowedAR := mm.applicationAddrRange()
+ if opts.Map32Bit {
+ allowedAR = allowedAR.Intersect(usermem.AddrRange{map32Start, map32End})
+ }
+
+ // Does the provided suggestion work?
+ if ar, ok := opts.Addr.ToRange(length); ok {
+ if allowedAR.IsSupersetOf(ar) {
+ if opts.Unmap {
+ return ar.Start, nil
+ }
+ // Check for the presence of an existing vma or guard page.
+ if vgap := mm.vmas.FindGap(ar.Start); vgap.Ok() && vgap.availableRange().IsSupersetOf(ar) {
+ return ar.Start, nil
+ }
+ }
+ }
+
+ // Fixed mappings accept only the requested address.
+ if opts.Fixed {
+ return 0, syserror.ENOMEM
+ }
+
+ // Prefer hugepage alignment if a hugepage or more is requested.
+ alignment := uint64(usermem.PageSize)
+ if length >= usermem.HugePageSize {
+ alignment = usermem.HugePageSize
+ }
+
+ if opts.Map32Bit {
+ return mm.findLowestAvailableLocked(length, alignment, allowedAR)
+ }
+ if mm.layout.DefaultDirection == arch.MmapBottomUp {
+ return mm.findLowestAvailableLocked(length, alignment, usermem.AddrRange{mm.layout.BottomUpBase, mm.layout.MaxAddr})
+ }
+ return mm.findHighestAvailableLocked(length, alignment, usermem.AddrRange{mm.layout.MinAddr, mm.layout.TopDownBase})
+}
+
+func (mm *MemoryManager) applicationAddrRange() usermem.AddrRange {
+ return usermem.AddrRange{mm.layout.MinAddr, mm.layout.MaxAddr}
+}
+
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) findLowestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) {
+ for gap := mm.vmas.LowerBoundGap(bounds.Start); gap.Ok() && gap.Start() < bounds.End; gap = gap.NextLargeEnoughGap(usermem.Addr(length)) {
+ if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length {
+ // Can we shift up to match the alignment?
+ if offset := uint64(gr.Start) % alignment; offset != 0 {
+ if uint64(gr.Length()) >= length+alignment-offset {
+ // Yes, we're aligned.
+ return gr.Start + usermem.Addr(alignment-offset), nil
+ }
+ }
+
+ // Either aligned perfectly, or can't align it.
+ return gr.Start, nil
+ }
+ }
+ return 0, syserror.ENOMEM
+}
+
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) {
+ for gap := mm.vmas.UpperBoundGap(bounds.End); gap.Ok() && gap.End() > bounds.Start; gap = gap.PrevLargeEnoughGap(usermem.Addr(length)) {
+ if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length {
+ // Can we shift down to match the alignment?
+ start := gr.End - usermem.Addr(length)
+ if offset := uint64(start) % alignment; offset != 0 {
+ if gr.Start <= start-usermem.Addr(offset) {
+ // Yes, we're aligned.
+ return start - usermem.Addr(offset), nil
+ }
+ }
+
+ // Either aligned perfectly, or can't align it.
+ return start, nil
+ }
+ }
+ return 0, syserror.ENOMEM
+}
+
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) mlockedBytesRangeLocked(ar usermem.AddrRange) uint64 {
+ var total uint64
+ for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
+ if vseg.ValuePtr().mlockMode != memmap.MLockNone {
+ total += uint64(vseg.Range().Intersect(ar).Length())
+ }
+ }
+ return total
+}
+
+// getVMAsLocked ensures that vmas exist for all addresses in ar, and support
+// access of type (at, ignorePermissions). It returns:
+//
+// - An iterator to the vma containing ar.Start. If no vma contains ar.Start,
+// the iterator is unspecified.
+//
+// - An iterator to the gap after the last vma containing an address in ar. If
+// vmas exist for no addresses in ar, the iterator is to a gap that begins
+// before ar.Start.
+//
+// - An error that is non-nil if vmas exist for only a subset of ar.
+//
+// Preconditions: mm.mappingMu must be locked for reading; it may be
+// temporarily unlocked. ar.Length() != 0.
+func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) (vmaIterator, vmaGapIterator, error) {
+ if checkInvariants {
+ if !ar.WellFormed() || ar.Length() <= 0 {
+ panic(fmt.Sprintf("invalid ar: %v", ar))
+ }
+ }
+
+ // Inline mm.vmas.LowerBoundSegment so that we have the preceding gap if
+ // !vbegin.Ok().
+ vbegin, vgap := mm.vmas.Find(ar.Start)
+ if !vbegin.Ok() {
+ vbegin = vgap.NextSegment()
+ // vseg.Ok() is checked before entering the following loop.
+ } else {
+ vgap = vbegin.PrevGap()
+ }
+
+ addr := ar.Start
+ vseg := vbegin
+ for vseg.Ok() {
+ // Loop invariants: vgap = vseg.PrevGap(); addr < vseg.End().
+ vma := vseg.ValuePtr()
+ if addr < vseg.Start() {
+ // TODO(jamieliu): Implement vma.growsDown here.
+ return vbegin, vgap, syserror.EFAULT
+ }
+
+ perms := vma.effectivePerms
+ if ignorePermissions {
+ perms = vma.maxPerms
+ }
+ if !perms.SupersetOf(at) {
+ return vbegin, vgap, syserror.EPERM
+ }
+
+ addr = vseg.End()
+ vgap = vseg.NextGap()
+ if addr >= ar.End {
+ return vbegin, vgap, nil
+ }
+ vseg = vgap.NextSegment()
+ }
+
+ // Ran out of vmas before ar.End.
+ return vbegin, vgap, syserror.EFAULT
+}
+
+// getVecVMAsLocked ensures that vmas exist for all addresses in ars, and
+// support access to type of (at, ignorePermissions). It returns the subset of
+// ars for which vmas exist. If this is not equal to ars, it returns a non-nil
+// error explaining why.
+//
+// Preconditions: mm.mappingMu must be locked for reading; it may be
+// temporarily unlocked.
+//
+// Postconditions: ars is not mutated.
+func (mm *MemoryManager) getVecVMAsLocked(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool) (usermem.AddrRangeSeq, error) {
+ for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
+ ar := arsit.Head()
+ if ar.Length() == 0 {
+ continue
+ }
+ if _, vend, err := mm.getVMAsLocked(ctx, ar, at, ignorePermissions); err != nil {
+ return truncatedAddrRangeSeq(ars, arsit, vend.Start()), err
+ }
+ }
+ return ars, nil
+}
+
+// vma extension will not shrink the number of unmapped bytes between the start
+// of a growsDown vma and the end of its predecessor non-growsDown vma below
+// guardBytes.
+//
+// guardBytes is equivalent to Linux's stack_guard_gap after upstream
+// 1be7107fbe18 "mm: larger stack guard gap, between vmas".
+const guardBytes = 256 * usermem.PageSize
+
+// unmapLocked unmaps all addresses in ar and returns the resulting gap in
+// mm.vmas.
+//
+// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0.
+// ar must be page-aligned.
+func (mm *MemoryManager) unmapLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator {
+ if checkInvariants {
+ if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+ panic(fmt.Sprintf("invalid ar: %v", ar))
+ }
+ }
+
+ // AddressSpace mappings and pmas must be invalidated before
+ // mm.removeVMAsLocked() => memmap.Mappable.RemoveMapping().
+ mm.Invalidate(ar, memmap.InvalidateOpts{InvalidatePrivate: true})
+ return mm.removeVMAsLocked(ctx, ar)
+}
+
+// removeVMAsLocked removes vmas for addresses in ar and returns the resulting
+// gap in mm.vmas. It does not remove pmas or AddressSpace mappings; clients
+// must do so before calling removeVMAsLocked.
+//
+// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0. ar
+// must be page-aligned.
+func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator {
+ if checkInvariants {
+ if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() {
+ panic(fmt.Sprintf("invalid ar: %v", ar))
+ }
+ }
+
+ vseg, vgap := mm.vmas.Find(ar.Start)
+ if vgap.Ok() {
+ vseg = vgap.NextSegment()
+ }
+ for vseg.Ok() && vseg.Start() < ar.End {
+ vseg = mm.vmas.Isolate(vseg, ar)
+ vmaAR := vseg.Range()
+ vma := vseg.ValuePtr()
+ if vma.mappable != nil {
+ vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off, vma.canWriteMappableLocked())
+ }
+ if vma.id != nil {
+ vma.id.DecRef()
+ }
+ mm.usageAS -= uint64(vmaAR.Length())
+ if vma.isPrivateDataLocked() {
+ mm.dataAS -= uint64(vmaAR.Length())
+ }
+ if vma.mlockMode != memmap.MLockNone {
+ mm.lockedAS -= uint64(vmaAR.Length())
+ }
+ vgap = mm.vmas.Remove(vseg)
+ vseg = vgap.NextSegment()
+ }
+ return vgap
+}
+
+// canWriteMappableLocked returns true if it is possible for vma.mappable to be
+// written to via this vma, i.e. if it is possible that
+// vma.mappable.Translate(at.Write=true) may be called as a result of this vma.
+// This includes via I/O with usermem.IOOpts.IgnorePermissions = true, such as
+// PTRACE_POKEDATA.
+//
+// canWriteMappableLocked is equivalent to Linux's VM_SHARED.
+//
+// Preconditions: mm.mappingMu must be locked.
+func (vma *vma) canWriteMappableLocked() bool {
+ return !vma.private && vma.maxPerms.Write
+}
+
+// isPrivateDataLocked identify the data segments - private, writable, not stack
+//
+// Preconditions: mm.mappingMu must be locked.
+func (vma *vma) isPrivateDataLocked() bool {
+ return vma.realPerms.Write && vma.private && !vma.growsDown
+}
+
+// vmaSetFunctions implements segment.Functions for vmaSet.
+type vmaSetFunctions struct{}
+
+func (vmaSetFunctions) MinKey() usermem.Addr {
+ return 0
+}
+
+func (vmaSetFunctions) MaxKey() usermem.Addr {
+ return ^usermem.Addr(0)
+}
+
+func (vmaSetFunctions) ClearValue(vma *vma) {
+ vma.mappable = nil
+ vma.id = nil
+ vma.hint = ""
+}
+
+func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRange, vma2 vma) (vma, bool) {
+ if vma1.mappable != vma2.mappable ||
+ (vma1.mappable != nil && vma1.off+uint64(ar1.Length()) != vma2.off) ||
+ vma1.realPerms != vma2.realPerms ||
+ vma1.maxPerms != vma2.maxPerms ||
+ vma1.private != vma2.private ||
+ vma1.growsDown != vma2.growsDown ||
+ vma1.mlockMode != vma2.mlockMode ||
+ vma1.numaPolicy != vma2.numaPolicy ||
+ vma1.numaNodemask != vma2.numaNodemask ||
+ vma1.dontfork != vma2.dontfork ||
+ vma1.id != vma2.id ||
+ vma1.hint != vma2.hint {
+ return vma{}, false
+ }
+
+ if vma2.id != nil {
+ vma2.id.DecRef()
+ }
+ return vma1, true
+}
+
+func (vmaSetFunctions) Split(ar usermem.AddrRange, v vma, split usermem.Addr) (vma, vma) {
+ v2 := v
+ if v2.mappable != nil {
+ v2.off += uint64(split - ar.Start)
+ }
+ if v2.id != nil {
+ v2.id.IncRef()
+ }
+ return v, v2
+}
+
+// Preconditions: vseg.ValuePtr().mappable != nil. vseg.Range().Contains(addr).
+func (vseg vmaIterator) mappableOffsetAt(addr usermem.Addr) uint64 {
+ if checkInvariants {
+ if !vseg.Ok() {
+ panic("terminal vma iterator")
+ }
+ if vseg.ValuePtr().mappable == nil {
+ panic("Mappable offset is meaningless for anonymous vma")
+ }
+ if !vseg.Range().Contains(addr) {
+ panic(fmt.Sprintf("addr %v out of bounds %v", addr, vseg.Range()))
+ }
+ }
+
+ vma := vseg.ValuePtr()
+ vstart := vseg.Start()
+ return vma.off + uint64(addr-vstart)
+}
+
+// Preconditions: vseg.ValuePtr().mappable != nil.
+func (vseg vmaIterator) mappableRange() memmap.MappableRange {
+ return vseg.mappableRangeOf(vseg.Range())
+}
+
+// Preconditions: vseg.ValuePtr().mappable != nil.
+// vseg.Range().IsSupersetOf(ar). ar.Length() != 0.
+func (vseg vmaIterator) mappableRangeOf(ar usermem.AddrRange) memmap.MappableRange {
+ if checkInvariants {
+ if !vseg.Ok() {
+ panic("terminal vma iterator")
+ }
+ if vseg.ValuePtr().mappable == nil {
+ panic("MappableRange is meaningless for anonymous vma")
+ }
+ if !ar.WellFormed() || ar.Length() <= 0 {
+ panic(fmt.Sprintf("invalid ar: %v", ar))
+ }
+ if !vseg.Range().IsSupersetOf(ar) {
+ panic(fmt.Sprintf("ar %v out of bounds %v", ar, vseg.Range()))
+ }
+ }
+
+ vma := vseg.ValuePtr()
+ vstart := vseg.Start()
+ return memmap.MappableRange{vma.off + uint64(ar.Start-vstart), vma.off + uint64(ar.End-vstart)}
+}
+
+// Preconditions: vseg.ValuePtr().mappable != nil.
+// vseg.mappableRange().IsSupersetOf(mr). mr.Length() != 0.
+func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) usermem.AddrRange {
+ if checkInvariants {
+ if !vseg.Ok() {
+ panic("terminal vma iterator")
+ }
+ if vseg.ValuePtr().mappable == nil {
+ panic("MappableRange is meaningless for anonymous vma")
+ }
+ if !mr.WellFormed() || mr.Length() <= 0 {
+ panic(fmt.Sprintf("invalid mr: %v", mr))
+ }
+ if !vseg.mappableRange().IsSupersetOf(mr) {
+ panic(fmt.Sprintf("mr %v out of bounds %v", mr, vseg.mappableRange()))
+ }
+ }
+
+ vma := vseg.ValuePtr()
+ vstart := vseg.Start()
+ return usermem.AddrRange{vstart + usermem.Addr(mr.Start-vma.off), vstart + usermem.Addr(mr.End-vma.off)}
+}
+
+// seekNextLowerBound returns mm.vmas.LowerBoundSegment(addr), but does so by
+// scanning linearly forward from vseg.
+//
+// Preconditions: mm.mappingMu must be locked. addr >= vseg.Start().
+func (vseg vmaIterator) seekNextLowerBound(addr usermem.Addr) vmaIterator {
+ if checkInvariants {
+ if !vseg.Ok() {
+ panic("terminal vma iterator")
+ }
+ if addr < vseg.Start() {
+ panic(fmt.Sprintf("can't seek forward to %#x from %#x", addr, vseg.Start()))
+ }
+ }
+ for vseg.Ok() && addr >= vseg.End() {
+ vseg = vseg.NextSegment()
+ }
+ return vseg
+}
+
+// availableRange returns the subset of vgap.Range() in which new vmas may be
+// created without MMapOpts.Unmap == true.
+func (vgap vmaGapIterator) availableRange() usermem.AddrRange {
+ ar := vgap.Range()
+ next := vgap.NextSegment()
+ if !next.Ok() || !next.ValuePtr().growsDown {
+ return ar
+ }
+ // Exclude guard pages.
+ if ar.Length() < guardBytes {
+ return usermem.AddrRange{ar.Start, ar.Start}
+ }
+ ar.End -= guardBytes
+ return ar
+}