// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package mm

import (
	"fmt"

	"gvisor.dev/gvisor/pkg/context"
	"gvisor.dev/gvisor/pkg/hostarch"
	"gvisor.dev/gvisor/pkg/safecopy"
	"gvisor.dev/gvisor/pkg/safemem"
	"gvisor.dev/gvisor/pkg/sentry/memmap"
	"gvisor.dev/gvisor/pkg/sentry/usage"
	"gvisor.dev/gvisor/pkg/syserror"
)

// existingPMAsLocked checks that pmas exist for all addresses in ar, and
// support access of type (at, ignorePermissions). If so, it returns an
// iterator to the pma containing ar.Start. Otherwise it returns a terminal
// iterator.
//
// Preconditions:
// * mm.activeMu must be locked.
// * ar.Length() != 0.
func (mm *MemoryManager) existingPMAsLocked(ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool, needInternalMappings bool) pmaIterator {
	if checkInvariants {
		if !ar.WellFormed() || ar.Length() == 0 {
			panic(fmt.Sprintf("invalid ar: %v", ar))
		}
	}

	first := mm.pmas.FindSegment(ar.Start)
	pseg := first
	for pseg.Ok() {
		pma := pseg.ValuePtr()
		perms := pma.effectivePerms
		if ignorePermissions {
			perms = pma.maxPerms
		}
		if !perms.SupersetOf(at) {
			return pmaIterator{}
		}
		if needInternalMappings && pma.internalMappings.IsEmpty() {
			return pmaIterator{}
		}

		if ar.End <= pseg.End() {
			return first
		}
		pseg, _ = pseg.NextNonEmpty()
	}

	// Ran out of pmas before reaching ar.End.
	return pmaIterator{}
}

// existingVecPMAsLocked returns true if pmas exist for all addresses in ars,
// and support access of type (at, ignorePermissions).
//
// Preconditions: mm.activeMu must be locked.
func (mm *MemoryManager) existingVecPMAsLocked(ars hostarch.AddrRangeSeq, at hostarch.AccessType, ignorePermissions bool, needInternalMappings bool) bool {
	for ; !ars.IsEmpty(); ars = ars.Tail() {
		if ar := ars.Head(); ar.Length() != 0 && !mm.existingPMAsLocked(ar, at, ignorePermissions, needInternalMappings).Ok() {
			return false
		}
	}
	return true
}

// getPMAsLocked ensures that pmas exist for all addresses in ar, and support
// access of type at. It returns:
//
// - An iterator to the pma containing ar.Start. If no pma contains ar.Start,
// the iterator is unspecified.
//
// - An iterator to the gap after the last pma containing an address in ar. If
// pmas exist for no addresses in ar, the iterator is to a gap that begins
// before ar.Start.
//
// - An error that is non-nil if pmas exist for only a subset of ar.
//
// Preconditions:
// * mm.mappingMu must be locked.
// * mm.activeMu must be locked for writing.
// * ar.Length() != 0.
// * vseg.Range().Contains(ar.Start).
// * vmas must exist for all addresses in ar, and support accesses of type at
//   (i.e. permission checks must have been performed against vmas).
func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, at hostarch.AccessType) (pmaIterator, pmaGapIterator, error) {
	if checkInvariants {
		if !ar.WellFormed() || ar.Length() == 0 {
			panic(fmt.Sprintf("invalid ar: %v", ar))
		}
		if !vseg.Ok() {
			panic("terminal vma iterator")
		}
		if !vseg.Range().Contains(ar.Start) {
			panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar))
		}
	}

	// Page-align ar so that all AddrRanges are aligned.
	end, ok := ar.End.RoundUp()
	var alignerr error
	if !ok {
		end = ar.End.RoundDown()
		alignerr = syserror.EFAULT
	}
	ar = hostarch.AddrRange{ar.Start.RoundDown(), end}

	pstart, pend, perr := mm.getPMAsInternalLocked(ctx, vseg, ar, at)
	if pend.Start() <= ar.Start {
		return pmaIterator{}, pend, perr
	}
	// getPMAsInternalLocked may not have returned pstart due to iterator
	// invalidation.
	if !pstart.Ok() {
		pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend)
	}
	if perr != nil {
		return pstart, pend, perr
	}
	return pstart, pend, alignerr
}

// getVecPMAsLocked ensures that pmas exist for all addresses in ars, and
// support access of type at. It returns the subset of ars for which pmas
// exist. If this is not equal to ars, it returns a non-nil error explaining
// why.
//
// Preconditions:
// * mm.mappingMu must be locked.
// * mm.activeMu must be locked for writing.
// * vmas must exist for all addresses in ars, and support accesses of type at
//   (i.e. permission checks must have been performed against vmas).
func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars hostarch.AddrRangeSeq, at hostarch.AccessType) (hostarch.AddrRangeSeq, error) {
	for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
		ar := arsit.Head()
		if ar.Length() == 0 {
			continue
		}
		if checkInvariants {
			if !ar.WellFormed() {
				panic(fmt.Sprintf("invalid ar: %v", ar))
			}
		}

		// Page-align ar so that all AddrRanges are aligned.
		end, ok := ar.End.RoundUp()
		var alignerr error
		if !ok {
			end = ar.End.RoundDown()
			alignerr = syserror.EFAULT
		}
		ar = hostarch.AddrRange{ar.Start.RoundDown(), end}

		_, pend, perr := mm.getPMAsInternalLocked(ctx, mm.vmas.FindSegment(ar.Start), ar, at)
		if perr != nil {
			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), perr
		}
		if alignerr != nil {
			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), alignerr
		}
	}

	return ars, nil
}

// getPMAsInternalLocked is equivalent to getPMAsLocked, with the following
// exceptions:
//
// - getPMAsInternalLocked returns a pmaIterator on a best-effort basis (that
// is, the returned iterator may be terminal, even if a pma that contains
// ar.Start exists). Returning this iterator on a best-effort basis allows
// callers that require it to use it when it's cheaply available, while also
// avoiding the overhead of retrieving it when it's not.
//
// - getPMAsInternalLocked additionally requires that ar is page-aligned.
//
// getPMAsInternalLocked is an implementation helper for getPMAsLocked and
// getVecPMAsLocked; other clients should call one of those instead.
func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, at hostarch.AccessType) (pmaIterator, pmaGapIterator, error) {
	if checkInvariants {
		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
			panic(fmt.Sprintf("invalid ar: %v", ar))
		}
		if !vseg.Ok() {
			panic("terminal vma iterator")
		}
		if !vseg.Range().Contains(ar.Start) {
			panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar))
		}
	}

	mf := mm.mfp.MemoryFile()
	// Limit the range we allocate to ar, aligned to privateAllocUnit.
	maskAR := privateAligned(ar)
	didUnmapAS := false
	// The range in which we iterate vmas and pmas is still limited to ar, to
	// ensure that we don't allocate or COW-break a pma we don't need.
	pseg, pgap := mm.pmas.Find(ar.Start)
	pstart := pseg
	for {
		// Get pmas for this vma.
		vsegAR := vseg.Range().Intersect(ar)
		vma := vseg.ValuePtr()
	pmaLoop:
		for {
			switch {
			case pgap.Ok() && pgap.Start() < vsegAR.End:
				// Need a pma here.
				optAR := vseg.Range().Intersect(pgap.Range())
				if checkInvariants {
					if optAR.Length() == 0 {
						panic(fmt.Sprintf("vseg %v and pgap %v do not overlap", vseg, pgap))
					}
				}
				if vma.mappable == nil {
					// Private anonymous mappings get pmas by allocating.
					allocAR := optAR.Intersect(maskAR)
					fr, err := mf.Allocate(uint64(allocAR.Length()), usage.Anonymous)
					if err != nil {
						return pstart, pgap, err
					}
					if checkInvariants {
						if !fr.WellFormed() || fr.Length() != uint64(allocAR.Length()) {
							panic(fmt.Sprintf("Allocate(%v) returned invalid FileRange %v", allocAR.Length(), fr))
						}
					}
					mm.addRSSLocked(allocAR)
					mm.incPrivateRef(fr)
					mf.IncRef(fr)
					pseg, pgap = mm.pmas.Insert(pgap, allocAR, pma{
						file:           mf,
						off:            fr.Start,
						translatePerms: hostarch.AnyAccess,
						effectivePerms: vma.effectivePerms,
						maxPerms:       vma.maxPerms,
						// Since we just allocated this memory and have the
						// only reference, the new pma does not need
						// copy-on-write.
						private: true,
					}).NextNonEmpty()
					pstart = pmaIterator{} // iterators invalidated
				} else {
					// Other mappings get pmas by translating.
					optMR := vseg.mappableRangeOf(optAR)
					reqAR := optAR.Intersect(ar)
					reqMR := vseg.mappableRangeOf(reqAR)
					perms := at
					if vma.private {
						// This pma will be copy-on-write; don't require write
						// permission, but do require read permission to
						// facilitate the copy.
						//
						// If at.Write is true, we will need to break
						// copy-on-write immediately, which occurs after
						// translation below.
						perms.Read = true
						perms.Write = false
					}
					ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms)
					if checkInvariants {
						if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil {
							panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err))
						}
					}
					// Install a pma for each translation.
					if len(ts) == 0 {
						return pstart, pgap, err
					}
					pstart = pmaIterator{} // iterators invalidated
					for _, t := range ts {
						newpmaAR := vseg.addrRangeOf(t.Source)
						newpma := pma{
							file:           t.File,
							off:            t.Offset,
							translatePerms: t.Perms,
							effectivePerms: vma.effectivePerms.Intersect(t.Perms),
							maxPerms:       vma.maxPerms.Intersect(t.Perms),
						}
						if vma.private {
							newpma.effectivePerms.Write = false
							newpma.maxPerms.Write = false
							newpma.needCOW = true
						}
						mm.addRSSLocked(newpmaAR)
						t.File.IncRef(t.FileRange())
						// This is valid because memmap.Mappable.Translate is
						// required to return Translations in increasing
						// Translation.Source order.
						pseg = mm.pmas.Insert(pgap, newpmaAR, newpma)
						pgap = pseg.NextGap()
					}
					// The error returned by Translate is only significant if
					// it occurred before ar.End.
					if err != nil && vseg.addrRangeOf(ts[len(ts)-1].Source).End < ar.End {
						return pstart, pgap, err
					}
					// Rewind pseg to the first pma inserted and continue the
					// loop to check if we need to break copy-on-write.
					pseg, pgap = mm.findOrSeekPrevUpperBoundPMA(vseg.addrRangeOf(ts[0].Source).Start, pgap), pmaGapIterator{}
					continue
				}

			case pseg.Ok() && pseg.Start() < vsegAR.End:
				oldpma := pseg.ValuePtr()
				if at.Write && mm.isPMACopyOnWriteLocked(vseg, pseg) {
					// Break copy-on-write by copying.
					if checkInvariants {
						if !oldpma.maxPerms.Read {
							panic(fmt.Sprintf("pma %v needs to be copied for writing, but is not readable: %v", pseg.Range(), oldpma))
						}
					}
					// The majority of copy-on-write breaks on executable pages
					// come from:
					//
					// - The ELF loader, which must zero out bytes on the last
					// page of each segment after the end of the segment.
					//
					// - gdb's use of ptrace to insert breakpoints.
					//
					// Neither of these cases has enough spatial locality to
					// benefit from copying nearby pages, so if the vma is
					// executable, only copy the pages required.
					var copyAR hostarch.AddrRange
					if vseg.ValuePtr().effectivePerms.Execute {
						copyAR = pseg.Range().Intersect(ar)
					} else {
						copyAR = pseg.Range().Intersect(maskAR)
					}
					// Get internal mappings from the pma to copy from.
					if err := pseg.getInternalMappingsLocked(); err != nil {
						return pstart, pseg.PrevGap(), err
					}
					// Copy contents.
					fr, err := mf.AllocateAndFill(uint64(copyAR.Length()), usage.Anonymous, &safemem.BlockSeqReader{mm.internalMappingsLocked(pseg, copyAR)})
					if _, ok := err.(safecopy.BusError); ok {
						// If we got SIGBUS during the copy, deliver SIGBUS to
						// userspace (instead of SIGSEGV) if we're breaking
						// copy-on-write due to application page fault.
						err = &memmap.BusError{err}
					}
					if fr.Length() == 0 {
						return pstart, pseg.PrevGap(), err
					}
					// Unmap all of maskAR, not just copyAR, to minimize host
					// syscalls. AddressSpace mappings must be removed before
					// mm.decPrivateRef().
					if !didUnmapAS {
						mm.unmapASLocked(maskAR)
						didUnmapAS = true
					}
					// Replace the pma with a copy in the part of the address
					// range where copying was successful. This doesn't change
					// RSS.
					copyAR.End = copyAR.Start + hostarch.Addr(fr.Length())
					if copyAR != pseg.Range() {
						pseg = mm.pmas.Isolate(pseg, copyAR)
						pstart = pmaIterator{} // iterators invalidated
					}
					oldpma = pseg.ValuePtr()
					if oldpma.private {
						mm.decPrivateRef(pseg.fileRange())
					}
					oldpma.file.DecRef(pseg.fileRange())
					mm.incPrivateRef(fr)
					mf.IncRef(fr)
					oldpma.file = mf
					oldpma.off = fr.Start
					oldpma.translatePerms = hostarch.AnyAccess
					oldpma.effectivePerms = vma.effectivePerms
					oldpma.maxPerms = vma.maxPerms
					oldpma.needCOW = false
					oldpma.private = true
					oldpma.internalMappings = safemem.BlockSeq{}
					// Try to merge the pma with its neighbors.
					if prev := pseg.PrevSegment(); prev.Ok() {
						if merged := mm.pmas.Merge(prev, pseg); merged.Ok() {
							pseg = merged
							pstart = pmaIterator{} // iterators invalidated
						}
					}
					if next := pseg.NextSegment(); next.Ok() {
						if merged := mm.pmas.Merge(pseg, next); merged.Ok() {
							pseg = merged
							pstart = pmaIterator{} // iterators invalidated
						}
					}
					// The error returned by AllocateAndFill is only
					// significant if it occurred before ar.End.
					if err != nil && pseg.End() < ar.End {
						return pstart, pseg.NextGap(), err
					}
					// Ensure pseg and pgap are correct for the next iteration
					// of the loop.
					pseg, pgap = pseg.NextNonEmpty()
				} else if !oldpma.translatePerms.SupersetOf(at) {
					// Get new pmas (with sufficient permissions) by calling
					// memmap.Mappable.Translate again.
					if checkInvariants {
						if oldpma.private {
							panic(fmt.Sprintf("private pma %v has non-maximal pma.translatePerms: %v", pseg.Range(), oldpma))
						}
					}
					// Allow the entire pma to be replaced.
					optAR := pseg.Range()
					optMR := vseg.mappableRangeOf(optAR)
					reqAR := optAR.Intersect(ar)
					reqMR := vseg.mappableRangeOf(reqAR)
					perms := oldpma.translatePerms.Union(at)
					ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms)
					if checkInvariants {
						if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil {
							panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err))
						}
					}
					// Remove the part of the existing pma covered by new
					// Translations, then insert new pmas. This doesn't change
					// RSS. Note that we don't need to call unmapASLocked: any
					// existing AddressSpace mappings are still valid (though
					// less permissive than the new pmas indicate) until
					// Invalidate is called, and will be replaced by future
					// calls to mapASLocked.
					if len(ts) == 0 {
						return pstart, pseg.PrevGap(), err
					}
					transMR := memmap.MappableRange{ts[0].Source.Start, ts[len(ts)-1].Source.End}
					transAR := vseg.addrRangeOf(transMR)
					pseg = mm.pmas.Isolate(pseg, transAR)
					pseg.ValuePtr().file.DecRef(pseg.fileRange())
					pgap = mm.pmas.Remove(pseg)
					pstart = pmaIterator{} // iterators invalidated
					for _, t := range ts {
						newpmaAR := vseg.addrRangeOf(t.Source)
						newpma := pma{
							file:           t.File,
							off:            t.Offset,
							translatePerms: t.Perms,
							effectivePerms: vma.effectivePerms.Intersect(t.Perms),
							maxPerms:       vma.maxPerms.Intersect(t.Perms),
						}
						if vma.private {
							newpma.effectivePerms.Write = false
							newpma.maxPerms.Write = false
							newpma.needCOW = true
						}
						t.File.IncRef(t.FileRange())
						pseg = mm.pmas.Insert(pgap, newpmaAR, newpma)
						pgap = pseg.NextGap()
					}
					// The error returned by Translate is only significant if
					// it occurred before ar.End.
					if err != nil && pseg.End() < ar.End {
						return pstart, pgap, err
					}
					// Ensure pseg and pgap are correct for the next iteration
					// of the loop.
					if pgap.Range().Length() == 0 {
						pseg, pgap = pgap.NextSegment(), pmaGapIterator{}
					} else {
						pseg = pmaIterator{}
					}
				} else {
					// We have a usable pma; continue.
					pseg, pgap = pseg.NextNonEmpty()
				}

			default:
				break pmaLoop
			}
		}
		// Go to the next vma.
		if ar.End <= vseg.End() {
			if pgap.Ok() {
				return pstart, pgap, nil
			}
			return pstart, pseg.PrevGap(), nil
		}
		vseg = vseg.NextSegment()
	}
}

const (
	// When memory is allocated for a private pma, align the allocated address
	// range to a privateAllocUnit boundary when possible. Larger values of
	// privateAllocUnit may reduce page faults by allowing fewer, larger pmas
	// to be mapped, but may result in larger amounts of wasted memory in the
	// presence of fragmentation. privateAllocUnit must be a power-of-2
	// multiple of hostarch.PageSize.
	privateAllocUnit = hostarch.HugePageSize

	privateAllocMask = privateAllocUnit - 1
)

func privateAligned(ar hostarch.AddrRange) hostarch.AddrRange {
	aligned := hostarch.AddrRange{ar.Start &^ privateAllocMask, ar.End}
	if end := (ar.End + privateAllocMask) &^ privateAllocMask; end >= ar.End {
		aligned.End = end
	}
	if checkInvariants {
		if !aligned.IsSupersetOf(ar) {
			panic(fmt.Sprintf("aligned AddrRange %#v is not a superset of ar %#v", aligned, ar))
		}
	}
	return aligned
}

// isPMACopyOnWriteLocked returns true if the contents of the pma represented
// by pseg must be copied to a new private pma to be written to.
//
// If the pma is a copy-on-write private pma, and holds the only reference on
// the memory it maps, isPMACopyOnWriteLocked will take ownership of the memory
// and update the pma to indicate that it does not require copy-on-write.
//
// Preconditions:
// * vseg.Range().IsSupersetOf(pseg.Range()).
// * mm.mappingMu must be locked.
// * mm.activeMu must be locked for writing.
func (mm *MemoryManager) isPMACopyOnWriteLocked(vseg vmaIterator, pseg pmaIterator) bool {
	pma := pseg.ValuePtr()
	if !pma.needCOW {
		return false
	}
	if !pma.private {
		return true
	}
	// If we have the only reference on private memory to be copied, just take
	// ownership of it instead of copying. If we do hold the only reference,
	// additional references can only be taken by mm.Fork(), which is excluded
	// by mm.activeMu, so this isn't racy.
	mm.privateRefs.mu.Lock()
	defer mm.privateRefs.mu.Unlock()
	fr := pseg.fileRange()
	// This check relies on mm.privateRefs.refs being kept fully merged.
	rseg := mm.privateRefs.refs.FindSegment(fr.Start)
	if rseg.Ok() && rseg.Value() == 1 && fr.End <= rseg.End() {
		pma.needCOW = false
		// pma.private => pma.translatePerms == hostarch.AnyAccess
		vma := vseg.ValuePtr()
		pma.effectivePerms = vma.effectivePerms
		pma.maxPerms = vma.maxPerms
		return false
	}
	return true
}

// Invalidate implements memmap.MappingSpace.Invalidate.
func (mm *MemoryManager) Invalidate(ar hostarch.AddrRange, opts memmap.InvalidateOpts) {
	if checkInvariants {
		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
			panic(fmt.Sprintf("invalid ar: %v", ar))
		}
	}

	mm.activeMu.Lock()
	defer mm.activeMu.Unlock()
	if mm.captureInvalidations {
		mm.capturedInvalidations = append(mm.capturedInvalidations, invalidateArgs{ar, opts})
		return
	}
	mm.invalidateLocked(ar, opts.InvalidatePrivate, true)
}

// invalidateLocked removes pmas and AddressSpace mappings of those pmas for
// addresses in ar.
//
// Preconditions:
// * mm.activeMu must be locked for writing.
// * ar.Length() != 0.
// * ar must be page-aligned.
func (mm *MemoryManager) invalidateLocked(ar hostarch.AddrRange, invalidatePrivate, invalidateShared bool) {
	if checkInvariants {
		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
			panic(fmt.Sprintf("invalid ar: %v", ar))
		}
	}

	var didUnmapAS bool
	pseg := mm.pmas.LowerBoundSegment(ar.Start)
	for pseg.Ok() && pseg.Start() < ar.End {
		pma := pseg.ValuePtr()
		if (invalidatePrivate && pma.private) || (invalidateShared && !pma.private) {
			pseg = mm.pmas.Isolate(pseg, ar)
			pma = pseg.ValuePtr()
			if !didUnmapAS {
				// Unmap all of ar, not just pseg.Range(), to minimize host
				// syscalls. AddressSpace mappings must be removed before
				// mm.decPrivateRef().
				mm.unmapASLocked(ar)
				didUnmapAS = true
			}
			if pma.private {
				mm.decPrivateRef(pseg.fileRange())
			}
			mm.removeRSSLocked(pseg.Range())
			pma.file.DecRef(pseg.fileRange())
			pseg = mm.pmas.Remove(pseg).NextSegment()
		} else {
			pseg = pseg.NextSegment()
		}
	}
}

// Pin returns the memmap.File ranges currently mapped by addresses in ar in
// mm, acquiring a reference on the returned ranges which the caller must
// release by calling Unpin. If not all addresses are mapped, Pin returns a
// non-nil error. Note that Pin may return both a non-empty slice of
// PinnedRanges and a non-nil error.
//
// Pin does not prevent mapped ranges from changing, making it unsuitable for
// most I/O. It should only be used in contexts that would use get_user_pages()
// in the Linux kernel.
//
// Preconditions:
// * ar.Length() != 0.
// * ar must be page-aligned.
func (mm *MemoryManager) Pin(ctx context.Context, ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool) ([]PinnedRange, error) {
	if checkInvariants {
		if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
			panic(fmt.Sprintf("invalid ar: %v", ar))
		}
	}

	// Ensure that we have usable vmas.
	mm.mappingMu.RLock()
	vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions)
	if vendaddr := vend.Start(); vendaddr < ar.End {
		if vendaddr <= ar.Start {
			mm.mappingMu.RUnlock()
			return nil, verr
		}
		ar.End = vendaddr
	}

	// Ensure that we have usable pmas.
	mm.activeMu.Lock()
	pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, at)
	mm.mappingMu.RUnlock()
	if pendaddr := pend.Start(); pendaddr < ar.End {
		if pendaddr <= ar.Start {
			mm.activeMu.Unlock()
			return nil, perr
		}
		ar.End = pendaddr
	}

	// Gather pmas.
	var prs []PinnedRange
	for pseg.Ok() && pseg.Start() < ar.End {
		psar := pseg.Range().Intersect(ar)
		f := pseg.ValuePtr().file
		fr := pseg.fileRangeOf(psar)
		f.IncRef(fr)
		prs = append(prs, PinnedRange{
			Source: psar,
			File:   f,
			Offset: fr.Start,
		})
		pseg = pseg.NextSegment()
	}
	mm.activeMu.Unlock()

	// Return the first error in order of progress through ar.
	if perr != nil {
		return prs, perr
	}
	return prs, verr
}

// PinnedRanges are returned by MemoryManager.Pin.
type PinnedRange struct {
	// Source is the corresponding range of addresses.
	Source hostarch.AddrRange

	// File is the mapped file.
	File memmap.File

	// Offset is the offset into File at which this PinnedRange begins.
	Offset uint64
}

// FileRange returns the memmap.File offsets mapped by pr.
func (pr PinnedRange) FileRange() memmap.FileRange {
	return memmap.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())}
}

// Unpin releases the reference held by prs.
func Unpin(prs []PinnedRange) {
	for i := range prs {
		prs[i].File.DecRef(prs[i].FileRange())
	}
}

// movePMAsLocked moves all pmas in oldAR to newAR.
//
// Preconditions:
// * mm.activeMu must be locked for writing.
// * oldAR.Length() != 0.
// * oldAR.Length() <= newAR.Length().
// * !oldAR.Overlaps(newAR).
// * mm.pmas.IsEmptyRange(newAR).
// * oldAR and newAR must be page-aligned.
func (mm *MemoryManager) movePMAsLocked(oldAR, newAR hostarch.AddrRange) {
	if checkInvariants {
		if !oldAR.WellFormed() || oldAR.Length() == 0 || !oldAR.IsPageAligned() {
			panic(fmt.Sprintf("invalid oldAR: %v", oldAR))
		}
		if !newAR.WellFormed() || newAR.Length() == 0 || !newAR.IsPageAligned() {
			panic(fmt.Sprintf("invalid newAR: %v", newAR))
		}
		if oldAR.Length() > newAR.Length() {
			panic(fmt.Sprintf("old address range %v may contain pmas that will not fit in new address range %v", oldAR, newAR))
		}
		if oldAR.Overlaps(newAR) {
			panic(fmt.Sprintf("old and new address ranges overlap: %v, %v", oldAR, newAR))
		}
		// mm.pmas.IsEmptyRange is checked by mm.pmas.Insert.
	}

	type movedPMA struct {
		oldAR hostarch.AddrRange
		pma   pma
	}
	var movedPMAs []movedPMA
	pseg := mm.pmas.LowerBoundSegment(oldAR.Start)
	for pseg.Ok() && pseg.Start() < oldAR.End {
		pseg = mm.pmas.Isolate(pseg, oldAR)
		movedPMAs = append(movedPMAs, movedPMA{
			oldAR: pseg.Range(),
			pma:   pseg.Value(),
		})
		pseg = mm.pmas.Remove(pseg).NextSegment()
		// No RSS change is needed since we're re-inserting the same pmas
		// below.
	}

	off := newAR.Start - oldAR.Start
	pgap := mm.pmas.FindGap(newAR.Start)
	for i := range movedPMAs {
		mpma := &movedPMAs[i]
		pmaNewAR := hostarch.AddrRange{mpma.oldAR.Start + off, mpma.oldAR.End + off}
		pgap = mm.pmas.Insert(pgap, pmaNewAR, mpma.pma).NextGap()
	}

	mm.unmapASLocked(oldAR)
}

// getPMAInternalMappingsLocked ensures that pmas for all addresses in ar have
// cached internal mappings. It returns:
//
// - An iterator to the gap after the last pma with internal mappings
// containing an address in ar. If internal mappings exist for no addresses in
// ar, the iterator is to a gap that begins before ar.Start.
//
// - An error that is non-nil if internal mappings exist for only a subset of
// ar.
//
// Preconditions:
// * mm.activeMu must be locked for writing.
// * pseg.Range().Contains(ar.Start).
// * pmas must exist for all addresses in ar.
// * ar.Length() != 0.
//
// Postconditions: getPMAInternalMappingsLocked does not invalidate iterators
// into mm.pmas.
func (mm *MemoryManager) getPMAInternalMappingsLocked(pseg pmaIterator, ar hostarch.AddrRange) (pmaGapIterator, error) {
	if checkInvariants {
		if !ar.WellFormed() || ar.Length() == 0 {
			panic(fmt.Sprintf("invalid ar: %v", ar))
		}
		if !pseg.Range().Contains(ar.Start) {
			panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar))
		}
	}

	for {
		if err := pseg.getInternalMappingsLocked(); err != nil {
			return pseg.PrevGap(), err
		}
		if ar.End <= pseg.End() {
			return pseg.NextGap(), nil
		}
		pseg, _ = pseg.NextNonEmpty()
	}
}

// getVecPMAInternalMappingsLocked ensures that pmas for all addresses in ars
// have cached internal mappings. It returns the subset of ars for which
// internal mappings exist. If this is not equal to ars, it returns a non-nil
// error explaining why.
//
// Preconditions:
// * mm.activeMu must be locked for writing.
// * pmas must exist for all addresses in ar.
//
// Postconditions: getVecPMAInternalMappingsLocked does not invalidate iterators
// into mm.pmas.
func (mm *MemoryManager) getVecPMAInternalMappingsLocked(ars hostarch.AddrRangeSeq) (hostarch.AddrRangeSeq, error) {
	for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
		ar := arsit.Head()
		if ar.Length() == 0 {
			continue
		}
		if pend, err := mm.getPMAInternalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); err != nil {
			return truncatedAddrRangeSeq(ars, arsit, pend.Start()), err
		}
	}
	return ars, nil
}

// internalMappingsLocked returns internal mappings for addresses in ar.
//
// Preconditions:
// * mm.activeMu must be locked.
// * Internal mappings must have been previously established for all addresses
//   in ar.
// * ar.Length() != 0.
// * pseg.Range().Contains(ar.Start).
func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar hostarch.AddrRange) safemem.BlockSeq {
	if checkInvariants {
		if !ar.WellFormed() || ar.Length() == 0 {
			panic(fmt.Sprintf("invalid ar: %v", ar))
		}
		if !pseg.Range().Contains(ar.Start) {
			panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar))
		}
	}

	if ar.End <= pseg.End() {
		// Since only one pma is involved, we can use pma.internalMappings
		// directly, avoiding a slice allocation.
		offset := uint64(ar.Start - pseg.Start())
		return pseg.ValuePtr().internalMappings.DropFirst64(offset).TakeFirst64(uint64(ar.Length()))
	}

	var ims []safemem.Block
	for {
		pr := pseg.Range().Intersect(ar)
		for pims := pseg.ValuePtr().internalMappings.DropFirst64(uint64(pr.Start - pseg.Start())).TakeFirst64(uint64(pr.Length())); !pims.IsEmpty(); pims = pims.Tail() {
			ims = append(ims, pims.Head())
		}
		if ar.End <= pseg.End() {
			break
		}
		pseg = pseg.NextSegment()
	}
	return safemem.BlockSeqFromSlice(ims)
}

// vecInternalMappingsLocked returns internal mappings for addresses in ars.
//
// Preconditions:
// * mm.activeMu must be locked.
// * Internal mappings must have been previously established for all addresses
//   in ars.
func (mm *MemoryManager) vecInternalMappingsLocked(ars hostarch.AddrRangeSeq) safemem.BlockSeq {
	var ims []safemem.Block
	for ; !ars.IsEmpty(); ars = ars.Tail() {
		ar := ars.Head()
		if ar.Length() == 0 {
			continue
		}
		for pims := mm.internalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); !pims.IsEmpty(); pims = pims.Tail() {
			ims = append(ims, pims.Head())
		}
	}
	return safemem.BlockSeqFromSlice(ims)
}

// incPrivateRef acquires a reference on private pages in fr.
func (mm *MemoryManager) incPrivateRef(fr memmap.FileRange) {
	mm.privateRefs.mu.Lock()
	defer mm.privateRefs.mu.Unlock()
	refSet := &mm.privateRefs.refs
	seg, gap := refSet.Find(fr.Start)
	for {
		switch {
		case seg.Ok() && seg.Start() < fr.End:
			seg = refSet.Isolate(seg, fr)
			seg.SetValue(seg.Value() + 1)
			seg, gap = seg.NextNonEmpty()
		case gap.Ok() && gap.Start() < fr.End:
			seg, gap = refSet.InsertWithoutMerging(gap, gap.Range().Intersect(fr), 1).NextNonEmpty()
		default:
			refSet.MergeAdjacent(fr)
			return
		}
	}
}

// decPrivateRef releases a reference on private pages in fr.
func (mm *MemoryManager) decPrivateRef(fr memmap.FileRange) {
	var freed []memmap.FileRange

	mm.privateRefs.mu.Lock()
	refSet := &mm.privateRefs.refs
	seg := refSet.LowerBoundSegment(fr.Start)
	for seg.Ok() && seg.Start() < fr.End {
		seg = refSet.Isolate(seg, fr)
		if old := seg.Value(); old == 1 {
			freed = append(freed, seg.Range())
			seg = refSet.Remove(seg).NextSegment()
		} else {
			seg.SetValue(old - 1)
			seg = seg.NextSegment()
		}
	}
	refSet.MergeAdjacent(fr)
	mm.privateRefs.mu.Unlock()

	mf := mm.mfp.MemoryFile()
	for _, fr := range freed {
		mf.DecRef(fr)
	}
}

// addRSSLocked updates the current and maximum resident set size of a
// MemoryManager to reflect the insertion of a pma at ar.
//
// Preconditions: mm.activeMu must be locked for writing.
func (mm *MemoryManager) addRSSLocked(ar hostarch.AddrRange) {
	mm.curRSS += uint64(ar.Length())
	if mm.curRSS > mm.maxRSS {
		mm.maxRSS = mm.curRSS
	}
}

// removeRSSLocked updates the current resident set size of a MemoryManager to
// reflect the removal of a pma at ar.
//
// Preconditions: mm.activeMu must be locked for writing.
func (mm *MemoryManager) removeRSSLocked(ar hostarch.AddrRange) {
	mm.curRSS -= uint64(ar.Length())
}

// pmaSetFunctions implements segment.Functions for pmaSet.
type pmaSetFunctions struct{}

func (pmaSetFunctions) MinKey() hostarch.Addr {
	return 0
}

func (pmaSetFunctions) MaxKey() hostarch.Addr {
	return ^hostarch.Addr(0)
}

func (pmaSetFunctions) ClearValue(pma *pma) {
	pma.file = nil
	pma.internalMappings = safemem.BlockSeq{}
}

func (pmaSetFunctions) Merge(ar1 hostarch.AddrRange, pma1 pma, ar2 hostarch.AddrRange, pma2 pma) (pma, bool) {
	if pma1.file != pma2.file ||
		pma1.off+uint64(ar1.Length()) != pma2.off ||
		pma1.translatePerms != pma2.translatePerms ||
		pma1.effectivePerms != pma2.effectivePerms ||
		pma1.maxPerms != pma2.maxPerms ||
		pma1.needCOW != pma2.needCOW ||
		pma1.private != pma2.private {
		return pma{}, false
	}

	// Discard internal mappings instead of trying to merge them, since merging
	// them requires an allocation and getting them again from the
	// memmap.File might not.
	pma1.internalMappings = safemem.BlockSeq{}
	return pma1, true
}

func (pmaSetFunctions) Split(ar hostarch.AddrRange, p pma, split hostarch.Addr) (pma, pma) {
	newlen1 := uint64(split - ar.Start)
	p2 := p
	p2.off += newlen1
	if !p.internalMappings.IsEmpty() {
		p.internalMappings = p.internalMappings.TakeFirst64(newlen1)
		p2.internalMappings = p2.internalMappings.DropFirst64(newlen1)
	}
	return p, p2
}

// findOrSeekPrevUpperBoundPMA returns mm.pmas.UpperBoundSegment(addr), but may do
// so by scanning linearly backward from pgap.
//
// Preconditions:
// * mm.activeMu must be locked.
// * addr <= pgap.Start().
func (mm *MemoryManager) findOrSeekPrevUpperBoundPMA(addr hostarch.Addr, pgap pmaGapIterator) pmaIterator {
	if checkInvariants {
		if !pgap.Ok() {
			panic("terminal pma iterator")
		}
		if addr > pgap.Start() {
			panic(fmt.Sprintf("can't seek backward to %#x from %#x", addr, pgap.Start()))
		}
	}
	// Optimistically check if pgap.PrevSegment() is the PMA we're looking for,
	// which is the case if findOrSeekPrevUpperBoundPMA is called to find the
	// start of a range containing only a single PMA.
	if pseg := pgap.PrevSegment(); pseg.Start() <= addr {
		return pseg
	}
	return mm.pmas.UpperBoundSegment(addr)
}

// getInternalMappingsLocked ensures that pseg.ValuePtr().internalMappings is
// non-empty.
//
// Preconditions: mm.activeMu must be locked for writing.
func (pseg pmaIterator) getInternalMappingsLocked() error {
	pma := pseg.ValuePtr()
	if pma.internalMappings.IsEmpty() {
		// This must use maxPerms (instead of perms) because some permission
		// constraints are only visible to vmas; for example, mappings of
		// read-only files have vma.maxPerms.Write unset, but this may not be
		// visible to the memmap.Mappable.
		perms := pma.maxPerms
		// We will never execute application code through an internal mapping.
		perms.Execute = false
		ims, err := pma.file.MapInternal(pseg.fileRange(), perms)
		if err != nil {
			return err
		}
		pma.internalMappings = ims
	}
	return nil
}

func (pseg pmaIterator) fileRange() memmap.FileRange {
	return pseg.fileRangeOf(pseg.Range())
}

// Preconditions:
// * pseg.Range().IsSupersetOf(ar).
// * ar.Length != 0.
func (pseg pmaIterator) fileRangeOf(ar hostarch.AddrRange) memmap.FileRange {
	if checkInvariants {
		if !pseg.Ok() {
			panic("terminal pma iterator")
		}
		if !ar.WellFormed() || ar.Length() == 0 {
			panic(fmt.Sprintf("invalid ar: %v", ar))
		}
		if !pseg.Range().IsSupersetOf(ar) {
			panic(fmt.Sprintf("ar %v out of bounds %v", ar, pseg.Range()))
		}
	}

	pma := pseg.ValuePtr()
	pstart := pseg.Start()
	return memmap.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)}
}