Check in gVisor.

PiperOrigin-RevId: 194583126 Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
author: Googler <noreply@google.com> 2018-04-27 10:37:02 -0700
committer: Adin Scannell <ascannell@google.com> 2018-04-28 01:44:26 -0400
commit: d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree: 54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/mm/syscalls.go
parent: f70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)
1 files changed, 794 insertions, 0 deletions
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
new file mode 100644
index 000000000..0730be65b
--- /dev/null
+++ b/pkg/sentry/mm/syscalls.go
@@ -0,0 +1,794 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+	"fmt"
+	mrand "math/rand"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// HandleUserFault handles an application page fault. sp is the faulting
+// application thread's stack pointer.
+//
+// Preconditions: mm.as != nil.
+func (mm *MemoryManager) HandleUserFault(ctx context.Context, addr usermem.Addr, at usermem.AccessType, sp usermem.Addr) error {
+	ar, ok := addr.RoundDown().ToRange(usermem.PageSize)
+	if !ok {
+		return syserror.EFAULT
+	}
+
+	// Don't bother trying existingPMAsLocked; in most cases, if we did have
+	// existing pmas, we wouldn't have faulted.
+
+	// Ensure that we have a usable vma. Here and below, since we are only
+	// asking for a single page, there is no possibility of partial success,
+	// and any error is immediately fatal.
+	mm.mappingMu.RLock()
+	vseg, _, err := mm.getVMAsLocked(ctx, ar, at, false)
+	if err != nil {
+		mm.mappingMu.RUnlock()
+		return err
+	}
+
+	// Ensure that we have a usable pma.
+	mm.activeMu.Lock()
+	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{
+		breakCOW: at.Write,
+	})
+	mm.mappingMu.RUnlock()
+	if err != nil {
+		mm.activeMu.Unlock()
+		return err
+	}
+
+	// Downgrade to a read-lock on activeMu since we don't need to mutate pmas
+	// anymore.
+	mm.activeMu.DowngradeLock()
+
+	// Map the faulted page into the active AddressSpace.
+	err = mm.mapASLocked(pseg, ar, false)
+	mm.activeMu.RUnlock()
+	return err
+}
+
+// MMap establishes a memory mapping.
+func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (usermem.Addr, error) {
+	if opts.Length == 0 {
+		return 0, syserror.EINVAL
+	}
+	length, ok := usermem.Addr(opts.Length).RoundUp()
+	if !ok {
+		return 0, syserror.ENOMEM
+	}
+	opts.Length = uint64(length)
+
+	if opts.Mappable != nil {
+		// Offset must be aligned.
+		if usermem.Addr(opts.Offset).RoundDown() != usermem.Addr(opts.Offset) {
+			return 0, syserror.EINVAL
+		}
+		// Offset + length must not overflow.
+		if end := opts.Offset + opts.Length; end < opts.Offset {
+			return 0, syserror.ENOMEM
+		}
+	} else {
+		opts.Offset = 0
+		if !opts.Private {
+			if opts.MappingIdentity != nil {
+				return 0, syserror.EINVAL
+			}
+			m, err := NewSharedAnonMappable(opts.Length, platform.FromContext(ctx))
+			if err != nil {
+				return 0, err
+			}
+			opts.MappingIdentity = m
+			opts.Mappable = m
+		}
+	}
+
+	if opts.Addr.RoundDown() != opts.Addr {
+		// MAP_FIXED requires addr to be page-aligned; non-fixed mappings
+		// don't.
+		if opts.Fixed {
+			return 0, syserror.EINVAL
+		}
+		opts.Addr = opts.Addr.RoundDown()
+	}
+
+	if !opts.MaxPerms.SupersetOf(opts.Perms) {
+		return 0, syserror.EACCES
+	}
+	if opts.Unmap && !opts.Fixed {
+		return 0, syserror.EINVAL
+	}
+	if opts.GrowsDown && opts.Mappable != nil {
+		return 0, syserror.EINVAL
+	}
+
+	// Get the new vma.
+	mm.mappingMu.Lock()
+	vseg, ar, err := mm.createVMALocked(ctx, opts)
+	if err != nil {
+		mm.mappingMu.Unlock()
+		return 0, err
+	}
+
+	switch {
+	case opts.Precommit:
+		// Get pmas and map with precommit as requested.
+		mm.populateAndUnlock(ctx, vseg, ar, true)
+
+	case opts.Mappable == nil && length <= privateAllocUnit:
+		// NOTE: Get pmas and map eagerly in the hope
+		// that doing so will save on future page faults. We only do this for
+		// anonymous mappings, since otherwise the cost of
+		// memmap.Mappable.Translate is unknown; and only for small mappings,
+		// to avoid needing to allocate large amounts of memory that we may
+		// subsequently need to checkpoint.
+		mm.populateAndUnlock(ctx, vseg, ar, false)
+
+	default:
+		mm.mappingMu.Unlock()
+	}
+
+	return ar.Start, nil
+}
+
+// Preconditions: mm.mappingMu must be locked for writing.
+//
+// Postconditions: mm.mappingMu will be unlocked.
+func (mm *MemoryManager) populateAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
+	if !vseg.ValuePtr().effectivePerms.Any() {
+		// Linux doesn't populate inaccessible pages. See
+		// mm/gup.c:populate_vma_page_range.
+		mm.mappingMu.Unlock()
+		return
+	}
+
+	mm.activeMu.Lock()
+
+	// Even if we get a new pma, we can't actually map it if we don't have an
+	// AddressSpace.
+	if mm.as == nil {
+		mm.activeMu.Unlock()
+		mm.mappingMu.Unlock()
+		return
+	}
+
+	// Ensure that we have usable pmas.
+	mm.mappingMu.DowngradeLock()
+	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{})
+	mm.mappingMu.RUnlock()
+	if err != nil {
+		// mm/util.c:vm_mmap_pgoff() ignores the error, if any, from
+		// mm/gup.c:mm_populate(). If it matters, we'll get it again when
+		// userspace actually tries to use the failing page.
+		mm.activeMu.Unlock()
+		return
+	}
+
+	// Downgrade to a read-lock on activeMu since we don't need to mutate pmas
+	// anymore.
+	mm.activeMu.DowngradeLock()
+
+	// As above, errors are silently ignored.
+	mm.mapASLocked(pseg, ar, precommit)
+	mm.activeMu.RUnlock()
+}
+
+// MapStack allocates the initial process stack.
+func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error) {
+	// maxStackSize is the maximum supported process stack size in bytes.
+	//
+	// This limit exists because stack growing isn't implemented, so the entire
+	// process stack must be mapped up-front.
+	const maxStackSize = 128 << 20
+
+	stackSize := limits.FromContext(ctx).Get(limits.Stack)
+	r, ok := usermem.Addr(stackSize.Cur).RoundUp()
+	sz := uint64(r)
+	if !ok {
+		// RLIM_INFINITY rounds up to 0.
+		sz = linux.DefaultStackSoftLimit
+	} else if sz > maxStackSize {
+		ctx.Warningf("Capping stack size from RLIMIT_STACK of %v down to %v.", sz, maxStackSize)
+		sz = maxStackSize
+	} else if sz == 0 {
+		return usermem.AddrRange{}, syserror.ENOMEM
+	}
+	szaddr := usermem.Addr(sz)
+	ctx.Debugf("Allocating stack with size of %v bytes", sz)
+
+	// Determine the stack's desired location. Unlike Linux, address
+	// randomization can't be disabled.
+	stackEnd := mm.layout.MaxAddr - usermem.Addr(mrand.Int63n(int64(mm.layout.MaxStackRand))).RoundDown()
+	if stackEnd < szaddr {
+		return usermem.AddrRange{}, syserror.ENOMEM
+	}
+	stackStart := stackEnd - szaddr
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	_, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+		Length:    sz,
+		Addr:      stackStart,
+		Perms:     usermem.ReadWrite,
+		MaxPerms:  usermem.AnyAccess,
+		Private:   true,
+		GrowsDown: true,
+		Hint:      "[stack]",
+	})
+	return ar, err
+}
+
+// MUnmap implements the semantics of Linux's munmap(2).
+func (mm *MemoryManager) MUnmap(ctx context.Context, addr usermem.Addr, length uint64) error {
+	if addr != addr.RoundDown() {
+		return syserror.EINVAL
+	}
+	if length == 0 {
+		return syserror.EINVAL
+	}
+	la, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return syserror.EINVAL
+	}
+	ar, ok := addr.ToRange(uint64(la))
+	if !ok {
+		return syserror.EINVAL
+	}
+
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	mm.unmapLocked(ctx, ar)
+	return nil
+}
+
+// MRemapOpts specifies options to MRemap.
+type MRemapOpts struct {
+	// Move controls whether MRemap moves the remapped mapping to a new address.
+	Move MRemapMoveMode
+
+	// NewAddr is the new address for the remapping. NewAddr is ignored unless
+	// Move is MMRemapMustMove.
+	NewAddr usermem.Addr
+}
+
+// MRemapMoveMode controls MRemap's moving behavior.
+type MRemapMoveMode int
+
+const (
+	// MRemapNoMove prevents MRemap from moving the remapped mapping.
+	MRemapNoMove MRemapMoveMode = iota
+
+	// MRemapMayMove allows MRemap to move the remapped mapping.
+	MRemapMayMove
+
+	// MRemapMustMove requires MRemap to move the remapped mapping to
+	// MRemapOpts.NewAddr, replacing any existing mappings in the remapped
+	// range.
+	MRemapMustMove
+)
+
+// MRemap implements the semantics of Linux's mremap(2).
+func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSize uint64, newSize uint64, opts MRemapOpts) (usermem.Addr, error) {
+	// "Note that old_address has to be page aligned." - mremap(2)
+	if oldAddr.RoundDown() != oldAddr {
+		return 0, syserror.EINVAL
+	}
+
+	// Linux treats an old_size that rounds up to 0 as 0, which is otherwise a
+	// valid size. However, new_size can't be 0 after rounding.
+	oldSizeAddr, _ := usermem.Addr(oldSize).RoundUp()
+	oldSize = uint64(oldSizeAddr)
+	newSizeAddr, ok := usermem.Addr(newSize).RoundUp()
+	if !ok || newSizeAddr == 0 {
+		return 0, syserror.EINVAL
+	}
+	newSize = uint64(newSizeAddr)
+
+	oldEnd, ok := oldAddr.AddLength(oldSize)
+	if !ok {
+		return 0, syserror.EINVAL
+	}
+
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+
+	// All cases require that a vma exists at oldAddr.
+	vseg := mm.vmas.FindSegment(oldAddr)
+	if !vseg.Ok() {
+		return 0, syserror.EFAULT
+	}
+
+	if opts.Move != MRemapMustMove {
+		// Handle noops and in-place shrinking. These cases don't care if
+		// [oldAddr, oldEnd) maps to a single vma, or is even mapped at all
+		// (aside from oldAddr).
+		if newSize <= oldSize {
+			if newSize < oldSize {
+				// If oldAddr+oldSize didn't overflow, oldAddr+newSize can't
+				// either.
+				newEnd := oldAddr + usermem.Addr(newSize)
+				mm.unmapLocked(ctx, usermem.AddrRange{newEnd, oldEnd})
+			}
+			return oldAddr, nil
+		}
+
+		// Handle in-place growing.
+
+		// Check that oldEnd maps to the same vma as oldAddr.
+		if vseg.End() < oldEnd {
+			return 0, syserror.EFAULT
+		}
+		// "Grow" the existing vma by creating a new mergeable one.
+		vma := vseg.ValuePtr()
+		var newOffset uint64
+		if vma.mappable != nil {
+			newOffset = vseg.mappableRange().End
+		}
+		_, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+			Length:          newSize - oldSize,
+			MappingIdentity: vma.id,
+			Mappable:        vma.mappable,
+			Offset:          newOffset,
+			Addr:            oldEnd,
+			Fixed:           true,
+			Perms:           vma.realPerms,
+			MaxPerms:        vma.maxPerms,
+			Private:         vma.private,
+			GrowsDown:       vma.growsDown,
+			Hint:            vma.hint,
+		})
+		if err == nil {
+			return oldAddr, nil
+		}
+		// In-place growth failed. In the MRemapMayMove case, fall through to
+		// moving below.
+		if opts.Move == MRemapNoMove {
+			return 0, err
+		}
+	}
+
+	// Handle moving, which is the only remaining case.
+
+	// Find a destination for the move.
+	var newAR usermem.AddrRange
+	switch opts.Move {
+	case MRemapMayMove:
+		newAddr, err := mm.findAvailableLocked(newSize, findAvailableOpts{})
+		if err != nil {
+			return 0, err
+		}
+		newAR, _ = newAddr.ToRange(newSize)
+
+	case MRemapMustMove:
+		newAddr := opts.NewAddr
+		if newAddr.RoundDown() != newAddr {
+			return 0, syserror.EINVAL
+		}
+		var ok bool
+		newAR, ok = newAddr.ToRange(newSize)
+		if !ok {
+			return 0, syserror.EINVAL
+		}
+		if (usermem.AddrRange{oldAddr, oldEnd}).Overlaps(newAR) {
+			return 0, syserror.EINVAL
+		}
+
+		// Unmap any mappings at the destination.
+		mm.unmapLocked(ctx, newAR)
+
+		// If the sizes specify shrinking, unmap everything between the new and
+		// old sizes at the source.
+		if newSize < oldSize {
+			oldNewEnd := oldAddr + usermem.Addr(newSize)
+			mm.unmapLocked(ctx, usermem.AddrRange{oldNewEnd, oldEnd})
+			oldEnd = oldNewEnd
+		}
+
+		// unmapLocked may have invalidated vseg; look it up again.
+		vseg = mm.vmas.FindSegment(oldAddr)
+	}
+
+	oldAR := usermem.AddrRange{oldAddr, oldEnd}
+
+	// In the MRemapMustMove case, these checks happen after unmapping:
+	// mm/mremap.c:mremap_to() => do_munmap(), vma_to_resize().
+
+	// Check that oldEnd maps to the same vma as oldAddr.
+	if vseg.End() < oldEnd {
+		return 0, syserror.EFAULT
+	}
+
+	// Check against RLIMIT_AS.
+	newUsageAS := mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
+	if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS {
+		return 0, syserror.ENOMEM
+	}
+
+	if vma := vseg.ValuePtr(); vma.mappable != nil {
+		// Check that offset+length does not overflow.
+		if vma.off+uint64(newAR.Length()) < vma.off {
+			return 0, syserror.EINVAL
+		}
+		// Inform the Mappable, if any, of the copied mapping.
+		if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start)); err != nil {
+			return 0, err
+		}
+	}
+
+	// Remove the existing vma before inserting the new one to minimize
+	// iterator invalidation. We do this directly (instead of calling
+	// removeVMAsLocked) because:
+	//
+	// 1. We can't drop the reference on vma.id, which will be transferred to
+	// the new vma.
+	//
+	// 2. We can't call vma.mappable.RemoveMapping, because pmas are still at
+	// oldAR, so calling RemoveMapping could cause us to miss an invalidation
+	// overlapping oldAR.
+	//
+	// Call vseg.Value() (rather than vseg.ValuePtr()) first to make a copy of
+	// the vma.
+	vseg = mm.vmas.Isolate(vseg, oldAR)
+	vma := vseg.Value()
+	mm.vmas.Remove(vseg)
+
+	// Insert the new vma, transferring the reference on vma.id.
+	mm.vmas.Add(newAR, vma)
+
+	// Move pmas. This is technically optional for non-private pmas, which
+	// could just go through memmap.Mappable.Translate again, but it's required
+	// for private pmas.
+	mm.activeMu.Lock()
+	mm.movePMAsLocked(oldAR, newAR)
+	mm.activeMu.Unlock()
+
+	// Now that pmas have been moved to newAR, we can notify vma.mappable that
+	// oldAR is no longer mapped.
+	if vma.mappable != nil {
+		vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off)
+	}
+
+	return newAR.Start, nil
+}
+
+// MProtect implements the semantics of Linux's mprotect(2).
+func (mm *MemoryManager) MProtect(addr usermem.Addr, length uint64, realPerms usermem.AccessType, growsDown bool) error {
+	if addr.RoundDown() != addr {
+		return syserror.EINVAL
+	}
+	if length == 0 {
+		return nil
+	}
+	rlength, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return syserror.ENOMEM
+	}
+	ar, ok := addr.ToRange(uint64(rlength))
+	if !ok {
+		return syserror.ENOMEM
+	}
+	effectivePerms := realPerms.Effective()
+
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	// Non-growsDown mprotect requires that all of ar is mapped, and stops at
+	// the first non-empty gap. growsDown mprotect requires that the first vma
+	// be growsDown, but does not require it to extend all the way to ar.Start;
+	// vmas after the first must be contiguous but need not be growsDown, like
+	// the non-growsDown case.
+	vseg := mm.vmas.LowerBoundSegment(ar.Start)
+	if !vseg.Ok() {
+		return syserror.ENOMEM
+	}
+	if growsDown {
+		if !vseg.ValuePtr().growsDown {
+			return syserror.EINVAL
+		}
+		if ar.End <= vseg.Start() {
+			return syserror.ENOMEM
+		}
+		ar.Start = vseg.Start()
+	} else {
+		if ar.Start < vseg.Start() {
+			return syserror.ENOMEM
+		}
+	}
+
+	mm.activeMu.Lock()
+	defer mm.activeMu.Unlock()
+	defer func() {
+		mm.vmas.MergeRange(ar)
+		mm.vmas.MergeAdjacent(ar)
+		mm.pmas.MergeRange(ar)
+		mm.pmas.MergeAdjacent(ar)
+	}()
+	pseg := mm.pmas.LowerBoundSegment(ar.Start)
+	var didUnmapAS bool
+	for {
+		// Check for permission validity before splitting vmas, for consistency
+		// with Linux.
+		if !vseg.ValuePtr().maxPerms.SupersetOf(effectivePerms) {
+			return syserror.EACCES
+		}
+		vseg = mm.vmas.Isolate(vseg, ar)
+
+		// Update vma permissions.
+		vma := vseg.ValuePtr()
+		vma.realPerms = realPerms
+		vma.effectivePerms = effectivePerms
+
+		// Propagate vma permission changes to pmas.
+		for pseg.Ok() && pseg.Start() < vseg.End() {
+			if pseg.Range().Overlaps(vseg.Range()) {
+				pseg = mm.pmas.Isolate(pseg, vseg.Range())
+				if !effectivePerms.SupersetOf(pseg.ValuePtr().vmaEffectivePerms) && !didUnmapAS {
+					// Unmap all of ar, not just vseg.Range(), to minimize host
+					// syscalls.
+					mm.unmapASLocked(ar)
+					didUnmapAS = true
+				}
+				pseg.ValuePtr().vmaEffectivePerms = effectivePerms
+			}
+			pseg = pseg.NextSegment()
+		}
+
+		// Continue to the next vma.
+		if ar.End <= vseg.End() {
+			return nil
+		}
+		vseg, _ = vseg.NextNonEmpty()
+		if !vseg.Ok() {
+			return syserror.ENOMEM
+		}
+	}
+}
+
+// BrkSetup sets mm's brk address to addr and its brk size to 0.
+func (mm *MemoryManager) BrkSetup(ctx context.Context, addr usermem.Addr) {
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+	// Unmap the existing brk.
+	if mm.brk.Length() != 0 {
+		mm.unmapLocked(ctx, mm.brk)
+	}
+	mm.brk = usermem.AddrRange{addr, addr}
+}
+
+// Brk implements the semantics of Linux's brk(2), except that it returns an
+// error on failure.
+func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Addr, error) {
+	mm.mappingMu.Lock()
+	defer mm.mappingMu.Unlock()
+
+	if addr < mm.brk.Start {
+		return mm.brk.End, syserror.EINVAL
+	}
+
+	// TODO: This enforces RLIMIT_DATA, but is slightly more
+	// permissive than the usual data limit. In particular, this only
+	// limits the size of the heap; a true RLIMIT_DATA limits the size of
+	// heap + data + bss. The segment sizes need to be plumbed from the
+	// loader package to fully enforce RLIMIT_DATA.
+	if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur {
+		return mm.brk.End, syserror.ENOMEM
+	}
+
+	oldbrkpg, _ := mm.brk.End.RoundUp()
+	newbrkpg, ok := addr.RoundUp()
+	if !ok {
+		return mm.brk.End, syserror.EFAULT
+	}
+
+	switch {
+	case newbrkpg < oldbrkpg:
+		mm.unmapLocked(ctx, usermem.AddrRange{newbrkpg, oldbrkpg})
+
+	case oldbrkpg < newbrkpg:
+		_, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+			Length: uint64(newbrkpg - oldbrkpg),
+			Addr:   oldbrkpg,
+			Fixed:  true,
+			// Compare Linux's
+			// arch/x86/include/asm/page_types.h:VM_DATA_DEFAULT_FLAGS.
+			Perms:    usermem.ReadWrite,
+			MaxPerms: usermem.AnyAccess,
+			Private:  true,
+			Hint:     "[heap]",
+		})
+		if err != nil {
+			return mm.brk.End, err
+		}
+	}
+
+	mm.brk.End = addr
+	return addr, nil
+}
+
+// Decommit implements the semantics of Linux's madvise(MADV_DONTNEED).
+func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
+	ar, ok := addr.ToRange(length)
+	if !ok {
+		return syserror.EINVAL
+	}
+
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	mm.activeMu.Lock()
+	defer mm.activeMu.Unlock()
+
+	// Linux's mm/madvise.c:madvise_dontneed() => mm/memory.c:zap_page_range()
+	// is analogous to our mm.invalidateLocked(ar, true, true). We inline this
+	// here, with the special case that we synchronously decommit
+	// uniquely-owned (non-copy-on-write) pages for private anonymous vma,
+	// which is the common case for MADV_DONTNEED. Invalidating these pmas, and
+	// allowing them to be reallocated when touched again, increases pma
+	// fragmentation, which may significantly reduce performance for
+	// non-vectored I/O implementations. Also, decommitting synchronously
+	// ensures that Decommit immediately reduces host memory usage.
+	var didUnmapAS bool
+	pseg := mm.pmas.LowerBoundSegment(ar.Start)
+	vseg := mm.vmas.LowerBoundSegment(ar.Start)
+	mem := mm.p.Memory()
+	for pseg.Ok() && pseg.Start() < ar.End {
+		pma := pseg.ValuePtr()
+		if pma.private && !mm.isPMACopyOnWriteLocked(pseg) {
+			psegAR := pseg.Range().Intersect(ar)
+			vseg = vseg.seekNextLowerBound(psegAR.Start)
+			if checkInvariants {
+				if !vseg.Ok() {
+					panic(fmt.Sprintf("no vma after %#x", psegAR.Start))
+				}
+				if psegAR.Start < vseg.Start() {
+					panic(fmt.Sprintf("no vma in [%#x, %#x)", psegAR.Start, vseg.Start()))
+				}
+			}
+			if vseg.Range().IsSupersetOf(psegAR) && vseg.ValuePtr().mappable == nil {
+				if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
+					pseg = pseg.NextSegment()
+					continue
+				}
+				// If an error occurs, fall through to the general
+				// invalidation case below.
+			}
+		}
+		pseg = mm.pmas.Isolate(pseg, ar)
+		pma = pseg.ValuePtr()
+		if !didUnmapAS {
+			// Unmap all of ar, not just pseg.Range(), to minimize host
+			// syscalls. AddressSpace mappings must be removed before
+			// mm.decPrivateRef().
+			mm.unmapASLocked(ar)
+			didUnmapAS = true
+		}
+		if pma.private {
+			mm.decPrivateRef(pseg.fileRange())
+		}
+		pma.file.DecRef(pseg.fileRange())
+		mm.removeRSSLocked(pseg.Range())
+
+		pseg = mm.pmas.Remove(pseg).NextSegment()
+	}
+
+	// "If there are some parts of the specified address space that are not
+	// mapped, the Linux version of madvise() ignores them and applies the call
+	// to the rest (but returns ENOMEM from the system call, as it should)." -
+	// madvise(2)
+	if mm.vmas.SpanRange(ar) != ar.Length() {
+		return syserror.ENOMEM
+	}
+	return nil
+}
+
+// Sync implements the semantics of Linux's msync(MS_SYNC).
+func (mm *MemoryManager) Sync(ctx context.Context, addr usermem.Addr, length uint64) error {
+	ar, ok := addr.ToRange(length)
+	if !ok {
+		return syserror.ENOMEM
+	}
+
+	mm.mappingMu.RLock()
+	// Can't defer mm.mappingMu.RUnlock(); see below.
+	vseg := mm.vmas.LowerBoundSegment(ar.Start)
+	if !vseg.Ok() {
+		mm.mappingMu.RUnlock()
+		return syserror.ENOMEM
+	}
+	var unmapped bool
+	lastEnd := ar.Start
+	for {
+		if !vseg.Ok() {
+			mm.mappingMu.RUnlock()
+			unmapped = true
+			break
+		}
+		if lastEnd < vseg.Start() {
+			unmapped = true
+		}
+		lastEnd = vseg.End()
+		vma := vseg.ValuePtr()
+		// It's only possible to have dirtied the Mappable through a shared
+		// mapping. Don't check if the mapping is writable, because mprotect
+		// may have changed this, and also because Linux doesn't.
+		if id := vma.id; id != nil && vma.mappable != nil && !vma.private {
+			// We can't call memmap.MappingIdentity.Msync while holding
+			// mm.mappingMu since it may take fs locks that precede it in the
+			// lock order.
+			id.IncRef()
+			mr := vseg.mappableRangeOf(vseg.Range().Intersect(ar))
+			mm.mappingMu.RUnlock()
+			err := id.Msync(ctx, mr)
+			id.DecRef()
+			if err != nil {
+				return err
+			}
+			if lastEnd >= ar.End {
+				break
+			}
+			mm.mappingMu.RLock()
+			vseg = mm.vmas.LowerBoundSegment(lastEnd)
+		} else {
+			if lastEnd >= ar.End {
+				mm.mappingMu.RUnlock()
+				break
+			}
+			vseg = vseg.NextSegment()
+		}
+	}
+
+	if unmapped {
+		return syserror.ENOMEM
+	}
+	return nil
+}
+
+// VirtualMemorySize returns the combined length in bytes of all mappings in
+// mm.
+func (mm *MemoryManager) VirtualMemorySize() uint64 {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	return uint64(mm.usageAS)
+}
+
+// VirtualMemorySizeRange returns the combined length in bytes of all mappings
+// in ar in mm.
+func (mm *MemoryManager) VirtualMemorySizeRange(ar usermem.AddrRange) uint64 {
+	mm.mappingMu.RLock()
+	defer mm.mappingMu.RUnlock()
+	return uint64(mm.vmas.SpanRange(ar))
+}
+
+// ResidentSetSize returns the value advertised as mm's RSS in bytes.
+func (mm *MemoryManager) ResidentSetSize() uint64 {
+	mm.activeMu.RLock()
+	defer mm.activeMu.RUnlock()
+	return uint64(mm.curRSS)
+}
+
+// MaxResidentSetSize returns the value advertised as mm's max RSS in bytes.
+func (mm *MemoryManager) MaxResidentSetSize() uint64 {
+	mm.activeMu.RLock()
+	defer mm.activeMu.RUnlock()
+	return uint64(mm.maxRSS)
+}
author	Googler <noreply@google.com>	2018-04-27 10:37:02 -0700
committer	Adin Scannell <ascannell@google.com>	2018-04-28 01:44:26 -0400
commit	d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree	54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/mm/syscalls.go
parent	f70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)