summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/mm/syscalls.go
diff options
context:
space:
mode:
authorGoogler <noreply@google.com>2018-04-27 10:37:02 -0700
committerAdin Scannell <ascannell@google.com>2018-04-28 01:44:26 -0400
commitd02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/mm/syscalls.go
parentf70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)
Check in gVisor.
PiperOrigin-RevId: 194583126 Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
Diffstat (limited to 'pkg/sentry/mm/syscalls.go')
-rw-r--r--pkg/sentry/mm/syscalls.go794
1 files changed, 794 insertions, 0 deletions
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
new file mode 100644
index 000000000..0730be65b
--- /dev/null
+++ b/pkg/sentry/mm/syscalls.go
@@ -0,0 +1,794 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mm
+
+import (
+ "fmt"
+ mrand "math/rand"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// HandleUserFault handles an application page fault. sp is the faulting
+// application thread's stack pointer.
+//
+// Preconditions: mm.as != nil.
+func (mm *MemoryManager) HandleUserFault(ctx context.Context, addr usermem.Addr, at usermem.AccessType, sp usermem.Addr) error {
+ ar, ok := addr.RoundDown().ToRange(usermem.PageSize)
+ if !ok {
+ return syserror.EFAULT
+ }
+
+ // Don't bother trying existingPMAsLocked; in most cases, if we did have
+ // existing pmas, we wouldn't have faulted.
+
+ // Ensure that we have a usable vma. Here and below, since we are only
+ // asking for a single page, there is no possibility of partial success,
+ // and any error is immediately fatal.
+ mm.mappingMu.RLock()
+ vseg, _, err := mm.getVMAsLocked(ctx, ar, at, false)
+ if err != nil {
+ mm.mappingMu.RUnlock()
+ return err
+ }
+
+ // Ensure that we have a usable pma.
+ mm.activeMu.Lock()
+ pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{
+ breakCOW: at.Write,
+ })
+ mm.mappingMu.RUnlock()
+ if err != nil {
+ mm.activeMu.Unlock()
+ return err
+ }
+
+ // Downgrade to a read-lock on activeMu since we don't need to mutate pmas
+ // anymore.
+ mm.activeMu.DowngradeLock()
+
+ // Map the faulted page into the active AddressSpace.
+ err = mm.mapASLocked(pseg, ar, false)
+ mm.activeMu.RUnlock()
+ return err
+}
+
+// MMap establishes a memory mapping.
+func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (usermem.Addr, error) {
+ if opts.Length == 0 {
+ return 0, syserror.EINVAL
+ }
+ length, ok := usermem.Addr(opts.Length).RoundUp()
+ if !ok {
+ return 0, syserror.ENOMEM
+ }
+ opts.Length = uint64(length)
+
+ if opts.Mappable != nil {
+ // Offset must be aligned.
+ if usermem.Addr(opts.Offset).RoundDown() != usermem.Addr(opts.Offset) {
+ return 0, syserror.EINVAL
+ }
+ // Offset + length must not overflow.
+ if end := opts.Offset + opts.Length; end < opts.Offset {
+ return 0, syserror.ENOMEM
+ }
+ } else {
+ opts.Offset = 0
+ if !opts.Private {
+ if opts.MappingIdentity != nil {
+ return 0, syserror.EINVAL
+ }
+ m, err := NewSharedAnonMappable(opts.Length, platform.FromContext(ctx))
+ if err != nil {
+ return 0, err
+ }
+ opts.MappingIdentity = m
+ opts.Mappable = m
+ }
+ }
+
+ if opts.Addr.RoundDown() != opts.Addr {
+ // MAP_FIXED requires addr to be page-aligned; non-fixed mappings
+ // don't.
+ if opts.Fixed {
+ return 0, syserror.EINVAL
+ }
+ opts.Addr = opts.Addr.RoundDown()
+ }
+
+ if !opts.MaxPerms.SupersetOf(opts.Perms) {
+ return 0, syserror.EACCES
+ }
+ if opts.Unmap && !opts.Fixed {
+ return 0, syserror.EINVAL
+ }
+ if opts.GrowsDown && opts.Mappable != nil {
+ return 0, syserror.EINVAL
+ }
+
+ // Get the new vma.
+ mm.mappingMu.Lock()
+ vseg, ar, err := mm.createVMALocked(ctx, opts)
+ if err != nil {
+ mm.mappingMu.Unlock()
+ return 0, err
+ }
+
+ switch {
+ case opts.Precommit:
+ // Get pmas and map with precommit as requested.
+ mm.populateAndUnlock(ctx, vseg, ar, true)
+
+ case opts.Mappable == nil && length <= privateAllocUnit:
+ // NOTE: Get pmas and map eagerly in the hope
+ // that doing so will save on future page faults. We only do this for
+ // anonymous mappings, since otherwise the cost of
+ // memmap.Mappable.Translate is unknown; and only for small mappings,
+ // to avoid needing to allocate large amounts of memory that we may
+ // subsequently need to checkpoint.
+ mm.populateAndUnlock(ctx, vseg, ar, false)
+
+ default:
+ mm.mappingMu.Unlock()
+ }
+
+ return ar.Start, nil
+}
+
+// Preconditions: mm.mappingMu must be locked for writing.
+//
+// Postconditions: mm.mappingMu will be unlocked.
+func (mm *MemoryManager) populateAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
+ if !vseg.ValuePtr().effectivePerms.Any() {
+ // Linux doesn't populate inaccessible pages. See
+ // mm/gup.c:populate_vma_page_range.
+ mm.mappingMu.Unlock()
+ return
+ }
+
+ mm.activeMu.Lock()
+
+ // Even if we get a new pma, we can't actually map it if we don't have an
+ // AddressSpace.
+ if mm.as == nil {
+ mm.activeMu.Unlock()
+ mm.mappingMu.Unlock()
+ return
+ }
+
+ // Ensure that we have usable pmas.
+ mm.mappingMu.DowngradeLock()
+ pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{})
+ mm.mappingMu.RUnlock()
+ if err != nil {
+ // mm/util.c:vm_mmap_pgoff() ignores the error, if any, from
+ // mm/gup.c:mm_populate(). If it matters, we'll get it again when
+ // userspace actually tries to use the failing page.
+ mm.activeMu.Unlock()
+ return
+ }
+
+ // Downgrade to a read-lock on activeMu since we don't need to mutate pmas
+ // anymore.
+ mm.activeMu.DowngradeLock()
+
+ // As above, errors are silently ignored.
+ mm.mapASLocked(pseg, ar, precommit)
+ mm.activeMu.RUnlock()
+}
+
+// MapStack allocates the initial process stack.
+func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error) {
+ // maxStackSize is the maximum supported process stack size in bytes.
+ //
+ // This limit exists because stack growing isn't implemented, so the entire
+ // process stack must be mapped up-front.
+ const maxStackSize = 128 << 20
+
+ stackSize := limits.FromContext(ctx).Get(limits.Stack)
+ r, ok := usermem.Addr(stackSize.Cur).RoundUp()
+ sz := uint64(r)
+ if !ok {
+ // RLIM_INFINITY rounds up to 0.
+ sz = linux.DefaultStackSoftLimit
+ } else if sz > maxStackSize {
+ ctx.Warningf("Capping stack size from RLIMIT_STACK of %v down to %v.", sz, maxStackSize)
+ sz = maxStackSize
+ } else if sz == 0 {
+ return usermem.AddrRange{}, syserror.ENOMEM
+ }
+ szaddr := usermem.Addr(sz)
+ ctx.Debugf("Allocating stack with size of %v bytes", sz)
+
+ // Determine the stack's desired location. Unlike Linux, address
+ // randomization can't be disabled.
+ stackEnd := mm.layout.MaxAddr - usermem.Addr(mrand.Int63n(int64(mm.layout.MaxStackRand))).RoundDown()
+ if stackEnd < szaddr {
+ return usermem.AddrRange{}, syserror.ENOMEM
+ }
+ stackStart := stackEnd - szaddr
+ mm.mappingMu.Lock()
+ defer mm.mappingMu.Unlock()
+ _, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+ Length: sz,
+ Addr: stackStart,
+ Perms: usermem.ReadWrite,
+ MaxPerms: usermem.AnyAccess,
+ Private: true,
+ GrowsDown: true,
+ Hint: "[stack]",
+ })
+ return ar, err
+}
+
+// MUnmap implements the semantics of Linux's munmap(2).
+func (mm *MemoryManager) MUnmap(ctx context.Context, addr usermem.Addr, length uint64) error {
+ if addr != addr.RoundDown() {
+ return syserror.EINVAL
+ }
+ if length == 0 {
+ return syserror.EINVAL
+ }
+ la, ok := usermem.Addr(length).RoundUp()
+ if !ok {
+ return syserror.EINVAL
+ }
+ ar, ok := addr.ToRange(uint64(la))
+ if !ok {
+ return syserror.EINVAL
+ }
+
+ mm.mappingMu.Lock()
+ defer mm.mappingMu.Unlock()
+ mm.unmapLocked(ctx, ar)
+ return nil
+}
+
+// MRemapOpts specifies options to MRemap.
+type MRemapOpts struct {
+ // Move controls whether MRemap moves the remapped mapping to a new address.
+ Move MRemapMoveMode
+
+ // NewAddr is the new address for the remapping. NewAddr is ignored unless
+ // Move is MMRemapMustMove.
+ NewAddr usermem.Addr
+}
+
+// MRemapMoveMode controls MRemap's moving behavior.
+type MRemapMoveMode int
+
+const (
+ // MRemapNoMove prevents MRemap from moving the remapped mapping.
+ MRemapNoMove MRemapMoveMode = iota
+
+ // MRemapMayMove allows MRemap to move the remapped mapping.
+ MRemapMayMove
+
+ // MRemapMustMove requires MRemap to move the remapped mapping to
+ // MRemapOpts.NewAddr, replacing any existing mappings in the remapped
+ // range.
+ MRemapMustMove
+)
+
+// MRemap implements the semantics of Linux's mremap(2).
+func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSize uint64, newSize uint64, opts MRemapOpts) (usermem.Addr, error) {
+ // "Note that old_address has to be page aligned." - mremap(2)
+ if oldAddr.RoundDown() != oldAddr {
+ return 0, syserror.EINVAL
+ }
+
+ // Linux treats an old_size that rounds up to 0 as 0, which is otherwise a
+ // valid size. However, new_size can't be 0 after rounding.
+ oldSizeAddr, _ := usermem.Addr(oldSize).RoundUp()
+ oldSize = uint64(oldSizeAddr)
+ newSizeAddr, ok := usermem.Addr(newSize).RoundUp()
+ if !ok || newSizeAddr == 0 {
+ return 0, syserror.EINVAL
+ }
+ newSize = uint64(newSizeAddr)
+
+ oldEnd, ok := oldAddr.AddLength(oldSize)
+ if !ok {
+ return 0, syserror.EINVAL
+ }
+
+ mm.mappingMu.Lock()
+ defer mm.mappingMu.Unlock()
+
+ // All cases require that a vma exists at oldAddr.
+ vseg := mm.vmas.FindSegment(oldAddr)
+ if !vseg.Ok() {
+ return 0, syserror.EFAULT
+ }
+
+ if opts.Move != MRemapMustMove {
+ // Handle noops and in-place shrinking. These cases don't care if
+ // [oldAddr, oldEnd) maps to a single vma, or is even mapped at all
+ // (aside from oldAddr).
+ if newSize <= oldSize {
+ if newSize < oldSize {
+ // If oldAddr+oldSize didn't overflow, oldAddr+newSize can't
+ // either.
+ newEnd := oldAddr + usermem.Addr(newSize)
+ mm.unmapLocked(ctx, usermem.AddrRange{newEnd, oldEnd})
+ }
+ return oldAddr, nil
+ }
+
+ // Handle in-place growing.
+
+ // Check that oldEnd maps to the same vma as oldAddr.
+ if vseg.End() < oldEnd {
+ return 0, syserror.EFAULT
+ }
+ // "Grow" the existing vma by creating a new mergeable one.
+ vma := vseg.ValuePtr()
+ var newOffset uint64
+ if vma.mappable != nil {
+ newOffset = vseg.mappableRange().End
+ }
+ _, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+ Length: newSize - oldSize,
+ MappingIdentity: vma.id,
+ Mappable: vma.mappable,
+ Offset: newOffset,
+ Addr: oldEnd,
+ Fixed: true,
+ Perms: vma.realPerms,
+ MaxPerms: vma.maxPerms,
+ Private: vma.private,
+ GrowsDown: vma.growsDown,
+ Hint: vma.hint,
+ })
+ if err == nil {
+ return oldAddr, nil
+ }
+ // In-place growth failed. In the MRemapMayMove case, fall through to
+ // moving below.
+ if opts.Move == MRemapNoMove {
+ return 0, err
+ }
+ }
+
+ // Handle moving, which is the only remaining case.
+
+ // Find a destination for the move.
+ var newAR usermem.AddrRange
+ switch opts.Move {
+ case MRemapMayMove:
+ newAddr, err := mm.findAvailableLocked(newSize, findAvailableOpts{})
+ if err != nil {
+ return 0, err
+ }
+ newAR, _ = newAddr.ToRange(newSize)
+
+ case MRemapMustMove:
+ newAddr := opts.NewAddr
+ if newAddr.RoundDown() != newAddr {
+ return 0, syserror.EINVAL
+ }
+ var ok bool
+ newAR, ok = newAddr.ToRange(newSize)
+ if !ok {
+ return 0, syserror.EINVAL
+ }
+ if (usermem.AddrRange{oldAddr, oldEnd}).Overlaps(newAR) {
+ return 0, syserror.EINVAL
+ }
+
+ // Unmap any mappings at the destination.
+ mm.unmapLocked(ctx, newAR)
+
+ // If the sizes specify shrinking, unmap everything between the new and
+ // old sizes at the source.
+ if newSize < oldSize {
+ oldNewEnd := oldAddr + usermem.Addr(newSize)
+ mm.unmapLocked(ctx, usermem.AddrRange{oldNewEnd, oldEnd})
+ oldEnd = oldNewEnd
+ }
+
+ // unmapLocked may have invalidated vseg; look it up again.
+ vseg = mm.vmas.FindSegment(oldAddr)
+ }
+
+ oldAR := usermem.AddrRange{oldAddr, oldEnd}
+
+ // In the MRemapMustMove case, these checks happen after unmapping:
+ // mm/mremap.c:mremap_to() => do_munmap(), vma_to_resize().
+
+ // Check that oldEnd maps to the same vma as oldAddr.
+ if vseg.End() < oldEnd {
+ return 0, syserror.EFAULT
+ }
+
+ // Check against RLIMIT_AS.
+ newUsageAS := mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
+ if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS {
+ return 0, syserror.ENOMEM
+ }
+
+ if vma := vseg.ValuePtr(); vma.mappable != nil {
+ // Check that offset+length does not overflow.
+ if vma.off+uint64(newAR.Length()) < vma.off {
+ return 0, syserror.EINVAL
+ }
+ // Inform the Mappable, if any, of the copied mapping.
+ if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start)); err != nil {
+ return 0, err
+ }
+ }
+
+ // Remove the existing vma before inserting the new one to minimize
+ // iterator invalidation. We do this directly (instead of calling
+ // removeVMAsLocked) because:
+ //
+ // 1. We can't drop the reference on vma.id, which will be transferred to
+ // the new vma.
+ //
+ // 2. We can't call vma.mappable.RemoveMapping, because pmas are still at
+ // oldAR, so calling RemoveMapping could cause us to miss an invalidation
+ // overlapping oldAR.
+ //
+ // Call vseg.Value() (rather than vseg.ValuePtr()) first to make a copy of
+ // the vma.
+ vseg = mm.vmas.Isolate(vseg, oldAR)
+ vma := vseg.Value()
+ mm.vmas.Remove(vseg)
+
+ // Insert the new vma, transferring the reference on vma.id.
+ mm.vmas.Add(newAR, vma)
+
+ // Move pmas. This is technically optional for non-private pmas, which
+ // could just go through memmap.Mappable.Translate again, but it's required
+ // for private pmas.
+ mm.activeMu.Lock()
+ mm.movePMAsLocked(oldAR, newAR)
+ mm.activeMu.Unlock()
+
+ // Now that pmas have been moved to newAR, we can notify vma.mappable that
+ // oldAR is no longer mapped.
+ if vma.mappable != nil {
+ vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off)
+ }
+
+ return newAR.Start, nil
+}
+
+// MProtect implements the semantics of Linux's mprotect(2).
+func (mm *MemoryManager) MProtect(addr usermem.Addr, length uint64, realPerms usermem.AccessType, growsDown bool) error {
+ if addr.RoundDown() != addr {
+ return syserror.EINVAL
+ }
+ if length == 0 {
+ return nil
+ }
+ rlength, ok := usermem.Addr(length).RoundUp()
+ if !ok {
+ return syserror.ENOMEM
+ }
+ ar, ok := addr.ToRange(uint64(rlength))
+ if !ok {
+ return syserror.ENOMEM
+ }
+ effectivePerms := realPerms.Effective()
+
+ mm.mappingMu.Lock()
+ defer mm.mappingMu.Unlock()
+ // Non-growsDown mprotect requires that all of ar is mapped, and stops at
+ // the first non-empty gap. growsDown mprotect requires that the first vma
+ // be growsDown, but does not require it to extend all the way to ar.Start;
+ // vmas after the first must be contiguous but need not be growsDown, like
+ // the non-growsDown case.
+ vseg := mm.vmas.LowerBoundSegment(ar.Start)
+ if !vseg.Ok() {
+ return syserror.ENOMEM
+ }
+ if growsDown {
+ if !vseg.ValuePtr().growsDown {
+ return syserror.EINVAL
+ }
+ if ar.End <= vseg.Start() {
+ return syserror.ENOMEM
+ }
+ ar.Start = vseg.Start()
+ } else {
+ if ar.Start < vseg.Start() {
+ return syserror.ENOMEM
+ }
+ }
+
+ mm.activeMu.Lock()
+ defer mm.activeMu.Unlock()
+ defer func() {
+ mm.vmas.MergeRange(ar)
+ mm.vmas.MergeAdjacent(ar)
+ mm.pmas.MergeRange(ar)
+ mm.pmas.MergeAdjacent(ar)
+ }()
+ pseg := mm.pmas.LowerBoundSegment(ar.Start)
+ var didUnmapAS bool
+ for {
+ // Check for permission validity before splitting vmas, for consistency
+ // with Linux.
+ if !vseg.ValuePtr().maxPerms.SupersetOf(effectivePerms) {
+ return syserror.EACCES
+ }
+ vseg = mm.vmas.Isolate(vseg, ar)
+
+ // Update vma permissions.
+ vma := vseg.ValuePtr()
+ vma.realPerms = realPerms
+ vma.effectivePerms = effectivePerms
+
+ // Propagate vma permission changes to pmas.
+ for pseg.Ok() && pseg.Start() < vseg.End() {
+ if pseg.Range().Overlaps(vseg.Range()) {
+ pseg = mm.pmas.Isolate(pseg, vseg.Range())
+ if !effectivePerms.SupersetOf(pseg.ValuePtr().vmaEffectivePerms) && !didUnmapAS {
+ // Unmap all of ar, not just vseg.Range(), to minimize host
+ // syscalls.
+ mm.unmapASLocked(ar)
+ didUnmapAS = true
+ }
+ pseg.ValuePtr().vmaEffectivePerms = effectivePerms
+ }
+ pseg = pseg.NextSegment()
+ }
+
+ // Continue to the next vma.
+ if ar.End <= vseg.End() {
+ return nil
+ }
+ vseg, _ = vseg.NextNonEmpty()
+ if !vseg.Ok() {
+ return syserror.ENOMEM
+ }
+ }
+}
+
+// BrkSetup sets mm's brk address to addr and its brk size to 0.
+func (mm *MemoryManager) BrkSetup(ctx context.Context, addr usermem.Addr) {
+ mm.mappingMu.Lock()
+ defer mm.mappingMu.Unlock()
+ // Unmap the existing brk.
+ if mm.brk.Length() != 0 {
+ mm.unmapLocked(ctx, mm.brk)
+ }
+ mm.brk = usermem.AddrRange{addr, addr}
+}
+
+// Brk implements the semantics of Linux's brk(2), except that it returns an
+// error on failure.
+func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Addr, error) {
+ mm.mappingMu.Lock()
+ defer mm.mappingMu.Unlock()
+
+ if addr < mm.brk.Start {
+ return mm.brk.End, syserror.EINVAL
+ }
+
+ // TODO: This enforces RLIMIT_DATA, but is slightly more
+ // permissive than the usual data limit. In particular, this only
+ // limits the size of the heap; a true RLIMIT_DATA limits the size of
+ // heap + data + bss. The segment sizes need to be plumbed from the
+ // loader package to fully enforce RLIMIT_DATA.
+ if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur {
+ return mm.brk.End, syserror.ENOMEM
+ }
+
+ oldbrkpg, _ := mm.brk.End.RoundUp()
+ newbrkpg, ok := addr.RoundUp()
+ if !ok {
+ return mm.brk.End, syserror.EFAULT
+ }
+
+ switch {
+ case newbrkpg < oldbrkpg:
+ mm.unmapLocked(ctx, usermem.AddrRange{newbrkpg, oldbrkpg})
+
+ case oldbrkpg < newbrkpg:
+ _, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+ Length: uint64(newbrkpg - oldbrkpg),
+ Addr: oldbrkpg,
+ Fixed: true,
+ // Compare Linux's
+ // arch/x86/include/asm/page_types.h:VM_DATA_DEFAULT_FLAGS.
+ Perms: usermem.ReadWrite,
+ MaxPerms: usermem.AnyAccess,
+ Private: true,
+ Hint: "[heap]",
+ })
+ if err != nil {
+ return mm.brk.End, err
+ }
+ }
+
+ mm.brk.End = addr
+ return addr, nil
+}
+
+// Decommit implements the semantics of Linux's madvise(MADV_DONTNEED).
+func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
+ ar, ok := addr.ToRange(length)
+ if !ok {
+ return syserror.EINVAL
+ }
+
+ mm.mappingMu.RLock()
+ defer mm.mappingMu.RUnlock()
+ mm.activeMu.Lock()
+ defer mm.activeMu.Unlock()
+
+ // Linux's mm/madvise.c:madvise_dontneed() => mm/memory.c:zap_page_range()
+ // is analogous to our mm.invalidateLocked(ar, true, true). We inline this
+ // here, with the special case that we synchronously decommit
+ // uniquely-owned (non-copy-on-write) pages for private anonymous vma,
+ // which is the common case for MADV_DONTNEED. Invalidating these pmas, and
+ // allowing them to be reallocated when touched again, increases pma
+ // fragmentation, which may significantly reduce performance for
+ // non-vectored I/O implementations. Also, decommitting synchronously
+ // ensures that Decommit immediately reduces host memory usage.
+ var didUnmapAS bool
+ pseg := mm.pmas.LowerBoundSegment(ar.Start)
+ vseg := mm.vmas.LowerBoundSegment(ar.Start)
+ mem := mm.p.Memory()
+ for pseg.Ok() && pseg.Start() < ar.End {
+ pma := pseg.ValuePtr()
+ if pma.private && !mm.isPMACopyOnWriteLocked(pseg) {
+ psegAR := pseg.Range().Intersect(ar)
+ vseg = vseg.seekNextLowerBound(psegAR.Start)
+ if checkInvariants {
+ if !vseg.Ok() {
+ panic(fmt.Sprintf("no vma after %#x", psegAR.Start))
+ }
+ if psegAR.Start < vseg.Start() {
+ panic(fmt.Sprintf("no vma in [%#x, %#x)", psegAR.Start, vseg.Start()))
+ }
+ }
+ if vseg.Range().IsSupersetOf(psegAR) && vseg.ValuePtr().mappable == nil {
+ if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
+ pseg = pseg.NextSegment()
+ continue
+ }
+ // If an error occurs, fall through to the general
+ // invalidation case below.
+ }
+ }
+ pseg = mm.pmas.Isolate(pseg, ar)
+ pma = pseg.ValuePtr()
+ if !didUnmapAS {
+ // Unmap all of ar, not just pseg.Range(), to minimize host
+ // syscalls. AddressSpace mappings must be removed before
+ // mm.decPrivateRef().
+ mm.unmapASLocked(ar)
+ didUnmapAS = true
+ }
+ if pma.private {
+ mm.decPrivateRef(pseg.fileRange())
+ }
+ pma.file.DecRef(pseg.fileRange())
+ mm.removeRSSLocked(pseg.Range())
+
+ pseg = mm.pmas.Remove(pseg).NextSegment()
+ }
+
+ // "If there are some parts of the specified address space that are not
+ // mapped, the Linux version of madvise() ignores them and applies the call
+ // to the rest (but returns ENOMEM from the system call, as it should)." -
+ // madvise(2)
+ if mm.vmas.SpanRange(ar) != ar.Length() {
+ return syserror.ENOMEM
+ }
+ return nil
+}
+
+// Sync implements the semantics of Linux's msync(MS_SYNC).
+func (mm *MemoryManager) Sync(ctx context.Context, addr usermem.Addr, length uint64) error {
+ ar, ok := addr.ToRange(length)
+ if !ok {
+ return syserror.ENOMEM
+ }
+
+ mm.mappingMu.RLock()
+ // Can't defer mm.mappingMu.RUnlock(); see below.
+ vseg := mm.vmas.LowerBoundSegment(ar.Start)
+ if !vseg.Ok() {
+ mm.mappingMu.RUnlock()
+ return syserror.ENOMEM
+ }
+ var unmapped bool
+ lastEnd := ar.Start
+ for {
+ if !vseg.Ok() {
+ mm.mappingMu.RUnlock()
+ unmapped = true
+ break
+ }
+ if lastEnd < vseg.Start() {
+ unmapped = true
+ }
+ lastEnd = vseg.End()
+ vma := vseg.ValuePtr()
+ // It's only possible to have dirtied the Mappable through a shared
+ // mapping. Don't check if the mapping is writable, because mprotect
+ // may have changed this, and also because Linux doesn't.
+ if id := vma.id; id != nil && vma.mappable != nil && !vma.private {
+ // We can't call memmap.MappingIdentity.Msync while holding
+ // mm.mappingMu since it may take fs locks that precede it in the
+ // lock order.
+ id.IncRef()
+ mr := vseg.mappableRangeOf(vseg.Range().Intersect(ar))
+ mm.mappingMu.RUnlock()
+ err := id.Msync(ctx, mr)
+ id.DecRef()
+ if err != nil {
+ return err
+ }
+ if lastEnd >= ar.End {
+ break
+ }
+ mm.mappingMu.RLock()
+ vseg = mm.vmas.LowerBoundSegment(lastEnd)
+ } else {
+ if lastEnd >= ar.End {
+ mm.mappingMu.RUnlock()
+ break
+ }
+ vseg = vseg.NextSegment()
+ }
+ }
+
+ if unmapped {
+ return syserror.ENOMEM
+ }
+ return nil
+}
+
+// VirtualMemorySize returns the combined length in bytes of all mappings in
+// mm.
+func (mm *MemoryManager) VirtualMemorySize() uint64 {
+ mm.mappingMu.RLock()
+ defer mm.mappingMu.RUnlock()
+ return uint64(mm.usageAS)
+}
+
+// VirtualMemorySizeRange returns the combined length in bytes of all mappings
+// in ar in mm.
+func (mm *MemoryManager) VirtualMemorySizeRange(ar usermem.AddrRange) uint64 {
+ mm.mappingMu.RLock()
+ defer mm.mappingMu.RUnlock()
+ return uint64(mm.vmas.SpanRange(ar))
+}
+
+// ResidentSetSize returns the value advertised as mm's RSS in bytes.
+func (mm *MemoryManager) ResidentSetSize() uint64 {
+ mm.activeMu.RLock()
+ defer mm.activeMu.RUnlock()
+ return uint64(mm.curRSS)
+}
+
+// MaxResidentSetSize returns the value advertised as mm's max RSS in bytes.
+func (mm *MemoryManager) MaxResidentSetSize() uint64 {
+ mm.activeMu.RLock()
+ defer mm.activeMu.RUnlock()
+ return uint64(mm.maxRSS)
+}