Implement mlock(), kind of.

Currently mlock() and friends do nothing whatsoever. However, mlocking is directly application-visible in a number of ways; for example, madvise(MADV_DONTNEED) and msync(MS_INVALIDATE) both fail on mlocked regions. We handle this inconsistently: MADV_DONTNEED is too important to not work, but MS_INVALIDATE is rejected. Change MM to track mlocked regions in a manner consistent with Linux. It still will not actually pin pages into host physical memory, but: - mlock() will now cause sentry memory management to precommit mlocked pages. - MADV_DONTNEED and MS_INVALIDATE will interact with mlocked pages as described above. PiperOrigin-RevId: 225861605 Change-Id: Iee187204979ac9a4d15d0e037c152c0902c8d0ee
author: Jamie Liu <jamieliu@google.com> 2018-12-17 11:37:38 -0800
committer: Shentubot <shentubot@google.com> 2018-12-17 11:38:59 -0800
commit: 2421006426445a1827422c2dbdd6fc6a47087147 (patch)
tree: 49aa2bc113c208fc117aff8a036866a7260090e5 /pkg/sentry
parent: 54694086dfb02a6f8453f043a44ffd10bb5a7070 (diff)
12 files changed, 559 insertions, 126 deletions
diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go
index ba0b7d4fd..eeca01876 100644
--- a/pkg/sentry/limits/limits.go
+++ b/pkg/sentry/limits/limits.go
@@ -33,7 +33,7 @@ const (
 	Rss
 	ProcessCount
 	NumberOfFiles
-	MemoryPagesLocked
+	MemoryLocked
 	AS
 	Locks
 	SignalsPending
diff --git a/pkg/sentry/limits/linux.go b/pkg/sentry/limits/linux.go
index 511db6733..295f9c398 100644
--- a/pkg/sentry/limits/linux.go
+++ b/pkg/sentry/limits/linux.go
@@ -30,7 +30,7 @@ var FromLinuxResource = map[int]LimitType{
 	linux.RLIMIT_RSS:        Rss,
 	linux.RLIMIT_NPROC:      ProcessCount,
 	linux.RLIMIT_NOFILE:     NumberOfFiles,
-	linux.RLIMIT_MEMLOCK:    MemoryPagesLocked,
+	linux.RLIMIT_MEMLOCK:    MemoryLocked,
 	linux.RLIMIT_AS:         AS,
 	linux.RLIMIT_LOCKS:      Locks,
 	linux.RLIMIT_SIGPENDING: SignalsPending,
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index 28e2bed9b..cf20b11e3 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -243,6 +243,40 @@ type MappingIdentity interface {
 	Msync(ctx context.Context, mr MappableRange) error
 }
 
+// MLockMode specifies the memory locking behavior of a memory mapping.
+type MLockMode int
+
+// Note that the ordering of MLockModes is significant; see
+// mm.MemoryManager.defMLockMode.
+const (
+	// MLockNone specifies that a mapping has no memory locking behavior.
+	//
+	// This must be the zero value for MLockMode.
+	MLockNone MLockMode = iota
+
+	// MLockEager specifies that a mapping is memory-locked, as by mlock() or
+	// similar. Pages in the mapping should be made, and kept, resident in
+	// physical memory as soon as possible.
+	//
+	// As of this writing, MLockEager does not cause memory-locking to be
+	// requested from the host; it only affects the sentry's memory management
+	// behavior.
+	//
+	// MLockEager is analogous to Linux's VM_LOCKED.
+	MLockEager
+
+	// MLockLazy specifies that a mapping is memory-locked, as by mlock() or
+	// similar. Pages in the mapping should be kept resident in physical memory
+	// once they have been made resident due to e.g. a page fault.
+	//
+	// As of this writing, MLockLazy does not cause memory-locking to be
+	// requested from the host; in fact, it has virtually no effect, except for
+	// interactions between mlocked pages and other syscalls.
+	//
+	// MLockLazy is analogous to Linux's VM_LOCKED | VM_LOCKONFAULT.
+	MLockLazy
+)
+
 // MMapOpts specifies a request to create a memory mapping.
 type MMapOpts struct {
 	// Length is the length of the mapping.
@@ -303,6 +337,9 @@ type MMapOpts struct {
 	// mapping (see platform.AddressSpace.MapFile).
 	Precommit bool
 
+	// MLockMode specifies the memory locking behavior of the mapping.
+	MLockMode MLockMode
+
 	// Hint is the name used for the mapping in /proc/[pid]/maps. If Hint is
 	// empty, MappingIdentity.MappedName() will be used instead.
 	//
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index 744e73a39..5a9185e5d 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -106,6 +106,7 @@ go_library(
         "//pkg/sentry/context",
         "//pkg/sentry/fs",
         "//pkg/sentry/fs/proc/seqfile",
+        "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/futex",
         "//pkg/sentry/kernel/shm",
         "//pkg/sentry/limits",
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go
index 7488f7c4a..e7aa24c69 100644
--- a/pkg/sentry/mm/address_space.go
+++ b/pkg/sentry/mm/address_space.go
@@ -149,7 +149,7 @@ func (mm *MemoryManager) Deactivate() {
 // for all addresses in ar should be precommitted.
 //
 // Preconditions: mm.activeMu must be locked. mm.as != nil. ar.Length() != 0.
-// ar must be page-aligned. pseg.Range().Contains(ar.Start).
+// ar must be page-aligned. pseg == mm.pmas.LowerBoundSegment(ar.Start).
 func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, precommit bool) error {
 	// By default, map entire pmas at a time, under the assumption that there
 	// is no cost to mapping more of a pma than necessary.
@@ -173,7 +173,9 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre
 		}
 	}
 
-	for {
+	// Since this checks ar.End and not mapAR.End, we will never map a pma that
+	// is not required.
+	for pseg.Ok() && pseg.Start() < ar.End {
 		pma := pseg.ValuePtr()
 		pmaAR := pseg.Range()
 		pmaMapAR := pmaAR.Intersect(mapAR)
@@ -184,13 +186,9 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre
 		if err := pma.file.MapInto(mm.as, pmaMapAR.Start, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil {
 			return err
 		}
-		// Since this checks ar.End and not mapAR.End, we will never map a pma
-		// that is not required.
-		if ar.End <= pmaAR.End {
-			return nil
-		}
 		pseg = pseg.NextSegment()
 	}
+	return nil
 }
 
 // unmapASLocked removes all AddressSpace mappings for addresses in ar.
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index 1613ce11d..a42e32b43 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
 )
@@ -58,13 +59,17 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 	mm.mappingMu.RLock()
 	defer mm.mappingMu.RUnlock()
 	mm2 := &MemoryManager{
-		p:                    mm.p,
-		haveASIO:             mm.haveASIO,
-		layout:               mm.layout,
-		privateRefs:          mm.privateRefs,
-		users:                1,
-		usageAS:              mm.usageAS,
-		brk:                  mm.brk,
+		p:           mm.p,
+		haveASIO:    mm.haveASIO,
+		layout:      mm.layout,
+		privateRefs: mm.privateRefs,
+		users:       1,
+		brk:         mm.brk,
+		usageAS:     mm.usageAS,
+		// "The child does not inherit its parent's memory locks (mlock(2),
+		// mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is
+		// MLockNone, both of which are zero values. vma.mlockMode is reset
+		// when copied below.
 		captureInvalidations: true,
 		argv:                 mm.argv,
 		envv:                 mm.envv,
@@ -77,7 +82,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 	// Copy vmas.
 	dstvgap := mm2.vmas.FirstGap()
 	for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() {
-		vma := srcvseg.ValuePtr()
+		vma := srcvseg.Value() // makes a copy of the vma
 		vmaAR := srcvseg.Range()
 		// Inform the Mappable, if any, of the new mapping.
 		if vma.mappable != nil {
@@ -89,7 +94,8 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
 		if vma.id != nil {
 			vma.id.IncRef()
 		}
-		dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, *vma).NextGap()
+		vma.mlockMode = memmap.MLockNone
+		dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap()
 		// We don't need to update mm2.usageAS since we copied it from mm
 		// above.
 	}
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index b1e39e898..c0632d232 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -95,11 +95,6 @@ type MemoryManager struct {
 	// vmas is protected by mappingMu.
 	vmas vmaSet
 
-	// usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks.
-	//
-	// usageAS is protected by mappingMu.
-	usageAS uint64
-
 	// brk is the mm's brk, which is manipulated using the brk(2) system call.
 	// The brk is initially set up by the loader which maps an executable
 	// binary into the mm.
@@ -107,6 +102,23 @@ type MemoryManager struct {
 	// brk is protected by mappingMu.
 	brk usermem.AddrRange
 
+	// usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks.
+	//
+	// usageAS is protected by mappingMu.
+	usageAS uint64
+
+	// lockedAS is the combined size in bytes of all vmas with vma.mlockMode !=
+	// memmap.MLockNone.
+	//
+	// lockedAS is protected by mappingMu.
+	lockedAS uint64
+
+	// New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or
+	// defMLockMode is greater.
+	//
+	// defMLockMode is protected by mappingMu.
+	defMLockMode memmap.MLockMode
+
 	// activeMu is loosely analogous to Linux's struct
 	// mm_struct::page_table_lock.
 	activeMu ssync.DowngradableRWMutex `state:"nosave"`
@@ -252,6 +264,8 @@ type vma struct {
 	// metag, none of which we currently support.
 	growsDown bool `state:"manual"`
 
+	mlockMode memmap.MLockMode
+
 	// If id is not nil, it controls the lifecycle of mappable and provides vma
 	// metadata shown in /proc/[pid]/maps, and the vma holds a reference.
 	id memmap.MappingIdentity
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index daaae4da1..383703ec3 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -20,6 +20,7 @@ import (
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
@@ -128,16 +129,24 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 
 	// Get the new vma.
 	mm.mappingMu.Lock()
+	if opts.MLockMode < mm.defMLockMode {
+		opts.MLockMode = mm.defMLockMode
+	}
 	vseg, ar, err := mm.createVMALocked(ctx, opts)
 	if err != nil {
 		mm.mappingMu.Unlock()
 		return 0, err
 	}
 
+	// TODO: In Linux, VM_LOCKONFAULT (which may be set on the new
+	// vma by mlockall(MCL_FUTURE|MCL_ONFAULT) => mm_struct::def_flags) appears
+	// to effectively disable MAP_POPULATE by unsetting FOLL_POPULATE in
+	// mm/util.c:vm_mmap_pgoff() => mm/gup.c:__mm_populate() =>
+	// populate_vma_page_range(). Confirm this behavior.
 	switch {
-	case opts.Precommit:
+	case opts.Precommit || opts.MLockMode == memmap.MLockEager:
 		// Get pmas and map with precommit as requested.
-		mm.populateAndUnlock(ctx, vseg, ar, true)
+		mm.populateVMAAndUnlock(ctx, vseg, ar, true)
 
 	case opts.Mappable == nil && length <= privateAllocUnit:
 		// NOTE: Get pmas and map eagerly in the hope
@@ -146,7 +155,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 		// memmap.Mappable.Translate is unknown; and only for small mappings,
 		// to avoid needing to allocate large amounts of memory that we may
 		// subsequently need to checkpoint.
-		mm.populateAndUnlock(ctx, vseg, ar, false)
+		mm.populateVMAAndUnlock(ctx, vseg, ar, false)
 
 	default:
 		mm.mappingMu.Unlock()
@@ -155,31 +164,29 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
 	return ar.Start, nil
 }
 
-// Preconditions: mm.mappingMu must be locked for writing.
+// populateVMA obtains pmas for addresses in ar in the given vma, and maps them
+// into mm.as if it is active.
 //
-// Postconditions: mm.mappingMu will be unlocked.
-func (mm *MemoryManager) populateAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
+// Preconditions: mm.mappingMu must be locked. vseg.Range().IsSupersetOf(ar).
+func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
 	if !vseg.ValuePtr().effectivePerms.Any() {
 		// Linux doesn't populate inaccessible pages. See
 		// mm/gup.c:populate_vma_page_range.
-		mm.mappingMu.Unlock()
 		return
 	}
 
 	mm.activeMu.Lock()
+	// Can't defer mm.activeMu.Unlock(); see below.
 
-	// Even if we get a new pma, we can't actually map it if we don't have an
+	// Even if we get new pmas, we can't actually map them if we don't have an
 	// AddressSpace.
 	if mm.as == nil {
 		mm.activeMu.Unlock()
-		mm.mappingMu.Unlock()
 		return
 	}
 
 	// Ensure that we have usable pmas.
-	mm.mappingMu.DowngradeLock()
 	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{})
-	mm.mappingMu.RUnlock()
 	if err != nil {
 		// mm/util.c:vm_mmap_pgoff() ignores the error, if any, from
 		// mm/gup.c:mm_populate(). If it matters, we'll get it again when
@@ -197,6 +204,45 @@ func (mm *MemoryManager) populateAndUnlock(ctx context.Context, vseg vmaIterator
 	mm.activeMu.RUnlock()
 }
 
+// populateVMAAndUnlock is equivalent to populateVMA, but also unconditionally
+// unlocks mm.mappingMu. In cases where populateVMAAndUnlock is usable, it is
+// preferable to populateVMA since it unlocks mm.mappingMu before performing
+// expensive operations that don't require it to be locked.
+//
+// Preconditions: mm.mappingMu must be locked for writing.
+// vseg.Range().IsSupersetOf(ar).
+//
+// Postconditions: mm.mappingMu will be unlocked.
+func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) {
+	// See populateVMA above for commentary.
+	if !vseg.ValuePtr().effectivePerms.Any() {
+		mm.mappingMu.Unlock()
+		return
+	}
+
+	mm.activeMu.Lock()
+
+	if mm.as == nil {
+		mm.activeMu.Unlock()
+		mm.mappingMu.Unlock()
+		return
+	}
+
+	// mm.mappingMu doesn't need to be write-locked for getPMAsLocked, and it
+	// isn't needed at all for mapASLocked.
+	mm.mappingMu.DowngradeLock()
+	pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{})
+	mm.mappingMu.RUnlock()
+	if err != nil {
+		mm.activeMu.Unlock()
+		return
+	}
+
+	mm.activeMu.DowngradeLock()
+	mm.mapASLocked(pseg, ar, precommit)
+	mm.activeMu.RUnlock()
+}
+
 // MapStack allocates the initial process stack.
 func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error) {
 	// maxStackSize is the maximum supported process stack size in bytes.
@@ -236,6 +282,7 @@ func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error
 		MaxPerms:  usermem.AnyAccess,
 		Private:   true,
 		GrowsDown: true,
+		MLockMode: mm.defMLockMode,
 		Hint:      "[stack]",
 	})
 	return ar, err
@@ -334,6 +381,19 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 	// occupies at least part of the destination. Thus the NoMove case always
 	// fails and the MayMove case always falls back to copying.
 
+	if vma := vseg.ValuePtr(); newSize > oldSize && vma.mlockMode != memmap.MLockNone {
+		// Check against RLIMIT_MEMLOCK. Unlike mmap, mlock, and mlockall,
+		// mremap in Linux does not check mm/mlock.c:can_do_mlock() and
+		// therefore does not return EPERM if RLIMIT_MEMLOCK is 0 and
+		// !CAP_IPC_LOCK.
+		mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+			if newLockedAS := mm.lockedAS - oldSize + newSize; newLockedAS > mlockLimit {
+				return 0, syserror.EAGAIN
+			}
+		}
+	}
+
 	if opts.Move != MRemapMustMove {
 		// Handle no-ops and in-place shrinking. These cases don't care if
 		// [oldAddr, oldEnd) maps to a single vma, or is even mapped at all
@@ -360,7 +420,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 		if vma.mappable != nil {
 			newOffset = vseg.mappableRange().End
 		}
-		_, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+		vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
 			Length:          newSize - oldSize,
 			MappingIdentity: vma.id,
 			Mappable:        vma.mappable,
@@ -371,9 +431,13 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 			MaxPerms:        vma.maxPerms,
 			Private:         vma.private,
 			GrowsDown:       vma.growsDown,
+			MLockMode:       vma.mlockMode,
 			Hint:            vma.hint,
 		})
 		if err == nil {
+			if vma.mlockMode == memmap.MLockEager {
+				mm.populateVMA(ctx, vseg, ar, true)
+			}
 			return oldAddr, nil
 		}
 		// In-place growth failed. In the MRemapMayMove case, fall through to
@@ -462,8 +526,14 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 		if vma.id != nil {
 			vma.id.IncRef()
 		}
-		mm.vmas.Add(newAR, vma)
+		vseg := mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
 		mm.usageAS += uint64(newAR.Length())
+		if vma.mlockMode != memmap.MLockNone {
+			mm.lockedAS += uint64(newAR.Length())
+			if vma.mlockMode == memmap.MLockEager {
+				mm.populateVMA(ctx, vseg, newAR, true)
+			}
+		}
 		return newAR.Start, nil
 	}
 
@@ -485,8 +555,11 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 	vseg = mm.vmas.Isolate(vseg, oldAR)
 	vma := vseg.Value()
 	mm.vmas.Remove(vseg)
-	mm.vmas.Add(newAR, vma)
+	vseg = mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
 	mm.usageAS = mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
+	if vma.mlockMode != memmap.MLockNone {
+		mm.lockedAS = mm.lockedAS - uint64(oldAR.Length()) + uint64(newAR.Length())
+	}
 
 	// Move pmas. This is technically optional for non-private pmas, which
 	// could just go through memmap.Mappable.Translate again, but it's required
@@ -501,6 +574,10 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
 		vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.isMappableAsWritable())
 	}
 
+	if vma.mlockMode == memmap.MLockEager {
+		mm.populateVMA(ctx, vseg, newAR, true)
+	}
+
 	return newAR.Start, nil
 }
 
@@ -611,9 +688,10 @@ func (mm *MemoryManager) BrkSetup(ctx context.Context, addr usermem.Addr) {
 // error on failure.
 func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Addr, error) {
 	mm.mappingMu.Lock()
-	defer mm.mappingMu.Unlock()
+	// Can't defer mm.mappingMu.Unlock(); see below.
 
 	if addr < mm.brk.Start {
+		mm.mappingMu.Unlock()
 		return mm.brk.End, syserror.EINVAL
 	}
 
@@ -623,21 +701,24 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad
 	// heap + data + bss. The segment sizes need to be plumbed from the
 	// loader package to fully enforce RLIMIT_DATA.
 	if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur {
+		mm.mappingMu.Unlock()
 		return mm.brk.End, syserror.ENOMEM
 	}
 
 	oldbrkpg, _ := mm.brk.End.RoundUp()
 	newbrkpg, ok := addr.RoundUp()
 	if !ok {
+		mm.mappingMu.Unlock()
 		return mm.brk.End, syserror.EFAULT
 	}
 
 	switch {
 	case newbrkpg < oldbrkpg:
 		mm.unmapLocked(ctx, usermem.AddrRange{newbrkpg, oldbrkpg})
+		mm.mappingMu.Unlock()
 
 	case oldbrkpg < newbrkpg:
-		_, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{
+		vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
 			Length: uint64(newbrkpg - oldbrkpg),
 			Addr:   oldbrkpg,
 			Fixed:  true,
@@ -646,17 +727,221 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad
 			Perms:    usermem.ReadWrite,
 			MaxPerms: usermem.AnyAccess,
 			Private:  true,
-			Hint:     "[heap]",
+			// Linux: mm/mmap.c:sys_brk() => do_brk_flags() includes
+			// mm->def_flags.
+			MLockMode: mm.defMLockMode,
+			Hint:      "[heap]",
 		})
 		if err != nil {
+			mm.mappingMu.Unlock()
 			return mm.brk.End, err
 		}
+		if mm.defMLockMode == memmap.MLockEager {
+			mm.populateVMAAndUnlock(ctx, vseg, ar, true)
+		} else {
+			mm.mappingMu.Unlock()
+		}
+
+	default:
+		// Nothing to do.
+		mm.mappingMu.Unlock()
 	}
 
 	mm.brk.End = addr
 	return addr, nil
 }
 
+// MLock implements the semantics of Linux's mlock()/mlock2()/munlock(),
+// depending on mode.
+func (mm *MemoryManager) MLock(ctx context.Context, addr usermem.Addr, length uint64, mode memmap.MLockMode) error {
+	// Linux allows this to overflow.
+	la, _ := usermem.Addr(length + addr.PageOffset()).RoundUp()
+	ar, ok := addr.RoundDown().ToRange(uint64(la))
+	if !ok {
+		return syserror.EINVAL
+	}
+
+	mm.mappingMu.Lock()
+	// Can't defer mm.mappingMu.Unlock(); see below.
+
+	if mode != memmap.MLockNone {
+		// Check against RLIMIT_MEMLOCK.
+		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+			mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+			if mlockLimit == 0 {
+				mm.mappingMu.Unlock()
+				return syserror.EPERM
+			}
+			if newLockedAS := mm.lockedAS + uint64(ar.Length()) - mm.mlockedBytesRangeLocked(ar); newLockedAS > mlockLimit {
+				mm.mappingMu.Unlock()
+				return syserror.ENOMEM
+			}
+		}
+	}
+
+	// Check this after RLIMIT_MEMLOCK for consistency with Linux.
+	if ar.Length() == 0 {
+		mm.mappingMu.Unlock()
+		return nil
+	}
+
+	// Apply the new mlock mode to vmas.
+	var unmapped bool
+	vseg := mm.vmas.FindSegment(ar.Start)
+	for {
+		if !vseg.Ok() {
+			unmapped = true
+			break
+		}
+		vseg = mm.vmas.Isolate(vseg, ar)
+		vma := vseg.ValuePtr()
+		prevMode := vma.mlockMode
+		vma.mlockMode = mode
+		if mode != memmap.MLockNone && prevMode == memmap.MLockNone {
+			mm.lockedAS += uint64(vseg.Range().Length())
+		} else if mode == memmap.MLockNone && prevMode != memmap.MLockNone {
+			mm.lockedAS -= uint64(vseg.Range().Length())
+		}
+		if ar.End <= vseg.End() {
+			break
+		}
+		vseg, _ = vseg.NextNonEmpty()
+	}
+	mm.vmas.MergeRange(ar)
+	mm.vmas.MergeAdjacent(ar)
+	if unmapped {
+		mm.mappingMu.Unlock()
+		return syserror.ENOMEM
+	}
+
+	if mode == memmap.MLockEager {
+		// Ensure that we have usable pmas. Since we didn't return ENOMEM
+		// above, ar must be fully covered by vmas, so we can just use
+		// NextSegment below.
+		mm.activeMu.Lock()
+		mm.mappingMu.DowngradeLock()
+		for vseg := mm.vmas.FindSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
+			if !vseg.ValuePtr().effectivePerms.Any() {
+				// Linux: mm/gup.c:__get_user_pages() returns EFAULT in this
+				// case, which is converted to ENOMEM by mlock.
+				mm.activeMu.Unlock()
+				mm.mappingMu.RUnlock()
+				return syserror.ENOMEM
+			}
+			_, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), pmaOpts{})
+			if err != nil {
+				mm.activeMu.Unlock()
+				mm.mappingMu.RUnlock()
+				// Linux: mm/mlock.c:__mlock_posix_error_return()
+				if err == syserror.EFAULT {
+					return syserror.ENOMEM
+				}
+				if err == syserror.ENOMEM {
+					return syserror.EAGAIN
+				}
+				return err
+			}
+		}
+
+		// Map pmas into the active AddressSpace, if we have one.
+		mm.mappingMu.RUnlock()
+		if mm.as != nil {
+			mm.activeMu.DowngradeLock()
+			err := mm.mapASLocked(mm.pmas.LowerBoundSegment(ar.Start), ar, true /* precommit */)
+			mm.activeMu.RUnlock()
+			if err != nil {
+				return err
+			}
+		} else {
+			mm.activeMu.Unlock()
+		}
+	} else {
+		mm.mappingMu.Unlock()
+	}
+
+	return nil
+}
+
+// MLockAllOpts holds options to MLockAll.
+type MLockAllOpts struct {
+	// If Current is true, change the memory-locking behavior of all mappings
+	// to Mode. If Future is true, upgrade the memory-locking behavior of all
+	// future mappings to Mode. At least one of Current or Future must be true.
+	Current bool
+	Future  bool
+	Mode    memmap.MLockMode
+}
+
+// MLockAll implements the semantics of Linux's mlockall()/munlockall(),
+// depending on opts.
+func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error {
+	if !opts.Current && !opts.Future {
+		return syserror.EINVAL
+	}
+
+	mm.mappingMu.Lock()
+	// Can't defer mm.mappingMu.Unlock(); see below.
+
+	if opts.Current {
+		if opts.Mode != memmap.MLockNone {
+			// Check against RLIMIT_MEMLOCK.
+			if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+				mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+				if mlockLimit == 0 {
+					mm.mappingMu.Unlock()
+					return syserror.EPERM
+				}
+				if uint64(mm.vmas.Span()) > mlockLimit {
+					mm.mappingMu.Unlock()
+					return syserror.ENOMEM
+				}
+			}
+		}
+		for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
+			vma := vseg.ValuePtr()
+			prevMode := vma.mlockMode
+			vma.mlockMode = opts.Mode
+			if opts.Mode != memmap.MLockNone && prevMode == memmap.MLockNone {
+				mm.lockedAS += uint64(vseg.Range().Length())
+			} else if opts.Mode == memmap.MLockNone && prevMode != memmap.MLockNone {
+				mm.lockedAS -= uint64(vseg.Range().Length())
+			}
+		}
+	}
+
+	if opts.Future {
+		mm.defMLockMode = opts.Mode
+	}
+
+	if opts.Current && opts.Mode == memmap.MLockEager {
+		// Linux: mm/mlock.c:sys_mlockall() => include/linux/mm.h:mm_populate()
+		// ignores the return value of __mm_populate(), so all errors below are
+		// ignored.
+		//
+		// Try to get usable pmas.
+		mm.activeMu.Lock()
+		mm.mappingMu.DowngradeLock()
+		for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
+			if vseg.ValuePtr().effectivePerms.Any() {
+				mm.getPMAsLocked(ctx, vseg, vseg.Range(), pmaOpts{})
+			}
+		}
+
+		// Map all pmas into the active AddressSpace, if we have one.
+		mm.mappingMu.RUnlock()
+		if mm.as != nil {
+			mm.activeMu.DowngradeLock()
+			mm.mapASLocked(mm.pmas.FirstSegment(), mm.applicationAddrRange(), true /* precommit */)
+			mm.activeMu.RUnlock()
+		} else {
+			mm.activeMu.Unlock()
+		}
+	} else {
+		mm.mappingMu.Unlock()
+	}
+	return nil
+}
+
 // Decommit implements the semantics of Linux's madvise(MADV_DONTNEED).
 func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 	ar, ok := addr.ToRange(length)
@@ -680,46 +965,49 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 	// ensures that Decommit immediately reduces host memory usage.
 	var didUnmapAS bool
 	pseg := mm.pmas.LowerBoundSegment(ar.Start)
-	vseg := mm.vmas.LowerBoundSegment(ar.Start)
 	mem := mm.p.Memory()
-	for pseg.Ok() && pseg.Start() < ar.End {
-		pma := pseg.ValuePtr()
-		if pma.private && !mm.isPMACopyOnWriteLocked(pseg) {
-			psegAR := pseg.Range().Intersect(ar)
-			vseg = vseg.seekNextLowerBound(psegAR.Start)
-			if checkInvariants {
-				if !vseg.Ok() {
-					panic(fmt.Sprintf("no vma after %#x", psegAR.Start))
-				}
-				if psegAR.Start < vseg.Start() {
-					panic(fmt.Sprintf("no vma in [%#x, %#x)", psegAR.Start, vseg.Start()))
-				}
+	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
+		vma := vseg.ValuePtr()
+		if vma.mlockMode != memmap.MLockNone {
+			return syserror.EINVAL
+		}
+		vsegAR := vseg.Range().Intersect(ar)
+		// pseg should already correspond to either this vma or a later one,
+		// since there can't be a pma without a corresponding vma.
+		if checkInvariants {
+			if pseg.Ok() && pseg.End() <= vsegAR.Start {
+				panic(fmt.Sprintf("pma %v precedes vma %v", pseg.Range(), vsegAR))
 			}
-			if vseg.Range().IsSupersetOf(psegAR) && vseg.ValuePtr().mappable == nil {
-				if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
-					pseg = pseg.NextSegment()
-					continue
+		}
+		for pseg.Ok() && pseg.Start() < vsegAR.End {
+			pma := pseg.ValuePtr()
+			if pma.private && !mm.isPMACopyOnWriteLocked(pseg) {
+				psegAR := pseg.Range().Intersect(ar)
+				if vsegAR.IsSupersetOf(psegAR) && vma.mappable == nil {
+					if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil {
+						pseg = pseg.NextSegment()
+						continue
+					}
+					// If an error occurs, fall through to the general
+					// invalidation case below.
 				}
-				// If an error occurs, fall through to the general
-				// invalidation case below.
 			}
+			pseg = mm.pmas.Isolate(pseg, vsegAR)
+			pma = pseg.ValuePtr()
+			if !didUnmapAS {
+				// Unmap all of ar, not just pseg.Range(), to minimize host
+				// syscalls. AddressSpace mappings must be removed before
+				// mm.decPrivateRef().
+				mm.unmapASLocked(ar)
+				didUnmapAS = true
+			}
+			if pma.private {
+				mm.decPrivateRef(pseg.fileRange())
+			}
+			pma.file.DecRef(pseg.fileRange())
+			mm.removeRSSLocked(pseg.Range())
+			pseg = mm.pmas.Remove(pseg).NextSegment()
 		}
-		pseg = mm.pmas.Isolate(pseg, ar)
-		pma = pseg.ValuePtr()
-		if !didUnmapAS {
-			// Unmap all of ar, not just pseg.Range(), to minimize host
-			// syscalls. AddressSpace mappings must be removed before
-			// mm.decPrivateRef().
-			mm.unmapASLocked(ar)
-			didUnmapAS = true
-		}
-		if pma.private {
-			mm.decPrivateRef(pseg.fileRange())
-		}
-		pma.file.DecRef(pseg.fileRange())
-		mm.removeRSSLocked(pseg.Range())
-
-		pseg = mm.pmas.Remove(pseg).NextSegment()
 	}
 
 	// "If there are some parts of the specified address space that are not
@@ -732,9 +1020,28 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
 	return nil
 }
 
-// Sync implements the semantics of Linux's msync(MS_SYNC).
-func (mm *MemoryManager) Sync(ctx context.Context, addr usermem.Addr, length uint64) error {
-	ar, ok := addr.ToRange(length)
+// MSyncOpts holds options to MSync.
+type MSyncOpts struct {
+	// Sync has the semantics of MS_SYNC.
+	Sync bool
+
+	// Invalidate has the semantics of MS_INVALIDATE.
+	Invalidate bool
+}
+
+// MSync implements the semantics of Linux's msync().
+func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length uint64, opts MSyncOpts) error {
+	if addr != addr.RoundDown() {
+		return syserror.EINVAL
+	}
+	if length == 0 {
+		return nil
+	}
+	la, ok := usermem.Addr(length).RoundUp()
+	if !ok {
+		return syserror.ENOMEM
+	}
+	ar, ok := addr.ToRange(uint64(la))
 	if !ok {
 		return syserror.ENOMEM
 	}
@@ -759,10 +1066,14 @@ func (mm *MemoryManager) Sync(ctx context.Context, addr usermem.Addr, length uin
 		}
 		lastEnd = vseg.End()
 		vma := vseg.ValuePtr()
+		if opts.Invalidate && vma.mlockMode != memmap.MLockNone {
+			mm.mappingMu.RUnlock()
+			return syserror.EBUSY
+		}
 		// It's only possible to have dirtied the Mappable through a shared
 		// mapping. Don't check if the mapping is writable, because mprotect
 		// may have changed this, and also because Linux doesn't.
-		if id := vma.id; id != nil && vma.mappable != nil && !vma.private {
+		if id := vma.id; opts.Sync && id != nil && vma.mappable != nil && !vma.private {
 			// We can't call memmap.MappingIdentity.Msync while holding
 			// mm.mappingMu since it may take fs locks that precede it in the
 			// lock order.
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index 5c2c802f6..28ba9f2f5 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -17,8 +17,10 @@ package mm
 import (
 	"fmt"
 
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
@@ -53,6 +55,23 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
 		return vmaIterator{}, usermem.AddrRange{}, syserror.ENOMEM
 	}
 
+	if opts.MLockMode != memmap.MLockNone {
+		// Check against RLIMIT_MEMLOCK.
+		if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
+			mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
+			if mlockLimit == 0 {
+				return vmaIterator{}, usermem.AddrRange{}, syserror.EPERM
+			}
+			newLockedAS := mm.lockedAS + opts.Length
+			if opts.Unmap {
+				newLockedAS -= mm.mlockedBytesRangeLocked(ar)
+			}
+			if newLockedAS > mlockLimit {
+				return vmaIterator{}, usermem.AddrRange{}, syserror.EAGAIN
+			}
+		}
+	}
+
 	// Remove overwritten mappings. This ordering is consistent with Linux:
 	// compare Linux's mm/mmap.c:mmap_region() => do_munmap(),
 	// file->f_op->mmap().
@@ -85,10 +104,14 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
 		maxPerms:       opts.MaxPerms,
 		private:        opts.Private,
 		growsDown:      opts.GrowsDown,
+		mlockMode:      opts.MLockMode,
 		id:             opts.MappingIdentity,
 		hint:           opts.Hint,
 	})
 	mm.usageAS += opts.Length
+	if opts.MLockMode != memmap.MLockNone {
+		mm.lockedAS += opts.Length
+	}
 
 	return vseg, ar, nil
 }
@@ -201,6 +224,17 @@ func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bo
 	return 0, syserror.ENOMEM
 }
 
+// Preconditions: mm.mappingMu must be locked.
+func (mm *MemoryManager) mlockedBytesRangeLocked(ar usermem.AddrRange) uint64 {
+	var total uint64
+	for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
+		if vseg.ValuePtr().mlockMode != memmap.MLockNone {
+			total += uint64(vseg.Range().Intersect(ar).Length())
+		}
+	}
+	return total
+}
+
 // getVMAsLocked ensures that vmas exist for all addresses in ar, and support
 // access of type (at, ignorePermissions). It returns:
 //
@@ -338,6 +372,9 @@ func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRa
 			vma.id.DecRef()
 		}
 		mm.usageAS -= uint64(vmaAR.Length())
+		if vma.mlockMode != memmap.MLockNone {
+			mm.lockedAS -= uint64(vmaAR.Length())
+		}
 		vgap = mm.vmas.Remove(vseg)
 		vseg = vgap.NextSegment()
 	}
@@ -368,6 +405,7 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa
 		vma1.maxPerms != vma2.maxPerms ||
 		vma1.private != vma2.private ||
 		vma1.growsDown != vma2.growsDown ||
+		vma1.mlockMode != vma2.mlockMode ||
 		vma1.id != vma2.id ||
 		vma1.hint != vma2.hint {
 		return vma{}, false
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 2aab948da..cc5ebb955 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -196,11 +196,11 @@ var AMD64 = &kernel.SyscallTable{
 		145: SchedGetscheduler,
 		146: SchedGetPriorityMax,
 		147: SchedGetPriorityMin,
-		148: syscalls.ErrorWithEvent(syscall.EPERM),      // SchedRrGetInterval,
-		149: syscalls.Error(nil),                         // Mlock, TODO
-		150: syscalls.Error(nil),                         // Munlock, TODO
-		151: syscalls.Error(nil),                         // Mlockall, TODO
-		152: syscalls.Error(nil),                         // Munlockall, TODO
+		148: syscalls.ErrorWithEvent(syscall.EPERM), // SchedRrGetInterval,
+		149: Mlock,
+		150: Munlock,
+		151: Mlockall,
+		152: Munlockall,
 		153: syscalls.CapError(linux.CAP_SYS_TTY_CONFIG), // Vhangup,
 		154: syscalls.Error(syscall.EPERM),               // ModifyLdt,
 		155: syscalls.Error(syscall.EPERM),               // PivotRoot,
@@ -373,8 +373,9 @@ var AMD64 = &kernel.SyscallTable{
 		//     322: Execveat, TODO
 		//     323: Userfaultfd, TODO
 		//     324: Membarrier, TODO
-		// Syscalls after 325 are backports from 4.6.
-		325: syscalls.Error(nil), // Mlock2, TODO
+		325: Mlock2,
+		// Syscalls after 325 are "backports" from versions of Linux after 4.4.
+		//	326: CopyFileRange,
 		327: Preadv2,
 		//	328: Pwritev2,  // Pwritev2, TODO
 	},
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 145f7846c..8732861e0 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -69,6 +69,9 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
 		GrowsDown: linux.MAP_GROWSDOWN&flags != 0,
 		Precommit: linux.MAP_POPULATE&flags != 0,
 	}
+	if linux.MAP_LOCKED&flags != 0 {
+		opts.MLockMode = memmap.MLockEager
+	}
 	defer func() {
 		if opts.MappingIdentity != nil {
 			opts.MappingIdentity.DecRef()
@@ -384,16 +387,6 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	length := args[1].SizeT()
 	flags := args[2].Int()
 
-	if addr != addr.RoundDown() {
-		return 0, nil, syserror.EINVAL
-	}
-	if length == 0 {
-		return 0, nil, nil
-	}
-	la, ok := usermem.Addr(length).RoundUp()
-	if !ok {
-		return 0, nil, syserror.ENOMEM
-	}
 	// "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC,
 	// and may additionally include the MS_INVALIDATE bit. ... However, Linux
 	// permits a call to msync() that specifies neither of these flags, with
@@ -406,39 +399,72 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
 	if sync && flags&linux.MS_ASYNC != 0 {
 		return 0, nil, syserror.EINVAL
 	}
+	err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{
+		Sync:       sync,
+		Invalidate: flags&linux.MS_INVALIDATE != 0,
+	})
+	// MSync calls fsync, the same interrupt conversion rules apply, see
+	// mm/msync.c, fsync POSIX.1-2008.
+	return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
+}
+
+// Mlock implements linux syscall mlock(2).
+func Mlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+
+	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockEager)
+}
 
-	// MS_INVALIDATE "asks to invalidate other mappings of the same file (so
-	// that they can be updated with the fresh values just written)". This is a
-	// no-op given that shared memory exists. However, MS_INVALIDATE can also
-	// be used to detect mlocks: "EBUSY: MS_INVALIDATE was specified in flags,
-	// and a memory lock exists for the specified address range." Given that
-	// mlock is stubbed out, it's unsafe to pass MS_INVALIDATE silently since
-	// some user program could be using it for synchronization.
-	if flags&linux.MS_INVALIDATE != 0 {
+// Mlock2 implements linux syscall mlock2(2).
+func Mlock2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+	flags := args[2].Int()
+
+	if flags&^(linux.MLOCK_ONFAULT) != 0 {
 		return 0, nil, syserror.EINVAL
 	}
-	// MS_SYNC "requests an update and waits for it to complete."
-	if sync {
-		err := t.MemoryManager().Sync(t, addr, uint64(la))
-		// Sync calls fsync, the same interrupt conversion rules apply, see
-		// mm/msync.c, fsync POSIX.1-2008.
-		return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
-	}
-	// MS_ASYNC "specifies that an update be scheduled, but the call returns
-	// immediately". As long as dirty pages are tracked and eventually written
-	// back, this is a no-op. (Correspondingly: "Since Linux 2.6.19, MS_ASYNC
-	// is in fact a no-op, since the kernel properly tracks dirty pages and
-	// flushes them to storage as necessary.")
-	//
-	// However: "ENOMEM: The indicated memory (or part of it) was not mapped."
-	// This applies even for MS_ASYNC.
-	ar, ok := addr.ToRange(uint64(la))
-	if !ok {
-		return 0, nil, syserror.ENOMEM
+
+	mode := memmap.MLockEager
+	if flags&linux.MLOCK_ONFAULT != 0 {
+		mode = memmap.MLockLazy
 	}
-	mapped := t.MemoryManager().VirtualMemorySizeRange(ar)
-	if mapped != uint64(la) {
-		return 0, nil, syserror.ENOMEM
+	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), mode)
+}
+
+// Munlock implements linux syscall munlock(2).
+func Munlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	addr := args[0].Pointer()
+	length := args[1].SizeT()
+
+	return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockNone)
+}
+
+// Mlockall implements linux syscall mlockall(2).
+func Mlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	flags := args[0].Int()
+
+	if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 {
+		return 0, nil, syserror.EINVAL
 	}
-	return 0, nil, nil
+
+	mode := memmap.MLockEager
+	if flags&linux.MCL_ONFAULT != 0 {
+		mode = memmap.MLockLazy
+	}
+	return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
+		Current: flags&linux.MCL_CURRENT != 0,
+		Future:  flags&linux.MCL_FUTURE != 0,
+		Mode:    mode,
+	})
+}
+
+// Munlockall implements linux syscall munlockall(2).
+func Munlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+	return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
+		Current: true,
+		Future:  true,
+		Mode:    memmap.MLockNone,
+	})
 }
diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go
index 2f16e1791..b0b216045 100644
--- a/pkg/sentry/syscalls/linux/sys_rlimit.go
+++ b/pkg/sentry/syscalls/linux/sys_rlimit.go
@@ -90,6 +90,7 @@ var setableLimits = map[limits.LimitType]struct{}{
 	limits.CPU:           {},
 	limits.Data:          {},
 	limits.FileSize:      {},
+	limits.MemoryLocked:  {},
 	limits.Stack:         {},
 	// These are not enforced, but we include them here to avoid returning
 	// EPERM, since some apps expect them to succeed.
author	Jamie Liu <jamieliu@google.com>	2018-12-17 11:37:38 -0800
committer	Shentubot <shentubot@google.com>	2018-12-17 11:38:59 -0800
commit	2421006426445a1827422c2dbdd6fc6a47087147 (patch)
tree	49aa2bc113c208fc117aff8a036866a7260090e5 /pkg/sentry
parent	54694086dfb02a6f8453f043a44ffd10bb5a7070 (diff)