diff options
author | Jamie Liu <jamieliu@google.com> | 2018-12-17 11:37:38 -0800 |
---|---|---|
committer | Shentubot <shentubot@google.com> | 2018-12-17 11:38:59 -0800 |
commit | 2421006426445a1827422c2dbdd6fc6a47087147 (patch) | |
tree | 49aa2bc113c208fc117aff8a036866a7260090e5 /pkg/sentry | |
parent | 54694086dfb02a6f8453f043a44ffd10bb5a7070 (diff) |
Implement mlock(), kind of.
Currently mlock() and friends do nothing whatsoever. However, mlocking
is directly application-visible in a number of ways; for example,
madvise(MADV_DONTNEED) and msync(MS_INVALIDATE) both fail on mlocked
regions. We handle this inconsistently: MADV_DONTNEED is too important
to not work, but MS_INVALIDATE is rejected.
Change MM to track mlocked regions in a manner consistent with Linux.
It still will not actually pin pages into host physical memory, but:
- mlock() will now cause sentry memory management to precommit mlocked
pages.
- MADV_DONTNEED and MS_INVALIDATE will interact with mlocked pages as
described above.
PiperOrigin-RevId: 225861605
Change-Id: Iee187204979ac9a4d15d0e037c152c0902c8d0ee
Diffstat (limited to 'pkg/sentry')
-rw-r--r-- | pkg/sentry/limits/limits.go | 2 | ||||
-rw-r--r-- | pkg/sentry/limits/linux.go | 2 | ||||
-rw-r--r-- | pkg/sentry/memmap/memmap.go | 37 | ||||
-rw-r--r-- | pkg/sentry/mm/BUILD | 1 | ||||
-rw-r--r-- | pkg/sentry/mm/address_space.go | 12 | ||||
-rw-r--r-- | pkg/sentry/mm/lifecycle.go | 24 | ||||
-rw-r--r-- | pkg/sentry/mm/mm.go | 24 | ||||
-rw-r--r-- | pkg/sentry/mm/syscalls.go | 423 | ||||
-rw-r--r-- | pkg/sentry/mm/vma.go | 38 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/linux64.go | 15 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/sys_mmap.go | 106 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/sys_rlimit.go | 1 |
12 files changed, 559 insertions, 126 deletions
diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go index ba0b7d4fd..eeca01876 100644 --- a/pkg/sentry/limits/limits.go +++ b/pkg/sentry/limits/limits.go @@ -33,7 +33,7 @@ const ( Rss ProcessCount NumberOfFiles - MemoryPagesLocked + MemoryLocked AS Locks SignalsPending diff --git a/pkg/sentry/limits/linux.go b/pkg/sentry/limits/linux.go index 511db6733..295f9c398 100644 --- a/pkg/sentry/limits/linux.go +++ b/pkg/sentry/limits/linux.go @@ -30,7 +30,7 @@ var FromLinuxResource = map[int]LimitType{ linux.RLIMIT_RSS: Rss, linux.RLIMIT_NPROC: ProcessCount, linux.RLIMIT_NOFILE: NumberOfFiles, - linux.RLIMIT_MEMLOCK: MemoryPagesLocked, + linux.RLIMIT_MEMLOCK: MemoryLocked, linux.RLIMIT_AS: AS, linux.RLIMIT_LOCKS: Locks, linux.RLIMIT_SIGPENDING: SignalsPending, diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go index 28e2bed9b..cf20b11e3 100644 --- a/pkg/sentry/memmap/memmap.go +++ b/pkg/sentry/memmap/memmap.go @@ -243,6 +243,40 @@ type MappingIdentity interface { Msync(ctx context.Context, mr MappableRange) error } +// MLockMode specifies the memory locking behavior of a memory mapping. +type MLockMode int + +// Note that the ordering of MLockModes is significant; see +// mm.MemoryManager.defMLockMode. +const ( + // MLockNone specifies that a mapping has no memory locking behavior. + // + // This must be the zero value for MLockMode. + MLockNone MLockMode = iota + + // MLockEager specifies that a mapping is memory-locked, as by mlock() or + // similar. Pages in the mapping should be made, and kept, resident in + // physical memory as soon as possible. + // + // As of this writing, MLockEager does not cause memory-locking to be + // requested from the host; it only affects the sentry's memory management + // behavior. + // + // MLockEager is analogous to Linux's VM_LOCKED. + MLockEager + + // MLockLazy specifies that a mapping is memory-locked, as by mlock() or + // similar. Pages in the mapping should be kept resident in physical memory + // once they have been made resident due to e.g. a page fault. + // + // As of this writing, MLockLazy does not cause memory-locking to be + // requested from the host; in fact, it has virtually no effect, except for + // interactions between mlocked pages and other syscalls. + // + // MLockLazy is analogous to Linux's VM_LOCKED | VM_LOCKONFAULT. + MLockLazy +) + // MMapOpts specifies a request to create a memory mapping. type MMapOpts struct { // Length is the length of the mapping. @@ -303,6 +337,9 @@ type MMapOpts struct { // mapping (see platform.AddressSpace.MapFile). Precommit bool + // MLockMode specifies the memory locking behavior of the mapping. + MLockMode MLockMode + // Hint is the name used for the mapping in /proc/[pid]/maps. If Hint is // empty, MappingIdentity.MappedName() will be used instead. // diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index 744e73a39..5a9185e5d 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -106,6 +106,7 @@ go_library( "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fs/proc/seqfile", + "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/futex", "//pkg/sentry/kernel/shm", "//pkg/sentry/limits", diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go index 7488f7c4a..e7aa24c69 100644 --- a/pkg/sentry/mm/address_space.go +++ b/pkg/sentry/mm/address_space.go @@ -149,7 +149,7 @@ func (mm *MemoryManager) Deactivate() { // for all addresses in ar should be precommitted. // // Preconditions: mm.activeMu must be locked. mm.as != nil. ar.Length() != 0. -// ar must be page-aligned. pseg.Range().Contains(ar.Start). +// ar must be page-aligned. pseg == mm.pmas.LowerBoundSegment(ar.Start). func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, precommit bool) error { // By default, map entire pmas at a time, under the assumption that there // is no cost to mapping more of a pma than necessary. @@ -173,7 +173,9 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre } } - for { + // Since this checks ar.End and not mapAR.End, we will never map a pma that + // is not required. + for pseg.Ok() && pseg.Start() < ar.End { pma := pseg.ValuePtr() pmaAR := pseg.Range() pmaMapAR := pmaAR.Intersect(mapAR) @@ -184,13 +186,9 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre if err := pma.file.MapInto(mm.as, pmaMapAR.Start, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil { return err } - // Since this checks ar.End and not mapAR.End, we will never map a pma - // that is not required. - if ar.End <= pmaAR.End { - return nil - } pseg = pseg.NextSegment() } + return nil } // unmapASLocked removes all AddressSpace mappings for addresses in ar. diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go index 1613ce11d..a42e32b43 100644 --- a/pkg/sentry/mm/lifecycle.go +++ b/pkg/sentry/mm/lifecycle.go @@ -22,6 +22,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/context" "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" "gvisor.googlesource.com/gvisor/pkg/sentry/platform" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" ) @@ -58,13 +59,17 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { mm.mappingMu.RLock() defer mm.mappingMu.RUnlock() mm2 := &MemoryManager{ - p: mm.p, - haveASIO: mm.haveASIO, - layout: mm.layout, - privateRefs: mm.privateRefs, - users: 1, - usageAS: mm.usageAS, - brk: mm.brk, + p: mm.p, + haveASIO: mm.haveASIO, + layout: mm.layout, + privateRefs: mm.privateRefs, + users: 1, + brk: mm.brk, + usageAS: mm.usageAS, + // "The child does not inherit its parent's memory locks (mlock(2), + // mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is + // MLockNone, both of which are zero values. vma.mlockMode is reset + // when copied below. captureInvalidations: true, argv: mm.argv, envv: mm.envv, @@ -77,7 +82,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { // Copy vmas. dstvgap := mm2.vmas.FirstGap() for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() { - vma := srcvseg.ValuePtr() + vma := srcvseg.Value() // makes a copy of the vma vmaAR := srcvseg.Range() // Inform the Mappable, if any, of the new mapping. if vma.mappable != nil { @@ -89,7 +94,8 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { if vma.id != nil { vma.id.IncRef() } - dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, *vma).NextGap() + vma.mlockMode = memmap.MLockNone + dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap() // We don't need to update mm2.usageAS since we copied it from mm // above. } diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index b1e39e898..c0632d232 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -95,11 +95,6 @@ type MemoryManager struct { // vmas is protected by mappingMu. vmas vmaSet - // usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks. - // - // usageAS is protected by mappingMu. - usageAS uint64 - // brk is the mm's brk, which is manipulated using the brk(2) system call. // The brk is initially set up by the loader which maps an executable // binary into the mm. @@ -107,6 +102,23 @@ type MemoryManager struct { // brk is protected by mappingMu. brk usermem.AddrRange + // usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks. + // + // usageAS is protected by mappingMu. + usageAS uint64 + + // lockedAS is the combined size in bytes of all vmas with vma.mlockMode != + // memmap.MLockNone. + // + // lockedAS is protected by mappingMu. + lockedAS uint64 + + // New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or + // defMLockMode is greater. + // + // defMLockMode is protected by mappingMu. + defMLockMode memmap.MLockMode + // activeMu is loosely analogous to Linux's struct // mm_struct::page_table_lock. activeMu ssync.DowngradableRWMutex `state:"nosave"` @@ -252,6 +264,8 @@ type vma struct { // metag, none of which we currently support. growsDown bool `state:"manual"` + mlockMode memmap.MLockMode + // If id is not nil, it controls the lifecycle of mappable and provides vma // metadata shown in /proc/[pid]/maps, and the vma holds a reference. id memmap.MappingIdentity diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go index daaae4da1..383703ec3 100644 --- a/pkg/sentry/mm/syscalls.go +++ b/pkg/sentry/mm/syscalls.go @@ -20,6 +20,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex" "gvisor.googlesource.com/gvisor/pkg/sentry/limits" "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" @@ -128,16 +129,24 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme // Get the new vma. mm.mappingMu.Lock() + if opts.MLockMode < mm.defMLockMode { + opts.MLockMode = mm.defMLockMode + } vseg, ar, err := mm.createVMALocked(ctx, opts) if err != nil { mm.mappingMu.Unlock() return 0, err } + // TODO: In Linux, VM_LOCKONFAULT (which may be set on the new + // vma by mlockall(MCL_FUTURE|MCL_ONFAULT) => mm_struct::def_flags) appears + // to effectively disable MAP_POPULATE by unsetting FOLL_POPULATE in + // mm/util.c:vm_mmap_pgoff() => mm/gup.c:__mm_populate() => + // populate_vma_page_range(). Confirm this behavior. switch { - case opts.Precommit: + case opts.Precommit || opts.MLockMode == memmap.MLockEager: // Get pmas and map with precommit as requested. - mm.populateAndUnlock(ctx, vseg, ar, true) + mm.populateVMAAndUnlock(ctx, vseg, ar, true) case opts.Mappable == nil && length <= privateAllocUnit: // NOTE: Get pmas and map eagerly in the hope @@ -146,7 +155,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme // memmap.Mappable.Translate is unknown; and only for small mappings, // to avoid needing to allocate large amounts of memory that we may // subsequently need to checkpoint. - mm.populateAndUnlock(ctx, vseg, ar, false) + mm.populateVMAAndUnlock(ctx, vseg, ar, false) default: mm.mappingMu.Unlock() @@ -155,31 +164,29 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme return ar.Start, nil } -// Preconditions: mm.mappingMu must be locked for writing. +// populateVMA obtains pmas for addresses in ar in the given vma, and maps them +// into mm.as if it is active. // -// Postconditions: mm.mappingMu will be unlocked. -func (mm *MemoryManager) populateAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) { +// Preconditions: mm.mappingMu must be locked. vseg.Range().IsSupersetOf(ar). +func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) { if !vseg.ValuePtr().effectivePerms.Any() { // Linux doesn't populate inaccessible pages. See // mm/gup.c:populate_vma_page_range. - mm.mappingMu.Unlock() return } mm.activeMu.Lock() + // Can't defer mm.activeMu.Unlock(); see below. - // Even if we get a new pma, we can't actually map it if we don't have an + // Even if we get new pmas, we can't actually map them if we don't have an // AddressSpace. if mm.as == nil { mm.activeMu.Unlock() - mm.mappingMu.Unlock() return } // Ensure that we have usable pmas. - mm.mappingMu.DowngradeLock() pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{}) - mm.mappingMu.RUnlock() if err != nil { // mm/util.c:vm_mmap_pgoff() ignores the error, if any, from // mm/gup.c:mm_populate(). If it matters, we'll get it again when @@ -197,6 +204,45 @@ func (mm *MemoryManager) populateAndUnlock(ctx context.Context, vseg vmaIterator mm.activeMu.RUnlock() } +// populateVMAAndUnlock is equivalent to populateVMA, but also unconditionally +// unlocks mm.mappingMu. In cases where populateVMAAndUnlock is usable, it is +// preferable to populateVMA since it unlocks mm.mappingMu before performing +// expensive operations that don't require it to be locked. +// +// Preconditions: mm.mappingMu must be locked for writing. +// vseg.Range().IsSupersetOf(ar). +// +// Postconditions: mm.mappingMu will be unlocked. +func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) { + // See populateVMA above for commentary. + if !vseg.ValuePtr().effectivePerms.Any() { + mm.mappingMu.Unlock() + return + } + + mm.activeMu.Lock() + + if mm.as == nil { + mm.activeMu.Unlock() + mm.mappingMu.Unlock() + return + } + + // mm.mappingMu doesn't need to be write-locked for getPMAsLocked, and it + // isn't needed at all for mapASLocked. + mm.mappingMu.DowngradeLock() + pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{}) + mm.mappingMu.RUnlock() + if err != nil { + mm.activeMu.Unlock() + return + } + + mm.activeMu.DowngradeLock() + mm.mapASLocked(pseg, ar, precommit) + mm.activeMu.RUnlock() +} + // MapStack allocates the initial process stack. func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error) { // maxStackSize is the maximum supported process stack size in bytes. @@ -236,6 +282,7 @@ func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error MaxPerms: usermem.AnyAccess, Private: true, GrowsDown: true, + MLockMode: mm.defMLockMode, Hint: "[stack]", }) return ar, err @@ -334,6 +381,19 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi // occupies at least part of the destination. Thus the NoMove case always // fails and the MayMove case always falls back to copying. + if vma := vseg.ValuePtr(); newSize > oldSize && vma.mlockMode != memmap.MLockNone { + // Check against RLIMIT_MEMLOCK. Unlike mmap, mlock, and mlockall, + // mremap in Linux does not check mm/mlock.c:can_do_mlock() and + // therefore does not return EPERM if RLIMIT_MEMLOCK is 0 and + // !CAP_IPC_LOCK. + mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur + if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { + if newLockedAS := mm.lockedAS - oldSize + newSize; newLockedAS > mlockLimit { + return 0, syserror.EAGAIN + } + } + } + if opts.Move != MRemapMustMove { // Handle no-ops and in-place shrinking. These cases don't care if // [oldAddr, oldEnd) maps to a single vma, or is even mapped at all @@ -360,7 +420,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi if vma.mappable != nil { newOffset = vseg.mappableRange().End } - _, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{ + vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{ Length: newSize - oldSize, MappingIdentity: vma.id, Mappable: vma.mappable, @@ -371,9 +431,13 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi MaxPerms: vma.maxPerms, Private: vma.private, GrowsDown: vma.growsDown, + MLockMode: vma.mlockMode, Hint: vma.hint, }) if err == nil { + if vma.mlockMode == memmap.MLockEager { + mm.populateVMA(ctx, vseg, ar, true) + } return oldAddr, nil } // In-place growth failed. In the MRemapMayMove case, fall through to @@ -462,8 +526,14 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi if vma.id != nil { vma.id.IncRef() } - mm.vmas.Add(newAR, vma) + vseg := mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma) mm.usageAS += uint64(newAR.Length()) + if vma.mlockMode != memmap.MLockNone { + mm.lockedAS += uint64(newAR.Length()) + if vma.mlockMode == memmap.MLockEager { + mm.populateVMA(ctx, vseg, newAR, true) + } + } return newAR.Start, nil } @@ -485,8 +555,11 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi vseg = mm.vmas.Isolate(vseg, oldAR) vma := vseg.Value() mm.vmas.Remove(vseg) - mm.vmas.Add(newAR, vma) + vseg = mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma) mm.usageAS = mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length()) + if vma.mlockMode != memmap.MLockNone { + mm.lockedAS = mm.lockedAS - uint64(oldAR.Length()) + uint64(newAR.Length()) + } // Move pmas. This is technically optional for non-private pmas, which // could just go through memmap.Mappable.Translate again, but it's required @@ -501,6 +574,10 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.isMappableAsWritable()) } + if vma.mlockMode == memmap.MLockEager { + mm.populateVMA(ctx, vseg, newAR, true) + } + return newAR.Start, nil } @@ -611,9 +688,10 @@ func (mm *MemoryManager) BrkSetup(ctx context.Context, addr usermem.Addr) { // error on failure. func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Addr, error) { mm.mappingMu.Lock() - defer mm.mappingMu.Unlock() + // Can't defer mm.mappingMu.Unlock(); see below. if addr < mm.brk.Start { + mm.mappingMu.Unlock() return mm.brk.End, syserror.EINVAL } @@ -623,21 +701,24 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad // heap + data + bss. The segment sizes need to be plumbed from the // loader package to fully enforce RLIMIT_DATA. if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur { + mm.mappingMu.Unlock() return mm.brk.End, syserror.ENOMEM } oldbrkpg, _ := mm.brk.End.RoundUp() newbrkpg, ok := addr.RoundUp() if !ok { + mm.mappingMu.Unlock() return mm.brk.End, syserror.EFAULT } switch { case newbrkpg < oldbrkpg: mm.unmapLocked(ctx, usermem.AddrRange{newbrkpg, oldbrkpg}) + mm.mappingMu.Unlock() case oldbrkpg < newbrkpg: - _, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{ + vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{ Length: uint64(newbrkpg - oldbrkpg), Addr: oldbrkpg, Fixed: true, @@ -646,17 +727,221 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad Perms: usermem.ReadWrite, MaxPerms: usermem.AnyAccess, Private: true, - Hint: "[heap]", + // Linux: mm/mmap.c:sys_brk() => do_brk_flags() includes + // mm->def_flags. + MLockMode: mm.defMLockMode, + Hint: "[heap]", }) if err != nil { + mm.mappingMu.Unlock() return mm.brk.End, err } + if mm.defMLockMode == memmap.MLockEager { + mm.populateVMAAndUnlock(ctx, vseg, ar, true) + } else { + mm.mappingMu.Unlock() + } + + default: + // Nothing to do. + mm.mappingMu.Unlock() } mm.brk.End = addr return addr, nil } +// MLock implements the semantics of Linux's mlock()/mlock2()/munlock(), +// depending on mode. +func (mm *MemoryManager) MLock(ctx context.Context, addr usermem.Addr, length uint64, mode memmap.MLockMode) error { + // Linux allows this to overflow. + la, _ := usermem.Addr(length + addr.PageOffset()).RoundUp() + ar, ok := addr.RoundDown().ToRange(uint64(la)) + if !ok { + return syserror.EINVAL + } + + mm.mappingMu.Lock() + // Can't defer mm.mappingMu.Unlock(); see below. + + if mode != memmap.MLockNone { + // Check against RLIMIT_MEMLOCK. + if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { + mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur + if mlockLimit == 0 { + mm.mappingMu.Unlock() + return syserror.EPERM + } + if newLockedAS := mm.lockedAS + uint64(ar.Length()) - mm.mlockedBytesRangeLocked(ar); newLockedAS > mlockLimit { + mm.mappingMu.Unlock() + return syserror.ENOMEM + } + } + } + + // Check this after RLIMIT_MEMLOCK for consistency with Linux. + if ar.Length() == 0 { + mm.mappingMu.Unlock() + return nil + } + + // Apply the new mlock mode to vmas. + var unmapped bool + vseg := mm.vmas.FindSegment(ar.Start) + for { + if !vseg.Ok() { + unmapped = true + break + } + vseg = mm.vmas.Isolate(vseg, ar) + vma := vseg.ValuePtr() + prevMode := vma.mlockMode + vma.mlockMode = mode + if mode != memmap.MLockNone && prevMode == memmap.MLockNone { + mm.lockedAS += uint64(vseg.Range().Length()) + } else if mode == memmap.MLockNone && prevMode != memmap.MLockNone { + mm.lockedAS -= uint64(vseg.Range().Length()) + } + if ar.End <= vseg.End() { + break + } + vseg, _ = vseg.NextNonEmpty() + } + mm.vmas.MergeRange(ar) + mm.vmas.MergeAdjacent(ar) + if unmapped { + mm.mappingMu.Unlock() + return syserror.ENOMEM + } + + if mode == memmap.MLockEager { + // Ensure that we have usable pmas. Since we didn't return ENOMEM + // above, ar must be fully covered by vmas, so we can just use + // NextSegment below. + mm.activeMu.Lock() + mm.mappingMu.DowngradeLock() + for vseg := mm.vmas.FindSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { + if !vseg.ValuePtr().effectivePerms.Any() { + // Linux: mm/gup.c:__get_user_pages() returns EFAULT in this + // case, which is converted to ENOMEM by mlock. + mm.activeMu.Unlock() + mm.mappingMu.RUnlock() + return syserror.ENOMEM + } + _, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), pmaOpts{}) + if err != nil { + mm.activeMu.Unlock() + mm.mappingMu.RUnlock() + // Linux: mm/mlock.c:__mlock_posix_error_return() + if err == syserror.EFAULT { + return syserror.ENOMEM + } + if err == syserror.ENOMEM { + return syserror.EAGAIN + } + return err + } + } + + // Map pmas into the active AddressSpace, if we have one. + mm.mappingMu.RUnlock() + if mm.as != nil { + mm.activeMu.DowngradeLock() + err := mm.mapASLocked(mm.pmas.LowerBoundSegment(ar.Start), ar, true /* precommit */) + mm.activeMu.RUnlock() + if err != nil { + return err + } + } else { + mm.activeMu.Unlock() + } + } else { + mm.mappingMu.Unlock() + } + + return nil +} + +// MLockAllOpts holds options to MLockAll. +type MLockAllOpts struct { + // If Current is true, change the memory-locking behavior of all mappings + // to Mode. If Future is true, upgrade the memory-locking behavior of all + // future mappings to Mode. At least one of Current or Future must be true. + Current bool + Future bool + Mode memmap.MLockMode +} + +// MLockAll implements the semantics of Linux's mlockall()/munlockall(), +// depending on opts. +func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error { + if !opts.Current && !opts.Future { + return syserror.EINVAL + } + + mm.mappingMu.Lock() + // Can't defer mm.mappingMu.Unlock(); see below. + + if opts.Current { + if opts.Mode != memmap.MLockNone { + // Check against RLIMIT_MEMLOCK. + if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { + mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur + if mlockLimit == 0 { + mm.mappingMu.Unlock() + return syserror.EPERM + } + if uint64(mm.vmas.Span()) > mlockLimit { + mm.mappingMu.Unlock() + return syserror.ENOMEM + } + } + } + for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() { + vma := vseg.ValuePtr() + prevMode := vma.mlockMode + vma.mlockMode = opts.Mode + if opts.Mode != memmap.MLockNone && prevMode == memmap.MLockNone { + mm.lockedAS += uint64(vseg.Range().Length()) + } else if opts.Mode == memmap.MLockNone && prevMode != memmap.MLockNone { + mm.lockedAS -= uint64(vseg.Range().Length()) + } + } + } + + if opts.Future { + mm.defMLockMode = opts.Mode + } + + if opts.Current && opts.Mode == memmap.MLockEager { + // Linux: mm/mlock.c:sys_mlockall() => include/linux/mm.h:mm_populate() + // ignores the return value of __mm_populate(), so all errors below are + // ignored. + // + // Try to get usable pmas. + mm.activeMu.Lock() + mm.mappingMu.DowngradeLock() + for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() { + if vseg.ValuePtr().effectivePerms.Any() { + mm.getPMAsLocked(ctx, vseg, vseg.Range(), pmaOpts{}) + } + } + + // Map all pmas into the active AddressSpace, if we have one. + mm.mappingMu.RUnlock() + if mm.as != nil { + mm.activeMu.DowngradeLock() + mm.mapASLocked(mm.pmas.FirstSegment(), mm.applicationAddrRange(), true /* precommit */) + mm.activeMu.RUnlock() + } else { + mm.activeMu.Unlock() + } + } else { + mm.mappingMu.Unlock() + } + return nil +} + // Decommit implements the semantics of Linux's madvise(MADV_DONTNEED). func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error { ar, ok := addr.ToRange(length) @@ -680,46 +965,49 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error { // ensures that Decommit immediately reduces host memory usage. var didUnmapAS bool pseg := mm.pmas.LowerBoundSegment(ar.Start) - vseg := mm.vmas.LowerBoundSegment(ar.Start) mem := mm.p.Memory() - for pseg.Ok() && pseg.Start() < ar.End { - pma := pseg.ValuePtr() - if pma.private && !mm.isPMACopyOnWriteLocked(pseg) { - psegAR := pseg.Range().Intersect(ar) - vseg = vseg.seekNextLowerBound(psegAR.Start) - if checkInvariants { - if !vseg.Ok() { - panic(fmt.Sprintf("no vma after %#x", psegAR.Start)) - } - if psegAR.Start < vseg.Start() { - panic(fmt.Sprintf("no vma in [%#x, %#x)", psegAR.Start, vseg.Start())) - } + for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { + vma := vseg.ValuePtr() + if vma.mlockMode != memmap.MLockNone { + return syserror.EINVAL + } + vsegAR := vseg.Range().Intersect(ar) + // pseg should already correspond to either this vma or a later one, + // since there can't be a pma without a corresponding vma. + if checkInvariants { + if pseg.Ok() && pseg.End() <= vsegAR.Start { + panic(fmt.Sprintf("pma %v precedes vma %v", pseg.Range(), vsegAR)) } - if vseg.Range().IsSupersetOf(psegAR) && vseg.ValuePtr().mappable == nil { - if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil { - pseg = pseg.NextSegment() - continue + } + for pseg.Ok() && pseg.Start() < vsegAR.End { + pma := pseg.ValuePtr() + if pma.private && !mm.isPMACopyOnWriteLocked(pseg) { + psegAR := pseg.Range().Intersect(ar) + if vsegAR.IsSupersetOf(psegAR) && vma.mappable == nil { + if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil { + pseg = pseg.NextSegment() + continue + } + // If an error occurs, fall through to the general + // invalidation case below. } - // If an error occurs, fall through to the general - // invalidation case below. } + pseg = mm.pmas.Isolate(pseg, vsegAR) + pma = pseg.ValuePtr() + if !didUnmapAS { + // Unmap all of ar, not just pseg.Range(), to minimize host + // syscalls. AddressSpace mappings must be removed before + // mm.decPrivateRef(). + mm.unmapASLocked(ar) + didUnmapAS = true + } + if pma.private { + mm.decPrivateRef(pseg.fileRange()) + } + pma.file.DecRef(pseg.fileRange()) + mm.removeRSSLocked(pseg.Range()) + pseg = mm.pmas.Remove(pseg).NextSegment() } - pseg = mm.pmas.Isolate(pseg, ar) - pma = pseg.ValuePtr() - if !didUnmapAS { - // Unmap all of ar, not just pseg.Range(), to minimize host - // syscalls. AddressSpace mappings must be removed before - // mm.decPrivateRef(). - mm.unmapASLocked(ar) - didUnmapAS = true - } - if pma.private { - mm.decPrivateRef(pseg.fileRange()) - } - pma.file.DecRef(pseg.fileRange()) - mm.removeRSSLocked(pseg.Range()) - - pseg = mm.pmas.Remove(pseg).NextSegment() } // "If there are some parts of the specified address space that are not @@ -732,9 +1020,28 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error { return nil } -// Sync implements the semantics of Linux's msync(MS_SYNC). -func (mm *MemoryManager) Sync(ctx context.Context, addr usermem.Addr, length uint64) error { - ar, ok := addr.ToRange(length) +// MSyncOpts holds options to MSync. +type MSyncOpts struct { + // Sync has the semantics of MS_SYNC. + Sync bool + + // Invalidate has the semantics of MS_INVALIDATE. + Invalidate bool +} + +// MSync implements the semantics of Linux's msync(). +func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length uint64, opts MSyncOpts) error { + if addr != addr.RoundDown() { + return syserror.EINVAL + } + if length == 0 { + return nil + } + la, ok := usermem.Addr(length).RoundUp() + if !ok { + return syserror.ENOMEM + } + ar, ok := addr.ToRange(uint64(la)) if !ok { return syserror.ENOMEM } @@ -759,10 +1066,14 @@ func (mm *MemoryManager) Sync(ctx context.Context, addr usermem.Addr, length uin } lastEnd = vseg.End() vma := vseg.ValuePtr() + if opts.Invalidate && vma.mlockMode != memmap.MLockNone { + mm.mappingMu.RUnlock() + return syserror.EBUSY + } // It's only possible to have dirtied the Mappable through a shared // mapping. Don't check if the mapping is writable, because mprotect // may have changed this, and also because Linux doesn't. - if id := vma.id; id != nil && vma.mappable != nil && !vma.private { + if id := vma.id; opts.Sync && id != nil && vma.mappable != nil && !vma.private { // We can't call memmap.MappingIdentity.Msync while holding // mm.mappingMu since it may take fs locks that precede it in the // lock order. diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go index 5c2c802f6..28ba9f2f5 100644 --- a/pkg/sentry/mm/vma.go +++ b/pkg/sentry/mm/vma.go @@ -17,8 +17,10 @@ package mm import ( "fmt" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" "gvisor.googlesource.com/gvisor/pkg/sentry/limits" "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" @@ -53,6 +55,23 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp return vmaIterator{}, usermem.AddrRange{}, syserror.ENOMEM } + if opts.MLockMode != memmap.MLockNone { + // Check against RLIMIT_MEMLOCK. + if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { + mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur + if mlockLimit == 0 { + return vmaIterator{}, usermem.AddrRange{}, syserror.EPERM + } + newLockedAS := mm.lockedAS + opts.Length + if opts.Unmap { + newLockedAS -= mm.mlockedBytesRangeLocked(ar) + } + if newLockedAS > mlockLimit { + return vmaIterator{}, usermem.AddrRange{}, syserror.EAGAIN + } + } + } + // Remove overwritten mappings. This ordering is consistent with Linux: // compare Linux's mm/mmap.c:mmap_region() => do_munmap(), // file->f_op->mmap(). @@ -85,10 +104,14 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp maxPerms: opts.MaxPerms, private: opts.Private, growsDown: opts.GrowsDown, + mlockMode: opts.MLockMode, id: opts.MappingIdentity, hint: opts.Hint, }) mm.usageAS += opts.Length + if opts.MLockMode != memmap.MLockNone { + mm.lockedAS += opts.Length + } return vseg, ar, nil } @@ -201,6 +224,17 @@ func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bo return 0, syserror.ENOMEM } +// Preconditions: mm.mappingMu must be locked. +func (mm *MemoryManager) mlockedBytesRangeLocked(ar usermem.AddrRange) uint64 { + var total uint64 + for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { + if vseg.ValuePtr().mlockMode != memmap.MLockNone { + total += uint64(vseg.Range().Intersect(ar).Length()) + } + } + return total +} + // getVMAsLocked ensures that vmas exist for all addresses in ar, and support // access of type (at, ignorePermissions). It returns: // @@ -338,6 +372,9 @@ func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRa vma.id.DecRef() } mm.usageAS -= uint64(vmaAR.Length()) + if vma.mlockMode != memmap.MLockNone { + mm.lockedAS -= uint64(vmaAR.Length()) + } vgap = mm.vmas.Remove(vseg) vseg = vgap.NextSegment() } @@ -368,6 +405,7 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa vma1.maxPerms != vma2.maxPerms || vma1.private != vma2.private || vma1.growsDown != vma2.growsDown || + vma1.mlockMode != vma2.mlockMode || vma1.id != vma2.id || vma1.hint != vma2.hint { return vma{}, false diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index 2aab948da..cc5ebb955 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -196,11 +196,11 @@ var AMD64 = &kernel.SyscallTable{ 145: SchedGetscheduler, 146: SchedGetPriorityMax, 147: SchedGetPriorityMin, - 148: syscalls.ErrorWithEvent(syscall.EPERM), // SchedRrGetInterval, - 149: syscalls.Error(nil), // Mlock, TODO - 150: syscalls.Error(nil), // Munlock, TODO - 151: syscalls.Error(nil), // Mlockall, TODO - 152: syscalls.Error(nil), // Munlockall, TODO + 148: syscalls.ErrorWithEvent(syscall.EPERM), // SchedRrGetInterval, + 149: Mlock, + 150: Munlock, + 151: Mlockall, + 152: Munlockall, 153: syscalls.CapError(linux.CAP_SYS_TTY_CONFIG), // Vhangup, 154: syscalls.Error(syscall.EPERM), // ModifyLdt, 155: syscalls.Error(syscall.EPERM), // PivotRoot, @@ -373,8 +373,9 @@ var AMD64 = &kernel.SyscallTable{ // 322: Execveat, TODO // 323: Userfaultfd, TODO // 324: Membarrier, TODO - // Syscalls after 325 are backports from 4.6. - 325: syscalls.Error(nil), // Mlock2, TODO + 325: Mlock2, + // Syscalls after 325 are "backports" from versions of Linux after 4.4. + // 326: CopyFileRange, 327: Preadv2, // 328: Pwritev2, // Pwritev2, TODO }, diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index 145f7846c..8732861e0 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -69,6 +69,9 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC GrowsDown: linux.MAP_GROWSDOWN&flags != 0, Precommit: linux.MAP_POPULATE&flags != 0, } + if linux.MAP_LOCKED&flags != 0 { + opts.MLockMode = memmap.MLockEager + } defer func() { if opts.MappingIdentity != nil { opts.MappingIdentity.DecRef() @@ -384,16 +387,6 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall length := args[1].SizeT() flags := args[2].Int() - if addr != addr.RoundDown() { - return 0, nil, syserror.EINVAL - } - if length == 0 { - return 0, nil, nil - } - la, ok := usermem.Addr(length).RoundUp() - if !ok { - return 0, nil, syserror.ENOMEM - } // "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC, // and may additionally include the MS_INVALIDATE bit. ... However, Linux // permits a call to msync() that specifies neither of these flags, with @@ -406,39 +399,72 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if sync && flags&linux.MS_ASYNC != 0 { return 0, nil, syserror.EINVAL } + err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{ + Sync: sync, + Invalidate: flags&linux.MS_INVALIDATE != 0, + }) + // MSync calls fsync, the same interrupt conversion rules apply, see + // mm/msync.c, fsync POSIX.1-2008. + return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) +} + +// Mlock implements linux syscall mlock(2). +func Mlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].SizeT() + + return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockEager) +} - // MS_INVALIDATE "asks to invalidate other mappings of the same file (so - // that they can be updated with the fresh values just written)". This is a - // no-op given that shared memory exists. However, MS_INVALIDATE can also - // be used to detect mlocks: "EBUSY: MS_INVALIDATE was specified in flags, - // and a memory lock exists for the specified address range." Given that - // mlock is stubbed out, it's unsafe to pass MS_INVALIDATE silently since - // some user program could be using it for synchronization. - if flags&linux.MS_INVALIDATE != 0 { +// Mlock2 implements linux syscall mlock2(2). +func Mlock2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].SizeT() + flags := args[2].Int() + + if flags&^(linux.MLOCK_ONFAULT) != 0 { return 0, nil, syserror.EINVAL } - // MS_SYNC "requests an update and waits for it to complete." - if sync { - err := t.MemoryManager().Sync(t, addr, uint64(la)) - // Sync calls fsync, the same interrupt conversion rules apply, see - // mm/msync.c, fsync POSIX.1-2008. - return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) - } - // MS_ASYNC "specifies that an update be scheduled, but the call returns - // immediately". As long as dirty pages are tracked and eventually written - // back, this is a no-op. (Correspondingly: "Since Linux 2.6.19, MS_ASYNC - // is in fact a no-op, since the kernel properly tracks dirty pages and - // flushes them to storage as necessary.") - // - // However: "ENOMEM: The indicated memory (or part of it) was not mapped." - // This applies even for MS_ASYNC. - ar, ok := addr.ToRange(uint64(la)) - if !ok { - return 0, nil, syserror.ENOMEM + + mode := memmap.MLockEager + if flags&linux.MLOCK_ONFAULT != 0 { + mode = memmap.MLockLazy } - mapped := t.MemoryManager().VirtualMemorySizeRange(ar) - if mapped != uint64(la) { - return 0, nil, syserror.ENOMEM + return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), mode) +} + +// Munlock implements linux syscall munlock(2). +func Munlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].SizeT() + + return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockNone) +} + +// Mlockall implements linux syscall mlockall(2). +func Mlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + flags := args[0].Int() + + if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 { + return 0, nil, syserror.EINVAL } - return 0, nil, nil + + mode := memmap.MLockEager + if flags&linux.MCL_ONFAULT != 0 { + mode = memmap.MLockLazy + } + return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{ + Current: flags&linux.MCL_CURRENT != 0, + Future: flags&linux.MCL_FUTURE != 0, + Mode: mode, + }) +} + +// Munlockall implements linux syscall munlockall(2). +func Munlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{ + Current: true, + Future: true, + Mode: memmap.MLockNone, + }) } diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go index 2f16e1791..b0b216045 100644 --- a/pkg/sentry/syscalls/linux/sys_rlimit.go +++ b/pkg/sentry/syscalls/linux/sys_rlimit.go @@ -90,6 +90,7 @@ var setableLimits = map[limits.LimitType]struct{}{ limits.CPU: {}, limits.Data: {}, limits.FileSize: {}, + limits.MemoryLocked: {}, limits.Stack: {}, // These are not enforced, but we include them here to avoid returning // EPERM, since some apps expect them to succeed. |