diff options
Diffstat (limited to 'pkg')
-rw-r--r-- | pkg/abi/linux/limits.go | 2 | ||||
-rw-r--r-- | pkg/abi/linux/mm.go | 12 | ||||
-rw-r--r-- | pkg/sentry/limits/limits.go | 2 | ||||
-rw-r--r-- | pkg/sentry/limits/linux.go | 2 | ||||
-rw-r--r-- | pkg/sentry/memmap/memmap.go | 37 | ||||
-rw-r--r-- | pkg/sentry/mm/BUILD | 1 | ||||
-rw-r--r-- | pkg/sentry/mm/address_space.go | 12 | ||||
-rw-r--r-- | pkg/sentry/mm/lifecycle.go | 24 | ||||
-rw-r--r-- | pkg/sentry/mm/mm.go | 24 | ||||
-rw-r--r-- | pkg/sentry/mm/syscalls.go | 423 | ||||
-rw-r--r-- | pkg/sentry/mm/vma.go | 38 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/linux64.go | 15 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/sys_mmap.go | 106 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/sys_rlimit.go | 1 |
14 files changed, 127 insertions, 572 deletions
diff --git a/pkg/abi/linux/limits.go b/pkg/abi/linux/limits.go index e0aa5b31d..b2e51b9bd 100644 --- a/pkg/abi/linux/limits.go +++ b/pkg/abi/linux/limits.go @@ -60,7 +60,7 @@ const ( DefaultNofileHardLimit = 4096 // DefaultMemlockLimit is called MLOCK_LIMIT in Linux. - DefaultMemlockLimit = 64 * 1024 + DefaultMemlockLimit = 64 * 1094 // DefaultMsgqueueLimit is called MQ_BYTES_MAX in Linux. DefaultMsgqueueLimit = 819200 diff --git a/pkg/abi/linux/mm.go b/pkg/abi/linux/mm.go index eda8d9788..3fcdf8235 100644 --- a/pkg/abi/linux/mm.go +++ b/pkg/abi/linux/mm.go @@ -49,18 +49,6 @@ const ( MREMAP_FIXED = 1 << 1 ) -// Flags for mlock2(2). -const ( - MLOCK_ONFAULT = 0x01 -) - -// Flags for mlockall(2). -const ( - MCL_CURRENT = 1 - MCL_FUTURE = 2 - MCL_ONFAULT = 4 -) - // Advice for madvise(2). const ( MADV_NORMAL = 0 diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go index eeca01876..ba0b7d4fd 100644 --- a/pkg/sentry/limits/limits.go +++ b/pkg/sentry/limits/limits.go @@ -33,7 +33,7 @@ const ( Rss ProcessCount NumberOfFiles - MemoryLocked + MemoryPagesLocked AS Locks SignalsPending diff --git a/pkg/sentry/limits/linux.go b/pkg/sentry/limits/linux.go index 295f9c398..511db6733 100644 --- a/pkg/sentry/limits/linux.go +++ b/pkg/sentry/limits/linux.go @@ -30,7 +30,7 @@ var FromLinuxResource = map[int]LimitType{ linux.RLIMIT_RSS: Rss, linux.RLIMIT_NPROC: ProcessCount, linux.RLIMIT_NOFILE: NumberOfFiles, - linux.RLIMIT_MEMLOCK: MemoryLocked, + linux.RLIMIT_MEMLOCK: MemoryPagesLocked, linux.RLIMIT_AS: AS, linux.RLIMIT_LOCKS: Locks, linux.RLIMIT_SIGPENDING: SignalsPending, diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go index cf20b11e3..28e2bed9b 100644 --- a/pkg/sentry/memmap/memmap.go +++ b/pkg/sentry/memmap/memmap.go @@ -243,40 +243,6 @@ type MappingIdentity interface { Msync(ctx context.Context, mr MappableRange) error } -// MLockMode specifies the memory locking behavior of a memory mapping. -type MLockMode int - -// Note that the ordering of MLockModes is significant; see -// mm.MemoryManager.defMLockMode. -const ( - // MLockNone specifies that a mapping has no memory locking behavior. - // - // This must be the zero value for MLockMode. - MLockNone MLockMode = iota - - // MLockEager specifies that a mapping is memory-locked, as by mlock() or - // similar. Pages in the mapping should be made, and kept, resident in - // physical memory as soon as possible. - // - // As of this writing, MLockEager does not cause memory-locking to be - // requested from the host; it only affects the sentry's memory management - // behavior. - // - // MLockEager is analogous to Linux's VM_LOCKED. - MLockEager - - // MLockLazy specifies that a mapping is memory-locked, as by mlock() or - // similar. Pages in the mapping should be kept resident in physical memory - // once they have been made resident due to e.g. a page fault. - // - // As of this writing, MLockLazy does not cause memory-locking to be - // requested from the host; in fact, it has virtually no effect, except for - // interactions between mlocked pages and other syscalls. - // - // MLockLazy is analogous to Linux's VM_LOCKED | VM_LOCKONFAULT. - MLockLazy -) - // MMapOpts specifies a request to create a memory mapping. type MMapOpts struct { // Length is the length of the mapping. @@ -337,9 +303,6 @@ type MMapOpts struct { // mapping (see platform.AddressSpace.MapFile). Precommit bool - // MLockMode specifies the memory locking behavior of the mapping. - MLockMode MLockMode - // Hint is the name used for the mapping in /proc/[pid]/maps. If Hint is // empty, MappingIdentity.MappedName() will be used instead. // diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index 5a9185e5d..744e73a39 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -106,7 +106,6 @@ go_library( "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fs/proc/seqfile", - "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/futex", "//pkg/sentry/kernel/shm", "//pkg/sentry/limits", diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go index e7aa24c69..7488f7c4a 100644 --- a/pkg/sentry/mm/address_space.go +++ b/pkg/sentry/mm/address_space.go @@ -149,7 +149,7 @@ func (mm *MemoryManager) Deactivate() { // for all addresses in ar should be precommitted. // // Preconditions: mm.activeMu must be locked. mm.as != nil. ar.Length() != 0. -// ar must be page-aligned. pseg == mm.pmas.LowerBoundSegment(ar.Start). +// ar must be page-aligned. pseg.Range().Contains(ar.Start). func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, precommit bool) error { // By default, map entire pmas at a time, under the assumption that there // is no cost to mapping more of a pma than necessary. @@ -173,9 +173,7 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre } } - // Since this checks ar.End and not mapAR.End, we will never map a pma that - // is not required. - for pseg.Ok() && pseg.Start() < ar.End { + for { pma := pseg.ValuePtr() pmaAR := pseg.Range() pmaMapAR := pmaAR.Intersect(mapAR) @@ -186,9 +184,13 @@ func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, pre if err := pma.file.MapInto(mm.as, pmaMapAR.Start, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil { return err } + // Since this checks ar.End and not mapAR.End, we will never map a pma + // that is not required. + if ar.End <= pmaAR.End { + return nil + } pseg = pseg.NextSegment() } - return nil } // unmapASLocked removes all AddressSpace mappings for addresses in ar. diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go index a42e32b43..1613ce11d 100644 --- a/pkg/sentry/mm/lifecycle.go +++ b/pkg/sentry/mm/lifecycle.go @@ -22,7 +22,6 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/context" "gvisor.googlesource.com/gvisor/pkg/sentry/limits" - "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" "gvisor.googlesource.com/gvisor/pkg/sentry/platform" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" ) @@ -59,17 +58,13 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { mm.mappingMu.RLock() defer mm.mappingMu.RUnlock() mm2 := &MemoryManager{ - p: mm.p, - haveASIO: mm.haveASIO, - layout: mm.layout, - privateRefs: mm.privateRefs, - users: 1, - brk: mm.brk, - usageAS: mm.usageAS, - // "The child does not inherit its parent's memory locks (mlock(2), - // mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is - // MLockNone, both of which are zero values. vma.mlockMode is reset - // when copied below. + p: mm.p, + haveASIO: mm.haveASIO, + layout: mm.layout, + privateRefs: mm.privateRefs, + users: 1, + usageAS: mm.usageAS, + brk: mm.brk, captureInvalidations: true, argv: mm.argv, envv: mm.envv, @@ -82,7 +77,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { // Copy vmas. dstvgap := mm2.vmas.FirstGap() for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() { - vma := srcvseg.Value() // makes a copy of the vma + vma := srcvseg.ValuePtr() vmaAR := srcvseg.Range() // Inform the Mappable, if any, of the new mapping. if vma.mappable != nil { @@ -94,8 +89,7 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { if vma.id != nil { vma.id.IncRef() } - vma.mlockMode = memmap.MLockNone - dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap() + dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, *vma).NextGap() // We don't need to update mm2.usageAS since we copied it from mm // above. } diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index c0632d232..b1e39e898 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -95,29 +95,17 @@ type MemoryManager struct { // vmas is protected by mappingMu. vmas vmaSet - // brk is the mm's brk, which is manipulated using the brk(2) system call. - // The brk is initially set up by the loader which maps an executable - // binary into the mm. - // - // brk is protected by mappingMu. - brk usermem.AddrRange - // usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks. // // usageAS is protected by mappingMu. usageAS uint64 - // lockedAS is the combined size in bytes of all vmas with vma.mlockMode != - // memmap.MLockNone. - // - // lockedAS is protected by mappingMu. - lockedAS uint64 - - // New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or - // defMLockMode is greater. + // brk is the mm's brk, which is manipulated using the brk(2) system call. + // The brk is initially set up by the loader which maps an executable + // binary into the mm. // - // defMLockMode is protected by mappingMu. - defMLockMode memmap.MLockMode + // brk is protected by mappingMu. + brk usermem.AddrRange // activeMu is loosely analogous to Linux's struct // mm_struct::page_table_lock. @@ -264,8 +252,6 @@ type vma struct { // metag, none of which we currently support. growsDown bool `state:"manual"` - mlockMode memmap.MLockMode - // If id is not nil, it controls the lifecycle of mappable and provides vma // metadata shown in /proc/[pid]/maps, and the vma holds a reference. id memmap.MappingIdentity diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go index 383703ec3..daaae4da1 100644 --- a/pkg/sentry/mm/syscalls.go +++ b/pkg/sentry/mm/syscalls.go @@ -20,7 +20,6 @@ import ( "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/sentry/context" - "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex" "gvisor.googlesource.com/gvisor/pkg/sentry/limits" "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" @@ -129,24 +128,16 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme // Get the new vma. mm.mappingMu.Lock() - if opts.MLockMode < mm.defMLockMode { - opts.MLockMode = mm.defMLockMode - } vseg, ar, err := mm.createVMALocked(ctx, opts) if err != nil { mm.mappingMu.Unlock() return 0, err } - // TODO: In Linux, VM_LOCKONFAULT (which may be set on the new - // vma by mlockall(MCL_FUTURE|MCL_ONFAULT) => mm_struct::def_flags) appears - // to effectively disable MAP_POPULATE by unsetting FOLL_POPULATE in - // mm/util.c:vm_mmap_pgoff() => mm/gup.c:__mm_populate() => - // populate_vma_page_range(). Confirm this behavior. switch { - case opts.Precommit || opts.MLockMode == memmap.MLockEager: + case opts.Precommit: // Get pmas and map with precommit as requested. - mm.populateVMAAndUnlock(ctx, vseg, ar, true) + mm.populateAndUnlock(ctx, vseg, ar, true) case opts.Mappable == nil && length <= privateAllocUnit: // NOTE: Get pmas and map eagerly in the hope @@ -155,7 +146,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme // memmap.Mappable.Translate is unknown; and only for small mappings, // to avoid needing to allocate large amounts of memory that we may // subsequently need to checkpoint. - mm.populateVMAAndUnlock(ctx, vseg, ar, false) + mm.populateAndUnlock(ctx, vseg, ar, false) default: mm.mappingMu.Unlock() @@ -164,29 +155,31 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme return ar.Start, nil } -// populateVMA obtains pmas for addresses in ar in the given vma, and maps them -// into mm.as if it is active. +// Preconditions: mm.mappingMu must be locked for writing. // -// Preconditions: mm.mappingMu must be locked. vseg.Range().IsSupersetOf(ar). -func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) { +// Postconditions: mm.mappingMu will be unlocked. +func (mm *MemoryManager) populateAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) { if !vseg.ValuePtr().effectivePerms.Any() { // Linux doesn't populate inaccessible pages. See // mm/gup.c:populate_vma_page_range. + mm.mappingMu.Unlock() return } mm.activeMu.Lock() - // Can't defer mm.activeMu.Unlock(); see below. - // Even if we get new pmas, we can't actually map them if we don't have an + // Even if we get a new pma, we can't actually map it if we don't have an // AddressSpace. if mm.as == nil { mm.activeMu.Unlock() + mm.mappingMu.Unlock() return } // Ensure that we have usable pmas. + mm.mappingMu.DowngradeLock() pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{}) + mm.mappingMu.RUnlock() if err != nil { // mm/util.c:vm_mmap_pgoff() ignores the error, if any, from // mm/gup.c:mm_populate(). If it matters, we'll get it again when @@ -204,45 +197,6 @@ func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar u mm.activeMu.RUnlock() } -// populateVMAAndUnlock is equivalent to populateVMA, but also unconditionally -// unlocks mm.mappingMu. In cases where populateVMAAndUnlock is usable, it is -// preferable to populateVMA since it unlocks mm.mappingMu before performing -// expensive operations that don't require it to be locked. -// -// Preconditions: mm.mappingMu must be locked for writing. -// vseg.Range().IsSupersetOf(ar). -// -// Postconditions: mm.mappingMu will be unlocked. -func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) { - // See populateVMA above for commentary. - if !vseg.ValuePtr().effectivePerms.Any() { - mm.mappingMu.Unlock() - return - } - - mm.activeMu.Lock() - - if mm.as == nil { - mm.activeMu.Unlock() - mm.mappingMu.Unlock() - return - } - - // mm.mappingMu doesn't need to be write-locked for getPMAsLocked, and it - // isn't needed at all for mapASLocked. - mm.mappingMu.DowngradeLock() - pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, pmaOpts{}) - mm.mappingMu.RUnlock() - if err != nil { - mm.activeMu.Unlock() - return - } - - mm.activeMu.DowngradeLock() - mm.mapASLocked(pseg, ar, precommit) - mm.activeMu.RUnlock() -} - // MapStack allocates the initial process stack. func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error) { // maxStackSize is the maximum supported process stack size in bytes. @@ -282,7 +236,6 @@ func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error MaxPerms: usermem.AnyAccess, Private: true, GrowsDown: true, - MLockMode: mm.defMLockMode, Hint: "[stack]", }) return ar, err @@ -381,19 +334,6 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi // occupies at least part of the destination. Thus the NoMove case always // fails and the MayMove case always falls back to copying. - if vma := vseg.ValuePtr(); newSize > oldSize && vma.mlockMode != memmap.MLockNone { - // Check against RLIMIT_MEMLOCK. Unlike mmap, mlock, and mlockall, - // mremap in Linux does not check mm/mlock.c:can_do_mlock() and - // therefore does not return EPERM if RLIMIT_MEMLOCK is 0 and - // !CAP_IPC_LOCK. - mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur - if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { - if newLockedAS := mm.lockedAS - oldSize + newSize; newLockedAS > mlockLimit { - return 0, syserror.EAGAIN - } - } - } - if opts.Move != MRemapMustMove { // Handle no-ops and in-place shrinking. These cases don't care if // [oldAddr, oldEnd) maps to a single vma, or is even mapped at all @@ -420,7 +360,7 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi if vma.mappable != nil { newOffset = vseg.mappableRange().End } - vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{ + _, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{ Length: newSize - oldSize, MappingIdentity: vma.id, Mappable: vma.mappable, @@ -431,13 +371,9 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi MaxPerms: vma.maxPerms, Private: vma.private, GrowsDown: vma.growsDown, - MLockMode: vma.mlockMode, Hint: vma.hint, }) if err == nil { - if vma.mlockMode == memmap.MLockEager { - mm.populateVMA(ctx, vseg, ar, true) - } return oldAddr, nil } // In-place growth failed. In the MRemapMayMove case, fall through to @@ -526,14 +462,8 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi if vma.id != nil { vma.id.IncRef() } - vseg := mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma) + mm.vmas.Add(newAR, vma) mm.usageAS += uint64(newAR.Length()) - if vma.mlockMode != memmap.MLockNone { - mm.lockedAS += uint64(newAR.Length()) - if vma.mlockMode == memmap.MLockEager { - mm.populateVMA(ctx, vseg, newAR, true) - } - } return newAR.Start, nil } @@ -555,11 +485,8 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi vseg = mm.vmas.Isolate(vseg, oldAR) vma := vseg.Value() mm.vmas.Remove(vseg) - vseg = mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma) + mm.vmas.Add(newAR, vma) mm.usageAS = mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length()) - if vma.mlockMode != memmap.MLockNone { - mm.lockedAS = mm.lockedAS - uint64(oldAR.Length()) + uint64(newAR.Length()) - } // Move pmas. This is technically optional for non-private pmas, which // could just go through memmap.Mappable.Translate again, but it's required @@ -574,10 +501,6 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.isMappableAsWritable()) } - if vma.mlockMode == memmap.MLockEager { - mm.populateVMA(ctx, vseg, newAR, true) - } - return newAR.Start, nil } @@ -688,10 +611,9 @@ func (mm *MemoryManager) BrkSetup(ctx context.Context, addr usermem.Addr) { // error on failure. func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Addr, error) { mm.mappingMu.Lock() - // Can't defer mm.mappingMu.Unlock(); see below. + defer mm.mappingMu.Unlock() if addr < mm.brk.Start { - mm.mappingMu.Unlock() return mm.brk.End, syserror.EINVAL } @@ -701,24 +623,21 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad // heap + data + bss. The segment sizes need to be plumbed from the // loader package to fully enforce RLIMIT_DATA. if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur { - mm.mappingMu.Unlock() return mm.brk.End, syserror.ENOMEM } oldbrkpg, _ := mm.brk.End.RoundUp() newbrkpg, ok := addr.RoundUp() if !ok { - mm.mappingMu.Unlock() return mm.brk.End, syserror.EFAULT } switch { case newbrkpg < oldbrkpg: mm.unmapLocked(ctx, usermem.AddrRange{newbrkpg, oldbrkpg}) - mm.mappingMu.Unlock() case oldbrkpg < newbrkpg: - vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{ + _, _, err := mm.createVMALocked(ctx, memmap.MMapOpts{ Length: uint64(newbrkpg - oldbrkpg), Addr: oldbrkpg, Fixed: true, @@ -727,221 +646,17 @@ func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Ad Perms: usermem.ReadWrite, MaxPerms: usermem.AnyAccess, Private: true, - // Linux: mm/mmap.c:sys_brk() => do_brk_flags() includes - // mm->def_flags. - MLockMode: mm.defMLockMode, - Hint: "[heap]", + Hint: "[heap]", }) if err != nil { - mm.mappingMu.Unlock() return mm.brk.End, err } - if mm.defMLockMode == memmap.MLockEager { - mm.populateVMAAndUnlock(ctx, vseg, ar, true) - } else { - mm.mappingMu.Unlock() - } - - default: - // Nothing to do. - mm.mappingMu.Unlock() } mm.brk.End = addr return addr, nil } -// MLock implements the semantics of Linux's mlock()/mlock2()/munlock(), -// depending on mode. -func (mm *MemoryManager) MLock(ctx context.Context, addr usermem.Addr, length uint64, mode memmap.MLockMode) error { - // Linux allows this to overflow. - la, _ := usermem.Addr(length + addr.PageOffset()).RoundUp() - ar, ok := addr.RoundDown().ToRange(uint64(la)) - if !ok { - return syserror.EINVAL - } - - mm.mappingMu.Lock() - // Can't defer mm.mappingMu.Unlock(); see below. - - if mode != memmap.MLockNone { - // Check against RLIMIT_MEMLOCK. - if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { - mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur - if mlockLimit == 0 { - mm.mappingMu.Unlock() - return syserror.EPERM - } - if newLockedAS := mm.lockedAS + uint64(ar.Length()) - mm.mlockedBytesRangeLocked(ar); newLockedAS > mlockLimit { - mm.mappingMu.Unlock() - return syserror.ENOMEM - } - } - } - - // Check this after RLIMIT_MEMLOCK for consistency with Linux. - if ar.Length() == 0 { - mm.mappingMu.Unlock() - return nil - } - - // Apply the new mlock mode to vmas. - var unmapped bool - vseg := mm.vmas.FindSegment(ar.Start) - for { - if !vseg.Ok() { - unmapped = true - break - } - vseg = mm.vmas.Isolate(vseg, ar) - vma := vseg.ValuePtr() - prevMode := vma.mlockMode - vma.mlockMode = mode - if mode != memmap.MLockNone && prevMode == memmap.MLockNone { - mm.lockedAS += uint64(vseg.Range().Length()) - } else if mode == memmap.MLockNone && prevMode != memmap.MLockNone { - mm.lockedAS -= uint64(vseg.Range().Length()) - } - if ar.End <= vseg.End() { - break - } - vseg, _ = vseg.NextNonEmpty() - } - mm.vmas.MergeRange(ar) - mm.vmas.MergeAdjacent(ar) - if unmapped { - mm.mappingMu.Unlock() - return syserror.ENOMEM - } - - if mode == memmap.MLockEager { - // Ensure that we have usable pmas. Since we didn't return ENOMEM - // above, ar must be fully covered by vmas, so we can just use - // NextSegment below. - mm.activeMu.Lock() - mm.mappingMu.DowngradeLock() - for vseg := mm.vmas.FindSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { - if !vseg.ValuePtr().effectivePerms.Any() { - // Linux: mm/gup.c:__get_user_pages() returns EFAULT in this - // case, which is converted to ENOMEM by mlock. - mm.activeMu.Unlock() - mm.mappingMu.RUnlock() - return syserror.ENOMEM - } - _, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), pmaOpts{}) - if err != nil { - mm.activeMu.Unlock() - mm.mappingMu.RUnlock() - // Linux: mm/mlock.c:__mlock_posix_error_return() - if err == syserror.EFAULT { - return syserror.ENOMEM - } - if err == syserror.ENOMEM { - return syserror.EAGAIN - } - return err - } - } - - // Map pmas into the active AddressSpace, if we have one. - mm.mappingMu.RUnlock() - if mm.as != nil { - mm.activeMu.DowngradeLock() - err := mm.mapASLocked(mm.pmas.LowerBoundSegment(ar.Start), ar, true /* precommit */) - mm.activeMu.RUnlock() - if err != nil { - return err - } - } else { - mm.activeMu.Unlock() - } - } else { - mm.mappingMu.Unlock() - } - - return nil -} - -// MLockAllOpts holds options to MLockAll. -type MLockAllOpts struct { - // If Current is true, change the memory-locking behavior of all mappings - // to Mode. If Future is true, upgrade the memory-locking behavior of all - // future mappings to Mode. At least one of Current or Future must be true. - Current bool - Future bool - Mode memmap.MLockMode -} - -// MLockAll implements the semantics of Linux's mlockall()/munlockall(), -// depending on opts. -func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error { - if !opts.Current && !opts.Future { - return syserror.EINVAL - } - - mm.mappingMu.Lock() - // Can't defer mm.mappingMu.Unlock(); see below. - - if opts.Current { - if opts.Mode != memmap.MLockNone { - // Check against RLIMIT_MEMLOCK. - if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { - mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur - if mlockLimit == 0 { - mm.mappingMu.Unlock() - return syserror.EPERM - } - if uint64(mm.vmas.Span()) > mlockLimit { - mm.mappingMu.Unlock() - return syserror.ENOMEM - } - } - } - for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() { - vma := vseg.ValuePtr() - prevMode := vma.mlockMode - vma.mlockMode = opts.Mode - if opts.Mode != memmap.MLockNone && prevMode == memmap.MLockNone { - mm.lockedAS += uint64(vseg.Range().Length()) - } else if opts.Mode == memmap.MLockNone && prevMode != memmap.MLockNone { - mm.lockedAS -= uint64(vseg.Range().Length()) - } - } - } - - if opts.Future { - mm.defMLockMode = opts.Mode - } - - if opts.Current && opts.Mode == memmap.MLockEager { - // Linux: mm/mlock.c:sys_mlockall() => include/linux/mm.h:mm_populate() - // ignores the return value of __mm_populate(), so all errors below are - // ignored. - // - // Try to get usable pmas. - mm.activeMu.Lock() - mm.mappingMu.DowngradeLock() - for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() { - if vseg.ValuePtr().effectivePerms.Any() { - mm.getPMAsLocked(ctx, vseg, vseg.Range(), pmaOpts{}) - } - } - - // Map all pmas into the active AddressSpace, if we have one. - mm.mappingMu.RUnlock() - if mm.as != nil { - mm.activeMu.DowngradeLock() - mm.mapASLocked(mm.pmas.FirstSegment(), mm.applicationAddrRange(), true /* precommit */) - mm.activeMu.RUnlock() - } else { - mm.activeMu.Unlock() - } - } else { - mm.mappingMu.Unlock() - } - return nil -} - // Decommit implements the semantics of Linux's madvise(MADV_DONTNEED). func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error { ar, ok := addr.ToRange(length) @@ -965,49 +680,46 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error { // ensures that Decommit immediately reduces host memory usage. var didUnmapAS bool pseg := mm.pmas.LowerBoundSegment(ar.Start) + vseg := mm.vmas.LowerBoundSegment(ar.Start) mem := mm.p.Memory() - for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { - vma := vseg.ValuePtr() - if vma.mlockMode != memmap.MLockNone { - return syserror.EINVAL - } - vsegAR := vseg.Range().Intersect(ar) - // pseg should already correspond to either this vma or a later one, - // since there can't be a pma without a corresponding vma. - if checkInvariants { - if pseg.Ok() && pseg.End() <= vsegAR.Start { - panic(fmt.Sprintf("pma %v precedes vma %v", pseg.Range(), vsegAR)) - } - } - for pseg.Ok() && pseg.Start() < vsegAR.End { - pma := pseg.ValuePtr() - if pma.private && !mm.isPMACopyOnWriteLocked(pseg) { - psegAR := pseg.Range().Intersect(ar) - if vsegAR.IsSupersetOf(psegAR) && vma.mappable == nil { - if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil { - pseg = pseg.NextSegment() - continue - } - // If an error occurs, fall through to the general - // invalidation case below. + for pseg.Ok() && pseg.Start() < ar.End { + pma := pseg.ValuePtr() + if pma.private && !mm.isPMACopyOnWriteLocked(pseg) { + psegAR := pseg.Range().Intersect(ar) + vseg = vseg.seekNextLowerBound(psegAR.Start) + if checkInvariants { + if !vseg.Ok() { + panic(fmt.Sprintf("no vma after %#x", psegAR.Start)) + } + if psegAR.Start < vseg.Start() { + panic(fmt.Sprintf("no vma in [%#x, %#x)", psegAR.Start, vseg.Start())) } } - pseg = mm.pmas.Isolate(pseg, vsegAR) - pma = pseg.ValuePtr() - if !didUnmapAS { - // Unmap all of ar, not just pseg.Range(), to minimize host - // syscalls. AddressSpace mappings must be removed before - // mm.decPrivateRef(). - mm.unmapASLocked(ar) - didUnmapAS = true - } - if pma.private { - mm.decPrivateRef(pseg.fileRange()) + if vseg.Range().IsSupersetOf(psegAR) && vseg.ValuePtr().mappable == nil { + if err := mem.Decommit(pseg.fileRangeOf(psegAR)); err == nil { + pseg = pseg.NextSegment() + continue + } + // If an error occurs, fall through to the general + // invalidation case below. } - pma.file.DecRef(pseg.fileRange()) - mm.removeRSSLocked(pseg.Range()) - pseg = mm.pmas.Remove(pseg).NextSegment() } + pseg = mm.pmas.Isolate(pseg, ar) + pma = pseg.ValuePtr() + if !didUnmapAS { + // Unmap all of ar, not just pseg.Range(), to minimize host + // syscalls. AddressSpace mappings must be removed before + // mm.decPrivateRef(). + mm.unmapASLocked(ar) + didUnmapAS = true + } + if pma.private { + mm.decPrivateRef(pseg.fileRange()) + } + pma.file.DecRef(pseg.fileRange()) + mm.removeRSSLocked(pseg.Range()) + + pseg = mm.pmas.Remove(pseg).NextSegment() } // "If there are some parts of the specified address space that are not @@ -1020,28 +732,9 @@ func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error { return nil } -// MSyncOpts holds options to MSync. -type MSyncOpts struct { - // Sync has the semantics of MS_SYNC. - Sync bool - - // Invalidate has the semantics of MS_INVALIDATE. - Invalidate bool -} - -// MSync implements the semantics of Linux's msync(). -func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length uint64, opts MSyncOpts) error { - if addr != addr.RoundDown() { - return syserror.EINVAL - } - if length == 0 { - return nil - } - la, ok := usermem.Addr(length).RoundUp() - if !ok { - return syserror.ENOMEM - } - ar, ok := addr.ToRange(uint64(la)) +// Sync implements the semantics of Linux's msync(MS_SYNC). +func (mm *MemoryManager) Sync(ctx context.Context, addr usermem.Addr, length uint64) error { + ar, ok := addr.ToRange(length) if !ok { return syserror.ENOMEM } @@ -1066,14 +759,10 @@ func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length ui } lastEnd = vseg.End() vma := vseg.ValuePtr() - if opts.Invalidate && vma.mlockMode != memmap.MLockNone { - mm.mappingMu.RUnlock() - return syserror.EBUSY - } // It's only possible to have dirtied the Mappable through a shared // mapping. Don't check if the mapping is writable, because mprotect // may have changed this, and also because Linux doesn't. - if id := vma.id; opts.Sync && id != nil && vma.mappable != nil && !vma.private { + if id := vma.id; id != nil && vma.mappable != nil && !vma.private { // We can't call memmap.MappingIdentity.Msync while holding // mm.mappingMu since it may take fs locks that precede it in the // lock order. diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go index 28ba9f2f5..5c2c802f6 100644 --- a/pkg/sentry/mm/vma.go +++ b/pkg/sentry/mm/vma.go @@ -17,10 +17,8 @@ package mm import ( "fmt" - "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/context" - "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" "gvisor.googlesource.com/gvisor/pkg/sentry/limits" "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" @@ -55,23 +53,6 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp return vmaIterator{}, usermem.AddrRange{}, syserror.ENOMEM } - if opts.MLockMode != memmap.MLockNone { - // Check against RLIMIT_MEMLOCK. - if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { - mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur - if mlockLimit == 0 { - return vmaIterator{}, usermem.AddrRange{}, syserror.EPERM - } - newLockedAS := mm.lockedAS + opts.Length - if opts.Unmap { - newLockedAS -= mm.mlockedBytesRangeLocked(ar) - } - if newLockedAS > mlockLimit { - return vmaIterator{}, usermem.AddrRange{}, syserror.EAGAIN - } - } - } - // Remove overwritten mappings. This ordering is consistent with Linux: // compare Linux's mm/mmap.c:mmap_region() => do_munmap(), // file->f_op->mmap(). @@ -104,14 +85,10 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp maxPerms: opts.MaxPerms, private: opts.Private, growsDown: opts.GrowsDown, - mlockMode: opts.MLockMode, id: opts.MappingIdentity, hint: opts.Hint, }) mm.usageAS += opts.Length - if opts.MLockMode != memmap.MLockNone { - mm.lockedAS += opts.Length - } return vseg, ar, nil } @@ -224,17 +201,6 @@ func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bo return 0, syserror.ENOMEM } -// Preconditions: mm.mappingMu must be locked. -func (mm *MemoryManager) mlockedBytesRangeLocked(ar usermem.AddrRange) uint64 { - var total uint64 - for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { - if vseg.ValuePtr().mlockMode != memmap.MLockNone { - total += uint64(vseg.Range().Intersect(ar).Length()) - } - } - return total -} - // getVMAsLocked ensures that vmas exist for all addresses in ar, and support // access of type (at, ignorePermissions). It returns: // @@ -372,9 +338,6 @@ func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRa vma.id.DecRef() } mm.usageAS -= uint64(vmaAR.Length()) - if vma.mlockMode != memmap.MLockNone { - mm.lockedAS -= uint64(vmaAR.Length()) - } vgap = mm.vmas.Remove(vseg) vseg = vgap.NextSegment() } @@ -405,7 +368,6 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa vma1.maxPerms != vma2.maxPerms || vma1.private != vma2.private || vma1.growsDown != vma2.growsDown || - vma1.mlockMode != vma2.mlockMode || vma1.id != vma2.id || vma1.hint != vma2.hint { return vma{}, false diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index e855590e6..7a5c93f9b 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -196,11 +196,11 @@ var AMD64 = &kernel.SyscallTable{ 145: SchedGetscheduler, 146: SchedGetPriorityMax, 147: SchedGetPriorityMin, - 148: syscalls.ErrorWithEvent(syscall.EPERM), // SchedRrGetInterval, - 149: Mlock, - 150: Munlock, - 151: Mlockall, - 152: Munlockall, + 148: syscalls.ErrorWithEvent(syscall.EPERM), // SchedRrGetInterval, + 149: syscalls.Error(nil), // Mlock, TODO + 150: syscalls.Error(nil), // Munlock, TODO + 151: syscalls.Error(nil), // Mlockall, TODO + 152: syscalls.Error(nil), // Munlockall, TODO 153: syscalls.CapError(linux.CAP_SYS_TTY_CONFIG), // Vhangup, 154: syscalls.Error(syscall.EPERM), // ModifyLdt, 155: syscalls.Error(syscall.EPERM), // PivotRoot, @@ -373,9 +373,8 @@ var AMD64 = &kernel.SyscallTable{ // 322: Execveat, TODO // 323: Userfaultfd, TODO // 324: Membarrier, TODO - 325: Mlock2, - // Syscalls after 325 are "backports" from versions of Linux after 4.4. - // 326: CopyFileRange, + // Syscalls after 325 are backports from 4.6. + 325: syscalls.Error(nil), // Mlock2, TODO 327: Preadv2, 328: Pwritev2, }, diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index 8732861e0..145f7846c 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -69,9 +69,6 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC GrowsDown: linux.MAP_GROWSDOWN&flags != 0, Precommit: linux.MAP_POPULATE&flags != 0, } - if linux.MAP_LOCKED&flags != 0 { - opts.MLockMode = memmap.MLockEager - } defer func() { if opts.MappingIdentity != nil { opts.MappingIdentity.DecRef() @@ -387,6 +384,16 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall length := args[1].SizeT() flags := args[2].Int() + if addr != addr.RoundDown() { + return 0, nil, syserror.EINVAL + } + if length == 0 { + return 0, nil, nil + } + la, ok := usermem.Addr(length).RoundUp() + if !ok { + return 0, nil, syserror.ENOMEM + } // "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC, // and may additionally include the MS_INVALIDATE bit. ... However, Linux // permits a call to msync() that specifies neither of these flags, with @@ -399,72 +406,39 @@ func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if sync && flags&linux.MS_ASYNC != 0 { return 0, nil, syserror.EINVAL } - err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{ - Sync: sync, - Invalidate: flags&linux.MS_INVALIDATE != 0, - }) - // MSync calls fsync, the same interrupt conversion rules apply, see - // mm/msync.c, fsync POSIX.1-2008. - return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) -} - -// Mlock implements linux syscall mlock(2). -func Mlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - addr := args[0].Pointer() - length := args[1].SizeT() - - return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockEager) -} -// Mlock2 implements linux syscall mlock2(2). -func Mlock2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - addr := args[0].Pointer() - length := args[1].SizeT() - flags := args[2].Int() - - if flags&^(linux.MLOCK_ONFAULT) != 0 { + // MS_INVALIDATE "asks to invalidate other mappings of the same file (so + // that they can be updated with the fresh values just written)". This is a + // no-op given that shared memory exists. However, MS_INVALIDATE can also + // be used to detect mlocks: "EBUSY: MS_INVALIDATE was specified in flags, + // and a memory lock exists for the specified address range." Given that + // mlock is stubbed out, it's unsafe to pass MS_INVALIDATE silently since + // some user program could be using it for synchronization. + if flags&linux.MS_INVALIDATE != 0 { return 0, nil, syserror.EINVAL } - - mode := memmap.MLockEager - if flags&linux.MLOCK_ONFAULT != 0 { - mode = memmap.MLockLazy - } - return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), mode) -} - -// Munlock implements linux syscall munlock(2). -func Munlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - addr := args[0].Pointer() - length := args[1].SizeT() - - return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockNone) -} - -// Mlockall implements linux syscall mlockall(2). -func Mlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - flags := args[0].Int() - - if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 { - return 0, nil, syserror.EINVAL + // MS_SYNC "requests an update and waits for it to complete." + if sync { + err := t.MemoryManager().Sync(t, addr, uint64(la)) + // Sync calls fsync, the same interrupt conversion rules apply, see + // mm/msync.c, fsync POSIX.1-2008. + return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) + } + // MS_ASYNC "specifies that an update be scheduled, but the call returns + // immediately". As long as dirty pages are tracked and eventually written + // back, this is a no-op. (Correspondingly: "Since Linux 2.6.19, MS_ASYNC + // is in fact a no-op, since the kernel properly tracks dirty pages and + // flushes them to storage as necessary.") + // + // However: "ENOMEM: The indicated memory (or part of it) was not mapped." + // This applies even for MS_ASYNC. + ar, ok := addr.ToRange(uint64(la)) + if !ok { + return 0, nil, syserror.ENOMEM } - - mode := memmap.MLockEager - if flags&linux.MCL_ONFAULT != 0 { - mode = memmap.MLockLazy + mapped := t.MemoryManager().VirtualMemorySizeRange(ar) + if mapped != uint64(la) { + return 0, nil, syserror.ENOMEM } - return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{ - Current: flags&linux.MCL_CURRENT != 0, - Future: flags&linux.MCL_FUTURE != 0, - Mode: mode, - }) -} - -// Munlockall implements linux syscall munlockall(2). -func Munlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{ - Current: true, - Future: true, - Mode: memmap.MLockNone, - }) + return 0, nil, nil } diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go index b0b216045..2f16e1791 100644 --- a/pkg/sentry/syscalls/linux/sys_rlimit.go +++ b/pkg/sentry/syscalls/linux/sys_rlimit.go @@ -90,7 +90,6 @@ var setableLimits = map[limits.LimitType]struct{}{ limits.CPU: {}, limits.Data: {}, limits.FileSize: {}, - limits.MemoryLocked: {}, limits.Stack: {}, // These are not enforced, but we include them here to avoid returning // EPERM, since some apps expect them to succeed. |