// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package mm provides a memory management subsystem. See README.md for a // detailed overview. // // Lock order: // // fs locks, except for memmap.Mappable locks // mm.MemoryManager.metadataMu // mm.MemoryManager.mappingMu // Locks taken by memmap.Mappable methods other than Translate // mm.MemoryManager.activeMu // Locks taken by memmap.Mappable.Translate // mm.privateRefs.mu // platform.AddressSpace locks // platform.File locks // mm.aioManager.mu // mm.AIOContext.mu // // Only mm.MemoryManager.Fork is permitted to lock mm.MemoryManager.activeMu in // multiple mm.MemoryManagers, as it does so in a well-defined order (forked // child first). package mm import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) // MemoryManager implements a virtual address space. // // +stateify savable type MemoryManager struct { // p and mfp are immutable. p platform.Platform mfp pgalloc.MemoryFileProvider // haveASIO is the cached result of p.SupportsAddressSpaceIO(). Aside from // eliminating an indirect call in the hot I/O path, this makes // MemoryManager.asioEnabled() a leaf function, allowing it to be inlined. // // haveASIO is immutable. haveASIO bool `state:"nosave"` // layout is the memory layout. // // layout is set by the binary loader before the MemoryManager can be used. layout arch.MmapLayout // privateRefs stores reference counts for private memory (memory whose // ownership is shared by one or more pmas instead of being owned by a // memmap.Mappable). // // privateRefs is immutable. privateRefs *privateRefs // users is the number of dependencies on the mappings in the MemoryManager. // When the number of references in users reaches zero, all mappings are // unmapped. // // users is accessed using atomic memory operations. users int32 // mappingMu is analogous to Linux's struct mm_struct::mmap_sem. mappingMu sync.RWMutex `state:"nosave"` // vmas stores virtual memory areas. Since vmas are stored by value, // clients should usually use vmaIterator.ValuePtr() instead of // vmaIterator.Value() to get a pointer to the vma rather than a copy. // // Invariants: vmas are always page-aligned. // // vmas is protected by mappingMu. vmas vmaSet // brk is the mm's brk, which is manipulated using the brk(2) system call. // The brk is initially set up by the loader which maps an executable // binary into the mm. // // brk is protected by mappingMu. brk usermem.AddrRange // usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks. // // usageAS is protected by mappingMu. usageAS uint64 // lockedAS is the combined size in bytes of all vmas with vma.mlockMode != // memmap.MLockNone. // // lockedAS is protected by mappingMu. lockedAS uint64 // dataAS is the size of private data segments, like mm_struct->data_vm. // It means the vma which is private, writable, not stack. // // dataAS is protected by mappingMu. dataAS uint64 // New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or // defMLockMode is greater. // // defMLockMode is protected by mappingMu. defMLockMode memmap.MLockMode // activeMu is loosely analogous to Linux's struct // mm_struct::page_table_lock. activeMu sync.RWMutex `state:"nosave"` // pmas stores platform mapping areas used to implement vmas. Since pmas // are stored by value, clients should usually use pmaIterator.ValuePtr() // instead of pmaIterator.Value() to get a pointer to the pma rather than // a copy. // // Inserting or removing segments from pmas should happen along with a // call to mm.insertRSS or mm.removeRSS. // // Invariants: pmas are always page-aligned. If a pma exists for a given // address, a vma must also exist for that address. // // pmas is protected by activeMu. pmas pmaSet // curRSS is pmas.Span(), cached to accelerate updates to maxRSS. It is // reported as the MemoryManager's RSS. // // maxRSS should be modified only via insertRSS and removeRSS, not // directly. // // maxRSS is protected by activeMu. curRSS uint64 // maxRSS is the maximum resident set size in bytes of a MemoryManager. // It is tracked as the application adds and removes mappings to pmas. // // maxRSS should be modified only via insertRSS, not directly. // // maxRSS is protected by activeMu. maxRSS uint64 // as is the platform.AddressSpace that pmas are mapped into. active is the // number of contexts that require as to be non-nil; if active == 0, as may // be nil. // // as is protected by activeMu. active is manipulated with atomic memory // operations; transitions to and from zero are additionally protected by // activeMu. (This is because such transitions may need to be atomic with // changes to as.) as platform.AddressSpace `state:"nosave"` active int32 `state:"zerovalue"` // unmapAllOnActivate indicates that the next Activate call should activate // an empty AddressSpace. // // This is used to ensure that an AddressSpace cached in // NewAddressSpace is not used after some change in the MemoryManager // or VMAs has made that AddressSpace stale. // // unmapAllOnActivate is protected by activeMu. It must only be set when // there is no active or cached AddressSpace. If as != nil, then // invalidations should be propagated immediately. unmapAllOnActivate bool `state:"nosave"` // If captureInvalidations is true, calls to MM.Invalidate() are recorded // in capturedInvalidations rather than being applied immediately to pmas. // This is to avoid a race condition in MM.Fork(); see that function for // details. // // Both captureInvalidations and capturedInvalidations are protected by // activeMu. Neither need to be saved since captureInvalidations is only // enabled during MM.Fork(), during which saving can't occur. captureInvalidations bool `state:"zerovalue"` capturedInvalidations []invalidateArgs `state:"nosave"` metadataMu sync.Mutex `state:"nosave"` // argv is the application argv. This is set up by the loader and may be // modified by prctl(PR_SET_MM_ARG_START/PR_SET_MM_ARG_END). No // requirements apply to argv; we do not require that argv.WellFormed(). // // argv is protected by metadataMu. argv usermem.AddrRange // envv is the application envv. This is set up by the loader and may be // modified by prctl(PR_SET_MM_ENV_START/PR_SET_MM_ENV_END). No // requirements apply to envv; we do not require that envv.WellFormed(). // // envv is protected by metadataMu. envv usermem.AddrRange // auxv is the ELF's auxiliary vector. // // auxv is protected by metadataMu. auxv arch.Auxv // executable is the executable for this MemoryManager. If executable // is not nil, it holds a reference on the Dirent. // // executable is protected by metadataMu. executable fsbridge.File // dumpability describes if and how this MemoryManager may be dumped to // userspace. // // dumpability is protected by metadataMu. dumpability Dumpability // aioManager keeps track of AIOContexts used for async IOs. AIOManager // must be cloned when CLONE_VM is used. aioManager aioManager // sleepForActivation indicates whether the task should report to be sleeping // before trying to activate the address space. When set to true, delays in // activation are not reported as stuck tasks by the watchdog. sleepForActivation bool } // vma represents a virtual memory area. // // +stateify savable type vma struct { // mappable is the virtual memory object mapped by this vma. If mappable is // nil, the vma represents a private anonymous mapping. mappable memmap.Mappable // off is the offset into mappable at which this vma begins. If mappable is // nil, off is meaningless. off uint64 // To speedup VMA save/restore, we group and save the following booleans // as a single integer. // realPerms are the memory permissions on this vma, as defined by the // application. realPerms usermem.AccessType `state:".(int)"` // effectivePerms are the memory permissions on this vma which are // actually used to control access. // // Invariant: effectivePerms == realPerms.Effective(). effectivePerms usermem.AccessType `state:"manual"` // maxPerms limits the set of permissions that may ever apply to this // memory, as well as accesses for which usermem.IOOpts.IgnorePermissions // is true (e.g. ptrace(PTRACE_POKEDATA)). // // Invariant: maxPerms == maxPerms.Effective(). maxPerms usermem.AccessType `state:"manual"` // private is true if this is a MAP_PRIVATE mapping, such that writes to // the mapping are propagated to a copy. private bool `state:"manual"` // growsDown is true if the mapping may be automatically extended downward // under certain conditions. If growsDown is true, mappable must be nil. // // There is currently no corresponding growsUp flag; in Linux, the only // architectures that can have VM_GROWSUP mappings are ia64, parisc, and // metag, none of which we currently support. growsDown bool `state:"manual"` // dontfork is the MADV_DONTFORK setting for this vma configured by madvise(). dontfork bool mlockMode memmap.MLockMode // numaPolicy is the NUMA policy for this vma set by mbind(). numaPolicy int32 // numaNodemask is the NUMA nodemask for this vma set by mbind(). numaNodemask uint64 // If id is not nil, it controls the lifecycle of mappable and provides vma // metadata shown in /proc/[pid]/maps, and the vma holds a reference. id memmap.MappingIdentity // If hint is non-empty, it is a description of the vma printed in // /proc/[pid]/maps. hint takes priority over id.MappedName(). hint string } const ( vmaRealPermsRead = 1 << iota vmaRealPermsWrite vmaRealPermsExecute vmaEffectivePermsRead vmaEffectivePermsWrite vmaEffectivePermsExecute vmaMaxPermsRead vmaMaxPermsWrite vmaMaxPermsExecute vmaPrivate vmaGrowsDown ) func (v *vma) saveRealPerms() int { var b int if v.realPerms.Read { b |= vmaRealPermsRead } if v.realPerms.Write { b |= vmaRealPermsWrite } if v.realPerms.Execute { b |= vmaRealPermsExecute } if v.effectivePerms.Read { b |= vmaEffectivePermsRead } if v.effectivePerms.Write { b |= vmaEffectivePermsWrite } if v.effectivePerms.Execute { b |= vmaEffectivePermsExecute } if v.maxPerms.Read { b |= vmaMaxPermsRead } if v.maxPerms.Write { b |= vmaMaxPermsWrite } if v.maxPerms.Execute { b |= vmaMaxPermsExecute } if v.private { b |= vmaPrivate } if v.growsDown { b |= vmaGrowsDown } return b } func (v *vma) loadRealPerms(b int) { if b&vmaRealPermsRead > 0 { v.realPerms.Read = true } if b&vmaRealPermsWrite > 0 { v.realPerms.Write = true } if b&vmaRealPermsExecute > 0 { v.realPerms.Execute = true } if b&vmaEffectivePermsRead > 0 { v.effectivePerms.Read = true } if b&vmaEffectivePermsWrite > 0 { v.effectivePerms.Write = true } if b&vmaEffectivePermsExecute > 0 { v.effectivePerms.Execute = true } if b&vmaMaxPermsRead > 0 { v.maxPerms.Read = true } if b&vmaMaxPermsWrite > 0 { v.maxPerms.Write = true } if b&vmaMaxPermsExecute > 0 { v.maxPerms.Execute = true } if b&vmaPrivate > 0 { v.private = true } if b&vmaGrowsDown > 0 { v.growsDown = true } } // pma represents a platform mapping area. // // +stateify savable type pma struct { // file is the file mapped by this pma. Only pmas for which file == // MemoryManager.mfp.MemoryFile() may be saved. pmas hold a reference to // the corresponding file range while they exist. file platform.File `state:"nosave"` // off is the offset into file at which this pma begins. // // Note that pmas do *not* hold references on offsets in file! If private // is true, MemoryManager.privateRefs holds the reference instead. If // private is false, the corresponding memmap.Mappable holds the reference // instead (per memmap.Mappable.Translate requirement). off uint64 // translatePerms is the permissions returned by memmap.Mappable.Translate. // If private is true, translatePerms is usermem.AnyAccess. translatePerms usermem.AccessType // effectivePerms is the permissions allowed for non-ignorePermissions // accesses. maxPerms is the permissions allowed for ignorePermissions // accesses. These are vma.effectivePerms and vma.maxPerms respectively, // masked by pma.translatePerms and with Write disallowed if pma.needCOW is // true. // // These are stored in the pma so that the IO implementation can avoid // iterating mm.vmas when pmas already exist. effectivePerms usermem.AccessType maxPerms usermem.AccessType // needCOW is true if writes to the mapping must be propagated to a copy. needCOW bool // private is true if this pma represents private memory. // // If private is true, file must be MemoryManager.mfp.MemoryFile(), the pma // holds a reference on the mapped memory that is tracked in privateRefs, // and calls to Invalidate for which // memmap.InvalidateOpts.InvalidatePrivate is false should ignore the pma. // // If private is false, this pma caches a translation from the // corresponding vma's memmap.Mappable.Translate. private bool // If internalMappings is not empty, it is the cached return value of // file.MapInternal for the platform.FileRange mapped by this pma. internalMappings safemem.BlockSeq `state:"nosave"` } // +stateify savable type privateRefs struct { mu sync.Mutex `state:"nosave"` // refs maps offsets into MemoryManager.mfp.MemoryFile() to the number of // pmas (or, equivalently, MemoryManagers) that share ownership of the // memory at that offset. refs fileRefcountSet } type invalidateArgs struct { ar usermem.AddrRange opts memmap.InvalidateOpts } // fileRefcountSetFunctions implements segment.Functions for fileRefcountSet. type fileRefcountSetFunctions struct{} func (fileRefcountSetFunctions) MinKey() uint64 { return 0 } func (fileRefcountSetFunctions) MaxKey() uint64 { return ^uint64(0) } func (fileRefcountSetFunctions) ClearValue(_ *int32) { } func (fileRefcountSetFunctions) Merge(_ platform.FileRange, rc1 int32, _ platform.FileRange, rc2 int32) (int32, bool) { return rc1, rc1 == rc2 } func (fileRefcountSetFunctions) Split(_ platform.FileRange, rc int32, _ uint64) (int32, int32) { return rc, rc }