diff options
Diffstat (limited to 'pkg/sentry/mm/lifecycle.go')
-rw-r--r-- | pkg/sentry/mm/lifecycle.go | 234 |
1 files changed, 234 insertions, 0 deletions
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go new file mode 100644 index 000000000..7a65a62a2 --- /dev/null +++ b/pkg/sentry/mm/lifecycle.go @@ -0,0 +1,234 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "fmt" + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/atomicbitops" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// NewMemoryManager returns a new MemoryManager with no mappings and 1 user. +func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider) *MemoryManager { + return &MemoryManager{ + p: p, + mfp: mfp, + haveASIO: p.SupportsAddressSpaceIO(), + privateRefs: &privateRefs{}, + users: 1, + auxv: arch.Auxv{}, + aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, + } +} + +// SetMmapLayout initializes mm's layout from the given arch.Context. +// +// Preconditions: mm contains no mappings and is not used concurrently. +func (mm *MemoryManager) SetMmapLayout(ac arch.Context, r *limits.LimitSet) (arch.MmapLayout, error) { + layout, err := ac.NewMmapLayout(mm.p.MinUserAddress(), mm.p.MaxUserAddress(), r) + if err != nil { + return arch.MmapLayout{}, err + } + mm.layout = layout + return layout, nil +} + +// Fork creates a copy of mm with 1 user, as for Linux syscalls fork() or +// clone() (without CLONE_VM). +func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + mm2 := &MemoryManager{ + p: mm.p, + mfp: mm.mfp, + haveASIO: mm.haveASIO, + layout: mm.layout, + privateRefs: mm.privateRefs, + users: 1, + brk: mm.brk, + usageAS: mm.usageAS, + dataAS: mm.dataAS, + // "The child does not inherit its parent's memory locks (mlock(2), + // mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is + // MLockNone, both of which are zero values. vma.mlockMode is reset + // when copied below. + captureInvalidations: true, + argv: mm.argv, + envv: mm.envv, + auxv: append(arch.Auxv(nil), mm.auxv...), + // IncRef'd below, once we know that there isn't an error. + executable: mm.executable, + aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, + } + + // Copy vmas. + dstvgap := mm2.vmas.FirstGap() + for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() { + vma := srcvseg.Value() // makes a copy of the vma + vmaAR := srcvseg.Range() + // Inform the Mappable, if any, of the new mapping. + if vma.mappable != nil { + if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off, vma.canWriteMappableLocked()); err != nil { + mm2.removeVMAsLocked(ctx, mm2.applicationAddrRange()) + return nil, err + } + } + if vma.id != nil { + vma.id.IncRef() + } + vma.mlockMode = memmap.MLockNone + dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap() + // We don't need to update mm2.usageAS since we copied it from mm + // above. + } + + // Copy pmas. We have to lock mm.activeMu for writing to make existing + // private pmas copy-on-write. We also have to lock mm2.activeMu since + // after copying vmas above, memmap.Mappables may call mm2.Invalidate. We + // only copy private pmas, since in the common case where fork(2) is + // immediately followed by execve(2), copying non-private pmas that can be + // regenerated by calling memmap.Mappable.Translate is a waste of time. + // (Linux does the same; compare kernel/fork.c:dup_mmap() => + // mm/memory.c:copy_page_range().) + mm2.activeMu.Lock() + defer mm2.activeMu.Unlock() + mm.activeMu.Lock() + defer mm.activeMu.Unlock() + dstpgap := mm2.pmas.FirstGap() + var unmapAR usermem.AddrRange + for srcpseg := mm.pmas.FirstSegment(); srcpseg.Ok(); srcpseg = srcpseg.NextSegment() { + pma := srcpseg.ValuePtr() + if !pma.private { + continue + } + if !pma.needCOW { + pma.needCOW = true + if pma.effectivePerms.Write { + // We don't want to unmap the whole address space, even though + // doing so would reduce calls to unmapASLocked(), because mm + // will most likely continue to be used after the fork, so + // unmapping pmas unnecessarily will result in extra page + // faults. But we do want to merge consecutive AddrRanges + // across pma boundaries. + if unmapAR.End == srcpseg.Start() { + unmapAR.End = srcpseg.End() + } else { + if unmapAR.Length() != 0 { + mm.unmapASLocked(unmapAR) + } + unmapAR = srcpseg.Range() + } + pma.effectivePerms.Write = false + } + pma.maxPerms.Write = false + } + fr := srcpseg.fileRange() + mm2.incPrivateRef(fr) + srcpseg.ValuePtr().file.IncRef(fr) + addrRange := srcpseg.Range() + mm2.addRSSLocked(addrRange) + dstpgap = mm2.pmas.Insert(dstpgap, addrRange, *pma).NextGap() + } + if unmapAR.Length() != 0 { + mm.unmapASLocked(unmapAR) + } + + // Between when we call memmap.Mappable.AddMapping while copying vmas and + // when we lock mm2.activeMu to copy pmas, calls to mm2.Invalidate() are + // ineffective because the pmas they invalidate haven't yet been copied, + // possibly allowing mm2 to get invalidated translations: + // + // Invalidating Mappable mm.Fork + // --------------------- ------- + // + // mm2.Invalidate() + // mm.activeMu.Lock() + // mm.Invalidate() /* blocks */ + // mm2.activeMu.Lock() + // (mm copies invalidated pma to mm2) + // + // This would technically be both safe (since we only copy private pmas, + // which will still hold a reference on their memory) and consistent with + // Linux, but we avoid it anyway by setting mm2.captureInvalidations during + // construction, causing calls to mm2.Invalidate() to be captured in + // mm2.capturedInvalidations, to be replayed after pmas are copied - i.e. + // here. + mm2.captureInvalidations = false + for _, invArgs := range mm2.capturedInvalidations { + mm2.invalidateLocked(invArgs.ar, invArgs.opts.InvalidatePrivate, true) + } + mm2.capturedInvalidations = nil + + if mm2.executable != nil { + mm2.executable.IncRef() + } + return mm2, nil +} + +// IncUsers increments mm's user count and returns true. If the user count is +// already 0, IncUsers does nothing and returns false. +func (mm *MemoryManager) IncUsers() bool { + return atomicbitops.IncUnlessZeroInt32(&mm.users) +} + +// DecUsers decrements mm's user count. If the user count reaches 0, all +// mappings in mm are unmapped. +func (mm *MemoryManager) DecUsers(ctx context.Context) { + if users := atomic.AddInt32(&mm.users, -1); users > 0 { + return + } else if users < 0 { + panic(fmt.Sprintf("Invalid MemoryManager.users: %d", users)) + } + + mm.aioManager.destroy() + + mm.metadataMu.Lock() + exe := mm.executable + mm.executable = nil + mm.metadataMu.Unlock() + if exe != nil { + exe.DecRef() + } + + mm.activeMu.Lock() + // Sanity check. + if atomic.LoadInt32(&mm.active) != 0 { + panic("active address space lost?") + } + // Make sure the AddressSpace is returned. + if mm.as != nil { + mm.as.Release() + mm.as = nil + } + mm.activeMu.Unlock() + + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + // If mm is being dropped before mm.SetMmapLayout was called, + // mm.applicationAddrRange() will be empty. + if ar := mm.applicationAddrRange(); ar.Length() != 0 { + mm.unmapLocked(ctx, ar) + } +} |