diff options
-rw-r--r-- | pkg/sentry/context/contexttest/contexttest.go | 2 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/dirty_set.go | 22 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/inode_cached.go | 78 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/inode_cached_test.go | 8 | ||||
-rw-r--r-- | pkg/sentry/kernel/kernel.go | 5 | ||||
-rw-r--r-- | pkg/sentry/pgalloc/BUILD | 27 | ||||
-rw-r--r-- | pkg/sentry/pgalloc/pgalloc.go | 504 | ||||
-rw-r--r-- | pkg/sentry/pgalloc/save_restore.go | 14 | ||||
-rw-r--r-- | runsc/boot/loader.go | 2 |
9 files changed, 511 insertions, 151 deletions
diff --git a/pkg/sentry/context/contexttest/contexttest.go b/pkg/sentry/context/contexttest/contexttest.go index a42038711..210a235d2 100644 --- a/pkg/sentry/context/contexttest/contexttest.go +++ b/pkg/sentry/context/contexttest/contexttest.go @@ -44,7 +44,7 @@ func Context(tb testing.TB) context.Context { tb.Fatalf("error creating application memory file: %v", err) } memfile := os.NewFile(uintptr(memfd), memfileName) - mf, err := pgalloc.NewMemoryFile(memfile) + mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) if err != nil { memfile.Close() tb.Fatalf("error creating pgalloc.MemoryFile: %v", err) diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go index 9cd196d7d..f1451d77a 100644 --- a/pkg/sentry/fs/fsutil/dirty_set.go +++ b/pkg/sentry/fs/fsutil/dirty_set.go @@ -107,6 +107,7 @@ func (ds *DirtySet) setDirty(mr memmap.MappableRange, keep bool) { var changedAny bool defer func() { if changedAny { + // Merge segments split by Isolate to reduce cost of iteration. ds.MergeRange(mr) } }() @@ -132,6 +133,26 @@ func (ds *DirtySet) setDirty(mr memmap.MappableRange, keep bool) { } } +// AllowClean allows MarkClean to mark offsets in mr as not dirty, ending the +// effect of a previous call to KeepDirty. (It does not itself mark those +// offsets as not dirty.) +func (ds *DirtySet) AllowClean(mr memmap.MappableRange) { + var changedAny bool + defer func() { + if changedAny { + // Merge segments split by Isolate to reduce cost of iteration. + ds.MergeRange(mr) + } + }() + for seg := ds.LowerBoundSegment(mr.Start); seg.Ok() && seg.Start() < mr.End; seg = seg.NextSegment() { + if seg.Value().Keep { + changedAny = true + seg = ds.Isolate(seg, mr) + seg.ValuePtr().Keep = false + } + } +} + // SyncDirty passes pages in the range mr that are stored in cache and // identified as dirty to writeAt, updating dirty to reflect successful writes. // If writeAt returns a successful partial write, SyncDirty will call it @@ -142,6 +163,7 @@ func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet var changedDirty bool defer func() { if changedDirty { + // Merge segments split by Isolate to reduce cost of iteration. dirty.MergeRange(mr) } }() diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go index 919d2534c..76644e69d 100644 --- a/pkg/sentry/fs/fsutil/inode_cached.go +++ b/pkg/sentry/fs/fsutil/inode_cached.go @@ -175,11 +175,22 @@ func (c *CachingInodeOperations) Release() { defer c.mapsMu.Unlock() c.dataMu.Lock() defer c.dataMu.Unlock() - // The cache should be empty (something has gone terribly wrong if we're - // releasing an inode that is still memory-mapped). - if !c.mappings.IsEmpty() || !c.cache.IsEmpty() || !c.dirty.IsEmpty() { - panic(fmt.Sprintf("Releasing CachingInodeOperations with mappings:\n%s\ncache contents:\n%s\ndirty segments:\n%s", &c.mappings, &c.cache, &c.dirty)) + + // Something has gone terribly wrong if we're releasing an inode that is + // still memory-mapped. + if !c.mappings.IsEmpty() { + panic(fmt.Sprintf("Releasing CachingInodeOperations with mappings:\n%s", &c.mappings)) + } + + // Drop any cached pages that are still awaiting MemoryFile eviction. (This + // means that MemoryFile no longer needs to evict them.) + mf := c.mfp.MemoryFile() + mf.MarkAllUnevictable(c) + if err := SyncDirtyAll(context.Background(), &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil { + panic(fmt.Sprintf("Failed to writeback cached data: %v", err)) } + c.cache.DropAll(mf) + c.dirty.RemoveAll() } // UnstableAttr implements fs.InodeOperations.UnstableAttr. @@ -679,6 +690,13 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error return done, nil } +// useHostPageCache returns true if c uses c.backingFile.FD() for all file I/O +// and memory mappings, and false if c.cache may contain data cached from +// c.backingFile. +func (c *CachingInodeOperations) useHostPageCache() bool { + return !c.forcePageCache && c.backingFile.FD() >= 0 +} + // AddMapping implements memmap.Mappable.AddMapping. func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { // Hot path. Avoid defers. @@ -689,7 +707,15 @@ func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.Mappi for _, r := range mapped { c.hostFileMapper.IncRefOn(r) } - if !usage.IncrementalMappedAccounting && !c.forcePageCache && c.backingFile.FD() >= 0 { + if !c.useHostPageCache() { + // c.Evict() will refuse to evict memory-mapped pages, so tell the + // MemoryFile to not bother trying. + mf := c.mfp.MemoryFile() + for _, r := range mapped { + mf.MarkUnevictable(c, pgalloc.EvictableRange{r.Start, r.End}) + } + } + if c.useHostPageCache() && !usage.IncrementalMappedAccounting { for _, r := range mapped { usage.MemoryAccounting.Inc(r.Length(), usage.Mapped) } @@ -706,7 +732,7 @@ func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.Ma for _, r := range unmapped { c.hostFileMapper.DecRefOn(r) } - if !c.forcePageCache && c.backingFile.FD() >= 0 { + if c.useHostPageCache() { if !usage.IncrementalMappedAccounting { for _, r := range unmapped { usage.MemoryAccounting.Dec(r.Length(), usage.Mapped) @@ -716,17 +742,16 @@ func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.Ma return } - // Writeback dirty mapped memory now that there are no longer any - // mappings that reference it. This is our naive memory eviction - // strategy. + // Pages that are no longer referenced by any application memory mappings + // are now considered unused; allow MemoryFile to evict them when + // necessary. mf := c.mfp.MemoryFile() c.dataMu.Lock() for _, r := range unmapped { - if err := SyncDirty(ctx, r, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil { - log.Warningf("Failed to writeback cached data %v: %v", r, err) - } - c.cache.Drop(r, mf) - c.dirty.KeepClean(r) + // Since these pages are no longer mapped, they are no longer + // concurrently dirtyable by a writable memory mapping. + c.dirty.AllowClean(r) + mf.MarkEvictable(c, pgalloc.EvictableRange{r.Start, r.End}) } c.dataMu.Unlock() c.mapsMu.Unlock() @@ -740,7 +765,7 @@ func (c *CachingInodeOperations) CopyMapping(ctx context.Context, ms memmap.Mapp // Translate implements memmap.Mappable.Translate. func (c *CachingInodeOperations) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { // Hot path. Avoid defer. - if !c.forcePageCache && c.backingFile.FD() >= 0 { + if c.useHostPageCache() { return []memmap.Translation{ { Source: optional, @@ -853,6 +878,29 @@ func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error return nil } +// Evict implements pgalloc.EvictableMemoryUser.Evict. +func (c *CachingInodeOperations) Evict(ctx context.Context, er pgalloc.EvictableRange) { + c.mapsMu.Lock() + defer c.mapsMu.Unlock() + c.dataMu.Lock() + defer c.dataMu.Unlock() + + mr := memmap.MappableRange{er.Start, er.End} + mf := c.mfp.MemoryFile() + // Only allow pages that are no longer memory-mapped to be evicted. + for mgap := c.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() { + mgapMR := mgap.Range().Intersect(mr) + if mgapMR.Length() == 0 { + continue + } + if err := SyncDirty(ctx, mgapMR, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil { + log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err) + } + c.cache.Drop(mgapMR, mf) + c.dirty.KeepClean(mgapMR) + } +} + // IncRef implements platform.File.IncRef. This is used when we directly map an // underlying host fd and CachingInodeOperations is used as the platform.File // during translation. diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go index 661ec41f6..3f10efc12 100644 --- a/pkg/sentry/fs/fsutil/inode_cached_test.go +++ b/pkg/sentry/fs/fsutil/inode_cached_test.go @@ -311,12 +311,10 @@ func TestRead(t *testing.T) { t.Errorf("Read back bytes %v, want %v", rbuf, buf) } - // Delete the memory mapping and expect it to cause the cached page to be - // uncached. + // Delete the memory mapping before iops.Release(). The cached page will + // either be evicted by ctx's pgalloc.MemoryFile, or dropped by + // iops.Release(). iops.RemoveMapping(ctx, ms, ar, usermem.PageSize, true) - if cached := iops.cache.Span(); cached != 0 { - t.Fatalf("Span got %d, want 0", cached) - } } func TestWrite(t *testing.T) { diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 0468dd678..91889b573 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -303,7 +303,12 @@ func (k *Kernel) SaveTo(w io.Writer) error { k.pauseTimeLocked() defer k.resumeTimeLocked() + // Evict all evictable MemoryFile allocations. + k.mf.FlushEvictions() + // Flush write operations on open files so data reaches backing storage. + // This must come after k.mf.FlushEvictions() since eviction may cause file + // writes. if err := k.tasks.flushWritesToFiles(ctx); err != nil { return err } diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index 7efa55c20..8a8a0e4e4 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -4,6 +4,31 @@ load("//tools/go_generics:defs.bzl", "go_template_instance") load("//tools/go_stateify:defs.bzl", "go_library", "go_test") go_template_instance( + name = "evictable_range", + out = "evictable_range.go", + package = "pgalloc", + prefix = "Evictable", + template = "//pkg/segment:generic_range", + types = { + "T": "uint64", + }, +) + +go_template_instance( + name = "evictable_range_set", + out = "evictable_range_set.go", + package = "pgalloc", + prefix = "evictableRange", + template = "//pkg/segment:generic_set", + types = { + "Key": "uint64", + "Range": "EvictableRange", + "Value": "evictableRangeSetValue", + "Functions": "evictableRangeSetFunctions", + }, +) + +go_template_instance( name = "usage_set", out = "usage_set.go", consts = { @@ -27,6 +52,8 @@ go_library( name = "pgalloc", srcs = [ "context.go", + "evictable_range.go", + "evictable_range_set.go", "pgalloc.go", "pgalloc_unsafe.go", "save_restore.go", diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index 411dafa07..9c1313f6f 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -31,6 +31,7 @@ import ( "time" "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" "gvisor.googlesource.com/gvisor/pkg/sentry/platform" "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" "gvisor.googlesource.com/gvisor/pkg/sentry/usage" @@ -41,6 +42,9 @@ import ( // MemoryFile is a platform.File whose pages may be allocated to arbitrary // users. type MemoryFile struct { + // opts holds options passed to NewMemoryFile. opts is immutable. + opts MemoryFileOpts + // MemoryFile owns a single backing file, which is modeled as follows: // // Each page in the file can be committed or uncommitted. A page is @@ -115,6 +119,24 @@ type MemoryFile struct { // fileSize is protected by mu. fileSize int64 + // Pages from the backing file are mapped into the local address space on + // the granularity of large pieces called chunks. mappings is a []uintptr + // that stores, for each chunk, the start address of a mapping of that + // chunk in the current process' address space, or 0 if no such mapping + // exists. Once a chunk is mapped, it is never remapped or unmapped until + // the MemoryFile is destroyed. + // + // Mutating the mappings slice or its contents requires both holding + // mappingsMu and using atomic memory operations. (The slice is mutated + // whenever the file is expanded. Per the above, the only permitted + // mutation of the slice's contents is the assignment of a mapping to a + // chunk that was previously unmapped.) Reading the slice or its contents + // only requires *either* holding mappingsMu or using atomic memory + // operations. This allows MemoryFile.MapInternal to avoid locking in the + // common case where chunk mappings already exist. + mappingsMu sync.Mutex + mappings atomic.Value + // destroyed is set by Destroy to instruct the reclaimer goroutine to // release resources and exit. destroyed is protected by mu. destroyed bool @@ -133,26 +155,44 @@ type MemoryFile struct { // transitions from false to true. reclaimCond sync.Cond - // Pages from the backing file are mapped into the local address space on - // the granularity of large pieces called chunks. mappings is a []uintptr - // that stores, for each chunk, the start address of a mapping of that - // chunk in the current process' address space, or 0 if no such mapping - // exists. Once a chunk is mapped, it is never remapped or unmapped until - // the MemoryFile is destroyed. + // evictable maps EvictableMemoryUsers to eviction state. // - // Mutating the mappings slice or its contents requires both holding - // mappingsMu and using atomic memory operations. (The slice is mutated - // whenever the file is expanded. Per the above, the only permitted - // mutation of the slice's contents is the assignment of a mapping to a - // chunk that was previously unmapped.) Reading the slice or its contents - // only requires *either* holding mappingsMu or using atomic memory - // operations. This allows MemoryFile.MapInternal to avoid locking in the - // common case where chunk mappings already exist. - mappingsMu sync.Mutex - mappings atomic.Value + // evictable is protected by mu. + evictable map[EvictableMemoryUser]*evictableMemoryUserInfo + + // evictionWG counts the number of goroutines currently performing evictions. + evictionWG sync.WaitGroup +} + +// MemoryFileOpts provides options to NewMemoryFile. +type MemoryFileOpts struct { + // DelayedEviction controls the extent to which the MemoryFile may delay + // eviction of evictable allocations. + DelayedEviction DelayedEvictionType } -// usage tracks usage information. +// DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction. +type DelayedEvictionType int + +const ( + // DelayedEvictionDefault has unspecified behavior. + DelayedEvictionDefault DelayedEvictionType = iota + + // DelayedEvictionDisabled requires that evictable allocations are evicted + // as soon as possible. + DelayedEvictionDisabled + + // DelayedEvictionEnabled requests that the MemoryFile delay eviction of + // evictable allocations until doing so is considered necessary to avoid + // performance degradation due to host memory pressure, or OOM kills. + // + // As of this writing, DelayedEvictionEnabled delays evictions until the + // reclaimer goroutine is out of work (pages to reclaim), then evicts all + // pending evictable allocations immediately. + DelayedEvictionEnabled +) + +// usageInfo tracks usage information. // // +stateify savable type usageInfo struct { @@ -166,6 +206,46 @@ type usageInfo struct { refs uint64 } +// An EvictableMemoryUser represents a user of MemoryFile-allocated memory that +// may be asked to deallocate that memory in the presence of memory pressure. +type EvictableMemoryUser interface { + // Evict requests that the EvictableMemoryUser deallocate memory used by + // er, which was registered as evictable by a previous call to + // MemoryFile.MarkEvictable. + // + // Evict is not required to deallocate memory. In particular, since pgalloc + // must call Evict without holding locks to avoid circular lock ordering, + // it is possible that the passed range has already been marked as + // unevictable by a racing call to MemoryFile.MarkUnevictable. + // Implementations of EvictableMemoryUser must detect such races and handle + // them by making Evict have no effect on unevictable ranges. + // + // After a call to Evict, the MemoryFile will consider the evicted range + // unevictable (i.e. it will not call Evict on the same range again) until + // informed otherwise by a subsequent call to MarkEvictable. + Evict(ctx context.Context, er EvictableRange) +} + +// An EvictableRange represents a range of uint64 offsets in an +// EvictableMemoryUser. +// +// In practice, most EvictableMemoryUsers will probably be implementations of +// memmap.Mappable, and EvictableRange therefore corresponds to +// memmap.MappableRange. However, this package cannot depend on the memmap +// package, since doing so would create a circular dependency. +// +// type EvictableRange <generated using go_generics> + +// evictableMemoryUserInfo is the value type of MemoryFile.evictable. +type evictableMemoryUserInfo struct { + // ranges tracks all evictable ranges for the given user. + ranges evictableRangeSet + + // If evicting is true, there is a goroutine currently evicting all + // evictable ranges for this user. + evicting bool +} + const ( chunkShift = 24 chunkSize = 1 << chunkShift // 16 MB @@ -180,7 +260,15 @@ const ( // NewMemoryFile creates a MemoryFile backed by the given file. If // NewMemoryFile succeeds, ownership of file is transferred to the returned // MemoryFile. -func NewMemoryFile(file *os.File) (*MemoryFile, error) { +func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) { + switch opts.DelayedEviction { + case DelayedEvictionDefault: + opts.DelayedEviction = DelayedEvictionEnabled + case DelayedEvictionDisabled, DelayedEvictionEnabled: + default: + return nil, fmt.Errorf("invalid MemoryFileOpts.DelayedEviction: %v", opts.DelayedEviction) + } + // Truncate the file to 0 bytes first to ensure that it's empty. if err := file.Truncate(0); err != nil { return nil, err @@ -189,14 +277,16 @@ func NewMemoryFile(file *os.File) (*MemoryFile, error) { return nil, err } f := &MemoryFile{ + opts: opts, fileSize: initialSize, file: file, // No pages are reclaimable. DecRef will always be able to // decrease minReclaimablePage from this point. minReclaimablePage: maxPage, + evictable: make(map[EvictableMemoryUser]*evictableMemoryUserInfo), } - f.reclaimCond.L = &f.mu f.mappings.Store(make([]uintptr, initialSize/chunkSize)) + f.reclaimCond.L = &f.mu go f.runReclaim() // S/R-SAFE: f.mu // The Linux kernel contains an optional feature called "Integrity @@ -434,113 +524,6 @@ func (f *MemoryFile) markDecommitted(fr platform.FileRange) { f.usage.MergeRange(fr) } -// runReclaim implements the reclaimer goroutine, which continuously decommits -// reclaimable pages in order to reduce memory usage and make them available -// for allocation. -func (f *MemoryFile) runReclaim() { - for { - fr, ok := f.findReclaimable() - if !ok { - break - } - - if err := f.Decommit(fr); err != nil { - log.Warningf("Reclaim failed to decommit %v: %v", fr, err) - // Zero the pages manually. This won't reduce memory usage, but at - // least ensures that the pages will be zero when reallocated. - f.forEachMappingSlice(fr, func(bs []byte) { - for i := range bs { - bs[i] = 0 - } - }) - // Pretend the pages were decommitted even though they weren't, - // since the memory accounting implementation has no idea how to - // deal with this. - f.markDecommitted(fr) - } - f.markReclaimed(fr) - } - // We only get here if findReclaimable finds f.destroyed set and returns - // false. - f.mu.Lock() - defer f.mu.Unlock() - if !f.destroyed { - panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set") - } - f.file.Close() - // Ensure that any attempts to use f.file.Fd() fail instead of getting a fd - // that has possibly been reassigned. - f.file = nil - mappings := f.mappings.Load().([]uintptr) - for i, m := range mappings { - if m != 0 { - _, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m, chunkSize, 0) - if errno != 0 { - log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno) - } - } - } - // Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.) - f.mappings.Store([]uintptr{}) -} - -func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) { - f.mu.Lock() - defer f.mu.Unlock() - for { - for { - if f.destroyed { - return platform.FileRange{}, false - } - if f.reclaimable { - break - } - f.reclaimCond.Wait() - } - // Allocate returns the first usable range in offset order and is - // currently a linear scan, so reclaiming from the beginning of the - // file minimizes the expected latency of Allocate. - for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() { - if seg.ValuePtr().refs == 0 { - f.minReclaimablePage = seg.End() - return seg.Range(), true - } - } - // No pages are reclaimable. - f.reclaimable = false - f.minReclaimablePage = maxPage - } -} - -func (f *MemoryFile) markReclaimed(fr platform.FileRange) { - f.mu.Lock() - defer f.mu.Unlock() - seg := f.usage.FindSegment(fr.Start) - // All of fr should be mapped to a single uncommitted reclaimable segment - // accounted to System. - if !seg.Ok() { - panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage)) - } - if !seg.Range().IsSupersetOf(fr) { - panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage)) - } - if got, want := seg.Value(), (usageInfo{ - kind: usage.System, - knownCommitted: false, - refs: 0, - }); got != want { - panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage)) - } - // Deallocate reclaimed pages. Even though all of seg is reclaimable, the - // caller of markReclaimed may not have decommitted it, so we can only mark - // fr as reclaimed. - f.usage.Remove(f.usage.Isolate(seg, fr)) - if fr.Start < f.minUnallocatedPage { - // We've deallocated at least one lower page. - f.minUnallocatedPage = fr.Start - } -} - // IncRef implements platform.File.IncRef. func (f *MemoryFile) IncRef(fr platform.FileRange) { if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 { @@ -677,9 +660,82 @@ func (f *MemoryFile) getChunkMapping(chunk int) ([]uintptr, uintptr, error) { return mappings, m, nil } -// FD implements platform.File.FD. -func (f *MemoryFile) FD() int { - return int(f.file.Fd()) +// MarkEvictable allows f to request memory deallocation by calling +// user.Evict(er) in the future. +// +// Redundantly marking an already-evictable range as evictable has no effect. +func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange) { + f.mu.Lock() + defer f.mu.Unlock() + info, ok := f.evictable[user] + if !ok { + info = &evictableMemoryUserInfo{} + f.evictable[user] = info + } + gap := info.ranges.LowerBoundGap(er.Start) + for gap.Ok() && gap.Start() < er.End { + gapER := gap.Range().Intersect(er) + if gapER.Length() == 0 { + gap = gap.NextGap() + continue + } + gap = info.ranges.Insert(gap, gapER, evictableRangeSetValue{}).NextGap() + } + if !info.evicting { + switch f.opts.DelayedEviction { + case DelayedEvictionDisabled: + // Kick off eviction immediately. + f.startEvictionGoroutineLocked(user, info) + case DelayedEvictionEnabled: + // Ensure that the reclaimer goroutine is running, so that it can + // start eviction when necessary. + f.reclaimCond.Signal() + } + } +} + +// MarkUnevictable informs f that user no longer considers er to be evictable, +// so the MemoryFile should no longer call user.Evict(er). Note that, per +// EvictableMemoryUser.Evict's documentation, user.Evict(er) may still be +// called even after MarkUnevictable returns due to race conditions, and +// implementations of EvictableMemoryUser must handle this possibility. +// +// Redundantly marking an already-unevictable range as unevictable has no +// effect. +func (f *MemoryFile) MarkUnevictable(user EvictableMemoryUser, er EvictableRange) { + f.mu.Lock() + defer f.mu.Unlock() + info, ok := f.evictable[user] + if !ok { + return + } + seg := info.ranges.LowerBoundSegment(er.Start) + for seg.Ok() && seg.Start() < er.End { + seg = info.ranges.Isolate(seg, er) + seg = info.ranges.Remove(seg).NextSegment() + } + // We can only remove info if there's no eviction goroutine running on its + // behalf. + if !info.evicting && info.ranges.IsEmpty() { + delete(f.evictable, user) + } +} + +// MarkAllUnevictable informs f that user no longer considers any offsets to be +// evictable. It otherwise has the same semantics as MarkUnevictable. +func (f *MemoryFile) MarkAllUnevictable(user EvictableMemoryUser) { + f.mu.Lock() + defer f.mu.Unlock() + info, ok := f.evictable[user] + if !ok { + return + } + info.ranges.RemoveAll() + // We can only remove info if there's no eviction goroutine running on its + // behalf. + if !info.evicting { + delete(f.evictable, user) + } } // UpdateUsage ensures that the memory usage statistics in @@ -889,6 +945,11 @@ func (f *MemoryFile) File() *os.File { return f.file } +// FD implements platform.File.FD. +func (f *MemoryFile) FD() int { + return int(f.file.Fd()) +} + // String implements fmt.Stringer.String. // // Note that because f.String locks f.mu, calling f.String internally @@ -900,6 +961,167 @@ func (f *MemoryFile) String() string { return f.usage.String() } +// runReclaim implements the reclaimer goroutine, which continuously decommits +// reclaimable pages in order to reduce memory usage and make them available +// for allocation. +func (f *MemoryFile) runReclaim() { + for { + fr, ok := f.findReclaimable() + if !ok { + break + } + + if err := f.Decommit(fr); err != nil { + log.Warningf("Reclaim failed to decommit %v: %v", fr, err) + // Zero the pages manually. This won't reduce memory usage, but at + // least ensures that the pages will be zero when reallocated. + f.forEachMappingSlice(fr, func(bs []byte) { + for i := range bs { + bs[i] = 0 + } + }) + // Pretend the pages were decommitted even though they weren't, + // since the memory accounting implementation has no idea how to + // deal with this. + f.markDecommitted(fr) + } + f.markReclaimed(fr) + } + // We only get here if findReclaimable finds f.destroyed set and returns + // false. + f.mu.Lock() + defer f.mu.Unlock() + if !f.destroyed { + panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set") + } + f.file.Close() + // Ensure that any attempts to use f.file.Fd() fail instead of getting a fd + // that has possibly been reassigned. + f.file = nil + f.mappingsMu.Lock() + defer f.mappingsMu.Unlock() + mappings := f.mappings.Load().([]uintptr) + for i, m := range mappings { + if m != 0 { + _, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m, chunkSize, 0) + if errno != 0 { + log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno) + } + } + } + // Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.) + f.mappings.Store([]uintptr{}) +} + +func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) { + f.mu.Lock() + defer f.mu.Unlock() + for { + for { + if f.destroyed { + return platform.FileRange{}, false + } + if f.reclaimable { + break + } + if f.opts.DelayedEviction == DelayedEvictionEnabled { + // No work to do. Evict any pending evictable allocations to + // get more reclaimable pages before going to sleep. + f.startEvictionsLocked() + } + f.reclaimCond.Wait() + } + // Allocate returns the first usable range in offset order and is + // currently a linear scan, so reclaiming from the beginning of the + // file minimizes the expected latency of Allocate. + for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() { + if seg.ValuePtr().refs == 0 { + f.minReclaimablePage = seg.End() + return seg.Range(), true + } + } + // No pages are reclaimable. + f.reclaimable = false + f.minReclaimablePage = maxPage + } +} + +func (f *MemoryFile) markReclaimed(fr platform.FileRange) { + f.mu.Lock() + defer f.mu.Unlock() + seg := f.usage.FindSegment(fr.Start) + // All of fr should be mapped to a single uncommitted reclaimable segment + // accounted to System. + if !seg.Ok() { + panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage)) + } + if !seg.Range().IsSupersetOf(fr) { + panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage)) + } + if got, want := seg.Value(), (usageInfo{ + kind: usage.System, + knownCommitted: false, + refs: 0, + }); got != want { + panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage)) + } + // Deallocate reclaimed pages. Even though all of seg is reclaimable, the + // caller of markReclaimed may not have decommitted it, so we can only mark + // fr as reclaimed. + f.usage.Remove(f.usage.Isolate(seg, fr)) + if fr.Start < f.minUnallocatedPage { + // We've deallocated at least one lower page. + f.minUnallocatedPage = fr.Start + } +} + +// Preconditions: f.mu must be locked. +func (f *MemoryFile) startEvictionsLocked() { + for user, info := range f.evictable { + // Don't start multiple goroutines to evict the same user's + // allocations. + if !info.evicting { + f.startEvictionGoroutineLocked(user, info) + } + } +} + +// Preconditions: info == f.evictable[user]. !info.evicting. f.mu must be +// locked. +func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) { + info.evicting = true + f.evictionWG.Add(1) + go func() { // S/R-SAFE: f.evictionWG + defer f.evictionWG.Done() + for { + f.mu.Lock() + info, ok := f.evictable[user] + if !ok { + // This shouldn't happen: only this goroutine is permitted + // to delete this entry. + f.mu.Unlock() + panic(fmt.Sprintf("evictableMemoryUserInfo for EvictableMemoryUser %v deleted while eviction goroutine running", user)) + } + if info.ranges.IsEmpty() { + delete(f.evictable, user) + f.mu.Unlock() + return + } + // Evict from the end of info.ranges, under the assumption that + // if ranges in user start being used again (and are + // consequently marked unevictable), such uses are more likely + // to start from the beginning of user. + seg := info.ranges.LastSegment() + er := seg.Range() + info.ranges.Remove(seg) + // user.Evict() must be called without holding f.mu to avoid + // circular lock ordering. + f.mu.Unlock() + user.Evict(context.Background(), er) + } + }() +} + type usageSetFunctions struct{} func (usageSetFunctions) MinKey() uint64 { @@ -920,3 +1142,27 @@ func (usageSetFunctions) Merge(_ platform.FileRange, val1 usageInfo, _ platform. func (usageSetFunctions) Split(_ platform.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) { return val, val } + +// evictableRangeSetValue is the value type of evictableRangeSet. +type evictableRangeSetValue struct{} + +type evictableRangeSetFunctions struct{} + +func (evictableRangeSetFunctions) MinKey() uint64 { + return 0 +} + +func (evictableRangeSetFunctions) MaxKey() uint64 { + return math.MaxUint64 +} + +func (evictableRangeSetFunctions) ClearValue(val *evictableRangeSetValue) { +} + +func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetValue, _ EvictableRange, _ evictableRangeSetValue) (evictableRangeSetValue, bool) { + return evictableRangeSetValue{}, true +} + +func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) { + return evictableRangeSetValue{}, evictableRangeSetValue{} +} diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go index cf169af55..9534d1aed 100644 --- a/pkg/sentry/pgalloc/save_restore.go +++ b/pkg/sentry/pgalloc/save_restore.go @@ -28,6 +28,15 @@ import ( "gvisor.googlesource.com/gvisor/pkg/state" ) +// FlushEvictions blocks until f has finished evicting all evictable +// allocations. +func (f *MemoryFile) FlushEvictions() { + f.mu.Lock() + f.startEvictionsLocked() + f.mu.Unlock() + f.evictionWG.Wait() +} + // SaveTo writes f's state to the given stream. func (f *MemoryFile) SaveTo(w io.Writer) error { // Wait for reclaim. @@ -40,6 +49,11 @@ func (f *MemoryFile) SaveTo(w io.Writer) error { f.mu.Lock() } + // Ensure that there are no pending evictions. + if len(f.evictable) != 0 { + panic(fmt.Sprintf("evictions still pending for %d users; call FlushEvictions before SaveTo", len(f.evictable))) + } + // Ensure that all pages that contain data have knownCommitted set, since // we only store knownCommitted pages below. zeroPage := make([]byte, usermem.PageSize) diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 0b5be0a42..05122a6a8 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -424,7 +424,7 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) { return nil, fmt.Errorf("error creating memfd: %v", err) } memfile := os.NewFile(uintptr(memfd), memfileName) - mf, err := pgalloc.NewMemoryFile(memfile) + mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) if err != nil { memfile.Close() return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err) |