diff options
author | Jamie Liu <jamieliu@google.com> | 2019-04-30 13:55:41 -0700 |
---|---|---|
committer | Shentubot <shentubot@google.com> | 2019-04-30 13:56:41 -0700 |
commit | 8bfb83d0acdea553082b897d3fd0ad1c1580eaa9 (patch) | |
tree | a06cd3e3295c1397cce8a234bc15b795b30eb92e | |
parent | 81ecd8b6eab7457b331762626f8c210fec3504e6 (diff) |
Implement async MemoryFile eviction, and use it in CachingInodeOperations.
This feature allows MemoryFile to delay eviction of "optional"
allocations, such as unused cached file pages.
Note that this incidentally makes CachingInodeOperations writeback
asynchronous, in the sense that it doesn't occur until eviction; this is
necessary because between when a cached page becomes evictable and when
it's evicted, file writes (via CachingInodeOperations.Write) may dirty
the page.
As currently implemented, this feature won't meaningfully impact
steady-state memory usage or caching; the reclaimer goroutine will
schedule eviction as soon as it runs out of other work to do. Future CLs
increase caching by adding constraints on when eviction is scheduled.
PiperOrigin-RevId: 246014822
Change-Id: Ia85feb25a2de92a48359eb84434b6ec6f9bea2cb
-rw-r--r-- | pkg/sentry/context/contexttest/contexttest.go | 2 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/dirty_set.go | 22 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/inode_cached.go | 78 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/inode_cached_test.go | 8 | ||||
-rw-r--r-- | pkg/sentry/kernel/kernel.go | 5 | ||||
-rw-r--r-- | pkg/sentry/pgalloc/BUILD | 27 | ||||
-rw-r--r-- | pkg/sentry/pgalloc/pgalloc.go | 504 | ||||
-rw-r--r-- | pkg/sentry/pgalloc/save_restore.go | 14 | ||||
-rw-r--r-- | runsc/boot/loader.go | 2 |
9 files changed, 511 insertions, 151 deletions
diff --git a/pkg/sentry/context/contexttest/contexttest.go b/pkg/sentry/context/contexttest/contexttest.go index a42038711..210a235d2 100644 --- a/pkg/sentry/context/contexttest/contexttest.go +++ b/pkg/sentry/context/contexttest/contexttest.go @@ -44,7 +44,7 @@ func Context(tb testing.TB) context.Context { tb.Fatalf("error creating application memory file: %v", err) } memfile := os.NewFile(uintptr(memfd), memfileName) - mf, err := pgalloc.NewMemoryFile(memfile) + mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) if err != nil { memfile.Close() tb.Fatalf("error creating pgalloc.MemoryFile: %v", err) diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go index 9cd196d7d..f1451d77a 100644 --- a/pkg/sentry/fs/fsutil/dirty_set.go +++ b/pkg/sentry/fs/fsutil/dirty_set.go @@ -107,6 +107,7 @@ func (ds *DirtySet) setDirty(mr memmap.MappableRange, keep bool) { var changedAny bool defer func() { if changedAny { + // Merge segments split by Isolate to reduce cost of iteration. ds.MergeRange(mr) } }() @@ -132,6 +133,26 @@ func (ds *DirtySet) setDirty(mr memmap.MappableRange, keep bool) { } } +// AllowClean allows MarkClean to mark offsets in mr as not dirty, ending the +// effect of a previous call to KeepDirty. (It does not itself mark those +// offsets as not dirty.) +func (ds *DirtySet) AllowClean(mr memmap.MappableRange) { + var changedAny bool + defer func() { + if changedAny { + // Merge segments split by Isolate to reduce cost of iteration. + ds.MergeRange(mr) + } + }() + for seg := ds.LowerBoundSegment(mr.Start); seg.Ok() && seg.Start() < mr.End; seg = seg.NextSegment() { + if seg.Value().Keep { + changedAny = true + seg = ds.Isolate(seg, mr) + seg.ValuePtr().Keep = false + } + } +} + // SyncDirty passes pages in the range mr that are stored in cache and // identified as dirty to writeAt, updating dirty to reflect successful writes. // If writeAt returns a successful partial write, SyncDirty will call it @@ -142,6 +163,7 @@ func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet var changedDirty bool defer func() { if changedDirty { + // Merge segments split by Isolate to reduce cost of iteration. dirty.MergeRange(mr) } }() diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go index 919d2534c..76644e69d 100644 --- a/pkg/sentry/fs/fsutil/inode_cached.go +++ b/pkg/sentry/fs/fsutil/inode_cached.go @@ -175,11 +175,22 @@ func (c *CachingInodeOperations) Release() { defer c.mapsMu.Unlock() c.dataMu.Lock() defer c.dataMu.Unlock() - // The cache should be empty (something has gone terribly wrong if we're - // releasing an inode that is still memory-mapped). - if !c.mappings.IsEmpty() || !c.cache.IsEmpty() || !c.dirty.IsEmpty() { - panic(fmt.Sprintf("Releasing CachingInodeOperations with mappings:\n%s\ncache contents:\n%s\ndirty segments:\n%s", &c.mappings, &c.cache, &c.dirty)) + + // Something has gone terribly wrong if we're releasing an inode that is + // still memory-mapped. + if !c.mappings.IsEmpty() { + panic(fmt.Sprintf("Releasing CachingInodeOperations with mappings:\n%s", &c.mappings)) + } + + // Drop any cached pages that are still awaiting MemoryFile eviction. (This + // means that MemoryFile no longer needs to evict them.) + mf := c.mfp.MemoryFile() + mf.MarkAllUnevictable(c) + if err := SyncDirtyAll(context.Background(), &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil { + panic(fmt.Sprintf("Failed to writeback cached data: %v", err)) } + c.cache.DropAll(mf) + c.dirty.RemoveAll() } // UnstableAttr implements fs.InodeOperations.UnstableAttr. @@ -679,6 +690,13 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error return done, nil } +// useHostPageCache returns true if c uses c.backingFile.FD() for all file I/O +// and memory mappings, and false if c.cache may contain data cached from +// c.backingFile. +func (c *CachingInodeOperations) useHostPageCache() bool { + return !c.forcePageCache && c.backingFile.FD() >= 0 +} + // AddMapping implements memmap.Mappable.AddMapping. func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { // Hot path. Avoid defers. @@ -689,7 +707,15 @@ func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.Mappi for _, r := range mapped { c.hostFileMapper.IncRefOn(r) } - if !usage.IncrementalMappedAccounting && !c.forcePageCache && c.backingFile.FD() >= 0 { + if !c.useHostPageCache() { + // c.Evict() will refuse to evict memory-mapped pages, so tell the + // MemoryFile to not bother trying. + mf := c.mfp.MemoryFile() + for _, r := range mapped { + mf.MarkUnevictable(c, pgalloc.EvictableRange{r.Start, r.End}) + } + } + if c.useHostPageCache() && !usage.IncrementalMappedAccounting { for _, r := range mapped { usage.MemoryAccounting.Inc(r.Length(), usage.Mapped) } @@ -706,7 +732,7 @@ func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.Ma for _, r := range unmapped { c.hostFileMapper.DecRefOn(r) } - if !c.forcePageCache && c.backingFile.FD() >= 0 { + if c.useHostPageCache() { if !usage.IncrementalMappedAccounting { for _, r := range unmapped { usage.MemoryAccounting.Dec(r.Length(), usage.Mapped) @@ -716,17 +742,16 @@ func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.Ma return } - // Writeback dirty mapped memory now that there are no longer any - // mappings that reference it. This is our naive memory eviction - // strategy. + // Pages that are no longer referenced by any application memory mappings + // are now considered unused; allow MemoryFile to evict them when + // necessary. mf := c.mfp.MemoryFile() c.dataMu.Lock() for _, r := range unmapped { - if err := SyncDirty(ctx, r, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil { - log.Warningf("Failed to writeback cached data %v: %v", r, err) - } - c.cache.Drop(r, mf) - c.dirty.KeepClean(r) + // Since these pages are no longer mapped, they are no longer + // concurrently dirtyable by a writable memory mapping. + c.dirty.AllowClean(r) + mf.MarkEvictable(c, pgalloc.EvictableRange{r.Start, r.End}) } c.dataMu.Unlock() c.mapsMu.Unlock() @@ -740,7 +765,7 @@ func (c *CachingInodeOperations) CopyMapping(ctx context.Context, ms memmap.Mapp // Translate implements memmap.Mappable.Translate. func (c *CachingInodeOperations) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { // Hot path. Avoid defer. - if !c.forcePageCache && c.backingFile.FD() >= 0 { + if c.useHostPageCache() { return []memmap.Translation{ { Source: optional, @@ -853,6 +878,29 @@ func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error return nil } +// Evict implements pgalloc.EvictableMemoryUser.Evict. +func (c *CachingInodeOperations) Evict(ctx context.Context, er pgalloc.EvictableRange) { + c.mapsMu.Lock() + defer c.mapsMu.Unlock() + c.dataMu.Lock() + defer c.dataMu.Unlock() + + mr := memmap.MappableRange{er.Start, er.End} + mf := c.mfp.MemoryFile() + // Only allow pages that are no longer memory-mapped to be evicted. + for mgap := c.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() { + mgapMR := mgap.Range().Intersect(mr) + if mgapMR.Length() == 0 { + continue + } + if err := SyncDirty(ctx, mgapMR, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil { + log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err) + } + c.cache.Drop(mgapMR, mf) + c.dirty.KeepClean(mgapMR) + } +} + // IncRef implements platform.File.IncRef. This is used when we directly map an // underlying host fd and CachingInodeOperations is used as the platform.File // during translation. diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go index 661ec41f6..3f10efc12 100644 --- a/pkg/sentry/fs/fsutil/inode_cached_test.go +++ b/pkg/sentry/fs/fsutil/inode_cached_test.go @@ -311,12 +311,10 @@ func TestRead(t *testing.T) { t.Errorf("Read back bytes %v, want %v", rbuf, buf) } - // Delete the memory mapping and expect it to cause the cached page to be - // uncached. + // Delete the memory mapping before iops.Release(). The cached page will + // either be evicted by ctx's pgalloc.MemoryFile, or dropped by + // iops.Release(). iops.RemoveMapping(ctx, ms, ar, usermem.PageSize, true) - if cached := iops.cache.Span(); cached != 0 { - t.Fatalf("Span got %d, want 0", cached) - } } func TestWrite(t *testing.T) { diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 0468dd678..91889b573 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -303,7 +303,12 @@ func (k *Kernel) SaveTo(w io.Writer) error { k.pauseTimeLocked() defer k.resumeTimeLocked() + // Evict all evictable MemoryFile allocations. + k.mf.FlushEvictions() + // Flush write operations on open files so data reaches backing storage. + // This must come after k.mf.FlushEvictions() since eviction may cause file + // writes. if err := k.tasks.flushWritesToFiles(ctx); err != nil { return err } diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index 7efa55c20..8a8a0e4e4 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -4,6 +4,31 @@ load("//tools/go_generics:defs.bzl", "go_template_instance") load("//tools/go_stateify:defs.bzl", "go_library", "go_test") go_template_instance( + name = "evictable_range", + out = "evictable_range.go", + package = "pgalloc", + prefix = "Evictable", + template = "//pkg/segment:generic_range", + types = { + "T": "uint64", + }, +) + +go_template_instance( + name = "evictable_range_set", + out = "evictable_range_set.go", + package = "pgalloc", + prefix = "evictableRange", + template = "//pkg/segment:generic_set", + types = { + "Key": "uint64", + "Range": "EvictableRange", + "Value": "evictableRangeSetValue", + "Functions": "evictableRangeSetFunctions", + }, +) + +go_template_instance( name = "usage_set", out = "usage_set.go", consts = { @@ -27,6 +52,8 @@ go_library( name = "pgalloc", srcs = [ "context.go", + "evictable_range.go", + "evictable_range_set.go", "pgalloc.go", "pgalloc_unsafe.go", "save_restore.go", diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index 411dafa07..9c1313f6f 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -31,6 +31,7 @@ import ( "time" "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" "gvisor.googlesource.com/gvisor/pkg/sentry/platform" "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" "gvisor.googlesource.com/gvisor/pkg/sentry/usage" @@ -41,6 +42,9 @@ import ( // MemoryFile is a platform.File whose pages may be allocated to arbitrary // users. type MemoryFile struct { + // opts holds options passed to NewMemoryFile. opts is immutable. + opts MemoryFileOpts + // MemoryFile owns a single backing file, which is modeled as follows: // // Each page in the file can be committed or uncommitted. A page is @@ -115,6 +119,24 @@ type MemoryFile struct { // fileSize is protected by mu. fileSize int64 + // Pages from the backing file are mapped into the local address space on + // the granularity of large pieces called chunks. mappings is a []uintptr + // that stores, for each chunk, the start address of a mapping of that + // chunk in the current process' address space, or 0 if no such mapping + // exists. Once a chunk is mapped, it is never remapped or unmapped until + // the MemoryFile is destroyed. + // + // Mutating the mappings slice or its contents requires both holding + // mappingsMu and using atomic memory operations. (The slice is mutated + // whenever the file is expanded. Per the above, the only permitted + // mutation of the slice's contents is the assignment of a mapping to a + // chunk that was previously unmapped.) Reading the slice or its contents + // only requires *either* holding mappingsMu or using atomic memory + // operations. This allows MemoryFile.MapInternal to avoid locking in the + // common case where chunk mappings already exist. + mappingsMu sync.Mutex + mappings atomic.Value + // destroyed is set by Destroy to instruct the reclaimer goroutine to // release resources and exit. destroyed is protected by mu. destroyed bool @@ -133,26 +155,44 @@ type MemoryFile struct { // transitions from false to true. reclaimCond sync.Cond - // Pages from the backing file are mapped into the local address space on - // the granularity of large pieces called chunks. mappings is a []uintptr - // that stores, for each chunk, the start address of a mapping of that - // chunk in the current process' address space, or 0 if no such mapping - // exists. Once a chunk is mapped, it is never remapped or unmapped until - // the MemoryFile is destroyed. + // evictable maps EvictableMemoryUsers to eviction state. // - // Mutating the mappings slice or its contents requires both holding - // mappingsMu and using atomic memory operations. (The slice is mutated - // whenever the file is expanded. Per the above, the only permitted - // mutation of the slice's contents is the assignment of a mapping to a - // chunk that was previously unmapped.) Reading the slice or its contents - // only requires *either* holding mappingsMu or using atomic memory - // operations. This allows MemoryFile.MapInternal to avoid locking in the - // common case where chunk mappings already exist. - mappingsMu sync.Mutex - mappings atomic.Value + // evictable is protected by mu. + evictable map[EvictableMemoryUser]*evictableMemoryUserInfo + + // evictionWG counts the number of goroutines currently performing evictions. + evictionWG sync.WaitGroup +} + +// MemoryFileOpts provides options to NewMemoryFile. +type MemoryFileOpts struct { + // DelayedEviction controls the extent to which the MemoryFile may delay + // eviction of evictable allocations. + DelayedEviction DelayedEvictionType } -// usage tracks usage information. +// DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction. +type DelayedEvictionType int + +const ( + // DelayedEvictionDefault has unspecified behavior. + DelayedEvictionDefault DelayedEvictionType = iota + + // DelayedEvictionDisabled requires that evictable allocations are evicted + // as soon as possible. + DelayedEvictionDisabled + + // DelayedEvictionEnabled requests that the MemoryFile delay eviction of + // evictable allocations until doing so is considered necessary to avoid + // performance degradation due to host memory pressure, or OOM kills. + // + // As of this writing, DelayedEvictionEnabled delays evictions until the + // reclaimer goroutine is out of work (pages to reclaim), then evicts all + // pending evictable allocations immediately. + DelayedEvictionEnabled +) + +// usageInfo tracks usage information. // // +stateify savable type usageInfo struct { @@ -166,6 +206,46 @@ type usageInfo struct { refs uint64 } +// An EvictableMemoryUser represents a user of MemoryFile-allocated memory that +// may be asked to deallocate that memory in the presence of memory pressure. +type EvictableMemoryUser interface { + // Evict requests that the EvictableMemoryUser deallocate memory used by + // er, which was registered as evictable by a previous call to + // MemoryFile.MarkEvictable. + // + // Evict is not required to deallocate memory. In particular, since pgalloc + // must call Evict without holding locks to avoid circular lock ordering, + // it is possible that the passed range has already been marked as + // unevictable by a racing call to MemoryFile.MarkUnevictable. + // Implementations of EvictableMemoryUser must detect such races and handle + // them by making Evict have no effect on unevictable ranges. + // + // After a call to Evict, the MemoryFile will consider the evicted range + // unevictable (i.e. it will not call Evict on the same range again) until + // informed otherwise by a subsequent call to MarkEvictable. + Evict(ctx context.Context, er EvictableRange) +} + +// An EvictableRange represents a range of uint64 offsets in an +// EvictableMemoryUser. +// +// In practice, most EvictableMemoryUsers will probably be implementations of +// memmap.Mappable, and EvictableRange therefore corresponds to +// memmap.MappableRange. However, this package cannot depend on the memmap +// package, since doing so would create a circular dependency. +// +// type EvictableRange <generated using go_generics> + +// evictableMemoryUserInfo is the value type of MemoryFile.evictable. +type evictableMemoryUserInfo struct { + // ranges tracks all evictable ranges for the given user. + ranges evictableRangeSet + + // If evicting is true, there is a goroutine currently evicting all + // evictable ranges for this user. + evicting bool +} + const ( chunkShift = 24 chunkSize = 1 << chunkShift // 16 MB @@ -180,7 +260,15 @@ const ( // NewMemoryFile creates a MemoryFile backed by the given file. If // NewMemoryFile succeeds, ownership of file is transferred to the returned // MemoryFile. -func NewMemoryFile(file *os.File) (*MemoryFile, error) { +func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) { + switch opts.DelayedEviction { + case DelayedEvictionDefault: + opts.DelayedEviction = DelayedEvictionEnabled + case DelayedEvictionDisabled, DelayedEvictionEnabled: + default: + return nil, fmt.Errorf("invalid MemoryFileOpts.DelayedEviction: %v", opts.DelayedEviction) + } + // Truncate the file to 0 bytes first to ensure that it's empty. if err := file.Truncate(0); err != nil { return nil, err @@ -189,14 +277,16 @@ func NewMemoryFile(file *os.File) (*MemoryFile, error) { return nil, err } f := &MemoryFile{ + opts: opts, fileSize: initialSize, file: file, // No pages are reclaimable. DecRef will always be able to // decrease minReclaimablePage from this point. minReclaimablePage: maxPage, + evictable: make(map[EvictableMemoryUser]*evictableMemoryUserInfo), } - f.reclaimCond.L = &f.mu f.mappings.Store(make([]uintptr, initialSize/chunkSize)) + f.reclaimCond.L = &f.mu go f.runReclaim() // S/R-SAFE: f.mu // The Linux kernel contains an optional feature called "Integrity @@ -434,113 +524,6 @@ func (f *MemoryFile) markDecommitted(fr platform.FileRange) { f.usage.MergeRange(fr) } -// runReclaim implements the reclaimer goroutine, which continuously decommits -// reclaimable pages in order to reduce memory usage and make them available -// for allocation. -func (f *MemoryFile) runReclaim() { - for { - fr, ok := f.findReclaimable() - if !ok { - break - } - - if err := f.Decommit(fr); err != nil { - log.Warningf("Reclaim failed to decommit %v: %v", fr, err) - // Zero the pages manually. This won't reduce memory usage, but at - // least ensures that the pages will be zero when reallocated. - f.forEachMappingSlice(fr, func(bs []byte) { - for i := range bs { - bs[i] = 0 - } - }) - // Pretend the pages were decommitted even though they weren't, - // since the memory accounting implementation has no idea how to - // deal with this. - f.markDecommitted(fr) - } - f.markReclaimed(fr) - } - // We only get here if findReclaimable finds f.destroyed set and returns - // false. - f.mu.Lock() - defer f.mu.Unlock() - if !f.destroyed { - panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set") - } - f.file.Close() - // Ensure that any attempts to use f.file.Fd() fail instead of getting a fd - // that has possibly been reassigned. - f.file = nil - mappings := f.mappings.Load().([]uintptr) - for i, m := range mappings { - if m != 0 { - _, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m, chunkSize, 0) - if errno != 0 { - log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno) - } - } - } - // Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.) - f.mappings.Store([]uintptr{}) -} - -func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) { - f.mu.Lock() - defer f.mu.Unlock() - for { - for { - if f.destroyed { - return platform.FileRange{}, false - } - if f.reclaimable { - break - } - f.reclaimCond.Wait() - } - // Allocate returns the first usable range in offset order and is - // currently a linear scan, so reclaiming from the beginning of the - // file minimizes the expected latency of Allocate. - for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() { - if seg.ValuePtr().refs == 0 { - f.minReclaimablePage = seg.End() - return seg.Range(), true - } - } - // No pages are reclaimable. - f.reclaimable = false - f.minReclaimablePage = maxPage - } -} - -func (f *MemoryFile) markReclaimed(fr platform.FileRange) { - f.mu.Lock() - defer f.mu.Unlock() - seg := f.usage.FindSegment(fr.Start) - // All of fr should be mapped to a single uncommitted reclaimable segment - // accounted to System. - if !seg.Ok() { - panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage)) - } - if !seg.Range().IsSupersetOf(fr) { - panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage)) - } - if got, want := seg.Value(), (usageInfo{ - kind: usage.System, - knownCommitted: false, - refs: 0, - }); got != want { - panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage)) - } - // Deallocate reclaimed pages. Even though all of seg is reclaimable, the - // caller of markReclaimed may not have decommitted it, so we can only mark - // fr as reclaimed. - f.usage.Remove(f.usage.Isolate(seg, fr)) - if fr.Start < f.minUnallocatedPage { - // We've deallocated at least one lower page. - f.minUnallocatedPage = fr.Start - } -} - // IncRef implements platform.File.IncRef. func (f *MemoryFile) IncRef(fr platform.FileRange) { if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 { @@ -677,9 +660,82 @@ func (f *MemoryFile) getChunkMapping(chunk int) ([]uintptr, uintptr, error) { return mappings, m, nil } -// FD implements platform.File.FD. -func (f *MemoryFile) FD() int { - return int(f.file.Fd()) +// MarkEvictable allows f to request memory deallocation by calling +// user.Evict(er) in the future. +// +// Redundantly marking an already-evictable range as evictable has no effect. +func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange) { + f.mu.Lock() + defer f.mu.Unlock() + info, ok := f.evictable[user] + if !ok { + info = &evictableMemoryUserInfo{} + f.evictable[user] = info + } + gap := info.ranges.LowerBoundGap(er.Start) + for gap.Ok() && gap.Start() < er.End { + gapER := gap.Range().Intersect(er) + if gapER.Length() == 0 { + gap = gap.NextGap() + continue + } + gap = info.ranges.Insert(gap, gapER, evictableRangeSetValue{}).NextGap() + } + if !info.evicting { + switch f.opts.DelayedEviction { + case DelayedEvictionDisabled: + // Kick off eviction immediately. + f.startEvictionGoroutineLocked(user, info) + case DelayedEvictionEnabled: + // Ensure that the reclaimer goroutine is running, so that it can + // start eviction when necessary. + f.reclaimCond.Signal() + } + } +} + +// MarkUnevictable informs f that user no longer considers er to be evictable, +// so the MemoryFile should no longer call user.Evict(er). Note that, per +// EvictableMemoryUser.Evict's documentation, user.Evict(er) may still be +// called even after MarkUnevictable returns due to race conditions, and +// implementations of EvictableMemoryUser must handle this possibility. +// +// Redundantly marking an already-unevictable range as unevictable has no +// effect. +func (f *MemoryFile) MarkUnevictable(user EvictableMemoryUser, er EvictableRange) { + f.mu.Lock() + defer f.mu.Unlock() + info, ok := f.evictable[user] + if !ok { + return + } + seg := info.ranges.LowerBoundSegment(er.Start) + for seg.Ok() && seg.Start() < er.End { + seg = info.ranges.Isolate(seg, er) + seg = info.ranges.Remove(seg).NextSegment() + } + // We can only remove info if there's no eviction goroutine running on its + // behalf. + if !info.evicting && info.ranges.IsEmpty() { + delete(f.evictable, user) + } +} + +// MarkAllUnevictable informs f that user no longer considers any offsets to be +// evictable. It otherwise has the same semantics as MarkUnevictable. +func (f *MemoryFile) MarkAllUnevictable(user EvictableMemoryUser) { + f.mu.Lock() + defer f.mu.Unlock() + info, ok := f.evictable[user] + if !ok { + return + } + info.ranges.RemoveAll() + // We can only remove info if there's no eviction goroutine running on its + // behalf. + if !info.evicting { + delete(f.evictable, user) + } } // UpdateUsage ensures that the memory usage statistics in @@ -889,6 +945,11 @@ func (f *MemoryFile) File() *os.File { return f.file } +// FD implements platform.File.FD. +func (f *MemoryFile) FD() int { + return int(f.file.Fd()) +} + // String implements fmt.Stringer.String. // // Note that because f.String locks f.mu, calling f.String internally @@ -900,6 +961,167 @@ func (f *MemoryFile) String() string { return f.usage.String() } +// runReclaim implements the reclaimer goroutine, which continuously decommits +// reclaimable pages in order to reduce memory usage and make them available +// for allocation. +func (f *MemoryFile) runReclaim() { + for { + fr, ok := f.findReclaimable() + if !ok { + break + } + + if err := f.Decommit(fr); err != nil { + log.Warningf("Reclaim failed to decommit %v: %v", fr, err) + // Zero the pages manually. This won't reduce memory usage, but at + // least ensures that the pages will be zero when reallocated. + f.forEachMappingSlice(fr, func(bs []byte) { + for i := range bs { + bs[i] = 0 + } + }) + // Pretend the pages were decommitted even though they weren't, + // since the memory accounting implementation has no idea how to + // deal with this. + f.markDecommitted(fr) + } + f.markReclaimed(fr) + } + // We only get here if findReclaimable finds f.destroyed set and returns + // false. + f.mu.Lock() + defer f.mu.Unlock() + if !f.destroyed { + panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set") + } + f.file.Close() + // Ensure that any attempts to use f.file.Fd() fail instead of getting a fd + // that has possibly been reassigned. + f.file = nil + f.mappingsMu.Lock() + defer f.mappingsMu.Unlock() + mappings := f.mappings.Load().([]uintptr) + for i, m := range mappings { + if m != 0 { + _, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m, chunkSize, 0) + if errno != 0 { + log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno) + } + } + } + // Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.) + f.mappings.Store([]uintptr{}) +} + +func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) { + f.mu.Lock() + defer f.mu.Unlock() + for { + for { + if f.destroyed { + return platform.FileRange{}, false + } + if f.reclaimable { + break + } + if f.opts.DelayedEviction == DelayedEvictionEnabled { + // No work to do. Evict any pending evictable allocations to + // get more reclaimable pages before going to sleep. + f.startEvictionsLocked() + } + f.reclaimCond.Wait() + } + // Allocate returns the first usable range in offset order and is + // currently a linear scan, so reclaiming from the beginning of the + // file minimizes the expected latency of Allocate. + for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() { + if seg.ValuePtr().refs == 0 { + f.minReclaimablePage = seg.End() + return seg.Range(), true + } + } + // No pages are reclaimable. + f.reclaimable = false + f.minReclaimablePage = maxPage + } +} + +func (f *MemoryFile) markReclaimed(fr platform.FileRange) { + f.mu.Lock() + defer f.mu.Unlock() + seg := f.usage.FindSegment(fr.Start) + // All of fr should be mapped to a single uncommitted reclaimable segment + // accounted to System. + if !seg.Ok() { + panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage)) + } + if !seg.Range().IsSupersetOf(fr) { + panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage)) + } + if got, want := seg.Value(), (usageInfo{ + kind: usage.System, + knownCommitted: false, + refs: 0, + }); got != want { + panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage)) + } + // Deallocate reclaimed pages. Even though all of seg is reclaimable, the + // caller of markReclaimed may not have decommitted it, so we can only mark + // fr as reclaimed. + f.usage.Remove(f.usage.Isolate(seg, fr)) + if fr.Start < f.minUnallocatedPage { + // We've deallocated at least one lower page. + f.minUnallocatedPage = fr.Start + } +} + +// Preconditions: f.mu must be locked. +func (f *MemoryFile) startEvictionsLocked() { + for user, info := range f.evictable { + // Don't start multiple goroutines to evict the same user's + // allocations. + if !info.evicting { + f.startEvictionGoroutineLocked(user, info) + } + } +} + +// Preconditions: info == f.evictable[user]. !info.evicting. f.mu must be +// locked. +func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) { + info.evicting = true + f.evictionWG.Add(1) + go func() { // S/R-SAFE: f.evictionWG + defer f.evictionWG.Done() + for { + f.mu.Lock() + info, ok := f.evictable[user] + if !ok { + // This shouldn't happen: only this goroutine is permitted + // to delete this entry. + f.mu.Unlock() + panic(fmt.Sprintf("evictableMemoryUserInfo for EvictableMemoryUser %v deleted while eviction goroutine running", user)) + } + if info.ranges.IsEmpty() { + delete(f.evictable, user) + f.mu.Unlock() + return + } + // Evict from the end of info.ranges, under the assumption that + // if ranges in user start being used again (and are + // consequently marked unevictable), such uses are more likely + // to start from the beginning of user. + seg := info.ranges.LastSegment() + er := seg.Range() + info.ranges.Remove(seg) + // user.Evict() must be called without holding f.mu to avoid + // circular lock ordering. + f.mu.Unlock() + user.Evict(context.Background(), er) + } + }() +} + type usageSetFunctions struct{} func (usageSetFunctions) MinKey() uint64 { @@ -920,3 +1142,27 @@ func (usageSetFunctions) Merge(_ platform.FileRange, val1 usageInfo, _ platform. func (usageSetFunctions) Split(_ platform.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) { return val, val } + +// evictableRangeSetValue is the value type of evictableRangeSet. +type evictableRangeSetValue struct{} + +type evictableRangeSetFunctions struct{} + +func (evictableRangeSetFunctions) MinKey() uint64 { + return 0 +} + +func (evictableRangeSetFunctions) MaxKey() uint64 { + return math.MaxUint64 +} + +func (evictableRangeSetFunctions) ClearValue(val *evictableRangeSetValue) { +} + +func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetValue, _ EvictableRange, _ evictableRangeSetValue) (evictableRangeSetValue, bool) { + return evictableRangeSetValue{}, true +} + +func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) { + return evictableRangeSetValue{}, evictableRangeSetValue{} +} diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go index cf169af55..9534d1aed 100644 --- a/pkg/sentry/pgalloc/save_restore.go +++ b/pkg/sentry/pgalloc/save_restore.go @@ -28,6 +28,15 @@ import ( "gvisor.googlesource.com/gvisor/pkg/state" ) +// FlushEvictions blocks until f has finished evicting all evictable +// allocations. +func (f *MemoryFile) FlushEvictions() { + f.mu.Lock() + f.startEvictionsLocked() + f.mu.Unlock() + f.evictionWG.Wait() +} + // SaveTo writes f's state to the given stream. func (f *MemoryFile) SaveTo(w io.Writer) error { // Wait for reclaim. @@ -40,6 +49,11 @@ func (f *MemoryFile) SaveTo(w io.Writer) error { f.mu.Lock() } + // Ensure that there are no pending evictions. + if len(f.evictable) != 0 { + panic(fmt.Sprintf("evictions still pending for %d users; call FlushEvictions before SaveTo", len(f.evictable))) + } + // Ensure that all pages that contain data have knownCommitted set, since // we only store knownCommitted pages below. zeroPage := make([]byte, usermem.PageSize) diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 0b5be0a42..05122a6a8 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -424,7 +424,7 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) { return nil, fmt.Errorf("error creating memfd: %v", err) } memfile := os.NewFile(uintptr(memfd), memfileName) - mf, err := pgalloc.NewMemoryFile(memfile) + mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) if err != nil { memfile.Close() return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err) |