summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--pkg/sentry/context/contexttest/contexttest.go2
-rw-r--r--pkg/sentry/fs/fsutil/dirty_set.go22
-rw-r--r--pkg/sentry/fs/fsutil/inode_cached.go78
-rw-r--r--pkg/sentry/fs/fsutil/inode_cached_test.go8
-rw-r--r--pkg/sentry/kernel/kernel.go5
-rw-r--r--pkg/sentry/pgalloc/BUILD27
-rw-r--r--pkg/sentry/pgalloc/pgalloc.go504
-rw-r--r--pkg/sentry/pgalloc/save_restore.go14
-rw-r--r--runsc/boot/loader.go2
9 files changed, 511 insertions, 151 deletions
diff --git a/pkg/sentry/context/contexttest/contexttest.go b/pkg/sentry/context/contexttest/contexttest.go
index a42038711..210a235d2 100644
--- a/pkg/sentry/context/contexttest/contexttest.go
+++ b/pkg/sentry/context/contexttest/contexttest.go
@@ -44,7 +44,7 @@ func Context(tb testing.TB) context.Context {
tb.Fatalf("error creating application memory file: %v", err)
}
memfile := os.NewFile(uintptr(memfd), memfileName)
- mf, err := pgalloc.NewMemoryFile(memfile)
+ mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
if err != nil {
memfile.Close()
tb.Fatalf("error creating pgalloc.MemoryFile: %v", err)
diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go
index 9cd196d7d..f1451d77a 100644
--- a/pkg/sentry/fs/fsutil/dirty_set.go
+++ b/pkg/sentry/fs/fsutil/dirty_set.go
@@ -107,6 +107,7 @@ func (ds *DirtySet) setDirty(mr memmap.MappableRange, keep bool) {
var changedAny bool
defer func() {
if changedAny {
+ // Merge segments split by Isolate to reduce cost of iteration.
ds.MergeRange(mr)
}
}()
@@ -132,6 +133,26 @@ func (ds *DirtySet) setDirty(mr memmap.MappableRange, keep bool) {
}
}
+// AllowClean allows MarkClean to mark offsets in mr as not dirty, ending the
+// effect of a previous call to KeepDirty. (It does not itself mark those
+// offsets as not dirty.)
+func (ds *DirtySet) AllowClean(mr memmap.MappableRange) {
+ var changedAny bool
+ defer func() {
+ if changedAny {
+ // Merge segments split by Isolate to reduce cost of iteration.
+ ds.MergeRange(mr)
+ }
+ }()
+ for seg := ds.LowerBoundSegment(mr.Start); seg.Ok() && seg.Start() < mr.End; seg = seg.NextSegment() {
+ if seg.Value().Keep {
+ changedAny = true
+ seg = ds.Isolate(seg, mr)
+ seg.ValuePtr().Keep = false
+ }
+ }
+}
+
// SyncDirty passes pages in the range mr that are stored in cache and
// identified as dirty to writeAt, updating dirty to reflect successful writes.
// If writeAt returns a successful partial write, SyncDirty will call it
@@ -142,6 +163,7 @@ func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet
var changedDirty bool
defer func() {
if changedDirty {
+ // Merge segments split by Isolate to reduce cost of iteration.
dirty.MergeRange(mr)
}
}()
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 919d2534c..76644e69d 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -175,11 +175,22 @@ func (c *CachingInodeOperations) Release() {
defer c.mapsMu.Unlock()
c.dataMu.Lock()
defer c.dataMu.Unlock()
- // The cache should be empty (something has gone terribly wrong if we're
- // releasing an inode that is still memory-mapped).
- if !c.mappings.IsEmpty() || !c.cache.IsEmpty() || !c.dirty.IsEmpty() {
- panic(fmt.Sprintf("Releasing CachingInodeOperations with mappings:\n%s\ncache contents:\n%s\ndirty segments:\n%s", &c.mappings, &c.cache, &c.dirty))
+
+ // Something has gone terribly wrong if we're releasing an inode that is
+ // still memory-mapped.
+ if !c.mappings.IsEmpty() {
+ panic(fmt.Sprintf("Releasing CachingInodeOperations with mappings:\n%s", &c.mappings))
+ }
+
+ // Drop any cached pages that are still awaiting MemoryFile eviction. (This
+ // means that MemoryFile no longer needs to evict them.)
+ mf := c.mfp.MemoryFile()
+ mf.MarkAllUnevictable(c)
+ if err := SyncDirtyAll(context.Background(), &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil {
+ panic(fmt.Sprintf("Failed to writeback cached data: %v", err))
}
+ c.cache.DropAll(mf)
+ c.dirty.RemoveAll()
}
// UnstableAttr implements fs.InodeOperations.UnstableAttr.
@@ -679,6 +690,13 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error
return done, nil
}
+// useHostPageCache returns true if c uses c.backingFile.FD() for all file I/O
+// and memory mappings, and false if c.cache may contain data cached from
+// c.backingFile.
+func (c *CachingInodeOperations) useHostPageCache() bool {
+ return !c.forcePageCache && c.backingFile.FD() >= 0
+}
+
// AddMapping implements memmap.Mappable.AddMapping.
func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
// Hot path. Avoid defers.
@@ -689,7 +707,15 @@ func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.Mappi
for _, r := range mapped {
c.hostFileMapper.IncRefOn(r)
}
- if !usage.IncrementalMappedAccounting && !c.forcePageCache && c.backingFile.FD() >= 0 {
+ if !c.useHostPageCache() {
+ // c.Evict() will refuse to evict memory-mapped pages, so tell the
+ // MemoryFile to not bother trying.
+ mf := c.mfp.MemoryFile()
+ for _, r := range mapped {
+ mf.MarkUnevictable(c, pgalloc.EvictableRange{r.Start, r.End})
+ }
+ }
+ if c.useHostPageCache() && !usage.IncrementalMappedAccounting {
for _, r := range mapped {
usage.MemoryAccounting.Inc(r.Length(), usage.Mapped)
}
@@ -706,7 +732,7 @@ func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.Ma
for _, r := range unmapped {
c.hostFileMapper.DecRefOn(r)
}
- if !c.forcePageCache && c.backingFile.FD() >= 0 {
+ if c.useHostPageCache() {
if !usage.IncrementalMappedAccounting {
for _, r := range unmapped {
usage.MemoryAccounting.Dec(r.Length(), usage.Mapped)
@@ -716,17 +742,16 @@ func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.Ma
return
}
- // Writeback dirty mapped memory now that there are no longer any
- // mappings that reference it. This is our naive memory eviction
- // strategy.
+ // Pages that are no longer referenced by any application memory mappings
+ // are now considered unused; allow MemoryFile to evict them when
+ // necessary.
mf := c.mfp.MemoryFile()
c.dataMu.Lock()
for _, r := range unmapped {
- if err := SyncDirty(ctx, r, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil {
- log.Warningf("Failed to writeback cached data %v: %v", r, err)
- }
- c.cache.Drop(r, mf)
- c.dirty.KeepClean(r)
+ // Since these pages are no longer mapped, they are no longer
+ // concurrently dirtyable by a writable memory mapping.
+ c.dirty.AllowClean(r)
+ mf.MarkEvictable(c, pgalloc.EvictableRange{r.Start, r.End})
}
c.dataMu.Unlock()
c.mapsMu.Unlock()
@@ -740,7 +765,7 @@ func (c *CachingInodeOperations) CopyMapping(ctx context.Context, ms memmap.Mapp
// Translate implements memmap.Mappable.Translate.
func (c *CachingInodeOperations) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
// Hot path. Avoid defer.
- if !c.forcePageCache && c.backingFile.FD() >= 0 {
+ if c.useHostPageCache() {
return []memmap.Translation{
{
Source: optional,
@@ -853,6 +878,29 @@ func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error
return nil
}
+// Evict implements pgalloc.EvictableMemoryUser.Evict.
+func (c *CachingInodeOperations) Evict(ctx context.Context, er pgalloc.EvictableRange) {
+ c.mapsMu.Lock()
+ defer c.mapsMu.Unlock()
+ c.dataMu.Lock()
+ defer c.dataMu.Unlock()
+
+ mr := memmap.MappableRange{er.Start, er.End}
+ mf := c.mfp.MemoryFile()
+ // Only allow pages that are no longer memory-mapped to be evicted.
+ for mgap := c.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() {
+ mgapMR := mgap.Range().Intersect(mr)
+ if mgapMR.Length() == 0 {
+ continue
+ }
+ if err := SyncDirty(ctx, mgapMR, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil {
+ log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err)
+ }
+ c.cache.Drop(mgapMR, mf)
+ c.dirty.KeepClean(mgapMR)
+ }
+}
+
// IncRef implements platform.File.IncRef. This is used when we directly map an
// underlying host fd and CachingInodeOperations is used as the platform.File
// during translation.
diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go
index 661ec41f6..3f10efc12 100644
--- a/pkg/sentry/fs/fsutil/inode_cached_test.go
+++ b/pkg/sentry/fs/fsutil/inode_cached_test.go
@@ -311,12 +311,10 @@ func TestRead(t *testing.T) {
t.Errorf("Read back bytes %v, want %v", rbuf, buf)
}
- // Delete the memory mapping and expect it to cause the cached page to be
- // uncached.
+ // Delete the memory mapping before iops.Release(). The cached page will
+ // either be evicted by ctx's pgalloc.MemoryFile, or dropped by
+ // iops.Release().
iops.RemoveMapping(ctx, ms, ar, usermem.PageSize, true)
- if cached := iops.cache.Span(); cached != 0 {
- t.Fatalf("Span got %d, want 0", cached)
- }
}
func TestWrite(t *testing.T) {
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 0468dd678..91889b573 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -303,7 +303,12 @@ func (k *Kernel) SaveTo(w io.Writer) error {
k.pauseTimeLocked()
defer k.resumeTimeLocked()
+ // Evict all evictable MemoryFile allocations.
+ k.mf.FlushEvictions()
+
// Flush write operations on open files so data reaches backing storage.
+ // This must come after k.mf.FlushEvictions() since eviction may cause file
+ // writes.
if err := k.tasks.flushWritesToFiles(ctx); err != nil {
return err
}
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
index 7efa55c20..8a8a0e4e4 100644
--- a/pkg/sentry/pgalloc/BUILD
+++ b/pkg/sentry/pgalloc/BUILD
@@ -4,6 +4,31 @@ load("//tools/go_generics:defs.bzl", "go_template_instance")
load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
go_template_instance(
+ name = "evictable_range",
+ out = "evictable_range.go",
+ package = "pgalloc",
+ prefix = "Evictable",
+ template = "//pkg/segment:generic_range",
+ types = {
+ "T": "uint64",
+ },
+)
+
+go_template_instance(
+ name = "evictable_range_set",
+ out = "evictable_range_set.go",
+ package = "pgalloc",
+ prefix = "evictableRange",
+ template = "//pkg/segment:generic_set",
+ types = {
+ "Key": "uint64",
+ "Range": "EvictableRange",
+ "Value": "evictableRangeSetValue",
+ "Functions": "evictableRangeSetFunctions",
+ },
+)
+
+go_template_instance(
name = "usage_set",
out = "usage_set.go",
consts = {
@@ -27,6 +52,8 @@ go_library(
name = "pgalloc",
srcs = [
"context.go",
+ "evictable_range.go",
+ "evictable_range_set.go",
"pgalloc.go",
"pgalloc_unsafe.go",
"save_restore.go",
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index 411dafa07..9c1313f6f 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -31,6 +31,7 @@ import (
"time"
"gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
@@ -41,6 +42,9 @@ import (
// MemoryFile is a platform.File whose pages may be allocated to arbitrary
// users.
type MemoryFile struct {
+ // opts holds options passed to NewMemoryFile. opts is immutable.
+ opts MemoryFileOpts
+
// MemoryFile owns a single backing file, which is modeled as follows:
//
// Each page in the file can be committed or uncommitted. A page is
@@ -115,6 +119,24 @@ type MemoryFile struct {
// fileSize is protected by mu.
fileSize int64
+ // Pages from the backing file are mapped into the local address space on
+ // the granularity of large pieces called chunks. mappings is a []uintptr
+ // that stores, for each chunk, the start address of a mapping of that
+ // chunk in the current process' address space, or 0 if no such mapping
+ // exists. Once a chunk is mapped, it is never remapped or unmapped until
+ // the MemoryFile is destroyed.
+ //
+ // Mutating the mappings slice or its contents requires both holding
+ // mappingsMu and using atomic memory operations. (The slice is mutated
+ // whenever the file is expanded. Per the above, the only permitted
+ // mutation of the slice's contents is the assignment of a mapping to a
+ // chunk that was previously unmapped.) Reading the slice or its contents
+ // only requires *either* holding mappingsMu or using atomic memory
+ // operations. This allows MemoryFile.MapInternal to avoid locking in the
+ // common case where chunk mappings already exist.
+ mappingsMu sync.Mutex
+ mappings atomic.Value
+
// destroyed is set by Destroy to instruct the reclaimer goroutine to
// release resources and exit. destroyed is protected by mu.
destroyed bool
@@ -133,26 +155,44 @@ type MemoryFile struct {
// transitions from false to true.
reclaimCond sync.Cond
- // Pages from the backing file are mapped into the local address space on
- // the granularity of large pieces called chunks. mappings is a []uintptr
- // that stores, for each chunk, the start address of a mapping of that
- // chunk in the current process' address space, or 0 if no such mapping
- // exists. Once a chunk is mapped, it is never remapped or unmapped until
- // the MemoryFile is destroyed.
+ // evictable maps EvictableMemoryUsers to eviction state.
//
- // Mutating the mappings slice or its contents requires both holding
- // mappingsMu and using atomic memory operations. (The slice is mutated
- // whenever the file is expanded. Per the above, the only permitted
- // mutation of the slice's contents is the assignment of a mapping to a
- // chunk that was previously unmapped.) Reading the slice or its contents
- // only requires *either* holding mappingsMu or using atomic memory
- // operations. This allows MemoryFile.MapInternal to avoid locking in the
- // common case where chunk mappings already exist.
- mappingsMu sync.Mutex
- mappings atomic.Value
+ // evictable is protected by mu.
+ evictable map[EvictableMemoryUser]*evictableMemoryUserInfo
+
+ // evictionWG counts the number of goroutines currently performing evictions.
+ evictionWG sync.WaitGroup
+}
+
+// MemoryFileOpts provides options to NewMemoryFile.
+type MemoryFileOpts struct {
+ // DelayedEviction controls the extent to which the MemoryFile may delay
+ // eviction of evictable allocations.
+ DelayedEviction DelayedEvictionType
}
-// usage tracks usage information.
+// DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction.
+type DelayedEvictionType int
+
+const (
+ // DelayedEvictionDefault has unspecified behavior.
+ DelayedEvictionDefault DelayedEvictionType = iota
+
+ // DelayedEvictionDisabled requires that evictable allocations are evicted
+ // as soon as possible.
+ DelayedEvictionDisabled
+
+ // DelayedEvictionEnabled requests that the MemoryFile delay eviction of
+ // evictable allocations until doing so is considered necessary to avoid
+ // performance degradation due to host memory pressure, or OOM kills.
+ //
+ // As of this writing, DelayedEvictionEnabled delays evictions until the
+ // reclaimer goroutine is out of work (pages to reclaim), then evicts all
+ // pending evictable allocations immediately.
+ DelayedEvictionEnabled
+)
+
+// usageInfo tracks usage information.
//
// +stateify savable
type usageInfo struct {
@@ -166,6 +206,46 @@ type usageInfo struct {
refs uint64
}
+// An EvictableMemoryUser represents a user of MemoryFile-allocated memory that
+// may be asked to deallocate that memory in the presence of memory pressure.
+type EvictableMemoryUser interface {
+ // Evict requests that the EvictableMemoryUser deallocate memory used by
+ // er, which was registered as evictable by a previous call to
+ // MemoryFile.MarkEvictable.
+ //
+ // Evict is not required to deallocate memory. In particular, since pgalloc
+ // must call Evict without holding locks to avoid circular lock ordering,
+ // it is possible that the passed range has already been marked as
+ // unevictable by a racing call to MemoryFile.MarkUnevictable.
+ // Implementations of EvictableMemoryUser must detect such races and handle
+ // them by making Evict have no effect on unevictable ranges.
+ //
+ // After a call to Evict, the MemoryFile will consider the evicted range
+ // unevictable (i.e. it will not call Evict on the same range again) until
+ // informed otherwise by a subsequent call to MarkEvictable.
+ Evict(ctx context.Context, er EvictableRange)
+}
+
+// An EvictableRange represents a range of uint64 offsets in an
+// EvictableMemoryUser.
+//
+// In practice, most EvictableMemoryUsers will probably be implementations of
+// memmap.Mappable, and EvictableRange therefore corresponds to
+// memmap.MappableRange. However, this package cannot depend on the memmap
+// package, since doing so would create a circular dependency.
+//
+// type EvictableRange <generated using go_generics>
+
+// evictableMemoryUserInfo is the value type of MemoryFile.evictable.
+type evictableMemoryUserInfo struct {
+ // ranges tracks all evictable ranges for the given user.
+ ranges evictableRangeSet
+
+ // If evicting is true, there is a goroutine currently evicting all
+ // evictable ranges for this user.
+ evicting bool
+}
+
const (
chunkShift = 24
chunkSize = 1 << chunkShift // 16 MB
@@ -180,7 +260,15 @@ const (
// NewMemoryFile creates a MemoryFile backed by the given file. If
// NewMemoryFile succeeds, ownership of file is transferred to the returned
// MemoryFile.
-func NewMemoryFile(file *os.File) (*MemoryFile, error) {
+func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) {
+ switch opts.DelayedEviction {
+ case DelayedEvictionDefault:
+ opts.DelayedEviction = DelayedEvictionEnabled
+ case DelayedEvictionDisabled, DelayedEvictionEnabled:
+ default:
+ return nil, fmt.Errorf("invalid MemoryFileOpts.DelayedEviction: %v", opts.DelayedEviction)
+ }
+
// Truncate the file to 0 bytes first to ensure that it's empty.
if err := file.Truncate(0); err != nil {
return nil, err
@@ -189,14 +277,16 @@ func NewMemoryFile(file *os.File) (*MemoryFile, error) {
return nil, err
}
f := &MemoryFile{
+ opts: opts,
fileSize: initialSize,
file: file,
// No pages are reclaimable. DecRef will always be able to
// decrease minReclaimablePage from this point.
minReclaimablePage: maxPage,
+ evictable: make(map[EvictableMemoryUser]*evictableMemoryUserInfo),
}
- f.reclaimCond.L = &f.mu
f.mappings.Store(make([]uintptr, initialSize/chunkSize))
+ f.reclaimCond.L = &f.mu
go f.runReclaim() // S/R-SAFE: f.mu
// The Linux kernel contains an optional feature called "Integrity
@@ -434,113 +524,6 @@ func (f *MemoryFile) markDecommitted(fr platform.FileRange) {
f.usage.MergeRange(fr)
}
-// runReclaim implements the reclaimer goroutine, which continuously decommits
-// reclaimable pages in order to reduce memory usage and make them available
-// for allocation.
-func (f *MemoryFile) runReclaim() {
- for {
- fr, ok := f.findReclaimable()
- if !ok {
- break
- }
-
- if err := f.Decommit(fr); err != nil {
- log.Warningf("Reclaim failed to decommit %v: %v", fr, err)
- // Zero the pages manually. This won't reduce memory usage, but at
- // least ensures that the pages will be zero when reallocated.
- f.forEachMappingSlice(fr, func(bs []byte) {
- for i := range bs {
- bs[i] = 0
- }
- })
- // Pretend the pages were decommitted even though they weren't,
- // since the memory accounting implementation has no idea how to
- // deal with this.
- f.markDecommitted(fr)
- }
- f.markReclaimed(fr)
- }
- // We only get here if findReclaimable finds f.destroyed set and returns
- // false.
- f.mu.Lock()
- defer f.mu.Unlock()
- if !f.destroyed {
- panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set")
- }
- f.file.Close()
- // Ensure that any attempts to use f.file.Fd() fail instead of getting a fd
- // that has possibly been reassigned.
- f.file = nil
- mappings := f.mappings.Load().([]uintptr)
- for i, m := range mappings {
- if m != 0 {
- _, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m, chunkSize, 0)
- if errno != 0 {
- log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno)
- }
- }
- }
- // Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.)
- f.mappings.Store([]uintptr{})
-}
-
-func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
- f.mu.Lock()
- defer f.mu.Unlock()
- for {
- for {
- if f.destroyed {
- return platform.FileRange{}, false
- }
- if f.reclaimable {
- break
- }
- f.reclaimCond.Wait()
- }
- // Allocate returns the first usable range in offset order and is
- // currently a linear scan, so reclaiming from the beginning of the
- // file minimizes the expected latency of Allocate.
- for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() {
- if seg.ValuePtr().refs == 0 {
- f.minReclaimablePage = seg.End()
- return seg.Range(), true
- }
- }
- // No pages are reclaimable.
- f.reclaimable = false
- f.minReclaimablePage = maxPage
- }
-}
-
-func (f *MemoryFile) markReclaimed(fr platform.FileRange) {
- f.mu.Lock()
- defer f.mu.Unlock()
- seg := f.usage.FindSegment(fr.Start)
- // All of fr should be mapped to a single uncommitted reclaimable segment
- // accounted to System.
- if !seg.Ok() {
- panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage))
- }
- if !seg.Range().IsSupersetOf(fr) {
- panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage))
- }
- if got, want := seg.Value(), (usageInfo{
- kind: usage.System,
- knownCommitted: false,
- refs: 0,
- }); got != want {
- panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage))
- }
- // Deallocate reclaimed pages. Even though all of seg is reclaimable, the
- // caller of markReclaimed may not have decommitted it, so we can only mark
- // fr as reclaimed.
- f.usage.Remove(f.usage.Isolate(seg, fr))
- if fr.Start < f.minUnallocatedPage {
- // We've deallocated at least one lower page.
- f.minUnallocatedPage = fr.Start
- }
-}
-
// IncRef implements platform.File.IncRef.
func (f *MemoryFile) IncRef(fr platform.FileRange) {
if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
@@ -677,9 +660,82 @@ func (f *MemoryFile) getChunkMapping(chunk int) ([]uintptr, uintptr, error) {
return mappings, m, nil
}
-// FD implements platform.File.FD.
-func (f *MemoryFile) FD() int {
- return int(f.file.Fd())
+// MarkEvictable allows f to request memory deallocation by calling
+// user.Evict(er) in the future.
+//
+// Redundantly marking an already-evictable range as evictable has no effect.
+func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange) {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+ info, ok := f.evictable[user]
+ if !ok {
+ info = &evictableMemoryUserInfo{}
+ f.evictable[user] = info
+ }
+ gap := info.ranges.LowerBoundGap(er.Start)
+ for gap.Ok() && gap.Start() < er.End {
+ gapER := gap.Range().Intersect(er)
+ if gapER.Length() == 0 {
+ gap = gap.NextGap()
+ continue
+ }
+ gap = info.ranges.Insert(gap, gapER, evictableRangeSetValue{}).NextGap()
+ }
+ if !info.evicting {
+ switch f.opts.DelayedEviction {
+ case DelayedEvictionDisabled:
+ // Kick off eviction immediately.
+ f.startEvictionGoroutineLocked(user, info)
+ case DelayedEvictionEnabled:
+ // Ensure that the reclaimer goroutine is running, so that it can
+ // start eviction when necessary.
+ f.reclaimCond.Signal()
+ }
+ }
+}
+
+// MarkUnevictable informs f that user no longer considers er to be evictable,
+// so the MemoryFile should no longer call user.Evict(er). Note that, per
+// EvictableMemoryUser.Evict's documentation, user.Evict(er) may still be
+// called even after MarkUnevictable returns due to race conditions, and
+// implementations of EvictableMemoryUser must handle this possibility.
+//
+// Redundantly marking an already-unevictable range as unevictable has no
+// effect.
+func (f *MemoryFile) MarkUnevictable(user EvictableMemoryUser, er EvictableRange) {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+ info, ok := f.evictable[user]
+ if !ok {
+ return
+ }
+ seg := info.ranges.LowerBoundSegment(er.Start)
+ for seg.Ok() && seg.Start() < er.End {
+ seg = info.ranges.Isolate(seg, er)
+ seg = info.ranges.Remove(seg).NextSegment()
+ }
+ // We can only remove info if there's no eviction goroutine running on its
+ // behalf.
+ if !info.evicting && info.ranges.IsEmpty() {
+ delete(f.evictable, user)
+ }
+}
+
+// MarkAllUnevictable informs f that user no longer considers any offsets to be
+// evictable. It otherwise has the same semantics as MarkUnevictable.
+func (f *MemoryFile) MarkAllUnevictable(user EvictableMemoryUser) {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+ info, ok := f.evictable[user]
+ if !ok {
+ return
+ }
+ info.ranges.RemoveAll()
+ // We can only remove info if there's no eviction goroutine running on its
+ // behalf.
+ if !info.evicting {
+ delete(f.evictable, user)
+ }
}
// UpdateUsage ensures that the memory usage statistics in
@@ -889,6 +945,11 @@ func (f *MemoryFile) File() *os.File {
return f.file
}
+// FD implements platform.File.FD.
+func (f *MemoryFile) FD() int {
+ return int(f.file.Fd())
+}
+
// String implements fmt.Stringer.String.
//
// Note that because f.String locks f.mu, calling f.String internally
@@ -900,6 +961,167 @@ func (f *MemoryFile) String() string {
return f.usage.String()
}
+// runReclaim implements the reclaimer goroutine, which continuously decommits
+// reclaimable pages in order to reduce memory usage and make them available
+// for allocation.
+func (f *MemoryFile) runReclaim() {
+ for {
+ fr, ok := f.findReclaimable()
+ if !ok {
+ break
+ }
+
+ if err := f.Decommit(fr); err != nil {
+ log.Warningf("Reclaim failed to decommit %v: %v", fr, err)
+ // Zero the pages manually. This won't reduce memory usage, but at
+ // least ensures that the pages will be zero when reallocated.
+ f.forEachMappingSlice(fr, func(bs []byte) {
+ for i := range bs {
+ bs[i] = 0
+ }
+ })
+ // Pretend the pages were decommitted even though they weren't,
+ // since the memory accounting implementation has no idea how to
+ // deal with this.
+ f.markDecommitted(fr)
+ }
+ f.markReclaimed(fr)
+ }
+ // We only get here if findReclaimable finds f.destroyed set and returns
+ // false.
+ f.mu.Lock()
+ defer f.mu.Unlock()
+ if !f.destroyed {
+ panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set")
+ }
+ f.file.Close()
+ // Ensure that any attempts to use f.file.Fd() fail instead of getting a fd
+ // that has possibly been reassigned.
+ f.file = nil
+ f.mappingsMu.Lock()
+ defer f.mappingsMu.Unlock()
+ mappings := f.mappings.Load().([]uintptr)
+ for i, m := range mappings {
+ if m != 0 {
+ _, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m, chunkSize, 0)
+ if errno != 0 {
+ log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno)
+ }
+ }
+ }
+ // Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.)
+ f.mappings.Store([]uintptr{})
+}
+
+func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+ for {
+ for {
+ if f.destroyed {
+ return platform.FileRange{}, false
+ }
+ if f.reclaimable {
+ break
+ }
+ if f.opts.DelayedEviction == DelayedEvictionEnabled {
+ // No work to do. Evict any pending evictable allocations to
+ // get more reclaimable pages before going to sleep.
+ f.startEvictionsLocked()
+ }
+ f.reclaimCond.Wait()
+ }
+ // Allocate returns the first usable range in offset order and is
+ // currently a linear scan, so reclaiming from the beginning of the
+ // file minimizes the expected latency of Allocate.
+ for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() {
+ if seg.ValuePtr().refs == 0 {
+ f.minReclaimablePage = seg.End()
+ return seg.Range(), true
+ }
+ }
+ // No pages are reclaimable.
+ f.reclaimable = false
+ f.minReclaimablePage = maxPage
+ }
+}
+
+func (f *MemoryFile) markReclaimed(fr platform.FileRange) {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+ seg := f.usage.FindSegment(fr.Start)
+ // All of fr should be mapped to a single uncommitted reclaimable segment
+ // accounted to System.
+ if !seg.Ok() {
+ panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage))
+ }
+ if !seg.Range().IsSupersetOf(fr) {
+ panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage))
+ }
+ if got, want := seg.Value(), (usageInfo{
+ kind: usage.System,
+ knownCommitted: false,
+ refs: 0,
+ }); got != want {
+ panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage))
+ }
+ // Deallocate reclaimed pages. Even though all of seg is reclaimable, the
+ // caller of markReclaimed may not have decommitted it, so we can only mark
+ // fr as reclaimed.
+ f.usage.Remove(f.usage.Isolate(seg, fr))
+ if fr.Start < f.minUnallocatedPage {
+ // We've deallocated at least one lower page.
+ f.minUnallocatedPage = fr.Start
+ }
+}
+
+// Preconditions: f.mu must be locked.
+func (f *MemoryFile) startEvictionsLocked() {
+ for user, info := range f.evictable {
+ // Don't start multiple goroutines to evict the same user's
+ // allocations.
+ if !info.evicting {
+ f.startEvictionGoroutineLocked(user, info)
+ }
+ }
+}
+
+// Preconditions: info == f.evictable[user]. !info.evicting. f.mu must be
+// locked.
+func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) {
+ info.evicting = true
+ f.evictionWG.Add(1)
+ go func() { // S/R-SAFE: f.evictionWG
+ defer f.evictionWG.Done()
+ for {
+ f.mu.Lock()
+ info, ok := f.evictable[user]
+ if !ok {
+ // This shouldn't happen: only this goroutine is permitted
+ // to delete this entry.
+ f.mu.Unlock()
+ panic(fmt.Sprintf("evictableMemoryUserInfo for EvictableMemoryUser %v deleted while eviction goroutine running", user))
+ }
+ if info.ranges.IsEmpty() {
+ delete(f.evictable, user)
+ f.mu.Unlock()
+ return
+ }
+ // Evict from the end of info.ranges, under the assumption that
+ // if ranges in user start being used again (and are
+ // consequently marked unevictable), such uses are more likely
+ // to start from the beginning of user.
+ seg := info.ranges.LastSegment()
+ er := seg.Range()
+ info.ranges.Remove(seg)
+ // user.Evict() must be called without holding f.mu to avoid
+ // circular lock ordering.
+ f.mu.Unlock()
+ user.Evict(context.Background(), er)
+ }
+ }()
+}
+
type usageSetFunctions struct{}
func (usageSetFunctions) MinKey() uint64 {
@@ -920,3 +1142,27 @@ func (usageSetFunctions) Merge(_ platform.FileRange, val1 usageInfo, _ platform.
func (usageSetFunctions) Split(_ platform.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) {
return val, val
}
+
+// evictableRangeSetValue is the value type of evictableRangeSet.
+type evictableRangeSetValue struct{}
+
+type evictableRangeSetFunctions struct{}
+
+func (evictableRangeSetFunctions) MinKey() uint64 {
+ return 0
+}
+
+func (evictableRangeSetFunctions) MaxKey() uint64 {
+ return math.MaxUint64
+}
+
+func (evictableRangeSetFunctions) ClearValue(val *evictableRangeSetValue) {
+}
+
+func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetValue, _ EvictableRange, _ evictableRangeSetValue) (evictableRangeSetValue, bool) {
+ return evictableRangeSetValue{}, true
+}
+
+func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) {
+ return evictableRangeSetValue{}, evictableRangeSetValue{}
+}
diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go
index cf169af55..9534d1aed 100644
--- a/pkg/sentry/pgalloc/save_restore.go
+++ b/pkg/sentry/pgalloc/save_restore.go
@@ -28,6 +28,15 @@ import (
"gvisor.googlesource.com/gvisor/pkg/state"
)
+// FlushEvictions blocks until f has finished evicting all evictable
+// allocations.
+func (f *MemoryFile) FlushEvictions() {
+ f.mu.Lock()
+ f.startEvictionsLocked()
+ f.mu.Unlock()
+ f.evictionWG.Wait()
+}
+
// SaveTo writes f's state to the given stream.
func (f *MemoryFile) SaveTo(w io.Writer) error {
// Wait for reclaim.
@@ -40,6 +49,11 @@ func (f *MemoryFile) SaveTo(w io.Writer) error {
f.mu.Lock()
}
+ // Ensure that there are no pending evictions.
+ if len(f.evictable) != 0 {
+ panic(fmt.Sprintf("evictions still pending for %d users; call FlushEvictions before SaveTo", len(f.evictable)))
+ }
+
// Ensure that all pages that contain data have knownCommitted set, since
// we only store knownCommitted pages below.
zeroPage := make([]byte, usermem.PageSize)
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 0b5be0a42..05122a6a8 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -424,7 +424,7 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {
return nil, fmt.Errorf("error creating memfd: %v", err)
}
memfile := os.NewFile(uintptr(memfd), memfileName)
- mf, err := pgalloc.NewMemoryFile(memfile)
+ mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
if err != nil {
memfile.Close()
return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err)