Merge 216da0b7 (automated)

author: gVisor bot <gvisor-bot@google.com> 2019-06-02 06:44:55 +0000
committer: gVisor bot <gvisor-bot@google.com> 2019-06-02 06:44:55 +0000
commit: ceb0d792f328d1fc0692197d8856a43c3936a571 (patch)
tree: 83155f302eff44a78bcc30a3a08f4efe59a79379 /pkg/sentry/pgalloc
parent: deb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff)
parent: 216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff)
8 files changed, 4237 insertions, 0 deletions
diff --git a/pkg/sentry/pgalloc/context.go b/pkg/sentry/pgalloc/context.go
new file mode 100644
index 000000000..cb9809b1f
--- /dev/null
+++ b/pkg/sentry/pgalloc/context.go
@@ -0,0 +1,48 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pgalloc
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+)
+
+// contextID is this package's type for context.Context.Value keys.
+type contextID int
+
+const (
+	// CtxMemoryFile is a Context.Value key for a MemoryFile.
+	CtxMemoryFile contextID = iota
+
+	// CtxMemoryFileProvider is a Context.Value key for a MemoryFileProvider.
+	CtxMemoryFileProvider
+)
+
+// MemoryFileFromContext returns the MemoryFile used by ctx, or nil if no such
+// MemoryFile exists.
+func MemoryFileFromContext(ctx context.Context) *MemoryFile {
+	if v := ctx.Value(CtxMemoryFile); v != nil {
+		return v.(*MemoryFile)
+	}
+	return nil
+}
+
+// MemoryFileProviderFromContext returns the MemoryFileProvider used by ctx, or nil if no such
+// MemoryFileProvider exists.
+func MemoryFileProviderFromContext(ctx context.Context) MemoryFileProvider {
+	if v := ctx.Value(CtxMemoryFileProvider); v != nil {
+		return v.(MemoryFileProvider)
+	}
+	return nil
+}
diff --git a/pkg/sentry/pgalloc/evictable_range.go b/pkg/sentry/pgalloc/evictable_range.go
new file mode 100755
index 000000000..10ce2ff44
--- /dev/null
+++ b/pkg/sentry/pgalloc/evictable_range.go
@@ -0,0 +1,62 @@
+package pgalloc
+
+// A Range represents a contiguous range of T.
+//
+// +stateify savable
+type EvictableRange struct {
+	// Start is the inclusive start of the range.
+	Start uint64
+
+	// End is the exclusive end of the range.
+	End uint64
+}
+
+// WellFormed returns true if r.Start <= r.End. All other methods on a Range
+// require that the Range is well-formed.
+func (r EvictableRange) WellFormed() bool {
+	return r.Start <= r.End
+}
+
+// Length returns the length of the range.
+func (r EvictableRange) Length() uint64 {
+	return r.End - r.Start
+}
+
+// Contains returns true if r contains x.
+func (r EvictableRange) Contains(x uint64) bool {
+	return r.Start <= x && x < r.End
+}
+
+// Overlaps returns true if r and r2 overlap.
+func (r EvictableRange) Overlaps(r2 EvictableRange) bool {
+	return r.Start < r2.End && r2.Start < r.End
+}
+
+// IsSupersetOf returns true if r is a superset of r2; that is, the range r2 is
+// contained within r.
+func (r EvictableRange) IsSupersetOf(r2 EvictableRange) bool {
+	return r.Start <= r2.Start && r.End >= r2.End
+}
+
+// Intersect returns a range consisting of the intersection between r and r2.
+// If r and r2 do not overlap, Intersect returns a range with unspecified
+// bounds, but for which Length() == 0.
+func (r EvictableRange) Intersect(r2 EvictableRange) EvictableRange {
+	if r.Start < r2.Start {
+		r.Start = r2.Start
+	}
+	if r.End > r2.End {
+		r.End = r2.End
+	}
+	if r.End < r.Start {
+		r.End = r.Start
+	}
+	return r
+}
+
+// CanSplitAt returns true if it is legal to split a segment spanning the range
+// r at x; that is, splitting at x would produce two ranges, both of which have
+// non-zero length.
+func (r EvictableRange) CanSplitAt(x uint64) bool {
+	return r.Contains(x) && r.Start < x
+}
diff --git a/pkg/sentry/pgalloc/evictable_range_set.go b/pkg/sentry/pgalloc/evictable_range_set.go
new file mode 100755
index 000000000..a4dcb1663
--- /dev/null
+++ b/pkg/sentry/pgalloc/evictable_range_set.go
@@ -0,0 +1,1270 @@
+package pgalloc
+
+import (
+	"bytes"
+	"fmt"
+)
+
+const (
+	// minDegree is the minimum degree of an internal node in a Set B-tree.
+	//
+	// - Any non-root node has at least minDegree-1 segments.
+	//
+	// - Any non-root internal (non-leaf) node has at least minDegree children.
+	//
+	// - The root node may have fewer than minDegree-1 segments, but it may
+	// only have 0 segments if the tree is empty.
+	//
+	// Our implementation requires minDegree >= 3. Higher values of minDegree
+	// usually improve performance, but increase memory usage for small sets.
+	evictableRangeminDegree = 3
+
+	evictableRangemaxDegree = 2 * evictableRangeminDegree
+)
+
+// A Set is a mapping of segments with non-overlapping Range keys. The zero
+// value for a Set is an empty set. Set values are not safely movable nor
+// copyable. Set is thread-compatible.
+//
+// +stateify savable
+type evictableRangeSet struct {
+	root evictableRangenode `state:".(*evictableRangeSegmentDataSlices)"`
+}
+
+// IsEmpty returns true if the set contains no segments.
+func (s *evictableRangeSet) IsEmpty() bool {
+	return s.root.nrSegments == 0
+}
+
+// IsEmptyRange returns true iff no segments in the set overlap the given
+// range. This is semantically equivalent to s.SpanRange(r) == 0, but may be
+// more efficient.
+func (s *evictableRangeSet) IsEmptyRange(r EvictableRange) bool {
+	switch {
+	case r.Length() < 0:
+		panic(fmt.Sprintf("invalid range %v", r))
+	case r.Length() == 0:
+		return true
+	}
+	_, gap := s.Find(r.Start)
+	if !gap.Ok() {
+		return false
+	}
+	return r.End <= gap.End()
+}
+
+// Span returns the total size of all segments in the set.
+func (s *evictableRangeSet) Span() uint64 {
+	var sz uint64
+	for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		sz += seg.Range().Length()
+	}
+	return sz
+}
+
+// SpanRange returns the total size of the intersection of segments in the set
+// with the given range.
+func (s *evictableRangeSet) SpanRange(r EvictableRange) uint64 {
+	switch {
+	case r.Length() < 0:
+		panic(fmt.Sprintf("invalid range %v", r))
+	case r.Length() == 0:
+		return 0
+	}
+	var sz uint64
+	for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() {
+		sz += seg.Range().Intersect(r).Length()
+	}
+	return sz
+}
+
+// FirstSegment returns the first segment in the set. If the set is empty,
+// FirstSegment returns a terminal iterator.
+func (s *evictableRangeSet) FirstSegment() evictableRangeIterator {
+	if s.root.nrSegments == 0 {
+		return evictableRangeIterator{}
+	}
+	return s.root.firstSegment()
+}
+
+// LastSegment returns the last segment in the set. If the set is empty,
+// LastSegment returns a terminal iterator.
+func (s *evictableRangeSet) LastSegment() evictableRangeIterator {
+	if s.root.nrSegments == 0 {
+		return evictableRangeIterator{}
+	}
+	return s.root.lastSegment()
+}
+
+// FirstGap returns the first gap in the set.
+func (s *evictableRangeSet) FirstGap() evictableRangeGapIterator {
+	n := &s.root
+	for n.hasChildren {
+		n = n.children[0]
+	}
+	return evictableRangeGapIterator{n, 0}
+}
+
+// LastGap returns the last gap in the set.
+func (s *evictableRangeSet) LastGap() evictableRangeGapIterator {
+	n := &s.root
+	for n.hasChildren {
+		n = n.children[n.nrSegments]
+	}
+	return evictableRangeGapIterator{n, n.nrSegments}
+}
+
+// Find returns the segment or gap whose range contains the given key. If a
+// segment is found, the returned Iterator is non-terminal and the
+// returned GapIterator is terminal. Otherwise, the returned Iterator is
+// terminal and the returned GapIterator is non-terminal.
+func (s *evictableRangeSet) Find(key uint64) (evictableRangeIterator, evictableRangeGapIterator) {
+	n := &s.root
+	for {
+
+		lower := 0
+		upper := n.nrSegments
+		for lower < upper {
+			i := lower + (upper-lower)/2
+			if r := n.keys[i]; key < r.End {
+				if key >= r.Start {
+					return evictableRangeIterator{n, i}, evictableRangeGapIterator{}
+				}
+				upper = i
+			} else {
+				lower = i + 1
+			}
+		}
+		i := lower
+		if !n.hasChildren {
+			return evictableRangeIterator{}, evictableRangeGapIterator{n, i}
+		}
+		n = n.children[i]
+	}
+}
+
+// FindSegment returns the segment whose range contains the given key. If no
+// such segment exists, FindSegment returns a terminal iterator.
+func (s *evictableRangeSet) FindSegment(key uint64) evictableRangeIterator {
+	seg, _ := s.Find(key)
+	return seg
+}
+
+// LowerBoundSegment returns the segment with the lowest range that contains a
+// key greater than or equal to min. If no such segment exists,
+// LowerBoundSegment returns a terminal iterator.
+func (s *evictableRangeSet) LowerBoundSegment(min uint64) evictableRangeIterator {
+	seg, gap := s.Find(min)
+	if seg.Ok() {
+		return seg
+	}
+	return gap.NextSegment()
+}
+
+// UpperBoundSegment returns the segment with the highest range that contains a
+// key less than or equal to max. If no such segment exists, UpperBoundSegment
+// returns a terminal iterator.
+func (s *evictableRangeSet) UpperBoundSegment(max uint64) evictableRangeIterator {
+	seg, gap := s.Find(max)
+	if seg.Ok() {
+		return seg
+	}
+	return gap.PrevSegment()
+}
+
+// FindGap returns the gap containing the given key. If no such gap exists
+// (i.e. the set contains a segment containing that key), FindGap returns a
+// terminal iterator.
+func (s *evictableRangeSet) FindGap(key uint64) evictableRangeGapIterator {
+	_, gap := s.Find(key)
+	return gap
+}
+
+// LowerBoundGap returns the gap with the lowest range that is greater than or
+// equal to min.
+func (s *evictableRangeSet) LowerBoundGap(min uint64) evictableRangeGapIterator {
+	seg, gap := s.Find(min)
+	if gap.Ok() {
+		return gap
+	}
+	return seg.NextGap()
+}
+
+// UpperBoundGap returns the gap with the highest range that is less than or
+// equal to max.
+func (s *evictableRangeSet) UpperBoundGap(max uint64) evictableRangeGapIterator {
+	seg, gap := s.Find(max)
+	if gap.Ok() {
+		return gap
+	}
+	return seg.PrevGap()
+}
+
+// Add inserts the given segment into the set and returns true. If the new
+// segment can be merged with adjacent segments, Add will do so. If the new
+// segment would overlap an existing segment, Add returns false. If Add
+// succeeds, all existing iterators are invalidated.
+func (s *evictableRangeSet) Add(r EvictableRange, val evictableRangeSetValue) bool {
+	if r.Length() <= 0 {
+		panic(fmt.Sprintf("invalid segment range %v", r))
+	}
+	gap := s.FindGap(r.Start)
+	if !gap.Ok() {
+		return false
+	}
+	if r.End > gap.End() {
+		return false
+	}
+	s.Insert(gap, r, val)
+	return true
+}
+
+// AddWithoutMerging inserts the given segment into the set and returns true.
+// If it would overlap an existing segment, AddWithoutMerging does nothing and
+// returns false. If AddWithoutMerging succeeds, all existing iterators are
+// invalidated.
+func (s *evictableRangeSet) AddWithoutMerging(r EvictableRange, val evictableRangeSetValue) bool {
+	if r.Length() <= 0 {
+		panic(fmt.Sprintf("invalid segment range %v", r))
+	}
+	gap := s.FindGap(r.Start)
+	if !gap.Ok() {
+		return false
+	}
+	if r.End > gap.End() {
+		return false
+	}
+	s.InsertWithoutMergingUnchecked(gap, r, val)
+	return true
+}
+
+// Insert inserts the given segment into the given gap. If the new segment can
+// be merged with adjacent segments, Insert will do so. Insert returns an
+// iterator to the segment containing the inserted value (which may have been
+// merged with other values). All existing iterators (including gap, but not
+// including the returned iterator) are invalidated.
+//
+// If the gap cannot accommodate the segment, or if r is invalid, Insert panics.
+//
+// Insert is semantically equivalent to a InsertWithoutMerging followed by a
+// Merge, but may be more efficient. Note that there is no unchecked variant of
+// Insert since Insert must retrieve and inspect gap's predecessor and
+// successor segments regardless.
+func (s *evictableRangeSet) Insert(gap evictableRangeGapIterator, r EvictableRange, val evictableRangeSetValue) evictableRangeIterator {
+	if r.Length() <= 0 {
+		panic(fmt.Sprintf("invalid segment range %v", r))
+	}
+	prev, next := gap.PrevSegment(), gap.NextSegment()
+	if prev.Ok() && prev.End() > r.Start {
+		panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range()))
+	}
+	if next.Ok() && next.Start() < r.End {
+		panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range()))
+	}
+	if prev.Ok() && prev.End() == r.Start {
+		if mval, ok := (evictableRangeSetFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok {
+			prev.SetEndUnchecked(r.End)
+			prev.SetValue(mval)
+			if next.Ok() && next.Start() == r.End {
+				val = mval
+				if mval, ok := (evictableRangeSetFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok {
+					prev.SetEndUnchecked(next.End())
+					prev.SetValue(mval)
+					return s.Remove(next).PrevSegment()
+				}
+			}
+			return prev
+		}
+	}
+	if next.Ok() && next.Start() == r.End {
+		if mval, ok := (evictableRangeSetFunctions{}).Merge(r, val, next.Range(), next.Value()); ok {
+			next.SetStartUnchecked(r.Start)
+			next.SetValue(mval)
+			return next
+		}
+	}
+	return s.InsertWithoutMergingUnchecked(gap, r, val)
+}
+
+// InsertWithoutMerging inserts the given segment into the given gap and
+// returns an iterator to the inserted segment. All existing iterators
+// (including gap, but not including the returned iterator) are invalidated.
+//
+// If the gap cannot accommodate the segment, or if r is invalid,
+// InsertWithoutMerging panics.
+func (s *evictableRangeSet) InsertWithoutMerging(gap evictableRangeGapIterator, r EvictableRange, val evictableRangeSetValue) evictableRangeIterator {
+	if r.Length() <= 0 {
+		panic(fmt.Sprintf("invalid segment range %v", r))
+	}
+	if gr := gap.Range(); !gr.IsSupersetOf(r) {
+		panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr))
+	}
+	return s.InsertWithoutMergingUnchecked(gap, r, val)
+}
+
+// InsertWithoutMergingUnchecked inserts the given segment into the given gap
+// and returns an iterator to the inserted segment. All existing iterators
+// (including gap, but not including the returned iterator) are invalidated.
+//
+// Preconditions: r.Start >= gap.Start(); r.End <= gap.End().
+func (s *evictableRangeSet) InsertWithoutMergingUnchecked(gap evictableRangeGapIterator, r EvictableRange, val evictableRangeSetValue) evictableRangeIterator {
+	gap = gap.node.rebalanceBeforeInsert(gap)
+	copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments])
+	copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments])
+	gap.node.keys[gap.index] = r
+	gap.node.values[gap.index] = val
+	gap.node.nrSegments++
+	return evictableRangeIterator{gap.node, gap.index}
+}
+
+// Remove removes the given segment and returns an iterator to the vacated gap.
+// All existing iterators (including seg, but not including the returned
+// iterator) are invalidated.
+func (s *evictableRangeSet) Remove(seg evictableRangeIterator) evictableRangeGapIterator {
+
+	if seg.node.hasChildren {
+
+		victim := seg.PrevSegment()
+
+		seg.SetRangeUnchecked(victim.Range())
+		seg.SetValue(victim.Value())
+		return s.Remove(victim).NextGap()
+	}
+	copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments])
+	copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments])
+	evictableRangeSetFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1])
+	seg.node.nrSegments--
+	return seg.node.rebalanceAfterRemove(evictableRangeGapIterator{seg.node, seg.index})
+}
+
+// RemoveAll removes all segments from the set. All existing iterators are
+// invalidated.
+func (s *evictableRangeSet) RemoveAll() {
+	s.root = evictableRangenode{}
+}
+
+// RemoveRange removes all segments in the given range. An iterator to the
+// newly formed gap is returned, and all existing iterators are invalidated.
+func (s *evictableRangeSet) RemoveRange(r EvictableRange) evictableRangeGapIterator {
+	seg, gap := s.Find(r.Start)
+	if seg.Ok() {
+		seg = s.Isolate(seg, r)
+		gap = s.Remove(seg)
+	}
+	for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() {
+		seg = s.Isolate(seg, r)
+		gap = s.Remove(seg)
+	}
+	return gap
+}
+
+// Merge attempts to merge two neighboring segments. If successful, Merge
+// returns an iterator to the merged segment, and all existing iterators are
+// invalidated. Otherwise, Merge returns a terminal iterator.
+//
+// If first is not the predecessor of second, Merge panics.
+func (s *evictableRangeSet) Merge(first, second evictableRangeIterator) evictableRangeIterator {
+	if first.NextSegment() != second {
+		panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range()))
+	}
+	return s.MergeUnchecked(first, second)
+}
+
+// MergeUnchecked attempts to merge two neighboring segments. If successful,
+// MergeUnchecked returns an iterator to the merged segment, and all existing
+// iterators are invalidated. Otherwise, MergeUnchecked returns a terminal
+// iterator.
+//
+// Precondition: first is the predecessor of second: first.NextSegment() ==
+// second, first == second.PrevSegment().
+func (s *evictableRangeSet) MergeUnchecked(first, second evictableRangeIterator) evictableRangeIterator {
+	if first.End() == second.Start() {
+		if mval, ok := (evictableRangeSetFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok {
+
+			first.SetEndUnchecked(second.End())
+			first.SetValue(mval)
+			return s.Remove(second).PrevSegment()
+		}
+	}
+	return evictableRangeIterator{}
+}
+
+// MergeAll attempts to merge all adjacent segments in the set. All existing
+// iterators are invalidated.
+func (s *evictableRangeSet) MergeAll() {
+	seg := s.FirstSegment()
+	if !seg.Ok() {
+		return
+	}
+	next := seg.NextSegment()
+	for next.Ok() {
+		if mseg := s.MergeUnchecked(seg, next); mseg.Ok() {
+			seg, next = mseg, mseg.NextSegment()
+		} else {
+			seg, next = next, next.NextSegment()
+		}
+	}
+}
+
+// MergeRange attempts to merge all adjacent segments that contain a key in the
+// specific range. All existing iterators are invalidated.
+func (s *evictableRangeSet) MergeRange(r EvictableRange) {
+	seg := s.LowerBoundSegment(r.Start)
+	if !seg.Ok() {
+		return
+	}
+	next := seg.NextSegment()
+	for next.Ok() && next.Range().Start < r.End {
+		if mseg := s.MergeUnchecked(seg, next); mseg.Ok() {
+			seg, next = mseg, mseg.NextSegment()
+		} else {
+			seg, next = next, next.NextSegment()
+		}
+	}
+}
+
+// MergeAdjacent attempts to merge the segment containing r.Start with its
+// predecessor, and the segment containing r.End-1 with its successor.
+func (s *evictableRangeSet) MergeAdjacent(r EvictableRange) {
+	first := s.FindSegment(r.Start)
+	if first.Ok() {
+		if prev := first.PrevSegment(); prev.Ok() {
+			s.Merge(prev, first)
+		}
+	}
+	last := s.FindSegment(r.End - 1)
+	if last.Ok() {
+		if next := last.NextSegment(); next.Ok() {
+			s.Merge(last, next)
+		}
+	}
+}
+
+// Split splits the given segment at the given key and returns iterators to the
+// two resulting segments. All existing iterators (including seg, but not
+// including the returned iterators) are invalidated.
+//
+// If the segment cannot be split at split (because split is at the start or
+// end of the segment's range, so splitting would produce a segment with zero
+// length, or because split falls outside the segment's range altogether),
+// Split panics.
+func (s *evictableRangeSet) Split(seg evictableRangeIterator, split uint64) (evictableRangeIterator, evictableRangeIterator) {
+	if !seg.Range().CanSplitAt(split) {
+		panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split))
+	}
+	return s.SplitUnchecked(seg, split)
+}
+
+// SplitUnchecked splits the given segment at the given key and returns
+// iterators to the two resulting segments. All existing iterators (including
+// seg, but not including the returned iterators) are invalidated.
+//
+// Preconditions: seg.Start() < key < seg.End().
+func (s *evictableRangeSet) SplitUnchecked(seg evictableRangeIterator, split uint64) (evictableRangeIterator, evictableRangeIterator) {
+	val1, val2 := (evictableRangeSetFunctions{}).Split(seg.Range(), seg.Value(), split)
+	end2 := seg.End()
+	seg.SetEndUnchecked(split)
+	seg.SetValue(val1)
+	seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), EvictableRange{split, end2}, val2)
+
+	return seg2.PrevSegment(), seg2
+}
+
+// SplitAt splits the segment straddling split, if one exists. SplitAt returns
+// true if a segment was split and false otherwise. If SplitAt splits a
+// segment, all existing iterators are invalidated.
+func (s *evictableRangeSet) SplitAt(split uint64) bool {
+	if seg := s.FindSegment(split); seg.Ok() && seg.Range().CanSplitAt(split) {
+		s.SplitUnchecked(seg, split)
+		return true
+	}
+	return false
+}
+
+// Isolate ensures that the given segment's range does not escape r by
+// splitting at r.Start and r.End if necessary, and returns an updated iterator
+// to the bounded segment. All existing iterators (including seg, but not
+// including the returned iterators) are invalidated.
+func (s *evictableRangeSet) Isolate(seg evictableRangeIterator, r EvictableRange) evictableRangeIterator {
+	if seg.Range().CanSplitAt(r.Start) {
+		_, seg = s.SplitUnchecked(seg, r.Start)
+	}
+	if seg.Range().CanSplitAt(r.End) {
+		seg, _ = s.SplitUnchecked(seg, r.End)
+	}
+	return seg
+}
+
+// ApplyContiguous applies a function to a contiguous range of segments,
+// splitting if necessary. The function is applied until the first gap is
+// encountered, at which point the gap is returned. If the function is applied
+// across the entire range, a terminal gap is returned. All existing iterators
+// are invalidated.
+//
+// N.B. The Iterator must not be invalidated by the function.
+func (s *evictableRangeSet) ApplyContiguous(r EvictableRange, fn func(seg evictableRangeIterator)) evictableRangeGapIterator {
+	seg, gap := s.Find(r.Start)
+	if !seg.Ok() {
+		return gap
+	}
+	for {
+		seg = s.Isolate(seg, r)
+		fn(seg)
+		if seg.End() >= r.End {
+			return evictableRangeGapIterator{}
+		}
+		gap = seg.NextGap()
+		if !gap.IsEmpty() {
+			return gap
+		}
+		seg = gap.NextSegment()
+		if !seg.Ok() {
+
+			return evictableRangeGapIterator{}
+		}
+	}
+}
+
+// +stateify savable
+type evictableRangenode struct {
+	// An internal binary tree node looks like:
+	//
+	//   K
+	//  / \
+	// Cl Cr
+	//
+	// where all keys in the subtree rooted by Cl (the left subtree) are less
+	// than K (the key of the parent node), and all keys in the subtree rooted
+	// by Cr (the right subtree) are greater than K.
+	//
+	// An internal B-tree node's indexes work out to look like:
+	//
+	//   K0 K1 K2  ...   Kn-1
+	//  / \/ \/ \  ...  /  \
+	// C0 C1 C2 C3 ... Cn-1 Cn
+	//
+	// where n is nrSegments.
+	nrSegments int
+
+	// parent is a pointer to this node's parent. If this node is root, parent
+	// is nil.
+	parent *evictableRangenode
+
+	// parentIndex is the index of this node in parent.children.
+	parentIndex int
+
+	// Flag for internal nodes that is technically redundant with "children[0]
+	// != nil", but is stored in the first cache line. "hasChildren" rather
+	// than "isLeaf" because false must be the correct value for an empty root.
+	hasChildren bool
+
+	// Nodes store keys and values in separate arrays to maximize locality in
+	// the common case (scanning keys for lookup).
+	keys     [evictableRangemaxDegree - 1]EvictableRange
+	values   [evictableRangemaxDegree - 1]evictableRangeSetValue
+	children [evictableRangemaxDegree]*evictableRangenode
+}
+
+// firstSegment returns the first segment in the subtree rooted by n.
+//
+// Preconditions: n.nrSegments != 0.
+func (n *evictableRangenode) firstSegment() evictableRangeIterator {
+	for n.hasChildren {
+		n = n.children[0]
+	}
+	return evictableRangeIterator{n, 0}
+}
+
+// lastSegment returns the last segment in the subtree rooted by n.
+//
+// Preconditions: n.nrSegments != 0.
+func (n *evictableRangenode) lastSegment() evictableRangeIterator {
+	for n.hasChildren {
+		n = n.children[n.nrSegments]
+	}
+	return evictableRangeIterator{n, n.nrSegments - 1}
+}
+
+func (n *evictableRangenode) prevSibling() *evictableRangenode {
+	if n.parent == nil || n.parentIndex == 0 {
+		return nil
+	}
+	return n.parent.children[n.parentIndex-1]
+}
+
+func (n *evictableRangenode) nextSibling() *evictableRangenode {
+	if n.parent == nil || n.parentIndex == n.parent.nrSegments {
+		return nil
+	}
+	return n.parent.children[n.parentIndex+1]
+}
+
+// rebalanceBeforeInsert splits n and its ancestors if they are full, as
+// required for insertion, and returns an updated iterator to the position
+// represented by gap.
+func (n *evictableRangenode) rebalanceBeforeInsert(gap evictableRangeGapIterator) evictableRangeGapIterator {
+	if n.parent != nil {
+		gap = n.parent.rebalanceBeforeInsert(gap)
+	}
+	if n.nrSegments < evictableRangemaxDegree-1 {
+		return gap
+	}
+	if n.parent == nil {
+
+		left := &evictableRangenode{
+			nrSegments:  evictableRangeminDegree - 1,
+			parent:      n,
+			parentIndex: 0,
+			hasChildren: n.hasChildren,
+		}
+		right := &evictableRangenode{
+			nrSegments:  evictableRangeminDegree - 1,
+			parent:      n,
+			parentIndex: 1,
+			hasChildren: n.hasChildren,
+		}
+		copy(left.keys[:evictableRangeminDegree-1], n.keys[:evictableRangeminDegree-1])
+		copy(left.values[:evictableRangeminDegree-1], n.values[:evictableRangeminDegree-1])
+		copy(right.keys[:evictableRangeminDegree-1], n.keys[evictableRangeminDegree:])
+		copy(right.values[:evictableRangeminDegree-1], n.values[evictableRangeminDegree:])
+		n.keys[0], n.values[0] = n.keys[evictableRangeminDegree-1], n.values[evictableRangeminDegree-1]
+		evictableRangezeroValueSlice(n.values[1:])
+		if n.hasChildren {
+			copy(left.children[:evictableRangeminDegree], n.children[:evictableRangeminDegree])
+			copy(right.children[:evictableRangeminDegree], n.children[evictableRangeminDegree:])
+			evictableRangezeroNodeSlice(n.children[2:])
+			for i := 0; i < evictableRangeminDegree; i++ {
+				left.children[i].parent = left
+				left.children[i].parentIndex = i
+				right.children[i].parent = right
+				right.children[i].parentIndex = i
+			}
+		}
+		n.nrSegments = 1
+		n.hasChildren = true
+		n.children[0] = left
+		n.children[1] = right
+		if gap.node != n {
+			return gap
+		}
+		if gap.index < evictableRangeminDegree {
+			return evictableRangeGapIterator{left, gap.index}
+		}
+		return evictableRangeGapIterator{right, gap.index - evictableRangeminDegree}
+	}
+
+	copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments])
+	copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments])
+	n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[evictableRangeminDegree-1], n.values[evictableRangeminDegree-1]
+	copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1])
+	for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ {
+		n.parent.children[i].parentIndex = i
+	}
+	sibling := &evictableRangenode{
+		nrSegments:  evictableRangeminDegree - 1,
+		parent:      n.parent,
+		parentIndex: n.parentIndex + 1,
+		hasChildren: n.hasChildren,
+	}
+	n.parent.children[n.parentIndex+1] = sibling
+	n.parent.nrSegments++
+	copy(sibling.keys[:evictableRangeminDegree-1], n.keys[evictableRangeminDegree:])
+	copy(sibling.values[:evictableRangeminDegree-1], n.values[evictableRangeminDegree:])
+	evictableRangezeroValueSlice(n.values[evictableRangeminDegree-1:])
+	if n.hasChildren {
+		copy(sibling.children[:evictableRangeminDegree], n.children[evictableRangeminDegree:])
+		evictableRangezeroNodeSlice(n.children[evictableRangeminDegree:])
+		for i := 0; i < evictableRangeminDegree; i++ {
+			sibling.children[i].parent = sibling
+			sibling.children[i].parentIndex = i
+		}
+	}
+	n.nrSegments = evictableRangeminDegree - 1
+
+	if gap.node != n {
+		return gap
+	}
+	if gap.index < evictableRangeminDegree {
+		return gap
+	}
+	return evictableRangeGapIterator{sibling, gap.index - evictableRangeminDegree}
+}
+
+// rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient
+// (contain fewer segments than required by B-tree invariants), as required for
+// removal, and returns an updated iterator to the position represented by gap.
+//
+// Precondition: n is the only node in the tree that may currently violate a
+// B-tree invariant.
+func (n *evictableRangenode) rebalanceAfterRemove(gap evictableRangeGapIterator) evictableRangeGapIterator {
+	for {
+		if n.nrSegments >= evictableRangeminDegree-1 {
+			return gap
+		}
+		if n.parent == nil {
+
+			return gap
+		}
+
+		if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= evictableRangeminDegree {
+			copy(n.keys[1:], n.keys[:n.nrSegments])
+			copy(n.values[1:], n.values[:n.nrSegments])
+			n.keys[0] = n.parent.keys[n.parentIndex-1]
+			n.values[0] = n.parent.values[n.parentIndex-1]
+			n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1]
+			n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1]
+			evictableRangeSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1])
+			if n.hasChildren {
+				copy(n.children[1:], n.children[:n.nrSegments+1])
+				n.children[0] = sibling.children[sibling.nrSegments]
+				sibling.children[sibling.nrSegments] = nil
+				n.children[0].parent = n
+				n.children[0].parentIndex = 0
+				for i := 1; i < n.nrSegments+2; i++ {
+					n.children[i].parentIndex = i
+				}
+			}
+			n.nrSegments++
+			sibling.nrSegments--
+			if gap.node == sibling && gap.index == sibling.nrSegments {
+				return evictableRangeGapIterator{n, 0}
+			}
+			if gap.node == n {
+				return evictableRangeGapIterator{n, gap.index + 1}
+			}
+			return gap
+		}
+		if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= evictableRangeminDegree {
+			n.keys[n.nrSegments] = n.parent.keys[n.parentIndex]
+			n.values[n.nrSegments] = n.parent.values[n.parentIndex]
+			n.parent.keys[n.parentIndex] = sibling.keys[0]
+			n.parent.values[n.parentIndex] = sibling.values[0]
+			copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:])
+			copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:])
+			evictableRangeSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1])
+			if n.hasChildren {
+				n.children[n.nrSegments+1] = sibling.children[0]
+				copy(sibling.children[:sibling.nrSegments], sibling.children[1:])
+				sibling.children[sibling.nrSegments] = nil
+				n.children[n.nrSegments+1].parent = n
+				n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1
+				for i := 0; i < sibling.nrSegments; i++ {
+					sibling.children[i].parentIndex = i
+				}
+			}
+			n.nrSegments++
+			sibling.nrSegments--
+			if gap.node == sibling {
+				if gap.index == 0 {
+					return evictableRangeGapIterator{n, n.nrSegments}
+				}
+				return evictableRangeGapIterator{sibling, gap.index - 1}
+			}
+			return gap
+		}
+
+		p := n.parent
+		if p.nrSegments == 1 {
+
+			left, right := p.children[0], p.children[1]
+			p.nrSegments = left.nrSegments + right.nrSegments + 1
+			p.hasChildren = left.hasChildren
+			p.keys[left.nrSegments] = p.keys[0]
+			p.values[left.nrSegments] = p.values[0]
+			copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments])
+			copy(p.values[:left.nrSegments], left.values[:left.nrSegments])
+			copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments])
+			copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments])
+			if left.hasChildren {
+				copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1])
+				copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1])
+				for i := 0; i < p.nrSegments+1; i++ {
+					p.children[i].parent = p
+					p.children[i].parentIndex = i
+				}
+			} else {
+				p.children[0] = nil
+				p.children[1] = nil
+			}
+			if gap.node == left {
+				return evictableRangeGapIterator{p, gap.index}
+			}
+			if gap.node == right {
+				return evictableRangeGapIterator{p, gap.index + left.nrSegments + 1}
+			}
+			return gap
+		}
+		// Merge n and either sibling, along with the segment separating the
+		// two, into whichever of the two nodes comes first. This is the
+		// reverse of the non-root splitting case in
+		// node.rebalanceBeforeInsert.
+		var left, right *evictableRangenode
+		if n.parentIndex > 0 {
+			left = n.prevSibling()
+			right = n
+		} else {
+			left = n
+			right = n.nextSibling()
+		}
+
+		if gap.node == right {
+			gap = evictableRangeGapIterator{left, gap.index + left.nrSegments + 1}
+		}
+		left.keys[left.nrSegments] = p.keys[left.parentIndex]
+		left.values[left.nrSegments] = p.values[left.parentIndex]
+		copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments])
+		copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments])
+		if left.hasChildren {
+			copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1])
+			for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ {
+				left.children[i].parent = left
+				left.children[i].parentIndex = i
+			}
+		}
+		left.nrSegments += right.nrSegments + 1
+		copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments])
+		copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments])
+		evictableRangeSetFunctions{}.ClearValue(&p.values[p.nrSegments-1])
+		copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1])
+		for i := 0; i < p.nrSegments; i++ {
+			p.children[i].parentIndex = i
+		}
+		p.children[p.nrSegments] = nil
+		p.nrSegments--
+
+		n = p
+	}
+}
+
+// A Iterator is conceptually one of:
+//
+// - A pointer to a segment in a set; or
+//
+// - A terminal iterator, which is a sentinel indicating that the end of
+// iteration has been reached.
+//
+// Iterators are copyable values and are meaningfully equality-comparable. The
+// zero value of Iterator is a terminal iterator.
+//
+// Unless otherwise specified, any mutation of a set invalidates all existing
+// iterators into the set.
+type evictableRangeIterator struct {
+	// node is the node containing the iterated segment. If the iterator is
+	// terminal, node is nil.
+	node *evictableRangenode
+
+	// index is the index of the segment in node.keys/values.
+	index int
+}
+
+// Ok returns true if the iterator is not terminal. All other methods are only
+// valid for non-terminal iterators.
+func (seg evictableRangeIterator) Ok() bool {
+	return seg.node != nil
+}
+
+// Range returns the iterated segment's range key.
+func (seg evictableRangeIterator) Range() EvictableRange {
+	return seg.node.keys[seg.index]
+}
+
+// Start is equivalent to Range().Start, but should be preferred if only the
+// start of the range is needed.
+func (seg evictableRangeIterator) Start() uint64 {
+	return seg.node.keys[seg.index].Start
+}
+
+// End is equivalent to Range().End, but should be preferred if only the end of
+// the range is needed.
+func (seg evictableRangeIterator) End() uint64 {
+	return seg.node.keys[seg.index].End
+}
+
+// SetRangeUnchecked mutates the iterated segment's range key. This operation
+// does not invalidate any iterators.
+//
+// Preconditions:
+//
+// - r.Length() > 0.
+//
+// - The new range must not overlap an existing one: If seg.NextSegment().Ok(),
+// then r.end <= seg.NextSegment().Start(); if seg.PrevSegment().Ok(), then
+// r.start >= seg.PrevSegment().End().
+func (seg evictableRangeIterator) SetRangeUnchecked(r EvictableRange) {
+	seg.node.keys[seg.index] = r
+}
+
+// SetRange mutates the iterated segment's range key. If the new range would
+// cause the iterated segment to overlap another segment, or if the new range
+// is invalid, SetRange panics. This operation does not invalidate any
+// iterators.
+func (seg evictableRangeIterator) SetRange(r EvictableRange) {
+	if r.Length() <= 0 {
+		panic(fmt.Sprintf("invalid segment range %v", r))
+	}
+	if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() {
+		panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range()))
+	}
+	if next := seg.NextSegment(); next.Ok() && r.End > next.Start() {
+		panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range()))
+	}
+	seg.SetRangeUnchecked(r)
+}
+
+// SetStartUnchecked mutates the iterated segment's start. This operation does
+// not invalidate any iterators.
+//
+// Preconditions: The new start must be valid: start < seg.End(); if
+// seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End().
+func (seg evictableRangeIterator) SetStartUnchecked(start uint64) {
+	seg.node.keys[seg.index].Start = start
+}
+
+// SetStart mutates the iterated segment's start. If the new start value would
+// cause the iterated segment to overlap another segment, or would result in an
+// invalid range, SetStart panics. This operation does not invalidate any
+// iterators.
+func (seg evictableRangeIterator) SetStart(start uint64) {
+	if start >= seg.End() {
+		panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range()))
+	}
+	if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() {
+		panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range()))
+	}
+	seg.SetStartUnchecked(start)
+}
+
+// SetEndUnchecked mutates the iterated segment's end. This operation does not
+// invalidate any iterators.
+//
+// Preconditions: The new end must be valid: end > seg.Start(); if
+// seg.NextSegment().Ok(), then end <= seg.NextSegment().Start().
+func (seg evictableRangeIterator) SetEndUnchecked(end uint64) {
+	seg.node.keys[seg.index].End = end
+}
+
+// SetEnd mutates the iterated segment's end. If the new end value would cause
+// the iterated segment to overlap another segment, or would result in an
+// invalid range, SetEnd panics. This operation does not invalidate any
+// iterators.
+func (seg evictableRangeIterator) SetEnd(end uint64) {
+	if end <= seg.Start() {
+		panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range()))
+	}
+	if next := seg.NextSegment(); next.Ok() && end > next.Start() {
+		panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range()))
+	}
+	seg.SetEndUnchecked(end)
+}
+
+// Value returns a copy of the iterated segment's value.
+func (seg evictableRangeIterator) Value() evictableRangeSetValue {
+	return seg.node.values[seg.index]
+}
+
+// ValuePtr returns a pointer to the iterated segment's value. The pointer is
+// invalidated if the iterator is invalidated. This operation does not
+// invalidate any iterators.
+func (seg evictableRangeIterator) ValuePtr() *evictableRangeSetValue {
+	return &seg.node.values[seg.index]
+}
+
+// SetValue mutates the iterated segment's value. This operation does not
+// invalidate any iterators.
+func (seg evictableRangeIterator) SetValue(val evictableRangeSetValue) {
+	seg.node.values[seg.index] = val
+}
+
+// PrevSegment returns the iterated segment's predecessor. If there is no
+// preceding segment, PrevSegment returns a terminal iterator.
+func (seg evictableRangeIterator) PrevSegment() evictableRangeIterator {
+	if seg.node.hasChildren {
+		return seg.node.children[seg.index].lastSegment()
+	}
+	if seg.index > 0 {
+		return evictableRangeIterator{seg.node, seg.index - 1}
+	}
+	if seg.node.parent == nil {
+		return evictableRangeIterator{}
+	}
+	return evictableRangesegmentBeforePosition(seg.node.parent, seg.node.parentIndex)
+}
+
+// NextSegment returns the iterated segment's successor. If there is no
+// succeeding segment, NextSegment returns a terminal iterator.
+func (seg evictableRangeIterator) NextSegment() evictableRangeIterator {
+	if seg.node.hasChildren {
+		return seg.node.children[seg.index+1].firstSegment()
+	}
+	if seg.index < seg.node.nrSegments-1 {
+		return evictableRangeIterator{seg.node, seg.index + 1}
+	}
+	if seg.node.parent == nil {
+		return evictableRangeIterator{}
+	}
+	return evictableRangesegmentAfterPosition(seg.node.parent, seg.node.parentIndex)
+}
+
+// PrevGap returns the gap immediately before the iterated segment.
+func (seg evictableRangeIterator) PrevGap() evictableRangeGapIterator {
+	if seg.node.hasChildren {
+
+		return seg.node.children[seg.index].lastSegment().NextGap()
+	}
+	return evictableRangeGapIterator{seg.node, seg.index}
+}
+
+// NextGap returns the gap immediately after the iterated segment.
+func (seg evictableRangeIterator) NextGap() evictableRangeGapIterator {
+	if seg.node.hasChildren {
+		return seg.node.children[seg.index+1].firstSegment().PrevGap()
+	}
+	return evictableRangeGapIterator{seg.node, seg.index + 1}
+}
+
+// PrevNonEmpty returns the iterated segment's predecessor if it is adjacent,
+// or the gap before the iterated segment otherwise. If seg.Start() ==
+// Functions.MinKey(), PrevNonEmpty will return two terminal iterators.
+// Otherwise, exactly one of the iterators returned by PrevNonEmpty will be
+// non-terminal.
+func (seg evictableRangeIterator) PrevNonEmpty() (evictableRangeIterator, evictableRangeGapIterator) {
+	gap := seg.PrevGap()
+	if gap.Range().Length() != 0 {
+		return evictableRangeIterator{}, gap
+	}
+	return gap.PrevSegment(), evictableRangeGapIterator{}
+}
+
+// NextNonEmpty returns the iterated segment's successor if it is adjacent, or
+// the gap after the iterated segment otherwise. If seg.End() ==
+// Functions.MaxKey(), NextNonEmpty will return two terminal iterators.
+// Otherwise, exactly one of the iterators returned by NextNonEmpty will be
+// non-terminal.
+func (seg evictableRangeIterator) NextNonEmpty() (evictableRangeIterator, evictableRangeGapIterator) {
+	gap := seg.NextGap()
+	if gap.Range().Length() != 0 {
+		return evictableRangeIterator{}, gap
+	}
+	return gap.NextSegment(), evictableRangeGapIterator{}
+}
+
+// A GapIterator is conceptually one of:
+//
+// - A pointer to a position between two segments, before the first segment, or
+// after the last segment in a set, called a *gap*; or
+//
+// - A terminal iterator, which is a sentinel indicating that the end of
+// iteration has been reached.
+//
+// Note that the gap between two adjacent segments exists (iterators to it are
+// non-terminal), but has a length of zero. GapIterator.IsEmpty returns true
+// for such gaps. An empty set contains a single gap, spanning the entire range
+// of the set's keys.
+//
+// GapIterators are copyable values and are meaningfully equality-comparable.
+// The zero value of GapIterator is a terminal iterator.
+//
+// Unless otherwise specified, any mutation of a set invalidates all existing
+// iterators into the set.
+type evictableRangeGapIterator struct {
+	// The representation of a GapIterator is identical to that of an Iterator,
+	// except that index corresponds to positions between segments in the same
+	// way as for node.children (see comment for node.nrSegments).
+	node  *evictableRangenode
+	index int
+}
+
+// Ok returns true if the iterator is not terminal. All other methods are only
+// valid for non-terminal iterators.
+func (gap evictableRangeGapIterator) Ok() bool {
+	return gap.node != nil
+}
+
+// Range returns the range spanned by the iterated gap.
+func (gap evictableRangeGapIterator) Range() EvictableRange {
+	return EvictableRange{gap.Start(), gap.End()}
+}
+
+// Start is equivalent to Range().Start, but should be preferred if only the
+// start of the range is needed.
+func (gap evictableRangeGapIterator) Start() uint64 {
+	if ps := gap.PrevSegment(); ps.Ok() {
+		return ps.End()
+	}
+	return evictableRangeSetFunctions{}.MinKey()
+}
+
+// End is equivalent to Range().End, but should be preferred if only the end of
+// the range is needed.
+func (gap evictableRangeGapIterator) End() uint64 {
+	if ns := gap.NextSegment(); ns.Ok() {
+		return ns.Start()
+	}
+	return evictableRangeSetFunctions{}.MaxKey()
+}
+
+// IsEmpty returns true if the iterated gap is empty (that is, the "gap" is
+// between two adjacent segments.)
+func (gap evictableRangeGapIterator) IsEmpty() bool {
+	return gap.Range().Length() == 0
+}
+
+// PrevSegment returns the segment immediately before the iterated gap. If no
+// such segment exists, PrevSegment returns a terminal iterator.
+func (gap evictableRangeGapIterator) PrevSegment() evictableRangeIterator {
+	return evictableRangesegmentBeforePosition(gap.node, gap.index)
+}
+
+// NextSegment returns the segment immediately after the iterated gap. If no
+// such segment exists, NextSegment returns a terminal iterator.
+func (gap evictableRangeGapIterator) NextSegment() evictableRangeIterator {
+	return evictableRangesegmentAfterPosition(gap.node, gap.index)
+}
+
+// PrevGap returns the iterated gap's predecessor. If no such gap exists,
+// PrevGap returns a terminal iterator.
+func (gap evictableRangeGapIterator) PrevGap() evictableRangeGapIterator {
+	seg := gap.PrevSegment()
+	if !seg.Ok() {
+		return evictableRangeGapIterator{}
+	}
+	return seg.PrevGap()
+}
+
+// NextGap returns the iterated gap's successor. If no such gap exists, NextGap
+// returns a terminal iterator.
+func (gap evictableRangeGapIterator) NextGap() evictableRangeGapIterator {
+	seg := gap.NextSegment()
+	if !seg.Ok() {
+		return evictableRangeGapIterator{}
+	}
+	return seg.NextGap()
+}
+
+// segmentBeforePosition returns the predecessor segment of the position given
+// by n.children[i], which may or may not contain a child. If no such segment
+// exists, segmentBeforePosition returns a terminal iterator.
+func evictableRangesegmentBeforePosition(n *evictableRangenode, i int) evictableRangeIterator {
+	for i == 0 {
+		if n.parent == nil {
+			return evictableRangeIterator{}
+		}
+		n, i = n.parent, n.parentIndex
+	}
+	return evictableRangeIterator{n, i - 1}
+}
+
+// segmentAfterPosition returns the successor segment of the position given by
+// n.children[i], which may or may not contain a child. If no such segment
+// exists, segmentAfterPosition returns a terminal iterator.
+func evictableRangesegmentAfterPosition(n *evictableRangenode, i int) evictableRangeIterator {
+	for i == n.nrSegments {
+		if n.parent == nil {
+			return evictableRangeIterator{}
+		}
+		n, i = n.parent, n.parentIndex
+	}
+	return evictableRangeIterator{n, i}
+}
+
+func evictableRangezeroValueSlice(slice []evictableRangeSetValue) {
+
+	for i := range slice {
+		evictableRangeSetFunctions{}.ClearValue(&slice[i])
+	}
+}
+
+func evictableRangezeroNodeSlice(slice []*evictableRangenode) {
+	for i := range slice {
+		slice[i] = nil
+	}
+}
+
+// String stringifies a Set for debugging.
+func (s *evictableRangeSet) String() string {
+	return s.root.String()
+}
+
+// String stringifes a node (and all of its children) for debugging.
+func (n *evictableRangenode) String() string {
+	var buf bytes.Buffer
+	n.writeDebugString(&buf, "")
+	return buf.String()
+}
+
+func (n *evictableRangenode) writeDebugString(buf *bytes.Buffer, prefix string) {
+	if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) {
+		buf.WriteString(prefix)
+		buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren))
+	}
+	for i := 0; i < n.nrSegments; i++ {
+		if child := n.children[i]; child != nil {
+			cprefix := fmt.Sprintf("%s- % 3d ", prefix, i)
+			if child.parent != n || child.parentIndex != i {
+				buf.WriteString(cprefix)
+				buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i))
+			}
+			child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i))
+		}
+		buf.WriteString(prefix)
+		buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i]))
+	}
+	if child := n.children[n.nrSegments]; child != nil {
+		child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments))
+	}
+}
+
+// SegmentDataSlices represents segments from a set as slices of start, end, and
+// values. SegmentDataSlices is primarily used as an intermediate representation
+// for save/restore and the layout here is optimized for that.
+//
+// +stateify savable
+type evictableRangeSegmentDataSlices struct {
+	Start  []uint64
+	End    []uint64
+	Values []evictableRangeSetValue
+}
+
+// ExportSortedSlice returns a copy of all segments in the given set, in ascending
+// key order.
+func (s *evictableRangeSet) ExportSortedSlices() *evictableRangeSegmentDataSlices {
+	var sds evictableRangeSegmentDataSlices
+	for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		sds.Start = append(sds.Start, seg.Start())
+		sds.End = append(sds.End, seg.End())
+		sds.Values = append(sds.Values, seg.Value())
+	}
+	sds.Start = sds.Start[:len(sds.Start):len(sds.Start)]
+	sds.End = sds.End[:len(sds.End):len(sds.End)]
+	sds.Values = sds.Values[:len(sds.Values):len(sds.Values)]
+	return &sds
+}
+
+// ImportSortedSlice initializes the given set from the given slice.
+//
+// Preconditions: s must be empty. sds must represent a valid set (the segments
+// in sds must have valid lengths that do not overlap). The segments in sds
+// must be sorted in ascending key order.
+func (s *evictableRangeSet) ImportSortedSlices(sds *evictableRangeSegmentDataSlices) error {
+	if !s.IsEmpty() {
+		return fmt.Errorf("cannot import into non-empty set %v", s)
+	}
+	gap := s.FirstGap()
+	for i := range sds.Start {
+		r := EvictableRange{sds.Start[i], sds.End[i]}
+		if !gap.Range().IsSupersetOf(r) {
+			return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: [%d, %d) => %v", sds.Start[i], sds.End[i], sds.Values[i])
+		}
+		gap = s.InsertWithoutMerging(gap, r, sds.Values[i]).NextGap()
+	}
+	return nil
+}
+func (s *evictableRangeSet) saveRoot() *evictableRangeSegmentDataSlices {
+	return s.ExportSortedSlices()
+}
+
+func (s *evictableRangeSet) loadRoot(sds *evictableRangeSegmentDataSlices) {
+	if err := s.ImportSortedSlices(sds); err != nil {
+		panic(err)
+	}
+}
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
new file mode 100644
index 000000000..2b9924ad7
--- /dev/null
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -0,0 +1,1187 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pgalloc contains the page allocator subsystem, which manages memory
+// that may be mapped into application address spaces.
+//
+// Lock order:
+//
+// pgalloc.MemoryFile.mu
+//   pgalloc.MemoryFile.mappingsMu
+package pgalloc
+
+import (
+	"fmt"
+	"math"
+	"os"
+	"sync"
+	"sync/atomic"
+	"syscall"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// MemoryFile is a platform.File whose pages may be allocated to arbitrary
+// users.
+type MemoryFile struct {
+	// opts holds options passed to NewMemoryFile. opts is immutable.
+	opts MemoryFileOpts
+
+	// MemoryFile owns a single backing file, which is modeled as follows:
+	//
+	// Each page in the file can be committed or uncommitted. A page is
+	// committed if the host kernel is spending resources to store its contents
+	// and uncommitted otherwise. This definition includes pages that the host
+	// kernel has swapped; this is intentional, to ensure that accounting does
+	// not change even if host kernel swapping behavior changes, and that
+	// memory used by pseudo-swap mechanisms like zswap is still accounted.
+	//
+	// The initial contents of uncommitted pages are implicitly zero bytes. A
+	// read or write to the contents of an uncommitted page causes it to be
+	// committed. This is the only event that can cause a uncommitted page to
+	// be committed.
+	//
+	// fallocate(FALLOC_FL_PUNCH_HOLE) (MemoryFile.Decommit) causes committed
+	// pages to be uncommitted. This is the only event that can cause a
+	// committed page to be uncommitted.
+	//
+	// Memory accounting is based on identifying the set of committed pages.
+	// Since we do not have direct access to the MMU, tracking reads and writes
+	// to uncommitted pages to detect commitment would introduce additional
+	// page faults, which would be prohibitively expensive. Instead, we query
+	// the host kernel to determine which pages are committed.
+
+	// file is the backing file. The file pointer is immutable.
+	file *os.File
+
+	mu sync.Mutex
+
+	// usage maps each page in the file to metadata for that page. Pages for
+	// which no segment exists in usage are both unallocated (not in use) and
+	// uncommitted.
+	//
+	// Since usage stores usageInfo objects by value, clients should usually
+	// use usageIterator.ValuePtr() instead of usageIterator.Value() to get a
+	// pointer to the usageInfo rather than a copy.
+	//
+	// usage must be kept maximally merged (that is, there should never be two
+	// adjacent segments with the same values). At least markReclaimed depends
+	// on this property.
+	//
+	// usage is protected by mu.
+	usage usageSet
+
+	// The UpdateUsage function scans all segments with knownCommitted set
+	// to false, sees which pages are committed and creates corresponding
+	// segments with knownCommitted set to true.
+	//
+	// In order to avoid unnecessary scans, usageExpected tracks the total
+	// file blocks expected. This is used to elide the scan when this
+	// matches the underlying file blocks.
+	//
+	// To track swapped pages, usageSwapped tracks the discrepency between
+	// what is observed in core and what is reported by the file. When
+	// usageSwapped is non-zero, a sweep will be performed at least every
+	// second. The start of the last sweep is recorded in usageLast.
+	//
+	// All usage attributes are all protected by mu.
+	usageExpected uint64
+	usageSwapped  uint64
+	usageLast     time.Time
+
+	// minUnallocatedPage is the minimum page that may be unallocated.
+	// i.e., there are no unallocated pages below minUnallocatedPage.
+	//
+	// minUnallocatedPage is protected by mu.
+	minUnallocatedPage uint64
+
+	// fileSize is the size of the backing memory file in bytes. fileSize is
+	// always a power-of-two multiple of chunkSize.
+	//
+	// fileSize is protected by mu.
+	fileSize int64
+
+	// Pages from the backing file are mapped into the local address space on
+	// the granularity of large pieces called chunks. mappings is a []uintptr
+	// that stores, for each chunk, the start address of a mapping of that
+	// chunk in the current process' address space, or 0 if no such mapping
+	// exists. Once a chunk is mapped, it is never remapped or unmapped until
+	// the MemoryFile is destroyed.
+	//
+	// Mutating the mappings slice or its contents requires both holding
+	// mappingsMu and using atomic memory operations. (The slice is mutated
+	// whenever the file is expanded. Per the above, the only permitted
+	// mutation of the slice's contents is the assignment of a mapping to a
+	// chunk that was previously unmapped.) Reading the slice or its contents
+	// only requires *either* holding mappingsMu or using atomic memory
+	// operations. This allows MemoryFile.MapInternal to avoid locking in the
+	// common case where chunk mappings already exist.
+	mappingsMu sync.Mutex
+	mappings   atomic.Value
+
+	// destroyed is set by Destroy to instruct the reclaimer goroutine to
+	// release resources and exit. destroyed is protected by mu.
+	destroyed bool
+
+	// reclaimable is true if usage may contain reclaimable pages. reclaimable
+	// is protected by mu.
+	reclaimable bool
+
+	// minReclaimablePage is the minimum page that may be reclaimable.
+	// i.e., all reclaimable pages are >= minReclaimablePage.
+	//
+	// minReclaimablePage is protected by mu.
+	minReclaimablePage uint64
+
+	// reclaimCond is signaled (with mu locked) when reclaimable or destroyed
+	// transitions from false to true.
+	reclaimCond sync.Cond
+
+	// evictable maps EvictableMemoryUsers to eviction state.
+	//
+	// evictable is protected by mu.
+	evictable map[EvictableMemoryUser]*evictableMemoryUserInfo
+
+	// evictionWG counts the number of goroutines currently performing evictions.
+	evictionWG sync.WaitGroup
+}
+
+// MemoryFileOpts provides options to NewMemoryFile.
+type MemoryFileOpts struct {
+	// DelayedEviction controls the extent to which the MemoryFile may delay
+	// eviction of evictable allocations.
+	DelayedEviction DelayedEvictionType
+}
+
+// DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction.
+type DelayedEvictionType int
+
+const (
+	// DelayedEvictionDefault has unspecified behavior.
+	DelayedEvictionDefault DelayedEvictionType = iota
+
+	// DelayedEvictionDisabled requires that evictable allocations are evicted
+	// as soon as possible.
+	DelayedEvictionDisabled
+
+	// DelayedEvictionEnabled requests that the MemoryFile delay eviction of
+	// evictable allocations until doing so is considered necessary to avoid
+	// performance degradation due to host memory pressure, or OOM kills.
+	//
+	// As of this writing, DelayedEvictionEnabled delays evictions until the
+	// reclaimer goroutine is out of work (pages to reclaim), then evicts all
+	// pending evictable allocations immediately.
+	DelayedEvictionEnabled
+
+	// DelayedEvictionManual requires that evictable allocations are only
+	// evicted when MemoryFile.StartEvictions() is called. This is extremely
+	// dangerous outside of tests.
+	DelayedEvictionManual
+)
+
+// usageInfo tracks usage information.
+//
+// +stateify savable
+type usageInfo struct {
+	// kind is the usage kind.
+	kind usage.MemoryKind
+
+	// knownCommitted is true if the tracked region is definitely committed.
+	// (If it is false, the tracked region may or may not be committed.)
+	knownCommitted bool
+
+	refs uint64
+}
+
+// An EvictableMemoryUser represents a user of MemoryFile-allocated memory that
+// may be asked to deallocate that memory in the presence of memory pressure.
+type EvictableMemoryUser interface {
+	// Evict requests that the EvictableMemoryUser deallocate memory used by
+	// er, which was registered as evictable by a previous call to
+	// MemoryFile.MarkEvictable.
+	//
+	// Evict is not required to deallocate memory. In particular, since pgalloc
+	// must call Evict without holding locks to avoid circular lock ordering,
+	// it is possible that the passed range has already been marked as
+	// unevictable by a racing call to MemoryFile.MarkUnevictable.
+	// Implementations of EvictableMemoryUser must detect such races and handle
+	// them by making Evict have no effect on unevictable ranges.
+	//
+	// After a call to Evict, the MemoryFile will consider the evicted range
+	// unevictable (i.e. it will not call Evict on the same range again) until
+	// informed otherwise by a subsequent call to MarkEvictable.
+	Evict(ctx context.Context, er EvictableRange)
+}
+
+// An EvictableRange represents a range of uint64 offsets in an
+// EvictableMemoryUser.
+//
+// In practice, most EvictableMemoryUsers will probably be implementations of
+// memmap.Mappable, and EvictableRange therefore corresponds to
+// memmap.MappableRange. However, this package cannot depend on the memmap
+// package, since doing so would create a circular dependency.
+//
+// type EvictableRange <generated using go_generics>
+
+// evictableMemoryUserInfo is the value type of MemoryFile.evictable.
+type evictableMemoryUserInfo struct {
+	// ranges tracks all evictable ranges for the given user.
+	ranges evictableRangeSet
+
+	// If evicting is true, there is a goroutine currently evicting all
+	// evictable ranges for this user.
+	evicting bool
+}
+
+const (
+	chunkShift = 24
+	chunkSize  = 1 << chunkShift // 16 MB
+	chunkMask  = chunkSize - 1
+
+	initialSize = chunkSize
+
+	// maxPage is the highest 64-bit page.
+	maxPage = math.MaxUint64 &^ (usermem.PageSize - 1)
+)
+
+// NewMemoryFile creates a MemoryFile backed by the given file. If
+// NewMemoryFile succeeds, ownership of file is transferred to the returned
+// MemoryFile.
+func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) {
+	switch opts.DelayedEviction {
+	case DelayedEvictionDefault:
+		opts.DelayedEviction = DelayedEvictionEnabled
+	case DelayedEvictionDisabled, DelayedEvictionEnabled, DelayedEvictionManual:
+	default:
+		return nil, fmt.Errorf("invalid MemoryFileOpts.DelayedEviction: %v", opts.DelayedEviction)
+	}
+
+	// Truncate the file to 0 bytes first to ensure that it's empty.
+	if err := file.Truncate(0); err != nil {
+		return nil, err
+	}
+	if err := file.Truncate(initialSize); err != nil {
+		return nil, err
+	}
+	f := &MemoryFile{
+		opts:     opts,
+		fileSize: initialSize,
+		file:     file,
+		// No pages are reclaimable. DecRef will always be able to
+		// decrease minReclaimablePage from this point.
+		minReclaimablePage: maxPage,
+		evictable:          make(map[EvictableMemoryUser]*evictableMemoryUserInfo),
+	}
+	f.mappings.Store(make([]uintptr, initialSize/chunkSize))
+	f.reclaimCond.L = &f.mu
+	go f.runReclaim() // S/R-SAFE: f.mu
+
+	// The Linux kernel contains an optional feature called "Integrity
+	// Measurement Architecture" (IMA). If IMA is enabled, it will checksum
+	// binaries the first time they are mapped PROT_EXEC. This is bad news for
+	// executable pages mapped from our backing file, which can grow to
+	// terabytes in (sparse) size. If IMA attempts to checksum a file that
+	// large, it will allocate all of the sparse pages and quickly exhaust all
+	// memory.
+	//
+	// Work around IMA by immediately creating a temporary PROT_EXEC mapping,
+	// while the backing file is still small. IMA will ignore any future
+	// mappings.
+	m, _, errno := syscall.Syscall6(
+		syscall.SYS_MMAP,
+		0,
+		usermem.PageSize,
+		syscall.PROT_EXEC,
+		syscall.MAP_SHARED,
+		file.Fd(),
+		0)
+	if errno != 0 {
+		// This isn't fatal (IMA may not even be in use). Log the error, but
+		// don't return it.
+		log.Warningf("Failed to pre-map MemoryFile PROT_EXEC: %v", errno)
+	} else {
+		if _, _, errno := syscall.Syscall(
+			syscall.SYS_MUNMAP,
+			m,
+			usermem.PageSize,
+			0); errno != 0 {
+			panic(fmt.Sprintf("failed to unmap PROT_EXEC MemoryFile mapping: %v", errno))
+		}
+	}
+
+	return f, nil
+}
+
+// Destroy releases all resources used by f.
+//
+// Preconditions: All pages allocated by f have been freed.
+//
+// Postconditions: None of f's methods may be called after Destroy.
+func (f *MemoryFile) Destroy() {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.destroyed = true
+	f.reclaimCond.Signal()
+}
+
+// Allocate returns a range of initially-zeroed pages of the given length with
+// the given accounting kind and a single reference held by the caller. When
+// the last reference on an allocated page is released, ownership of the page
+// is returned to the MemoryFile, allowing it to be returned by a future call
+// to Allocate.
+//
+// Preconditions: length must be page-aligned and non-zero.
+func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.FileRange, error) {
+	if length == 0 || length%usermem.PageSize != 0 {
+		panic(fmt.Sprintf("invalid allocation length: %#x", length))
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// Align hugepage-and-larger allocations on hugepage boundaries to try
+	// to take advantage of hugetmpfs.
+	alignment := uint64(usermem.PageSize)
+	if length >= usermem.HugePageSize {
+		alignment = usermem.HugePageSize
+	}
+
+	start, minUnallocatedPage := findUnallocatedRange(&f.usage, f.minUnallocatedPage, length, alignment)
+	end := start + length
+	// File offsets are int64s. Since length must be strictly positive, end
+	// cannot legitimately be 0.
+	if end < start || int64(end) <= 0 {
+		return platform.FileRange{}, syserror.ENOMEM
+	}
+
+	// Expand the file if needed. Double the file size on each expansion;
+	// uncommitted pages have effectively no cost.
+	fileSize := f.fileSize
+	for int64(end) > fileSize {
+		if fileSize >= 2*fileSize {
+			// fileSize overflow.
+			return platform.FileRange{}, syserror.ENOMEM
+		}
+		fileSize *= 2
+	}
+	if fileSize > f.fileSize {
+		if err := f.file.Truncate(fileSize); err != nil {
+			return platform.FileRange{}, err
+		}
+		f.fileSize = fileSize
+		f.mappingsMu.Lock()
+		oldMappings := f.mappings.Load().([]uintptr)
+		newMappings := make([]uintptr, fileSize>>chunkShift)
+		copy(newMappings, oldMappings)
+		f.mappings.Store(newMappings)
+		f.mappingsMu.Unlock()
+	}
+
+	// Mark selected pages as in use.
+	fr := platform.FileRange{start, end}
+	if !f.usage.Add(fr, usageInfo{
+		kind: kind,
+		refs: 1,
+	}) {
+		panic(fmt.Sprintf("allocating %v: failed to insert into usage set:\n%v", fr, &f.usage))
+	}
+
+	if minUnallocatedPage < start {
+		f.minUnallocatedPage = minUnallocatedPage
+	} else {
+		// start was the first unallocated page. The next must be
+		// somewhere beyond end.
+		f.minUnallocatedPage = end
+	}
+
+	return fr, nil
+}
+
+// findUnallocatedRange returns the first unallocated page in usage of the
+// specified length and alignment beginning at page start and the first single
+// unallocated page.
+func findUnallocatedRange(usage *usageSet, start, length, alignment uint64) (uint64, uint64) {
+	// Only searched until the first page is found.
+	firstPage := start
+	foundFirstPage := false
+	alignMask := alignment - 1
+	for seg := usage.LowerBoundSegment(start); seg.Ok(); seg = seg.NextSegment() {
+		r := seg.Range()
+
+		if !foundFirstPage && r.Start > firstPage {
+			foundFirstPage = true
+		}
+
+		if start >= r.End {
+			// start was rounded up to an alignment boundary from the end
+			// of a previous segment and is now beyond r.End.
+			continue
+		}
+		// This segment represents allocated or reclaimable pages; only the
+		// range from start to the segment's beginning is allocatable, and the
+		// next allocatable range begins after the segment.
+		if r.Start > start && r.Start-start >= length {
+			break
+		}
+		start = (r.End + alignMask) &^ alignMask
+		if !foundFirstPage {
+			firstPage = r.End
+		}
+	}
+	return start, firstPage
+}
+
+// AllocateAndFill allocates memory of the given kind and fills it by calling
+// r.ReadToBlocks() repeatedly until either length bytes are read or a non-nil
+// error is returned. It returns the memory filled by r, truncated down to the
+// nearest page. If this is shorter than length bytes due to an error returned
+// by r.ReadToBlocks(), it returns that error.
+//
+// Preconditions: length > 0. length must be page-aligned.
+func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (platform.FileRange, error) {
+	fr, err := f.Allocate(length, kind)
+	if err != nil {
+		return platform.FileRange{}, err
+	}
+	dsts, err := f.MapInternal(fr, usermem.Write)
+	if err != nil {
+		f.DecRef(fr)
+		return platform.FileRange{}, err
+	}
+	n, err := safemem.ReadFullToBlocks(r, dsts)
+	un := uint64(usermem.Addr(n).RoundDown())
+	if un < length {
+		// Free unused memory and update fr to contain only the memory that is
+		// still allocated.
+		f.DecRef(platform.FileRange{fr.Start + un, fr.End})
+		fr.End = fr.Start + un
+	}
+	return fr, err
+}
+
+// fallocate(2) modes, defined in Linux's include/uapi/linux/falloc.h.
+const (
+	_FALLOC_FL_KEEP_SIZE  = 1
+	_FALLOC_FL_PUNCH_HOLE = 2
+)
+
+// Decommit releases resources associated with maintaining the contents of the
+// given pages. If Decommit succeeds, future accesses of the decommitted pages
+// will read zeroes.
+//
+// Preconditions: fr.Length() > 0.
+func (f *MemoryFile) Decommit(fr platform.FileRange) error {
+	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
+		panic(fmt.Sprintf("invalid range: %v", fr))
+	}
+
+	// "After a successful call, subsequent reads from this range will
+	// return zeroes. The FALLOC_FL_PUNCH_HOLE flag must be ORed with
+	// FALLOC_FL_KEEP_SIZE in mode ..." - fallocate(2)
+	err := syscall.Fallocate(
+		int(f.file.Fd()),
+		_FALLOC_FL_PUNCH_HOLE|_FALLOC_FL_KEEP_SIZE,
+		int64(fr.Start),
+		int64(fr.Length()))
+	if err != nil {
+		return err
+	}
+	f.markDecommitted(fr)
+	return nil
+}
+
+func (f *MemoryFile) markDecommitted(fr platform.FileRange) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	// Since we're changing the knownCommitted attribute, we need to merge
+	// across the entire range to ensure that the usage tree is minimal.
+	gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) {
+		val := seg.ValuePtr()
+		if val.knownCommitted {
+			// Drop the usageExpected appropriately.
+			amount := seg.Range().Length()
+			usage.MemoryAccounting.Dec(amount, val.kind)
+			f.usageExpected -= amount
+			val.knownCommitted = false
+		}
+	})
+	if gap.Ok() {
+		panic(fmt.Sprintf("Decommit(%v): attempted to decommit unallocated pages %v:\n%v", fr, gap.Range(), &f.usage))
+	}
+	f.usage.MergeRange(fr)
+}
+
+// IncRef implements platform.File.IncRef.
+func (f *MemoryFile) IncRef(fr platform.FileRange) {
+	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
+		panic(fmt.Sprintf("invalid range: %v", fr))
+	}
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) {
+		seg.ValuePtr().refs++
+	})
+	if gap.Ok() {
+		panic(fmt.Sprintf("IncRef(%v): attempted to IncRef on unallocated pages %v:\n%v", fr, gap.Range(), &f.usage))
+	}
+
+	f.usage.MergeAdjacent(fr)
+}
+
+// DecRef implements platform.File.DecRef.
+func (f *MemoryFile) DecRef(fr platform.FileRange) {
+	if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 {
+		panic(fmt.Sprintf("invalid range: %v", fr))
+	}
+
+	var freed bool
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	for seg := f.usage.FindSegment(fr.Start); seg.Ok() && seg.Start() < fr.End; seg = seg.NextSegment() {
+		seg = f.usage.Isolate(seg, fr)
+		val := seg.ValuePtr()
+		if val.refs == 0 {
+			panic(fmt.Sprintf("DecRef(%v): 0 existing references on %v:\n%v", fr, seg.Range(), &f.usage))
+		}
+		val.refs--
+		if val.refs == 0 {
+			freed = true
+			// Reclassify memory as System, until it's freed by the reclaim
+			// goroutine.
+			if val.knownCommitted {
+				usage.MemoryAccounting.Move(seg.Range().Length(), usage.System, val.kind)
+			}
+			val.kind = usage.System
+		}
+	}
+	f.usage.MergeAdjacent(fr)
+
+	if freed {
+		if fr.Start < f.minReclaimablePage {
+			// We've freed at least one lower page.
+			f.minReclaimablePage = fr.Start
+		}
+		f.reclaimable = true
+		f.reclaimCond.Signal()
+	}
+}
+
+// MapInternal implements platform.File.MapInternal.
+func (f *MemoryFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+	if !fr.WellFormed() || fr.Length() == 0 {
+		panic(fmt.Sprintf("invalid range: %v", fr))
+	}
+	if at.Execute {
+		return safemem.BlockSeq{}, syserror.EACCES
+	}
+
+	chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift)
+	if chunks == 1 {
+		// Avoid an unnecessary slice allocation.
+		var seq safemem.BlockSeq
+		err := f.forEachMappingSlice(fr, func(bs []byte) {
+			seq = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bs))
+		})
+		return seq, err
+	}
+	blocks := make([]safemem.Block, 0, chunks)
+	err := f.forEachMappingSlice(fr, func(bs []byte) {
+		blocks = append(blocks, safemem.BlockFromSafeSlice(bs))
+	})
+	return safemem.BlockSeqFromSlice(blocks), err
+}
+
+// forEachMappingSlice invokes fn on a sequence of byte slices that
+// collectively map all bytes in fr.
+func (f *MemoryFile) forEachMappingSlice(fr platform.FileRange, fn func([]byte)) error {
+	mappings := f.mappings.Load().([]uintptr)
+	for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize {
+		chunk := int(chunkStart >> chunkShift)
+		m := atomic.LoadUintptr(&mappings[chunk])
+		if m == 0 {
+			var err error
+			mappings, m, err = f.getChunkMapping(chunk)
+			if err != nil {
+				return err
+			}
+		}
+		startOff := uint64(0)
+		if chunkStart < fr.Start {
+			startOff = fr.Start - chunkStart
+		}
+		endOff := uint64(chunkSize)
+		if chunkStart+chunkSize > fr.End {
+			endOff = fr.End - chunkStart
+		}
+		fn(unsafeSlice(m, chunkSize)[startOff:endOff])
+	}
+	return nil
+}
+
+func (f *MemoryFile) getChunkMapping(chunk int) ([]uintptr, uintptr, error) {
+	f.mappingsMu.Lock()
+	defer f.mappingsMu.Unlock()
+	// Another thread may have replaced f.mappings altogether due to file
+	// expansion.
+	mappings := f.mappings.Load().([]uintptr)
+	// Another thread may have already mapped the chunk.
+	if m := mappings[chunk]; m != 0 {
+		return mappings, m, nil
+	}
+	m, _, errno := syscall.Syscall6(
+		syscall.SYS_MMAP,
+		0,
+		chunkSize,
+		syscall.PROT_READ|syscall.PROT_WRITE,
+		syscall.MAP_SHARED,
+		f.file.Fd(),
+		uintptr(chunk<<chunkShift))
+	if errno != 0 {
+		return nil, 0, errno
+	}
+	atomic.StoreUintptr(&mappings[chunk], m)
+	return mappings, m, nil
+}
+
+// MarkEvictable allows f to request memory deallocation by calling
+// user.Evict(er) in the future.
+//
+// Redundantly marking an already-evictable range as evictable has no effect.
+func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	info, ok := f.evictable[user]
+	if !ok {
+		info = &evictableMemoryUserInfo{}
+		f.evictable[user] = info
+	}
+	gap := info.ranges.LowerBoundGap(er.Start)
+	for gap.Ok() && gap.Start() < er.End {
+		gapER := gap.Range().Intersect(er)
+		if gapER.Length() == 0 {
+			gap = gap.NextGap()
+			continue
+		}
+		gap = info.ranges.Insert(gap, gapER, evictableRangeSetValue{}).NextGap()
+	}
+	if !info.evicting {
+		switch f.opts.DelayedEviction {
+		case DelayedEvictionDisabled:
+			// Kick off eviction immediately.
+			f.startEvictionGoroutineLocked(user, info)
+		case DelayedEvictionEnabled:
+			// Ensure that the reclaimer goroutine is running, so that it can
+			// start eviction when necessary.
+			f.reclaimCond.Signal()
+		}
+	}
+}
+
+// MarkUnevictable informs f that user no longer considers er to be evictable,
+// so the MemoryFile should no longer call user.Evict(er). Note that, per
+// EvictableMemoryUser.Evict's documentation, user.Evict(er) may still be
+// called even after MarkUnevictable returns due to race conditions, and
+// implementations of EvictableMemoryUser must handle this possibility.
+//
+// Redundantly marking an already-unevictable range as unevictable has no
+// effect.
+func (f *MemoryFile) MarkUnevictable(user EvictableMemoryUser, er EvictableRange) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	info, ok := f.evictable[user]
+	if !ok {
+		return
+	}
+	seg := info.ranges.LowerBoundSegment(er.Start)
+	for seg.Ok() && seg.Start() < er.End {
+		seg = info.ranges.Isolate(seg, er)
+		seg = info.ranges.Remove(seg).NextSegment()
+	}
+	// We can only remove info if there's no eviction goroutine running on its
+	// behalf.
+	if !info.evicting && info.ranges.IsEmpty() {
+		delete(f.evictable, user)
+	}
+}
+
+// MarkAllUnevictable informs f that user no longer considers any offsets to be
+// evictable. It otherwise has the same semantics as MarkUnevictable.
+func (f *MemoryFile) MarkAllUnevictable(user EvictableMemoryUser) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	info, ok := f.evictable[user]
+	if !ok {
+		return
+	}
+	info.ranges.RemoveAll()
+	// We can only remove info if there's no eviction goroutine running on its
+	// behalf.
+	if !info.evicting {
+		delete(f.evictable, user)
+	}
+}
+
+// UpdateUsage ensures that the memory usage statistics in
+// usage.MemoryAccounting are up to date.
+func (f *MemoryFile) UpdateUsage() error {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// If the underlying usage matches where the usage tree already
+	// represents, then we can just avoid the entire scan (we know it's
+	// accurate).
+	currentUsage, err := f.TotalUsage()
+	if err != nil {
+		return err
+	}
+	if currentUsage == f.usageExpected && f.usageSwapped == 0 {
+		log.Debugf("UpdateUsage: skipped with usageSwapped=0.")
+		return nil
+	}
+	// If the current usage matches the expected but there's swap
+	// accounting, then ensure a scan takes place at least every second
+	// (when requested).
+	if currentUsage == f.usageExpected+f.usageSwapped && time.Now().Before(f.usageLast.Add(time.Second)) {
+		log.Debugf("UpdateUsage: skipped with usageSwapped!=0.")
+		return nil
+	}
+
+	f.usageLast = time.Now()
+	err = f.updateUsageLocked(currentUsage, mincore)
+	log.Debugf("UpdateUsage: currentUsage=%d, usageExpected=%d, usageSwapped=%d.",
+		currentUsage, f.usageExpected, f.usageSwapped)
+	log.Debugf("UpdateUsage: took %v.", time.Since(f.usageLast))
+	return err
+}
+
+// updateUsageLocked attempts to detect commitment of previous-uncommitted
+// pages by invoking checkCommitted, which is a function that, for each page i
+// in bs, sets committed[i] to 1 if the page is committed and 0 otherwise.
+//
+// Precondition: f.mu must be held.
+func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(bs []byte, committed []byte) error) error {
+	// Track if anything changed to elide the merge. In the common case, we
+	// expect all segments to be committed and no merge to occur.
+	changedAny := false
+	defer func() {
+		if changedAny {
+			f.usage.MergeAll()
+		}
+
+		// Adjust the swap usage to reflect reality.
+		if f.usageExpected < currentUsage {
+			// Since no pages may be marked decommitted while we hold mu, we
+			// know that usage may have only increased since we got the last
+			// current usage. Therefore, if usageExpected is still short of
+			// currentUsage, we must assume that the difference is in pages
+			// that have been swapped.
+			newUsageSwapped := currentUsage - f.usageExpected
+			if f.usageSwapped < newUsageSwapped {
+				usage.MemoryAccounting.Inc(newUsageSwapped-f.usageSwapped, usage.System)
+			} else {
+				usage.MemoryAccounting.Dec(f.usageSwapped-newUsageSwapped, usage.System)
+			}
+			f.usageSwapped = newUsageSwapped
+		} else if f.usageSwapped != 0 {
+			// We have more usage accounted for than the file itself.
+			// That's fine, we probably caught a race where pages were
+			// being committed while the above loop was running. Just
+			// report the higher number that we found and ignore swap.
+			usage.MemoryAccounting.Dec(f.usageSwapped, usage.System)
+			f.usageSwapped = 0
+		}
+	}()
+
+	// Reused mincore buffer, will generally be <= 4096 bytes.
+	var buf []byte
+
+	// Iterate over all usage data. There will only be usage segments
+	// present when there is an associated reference.
+	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		val := seg.Value()
+
+		// Already known to be committed; ignore.
+		if val.knownCommitted {
+			continue
+		}
+
+		// Assume that reclaimable pages (that aren't already known to be
+		// committed) are not committed. This isn't necessarily true, even
+		// after the reclaimer does Decommit(), because the kernel may
+		// subsequently back the hugepage-sized region containing the
+		// decommitted page with a hugepage. However, it's consistent with our
+		// treatment of unallocated pages, which have the same property.
+		if val.refs == 0 {
+			continue
+		}
+
+		// Get the range for this segment. As we touch slices, the
+		// Start value will be walked along.
+		r := seg.Range()
+
+		var checkErr error
+		err := f.forEachMappingSlice(r, func(s []byte) {
+			if checkErr != nil {
+				return
+			}
+
+			// Ensure that we have sufficient buffer for the call
+			// (one byte per page). The length of each slice must
+			// be page-aligned.
+			bufLen := len(s) / usermem.PageSize
+			if len(buf) < bufLen {
+				buf = make([]byte, bufLen)
+			}
+
+			// Query for new pages in core.
+			if err := checkCommitted(s, buf); err != nil {
+				checkErr = err
+				return
+			}
+
+			// Scan each page and switch out segments.
+			populatedRun := false
+			populatedRunStart := 0
+			for i := 0; i <= bufLen; i++ {
+				// We run past the end of the slice here to
+				// simplify the logic and only set populated if
+				// we're still looking at elements.
+				populated := false
+				if i < bufLen {
+					populated = buf[i]&0x1 != 0
+				}
+
+				switch {
+				case populated == populatedRun:
+					// Keep the run going.
+					continue
+				case populated && !populatedRun:
+					// Begin the run.
+					populatedRun = true
+					populatedRunStart = i
+					// Keep going.
+					continue
+				case !populated && populatedRun:
+					// Finish the run by changing this segment.
+					runRange := platform.FileRange{
+						Start: r.Start + uint64(populatedRunStart*usermem.PageSize),
+						End:   r.Start + uint64(i*usermem.PageSize),
+					}
+					seg = f.usage.Isolate(seg, runRange)
+					seg.ValuePtr().knownCommitted = true
+					// Advance the segment only if we still
+					// have work to do in the context of
+					// the original segment from the for
+					// loop. Otherwise, the for loop itself
+					// will advance the segment
+					// appropriately.
+					if runRange.End != r.End {
+						seg = seg.NextSegment()
+					}
+					amount := runRange.Length()
+					usage.MemoryAccounting.Inc(amount, val.kind)
+					f.usageExpected += amount
+					changedAny = true
+					populatedRun = false
+				}
+			}
+
+			// Advance r.Start.
+			r.Start += uint64(len(s))
+		})
+		if checkErr != nil {
+			return checkErr
+		}
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// TotalUsage returns an aggregate usage for all memory statistics except
+// Mapped (which is external to MemoryFile). This is generally much cheaper
+// than UpdateUsage, but will not provide a fine-grained breakdown.
+func (f *MemoryFile) TotalUsage() (uint64, error) {
+	// Stat the underlying file to discover the underlying usage. stat(2)
+	// always reports the allocated block count in units of 512 bytes. This
+	// includes pages in the page cache and swapped pages.
+	var stat syscall.Stat_t
+	if err := syscall.Fstat(int(f.file.Fd()), &stat); err != nil {
+		return 0, err
+	}
+	return uint64(stat.Blocks * 512), nil
+}
+
+// TotalSize returns the current size of the backing file in bytes, which is an
+// upper bound on the amount of memory that can currently be allocated from the
+// MemoryFile. The value returned by TotalSize is permitted to change.
+func (f *MemoryFile) TotalSize() uint64 {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	return uint64(f.fileSize)
+}
+
+// File returns the backing file.
+func (f *MemoryFile) File() *os.File {
+	return f.file
+}
+
+// FD implements platform.File.FD.
+func (f *MemoryFile) FD() int {
+	return int(f.file.Fd())
+}
+
+// String implements fmt.Stringer.String.
+//
+// Note that because f.String locks f.mu, calling f.String internally
+// (including indirectly through the fmt package) risks recursive locking.
+// Within the pgalloc package, use f.usage directly instead.
+func (f *MemoryFile) String() string {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	return f.usage.String()
+}
+
+// runReclaim implements the reclaimer goroutine, which continuously decommits
+// reclaimable pages in order to reduce memory usage and make them available
+// for allocation.
+func (f *MemoryFile) runReclaim() {
+	for {
+		fr, ok := f.findReclaimable()
+		if !ok {
+			break
+		}
+
+		if err := f.Decommit(fr); err != nil {
+			log.Warningf("Reclaim failed to decommit %v: %v", fr, err)
+			// Zero the pages manually. This won't reduce memory usage, but at
+			// least ensures that the pages will be zero when reallocated.
+			f.forEachMappingSlice(fr, func(bs []byte) {
+				for i := range bs {
+					bs[i] = 0
+				}
+			})
+			// Pretend the pages were decommitted even though they weren't,
+			// since the memory accounting implementation has no idea how to
+			// deal with this.
+			f.markDecommitted(fr)
+		}
+		f.markReclaimed(fr)
+	}
+	// We only get here if findReclaimable finds f.destroyed set and returns
+	// false.
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	if !f.destroyed {
+		panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set")
+	}
+	f.file.Close()
+	// Ensure that any attempts to use f.file.Fd() fail instead of getting a fd
+	// that has possibly been reassigned.
+	f.file = nil
+	f.mappingsMu.Lock()
+	defer f.mappingsMu.Unlock()
+	mappings := f.mappings.Load().([]uintptr)
+	for i, m := range mappings {
+		if m != 0 {
+			_, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m, chunkSize, 0)
+			if errno != 0 {
+				log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno)
+			}
+		}
+	}
+	// Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.)
+	f.mappings.Store([]uintptr{})
+}
+
+func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	for {
+		for {
+			if f.destroyed {
+				return platform.FileRange{}, false
+			}
+			if f.reclaimable {
+				break
+			}
+			if f.opts.DelayedEviction == DelayedEvictionEnabled {
+				// No work to do. Evict any pending evictable allocations to
+				// get more reclaimable pages before going to sleep.
+				f.startEvictionsLocked()
+			}
+			f.reclaimCond.Wait()
+		}
+		// Allocate returns the first usable range in offset order and is
+		// currently a linear scan, so reclaiming from the beginning of the
+		// file minimizes the expected latency of Allocate.
+		for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() {
+			if seg.ValuePtr().refs == 0 {
+				f.minReclaimablePage = seg.End()
+				return seg.Range(), true
+			}
+		}
+		// No pages are reclaimable.
+		f.reclaimable = false
+		f.minReclaimablePage = maxPage
+	}
+}
+
+func (f *MemoryFile) markReclaimed(fr platform.FileRange) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	seg := f.usage.FindSegment(fr.Start)
+	// All of fr should be mapped to a single uncommitted reclaimable segment
+	// accounted to System.
+	if !seg.Ok() {
+		panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage))
+	}
+	if !seg.Range().IsSupersetOf(fr) {
+		panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage))
+	}
+	if got, want := seg.Value(), (usageInfo{
+		kind:           usage.System,
+		knownCommitted: false,
+		refs:           0,
+	}); got != want {
+		panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage))
+	}
+	// Deallocate reclaimed pages. Even though all of seg is reclaimable, the
+	// caller of markReclaimed may not have decommitted it, so we can only mark
+	// fr as reclaimed.
+	f.usage.Remove(f.usage.Isolate(seg, fr))
+	if fr.Start < f.minUnallocatedPage {
+		// We've deallocated at least one lower page.
+		f.minUnallocatedPage = fr.Start
+	}
+}
+
+// StartEvictions requests that f evict all evictable allocations. It does not
+// wait for eviction to complete; for this, see MemoryFile.WaitForEvictions.
+func (f *MemoryFile) StartEvictions() {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.startEvictionsLocked()
+}
+
+// Preconditions: f.mu must be locked.
+func (f *MemoryFile) startEvictionsLocked() {
+	for user, info := range f.evictable {
+		// Don't start multiple goroutines to evict the same user's
+		// allocations.
+		if !info.evicting {
+			f.startEvictionGoroutineLocked(user, info)
+		}
+	}
+}
+
+// Preconditions: info == f.evictable[user]. !info.evicting. f.mu must be
+// locked.
+func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) {
+	info.evicting = true
+	f.evictionWG.Add(1)
+	go func() { // S/R-SAFE: f.evictionWG
+		defer f.evictionWG.Done()
+		for {
+			f.mu.Lock()
+			info, ok := f.evictable[user]
+			if !ok {
+				// This shouldn't happen: only this goroutine is permitted
+				// to delete this entry.
+				f.mu.Unlock()
+				panic(fmt.Sprintf("evictableMemoryUserInfo for EvictableMemoryUser %v deleted while eviction goroutine running", user))
+			}
+			if info.ranges.IsEmpty() {
+				delete(f.evictable, user)
+				f.mu.Unlock()
+				return
+			}
+			// Evict from the end of info.ranges, under the assumption that
+			// if ranges in user start being used again (and are
+			// consequently marked unevictable), such uses are more likely
+			// to start from the beginning of user.
+			seg := info.ranges.LastSegment()
+			er := seg.Range()
+			info.ranges.Remove(seg)
+			// user.Evict() must be called without holding f.mu to avoid
+			// circular lock ordering.
+			f.mu.Unlock()
+			user.Evict(context.Background(), er)
+		}
+	}()
+}
+
+// WaitForEvictions blocks until f is no longer evicting any evictable
+// allocations.
+func (f *MemoryFile) WaitForEvictions() {
+	f.evictionWG.Wait()
+}
+
+type usageSetFunctions struct{}
+
+func (usageSetFunctions) MinKey() uint64 {
+	return 0
+}
+
+func (usageSetFunctions) MaxKey() uint64 {
+	return math.MaxUint64
+}
+
+func (usageSetFunctions) ClearValue(val *usageInfo) {
+}
+
+func (usageSetFunctions) Merge(_ platform.FileRange, val1 usageInfo, _ platform.FileRange, val2 usageInfo) (usageInfo, bool) {
+	return val1, val1 == val2
+}
+
+func (usageSetFunctions) Split(_ platform.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) {
+	return val, val
+}
+
+// evictableRangeSetValue is the value type of evictableRangeSet.
+type evictableRangeSetValue struct{}
+
+type evictableRangeSetFunctions struct{}
+
+func (evictableRangeSetFunctions) MinKey() uint64 {
+	return 0
+}
+
+func (evictableRangeSetFunctions) MaxKey() uint64 {
+	return math.MaxUint64
+}
+
+func (evictableRangeSetFunctions) ClearValue(val *evictableRangeSetValue) {
+}
+
+func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetValue, _ EvictableRange, _ evictableRangeSetValue) (evictableRangeSetValue, bool) {
+	return evictableRangeSetValue{}, true
+}
+
+func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) {
+	return evictableRangeSetValue{}, evictableRangeSetValue{}
+}
diff --git a/pkg/sentry/pgalloc/pgalloc_state_autogen.go b/pkg/sentry/pgalloc/pgalloc_state_autogen.go
new file mode 100755
index 000000000..36a5aafa1
--- /dev/null
+++ b/pkg/sentry/pgalloc/pgalloc_state_autogen.go
@@ -0,0 +1,146 @@
+// automatically generated by stateify.
+
+package pgalloc
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+func (x *EvictableRange) beforeSave() {}
+func (x *EvictableRange) save(m state.Map) {
+	x.beforeSave()
+	m.Save("Start", &x.Start)
+	m.Save("End", &x.End)
+}
+
+func (x *EvictableRange) afterLoad() {}
+func (x *EvictableRange) load(m state.Map) {
+	m.Load("Start", &x.Start)
+	m.Load("End", &x.End)
+}
+
+func (x *evictableRangeSet) beforeSave() {}
+func (x *evictableRangeSet) save(m state.Map) {
+	x.beforeSave()
+	var root *evictableRangeSegmentDataSlices = x.saveRoot()
+	m.SaveValue("root", root)
+}
+
+func (x *evictableRangeSet) afterLoad() {}
+func (x *evictableRangeSet) load(m state.Map) {
+	m.LoadValue("root", new(*evictableRangeSegmentDataSlices), func(y interface{}) { x.loadRoot(y.(*evictableRangeSegmentDataSlices)) })
+}
+
+func (x *evictableRangenode) beforeSave() {}
+func (x *evictableRangenode) save(m state.Map) {
+	x.beforeSave()
+	m.Save("nrSegments", &x.nrSegments)
+	m.Save("parent", &x.parent)
+	m.Save("parentIndex", &x.parentIndex)
+	m.Save("hasChildren", &x.hasChildren)
+	m.Save("keys", &x.keys)
+	m.Save("values", &x.values)
+	m.Save("children", &x.children)
+}
+
+func (x *evictableRangenode) afterLoad() {}
+func (x *evictableRangenode) load(m state.Map) {
+	m.Load("nrSegments", &x.nrSegments)
+	m.Load("parent", &x.parent)
+	m.Load("parentIndex", &x.parentIndex)
+	m.Load("hasChildren", &x.hasChildren)
+	m.Load("keys", &x.keys)
+	m.Load("values", &x.values)
+	m.Load("children", &x.children)
+}
+
+func (x *evictableRangeSegmentDataSlices) beforeSave() {}
+func (x *evictableRangeSegmentDataSlices) save(m state.Map) {
+	x.beforeSave()
+	m.Save("Start", &x.Start)
+	m.Save("End", &x.End)
+	m.Save("Values", &x.Values)
+}
+
+func (x *evictableRangeSegmentDataSlices) afterLoad() {}
+func (x *evictableRangeSegmentDataSlices) load(m state.Map) {
+	m.Load("Start", &x.Start)
+	m.Load("End", &x.End)
+	m.Load("Values", &x.Values)
+}
+
+func (x *usageInfo) beforeSave() {}
+func (x *usageInfo) save(m state.Map) {
+	x.beforeSave()
+	m.Save("kind", &x.kind)
+	m.Save("knownCommitted", &x.knownCommitted)
+	m.Save("refs", &x.refs)
+}
+
+func (x *usageInfo) afterLoad() {}
+func (x *usageInfo) load(m state.Map) {
+	m.Load("kind", &x.kind)
+	m.Load("knownCommitted", &x.knownCommitted)
+	m.Load("refs", &x.refs)
+}
+
+func (x *usageSet) beforeSave() {}
+func (x *usageSet) save(m state.Map) {
+	x.beforeSave()
+	var root *usageSegmentDataSlices = x.saveRoot()
+	m.SaveValue("root", root)
+}
+
+func (x *usageSet) afterLoad() {}
+func (x *usageSet) load(m state.Map) {
+	m.LoadValue("root", new(*usageSegmentDataSlices), func(y interface{}) { x.loadRoot(y.(*usageSegmentDataSlices)) })
+}
+
+func (x *usagenode) beforeSave() {}
+func (x *usagenode) save(m state.Map) {
+	x.beforeSave()
+	m.Save("nrSegments", &x.nrSegments)
+	m.Save("parent", &x.parent)
+	m.Save("parentIndex", &x.parentIndex)
+	m.Save("hasChildren", &x.hasChildren)
+	m.Save("keys", &x.keys)
+	m.Save("values", &x.values)
+	m.Save("children", &x.children)
+}
+
+func (x *usagenode) afterLoad() {}
+func (x *usagenode) load(m state.Map) {
+	m.Load("nrSegments", &x.nrSegments)
+	m.Load("parent", &x.parent)
+	m.Load("parentIndex", &x.parentIndex)
+	m.Load("hasChildren", &x.hasChildren)
+	m.Load("keys", &x.keys)
+	m.Load("values", &x.values)
+	m.Load("children", &x.children)
+}
+
+func (x *usageSegmentDataSlices) beforeSave() {}
+func (x *usageSegmentDataSlices) save(m state.Map) {
+	x.beforeSave()
+	m.Save("Start", &x.Start)
+	m.Save("End", &x.End)
+	m.Save("Values", &x.Values)
+}
+
+func (x *usageSegmentDataSlices) afterLoad() {}
+func (x *usageSegmentDataSlices) load(m state.Map) {
+	m.Load("Start", &x.Start)
+	m.Load("End", &x.End)
+	m.Load("Values", &x.Values)
+}
+
+func init() {
+	state.Register("pgalloc.EvictableRange", (*EvictableRange)(nil), state.Fns{Save: (*EvictableRange).save, Load: (*EvictableRange).load})
+	state.Register("pgalloc.evictableRangeSet", (*evictableRangeSet)(nil), state.Fns{Save: (*evictableRangeSet).save, Load: (*evictableRangeSet).load})
+	state.Register("pgalloc.evictableRangenode", (*evictableRangenode)(nil), state.Fns{Save: (*evictableRangenode).save, Load: (*evictableRangenode).load})
+	state.Register("pgalloc.evictableRangeSegmentDataSlices", (*evictableRangeSegmentDataSlices)(nil), state.Fns{Save: (*evictableRangeSegmentDataSlices).save, Load: (*evictableRangeSegmentDataSlices).load})
+	state.Register("pgalloc.usageInfo", (*usageInfo)(nil), state.Fns{Save: (*usageInfo).save, Load: (*usageInfo).load})
+	state.Register("pgalloc.usageSet", (*usageSet)(nil), state.Fns{Save: (*usageSet).save, Load: (*usageSet).load})
+	state.Register("pgalloc.usagenode", (*usagenode)(nil), state.Fns{Save: (*usagenode).save, Load: (*usagenode).load})
+	state.Register("pgalloc.usageSegmentDataSlices", (*usageSegmentDataSlices)(nil), state.Fns{Save: (*usageSegmentDataSlices).save, Load: (*usageSegmentDataSlices).load})
+}
diff --git a/pkg/sentry/pgalloc/pgalloc_unsafe.go b/pkg/sentry/pgalloc/pgalloc_unsafe.go
new file mode 100644
index 000000000..a4b5d581c
--- /dev/null
+++ b/pkg/sentry/pgalloc/pgalloc_unsafe.go
@@ -0,0 +1,40 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pgalloc
+
+import (
+	"reflect"
+	"syscall"
+	"unsafe"
+)
+
+func unsafeSlice(addr uintptr, length int) (slice []byte) {
+	sh := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
+	sh.Data = addr
+	sh.Len = length
+	sh.Cap = length
+	return
+}
+
+func mincore(s []byte, buf []byte) error {
+	if _, _, errno := syscall.RawSyscall(
+		syscall.SYS_MINCORE,
+		uintptr(unsafe.Pointer(&s[0])),
+		uintptr(len(s)),
+		uintptr(unsafe.Pointer(&buf[0]))); errno != 0 {
+		return errno
+	}
+	return nil
+}
diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go
new file mode 100644
index 000000000..d4ba384b1
--- /dev/null
+++ b/pkg/sentry/pgalloc/save_restore.go
@@ -0,0 +1,210 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pgalloc
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"runtime"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+	"gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+// SaveTo writes f's state to the given stream.
+func (f *MemoryFile) SaveTo(w io.Writer) error {
+	// Wait for reclaim.
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	for f.reclaimable {
+		f.reclaimCond.Signal()
+		f.mu.Unlock()
+		runtime.Gosched()
+		f.mu.Lock()
+	}
+
+	// Ensure that there are no pending evictions.
+	if len(f.evictable) != 0 {
+		panic(fmt.Sprintf("evictions still pending for %d users; call StartEvictions and WaitForEvictions before SaveTo", len(f.evictable)))
+	}
+
+	// Ensure that all pages that contain data have knownCommitted set, since
+	// we only store knownCommitted pages below.
+	zeroPage := make([]byte, usermem.PageSize)
+	err := f.updateUsageLocked(0, func(bs []byte, committed []byte) error {
+		for pgoff := 0; pgoff < len(bs); pgoff += usermem.PageSize {
+			i := pgoff / usermem.PageSize
+			pg := bs[pgoff : pgoff+usermem.PageSize]
+			if !bytes.Equal(pg, zeroPage) {
+				committed[i] = 1
+				continue
+			}
+			committed[i] = 0
+			// Reading the page caused it to be committed; decommit it to
+			// reduce memory usage.
+			//
+			// "MADV_REMOVE [...] Free up a given range of pages and its
+			// associated backing store. This is equivalent to punching a hole
+			// in the corresponding byte range of the backing store (see
+			// fallocate(2))." - madvise(2)
+			if err := syscall.Madvise(pg, syscall.MADV_REMOVE); err != nil {
+				// This doesn't impact the correctness of saved memory, it
+				// just means that we're incrementally more likely to OOM.
+				// Complain, but don't abort saving.
+				log.Warningf("Decommitting page %p while saving failed: %v", pg, err)
+			}
+		}
+		return nil
+	})
+	if err != nil {
+		return err
+	}
+
+	// Save metadata.
+	if err := state.Save(w, &f.fileSize, nil); err != nil {
+		return err
+	}
+	if err := state.Save(w, &f.usage, nil); err != nil {
+		return err
+	}
+
+	// Dump out committed pages.
+	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		if !seg.Value().knownCommitted {
+			continue
+		}
+		// Write a header to distinguish from objects.
+		if err := state.WriteHeader(w, uint64(seg.Range().Length()), false); err != nil {
+			return err
+		}
+		// Write out data.
+		var ioErr error
+		err := f.forEachMappingSlice(seg.Range(), func(s []byte) {
+			if ioErr != nil {
+				return
+			}
+			_, ioErr = w.Write(s)
+		})
+		if ioErr != nil {
+			return ioErr
+		}
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// LoadFrom loads MemoryFile state from the given stream.
+func (f *MemoryFile) LoadFrom(r io.Reader) error {
+	// Load metadata.
+	if err := state.Load(r, &f.fileSize, nil); err != nil {
+		return err
+	}
+	if err := f.file.Truncate(f.fileSize); err != nil {
+		return err
+	}
+	newMappings := make([]uintptr, f.fileSize>>chunkShift)
+	f.mappings.Store(newMappings)
+	if err := state.Load(r, &f.usage, nil); err != nil {
+		return err
+	}
+
+	// Try to map committed chunks concurrently: For any given chunk, either
+	// this loop or the following one will mmap the chunk first and cache it in
+	// f.mappings for the other, but this loop is likely to run ahead of the
+	// other since it doesn't do any work between mmaps. The rest of this
+	// function doesn't mutate f.usage, so it's safe to iterate concurrently.
+	mapperDone := make(chan struct{})
+	mapperCanceled := int32(0)
+	go func() { // S/R-SAFE: see comment
+		defer func() { close(mapperDone) }()
+		for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+			if atomic.LoadInt32(&mapperCanceled) != 0 {
+				return
+			}
+			if seg.Value().knownCommitted {
+				f.forEachMappingSlice(seg.Range(), func(s []byte) {})
+			}
+		}
+	}()
+	defer func() {
+		atomic.StoreInt32(&mapperCanceled, 1)
+		<-mapperDone
+	}()
+
+	// Load committed pages.
+	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		if !seg.Value().knownCommitted {
+			continue
+		}
+		// Verify header.
+		length, object, err := state.ReadHeader(r)
+		if err != nil {
+			return err
+		}
+		if object {
+			// Not expected.
+			return fmt.Errorf("unexpected object")
+		}
+		if expected := uint64(seg.Range().Length()); length != expected {
+			// Size mismatch.
+			return fmt.Errorf("mismatched segment: expected %d, got %d", expected, length)
+		}
+		// Read data.
+		var ioErr error
+		err = f.forEachMappingSlice(seg.Range(), func(s []byte) {
+			if ioErr != nil {
+				return
+			}
+			_, ioErr = io.ReadFull(r, s)
+		})
+		if ioErr != nil {
+			return ioErr
+		}
+		if err != nil {
+			return err
+		}
+
+		// Update accounting for restored pages. We need to do this here since
+		// these segments are marked as "known committed", and will be skipped
+		// over on accounting scans.
+		usage.MemoryAccounting.Inc(seg.End()-seg.Start(), seg.Value().kind)
+	}
+
+	return nil
+}
+
+// MemoryFileProvider provides the MemoryFile method.
+//
+// This type exists to work around a save/restore defect. The only object in a
+// saved object graph that S/R allows to be replaced at time of restore is the
+// starting point of the restore, kernel.Kernel. However, the MemoryFile
+// changes between save and restore as well, so objects that need persistent
+// access to the MemoryFile must instead store a pointer to the Kernel and call
+// Kernel.MemoryFile() as required. In most cases, depending on the kernel
+// package directly would create a package dependency loop, so the stored
+// pointer must instead be a MemoryProvider interface object. Correspondingly,
+// kernel.Kernel is the only implementation of this interface.
+type MemoryFileProvider interface {
+	// MemoryFile returns the Kernel MemoryFile.
+	MemoryFile() *MemoryFile
+}
diff --git a/pkg/sentry/pgalloc/usage_set.go b/pkg/sentry/pgalloc/usage_set.go
new file mode 100755
index 000000000..8ef4952eb
--- /dev/null
+++ b/pkg/sentry/pgalloc/usage_set.go
@@ -0,0 +1,1274 @@
+package pgalloc
+
+import (
+	__generics_imported0 "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+)
+
+import (
+	"bytes"
+	"fmt"
+)
+
+const (
+	// minDegree is the minimum degree of an internal node in a Set B-tree.
+	//
+	// - Any non-root node has at least minDegree-1 segments.
+	//
+	// - Any non-root internal (non-leaf) node has at least minDegree children.
+	//
+	// - The root node may have fewer than minDegree-1 segments, but it may
+	// only have 0 segments if the tree is empty.
+	//
+	// Our implementation requires minDegree >= 3. Higher values of minDegree
+	// usually improve performance, but increase memory usage for small sets.
+	usageminDegree = 10
+
+	usagemaxDegree = 2 * usageminDegree
+)
+
+// A Set is a mapping of segments with non-overlapping Range keys. The zero
+// value for a Set is an empty set. Set values are not safely movable nor
+// copyable. Set is thread-compatible.
+//
+// +stateify savable
+type usageSet struct {
+	root usagenode `state:".(*usageSegmentDataSlices)"`
+}
+
+// IsEmpty returns true if the set contains no segments.
+func (s *usageSet) IsEmpty() bool {
+	return s.root.nrSegments == 0
+}
+
+// IsEmptyRange returns true iff no segments in the set overlap the given
+// range. This is semantically equivalent to s.SpanRange(r) == 0, but may be
+// more efficient.
+func (s *usageSet) IsEmptyRange(r __generics_imported0.FileRange) bool {
+	switch {
+	case r.Length() < 0:
+		panic(fmt.Sprintf("invalid range %v", r))
+	case r.Length() == 0:
+		return true
+	}
+	_, gap := s.Find(r.Start)
+	if !gap.Ok() {
+		return false
+	}
+	return r.End <= gap.End()
+}
+
+// Span returns the total size of all segments in the set.
+func (s *usageSet) Span() uint64 {
+	var sz uint64
+	for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		sz += seg.Range().Length()
+	}
+	return sz
+}
+
+// SpanRange returns the total size of the intersection of segments in the set
+// with the given range.
+func (s *usageSet) SpanRange(r __generics_imported0.FileRange) uint64 {
+	switch {
+	case r.Length() < 0:
+		panic(fmt.Sprintf("invalid range %v", r))
+	case r.Length() == 0:
+		return 0
+	}
+	var sz uint64
+	for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() {
+		sz += seg.Range().Intersect(r).Length()
+	}
+	return sz
+}
+
+// FirstSegment returns the first segment in the set. If the set is empty,
+// FirstSegment returns a terminal iterator.
+func (s *usageSet) FirstSegment() usageIterator {
+	if s.root.nrSegments == 0 {
+		return usageIterator{}
+	}
+	return s.root.firstSegment()
+}
+
+// LastSegment returns the last segment in the set. If the set is empty,
+// LastSegment returns a terminal iterator.
+func (s *usageSet) LastSegment() usageIterator {
+	if s.root.nrSegments == 0 {
+		return usageIterator{}
+	}
+	return s.root.lastSegment()
+}
+
+// FirstGap returns the first gap in the set.
+func (s *usageSet) FirstGap() usageGapIterator {
+	n := &s.root
+	for n.hasChildren {
+		n = n.children[0]
+	}
+	return usageGapIterator{n, 0}
+}
+
+// LastGap returns the last gap in the set.
+func (s *usageSet) LastGap() usageGapIterator {
+	n := &s.root
+	for n.hasChildren {
+		n = n.children[n.nrSegments]
+	}
+	return usageGapIterator{n, n.nrSegments}
+}
+
+// Find returns the segment or gap whose range contains the given key. If a
+// segment is found, the returned Iterator is non-terminal and the
+// returned GapIterator is terminal. Otherwise, the returned Iterator is
+// terminal and the returned GapIterator is non-terminal.
+func (s *usageSet) Find(key uint64) (usageIterator, usageGapIterator) {
+	n := &s.root
+	for {
+
+		lower := 0
+		upper := n.nrSegments
+		for lower < upper {
+			i := lower + (upper-lower)/2
+			if r := n.keys[i]; key < r.End {
+				if key >= r.Start {
+					return usageIterator{n, i}, usageGapIterator{}
+				}
+				upper = i
+			} else {
+				lower = i + 1
+			}
+		}
+		i := lower
+		if !n.hasChildren {
+			return usageIterator{}, usageGapIterator{n, i}
+		}
+		n = n.children[i]
+	}
+}
+
+// FindSegment returns the segment whose range contains the given key. If no
+// such segment exists, FindSegment returns a terminal iterator.
+func (s *usageSet) FindSegment(key uint64) usageIterator {
+	seg, _ := s.Find(key)
+	return seg
+}
+
+// LowerBoundSegment returns the segment with the lowest range that contains a
+// key greater than or equal to min. If no such segment exists,
+// LowerBoundSegment returns a terminal iterator.
+func (s *usageSet) LowerBoundSegment(min uint64) usageIterator {
+	seg, gap := s.Find(min)
+	if seg.Ok() {
+		return seg
+	}
+	return gap.NextSegment()
+}
+
+// UpperBoundSegment returns the segment with the highest range that contains a
+// key less than or equal to max. If no such segment exists, UpperBoundSegment
+// returns a terminal iterator.
+func (s *usageSet) UpperBoundSegment(max uint64) usageIterator {
+	seg, gap := s.Find(max)
+	if seg.Ok() {
+		return seg
+	}
+	return gap.PrevSegment()
+}
+
+// FindGap returns the gap containing the given key. If no such gap exists
+// (i.e. the set contains a segment containing that key), FindGap returns a
+// terminal iterator.
+func (s *usageSet) FindGap(key uint64) usageGapIterator {
+	_, gap := s.Find(key)
+	return gap
+}
+
+// LowerBoundGap returns the gap with the lowest range that is greater than or
+// equal to min.
+func (s *usageSet) LowerBoundGap(min uint64) usageGapIterator {
+	seg, gap := s.Find(min)
+	if gap.Ok() {
+		return gap
+	}
+	return seg.NextGap()
+}
+
+// UpperBoundGap returns the gap with the highest range that is less than or
+// equal to max.
+func (s *usageSet) UpperBoundGap(max uint64) usageGapIterator {
+	seg, gap := s.Find(max)
+	if gap.Ok() {
+		return gap
+	}
+	return seg.PrevGap()
+}
+
+// Add inserts the given segment into the set and returns true. If the new
+// segment can be merged with adjacent segments, Add will do so. If the new
+// segment would overlap an existing segment, Add returns false. If Add
+// succeeds, all existing iterators are invalidated.
+func (s *usageSet) Add(r __generics_imported0.FileRange, val usageInfo) bool {
+	if r.Length() <= 0 {
+		panic(fmt.Sprintf("invalid segment range %v", r))
+	}
+	gap := s.FindGap(r.Start)
+	if !gap.Ok() {
+		return false
+	}
+	if r.End > gap.End() {
+		return false
+	}
+	s.Insert(gap, r, val)
+	return true
+}
+
+// AddWithoutMerging inserts the given segment into the set and returns true.
+// If it would overlap an existing segment, AddWithoutMerging does nothing and
+// returns false. If AddWithoutMerging succeeds, all existing iterators are
+// invalidated.
+func (s *usageSet) AddWithoutMerging(r __generics_imported0.FileRange, val usageInfo) bool {
+	if r.Length() <= 0 {
+		panic(fmt.Sprintf("invalid segment range %v", r))
+	}
+	gap := s.FindGap(r.Start)
+	if !gap.Ok() {
+		return false
+	}
+	if r.End > gap.End() {
+		return false
+	}
+	s.InsertWithoutMergingUnchecked(gap, r, val)
+	return true
+}
+
+// Insert inserts the given segment into the given gap. If the new segment can
+// be merged with adjacent segments, Insert will do so. Insert returns an
+// iterator to the segment containing the inserted value (which may have been
+// merged with other values). All existing iterators (including gap, but not
+// including the returned iterator) are invalidated.
+//
+// If the gap cannot accommodate the segment, or if r is invalid, Insert panics.
+//
+// Insert is semantically equivalent to a InsertWithoutMerging followed by a
+// Merge, but may be more efficient. Note that there is no unchecked variant of
+// Insert since Insert must retrieve and inspect gap's predecessor and
+// successor segments regardless.
+func (s *usageSet) Insert(gap usageGapIterator, r __generics_imported0.FileRange, val usageInfo) usageIterator {
+	if r.Length() <= 0 {
+		panic(fmt.Sprintf("invalid segment range %v", r))
+	}
+	prev, next := gap.PrevSegment(), gap.NextSegment()
+	if prev.Ok() && prev.End() > r.Start {
+		panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range()))
+	}
+	if next.Ok() && next.Start() < r.End {
+		panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range()))
+	}
+	if prev.Ok() && prev.End() == r.Start {
+		if mval, ok := (usageSetFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok {
+			prev.SetEndUnchecked(r.End)
+			prev.SetValue(mval)
+			if next.Ok() && next.Start() == r.End {
+				val = mval
+				if mval, ok := (usageSetFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok {
+					prev.SetEndUnchecked(next.End())
+					prev.SetValue(mval)
+					return s.Remove(next).PrevSegment()
+				}
+			}
+			return prev
+		}
+	}
+	if next.Ok() && next.Start() == r.End {
+		if mval, ok := (usageSetFunctions{}).Merge(r, val, next.Range(), next.Value()); ok {
+			next.SetStartUnchecked(r.Start)
+			next.SetValue(mval)
+			return next
+		}
+	}
+	return s.InsertWithoutMergingUnchecked(gap, r, val)
+}
+
+// InsertWithoutMerging inserts the given segment into the given gap and
+// returns an iterator to the inserted segment. All existing iterators
+// (including gap, but not including the returned iterator) are invalidated.
+//
+// If the gap cannot accommodate the segment, or if r is invalid,
+// InsertWithoutMerging panics.
+func (s *usageSet) InsertWithoutMerging(gap usageGapIterator, r __generics_imported0.FileRange, val usageInfo) usageIterator {
+	if r.Length() <= 0 {
+		panic(fmt.Sprintf("invalid segment range %v", r))
+	}
+	if gr := gap.Range(); !gr.IsSupersetOf(r) {
+		panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr))
+	}
+	return s.InsertWithoutMergingUnchecked(gap, r, val)
+}
+
+// InsertWithoutMergingUnchecked inserts the given segment into the given gap
+// and returns an iterator to the inserted segment. All existing iterators
+// (including gap, but not including the returned iterator) are invalidated.
+//
+// Preconditions: r.Start >= gap.Start(); r.End <= gap.End().
+func (s *usageSet) InsertWithoutMergingUnchecked(gap usageGapIterator, r __generics_imported0.FileRange, val usageInfo) usageIterator {
+	gap = gap.node.rebalanceBeforeInsert(gap)
+	copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments])
+	copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments])
+	gap.node.keys[gap.index] = r
+	gap.node.values[gap.index] = val
+	gap.node.nrSegments++
+	return usageIterator{gap.node, gap.index}
+}
+
+// Remove removes the given segment and returns an iterator to the vacated gap.
+// All existing iterators (including seg, but not including the returned
+// iterator) are invalidated.
+func (s *usageSet) Remove(seg usageIterator) usageGapIterator {
+
+	if seg.node.hasChildren {
+
+		victim := seg.PrevSegment()
+
+		seg.SetRangeUnchecked(victim.Range())
+		seg.SetValue(victim.Value())
+		return s.Remove(victim).NextGap()
+	}
+	copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments])
+	copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments])
+	usageSetFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1])
+	seg.node.nrSegments--
+	return seg.node.rebalanceAfterRemove(usageGapIterator{seg.node, seg.index})
+}
+
+// RemoveAll removes all segments from the set. All existing iterators are
+// invalidated.
+func (s *usageSet) RemoveAll() {
+	s.root = usagenode{}
+}
+
+// RemoveRange removes all segments in the given range. An iterator to the
+// newly formed gap is returned, and all existing iterators are invalidated.
+func (s *usageSet) RemoveRange(r __generics_imported0.FileRange) usageGapIterator {
+	seg, gap := s.Find(r.Start)
+	if seg.Ok() {
+		seg = s.Isolate(seg, r)
+		gap = s.Remove(seg)
+	}
+	for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() {
+		seg = s.Isolate(seg, r)
+		gap = s.Remove(seg)
+	}
+	return gap
+}
+
+// Merge attempts to merge two neighboring segments. If successful, Merge
+// returns an iterator to the merged segment, and all existing iterators are
+// invalidated. Otherwise, Merge returns a terminal iterator.
+//
+// If first is not the predecessor of second, Merge panics.
+func (s *usageSet) Merge(first, second usageIterator) usageIterator {
+	if first.NextSegment() != second {
+		panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range()))
+	}
+	return s.MergeUnchecked(first, second)
+}
+
+// MergeUnchecked attempts to merge two neighboring segments. If successful,
+// MergeUnchecked returns an iterator to the merged segment, and all existing
+// iterators are invalidated. Otherwise, MergeUnchecked returns a terminal
+// iterator.
+//
+// Precondition: first is the predecessor of second: first.NextSegment() ==
+// second, first == second.PrevSegment().
+func (s *usageSet) MergeUnchecked(first, second usageIterator) usageIterator {
+	if first.End() == second.Start() {
+		if mval, ok := (usageSetFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok {
+
+			first.SetEndUnchecked(second.End())
+			first.SetValue(mval)
+			return s.Remove(second).PrevSegment()
+		}
+	}
+	return usageIterator{}
+}
+
+// MergeAll attempts to merge all adjacent segments in the set. All existing
+// iterators are invalidated.
+func (s *usageSet) MergeAll() {
+	seg := s.FirstSegment()
+	if !seg.Ok() {
+		return
+	}
+	next := seg.NextSegment()
+	for next.Ok() {
+		if mseg := s.MergeUnchecked(seg, next); mseg.Ok() {
+			seg, next = mseg, mseg.NextSegment()
+		} else {
+			seg, next = next, next.NextSegment()
+		}
+	}
+}
+
+// MergeRange attempts to merge all adjacent segments that contain a key in the
+// specific range. All existing iterators are invalidated.
+func (s *usageSet) MergeRange(r __generics_imported0.FileRange) {
+	seg := s.LowerBoundSegment(r.Start)
+	if !seg.Ok() {
+		return
+	}
+	next := seg.NextSegment()
+	for next.Ok() && next.Range().Start < r.End {
+		if mseg := s.MergeUnchecked(seg, next); mseg.Ok() {
+			seg, next = mseg, mseg.NextSegment()
+		} else {
+			seg, next = next, next.NextSegment()
+		}
+	}
+}
+
+// MergeAdjacent attempts to merge the segment containing r.Start with its
+// predecessor, and the segment containing r.End-1 with its successor.
+func (s *usageSet) MergeAdjacent(r __generics_imported0.FileRange) {
+	first := s.FindSegment(r.Start)
+	if first.Ok() {
+		if prev := first.PrevSegment(); prev.Ok() {
+			s.Merge(prev, first)
+		}
+	}
+	last := s.FindSegment(r.End - 1)
+	if last.Ok() {
+		if next := last.NextSegment(); next.Ok() {
+			s.Merge(last, next)
+		}
+	}
+}
+
+// Split splits the given segment at the given key and returns iterators to the
+// two resulting segments. All existing iterators (including seg, but not
+// including the returned iterators) are invalidated.
+//
+// If the segment cannot be split at split (because split is at the start or
+// end of the segment's range, so splitting would produce a segment with zero
+// length, or because split falls outside the segment's range altogether),
+// Split panics.
+func (s *usageSet) Split(seg usageIterator, split uint64) (usageIterator, usageIterator) {
+	if !seg.Range().CanSplitAt(split) {
+		panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split))
+	}
+	return s.SplitUnchecked(seg, split)
+}
+
+// SplitUnchecked splits the given segment at the given key and returns
+// iterators to the two resulting segments. All existing iterators (including
+// seg, but not including the returned iterators) are invalidated.
+//
+// Preconditions: seg.Start() < key < seg.End().
+func (s *usageSet) SplitUnchecked(seg usageIterator, split uint64) (usageIterator, usageIterator) {
+	val1, val2 := (usageSetFunctions{}).Split(seg.Range(), seg.Value(), split)
+	end2 := seg.End()
+	seg.SetEndUnchecked(split)
+	seg.SetValue(val1)
+	seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), __generics_imported0.FileRange{split, end2}, val2)
+
+	return seg2.PrevSegment(), seg2
+}
+
+// SplitAt splits the segment straddling split, if one exists. SplitAt returns
+// true if a segment was split and false otherwise. If SplitAt splits a
+// segment, all existing iterators are invalidated.
+func (s *usageSet) SplitAt(split uint64) bool {
+	if seg := s.FindSegment(split); seg.Ok() && seg.Range().CanSplitAt(split) {
+		s.SplitUnchecked(seg, split)
+		return true
+	}
+	return false
+}
+
+// Isolate ensures that the given segment's range does not escape r by
+// splitting at r.Start and r.End if necessary, and returns an updated iterator
+// to the bounded segment. All existing iterators (including seg, but not
+// including the returned iterators) are invalidated.
+func (s *usageSet) Isolate(seg usageIterator, r __generics_imported0.FileRange) usageIterator {
+	if seg.Range().CanSplitAt(r.Start) {
+		_, seg = s.SplitUnchecked(seg, r.Start)
+	}
+	if seg.Range().CanSplitAt(r.End) {
+		seg, _ = s.SplitUnchecked(seg, r.End)
+	}
+	return seg
+}
+
+// ApplyContiguous applies a function to a contiguous range of segments,
+// splitting if necessary. The function is applied until the first gap is
+// encountered, at which point the gap is returned. If the function is applied
+// across the entire range, a terminal gap is returned. All existing iterators
+// are invalidated.
+//
+// N.B. The Iterator must not be invalidated by the function.
+func (s *usageSet) ApplyContiguous(r __generics_imported0.FileRange, fn func(seg usageIterator)) usageGapIterator {
+	seg, gap := s.Find(r.Start)
+	if !seg.Ok() {
+		return gap
+	}
+	for {
+		seg = s.Isolate(seg, r)
+		fn(seg)
+		if seg.End() >= r.End {
+			return usageGapIterator{}
+		}
+		gap = seg.NextGap()
+		if !gap.IsEmpty() {
+			return gap
+		}
+		seg = gap.NextSegment()
+		if !seg.Ok() {
+
+			return usageGapIterator{}
+		}
+	}
+}
+
+// +stateify savable
+type usagenode struct {
+	// An internal binary tree node looks like:
+	//
+	//   K
+	//  / \
+	// Cl Cr
+	//
+	// where all keys in the subtree rooted by Cl (the left subtree) are less
+	// than K (the key of the parent node), and all keys in the subtree rooted
+	// by Cr (the right subtree) are greater than K.
+	//
+	// An internal B-tree node's indexes work out to look like:
+	//
+	//   K0 K1 K2  ...   Kn-1
+	//  / \/ \/ \  ...  /  \
+	// C0 C1 C2 C3 ... Cn-1 Cn
+	//
+	// where n is nrSegments.
+	nrSegments int
+
+	// parent is a pointer to this node's parent. If this node is root, parent
+	// is nil.
+	parent *usagenode
+
+	// parentIndex is the index of this node in parent.children.
+	parentIndex int
+
+	// Flag for internal nodes that is technically redundant with "children[0]
+	// != nil", but is stored in the first cache line. "hasChildren" rather
+	// than "isLeaf" because false must be the correct value for an empty root.
+	hasChildren bool
+
+	// Nodes store keys and values in separate arrays to maximize locality in
+	// the common case (scanning keys for lookup).
+	keys     [usagemaxDegree - 1]__generics_imported0.FileRange
+	values   [usagemaxDegree - 1]usageInfo
+	children [usagemaxDegree]*usagenode
+}
+
+// firstSegment returns the first segment in the subtree rooted by n.
+//
+// Preconditions: n.nrSegments != 0.
+func (n *usagenode) firstSegment() usageIterator {
+	for n.hasChildren {
+		n = n.children[0]
+	}
+	return usageIterator{n, 0}
+}
+
+// lastSegment returns the last segment in the subtree rooted by n.
+//
+// Preconditions: n.nrSegments != 0.
+func (n *usagenode) lastSegment() usageIterator {
+	for n.hasChildren {
+		n = n.children[n.nrSegments]
+	}
+	return usageIterator{n, n.nrSegments - 1}
+}
+
+func (n *usagenode) prevSibling() *usagenode {
+	if n.parent == nil || n.parentIndex == 0 {
+		return nil
+	}
+	return n.parent.children[n.parentIndex-1]
+}
+
+func (n *usagenode) nextSibling() *usagenode {
+	if n.parent == nil || n.parentIndex == n.parent.nrSegments {
+		return nil
+	}
+	return n.parent.children[n.parentIndex+1]
+}
+
+// rebalanceBeforeInsert splits n and its ancestors if they are full, as
+// required for insertion, and returns an updated iterator to the position
+// represented by gap.
+func (n *usagenode) rebalanceBeforeInsert(gap usageGapIterator) usageGapIterator {
+	if n.parent != nil {
+		gap = n.parent.rebalanceBeforeInsert(gap)
+	}
+	if n.nrSegments < usagemaxDegree-1 {
+		return gap
+	}
+	if n.parent == nil {
+
+		left := &usagenode{
+			nrSegments:  usageminDegree - 1,
+			parent:      n,
+			parentIndex: 0,
+			hasChildren: n.hasChildren,
+		}
+		right := &usagenode{
+			nrSegments:  usageminDegree - 1,
+			parent:      n,
+			parentIndex: 1,
+			hasChildren: n.hasChildren,
+		}
+		copy(left.keys[:usageminDegree-1], n.keys[:usageminDegree-1])
+		copy(left.values[:usageminDegree-1], n.values[:usageminDegree-1])
+		copy(right.keys[:usageminDegree-1], n.keys[usageminDegree:])
+		copy(right.values[:usageminDegree-1], n.values[usageminDegree:])
+		n.keys[0], n.values[0] = n.keys[usageminDegree-1], n.values[usageminDegree-1]
+		usagezeroValueSlice(n.values[1:])
+		if n.hasChildren {
+			copy(left.children[:usageminDegree], n.children[:usageminDegree])
+			copy(right.children[:usageminDegree], n.children[usageminDegree:])
+			usagezeroNodeSlice(n.children[2:])
+			for i := 0; i < usageminDegree; i++ {
+				left.children[i].parent = left
+				left.children[i].parentIndex = i
+				right.children[i].parent = right
+				right.children[i].parentIndex = i
+			}
+		}
+		n.nrSegments = 1
+		n.hasChildren = true
+		n.children[0] = left
+		n.children[1] = right
+		if gap.node != n {
+			return gap
+		}
+		if gap.index < usageminDegree {
+			return usageGapIterator{left, gap.index}
+		}
+		return usageGapIterator{right, gap.index - usageminDegree}
+	}
+
+	copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments])
+	copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments])
+	n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[usageminDegree-1], n.values[usageminDegree-1]
+	copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1])
+	for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ {
+		n.parent.children[i].parentIndex = i
+	}
+	sibling := &usagenode{
+		nrSegments:  usageminDegree - 1,
+		parent:      n.parent,
+		parentIndex: n.parentIndex + 1,
+		hasChildren: n.hasChildren,
+	}
+	n.parent.children[n.parentIndex+1] = sibling
+	n.parent.nrSegments++
+	copy(sibling.keys[:usageminDegree-1], n.keys[usageminDegree:])
+	copy(sibling.values[:usageminDegree-1], n.values[usageminDegree:])
+	usagezeroValueSlice(n.values[usageminDegree-1:])
+	if n.hasChildren {
+		copy(sibling.children[:usageminDegree], n.children[usageminDegree:])
+		usagezeroNodeSlice(n.children[usageminDegree:])
+		for i := 0; i < usageminDegree; i++ {
+			sibling.children[i].parent = sibling
+			sibling.children[i].parentIndex = i
+		}
+	}
+	n.nrSegments = usageminDegree - 1
+
+	if gap.node != n {
+		return gap
+	}
+	if gap.index < usageminDegree {
+		return gap
+	}
+	return usageGapIterator{sibling, gap.index - usageminDegree}
+}
+
+// rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient
+// (contain fewer segments than required by B-tree invariants), as required for
+// removal, and returns an updated iterator to the position represented by gap.
+//
+// Precondition: n is the only node in the tree that may currently violate a
+// B-tree invariant.
+func (n *usagenode) rebalanceAfterRemove(gap usageGapIterator) usageGapIterator {
+	for {
+		if n.nrSegments >= usageminDegree-1 {
+			return gap
+		}
+		if n.parent == nil {
+
+			return gap
+		}
+
+		if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= usageminDegree {
+			copy(n.keys[1:], n.keys[:n.nrSegments])
+			copy(n.values[1:], n.values[:n.nrSegments])
+			n.keys[0] = n.parent.keys[n.parentIndex-1]
+			n.values[0] = n.parent.values[n.parentIndex-1]
+			n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1]
+			n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1]
+			usageSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1])
+			if n.hasChildren {
+				copy(n.children[1:], n.children[:n.nrSegments+1])
+				n.children[0] = sibling.children[sibling.nrSegments]
+				sibling.children[sibling.nrSegments] = nil
+				n.children[0].parent = n
+				n.children[0].parentIndex = 0
+				for i := 1; i < n.nrSegments+2; i++ {
+					n.children[i].parentIndex = i
+				}
+			}
+			n.nrSegments++
+			sibling.nrSegments--
+			if gap.node == sibling && gap.index == sibling.nrSegments {
+				return usageGapIterator{n, 0}
+			}
+			if gap.node == n {
+				return usageGapIterator{n, gap.index + 1}
+			}
+			return gap
+		}
+		if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= usageminDegree {
+			n.keys[n.nrSegments] = n.parent.keys[n.parentIndex]
+			n.values[n.nrSegments] = n.parent.values[n.parentIndex]
+			n.parent.keys[n.parentIndex] = sibling.keys[0]
+			n.parent.values[n.parentIndex] = sibling.values[0]
+			copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:])
+			copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:])
+			usageSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1])
+			if n.hasChildren {
+				n.children[n.nrSegments+1] = sibling.children[0]
+				copy(sibling.children[:sibling.nrSegments], sibling.children[1:])
+				sibling.children[sibling.nrSegments] = nil
+				n.children[n.nrSegments+1].parent = n
+				n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1
+				for i := 0; i < sibling.nrSegments; i++ {
+					sibling.children[i].parentIndex = i
+				}
+			}
+			n.nrSegments++
+			sibling.nrSegments--
+			if gap.node == sibling {
+				if gap.index == 0 {
+					return usageGapIterator{n, n.nrSegments}
+				}
+				return usageGapIterator{sibling, gap.index - 1}
+			}
+			return gap
+		}
+
+		p := n.parent
+		if p.nrSegments == 1 {
+
+			left, right := p.children[0], p.children[1]
+			p.nrSegments = left.nrSegments + right.nrSegments + 1
+			p.hasChildren = left.hasChildren
+			p.keys[left.nrSegments] = p.keys[0]
+			p.values[left.nrSegments] = p.values[0]
+			copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments])
+			copy(p.values[:left.nrSegments], left.values[:left.nrSegments])
+			copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments])
+			copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments])
+			if left.hasChildren {
+				copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1])
+				copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1])
+				for i := 0; i < p.nrSegments+1; i++ {
+					p.children[i].parent = p
+					p.children[i].parentIndex = i
+				}
+			} else {
+				p.children[0] = nil
+				p.children[1] = nil
+			}
+			if gap.node == left {
+				return usageGapIterator{p, gap.index}
+			}
+			if gap.node == right {
+				return usageGapIterator{p, gap.index + left.nrSegments + 1}
+			}
+			return gap
+		}
+		// Merge n and either sibling, along with the segment separating the
+		// two, into whichever of the two nodes comes first. This is the
+		// reverse of the non-root splitting case in
+		// node.rebalanceBeforeInsert.
+		var left, right *usagenode
+		if n.parentIndex > 0 {
+			left = n.prevSibling()
+			right = n
+		} else {
+			left = n
+			right = n.nextSibling()
+		}
+
+		if gap.node == right {
+			gap = usageGapIterator{left, gap.index + left.nrSegments + 1}
+		}
+		left.keys[left.nrSegments] = p.keys[left.parentIndex]
+		left.values[left.nrSegments] = p.values[left.parentIndex]
+		copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments])
+		copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments])
+		if left.hasChildren {
+			copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1])
+			for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ {
+				left.children[i].parent = left
+				left.children[i].parentIndex = i
+			}
+		}
+		left.nrSegments += right.nrSegments + 1
+		copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments])
+		copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments])
+		usageSetFunctions{}.ClearValue(&p.values[p.nrSegments-1])
+		copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1])
+		for i := 0; i < p.nrSegments; i++ {
+			p.children[i].parentIndex = i
+		}
+		p.children[p.nrSegments] = nil
+		p.nrSegments--
+
+		n = p
+	}
+}
+
+// A Iterator is conceptually one of:
+//
+// - A pointer to a segment in a set; or
+//
+// - A terminal iterator, which is a sentinel indicating that the end of
+// iteration has been reached.
+//
+// Iterators are copyable values and are meaningfully equality-comparable. The
+// zero value of Iterator is a terminal iterator.
+//
+// Unless otherwise specified, any mutation of a set invalidates all existing
+// iterators into the set.
+type usageIterator struct {
+	// node is the node containing the iterated segment. If the iterator is
+	// terminal, node is nil.
+	node *usagenode
+
+	// index is the index of the segment in node.keys/values.
+	index int
+}
+
+// Ok returns true if the iterator is not terminal. All other methods are only
+// valid for non-terminal iterators.
+func (seg usageIterator) Ok() bool {
+	return seg.node != nil
+}
+
+// Range returns the iterated segment's range key.
+func (seg usageIterator) Range() __generics_imported0.FileRange {
+	return seg.node.keys[seg.index]
+}
+
+// Start is equivalent to Range().Start, but should be preferred if only the
+// start of the range is needed.
+func (seg usageIterator) Start() uint64 {
+	return seg.node.keys[seg.index].Start
+}
+
+// End is equivalent to Range().End, but should be preferred if only the end of
+// the range is needed.
+func (seg usageIterator) End() uint64 {
+	return seg.node.keys[seg.index].End
+}
+
+// SetRangeUnchecked mutates the iterated segment's range key. This operation
+// does not invalidate any iterators.
+//
+// Preconditions:
+//
+// - r.Length() > 0.
+//
+// - The new range must not overlap an existing one: If seg.NextSegment().Ok(),
+// then r.end <= seg.NextSegment().Start(); if seg.PrevSegment().Ok(), then
+// r.start >= seg.PrevSegment().End().
+func (seg usageIterator) SetRangeUnchecked(r __generics_imported0.FileRange) {
+	seg.node.keys[seg.index] = r
+}
+
+// SetRange mutates the iterated segment's range key. If the new range would
+// cause the iterated segment to overlap another segment, or if the new range
+// is invalid, SetRange panics. This operation does not invalidate any
+// iterators.
+func (seg usageIterator) SetRange(r __generics_imported0.FileRange) {
+	if r.Length() <= 0 {
+		panic(fmt.Sprintf("invalid segment range %v", r))
+	}
+	if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() {
+		panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range()))
+	}
+	if next := seg.NextSegment(); next.Ok() && r.End > next.Start() {
+		panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range()))
+	}
+	seg.SetRangeUnchecked(r)
+}
+
+// SetStartUnchecked mutates the iterated segment's start. This operation does
+// not invalidate any iterators.
+//
+// Preconditions: The new start must be valid: start < seg.End(); if
+// seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End().
+func (seg usageIterator) SetStartUnchecked(start uint64) {
+	seg.node.keys[seg.index].Start = start
+}
+
+// SetStart mutates the iterated segment's start. If the new start value would
+// cause the iterated segment to overlap another segment, or would result in an
+// invalid range, SetStart panics. This operation does not invalidate any
+// iterators.
+func (seg usageIterator) SetStart(start uint64) {
+	if start >= seg.End() {
+		panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range()))
+	}
+	if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() {
+		panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range()))
+	}
+	seg.SetStartUnchecked(start)
+}
+
+// SetEndUnchecked mutates the iterated segment's end. This operation does not
+// invalidate any iterators.
+//
+// Preconditions: The new end must be valid: end > seg.Start(); if
+// seg.NextSegment().Ok(), then end <= seg.NextSegment().Start().
+func (seg usageIterator) SetEndUnchecked(end uint64) {
+	seg.node.keys[seg.index].End = end
+}
+
+// SetEnd mutates the iterated segment's end. If the new end value would cause
+// the iterated segment to overlap another segment, or would result in an
+// invalid range, SetEnd panics. This operation does not invalidate any
+// iterators.
+func (seg usageIterator) SetEnd(end uint64) {
+	if end <= seg.Start() {
+		panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range()))
+	}
+	if next := seg.NextSegment(); next.Ok() && end > next.Start() {
+		panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range()))
+	}
+	seg.SetEndUnchecked(end)
+}
+
+// Value returns a copy of the iterated segment's value.
+func (seg usageIterator) Value() usageInfo {
+	return seg.node.values[seg.index]
+}
+
+// ValuePtr returns a pointer to the iterated segment's value. The pointer is
+// invalidated if the iterator is invalidated. This operation does not
+// invalidate any iterators.
+func (seg usageIterator) ValuePtr() *usageInfo {
+	return &seg.node.values[seg.index]
+}
+
+// SetValue mutates the iterated segment's value. This operation does not
+// invalidate any iterators.
+func (seg usageIterator) SetValue(val usageInfo) {
+	seg.node.values[seg.index] = val
+}
+
+// PrevSegment returns the iterated segment's predecessor. If there is no
+// preceding segment, PrevSegment returns a terminal iterator.
+func (seg usageIterator) PrevSegment() usageIterator {
+	if seg.node.hasChildren {
+		return seg.node.children[seg.index].lastSegment()
+	}
+	if seg.index > 0 {
+		return usageIterator{seg.node, seg.index - 1}
+	}
+	if seg.node.parent == nil {
+		return usageIterator{}
+	}
+	return usagesegmentBeforePosition(seg.node.parent, seg.node.parentIndex)
+}
+
+// NextSegment returns the iterated segment's successor. If there is no
+// succeeding segment, NextSegment returns a terminal iterator.
+func (seg usageIterator) NextSegment() usageIterator {
+	if seg.node.hasChildren {
+		return seg.node.children[seg.index+1].firstSegment()
+	}
+	if seg.index < seg.node.nrSegments-1 {
+		return usageIterator{seg.node, seg.index + 1}
+	}
+	if seg.node.parent == nil {
+		return usageIterator{}
+	}
+	return usagesegmentAfterPosition(seg.node.parent, seg.node.parentIndex)
+}
+
+// PrevGap returns the gap immediately before the iterated segment.
+func (seg usageIterator) PrevGap() usageGapIterator {
+	if seg.node.hasChildren {
+
+		return seg.node.children[seg.index].lastSegment().NextGap()
+	}
+	return usageGapIterator{seg.node, seg.index}
+}
+
+// NextGap returns the gap immediately after the iterated segment.
+func (seg usageIterator) NextGap() usageGapIterator {
+	if seg.node.hasChildren {
+		return seg.node.children[seg.index+1].firstSegment().PrevGap()
+	}
+	return usageGapIterator{seg.node, seg.index + 1}
+}
+
+// PrevNonEmpty returns the iterated segment's predecessor if it is adjacent,
+// or the gap before the iterated segment otherwise. If seg.Start() ==
+// Functions.MinKey(), PrevNonEmpty will return two terminal iterators.
+// Otherwise, exactly one of the iterators returned by PrevNonEmpty will be
+// non-terminal.
+func (seg usageIterator) PrevNonEmpty() (usageIterator, usageGapIterator) {
+	gap := seg.PrevGap()
+	if gap.Range().Length() != 0 {
+		return usageIterator{}, gap
+	}
+	return gap.PrevSegment(), usageGapIterator{}
+}
+
+// NextNonEmpty returns the iterated segment's successor if it is adjacent, or
+// the gap after the iterated segment otherwise. If seg.End() ==
+// Functions.MaxKey(), NextNonEmpty will return two terminal iterators.
+// Otherwise, exactly one of the iterators returned by NextNonEmpty will be
+// non-terminal.
+func (seg usageIterator) NextNonEmpty() (usageIterator, usageGapIterator) {
+	gap := seg.NextGap()
+	if gap.Range().Length() != 0 {
+		return usageIterator{}, gap
+	}
+	return gap.NextSegment(), usageGapIterator{}
+}
+
+// A GapIterator is conceptually one of:
+//
+// - A pointer to a position between two segments, before the first segment, or
+// after the last segment in a set, called a *gap*; or
+//
+// - A terminal iterator, which is a sentinel indicating that the end of
+// iteration has been reached.
+//
+// Note that the gap between two adjacent segments exists (iterators to it are
+// non-terminal), but has a length of zero. GapIterator.IsEmpty returns true
+// for such gaps. An empty set contains a single gap, spanning the entire range
+// of the set's keys.
+//
+// GapIterators are copyable values and are meaningfully equality-comparable.
+// The zero value of GapIterator is a terminal iterator.
+//
+// Unless otherwise specified, any mutation of a set invalidates all existing
+// iterators into the set.
+type usageGapIterator struct {
+	// The representation of a GapIterator is identical to that of an Iterator,
+	// except that index corresponds to positions between segments in the same
+	// way as for node.children (see comment for node.nrSegments).
+	node  *usagenode
+	index int
+}
+
+// Ok returns true if the iterator is not terminal. All other methods are only
+// valid for non-terminal iterators.
+func (gap usageGapIterator) Ok() bool {
+	return gap.node != nil
+}
+
+// Range returns the range spanned by the iterated gap.
+func (gap usageGapIterator) Range() __generics_imported0.FileRange {
+	return __generics_imported0.FileRange{gap.Start(), gap.End()}
+}
+
+// Start is equivalent to Range().Start, but should be preferred if only the
+// start of the range is needed.
+func (gap usageGapIterator) Start() uint64 {
+	if ps := gap.PrevSegment(); ps.Ok() {
+		return ps.End()
+	}
+	return usageSetFunctions{}.MinKey()
+}
+
+// End is equivalent to Range().End, but should be preferred if only the end of
+// the range is needed.
+func (gap usageGapIterator) End() uint64 {
+	if ns := gap.NextSegment(); ns.Ok() {
+		return ns.Start()
+	}
+	return usageSetFunctions{}.MaxKey()
+}
+
+// IsEmpty returns true if the iterated gap is empty (that is, the "gap" is
+// between two adjacent segments.)
+func (gap usageGapIterator) IsEmpty() bool {
+	return gap.Range().Length() == 0
+}
+
+// PrevSegment returns the segment immediately before the iterated gap. If no
+// such segment exists, PrevSegment returns a terminal iterator.
+func (gap usageGapIterator) PrevSegment() usageIterator {
+	return usagesegmentBeforePosition(gap.node, gap.index)
+}
+
+// NextSegment returns the segment immediately after the iterated gap. If no
+// such segment exists, NextSegment returns a terminal iterator.
+func (gap usageGapIterator) NextSegment() usageIterator {
+	return usagesegmentAfterPosition(gap.node, gap.index)
+}
+
+// PrevGap returns the iterated gap's predecessor. If no such gap exists,
+// PrevGap returns a terminal iterator.
+func (gap usageGapIterator) PrevGap() usageGapIterator {
+	seg := gap.PrevSegment()
+	if !seg.Ok() {
+		return usageGapIterator{}
+	}
+	return seg.PrevGap()
+}
+
+// NextGap returns the iterated gap's successor. If no such gap exists, NextGap
+// returns a terminal iterator.
+func (gap usageGapIterator) NextGap() usageGapIterator {
+	seg := gap.NextSegment()
+	if !seg.Ok() {
+		return usageGapIterator{}
+	}
+	return seg.NextGap()
+}
+
+// segmentBeforePosition returns the predecessor segment of the position given
+// by n.children[i], which may or may not contain a child. If no such segment
+// exists, segmentBeforePosition returns a terminal iterator.
+func usagesegmentBeforePosition(n *usagenode, i int) usageIterator {
+	for i == 0 {
+		if n.parent == nil {
+			return usageIterator{}
+		}
+		n, i = n.parent, n.parentIndex
+	}
+	return usageIterator{n, i - 1}
+}
+
+// segmentAfterPosition returns the successor segment of the position given by
+// n.children[i], which may or may not contain a child. If no such segment
+// exists, segmentAfterPosition returns a terminal iterator.
+func usagesegmentAfterPosition(n *usagenode, i int) usageIterator {
+	for i == n.nrSegments {
+		if n.parent == nil {
+			return usageIterator{}
+		}
+		n, i = n.parent, n.parentIndex
+	}
+	return usageIterator{n, i}
+}
+
+func usagezeroValueSlice(slice []usageInfo) {
+
+	for i := range slice {
+		usageSetFunctions{}.ClearValue(&slice[i])
+	}
+}
+
+func usagezeroNodeSlice(slice []*usagenode) {
+	for i := range slice {
+		slice[i] = nil
+	}
+}
+
+// String stringifies a Set for debugging.
+func (s *usageSet) String() string {
+	return s.root.String()
+}
+
+// String stringifes a node (and all of its children) for debugging.
+func (n *usagenode) String() string {
+	var buf bytes.Buffer
+	n.writeDebugString(&buf, "")
+	return buf.String()
+}
+
+func (n *usagenode) writeDebugString(buf *bytes.Buffer, prefix string) {
+	if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) {
+		buf.WriteString(prefix)
+		buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren))
+	}
+	for i := 0; i < n.nrSegments; i++ {
+		if child := n.children[i]; child != nil {
+			cprefix := fmt.Sprintf("%s- % 3d ", prefix, i)
+			if child.parent != n || child.parentIndex != i {
+				buf.WriteString(cprefix)
+				buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i))
+			}
+			child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i))
+		}
+		buf.WriteString(prefix)
+		buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i]))
+	}
+	if child := n.children[n.nrSegments]; child != nil {
+		child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments))
+	}
+}
+
+// SegmentDataSlices represents segments from a set as slices of start, end, and
+// values. SegmentDataSlices is primarily used as an intermediate representation
+// for save/restore and the layout here is optimized for that.
+//
+// +stateify savable
+type usageSegmentDataSlices struct {
+	Start  []uint64
+	End    []uint64
+	Values []usageInfo
+}
+
+// ExportSortedSlice returns a copy of all segments in the given set, in ascending
+// key order.
+func (s *usageSet) ExportSortedSlices() *usageSegmentDataSlices {
+	var sds usageSegmentDataSlices
+	for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+		sds.Start = append(sds.Start, seg.Start())
+		sds.End = append(sds.End, seg.End())
+		sds.Values = append(sds.Values, seg.Value())
+	}
+	sds.Start = sds.Start[:len(sds.Start):len(sds.Start)]
+	sds.End = sds.End[:len(sds.End):len(sds.End)]
+	sds.Values = sds.Values[:len(sds.Values):len(sds.Values)]
+	return &sds
+}
+
+// ImportSortedSlice initializes the given set from the given slice.
+//
+// Preconditions: s must be empty. sds must represent a valid set (the segments
+// in sds must have valid lengths that do not overlap). The segments in sds
+// must be sorted in ascending key order.
+func (s *usageSet) ImportSortedSlices(sds *usageSegmentDataSlices) error {
+	if !s.IsEmpty() {
+		return fmt.Errorf("cannot import into non-empty set %v", s)
+	}
+	gap := s.FirstGap()
+	for i := range sds.Start {
+		r := __generics_imported0.FileRange{sds.Start[i], sds.End[i]}
+		if !gap.Range().IsSupersetOf(r) {
+			return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: [%d, %d) => %v", sds.Start[i], sds.End[i], sds.Values[i])
+		}
+		gap = s.InsertWithoutMerging(gap, r, sds.Values[i]).NextGap()
+	}
+	return nil
+}
+func (s *usageSet) saveRoot() *usageSegmentDataSlices {
+	return s.ExportSortedSlices()
+}
+
+func (s *usageSet) loadRoot(sds *usageSegmentDataSlices) {
+	if err := s.ImportSortedSlices(sds); err != nil {
+		panic(err)
+	}
+}
author	gVisor bot <gvisor-bot@google.com>	2019-06-02 06:44:55 +0000
committer	gVisor bot <gvisor-bot@google.com>	2019-06-02 06:44:55 +0000
commit	ceb0d792f328d1fc0692197d8856a43c3936a571 (patch)
tree	83155f302eff44a78bcc30a3a08f4efe59a79379 /pkg/sentry/pgalloc
parent	deb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff)
parent	216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff)