diff options
Diffstat (limited to 'pkg/sentry/pgalloc')
-rw-r--r-- | pkg/sentry/pgalloc/context.go | 48 | ||||
-rwxr-xr-x | pkg/sentry/pgalloc/evictable_range.go | 62 | ||||
-rwxr-xr-x | pkg/sentry/pgalloc/evictable_range_set.go | 1270 | ||||
-rw-r--r-- | pkg/sentry/pgalloc/pgalloc.go | 1187 | ||||
-rwxr-xr-x | pkg/sentry/pgalloc/pgalloc_state_autogen.go | 146 | ||||
-rw-r--r-- | pkg/sentry/pgalloc/pgalloc_unsafe.go | 40 | ||||
-rw-r--r-- | pkg/sentry/pgalloc/save_restore.go | 210 | ||||
-rwxr-xr-x | pkg/sentry/pgalloc/usage_set.go | 1274 |
8 files changed, 4237 insertions, 0 deletions
diff --git a/pkg/sentry/pgalloc/context.go b/pkg/sentry/pgalloc/context.go new file mode 100644 index 000000000..cb9809b1f --- /dev/null +++ b/pkg/sentry/pgalloc/context.go @@ -0,0 +1,48 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pgalloc + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/context" +) + +// contextID is this package's type for context.Context.Value keys. +type contextID int + +const ( + // CtxMemoryFile is a Context.Value key for a MemoryFile. + CtxMemoryFile contextID = iota + + // CtxMemoryFileProvider is a Context.Value key for a MemoryFileProvider. + CtxMemoryFileProvider +) + +// MemoryFileFromContext returns the MemoryFile used by ctx, or nil if no such +// MemoryFile exists. +func MemoryFileFromContext(ctx context.Context) *MemoryFile { + if v := ctx.Value(CtxMemoryFile); v != nil { + return v.(*MemoryFile) + } + return nil +} + +// MemoryFileProviderFromContext returns the MemoryFileProvider used by ctx, or nil if no such +// MemoryFileProvider exists. +func MemoryFileProviderFromContext(ctx context.Context) MemoryFileProvider { + if v := ctx.Value(CtxMemoryFileProvider); v != nil { + return v.(MemoryFileProvider) + } + return nil +} diff --git a/pkg/sentry/pgalloc/evictable_range.go b/pkg/sentry/pgalloc/evictable_range.go new file mode 100755 index 000000000..10ce2ff44 --- /dev/null +++ b/pkg/sentry/pgalloc/evictable_range.go @@ -0,0 +1,62 @@ +package pgalloc + +// A Range represents a contiguous range of T. +// +// +stateify savable +type EvictableRange struct { + // Start is the inclusive start of the range. + Start uint64 + + // End is the exclusive end of the range. + End uint64 +} + +// WellFormed returns true if r.Start <= r.End. All other methods on a Range +// require that the Range is well-formed. +func (r EvictableRange) WellFormed() bool { + return r.Start <= r.End +} + +// Length returns the length of the range. +func (r EvictableRange) Length() uint64 { + return r.End - r.Start +} + +// Contains returns true if r contains x. +func (r EvictableRange) Contains(x uint64) bool { + return r.Start <= x && x < r.End +} + +// Overlaps returns true if r and r2 overlap. +func (r EvictableRange) Overlaps(r2 EvictableRange) bool { + return r.Start < r2.End && r2.Start < r.End +} + +// IsSupersetOf returns true if r is a superset of r2; that is, the range r2 is +// contained within r. +func (r EvictableRange) IsSupersetOf(r2 EvictableRange) bool { + return r.Start <= r2.Start && r.End >= r2.End +} + +// Intersect returns a range consisting of the intersection between r and r2. +// If r and r2 do not overlap, Intersect returns a range with unspecified +// bounds, but for which Length() == 0. +func (r EvictableRange) Intersect(r2 EvictableRange) EvictableRange { + if r.Start < r2.Start { + r.Start = r2.Start + } + if r.End > r2.End { + r.End = r2.End + } + if r.End < r.Start { + r.End = r.Start + } + return r +} + +// CanSplitAt returns true if it is legal to split a segment spanning the range +// r at x; that is, splitting at x would produce two ranges, both of which have +// non-zero length. +func (r EvictableRange) CanSplitAt(x uint64) bool { + return r.Contains(x) && r.Start < x +} diff --git a/pkg/sentry/pgalloc/evictable_range_set.go b/pkg/sentry/pgalloc/evictable_range_set.go new file mode 100755 index 000000000..a4dcb1663 --- /dev/null +++ b/pkg/sentry/pgalloc/evictable_range_set.go @@ -0,0 +1,1270 @@ +package pgalloc + +import ( + "bytes" + "fmt" +) + +const ( + // minDegree is the minimum degree of an internal node in a Set B-tree. + // + // - Any non-root node has at least minDegree-1 segments. + // + // - Any non-root internal (non-leaf) node has at least minDegree children. + // + // - The root node may have fewer than minDegree-1 segments, but it may + // only have 0 segments if the tree is empty. + // + // Our implementation requires minDegree >= 3. Higher values of minDegree + // usually improve performance, but increase memory usage for small sets. + evictableRangeminDegree = 3 + + evictableRangemaxDegree = 2 * evictableRangeminDegree +) + +// A Set is a mapping of segments with non-overlapping Range keys. The zero +// value for a Set is an empty set. Set values are not safely movable nor +// copyable. Set is thread-compatible. +// +// +stateify savable +type evictableRangeSet struct { + root evictableRangenode `state:".(*evictableRangeSegmentDataSlices)"` +} + +// IsEmpty returns true if the set contains no segments. +func (s *evictableRangeSet) IsEmpty() bool { + return s.root.nrSegments == 0 +} + +// IsEmptyRange returns true iff no segments in the set overlap the given +// range. This is semantically equivalent to s.SpanRange(r) == 0, but may be +// more efficient. +func (s *evictableRangeSet) IsEmptyRange(r EvictableRange) bool { + switch { + case r.Length() < 0: + panic(fmt.Sprintf("invalid range %v", r)) + case r.Length() == 0: + return true + } + _, gap := s.Find(r.Start) + if !gap.Ok() { + return false + } + return r.End <= gap.End() +} + +// Span returns the total size of all segments in the set. +func (s *evictableRangeSet) Span() uint64 { + var sz uint64 + for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + sz += seg.Range().Length() + } + return sz +} + +// SpanRange returns the total size of the intersection of segments in the set +// with the given range. +func (s *evictableRangeSet) SpanRange(r EvictableRange) uint64 { + switch { + case r.Length() < 0: + panic(fmt.Sprintf("invalid range %v", r)) + case r.Length() == 0: + return 0 + } + var sz uint64 + for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { + sz += seg.Range().Intersect(r).Length() + } + return sz +} + +// FirstSegment returns the first segment in the set. If the set is empty, +// FirstSegment returns a terminal iterator. +func (s *evictableRangeSet) FirstSegment() evictableRangeIterator { + if s.root.nrSegments == 0 { + return evictableRangeIterator{} + } + return s.root.firstSegment() +} + +// LastSegment returns the last segment in the set. If the set is empty, +// LastSegment returns a terminal iterator. +func (s *evictableRangeSet) LastSegment() evictableRangeIterator { + if s.root.nrSegments == 0 { + return evictableRangeIterator{} + } + return s.root.lastSegment() +} + +// FirstGap returns the first gap in the set. +func (s *evictableRangeSet) FirstGap() evictableRangeGapIterator { + n := &s.root + for n.hasChildren { + n = n.children[0] + } + return evictableRangeGapIterator{n, 0} +} + +// LastGap returns the last gap in the set. +func (s *evictableRangeSet) LastGap() evictableRangeGapIterator { + n := &s.root + for n.hasChildren { + n = n.children[n.nrSegments] + } + return evictableRangeGapIterator{n, n.nrSegments} +} + +// Find returns the segment or gap whose range contains the given key. If a +// segment is found, the returned Iterator is non-terminal and the +// returned GapIterator is terminal. Otherwise, the returned Iterator is +// terminal and the returned GapIterator is non-terminal. +func (s *evictableRangeSet) Find(key uint64) (evictableRangeIterator, evictableRangeGapIterator) { + n := &s.root + for { + + lower := 0 + upper := n.nrSegments + for lower < upper { + i := lower + (upper-lower)/2 + if r := n.keys[i]; key < r.End { + if key >= r.Start { + return evictableRangeIterator{n, i}, evictableRangeGapIterator{} + } + upper = i + } else { + lower = i + 1 + } + } + i := lower + if !n.hasChildren { + return evictableRangeIterator{}, evictableRangeGapIterator{n, i} + } + n = n.children[i] + } +} + +// FindSegment returns the segment whose range contains the given key. If no +// such segment exists, FindSegment returns a terminal iterator. +func (s *evictableRangeSet) FindSegment(key uint64) evictableRangeIterator { + seg, _ := s.Find(key) + return seg +} + +// LowerBoundSegment returns the segment with the lowest range that contains a +// key greater than or equal to min. If no such segment exists, +// LowerBoundSegment returns a terminal iterator. +func (s *evictableRangeSet) LowerBoundSegment(min uint64) evictableRangeIterator { + seg, gap := s.Find(min) + if seg.Ok() { + return seg + } + return gap.NextSegment() +} + +// UpperBoundSegment returns the segment with the highest range that contains a +// key less than or equal to max. If no such segment exists, UpperBoundSegment +// returns a terminal iterator. +func (s *evictableRangeSet) UpperBoundSegment(max uint64) evictableRangeIterator { + seg, gap := s.Find(max) + if seg.Ok() { + return seg + } + return gap.PrevSegment() +} + +// FindGap returns the gap containing the given key. If no such gap exists +// (i.e. the set contains a segment containing that key), FindGap returns a +// terminal iterator. +func (s *evictableRangeSet) FindGap(key uint64) evictableRangeGapIterator { + _, gap := s.Find(key) + return gap +} + +// LowerBoundGap returns the gap with the lowest range that is greater than or +// equal to min. +func (s *evictableRangeSet) LowerBoundGap(min uint64) evictableRangeGapIterator { + seg, gap := s.Find(min) + if gap.Ok() { + return gap + } + return seg.NextGap() +} + +// UpperBoundGap returns the gap with the highest range that is less than or +// equal to max. +func (s *evictableRangeSet) UpperBoundGap(max uint64) evictableRangeGapIterator { + seg, gap := s.Find(max) + if gap.Ok() { + return gap + } + return seg.PrevGap() +} + +// Add inserts the given segment into the set and returns true. If the new +// segment can be merged with adjacent segments, Add will do so. If the new +// segment would overlap an existing segment, Add returns false. If Add +// succeeds, all existing iterators are invalidated. +func (s *evictableRangeSet) Add(r EvictableRange, val evictableRangeSetValue) bool { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + gap := s.FindGap(r.Start) + if !gap.Ok() { + return false + } + if r.End > gap.End() { + return false + } + s.Insert(gap, r, val) + return true +} + +// AddWithoutMerging inserts the given segment into the set and returns true. +// If it would overlap an existing segment, AddWithoutMerging does nothing and +// returns false. If AddWithoutMerging succeeds, all existing iterators are +// invalidated. +func (s *evictableRangeSet) AddWithoutMerging(r EvictableRange, val evictableRangeSetValue) bool { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + gap := s.FindGap(r.Start) + if !gap.Ok() { + return false + } + if r.End > gap.End() { + return false + } + s.InsertWithoutMergingUnchecked(gap, r, val) + return true +} + +// Insert inserts the given segment into the given gap. If the new segment can +// be merged with adjacent segments, Insert will do so. Insert returns an +// iterator to the segment containing the inserted value (which may have been +// merged with other values). All existing iterators (including gap, but not +// including the returned iterator) are invalidated. +// +// If the gap cannot accommodate the segment, or if r is invalid, Insert panics. +// +// Insert is semantically equivalent to a InsertWithoutMerging followed by a +// Merge, but may be more efficient. Note that there is no unchecked variant of +// Insert since Insert must retrieve and inspect gap's predecessor and +// successor segments regardless. +func (s *evictableRangeSet) Insert(gap evictableRangeGapIterator, r EvictableRange, val evictableRangeSetValue) evictableRangeIterator { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + prev, next := gap.PrevSegment(), gap.NextSegment() + if prev.Ok() && prev.End() > r.Start { + panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) + } + if next.Ok() && next.Start() < r.End { + panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) + } + if prev.Ok() && prev.End() == r.Start { + if mval, ok := (evictableRangeSetFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok { + prev.SetEndUnchecked(r.End) + prev.SetValue(mval) + if next.Ok() && next.Start() == r.End { + val = mval + if mval, ok := (evictableRangeSetFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { + prev.SetEndUnchecked(next.End()) + prev.SetValue(mval) + return s.Remove(next).PrevSegment() + } + } + return prev + } + } + if next.Ok() && next.Start() == r.End { + if mval, ok := (evictableRangeSetFunctions{}).Merge(r, val, next.Range(), next.Value()); ok { + next.SetStartUnchecked(r.Start) + next.SetValue(mval) + return next + } + } + return s.InsertWithoutMergingUnchecked(gap, r, val) +} + +// InsertWithoutMerging inserts the given segment into the given gap and +// returns an iterator to the inserted segment. All existing iterators +// (including gap, but not including the returned iterator) are invalidated. +// +// If the gap cannot accommodate the segment, or if r is invalid, +// InsertWithoutMerging panics. +func (s *evictableRangeSet) InsertWithoutMerging(gap evictableRangeGapIterator, r EvictableRange, val evictableRangeSetValue) evictableRangeIterator { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + if gr := gap.Range(); !gr.IsSupersetOf(r) { + panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) + } + return s.InsertWithoutMergingUnchecked(gap, r, val) +} + +// InsertWithoutMergingUnchecked inserts the given segment into the given gap +// and returns an iterator to the inserted segment. All existing iterators +// (including gap, but not including the returned iterator) are invalidated. +// +// Preconditions: r.Start >= gap.Start(); r.End <= gap.End(). +func (s *evictableRangeSet) InsertWithoutMergingUnchecked(gap evictableRangeGapIterator, r EvictableRange, val evictableRangeSetValue) evictableRangeIterator { + gap = gap.node.rebalanceBeforeInsert(gap) + copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) + copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) + gap.node.keys[gap.index] = r + gap.node.values[gap.index] = val + gap.node.nrSegments++ + return evictableRangeIterator{gap.node, gap.index} +} + +// Remove removes the given segment and returns an iterator to the vacated gap. +// All existing iterators (including seg, but not including the returned +// iterator) are invalidated. +func (s *evictableRangeSet) Remove(seg evictableRangeIterator) evictableRangeGapIterator { + + if seg.node.hasChildren { + + victim := seg.PrevSegment() + + seg.SetRangeUnchecked(victim.Range()) + seg.SetValue(victim.Value()) + return s.Remove(victim).NextGap() + } + copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) + copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) + evictableRangeSetFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) + seg.node.nrSegments-- + return seg.node.rebalanceAfterRemove(evictableRangeGapIterator{seg.node, seg.index}) +} + +// RemoveAll removes all segments from the set. All existing iterators are +// invalidated. +func (s *evictableRangeSet) RemoveAll() { + s.root = evictableRangenode{} +} + +// RemoveRange removes all segments in the given range. An iterator to the +// newly formed gap is returned, and all existing iterators are invalidated. +func (s *evictableRangeSet) RemoveRange(r EvictableRange) evictableRangeGapIterator { + seg, gap := s.Find(r.Start) + if seg.Ok() { + seg = s.Isolate(seg, r) + gap = s.Remove(seg) + } + for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { + seg = s.Isolate(seg, r) + gap = s.Remove(seg) + } + return gap +} + +// Merge attempts to merge two neighboring segments. If successful, Merge +// returns an iterator to the merged segment, and all existing iterators are +// invalidated. Otherwise, Merge returns a terminal iterator. +// +// If first is not the predecessor of second, Merge panics. +func (s *evictableRangeSet) Merge(first, second evictableRangeIterator) evictableRangeIterator { + if first.NextSegment() != second { + panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) + } + return s.MergeUnchecked(first, second) +} + +// MergeUnchecked attempts to merge two neighboring segments. If successful, +// MergeUnchecked returns an iterator to the merged segment, and all existing +// iterators are invalidated. Otherwise, MergeUnchecked returns a terminal +// iterator. +// +// Precondition: first is the predecessor of second: first.NextSegment() == +// second, first == second.PrevSegment(). +func (s *evictableRangeSet) MergeUnchecked(first, second evictableRangeIterator) evictableRangeIterator { + if first.End() == second.Start() { + if mval, ok := (evictableRangeSetFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { + + first.SetEndUnchecked(second.End()) + first.SetValue(mval) + return s.Remove(second).PrevSegment() + } + } + return evictableRangeIterator{} +} + +// MergeAll attempts to merge all adjacent segments in the set. All existing +// iterators are invalidated. +func (s *evictableRangeSet) MergeAll() { + seg := s.FirstSegment() + if !seg.Ok() { + return + } + next := seg.NextSegment() + for next.Ok() { + if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { + seg, next = mseg, mseg.NextSegment() + } else { + seg, next = next, next.NextSegment() + } + } +} + +// MergeRange attempts to merge all adjacent segments that contain a key in the +// specific range. All existing iterators are invalidated. +func (s *evictableRangeSet) MergeRange(r EvictableRange) { + seg := s.LowerBoundSegment(r.Start) + if !seg.Ok() { + return + } + next := seg.NextSegment() + for next.Ok() && next.Range().Start < r.End { + if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { + seg, next = mseg, mseg.NextSegment() + } else { + seg, next = next, next.NextSegment() + } + } +} + +// MergeAdjacent attempts to merge the segment containing r.Start with its +// predecessor, and the segment containing r.End-1 with its successor. +func (s *evictableRangeSet) MergeAdjacent(r EvictableRange) { + first := s.FindSegment(r.Start) + if first.Ok() { + if prev := first.PrevSegment(); prev.Ok() { + s.Merge(prev, first) + } + } + last := s.FindSegment(r.End - 1) + if last.Ok() { + if next := last.NextSegment(); next.Ok() { + s.Merge(last, next) + } + } +} + +// Split splits the given segment at the given key and returns iterators to the +// two resulting segments. All existing iterators (including seg, but not +// including the returned iterators) are invalidated. +// +// If the segment cannot be split at split (because split is at the start or +// end of the segment's range, so splitting would produce a segment with zero +// length, or because split falls outside the segment's range altogether), +// Split panics. +func (s *evictableRangeSet) Split(seg evictableRangeIterator, split uint64) (evictableRangeIterator, evictableRangeIterator) { + if !seg.Range().CanSplitAt(split) { + panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) + } + return s.SplitUnchecked(seg, split) +} + +// SplitUnchecked splits the given segment at the given key and returns +// iterators to the two resulting segments. All existing iterators (including +// seg, but not including the returned iterators) are invalidated. +// +// Preconditions: seg.Start() < key < seg.End(). +func (s *evictableRangeSet) SplitUnchecked(seg evictableRangeIterator, split uint64) (evictableRangeIterator, evictableRangeIterator) { + val1, val2 := (evictableRangeSetFunctions{}).Split(seg.Range(), seg.Value(), split) + end2 := seg.End() + seg.SetEndUnchecked(split) + seg.SetValue(val1) + seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), EvictableRange{split, end2}, val2) + + return seg2.PrevSegment(), seg2 +} + +// SplitAt splits the segment straddling split, if one exists. SplitAt returns +// true if a segment was split and false otherwise. If SplitAt splits a +// segment, all existing iterators are invalidated. +func (s *evictableRangeSet) SplitAt(split uint64) bool { + if seg := s.FindSegment(split); seg.Ok() && seg.Range().CanSplitAt(split) { + s.SplitUnchecked(seg, split) + return true + } + return false +} + +// Isolate ensures that the given segment's range does not escape r by +// splitting at r.Start and r.End if necessary, and returns an updated iterator +// to the bounded segment. All existing iterators (including seg, but not +// including the returned iterators) are invalidated. +func (s *evictableRangeSet) Isolate(seg evictableRangeIterator, r EvictableRange) evictableRangeIterator { + if seg.Range().CanSplitAt(r.Start) { + _, seg = s.SplitUnchecked(seg, r.Start) + } + if seg.Range().CanSplitAt(r.End) { + seg, _ = s.SplitUnchecked(seg, r.End) + } + return seg +} + +// ApplyContiguous applies a function to a contiguous range of segments, +// splitting if necessary. The function is applied until the first gap is +// encountered, at which point the gap is returned. If the function is applied +// across the entire range, a terminal gap is returned. All existing iterators +// are invalidated. +// +// N.B. The Iterator must not be invalidated by the function. +func (s *evictableRangeSet) ApplyContiguous(r EvictableRange, fn func(seg evictableRangeIterator)) evictableRangeGapIterator { + seg, gap := s.Find(r.Start) + if !seg.Ok() { + return gap + } + for { + seg = s.Isolate(seg, r) + fn(seg) + if seg.End() >= r.End { + return evictableRangeGapIterator{} + } + gap = seg.NextGap() + if !gap.IsEmpty() { + return gap + } + seg = gap.NextSegment() + if !seg.Ok() { + + return evictableRangeGapIterator{} + } + } +} + +// +stateify savable +type evictableRangenode struct { + // An internal binary tree node looks like: + // + // K + // / \ + // Cl Cr + // + // where all keys in the subtree rooted by Cl (the left subtree) are less + // than K (the key of the parent node), and all keys in the subtree rooted + // by Cr (the right subtree) are greater than K. + // + // An internal B-tree node's indexes work out to look like: + // + // K0 K1 K2 ... Kn-1 + // / \/ \/ \ ... / \ + // C0 C1 C2 C3 ... Cn-1 Cn + // + // where n is nrSegments. + nrSegments int + + // parent is a pointer to this node's parent. If this node is root, parent + // is nil. + parent *evictableRangenode + + // parentIndex is the index of this node in parent.children. + parentIndex int + + // Flag for internal nodes that is technically redundant with "children[0] + // != nil", but is stored in the first cache line. "hasChildren" rather + // than "isLeaf" because false must be the correct value for an empty root. + hasChildren bool + + // Nodes store keys and values in separate arrays to maximize locality in + // the common case (scanning keys for lookup). + keys [evictableRangemaxDegree - 1]EvictableRange + values [evictableRangemaxDegree - 1]evictableRangeSetValue + children [evictableRangemaxDegree]*evictableRangenode +} + +// firstSegment returns the first segment in the subtree rooted by n. +// +// Preconditions: n.nrSegments != 0. +func (n *evictableRangenode) firstSegment() evictableRangeIterator { + for n.hasChildren { + n = n.children[0] + } + return evictableRangeIterator{n, 0} +} + +// lastSegment returns the last segment in the subtree rooted by n. +// +// Preconditions: n.nrSegments != 0. +func (n *evictableRangenode) lastSegment() evictableRangeIterator { + for n.hasChildren { + n = n.children[n.nrSegments] + } + return evictableRangeIterator{n, n.nrSegments - 1} +} + +func (n *evictableRangenode) prevSibling() *evictableRangenode { + if n.parent == nil || n.parentIndex == 0 { + return nil + } + return n.parent.children[n.parentIndex-1] +} + +func (n *evictableRangenode) nextSibling() *evictableRangenode { + if n.parent == nil || n.parentIndex == n.parent.nrSegments { + return nil + } + return n.parent.children[n.parentIndex+1] +} + +// rebalanceBeforeInsert splits n and its ancestors if they are full, as +// required for insertion, and returns an updated iterator to the position +// represented by gap. +func (n *evictableRangenode) rebalanceBeforeInsert(gap evictableRangeGapIterator) evictableRangeGapIterator { + if n.parent != nil { + gap = n.parent.rebalanceBeforeInsert(gap) + } + if n.nrSegments < evictableRangemaxDegree-1 { + return gap + } + if n.parent == nil { + + left := &evictableRangenode{ + nrSegments: evictableRangeminDegree - 1, + parent: n, + parentIndex: 0, + hasChildren: n.hasChildren, + } + right := &evictableRangenode{ + nrSegments: evictableRangeminDegree - 1, + parent: n, + parentIndex: 1, + hasChildren: n.hasChildren, + } + copy(left.keys[:evictableRangeminDegree-1], n.keys[:evictableRangeminDegree-1]) + copy(left.values[:evictableRangeminDegree-1], n.values[:evictableRangeminDegree-1]) + copy(right.keys[:evictableRangeminDegree-1], n.keys[evictableRangeminDegree:]) + copy(right.values[:evictableRangeminDegree-1], n.values[evictableRangeminDegree:]) + n.keys[0], n.values[0] = n.keys[evictableRangeminDegree-1], n.values[evictableRangeminDegree-1] + evictableRangezeroValueSlice(n.values[1:]) + if n.hasChildren { + copy(left.children[:evictableRangeminDegree], n.children[:evictableRangeminDegree]) + copy(right.children[:evictableRangeminDegree], n.children[evictableRangeminDegree:]) + evictableRangezeroNodeSlice(n.children[2:]) + for i := 0; i < evictableRangeminDegree; i++ { + left.children[i].parent = left + left.children[i].parentIndex = i + right.children[i].parent = right + right.children[i].parentIndex = i + } + } + n.nrSegments = 1 + n.hasChildren = true + n.children[0] = left + n.children[1] = right + if gap.node != n { + return gap + } + if gap.index < evictableRangeminDegree { + return evictableRangeGapIterator{left, gap.index} + } + return evictableRangeGapIterator{right, gap.index - evictableRangeminDegree} + } + + copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) + copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) + n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[evictableRangeminDegree-1], n.values[evictableRangeminDegree-1] + copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) + for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { + n.parent.children[i].parentIndex = i + } + sibling := &evictableRangenode{ + nrSegments: evictableRangeminDegree - 1, + parent: n.parent, + parentIndex: n.parentIndex + 1, + hasChildren: n.hasChildren, + } + n.parent.children[n.parentIndex+1] = sibling + n.parent.nrSegments++ + copy(sibling.keys[:evictableRangeminDegree-1], n.keys[evictableRangeminDegree:]) + copy(sibling.values[:evictableRangeminDegree-1], n.values[evictableRangeminDegree:]) + evictableRangezeroValueSlice(n.values[evictableRangeminDegree-1:]) + if n.hasChildren { + copy(sibling.children[:evictableRangeminDegree], n.children[evictableRangeminDegree:]) + evictableRangezeroNodeSlice(n.children[evictableRangeminDegree:]) + for i := 0; i < evictableRangeminDegree; i++ { + sibling.children[i].parent = sibling + sibling.children[i].parentIndex = i + } + } + n.nrSegments = evictableRangeminDegree - 1 + + if gap.node != n { + return gap + } + if gap.index < evictableRangeminDegree { + return gap + } + return evictableRangeGapIterator{sibling, gap.index - evictableRangeminDegree} +} + +// rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient +// (contain fewer segments than required by B-tree invariants), as required for +// removal, and returns an updated iterator to the position represented by gap. +// +// Precondition: n is the only node in the tree that may currently violate a +// B-tree invariant. +func (n *evictableRangenode) rebalanceAfterRemove(gap evictableRangeGapIterator) evictableRangeGapIterator { + for { + if n.nrSegments >= evictableRangeminDegree-1 { + return gap + } + if n.parent == nil { + + return gap + } + + if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= evictableRangeminDegree { + copy(n.keys[1:], n.keys[:n.nrSegments]) + copy(n.values[1:], n.values[:n.nrSegments]) + n.keys[0] = n.parent.keys[n.parentIndex-1] + n.values[0] = n.parent.values[n.parentIndex-1] + n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] + n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] + evictableRangeSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) + if n.hasChildren { + copy(n.children[1:], n.children[:n.nrSegments+1]) + n.children[0] = sibling.children[sibling.nrSegments] + sibling.children[sibling.nrSegments] = nil + n.children[0].parent = n + n.children[0].parentIndex = 0 + for i := 1; i < n.nrSegments+2; i++ { + n.children[i].parentIndex = i + } + } + n.nrSegments++ + sibling.nrSegments-- + if gap.node == sibling && gap.index == sibling.nrSegments { + return evictableRangeGapIterator{n, 0} + } + if gap.node == n { + return evictableRangeGapIterator{n, gap.index + 1} + } + return gap + } + if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= evictableRangeminDegree { + n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] + n.values[n.nrSegments] = n.parent.values[n.parentIndex] + n.parent.keys[n.parentIndex] = sibling.keys[0] + n.parent.values[n.parentIndex] = sibling.values[0] + copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) + copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) + evictableRangeSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) + if n.hasChildren { + n.children[n.nrSegments+1] = sibling.children[0] + copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) + sibling.children[sibling.nrSegments] = nil + n.children[n.nrSegments+1].parent = n + n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 + for i := 0; i < sibling.nrSegments; i++ { + sibling.children[i].parentIndex = i + } + } + n.nrSegments++ + sibling.nrSegments-- + if gap.node == sibling { + if gap.index == 0 { + return evictableRangeGapIterator{n, n.nrSegments} + } + return evictableRangeGapIterator{sibling, gap.index - 1} + } + return gap + } + + p := n.parent + if p.nrSegments == 1 { + + left, right := p.children[0], p.children[1] + p.nrSegments = left.nrSegments + right.nrSegments + 1 + p.hasChildren = left.hasChildren + p.keys[left.nrSegments] = p.keys[0] + p.values[left.nrSegments] = p.values[0] + copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) + copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) + copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) + copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) + if left.hasChildren { + copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) + copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) + for i := 0; i < p.nrSegments+1; i++ { + p.children[i].parent = p + p.children[i].parentIndex = i + } + } else { + p.children[0] = nil + p.children[1] = nil + } + if gap.node == left { + return evictableRangeGapIterator{p, gap.index} + } + if gap.node == right { + return evictableRangeGapIterator{p, gap.index + left.nrSegments + 1} + } + return gap + } + // Merge n and either sibling, along with the segment separating the + // two, into whichever of the two nodes comes first. This is the + // reverse of the non-root splitting case in + // node.rebalanceBeforeInsert. + var left, right *evictableRangenode + if n.parentIndex > 0 { + left = n.prevSibling() + right = n + } else { + left = n + right = n.nextSibling() + } + + if gap.node == right { + gap = evictableRangeGapIterator{left, gap.index + left.nrSegments + 1} + } + left.keys[left.nrSegments] = p.keys[left.parentIndex] + left.values[left.nrSegments] = p.values[left.parentIndex] + copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) + copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) + if left.hasChildren { + copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) + for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { + left.children[i].parent = left + left.children[i].parentIndex = i + } + } + left.nrSegments += right.nrSegments + 1 + copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) + copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) + evictableRangeSetFunctions{}.ClearValue(&p.values[p.nrSegments-1]) + copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) + for i := 0; i < p.nrSegments; i++ { + p.children[i].parentIndex = i + } + p.children[p.nrSegments] = nil + p.nrSegments-- + + n = p + } +} + +// A Iterator is conceptually one of: +// +// - A pointer to a segment in a set; or +// +// - A terminal iterator, which is a sentinel indicating that the end of +// iteration has been reached. +// +// Iterators are copyable values and are meaningfully equality-comparable. The +// zero value of Iterator is a terminal iterator. +// +// Unless otherwise specified, any mutation of a set invalidates all existing +// iterators into the set. +type evictableRangeIterator struct { + // node is the node containing the iterated segment. If the iterator is + // terminal, node is nil. + node *evictableRangenode + + // index is the index of the segment in node.keys/values. + index int +} + +// Ok returns true if the iterator is not terminal. All other methods are only +// valid for non-terminal iterators. +func (seg evictableRangeIterator) Ok() bool { + return seg.node != nil +} + +// Range returns the iterated segment's range key. +func (seg evictableRangeIterator) Range() EvictableRange { + return seg.node.keys[seg.index] +} + +// Start is equivalent to Range().Start, but should be preferred if only the +// start of the range is needed. +func (seg evictableRangeIterator) Start() uint64 { + return seg.node.keys[seg.index].Start +} + +// End is equivalent to Range().End, but should be preferred if only the end of +// the range is needed. +func (seg evictableRangeIterator) End() uint64 { + return seg.node.keys[seg.index].End +} + +// SetRangeUnchecked mutates the iterated segment's range key. This operation +// does not invalidate any iterators. +// +// Preconditions: +// +// - r.Length() > 0. +// +// - The new range must not overlap an existing one: If seg.NextSegment().Ok(), +// then r.end <= seg.NextSegment().Start(); if seg.PrevSegment().Ok(), then +// r.start >= seg.PrevSegment().End(). +func (seg evictableRangeIterator) SetRangeUnchecked(r EvictableRange) { + seg.node.keys[seg.index] = r +} + +// SetRange mutates the iterated segment's range key. If the new range would +// cause the iterated segment to overlap another segment, or if the new range +// is invalid, SetRange panics. This operation does not invalidate any +// iterators. +func (seg evictableRangeIterator) SetRange(r EvictableRange) { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { + panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) + } + if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { + panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) + } + seg.SetRangeUnchecked(r) +} + +// SetStartUnchecked mutates the iterated segment's start. This operation does +// not invalidate any iterators. +// +// Preconditions: The new start must be valid: start < seg.End(); if +// seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). +func (seg evictableRangeIterator) SetStartUnchecked(start uint64) { + seg.node.keys[seg.index].Start = start +} + +// SetStart mutates the iterated segment's start. If the new start value would +// cause the iterated segment to overlap another segment, or would result in an +// invalid range, SetStart panics. This operation does not invalidate any +// iterators. +func (seg evictableRangeIterator) SetStart(start uint64) { + if start >= seg.End() { + panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) + } + if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { + panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) + } + seg.SetStartUnchecked(start) +} + +// SetEndUnchecked mutates the iterated segment's end. This operation does not +// invalidate any iterators. +// +// Preconditions: The new end must be valid: end > seg.Start(); if +// seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). +func (seg evictableRangeIterator) SetEndUnchecked(end uint64) { + seg.node.keys[seg.index].End = end +} + +// SetEnd mutates the iterated segment's end. If the new end value would cause +// the iterated segment to overlap another segment, or would result in an +// invalid range, SetEnd panics. This operation does not invalidate any +// iterators. +func (seg evictableRangeIterator) SetEnd(end uint64) { + if end <= seg.Start() { + panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) + } + if next := seg.NextSegment(); next.Ok() && end > next.Start() { + panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) + } + seg.SetEndUnchecked(end) +} + +// Value returns a copy of the iterated segment's value. +func (seg evictableRangeIterator) Value() evictableRangeSetValue { + return seg.node.values[seg.index] +} + +// ValuePtr returns a pointer to the iterated segment's value. The pointer is +// invalidated if the iterator is invalidated. This operation does not +// invalidate any iterators. +func (seg evictableRangeIterator) ValuePtr() *evictableRangeSetValue { + return &seg.node.values[seg.index] +} + +// SetValue mutates the iterated segment's value. This operation does not +// invalidate any iterators. +func (seg evictableRangeIterator) SetValue(val evictableRangeSetValue) { + seg.node.values[seg.index] = val +} + +// PrevSegment returns the iterated segment's predecessor. If there is no +// preceding segment, PrevSegment returns a terminal iterator. +func (seg evictableRangeIterator) PrevSegment() evictableRangeIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index].lastSegment() + } + if seg.index > 0 { + return evictableRangeIterator{seg.node, seg.index - 1} + } + if seg.node.parent == nil { + return evictableRangeIterator{} + } + return evictableRangesegmentBeforePosition(seg.node.parent, seg.node.parentIndex) +} + +// NextSegment returns the iterated segment's successor. If there is no +// succeeding segment, NextSegment returns a terminal iterator. +func (seg evictableRangeIterator) NextSegment() evictableRangeIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index+1].firstSegment() + } + if seg.index < seg.node.nrSegments-1 { + return evictableRangeIterator{seg.node, seg.index + 1} + } + if seg.node.parent == nil { + return evictableRangeIterator{} + } + return evictableRangesegmentAfterPosition(seg.node.parent, seg.node.parentIndex) +} + +// PrevGap returns the gap immediately before the iterated segment. +func (seg evictableRangeIterator) PrevGap() evictableRangeGapIterator { + if seg.node.hasChildren { + + return seg.node.children[seg.index].lastSegment().NextGap() + } + return evictableRangeGapIterator{seg.node, seg.index} +} + +// NextGap returns the gap immediately after the iterated segment. +func (seg evictableRangeIterator) NextGap() evictableRangeGapIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index+1].firstSegment().PrevGap() + } + return evictableRangeGapIterator{seg.node, seg.index + 1} +} + +// PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, +// or the gap before the iterated segment otherwise. If seg.Start() == +// Functions.MinKey(), PrevNonEmpty will return two terminal iterators. +// Otherwise, exactly one of the iterators returned by PrevNonEmpty will be +// non-terminal. +func (seg evictableRangeIterator) PrevNonEmpty() (evictableRangeIterator, evictableRangeGapIterator) { + gap := seg.PrevGap() + if gap.Range().Length() != 0 { + return evictableRangeIterator{}, gap + } + return gap.PrevSegment(), evictableRangeGapIterator{} +} + +// NextNonEmpty returns the iterated segment's successor if it is adjacent, or +// the gap after the iterated segment otherwise. If seg.End() == +// Functions.MaxKey(), NextNonEmpty will return two terminal iterators. +// Otherwise, exactly one of the iterators returned by NextNonEmpty will be +// non-terminal. +func (seg evictableRangeIterator) NextNonEmpty() (evictableRangeIterator, evictableRangeGapIterator) { + gap := seg.NextGap() + if gap.Range().Length() != 0 { + return evictableRangeIterator{}, gap + } + return gap.NextSegment(), evictableRangeGapIterator{} +} + +// A GapIterator is conceptually one of: +// +// - A pointer to a position between two segments, before the first segment, or +// after the last segment in a set, called a *gap*; or +// +// - A terminal iterator, which is a sentinel indicating that the end of +// iteration has been reached. +// +// Note that the gap between two adjacent segments exists (iterators to it are +// non-terminal), but has a length of zero. GapIterator.IsEmpty returns true +// for such gaps. An empty set contains a single gap, spanning the entire range +// of the set's keys. +// +// GapIterators are copyable values and are meaningfully equality-comparable. +// The zero value of GapIterator is a terminal iterator. +// +// Unless otherwise specified, any mutation of a set invalidates all existing +// iterators into the set. +type evictableRangeGapIterator struct { + // The representation of a GapIterator is identical to that of an Iterator, + // except that index corresponds to positions between segments in the same + // way as for node.children (see comment for node.nrSegments). + node *evictableRangenode + index int +} + +// Ok returns true if the iterator is not terminal. All other methods are only +// valid for non-terminal iterators. +func (gap evictableRangeGapIterator) Ok() bool { + return gap.node != nil +} + +// Range returns the range spanned by the iterated gap. +func (gap evictableRangeGapIterator) Range() EvictableRange { + return EvictableRange{gap.Start(), gap.End()} +} + +// Start is equivalent to Range().Start, but should be preferred if only the +// start of the range is needed. +func (gap evictableRangeGapIterator) Start() uint64 { + if ps := gap.PrevSegment(); ps.Ok() { + return ps.End() + } + return evictableRangeSetFunctions{}.MinKey() +} + +// End is equivalent to Range().End, but should be preferred if only the end of +// the range is needed. +func (gap evictableRangeGapIterator) End() uint64 { + if ns := gap.NextSegment(); ns.Ok() { + return ns.Start() + } + return evictableRangeSetFunctions{}.MaxKey() +} + +// IsEmpty returns true if the iterated gap is empty (that is, the "gap" is +// between two adjacent segments.) +func (gap evictableRangeGapIterator) IsEmpty() bool { + return gap.Range().Length() == 0 +} + +// PrevSegment returns the segment immediately before the iterated gap. If no +// such segment exists, PrevSegment returns a terminal iterator. +func (gap evictableRangeGapIterator) PrevSegment() evictableRangeIterator { + return evictableRangesegmentBeforePosition(gap.node, gap.index) +} + +// NextSegment returns the segment immediately after the iterated gap. If no +// such segment exists, NextSegment returns a terminal iterator. +func (gap evictableRangeGapIterator) NextSegment() evictableRangeIterator { + return evictableRangesegmentAfterPosition(gap.node, gap.index) +} + +// PrevGap returns the iterated gap's predecessor. If no such gap exists, +// PrevGap returns a terminal iterator. +func (gap evictableRangeGapIterator) PrevGap() evictableRangeGapIterator { + seg := gap.PrevSegment() + if !seg.Ok() { + return evictableRangeGapIterator{} + } + return seg.PrevGap() +} + +// NextGap returns the iterated gap's successor. If no such gap exists, NextGap +// returns a terminal iterator. +func (gap evictableRangeGapIterator) NextGap() evictableRangeGapIterator { + seg := gap.NextSegment() + if !seg.Ok() { + return evictableRangeGapIterator{} + } + return seg.NextGap() +} + +// segmentBeforePosition returns the predecessor segment of the position given +// by n.children[i], which may or may not contain a child. If no such segment +// exists, segmentBeforePosition returns a terminal iterator. +func evictableRangesegmentBeforePosition(n *evictableRangenode, i int) evictableRangeIterator { + for i == 0 { + if n.parent == nil { + return evictableRangeIterator{} + } + n, i = n.parent, n.parentIndex + } + return evictableRangeIterator{n, i - 1} +} + +// segmentAfterPosition returns the successor segment of the position given by +// n.children[i], which may or may not contain a child. If no such segment +// exists, segmentAfterPosition returns a terminal iterator. +func evictableRangesegmentAfterPosition(n *evictableRangenode, i int) evictableRangeIterator { + for i == n.nrSegments { + if n.parent == nil { + return evictableRangeIterator{} + } + n, i = n.parent, n.parentIndex + } + return evictableRangeIterator{n, i} +} + +func evictableRangezeroValueSlice(slice []evictableRangeSetValue) { + + for i := range slice { + evictableRangeSetFunctions{}.ClearValue(&slice[i]) + } +} + +func evictableRangezeroNodeSlice(slice []*evictableRangenode) { + for i := range slice { + slice[i] = nil + } +} + +// String stringifies a Set for debugging. +func (s *evictableRangeSet) String() string { + return s.root.String() +} + +// String stringifes a node (and all of its children) for debugging. +func (n *evictableRangenode) String() string { + var buf bytes.Buffer + n.writeDebugString(&buf, "") + return buf.String() +} + +func (n *evictableRangenode) writeDebugString(buf *bytes.Buffer, prefix string) { + if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { + buf.WriteString(prefix) + buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) + } + for i := 0; i < n.nrSegments; i++ { + if child := n.children[i]; child != nil { + cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) + if child.parent != n || child.parentIndex != i { + buf.WriteString(cprefix) + buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) + } + child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) + } + buf.WriteString(prefix) + buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) + } + if child := n.children[n.nrSegments]; child != nil { + child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) + } +} + +// SegmentDataSlices represents segments from a set as slices of start, end, and +// values. SegmentDataSlices is primarily used as an intermediate representation +// for save/restore and the layout here is optimized for that. +// +// +stateify savable +type evictableRangeSegmentDataSlices struct { + Start []uint64 + End []uint64 + Values []evictableRangeSetValue +} + +// ExportSortedSlice returns a copy of all segments in the given set, in ascending +// key order. +func (s *evictableRangeSet) ExportSortedSlices() *evictableRangeSegmentDataSlices { + var sds evictableRangeSegmentDataSlices + for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + sds.Start = append(sds.Start, seg.Start()) + sds.End = append(sds.End, seg.End()) + sds.Values = append(sds.Values, seg.Value()) + } + sds.Start = sds.Start[:len(sds.Start):len(sds.Start)] + sds.End = sds.End[:len(sds.End):len(sds.End)] + sds.Values = sds.Values[:len(sds.Values):len(sds.Values)] + return &sds +} + +// ImportSortedSlice initializes the given set from the given slice. +// +// Preconditions: s must be empty. sds must represent a valid set (the segments +// in sds must have valid lengths that do not overlap). The segments in sds +// must be sorted in ascending key order. +func (s *evictableRangeSet) ImportSortedSlices(sds *evictableRangeSegmentDataSlices) error { + if !s.IsEmpty() { + return fmt.Errorf("cannot import into non-empty set %v", s) + } + gap := s.FirstGap() + for i := range sds.Start { + r := EvictableRange{sds.Start[i], sds.End[i]} + if !gap.Range().IsSupersetOf(r) { + return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: [%d, %d) => %v", sds.Start[i], sds.End[i], sds.Values[i]) + } + gap = s.InsertWithoutMerging(gap, r, sds.Values[i]).NextGap() + } + return nil +} +func (s *evictableRangeSet) saveRoot() *evictableRangeSegmentDataSlices { + return s.ExportSortedSlices() +} + +func (s *evictableRangeSet) loadRoot(sds *evictableRangeSegmentDataSlices) { + if err := s.ImportSortedSlices(sds); err != nil { + panic(err) + } +} diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go new file mode 100644 index 000000000..2b9924ad7 --- /dev/null +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -0,0 +1,1187 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package pgalloc contains the page allocator subsystem, which manages memory +// that may be mapped into application address spaces. +// +// Lock order: +// +// pgalloc.MemoryFile.mu +// pgalloc.MemoryFile.mappingsMu +package pgalloc + +import ( + "fmt" + "math" + "os" + "sync" + "sync/atomic" + "syscall" + "time" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// MemoryFile is a platform.File whose pages may be allocated to arbitrary +// users. +type MemoryFile struct { + // opts holds options passed to NewMemoryFile. opts is immutable. + opts MemoryFileOpts + + // MemoryFile owns a single backing file, which is modeled as follows: + // + // Each page in the file can be committed or uncommitted. A page is + // committed if the host kernel is spending resources to store its contents + // and uncommitted otherwise. This definition includes pages that the host + // kernel has swapped; this is intentional, to ensure that accounting does + // not change even if host kernel swapping behavior changes, and that + // memory used by pseudo-swap mechanisms like zswap is still accounted. + // + // The initial contents of uncommitted pages are implicitly zero bytes. A + // read or write to the contents of an uncommitted page causes it to be + // committed. This is the only event that can cause a uncommitted page to + // be committed. + // + // fallocate(FALLOC_FL_PUNCH_HOLE) (MemoryFile.Decommit) causes committed + // pages to be uncommitted. This is the only event that can cause a + // committed page to be uncommitted. + // + // Memory accounting is based on identifying the set of committed pages. + // Since we do not have direct access to the MMU, tracking reads and writes + // to uncommitted pages to detect commitment would introduce additional + // page faults, which would be prohibitively expensive. Instead, we query + // the host kernel to determine which pages are committed. + + // file is the backing file. The file pointer is immutable. + file *os.File + + mu sync.Mutex + + // usage maps each page in the file to metadata for that page. Pages for + // which no segment exists in usage are both unallocated (not in use) and + // uncommitted. + // + // Since usage stores usageInfo objects by value, clients should usually + // use usageIterator.ValuePtr() instead of usageIterator.Value() to get a + // pointer to the usageInfo rather than a copy. + // + // usage must be kept maximally merged (that is, there should never be two + // adjacent segments with the same values). At least markReclaimed depends + // on this property. + // + // usage is protected by mu. + usage usageSet + + // The UpdateUsage function scans all segments with knownCommitted set + // to false, sees which pages are committed and creates corresponding + // segments with knownCommitted set to true. + // + // In order to avoid unnecessary scans, usageExpected tracks the total + // file blocks expected. This is used to elide the scan when this + // matches the underlying file blocks. + // + // To track swapped pages, usageSwapped tracks the discrepency between + // what is observed in core and what is reported by the file. When + // usageSwapped is non-zero, a sweep will be performed at least every + // second. The start of the last sweep is recorded in usageLast. + // + // All usage attributes are all protected by mu. + usageExpected uint64 + usageSwapped uint64 + usageLast time.Time + + // minUnallocatedPage is the minimum page that may be unallocated. + // i.e., there are no unallocated pages below minUnallocatedPage. + // + // minUnallocatedPage is protected by mu. + minUnallocatedPage uint64 + + // fileSize is the size of the backing memory file in bytes. fileSize is + // always a power-of-two multiple of chunkSize. + // + // fileSize is protected by mu. + fileSize int64 + + // Pages from the backing file are mapped into the local address space on + // the granularity of large pieces called chunks. mappings is a []uintptr + // that stores, for each chunk, the start address of a mapping of that + // chunk in the current process' address space, or 0 if no such mapping + // exists. Once a chunk is mapped, it is never remapped or unmapped until + // the MemoryFile is destroyed. + // + // Mutating the mappings slice or its contents requires both holding + // mappingsMu and using atomic memory operations. (The slice is mutated + // whenever the file is expanded. Per the above, the only permitted + // mutation of the slice's contents is the assignment of a mapping to a + // chunk that was previously unmapped.) Reading the slice or its contents + // only requires *either* holding mappingsMu or using atomic memory + // operations. This allows MemoryFile.MapInternal to avoid locking in the + // common case where chunk mappings already exist. + mappingsMu sync.Mutex + mappings atomic.Value + + // destroyed is set by Destroy to instruct the reclaimer goroutine to + // release resources and exit. destroyed is protected by mu. + destroyed bool + + // reclaimable is true if usage may contain reclaimable pages. reclaimable + // is protected by mu. + reclaimable bool + + // minReclaimablePage is the minimum page that may be reclaimable. + // i.e., all reclaimable pages are >= minReclaimablePage. + // + // minReclaimablePage is protected by mu. + minReclaimablePage uint64 + + // reclaimCond is signaled (with mu locked) when reclaimable or destroyed + // transitions from false to true. + reclaimCond sync.Cond + + // evictable maps EvictableMemoryUsers to eviction state. + // + // evictable is protected by mu. + evictable map[EvictableMemoryUser]*evictableMemoryUserInfo + + // evictionWG counts the number of goroutines currently performing evictions. + evictionWG sync.WaitGroup +} + +// MemoryFileOpts provides options to NewMemoryFile. +type MemoryFileOpts struct { + // DelayedEviction controls the extent to which the MemoryFile may delay + // eviction of evictable allocations. + DelayedEviction DelayedEvictionType +} + +// DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction. +type DelayedEvictionType int + +const ( + // DelayedEvictionDefault has unspecified behavior. + DelayedEvictionDefault DelayedEvictionType = iota + + // DelayedEvictionDisabled requires that evictable allocations are evicted + // as soon as possible. + DelayedEvictionDisabled + + // DelayedEvictionEnabled requests that the MemoryFile delay eviction of + // evictable allocations until doing so is considered necessary to avoid + // performance degradation due to host memory pressure, or OOM kills. + // + // As of this writing, DelayedEvictionEnabled delays evictions until the + // reclaimer goroutine is out of work (pages to reclaim), then evicts all + // pending evictable allocations immediately. + DelayedEvictionEnabled + + // DelayedEvictionManual requires that evictable allocations are only + // evicted when MemoryFile.StartEvictions() is called. This is extremely + // dangerous outside of tests. + DelayedEvictionManual +) + +// usageInfo tracks usage information. +// +// +stateify savable +type usageInfo struct { + // kind is the usage kind. + kind usage.MemoryKind + + // knownCommitted is true if the tracked region is definitely committed. + // (If it is false, the tracked region may or may not be committed.) + knownCommitted bool + + refs uint64 +} + +// An EvictableMemoryUser represents a user of MemoryFile-allocated memory that +// may be asked to deallocate that memory in the presence of memory pressure. +type EvictableMemoryUser interface { + // Evict requests that the EvictableMemoryUser deallocate memory used by + // er, which was registered as evictable by a previous call to + // MemoryFile.MarkEvictable. + // + // Evict is not required to deallocate memory. In particular, since pgalloc + // must call Evict without holding locks to avoid circular lock ordering, + // it is possible that the passed range has already been marked as + // unevictable by a racing call to MemoryFile.MarkUnevictable. + // Implementations of EvictableMemoryUser must detect such races and handle + // them by making Evict have no effect on unevictable ranges. + // + // After a call to Evict, the MemoryFile will consider the evicted range + // unevictable (i.e. it will not call Evict on the same range again) until + // informed otherwise by a subsequent call to MarkEvictable. + Evict(ctx context.Context, er EvictableRange) +} + +// An EvictableRange represents a range of uint64 offsets in an +// EvictableMemoryUser. +// +// In practice, most EvictableMemoryUsers will probably be implementations of +// memmap.Mappable, and EvictableRange therefore corresponds to +// memmap.MappableRange. However, this package cannot depend on the memmap +// package, since doing so would create a circular dependency. +// +// type EvictableRange <generated using go_generics> + +// evictableMemoryUserInfo is the value type of MemoryFile.evictable. +type evictableMemoryUserInfo struct { + // ranges tracks all evictable ranges for the given user. + ranges evictableRangeSet + + // If evicting is true, there is a goroutine currently evicting all + // evictable ranges for this user. + evicting bool +} + +const ( + chunkShift = 24 + chunkSize = 1 << chunkShift // 16 MB + chunkMask = chunkSize - 1 + + initialSize = chunkSize + + // maxPage is the highest 64-bit page. + maxPage = math.MaxUint64 &^ (usermem.PageSize - 1) +) + +// NewMemoryFile creates a MemoryFile backed by the given file. If +// NewMemoryFile succeeds, ownership of file is transferred to the returned +// MemoryFile. +func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) { + switch opts.DelayedEviction { + case DelayedEvictionDefault: + opts.DelayedEviction = DelayedEvictionEnabled + case DelayedEvictionDisabled, DelayedEvictionEnabled, DelayedEvictionManual: + default: + return nil, fmt.Errorf("invalid MemoryFileOpts.DelayedEviction: %v", opts.DelayedEviction) + } + + // Truncate the file to 0 bytes first to ensure that it's empty. + if err := file.Truncate(0); err != nil { + return nil, err + } + if err := file.Truncate(initialSize); err != nil { + return nil, err + } + f := &MemoryFile{ + opts: opts, + fileSize: initialSize, + file: file, + // No pages are reclaimable. DecRef will always be able to + // decrease minReclaimablePage from this point. + minReclaimablePage: maxPage, + evictable: make(map[EvictableMemoryUser]*evictableMemoryUserInfo), + } + f.mappings.Store(make([]uintptr, initialSize/chunkSize)) + f.reclaimCond.L = &f.mu + go f.runReclaim() // S/R-SAFE: f.mu + + // The Linux kernel contains an optional feature called "Integrity + // Measurement Architecture" (IMA). If IMA is enabled, it will checksum + // binaries the first time they are mapped PROT_EXEC. This is bad news for + // executable pages mapped from our backing file, which can grow to + // terabytes in (sparse) size. If IMA attempts to checksum a file that + // large, it will allocate all of the sparse pages and quickly exhaust all + // memory. + // + // Work around IMA by immediately creating a temporary PROT_EXEC mapping, + // while the backing file is still small. IMA will ignore any future + // mappings. + m, _, errno := syscall.Syscall6( + syscall.SYS_MMAP, + 0, + usermem.PageSize, + syscall.PROT_EXEC, + syscall.MAP_SHARED, + file.Fd(), + 0) + if errno != 0 { + // This isn't fatal (IMA may not even be in use). Log the error, but + // don't return it. + log.Warningf("Failed to pre-map MemoryFile PROT_EXEC: %v", errno) + } else { + if _, _, errno := syscall.Syscall( + syscall.SYS_MUNMAP, + m, + usermem.PageSize, + 0); errno != 0 { + panic(fmt.Sprintf("failed to unmap PROT_EXEC MemoryFile mapping: %v", errno)) + } + } + + return f, nil +} + +// Destroy releases all resources used by f. +// +// Preconditions: All pages allocated by f have been freed. +// +// Postconditions: None of f's methods may be called after Destroy. +func (f *MemoryFile) Destroy() { + f.mu.Lock() + defer f.mu.Unlock() + f.destroyed = true + f.reclaimCond.Signal() +} + +// Allocate returns a range of initially-zeroed pages of the given length with +// the given accounting kind and a single reference held by the caller. When +// the last reference on an allocated page is released, ownership of the page +// is returned to the MemoryFile, allowing it to be returned by a future call +// to Allocate. +// +// Preconditions: length must be page-aligned and non-zero. +func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.FileRange, error) { + if length == 0 || length%usermem.PageSize != 0 { + panic(fmt.Sprintf("invalid allocation length: %#x", length)) + } + + f.mu.Lock() + defer f.mu.Unlock() + + // Align hugepage-and-larger allocations on hugepage boundaries to try + // to take advantage of hugetmpfs. + alignment := uint64(usermem.PageSize) + if length >= usermem.HugePageSize { + alignment = usermem.HugePageSize + } + + start, minUnallocatedPage := findUnallocatedRange(&f.usage, f.minUnallocatedPage, length, alignment) + end := start + length + // File offsets are int64s. Since length must be strictly positive, end + // cannot legitimately be 0. + if end < start || int64(end) <= 0 { + return platform.FileRange{}, syserror.ENOMEM + } + + // Expand the file if needed. Double the file size on each expansion; + // uncommitted pages have effectively no cost. + fileSize := f.fileSize + for int64(end) > fileSize { + if fileSize >= 2*fileSize { + // fileSize overflow. + return platform.FileRange{}, syserror.ENOMEM + } + fileSize *= 2 + } + if fileSize > f.fileSize { + if err := f.file.Truncate(fileSize); err != nil { + return platform.FileRange{}, err + } + f.fileSize = fileSize + f.mappingsMu.Lock() + oldMappings := f.mappings.Load().([]uintptr) + newMappings := make([]uintptr, fileSize>>chunkShift) + copy(newMappings, oldMappings) + f.mappings.Store(newMappings) + f.mappingsMu.Unlock() + } + + // Mark selected pages as in use. + fr := platform.FileRange{start, end} + if !f.usage.Add(fr, usageInfo{ + kind: kind, + refs: 1, + }) { + panic(fmt.Sprintf("allocating %v: failed to insert into usage set:\n%v", fr, &f.usage)) + } + + if minUnallocatedPage < start { + f.minUnallocatedPage = minUnallocatedPage + } else { + // start was the first unallocated page. The next must be + // somewhere beyond end. + f.minUnallocatedPage = end + } + + return fr, nil +} + +// findUnallocatedRange returns the first unallocated page in usage of the +// specified length and alignment beginning at page start and the first single +// unallocated page. +func findUnallocatedRange(usage *usageSet, start, length, alignment uint64) (uint64, uint64) { + // Only searched until the first page is found. + firstPage := start + foundFirstPage := false + alignMask := alignment - 1 + for seg := usage.LowerBoundSegment(start); seg.Ok(); seg = seg.NextSegment() { + r := seg.Range() + + if !foundFirstPage && r.Start > firstPage { + foundFirstPage = true + } + + if start >= r.End { + // start was rounded up to an alignment boundary from the end + // of a previous segment and is now beyond r.End. + continue + } + // This segment represents allocated or reclaimable pages; only the + // range from start to the segment's beginning is allocatable, and the + // next allocatable range begins after the segment. + if r.Start > start && r.Start-start >= length { + break + } + start = (r.End + alignMask) &^ alignMask + if !foundFirstPage { + firstPage = r.End + } + } + return start, firstPage +} + +// AllocateAndFill allocates memory of the given kind and fills it by calling +// r.ReadToBlocks() repeatedly until either length bytes are read or a non-nil +// error is returned. It returns the memory filled by r, truncated down to the +// nearest page. If this is shorter than length bytes due to an error returned +// by r.ReadToBlocks(), it returns that error. +// +// Preconditions: length > 0. length must be page-aligned. +func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (platform.FileRange, error) { + fr, err := f.Allocate(length, kind) + if err != nil { + return platform.FileRange{}, err + } + dsts, err := f.MapInternal(fr, usermem.Write) + if err != nil { + f.DecRef(fr) + return platform.FileRange{}, err + } + n, err := safemem.ReadFullToBlocks(r, dsts) + un := uint64(usermem.Addr(n).RoundDown()) + if un < length { + // Free unused memory and update fr to contain only the memory that is + // still allocated. + f.DecRef(platform.FileRange{fr.Start + un, fr.End}) + fr.End = fr.Start + un + } + return fr, err +} + +// fallocate(2) modes, defined in Linux's include/uapi/linux/falloc.h. +const ( + _FALLOC_FL_KEEP_SIZE = 1 + _FALLOC_FL_PUNCH_HOLE = 2 +) + +// Decommit releases resources associated with maintaining the contents of the +// given pages. If Decommit succeeds, future accesses of the decommitted pages +// will read zeroes. +// +// Preconditions: fr.Length() > 0. +func (f *MemoryFile) Decommit(fr platform.FileRange) error { + if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 { + panic(fmt.Sprintf("invalid range: %v", fr)) + } + + // "After a successful call, subsequent reads from this range will + // return zeroes. The FALLOC_FL_PUNCH_HOLE flag must be ORed with + // FALLOC_FL_KEEP_SIZE in mode ..." - fallocate(2) + err := syscall.Fallocate( + int(f.file.Fd()), + _FALLOC_FL_PUNCH_HOLE|_FALLOC_FL_KEEP_SIZE, + int64(fr.Start), + int64(fr.Length())) + if err != nil { + return err + } + f.markDecommitted(fr) + return nil +} + +func (f *MemoryFile) markDecommitted(fr platform.FileRange) { + f.mu.Lock() + defer f.mu.Unlock() + // Since we're changing the knownCommitted attribute, we need to merge + // across the entire range to ensure that the usage tree is minimal. + gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) { + val := seg.ValuePtr() + if val.knownCommitted { + // Drop the usageExpected appropriately. + amount := seg.Range().Length() + usage.MemoryAccounting.Dec(amount, val.kind) + f.usageExpected -= amount + val.knownCommitted = false + } + }) + if gap.Ok() { + panic(fmt.Sprintf("Decommit(%v): attempted to decommit unallocated pages %v:\n%v", fr, gap.Range(), &f.usage)) + } + f.usage.MergeRange(fr) +} + +// IncRef implements platform.File.IncRef. +func (f *MemoryFile) IncRef(fr platform.FileRange) { + if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 { + panic(fmt.Sprintf("invalid range: %v", fr)) + } + + f.mu.Lock() + defer f.mu.Unlock() + + gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) { + seg.ValuePtr().refs++ + }) + if gap.Ok() { + panic(fmt.Sprintf("IncRef(%v): attempted to IncRef on unallocated pages %v:\n%v", fr, gap.Range(), &f.usage)) + } + + f.usage.MergeAdjacent(fr) +} + +// DecRef implements platform.File.DecRef. +func (f *MemoryFile) DecRef(fr platform.FileRange) { + if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 { + panic(fmt.Sprintf("invalid range: %v", fr)) + } + + var freed bool + + f.mu.Lock() + defer f.mu.Unlock() + + for seg := f.usage.FindSegment(fr.Start); seg.Ok() && seg.Start() < fr.End; seg = seg.NextSegment() { + seg = f.usage.Isolate(seg, fr) + val := seg.ValuePtr() + if val.refs == 0 { + panic(fmt.Sprintf("DecRef(%v): 0 existing references on %v:\n%v", fr, seg.Range(), &f.usage)) + } + val.refs-- + if val.refs == 0 { + freed = true + // Reclassify memory as System, until it's freed by the reclaim + // goroutine. + if val.knownCommitted { + usage.MemoryAccounting.Move(seg.Range().Length(), usage.System, val.kind) + } + val.kind = usage.System + } + } + f.usage.MergeAdjacent(fr) + + if freed { + if fr.Start < f.minReclaimablePage { + // We've freed at least one lower page. + f.minReclaimablePage = fr.Start + } + f.reclaimable = true + f.reclaimCond.Signal() + } +} + +// MapInternal implements platform.File.MapInternal. +func (f *MemoryFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { + if !fr.WellFormed() || fr.Length() == 0 { + panic(fmt.Sprintf("invalid range: %v", fr)) + } + if at.Execute { + return safemem.BlockSeq{}, syserror.EACCES + } + + chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift) + if chunks == 1 { + // Avoid an unnecessary slice allocation. + var seq safemem.BlockSeq + err := f.forEachMappingSlice(fr, func(bs []byte) { + seq = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bs)) + }) + return seq, err + } + blocks := make([]safemem.Block, 0, chunks) + err := f.forEachMappingSlice(fr, func(bs []byte) { + blocks = append(blocks, safemem.BlockFromSafeSlice(bs)) + }) + return safemem.BlockSeqFromSlice(blocks), err +} + +// forEachMappingSlice invokes fn on a sequence of byte slices that +// collectively map all bytes in fr. +func (f *MemoryFile) forEachMappingSlice(fr platform.FileRange, fn func([]byte)) error { + mappings := f.mappings.Load().([]uintptr) + for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize { + chunk := int(chunkStart >> chunkShift) + m := atomic.LoadUintptr(&mappings[chunk]) + if m == 0 { + var err error + mappings, m, err = f.getChunkMapping(chunk) + if err != nil { + return err + } + } + startOff := uint64(0) + if chunkStart < fr.Start { + startOff = fr.Start - chunkStart + } + endOff := uint64(chunkSize) + if chunkStart+chunkSize > fr.End { + endOff = fr.End - chunkStart + } + fn(unsafeSlice(m, chunkSize)[startOff:endOff]) + } + return nil +} + +func (f *MemoryFile) getChunkMapping(chunk int) ([]uintptr, uintptr, error) { + f.mappingsMu.Lock() + defer f.mappingsMu.Unlock() + // Another thread may have replaced f.mappings altogether due to file + // expansion. + mappings := f.mappings.Load().([]uintptr) + // Another thread may have already mapped the chunk. + if m := mappings[chunk]; m != 0 { + return mappings, m, nil + } + m, _, errno := syscall.Syscall6( + syscall.SYS_MMAP, + 0, + chunkSize, + syscall.PROT_READ|syscall.PROT_WRITE, + syscall.MAP_SHARED, + f.file.Fd(), + uintptr(chunk<<chunkShift)) + if errno != 0 { + return nil, 0, errno + } + atomic.StoreUintptr(&mappings[chunk], m) + return mappings, m, nil +} + +// MarkEvictable allows f to request memory deallocation by calling +// user.Evict(er) in the future. +// +// Redundantly marking an already-evictable range as evictable has no effect. +func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange) { + f.mu.Lock() + defer f.mu.Unlock() + info, ok := f.evictable[user] + if !ok { + info = &evictableMemoryUserInfo{} + f.evictable[user] = info + } + gap := info.ranges.LowerBoundGap(er.Start) + for gap.Ok() && gap.Start() < er.End { + gapER := gap.Range().Intersect(er) + if gapER.Length() == 0 { + gap = gap.NextGap() + continue + } + gap = info.ranges.Insert(gap, gapER, evictableRangeSetValue{}).NextGap() + } + if !info.evicting { + switch f.opts.DelayedEviction { + case DelayedEvictionDisabled: + // Kick off eviction immediately. + f.startEvictionGoroutineLocked(user, info) + case DelayedEvictionEnabled: + // Ensure that the reclaimer goroutine is running, so that it can + // start eviction when necessary. + f.reclaimCond.Signal() + } + } +} + +// MarkUnevictable informs f that user no longer considers er to be evictable, +// so the MemoryFile should no longer call user.Evict(er). Note that, per +// EvictableMemoryUser.Evict's documentation, user.Evict(er) may still be +// called even after MarkUnevictable returns due to race conditions, and +// implementations of EvictableMemoryUser must handle this possibility. +// +// Redundantly marking an already-unevictable range as unevictable has no +// effect. +func (f *MemoryFile) MarkUnevictable(user EvictableMemoryUser, er EvictableRange) { + f.mu.Lock() + defer f.mu.Unlock() + info, ok := f.evictable[user] + if !ok { + return + } + seg := info.ranges.LowerBoundSegment(er.Start) + for seg.Ok() && seg.Start() < er.End { + seg = info.ranges.Isolate(seg, er) + seg = info.ranges.Remove(seg).NextSegment() + } + // We can only remove info if there's no eviction goroutine running on its + // behalf. + if !info.evicting && info.ranges.IsEmpty() { + delete(f.evictable, user) + } +} + +// MarkAllUnevictable informs f that user no longer considers any offsets to be +// evictable. It otherwise has the same semantics as MarkUnevictable. +func (f *MemoryFile) MarkAllUnevictable(user EvictableMemoryUser) { + f.mu.Lock() + defer f.mu.Unlock() + info, ok := f.evictable[user] + if !ok { + return + } + info.ranges.RemoveAll() + // We can only remove info if there's no eviction goroutine running on its + // behalf. + if !info.evicting { + delete(f.evictable, user) + } +} + +// UpdateUsage ensures that the memory usage statistics in +// usage.MemoryAccounting are up to date. +func (f *MemoryFile) UpdateUsage() error { + f.mu.Lock() + defer f.mu.Unlock() + + // If the underlying usage matches where the usage tree already + // represents, then we can just avoid the entire scan (we know it's + // accurate). + currentUsage, err := f.TotalUsage() + if err != nil { + return err + } + if currentUsage == f.usageExpected && f.usageSwapped == 0 { + log.Debugf("UpdateUsage: skipped with usageSwapped=0.") + return nil + } + // If the current usage matches the expected but there's swap + // accounting, then ensure a scan takes place at least every second + // (when requested). + if currentUsage == f.usageExpected+f.usageSwapped && time.Now().Before(f.usageLast.Add(time.Second)) { + log.Debugf("UpdateUsage: skipped with usageSwapped!=0.") + return nil + } + + f.usageLast = time.Now() + err = f.updateUsageLocked(currentUsage, mincore) + log.Debugf("UpdateUsage: currentUsage=%d, usageExpected=%d, usageSwapped=%d.", + currentUsage, f.usageExpected, f.usageSwapped) + log.Debugf("UpdateUsage: took %v.", time.Since(f.usageLast)) + return err +} + +// updateUsageLocked attempts to detect commitment of previous-uncommitted +// pages by invoking checkCommitted, which is a function that, for each page i +// in bs, sets committed[i] to 1 if the page is committed and 0 otherwise. +// +// Precondition: f.mu must be held. +func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(bs []byte, committed []byte) error) error { + // Track if anything changed to elide the merge. In the common case, we + // expect all segments to be committed and no merge to occur. + changedAny := false + defer func() { + if changedAny { + f.usage.MergeAll() + } + + // Adjust the swap usage to reflect reality. + if f.usageExpected < currentUsage { + // Since no pages may be marked decommitted while we hold mu, we + // know that usage may have only increased since we got the last + // current usage. Therefore, if usageExpected is still short of + // currentUsage, we must assume that the difference is in pages + // that have been swapped. + newUsageSwapped := currentUsage - f.usageExpected + if f.usageSwapped < newUsageSwapped { + usage.MemoryAccounting.Inc(newUsageSwapped-f.usageSwapped, usage.System) + } else { + usage.MemoryAccounting.Dec(f.usageSwapped-newUsageSwapped, usage.System) + } + f.usageSwapped = newUsageSwapped + } else if f.usageSwapped != 0 { + // We have more usage accounted for than the file itself. + // That's fine, we probably caught a race where pages were + // being committed while the above loop was running. Just + // report the higher number that we found and ignore swap. + usage.MemoryAccounting.Dec(f.usageSwapped, usage.System) + f.usageSwapped = 0 + } + }() + + // Reused mincore buffer, will generally be <= 4096 bytes. + var buf []byte + + // Iterate over all usage data. There will only be usage segments + // present when there is an associated reference. + for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + val := seg.Value() + + // Already known to be committed; ignore. + if val.knownCommitted { + continue + } + + // Assume that reclaimable pages (that aren't already known to be + // committed) are not committed. This isn't necessarily true, even + // after the reclaimer does Decommit(), because the kernel may + // subsequently back the hugepage-sized region containing the + // decommitted page with a hugepage. However, it's consistent with our + // treatment of unallocated pages, which have the same property. + if val.refs == 0 { + continue + } + + // Get the range for this segment. As we touch slices, the + // Start value will be walked along. + r := seg.Range() + + var checkErr error + err := f.forEachMappingSlice(r, func(s []byte) { + if checkErr != nil { + return + } + + // Ensure that we have sufficient buffer for the call + // (one byte per page). The length of each slice must + // be page-aligned. + bufLen := len(s) / usermem.PageSize + if len(buf) < bufLen { + buf = make([]byte, bufLen) + } + + // Query for new pages in core. + if err := checkCommitted(s, buf); err != nil { + checkErr = err + return + } + + // Scan each page and switch out segments. + populatedRun := false + populatedRunStart := 0 + for i := 0; i <= bufLen; i++ { + // We run past the end of the slice here to + // simplify the logic and only set populated if + // we're still looking at elements. + populated := false + if i < bufLen { + populated = buf[i]&0x1 != 0 + } + + switch { + case populated == populatedRun: + // Keep the run going. + continue + case populated && !populatedRun: + // Begin the run. + populatedRun = true + populatedRunStart = i + // Keep going. + continue + case !populated && populatedRun: + // Finish the run by changing this segment. + runRange := platform.FileRange{ + Start: r.Start + uint64(populatedRunStart*usermem.PageSize), + End: r.Start + uint64(i*usermem.PageSize), + } + seg = f.usage.Isolate(seg, runRange) + seg.ValuePtr().knownCommitted = true + // Advance the segment only if we still + // have work to do in the context of + // the original segment from the for + // loop. Otherwise, the for loop itself + // will advance the segment + // appropriately. + if runRange.End != r.End { + seg = seg.NextSegment() + } + amount := runRange.Length() + usage.MemoryAccounting.Inc(amount, val.kind) + f.usageExpected += amount + changedAny = true + populatedRun = false + } + } + + // Advance r.Start. + r.Start += uint64(len(s)) + }) + if checkErr != nil { + return checkErr + } + if err != nil { + return err + } + } + + return nil +} + +// TotalUsage returns an aggregate usage for all memory statistics except +// Mapped (which is external to MemoryFile). This is generally much cheaper +// than UpdateUsage, but will not provide a fine-grained breakdown. +func (f *MemoryFile) TotalUsage() (uint64, error) { + // Stat the underlying file to discover the underlying usage. stat(2) + // always reports the allocated block count in units of 512 bytes. This + // includes pages in the page cache and swapped pages. + var stat syscall.Stat_t + if err := syscall.Fstat(int(f.file.Fd()), &stat); err != nil { + return 0, err + } + return uint64(stat.Blocks * 512), nil +} + +// TotalSize returns the current size of the backing file in bytes, which is an +// upper bound on the amount of memory that can currently be allocated from the +// MemoryFile. The value returned by TotalSize is permitted to change. +func (f *MemoryFile) TotalSize() uint64 { + f.mu.Lock() + defer f.mu.Unlock() + return uint64(f.fileSize) +} + +// File returns the backing file. +func (f *MemoryFile) File() *os.File { + return f.file +} + +// FD implements platform.File.FD. +func (f *MemoryFile) FD() int { + return int(f.file.Fd()) +} + +// String implements fmt.Stringer.String. +// +// Note that because f.String locks f.mu, calling f.String internally +// (including indirectly through the fmt package) risks recursive locking. +// Within the pgalloc package, use f.usage directly instead. +func (f *MemoryFile) String() string { + f.mu.Lock() + defer f.mu.Unlock() + return f.usage.String() +} + +// runReclaim implements the reclaimer goroutine, which continuously decommits +// reclaimable pages in order to reduce memory usage and make them available +// for allocation. +func (f *MemoryFile) runReclaim() { + for { + fr, ok := f.findReclaimable() + if !ok { + break + } + + if err := f.Decommit(fr); err != nil { + log.Warningf("Reclaim failed to decommit %v: %v", fr, err) + // Zero the pages manually. This won't reduce memory usage, but at + // least ensures that the pages will be zero when reallocated. + f.forEachMappingSlice(fr, func(bs []byte) { + for i := range bs { + bs[i] = 0 + } + }) + // Pretend the pages were decommitted even though they weren't, + // since the memory accounting implementation has no idea how to + // deal with this. + f.markDecommitted(fr) + } + f.markReclaimed(fr) + } + // We only get here if findReclaimable finds f.destroyed set and returns + // false. + f.mu.Lock() + defer f.mu.Unlock() + if !f.destroyed { + panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set") + } + f.file.Close() + // Ensure that any attempts to use f.file.Fd() fail instead of getting a fd + // that has possibly been reassigned. + f.file = nil + f.mappingsMu.Lock() + defer f.mappingsMu.Unlock() + mappings := f.mappings.Load().([]uintptr) + for i, m := range mappings { + if m != 0 { + _, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m, chunkSize, 0) + if errno != 0 { + log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno) + } + } + } + // Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.) + f.mappings.Store([]uintptr{}) +} + +func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) { + f.mu.Lock() + defer f.mu.Unlock() + for { + for { + if f.destroyed { + return platform.FileRange{}, false + } + if f.reclaimable { + break + } + if f.opts.DelayedEviction == DelayedEvictionEnabled { + // No work to do. Evict any pending evictable allocations to + // get more reclaimable pages before going to sleep. + f.startEvictionsLocked() + } + f.reclaimCond.Wait() + } + // Allocate returns the first usable range in offset order and is + // currently a linear scan, so reclaiming from the beginning of the + // file minimizes the expected latency of Allocate. + for seg := f.usage.LowerBoundSegment(f.minReclaimablePage); seg.Ok(); seg = seg.NextSegment() { + if seg.ValuePtr().refs == 0 { + f.minReclaimablePage = seg.End() + return seg.Range(), true + } + } + // No pages are reclaimable. + f.reclaimable = false + f.minReclaimablePage = maxPage + } +} + +func (f *MemoryFile) markReclaimed(fr platform.FileRange) { + f.mu.Lock() + defer f.mu.Unlock() + seg := f.usage.FindSegment(fr.Start) + // All of fr should be mapped to a single uncommitted reclaimable segment + // accounted to System. + if !seg.Ok() { + panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage)) + } + if !seg.Range().IsSupersetOf(fr) { + panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage)) + } + if got, want := seg.Value(), (usageInfo{ + kind: usage.System, + knownCommitted: false, + refs: 0, + }); got != want { + panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage)) + } + // Deallocate reclaimed pages. Even though all of seg is reclaimable, the + // caller of markReclaimed may not have decommitted it, so we can only mark + // fr as reclaimed. + f.usage.Remove(f.usage.Isolate(seg, fr)) + if fr.Start < f.minUnallocatedPage { + // We've deallocated at least one lower page. + f.minUnallocatedPage = fr.Start + } +} + +// StartEvictions requests that f evict all evictable allocations. It does not +// wait for eviction to complete; for this, see MemoryFile.WaitForEvictions. +func (f *MemoryFile) StartEvictions() { + f.mu.Lock() + defer f.mu.Unlock() + f.startEvictionsLocked() +} + +// Preconditions: f.mu must be locked. +func (f *MemoryFile) startEvictionsLocked() { + for user, info := range f.evictable { + // Don't start multiple goroutines to evict the same user's + // allocations. + if !info.evicting { + f.startEvictionGoroutineLocked(user, info) + } + } +} + +// Preconditions: info == f.evictable[user]. !info.evicting. f.mu must be +// locked. +func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) { + info.evicting = true + f.evictionWG.Add(1) + go func() { // S/R-SAFE: f.evictionWG + defer f.evictionWG.Done() + for { + f.mu.Lock() + info, ok := f.evictable[user] + if !ok { + // This shouldn't happen: only this goroutine is permitted + // to delete this entry. + f.mu.Unlock() + panic(fmt.Sprintf("evictableMemoryUserInfo for EvictableMemoryUser %v deleted while eviction goroutine running", user)) + } + if info.ranges.IsEmpty() { + delete(f.evictable, user) + f.mu.Unlock() + return + } + // Evict from the end of info.ranges, under the assumption that + // if ranges in user start being used again (and are + // consequently marked unevictable), such uses are more likely + // to start from the beginning of user. + seg := info.ranges.LastSegment() + er := seg.Range() + info.ranges.Remove(seg) + // user.Evict() must be called without holding f.mu to avoid + // circular lock ordering. + f.mu.Unlock() + user.Evict(context.Background(), er) + } + }() +} + +// WaitForEvictions blocks until f is no longer evicting any evictable +// allocations. +func (f *MemoryFile) WaitForEvictions() { + f.evictionWG.Wait() +} + +type usageSetFunctions struct{} + +func (usageSetFunctions) MinKey() uint64 { + return 0 +} + +func (usageSetFunctions) MaxKey() uint64 { + return math.MaxUint64 +} + +func (usageSetFunctions) ClearValue(val *usageInfo) { +} + +func (usageSetFunctions) Merge(_ platform.FileRange, val1 usageInfo, _ platform.FileRange, val2 usageInfo) (usageInfo, bool) { + return val1, val1 == val2 +} + +func (usageSetFunctions) Split(_ platform.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) { + return val, val +} + +// evictableRangeSetValue is the value type of evictableRangeSet. +type evictableRangeSetValue struct{} + +type evictableRangeSetFunctions struct{} + +func (evictableRangeSetFunctions) MinKey() uint64 { + return 0 +} + +func (evictableRangeSetFunctions) MaxKey() uint64 { + return math.MaxUint64 +} + +func (evictableRangeSetFunctions) ClearValue(val *evictableRangeSetValue) { +} + +func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetValue, _ EvictableRange, _ evictableRangeSetValue) (evictableRangeSetValue, bool) { + return evictableRangeSetValue{}, true +} + +func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) { + return evictableRangeSetValue{}, evictableRangeSetValue{} +} diff --git a/pkg/sentry/pgalloc/pgalloc_state_autogen.go b/pkg/sentry/pgalloc/pgalloc_state_autogen.go new file mode 100755 index 000000000..36a5aafa1 --- /dev/null +++ b/pkg/sentry/pgalloc/pgalloc_state_autogen.go @@ -0,0 +1,146 @@ +// automatically generated by stateify. + +package pgalloc + +import ( + "gvisor.googlesource.com/gvisor/pkg/state" +) + +func (x *EvictableRange) beforeSave() {} +func (x *EvictableRange) save(m state.Map) { + x.beforeSave() + m.Save("Start", &x.Start) + m.Save("End", &x.End) +} + +func (x *EvictableRange) afterLoad() {} +func (x *EvictableRange) load(m state.Map) { + m.Load("Start", &x.Start) + m.Load("End", &x.End) +} + +func (x *evictableRangeSet) beforeSave() {} +func (x *evictableRangeSet) save(m state.Map) { + x.beforeSave() + var root *evictableRangeSegmentDataSlices = x.saveRoot() + m.SaveValue("root", root) +} + +func (x *evictableRangeSet) afterLoad() {} +func (x *evictableRangeSet) load(m state.Map) { + m.LoadValue("root", new(*evictableRangeSegmentDataSlices), func(y interface{}) { x.loadRoot(y.(*evictableRangeSegmentDataSlices)) }) +} + +func (x *evictableRangenode) beforeSave() {} +func (x *evictableRangenode) save(m state.Map) { + x.beforeSave() + m.Save("nrSegments", &x.nrSegments) + m.Save("parent", &x.parent) + m.Save("parentIndex", &x.parentIndex) + m.Save("hasChildren", &x.hasChildren) + m.Save("keys", &x.keys) + m.Save("values", &x.values) + m.Save("children", &x.children) +} + +func (x *evictableRangenode) afterLoad() {} +func (x *evictableRangenode) load(m state.Map) { + m.Load("nrSegments", &x.nrSegments) + m.Load("parent", &x.parent) + m.Load("parentIndex", &x.parentIndex) + m.Load("hasChildren", &x.hasChildren) + m.Load("keys", &x.keys) + m.Load("values", &x.values) + m.Load("children", &x.children) +} + +func (x *evictableRangeSegmentDataSlices) beforeSave() {} +func (x *evictableRangeSegmentDataSlices) save(m state.Map) { + x.beforeSave() + m.Save("Start", &x.Start) + m.Save("End", &x.End) + m.Save("Values", &x.Values) +} + +func (x *evictableRangeSegmentDataSlices) afterLoad() {} +func (x *evictableRangeSegmentDataSlices) load(m state.Map) { + m.Load("Start", &x.Start) + m.Load("End", &x.End) + m.Load("Values", &x.Values) +} + +func (x *usageInfo) beforeSave() {} +func (x *usageInfo) save(m state.Map) { + x.beforeSave() + m.Save("kind", &x.kind) + m.Save("knownCommitted", &x.knownCommitted) + m.Save("refs", &x.refs) +} + +func (x *usageInfo) afterLoad() {} +func (x *usageInfo) load(m state.Map) { + m.Load("kind", &x.kind) + m.Load("knownCommitted", &x.knownCommitted) + m.Load("refs", &x.refs) +} + +func (x *usageSet) beforeSave() {} +func (x *usageSet) save(m state.Map) { + x.beforeSave() + var root *usageSegmentDataSlices = x.saveRoot() + m.SaveValue("root", root) +} + +func (x *usageSet) afterLoad() {} +func (x *usageSet) load(m state.Map) { + m.LoadValue("root", new(*usageSegmentDataSlices), func(y interface{}) { x.loadRoot(y.(*usageSegmentDataSlices)) }) +} + +func (x *usagenode) beforeSave() {} +func (x *usagenode) save(m state.Map) { + x.beforeSave() + m.Save("nrSegments", &x.nrSegments) + m.Save("parent", &x.parent) + m.Save("parentIndex", &x.parentIndex) + m.Save("hasChildren", &x.hasChildren) + m.Save("keys", &x.keys) + m.Save("values", &x.values) + m.Save("children", &x.children) +} + +func (x *usagenode) afterLoad() {} +func (x *usagenode) load(m state.Map) { + m.Load("nrSegments", &x.nrSegments) + m.Load("parent", &x.parent) + m.Load("parentIndex", &x.parentIndex) + m.Load("hasChildren", &x.hasChildren) + m.Load("keys", &x.keys) + m.Load("values", &x.values) + m.Load("children", &x.children) +} + +func (x *usageSegmentDataSlices) beforeSave() {} +func (x *usageSegmentDataSlices) save(m state.Map) { + x.beforeSave() + m.Save("Start", &x.Start) + m.Save("End", &x.End) + m.Save("Values", &x.Values) +} + +func (x *usageSegmentDataSlices) afterLoad() {} +func (x *usageSegmentDataSlices) load(m state.Map) { + m.Load("Start", &x.Start) + m.Load("End", &x.End) + m.Load("Values", &x.Values) +} + +func init() { + state.Register("pgalloc.EvictableRange", (*EvictableRange)(nil), state.Fns{Save: (*EvictableRange).save, Load: (*EvictableRange).load}) + state.Register("pgalloc.evictableRangeSet", (*evictableRangeSet)(nil), state.Fns{Save: (*evictableRangeSet).save, Load: (*evictableRangeSet).load}) + state.Register("pgalloc.evictableRangenode", (*evictableRangenode)(nil), state.Fns{Save: (*evictableRangenode).save, Load: (*evictableRangenode).load}) + state.Register("pgalloc.evictableRangeSegmentDataSlices", (*evictableRangeSegmentDataSlices)(nil), state.Fns{Save: (*evictableRangeSegmentDataSlices).save, Load: (*evictableRangeSegmentDataSlices).load}) + state.Register("pgalloc.usageInfo", (*usageInfo)(nil), state.Fns{Save: (*usageInfo).save, Load: (*usageInfo).load}) + state.Register("pgalloc.usageSet", (*usageSet)(nil), state.Fns{Save: (*usageSet).save, Load: (*usageSet).load}) + state.Register("pgalloc.usagenode", (*usagenode)(nil), state.Fns{Save: (*usagenode).save, Load: (*usagenode).load}) + state.Register("pgalloc.usageSegmentDataSlices", (*usageSegmentDataSlices)(nil), state.Fns{Save: (*usageSegmentDataSlices).save, Load: (*usageSegmentDataSlices).load}) +} diff --git a/pkg/sentry/pgalloc/pgalloc_unsafe.go b/pkg/sentry/pgalloc/pgalloc_unsafe.go new file mode 100644 index 000000000..a4b5d581c --- /dev/null +++ b/pkg/sentry/pgalloc/pgalloc_unsafe.go @@ -0,0 +1,40 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pgalloc + +import ( + "reflect" + "syscall" + "unsafe" +) + +func unsafeSlice(addr uintptr, length int) (slice []byte) { + sh := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) + sh.Data = addr + sh.Len = length + sh.Cap = length + return +} + +func mincore(s []byte, buf []byte) error { + if _, _, errno := syscall.RawSyscall( + syscall.SYS_MINCORE, + uintptr(unsafe.Pointer(&s[0])), + uintptr(len(s)), + uintptr(unsafe.Pointer(&buf[0]))); errno != 0 { + return errno + } + return nil +} diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go new file mode 100644 index 000000000..d4ba384b1 --- /dev/null +++ b/pkg/sentry/pgalloc/save_restore.go @@ -0,0 +1,210 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pgalloc + +import ( + "bytes" + "fmt" + "io" + "runtime" + "sync/atomic" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/state" +) + +// SaveTo writes f's state to the given stream. +func (f *MemoryFile) SaveTo(w io.Writer) error { + // Wait for reclaim. + f.mu.Lock() + defer f.mu.Unlock() + for f.reclaimable { + f.reclaimCond.Signal() + f.mu.Unlock() + runtime.Gosched() + f.mu.Lock() + } + + // Ensure that there are no pending evictions. + if len(f.evictable) != 0 { + panic(fmt.Sprintf("evictions still pending for %d users; call StartEvictions and WaitForEvictions before SaveTo", len(f.evictable))) + } + + // Ensure that all pages that contain data have knownCommitted set, since + // we only store knownCommitted pages below. + zeroPage := make([]byte, usermem.PageSize) + err := f.updateUsageLocked(0, func(bs []byte, committed []byte) error { + for pgoff := 0; pgoff < len(bs); pgoff += usermem.PageSize { + i := pgoff / usermem.PageSize + pg := bs[pgoff : pgoff+usermem.PageSize] + if !bytes.Equal(pg, zeroPage) { + committed[i] = 1 + continue + } + committed[i] = 0 + // Reading the page caused it to be committed; decommit it to + // reduce memory usage. + // + // "MADV_REMOVE [...] Free up a given range of pages and its + // associated backing store. This is equivalent to punching a hole + // in the corresponding byte range of the backing store (see + // fallocate(2))." - madvise(2) + if err := syscall.Madvise(pg, syscall.MADV_REMOVE); err != nil { + // This doesn't impact the correctness of saved memory, it + // just means that we're incrementally more likely to OOM. + // Complain, but don't abort saving. + log.Warningf("Decommitting page %p while saving failed: %v", pg, err) + } + } + return nil + }) + if err != nil { + return err + } + + // Save metadata. + if err := state.Save(w, &f.fileSize, nil); err != nil { + return err + } + if err := state.Save(w, &f.usage, nil); err != nil { + return err + } + + // Dump out committed pages. + for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + if !seg.Value().knownCommitted { + continue + } + // Write a header to distinguish from objects. + if err := state.WriteHeader(w, uint64(seg.Range().Length()), false); err != nil { + return err + } + // Write out data. + var ioErr error + err := f.forEachMappingSlice(seg.Range(), func(s []byte) { + if ioErr != nil { + return + } + _, ioErr = w.Write(s) + }) + if ioErr != nil { + return ioErr + } + if err != nil { + return err + } + } + + return nil +} + +// LoadFrom loads MemoryFile state from the given stream. +func (f *MemoryFile) LoadFrom(r io.Reader) error { + // Load metadata. + if err := state.Load(r, &f.fileSize, nil); err != nil { + return err + } + if err := f.file.Truncate(f.fileSize); err != nil { + return err + } + newMappings := make([]uintptr, f.fileSize>>chunkShift) + f.mappings.Store(newMappings) + if err := state.Load(r, &f.usage, nil); err != nil { + return err + } + + // Try to map committed chunks concurrently: For any given chunk, either + // this loop or the following one will mmap the chunk first and cache it in + // f.mappings for the other, but this loop is likely to run ahead of the + // other since it doesn't do any work between mmaps. The rest of this + // function doesn't mutate f.usage, so it's safe to iterate concurrently. + mapperDone := make(chan struct{}) + mapperCanceled := int32(0) + go func() { // S/R-SAFE: see comment + defer func() { close(mapperDone) }() + for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + if atomic.LoadInt32(&mapperCanceled) != 0 { + return + } + if seg.Value().knownCommitted { + f.forEachMappingSlice(seg.Range(), func(s []byte) {}) + } + } + }() + defer func() { + atomic.StoreInt32(&mapperCanceled, 1) + <-mapperDone + }() + + // Load committed pages. + for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + if !seg.Value().knownCommitted { + continue + } + // Verify header. + length, object, err := state.ReadHeader(r) + if err != nil { + return err + } + if object { + // Not expected. + return fmt.Errorf("unexpected object") + } + if expected := uint64(seg.Range().Length()); length != expected { + // Size mismatch. + return fmt.Errorf("mismatched segment: expected %d, got %d", expected, length) + } + // Read data. + var ioErr error + err = f.forEachMappingSlice(seg.Range(), func(s []byte) { + if ioErr != nil { + return + } + _, ioErr = io.ReadFull(r, s) + }) + if ioErr != nil { + return ioErr + } + if err != nil { + return err + } + + // Update accounting for restored pages. We need to do this here since + // these segments are marked as "known committed", and will be skipped + // over on accounting scans. + usage.MemoryAccounting.Inc(seg.End()-seg.Start(), seg.Value().kind) + } + + return nil +} + +// MemoryFileProvider provides the MemoryFile method. +// +// This type exists to work around a save/restore defect. The only object in a +// saved object graph that S/R allows to be replaced at time of restore is the +// starting point of the restore, kernel.Kernel. However, the MemoryFile +// changes between save and restore as well, so objects that need persistent +// access to the MemoryFile must instead store a pointer to the Kernel and call +// Kernel.MemoryFile() as required. In most cases, depending on the kernel +// package directly would create a package dependency loop, so the stored +// pointer must instead be a MemoryProvider interface object. Correspondingly, +// kernel.Kernel is the only implementation of this interface. +type MemoryFileProvider interface { + // MemoryFile returns the Kernel MemoryFile. + MemoryFile() *MemoryFile +} diff --git a/pkg/sentry/pgalloc/usage_set.go b/pkg/sentry/pgalloc/usage_set.go new file mode 100755 index 000000000..8ef4952eb --- /dev/null +++ b/pkg/sentry/pgalloc/usage_set.go @@ -0,0 +1,1274 @@ +package pgalloc + +import ( + __generics_imported0 "gvisor.googlesource.com/gvisor/pkg/sentry/platform" +) + +import ( + "bytes" + "fmt" +) + +const ( + // minDegree is the minimum degree of an internal node in a Set B-tree. + // + // - Any non-root node has at least minDegree-1 segments. + // + // - Any non-root internal (non-leaf) node has at least minDegree children. + // + // - The root node may have fewer than minDegree-1 segments, but it may + // only have 0 segments if the tree is empty. + // + // Our implementation requires minDegree >= 3. Higher values of minDegree + // usually improve performance, but increase memory usage for small sets. + usageminDegree = 10 + + usagemaxDegree = 2 * usageminDegree +) + +// A Set is a mapping of segments with non-overlapping Range keys. The zero +// value for a Set is an empty set. Set values are not safely movable nor +// copyable. Set is thread-compatible. +// +// +stateify savable +type usageSet struct { + root usagenode `state:".(*usageSegmentDataSlices)"` +} + +// IsEmpty returns true if the set contains no segments. +func (s *usageSet) IsEmpty() bool { + return s.root.nrSegments == 0 +} + +// IsEmptyRange returns true iff no segments in the set overlap the given +// range. This is semantically equivalent to s.SpanRange(r) == 0, but may be +// more efficient. +func (s *usageSet) IsEmptyRange(r __generics_imported0.FileRange) bool { + switch { + case r.Length() < 0: + panic(fmt.Sprintf("invalid range %v", r)) + case r.Length() == 0: + return true + } + _, gap := s.Find(r.Start) + if !gap.Ok() { + return false + } + return r.End <= gap.End() +} + +// Span returns the total size of all segments in the set. +func (s *usageSet) Span() uint64 { + var sz uint64 + for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + sz += seg.Range().Length() + } + return sz +} + +// SpanRange returns the total size of the intersection of segments in the set +// with the given range. +func (s *usageSet) SpanRange(r __generics_imported0.FileRange) uint64 { + switch { + case r.Length() < 0: + panic(fmt.Sprintf("invalid range %v", r)) + case r.Length() == 0: + return 0 + } + var sz uint64 + for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { + sz += seg.Range().Intersect(r).Length() + } + return sz +} + +// FirstSegment returns the first segment in the set. If the set is empty, +// FirstSegment returns a terminal iterator. +func (s *usageSet) FirstSegment() usageIterator { + if s.root.nrSegments == 0 { + return usageIterator{} + } + return s.root.firstSegment() +} + +// LastSegment returns the last segment in the set. If the set is empty, +// LastSegment returns a terminal iterator. +func (s *usageSet) LastSegment() usageIterator { + if s.root.nrSegments == 0 { + return usageIterator{} + } + return s.root.lastSegment() +} + +// FirstGap returns the first gap in the set. +func (s *usageSet) FirstGap() usageGapIterator { + n := &s.root + for n.hasChildren { + n = n.children[0] + } + return usageGapIterator{n, 0} +} + +// LastGap returns the last gap in the set. +func (s *usageSet) LastGap() usageGapIterator { + n := &s.root + for n.hasChildren { + n = n.children[n.nrSegments] + } + return usageGapIterator{n, n.nrSegments} +} + +// Find returns the segment or gap whose range contains the given key. If a +// segment is found, the returned Iterator is non-terminal and the +// returned GapIterator is terminal. Otherwise, the returned Iterator is +// terminal and the returned GapIterator is non-terminal. +func (s *usageSet) Find(key uint64) (usageIterator, usageGapIterator) { + n := &s.root + for { + + lower := 0 + upper := n.nrSegments + for lower < upper { + i := lower + (upper-lower)/2 + if r := n.keys[i]; key < r.End { + if key >= r.Start { + return usageIterator{n, i}, usageGapIterator{} + } + upper = i + } else { + lower = i + 1 + } + } + i := lower + if !n.hasChildren { + return usageIterator{}, usageGapIterator{n, i} + } + n = n.children[i] + } +} + +// FindSegment returns the segment whose range contains the given key. If no +// such segment exists, FindSegment returns a terminal iterator. +func (s *usageSet) FindSegment(key uint64) usageIterator { + seg, _ := s.Find(key) + return seg +} + +// LowerBoundSegment returns the segment with the lowest range that contains a +// key greater than or equal to min. If no such segment exists, +// LowerBoundSegment returns a terminal iterator. +func (s *usageSet) LowerBoundSegment(min uint64) usageIterator { + seg, gap := s.Find(min) + if seg.Ok() { + return seg + } + return gap.NextSegment() +} + +// UpperBoundSegment returns the segment with the highest range that contains a +// key less than or equal to max. If no such segment exists, UpperBoundSegment +// returns a terminal iterator. +func (s *usageSet) UpperBoundSegment(max uint64) usageIterator { + seg, gap := s.Find(max) + if seg.Ok() { + return seg + } + return gap.PrevSegment() +} + +// FindGap returns the gap containing the given key. If no such gap exists +// (i.e. the set contains a segment containing that key), FindGap returns a +// terminal iterator. +func (s *usageSet) FindGap(key uint64) usageGapIterator { + _, gap := s.Find(key) + return gap +} + +// LowerBoundGap returns the gap with the lowest range that is greater than or +// equal to min. +func (s *usageSet) LowerBoundGap(min uint64) usageGapIterator { + seg, gap := s.Find(min) + if gap.Ok() { + return gap + } + return seg.NextGap() +} + +// UpperBoundGap returns the gap with the highest range that is less than or +// equal to max. +func (s *usageSet) UpperBoundGap(max uint64) usageGapIterator { + seg, gap := s.Find(max) + if gap.Ok() { + return gap + } + return seg.PrevGap() +} + +// Add inserts the given segment into the set and returns true. If the new +// segment can be merged with adjacent segments, Add will do so. If the new +// segment would overlap an existing segment, Add returns false. If Add +// succeeds, all existing iterators are invalidated. +func (s *usageSet) Add(r __generics_imported0.FileRange, val usageInfo) bool { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + gap := s.FindGap(r.Start) + if !gap.Ok() { + return false + } + if r.End > gap.End() { + return false + } + s.Insert(gap, r, val) + return true +} + +// AddWithoutMerging inserts the given segment into the set and returns true. +// If it would overlap an existing segment, AddWithoutMerging does nothing and +// returns false. If AddWithoutMerging succeeds, all existing iterators are +// invalidated. +func (s *usageSet) AddWithoutMerging(r __generics_imported0.FileRange, val usageInfo) bool { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + gap := s.FindGap(r.Start) + if !gap.Ok() { + return false + } + if r.End > gap.End() { + return false + } + s.InsertWithoutMergingUnchecked(gap, r, val) + return true +} + +// Insert inserts the given segment into the given gap. If the new segment can +// be merged with adjacent segments, Insert will do so. Insert returns an +// iterator to the segment containing the inserted value (which may have been +// merged with other values). All existing iterators (including gap, but not +// including the returned iterator) are invalidated. +// +// If the gap cannot accommodate the segment, or if r is invalid, Insert panics. +// +// Insert is semantically equivalent to a InsertWithoutMerging followed by a +// Merge, but may be more efficient. Note that there is no unchecked variant of +// Insert since Insert must retrieve and inspect gap's predecessor and +// successor segments regardless. +func (s *usageSet) Insert(gap usageGapIterator, r __generics_imported0.FileRange, val usageInfo) usageIterator { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + prev, next := gap.PrevSegment(), gap.NextSegment() + if prev.Ok() && prev.End() > r.Start { + panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) + } + if next.Ok() && next.Start() < r.End { + panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) + } + if prev.Ok() && prev.End() == r.Start { + if mval, ok := (usageSetFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok { + prev.SetEndUnchecked(r.End) + prev.SetValue(mval) + if next.Ok() && next.Start() == r.End { + val = mval + if mval, ok := (usageSetFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { + prev.SetEndUnchecked(next.End()) + prev.SetValue(mval) + return s.Remove(next).PrevSegment() + } + } + return prev + } + } + if next.Ok() && next.Start() == r.End { + if mval, ok := (usageSetFunctions{}).Merge(r, val, next.Range(), next.Value()); ok { + next.SetStartUnchecked(r.Start) + next.SetValue(mval) + return next + } + } + return s.InsertWithoutMergingUnchecked(gap, r, val) +} + +// InsertWithoutMerging inserts the given segment into the given gap and +// returns an iterator to the inserted segment. All existing iterators +// (including gap, but not including the returned iterator) are invalidated. +// +// If the gap cannot accommodate the segment, or if r is invalid, +// InsertWithoutMerging panics. +func (s *usageSet) InsertWithoutMerging(gap usageGapIterator, r __generics_imported0.FileRange, val usageInfo) usageIterator { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + if gr := gap.Range(); !gr.IsSupersetOf(r) { + panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) + } + return s.InsertWithoutMergingUnchecked(gap, r, val) +} + +// InsertWithoutMergingUnchecked inserts the given segment into the given gap +// and returns an iterator to the inserted segment. All existing iterators +// (including gap, but not including the returned iterator) are invalidated. +// +// Preconditions: r.Start >= gap.Start(); r.End <= gap.End(). +func (s *usageSet) InsertWithoutMergingUnchecked(gap usageGapIterator, r __generics_imported0.FileRange, val usageInfo) usageIterator { + gap = gap.node.rebalanceBeforeInsert(gap) + copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) + copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) + gap.node.keys[gap.index] = r + gap.node.values[gap.index] = val + gap.node.nrSegments++ + return usageIterator{gap.node, gap.index} +} + +// Remove removes the given segment and returns an iterator to the vacated gap. +// All existing iterators (including seg, but not including the returned +// iterator) are invalidated. +func (s *usageSet) Remove(seg usageIterator) usageGapIterator { + + if seg.node.hasChildren { + + victim := seg.PrevSegment() + + seg.SetRangeUnchecked(victim.Range()) + seg.SetValue(victim.Value()) + return s.Remove(victim).NextGap() + } + copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) + copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) + usageSetFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) + seg.node.nrSegments-- + return seg.node.rebalanceAfterRemove(usageGapIterator{seg.node, seg.index}) +} + +// RemoveAll removes all segments from the set. All existing iterators are +// invalidated. +func (s *usageSet) RemoveAll() { + s.root = usagenode{} +} + +// RemoveRange removes all segments in the given range. An iterator to the +// newly formed gap is returned, and all existing iterators are invalidated. +func (s *usageSet) RemoveRange(r __generics_imported0.FileRange) usageGapIterator { + seg, gap := s.Find(r.Start) + if seg.Ok() { + seg = s.Isolate(seg, r) + gap = s.Remove(seg) + } + for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { + seg = s.Isolate(seg, r) + gap = s.Remove(seg) + } + return gap +} + +// Merge attempts to merge two neighboring segments. If successful, Merge +// returns an iterator to the merged segment, and all existing iterators are +// invalidated. Otherwise, Merge returns a terminal iterator. +// +// If first is not the predecessor of second, Merge panics. +func (s *usageSet) Merge(first, second usageIterator) usageIterator { + if first.NextSegment() != second { + panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) + } + return s.MergeUnchecked(first, second) +} + +// MergeUnchecked attempts to merge two neighboring segments. If successful, +// MergeUnchecked returns an iterator to the merged segment, and all existing +// iterators are invalidated. Otherwise, MergeUnchecked returns a terminal +// iterator. +// +// Precondition: first is the predecessor of second: first.NextSegment() == +// second, first == second.PrevSegment(). +func (s *usageSet) MergeUnchecked(first, second usageIterator) usageIterator { + if first.End() == second.Start() { + if mval, ok := (usageSetFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { + + first.SetEndUnchecked(second.End()) + first.SetValue(mval) + return s.Remove(second).PrevSegment() + } + } + return usageIterator{} +} + +// MergeAll attempts to merge all adjacent segments in the set. All existing +// iterators are invalidated. +func (s *usageSet) MergeAll() { + seg := s.FirstSegment() + if !seg.Ok() { + return + } + next := seg.NextSegment() + for next.Ok() { + if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { + seg, next = mseg, mseg.NextSegment() + } else { + seg, next = next, next.NextSegment() + } + } +} + +// MergeRange attempts to merge all adjacent segments that contain a key in the +// specific range. All existing iterators are invalidated. +func (s *usageSet) MergeRange(r __generics_imported0.FileRange) { + seg := s.LowerBoundSegment(r.Start) + if !seg.Ok() { + return + } + next := seg.NextSegment() + for next.Ok() && next.Range().Start < r.End { + if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { + seg, next = mseg, mseg.NextSegment() + } else { + seg, next = next, next.NextSegment() + } + } +} + +// MergeAdjacent attempts to merge the segment containing r.Start with its +// predecessor, and the segment containing r.End-1 with its successor. +func (s *usageSet) MergeAdjacent(r __generics_imported0.FileRange) { + first := s.FindSegment(r.Start) + if first.Ok() { + if prev := first.PrevSegment(); prev.Ok() { + s.Merge(prev, first) + } + } + last := s.FindSegment(r.End - 1) + if last.Ok() { + if next := last.NextSegment(); next.Ok() { + s.Merge(last, next) + } + } +} + +// Split splits the given segment at the given key and returns iterators to the +// two resulting segments. All existing iterators (including seg, but not +// including the returned iterators) are invalidated. +// +// If the segment cannot be split at split (because split is at the start or +// end of the segment's range, so splitting would produce a segment with zero +// length, or because split falls outside the segment's range altogether), +// Split panics. +func (s *usageSet) Split(seg usageIterator, split uint64) (usageIterator, usageIterator) { + if !seg.Range().CanSplitAt(split) { + panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) + } + return s.SplitUnchecked(seg, split) +} + +// SplitUnchecked splits the given segment at the given key and returns +// iterators to the two resulting segments. All existing iterators (including +// seg, but not including the returned iterators) are invalidated. +// +// Preconditions: seg.Start() < key < seg.End(). +func (s *usageSet) SplitUnchecked(seg usageIterator, split uint64) (usageIterator, usageIterator) { + val1, val2 := (usageSetFunctions{}).Split(seg.Range(), seg.Value(), split) + end2 := seg.End() + seg.SetEndUnchecked(split) + seg.SetValue(val1) + seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), __generics_imported0.FileRange{split, end2}, val2) + + return seg2.PrevSegment(), seg2 +} + +// SplitAt splits the segment straddling split, if one exists. SplitAt returns +// true if a segment was split and false otherwise. If SplitAt splits a +// segment, all existing iterators are invalidated. +func (s *usageSet) SplitAt(split uint64) bool { + if seg := s.FindSegment(split); seg.Ok() && seg.Range().CanSplitAt(split) { + s.SplitUnchecked(seg, split) + return true + } + return false +} + +// Isolate ensures that the given segment's range does not escape r by +// splitting at r.Start and r.End if necessary, and returns an updated iterator +// to the bounded segment. All existing iterators (including seg, but not +// including the returned iterators) are invalidated. +func (s *usageSet) Isolate(seg usageIterator, r __generics_imported0.FileRange) usageIterator { + if seg.Range().CanSplitAt(r.Start) { + _, seg = s.SplitUnchecked(seg, r.Start) + } + if seg.Range().CanSplitAt(r.End) { + seg, _ = s.SplitUnchecked(seg, r.End) + } + return seg +} + +// ApplyContiguous applies a function to a contiguous range of segments, +// splitting if necessary. The function is applied until the first gap is +// encountered, at which point the gap is returned. If the function is applied +// across the entire range, a terminal gap is returned. All existing iterators +// are invalidated. +// +// N.B. The Iterator must not be invalidated by the function. +func (s *usageSet) ApplyContiguous(r __generics_imported0.FileRange, fn func(seg usageIterator)) usageGapIterator { + seg, gap := s.Find(r.Start) + if !seg.Ok() { + return gap + } + for { + seg = s.Isolate(seg, r) + fn(seg) + if seg.End() >= r.End { + return usageGapIterator{} + } + gap = seg.NextGap() + if !gap.IsEmpty() { + return gap + } + seg = gap.NextSegment() + if !seg.Ok() { + + return usageGapIterator{} + } + } +} + +// +stateify savable +type usagenode struct { + // An internal binary tree node looks like: + // + // K + // / \ + // Cl Cr + // + // where all keys in the subtree rooted by Cl (the left subtree) are less + // than K (the key of the parent node), and all keys in the subtree rooted + // by Cr (the right subtree) are greater than K. + // + // An internal B-tree node's indexes work out to look like: + // + // K0 K1 K2 ... Kn-1 + // / \/ \/ \ ... / \ + // C0 C1 C2 C3 ... Cn-1 Cn + // + // where n is nrSegments. + nrSegments int + + // parent is a pointer to this node's parent. If this node is root, parent + // is nil. + parent *usagenode + + // parentIndex is the index of this node in parent.children. + parentIndex int + + // Flag for internal nodes that is technically redundant with "children[0] + // != nil", but is stored in the first cache line. "hasChildren" rather + // than "isLeaf" because false must be the correct value for an empty root. + hasChildren bool + + // Nodes store keys and values in separate arrays to maximize locality in + // the common case (scanning keys for lookup). + keys [usagemaxDegree - 1]__generics_imported0.FileRange + values [usagemaxDegree - 1]usageInfo + children [usagemaxDegree]*usagenode +} + +// firstSegment returns the first segment in the subtree rooted by n. +// +// Preconditions: n.nrSegments != 0. +func (n *usagenode) firstSegment() usageIterator { + for n.hasChildren { + n = n.children[0] + } + return usageIterator{n, 0} +} + +// lastSegment returns the last segment in the subtree rooted by n. +// +// Preconditions: n.nrSegments != 0. +func (n *usagenode) lastSegment() usageIterator { + for n.hasChildren { + n = n.children[n.nrSegments] + } + return usageIterator{n, n.nrSegments - 1} +} + +func (n *usagenode) prevSibling() *usagenode { + if n.parent == nil || n.parentIndex == 0 { + return nil + } + return n.parent.children[n.parentIndex-1] +} + +func (n *usagenode) nextSibling() *usagenode { + if n.parent == nil || n.parentIndex == n.parent.nrSegments { + return nil + } + return n.parent.children[n.parentIndex+1] +} + +// rebalanceBeforeInsert splits n and its ancestors if they are full, as +// required for insertion, and returns an updated iterator to the position +// represented by gap. +func (n *usagenode) rebalanceBeforeInsert(gap usageGapIterator) usageGapIterator { + if n.parent != nil { + gap = n.parent.rebalanceBeforeInsert(gap) + } + if n.nrSegments < usagemaxDegree-1 { + return gap + } + if n.parent == nil { + + left := &usagenode{ + nrSegments: usageminDegree - 1, + parent: n, + parentIndex: 0, + hasChildren: n.hasChildren, + } + right := &usagenode{ + nrSegments: usageminDegree - 1, + parent: n, + parentIndex: 1, + hasChildren: n.hasChildren, + } + copy(left.keys[:usageminDegree-1], n.keys[:usageminDegree-1]) + copy(left.values[:usageminDegree-1], n.values[:usageminDegree-1]) + copy(right.keys[:usageminDegree-1], n.keys[usageminDegree:]) + copy(right.values[:usageminDegree-1], n.values[usageminDegree:]) + n.keys[0], n.values[0] = n.keys[usageminDegree-1], n.values[usageminDegree-1] + usagezeroValueSlice(n.values[1:]) + if n.hasChildren { + copy(left.children[:usageminDegree], n.children[:usageminDegree]) + copy(right.children[:usageminDegree], n.children[usageminDegree:]) + usagezeroNodeSlice(n.children[2:]) + for i := 0; i < usageminDegree; i++ { + left.children[i].parent = left + left.children[i].parentIndex = i + right.children[i].parent = right + right.children[i].parentIndex = i + } + } + n.nrSegments = 1 + n.hasChildren = true + n.children[0] = left + n.children[1] = right + if gap.node != n { + return gap + } + if gap.index < usageminDegree { + return usageGapIterator{left, gap.index} + } + return usageGapIterator{right, gap.index - usageminDegree} + } + + copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) + copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) + n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[usageminDegree-1], n.values[usageminDegree-1] + copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) + for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { + n.parent.children[i].parentIndex = i + } + sibling := &usagenode{ + nrSegments: usageminDegree - 1, + parent: n.parent, + parentIndex: n.parentIndex + 1, + hasChildren: n.hasChildren, + } + n.parent.children[n.parentIndex+1] = sibling + n.parent.nrSegments++ + copy(sibling.keys[:usageminDegree-1], n.keys[usageminDegree:]) + copy(sibling.values[:usageminDegree-1], n.values[usageminDegree:]) + usagezeroValueSlice(n.values[usageminDegree-1:]) + if n.hasChildren { + copy(sibling.children[:usageminDegree], n.children[usageminDegree:]) + usagezeroNodeSlice(n.children[usageminDegree:]) + for i := 0; i < usageminDegree; i++ { + sibling.children[i].parent = sibling + sibling.children[i].parentIndex = i + } + } + n.nrSegments = usageminDegree - 1 + + if gap.node != n { + return gap + } + if gap.index < usageminDegree { + return gap + } + return usageGapIterator{sibling, gap.index - usageminDegree} +} + +// rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient +// (contain fewer segments than required by B-tree invariants), as required for +// removal, and returns an updated iterator to the position represented by gap. +// +// Precondition: n is the only node in the tree that may currently violate a +// B-tree invariant. +func (n *usagenode) rebalanceAfterRemove(gap usageGapIterator) usageGapIterator { + for { + if n.nrSegments >= usageminDegree-1 { + return gap + } + if n.parent == nil { + + return gap + } + + if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= usageminDegree { + copy(n.keys[1:], n.keys[:n.nrSegments]) + copy(n.values[1:], n.values[:n.nrSegments]) + n.keys[0] = n.parent.keys[n.parentIndex-1] + n.values[0] = n.parent.values[n.parentIndex-1] + n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] + n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] + usageSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) + if n.hasChildren { + copy(n.children[1:], n.children[:n.nrSegments+1]) + n.children[0] = sibling.children[sibling.nrSegments] + sibling.children[sibling.nrSegments] = nil + n.children[0].parent = n + n.children[0].parentIndex = 0 + for i := 1; i < n.nrSegments+2; i++ { + n.children[i].parentIndex = i + } + } + n.nrSegments++ + sibling.nrSegments-- + if gap.node == sibling && gap.index == sibling.nrSegments { + return usageGapIterator{n, 0} + } + if gap.node == n { + return usageGapIterator{n, gap.index + 1} + } + return gap + } + if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= usageminDegree { + n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] + n.values[n.nrSegments] = n.parent.values[n.parentIndex] + n.parent.keys[n.parentIndex] = sibling.keys[0] + n.parent.values[n.parentIndex] = sibling.values[0] + copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) + copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) + usageSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) + if n.hasChildren { + n.children[n.nrSegments+1] = sibling.children[0] + copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) + sibling.children[sibling.nrSegments] = nil + n.children[n.nrSegments+1].parent = n + n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 + for i := 0; i < sibling.nrSegments; i++ { + sibling.children[i].parentIndex = i + } + } + n.nrSegments++ + sibling.nrSegments-- + if gap.node == sibling { + if gap.index == 0 { + return usageGapIterator{n, n.nrSegments} + } + return usageGapIterator{sibling, gap.index - 1} + } + return gap + } + + p := n.parent + if p.nrSegments == 1 { + + left, right := p.children[0], p.children[1] + p.nrSegments = left.nrSegments + right.nrSegments + 1 + p.hasChildren = left.hasChildren + p.keys[left.nrSegments] = p.keys[0] + p.values[left.nrSegments] = p.values[0] + copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) + copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) + copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) + copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) + if left.hasChildren { + copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) + copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) + for i := 0; i < p.nrSegments+1; i++ { + p.children[i].parent = p + p.children[i].parentIndex = i + } + } else { + p.children[0] = nil + p.children[1] = nil + } + if gap.node == left { + return usageGapIterator{p, gap.index} + } + if gap.node == right { + return usageGapIterator{p, gap.index + left.nrSegments + 1} + } + return gap + } + // Merge n and either sibling, along with the segment separating the + // two, into whichever of the two nodes comes first. This is the + // reverse of the non-root splitting case in + // node.rebalanceBeforeInsert. + var left, right *usagenode + if n.parentIndex > 0 { + left = n.prevSibling() + right = n + } else { + left = n + right = n.nextSibling() + } + + if gap.node == right { + gap = usageGapIterator{left, gap.index + left.nrSegments + 1} + } + left.keys[left.nrSegments] = p.keys[left.parentIndex] + left.values[left.nrSegments] = p.values[left.parentIndex] + copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) + copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) + if left.hasChildren { + copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) + for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { + left.children[i].parent = left + left.children[i].parentIndex = i + } + } + left.nrSegments += right.nrSegments + 1 + copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) + copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) + usageSetFunctions{}.ClearValue(&p.values[p.nrSegments-1]) + copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) + for i := 0; i < p.nrSegments; i++ { + p.children[i].parentIndex = i + } + p.children[p.nrSegments] = nil + p.nrSegments-- + + n = p + } +} + +// A Iterator is conceptually one of: +// +// - A pointer to a segment in a set; or +// +// - A terminal iterator, which is a sentinel indicating that the end of +// iteration has been reached. +// +// Iterators are copyable values and are meaningfully equality-comparable. The +// zero value of Iterator is a terminal iterator. +// +// Unless otherwise specified, any mutation of a set invalidates all existing +// iterators into the set. +type usageIterator struct { + // node is the node containing the iterated segment. If the iterator is + // terminal, node is nil. + node *usagenode + + // index is the index of the segment in node.keys/values. + index int +} + +// Ok returns true if the iterator is not terminal. All other methods are only +// valid for non-terminal iterators. +func (seg usageIterator) Ok() bool { + return seg.node != nil +} + +// Range returns the iterated segment's range key. +func (seg usageIterator) Range() __generics_imported0.FileRange { + return seg.node.keys[seg.index] +} + +// Start is equivalent to Range().Start, but should be preferred if only the +// start of the range is needed. +func (seg usageIterator) Start() uint64 { + return seg.node.keys[seg.index].Start +} + +// End is equivalent to Range().End, but should be preferred if only the end of +// the range is needed. +func (seg usageIterator) End() uint64 { + return seg.node.keys[seg.index].End +} + +// SetRangeUnchecked mutates the iterated segment's range key. This operation +// does not invalidate any iterators. +// +// Preconditions: +// +// - r.Length() > 0. +// +// - The new range must not overlap an existing one: If seg.NextSegment().Ok(), +// then r.end <= seg.NextSegment().Start(); if seg.PrevSegment().Ok(), then +// r.start >= seg.PrevSegment().End(). +func (seg usageIterator) SetRangeUnchecked(r __generics_imported0.FileRange) { + seg.node.keys[seg.index] = r +} + +// SetRange mutates the iterated segment's range key. If the new range would +// cause the iterated segment to overlap another segment, or if the new range +// is invalid, SetRange panics. This operation does not invalidate any +// iterators. +func (seg usageIterator) SetRange(r __generics_imported0.FileRange) { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { + panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) + } + if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { + panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) + } + seg.SetRangeUnchecked(r) +} + +// SetStartUnchecked mutates the iterated segment's start. This operation does +// not invalidate any iterators. +// +// Preconditions: The new start must be valid: start < seg.End(); if +// seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). +func (seg usageIterator) SetStartUnchecked(start uint64) { + seg.node.keys[seg.index].Start = start +} + +// SetStart mutates the iterated segment's start. If the new start value would +// cause the iterated segment to overlap another segment, or would result in an +// invalid range, SetStart panics. This operation does not invalidate any +// iterators. +func (seg usageIterator) SetStart(start uint64) { + if start >= seg.End() { + panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) + } + if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { + panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) + } + seg.SetStartUnchecked(start) +} + +// SetEndUnchecked mutates the iterated segment's end. This operation does not +// invalidate any iterators. +// +// Preconditions: The new end must be valid: end > seg.Start(); if +// seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). +func (seg usageIterator) SetEndUnchecked(end uint64) { + seg.node.keys[seg.index].End = end +} + +// SetEnd mutates the iterated segment's end. If the new end value would cause +// the iterated segment to overlap another segment, or would result in an +// invalid range, SetEnd panics. This operation does not invalidate any +// iterators. +func (seg usageIterator) SetEnd(end uint64) { + if end <= seg.Start() { + panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) + } + if next := seg.NextSegment(); next.Ok() && end > next.Start() { + panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) + } + seg.SetEndUnchecked(end) +} + +// Value returns a copy of the iterated segment's value. +func (seg usageIterator) Value() usageInfo { + return seg.node.values[seg.index] +} + +// ValuePtr returns a pointer to the iterated segment's value. The pointer is +// invalidated if the iterator is invalidated. This operation does not +// invalidate any iterators. +func (seg usageIterator) ValuePtr() *usageInfo { + return &seg.node.values[seg.index] +} + +// SetValue mutates the iterated segment's value. This operation does not +// invalidate any iterators. +func (seg usageIterator) SetValue(val usageInfo) { + seg.node.values[seg.index] = val +} + +// PrevSegment returns the iterated segment's predecessor. If there is no +// preceding segment, PrevSegment returns a terminal iterator. +func (seg usageIterator) PrevSegment() usageIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index].lastSegment() + } + if seg.index > 0 { + return usageIterator{seg.node, seg.index - 1} + } + if seg.node.parent == nil { + return usageIterator{} + } + return usagesegmentBeforePosition(seg.node.parent, seg.node.parentIndex) +} + +// NextSegment returns the iterated segment's successor. If there is no +// succeeding segment, NextSegment returns a terminal iterator. +func (seg usageIterator) NextSegment() usageIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index+1].firstSegment() + } + if seg.index < seg.node.nrSegments-1 { + return usageIterator{seg.node, seg.index + 1} + } + if seg.node.parent == nil { + return usageIterator{} + } + return usagesegmentAfterPosition(seg.node.parent, seg.node.parentIndex) +} + +// PrevGap returns the gap immediately before the iterated segment. +func (seg usageIterator) PrevGap() usageGapIterator { + if seg.node.hasChildren { + + return seg.node.children[seg.index].lastSegment().NextGap() + } + return usageGapIterator{seg.node, seg.index} +} + +// NextGap returns the gap immediately after the iterated segment. +func (seg usageIterator) NextGap() usageGapIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index+1].firstSegment().PrevGap() + } + return usageGapIterator{seg.node, seg.index + 1} +} + +// PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, +// or the gap before the iterated segment otherwise. If seg.Start() == +// Functions.MinKey(), PrevNonEmpty will return two terminal iterators. +// Otherwise, exactly one of the iterators returned by PrevNonEmpty will be +// non-terminal. +func (seg usageIterator) PrevNonEmpty() (usageIterator, usageGapIterator) { + gap := seg.PrevGap() + if gap.Range().Length() != 0 { + return usageIterator{}, gap + } + return gap.PrevSegment(), usageGapIterator{} +} + +// NextNonEmpty returns the iterated segment's successor if it is adjacent, or +// the gap after the iterated segment otherwise. If seg.End() == +// Functions.MaxKey(), NextNonEmpty will return two terminal iterators. +// Otherwise, exactly one of the iterators returned by NextNonEmpty will be +// non-terminal. +func (seg usageIterator) NextNonEmpty() (usageIterator, usageGapIterator) { + gap := seg.NextGap() + if gap.Range().Length() != 0 { + return usageIterator{}, gap + } + return gap.NextSegment(), usageGapIterator{} +} + +// A GapIterator is conceptually one of: +// +// - A pointer to a position between two segments, before the first segment, or +// after the last segment in a set, called a *gap*; or +// +// - A terminal iterator, which is a sentinel indicating that the end of +// iteration has been reached. +// +// Note that the gap between two adjacent segments exists (iterators to it are +// non-terminal), but has a length of zero. GapIterator.IsEmpty returns true +// for such gaps. An empty set contains a single gap, spanning the entire range +// of the set's keys. +// +// GapIterators are copyable values and are meaningfully equality-comparable. +// The zero value of GapIterator is a terminal iterator. +// +// Unless otherwise specified, any mutation of a set invalidates all existing +// iterators into the set. +type usageGapIterator struct { + // The representation of a GapIterator is identical to that of an Iterator, + // except that index corresponds to positions between segments in the same + // way as for node.children (see comment for node.nrSegments). + node *usagenode + index int +} + +// Ok returns true if the iterator is not terminal. All other methods are only +// valid for non-terminal iterators. +func (gap usageGapIterator) Ok() bool { + return gap.node != nil +} + +// Range returns the range spanned by the iterated gap. +func (gap usageGapIterator) Range() __generics_imported0.FileRange { + return __generics_imported0.FileRange{gap.Start(), gap.End()} +} + +// Start is equivalent to Range().Start, but should be preferred if only the +// start of the range is needed. +func (gap usageGapIterator) Start() uint64 { + if ps := gap.PrevSegment(); ps.Ok() { + return ps.End() + } + return usageSetFunctions{}.MinKey() +} + +// End is equivalent to Range().End, but should be preferred if only the end of +// the range is needed. +func (gap usageGapIterator) End() uint64 { + if ns := gap.NextSegment(); ns.Ok() { + return ns.Start() + } + return usageSetFunctions{}.MaxKey() +} + +// IsEmpty returns true if the iterated gap is empty (that is, the "gap" is +// between two adjacent segments.) +func (gap usageGapIterator) IsEmpty() bool { + return gap.Range().Length() == 0 +} + +// PrevSegment returns the segment immediately before the iterated gap. If no +// such segment exists, PrevSegment returns a terminal iterator. +func (gap usageGapIterator) PrevSegment() usageIterator { + return usagesegmentBeforePosition(gap.node, gap.index) +} + +// NextSegment returns the segment immediately after the iterated gap. If no +// such segment exists, NextSegment returns a terminal iterator. +func (gap usageGapIterator) NextSegment() usageIterator { + return usagesegmentAfterPosition(gap.node, gap.index) +} + +// PrevGap returns the iterated gap's predecessor. If no such gap exists, +// PrevGap returns a terminal iterator. +func (gap usageGapIterator) PrevGap() usageGapIterator { + seg := gap.PrevSegment() + if !seg.Ok() { + return usageGapIterator{} + } + return seg.PrevGap() +} + +// NextGap returns the iterated gap's successor. If no such gap exists, NextGap +// returns a terminal iterator. +func (gap usageGapIterator) NextGap() usageGapIterator { + seg := gap.NextSegment() + if !seg.Ok() { + return usageGapIterator{} + } + return seg.NextGap() +} + +// segmentBeforePosition returns the predecessor segment of the position given +// by n.children[i], which may or may not contain a child. If no such segment +// exists, segmentBeforePosition returns a terminal iterator. +func usagesegmentBeforePosition(n *usagenode, i int) usageIterator { + for i == 0 { + if n.parent == nil { + return usageIterator{} + } + n, i = n.parent, n.parentIndex + } + return usageIterator{n, i - 1} +} + +// segmentAfterPosition returns the successor segment of the position given by +// n.children[i], which may or may not contain a child. If no such segment +// exists, segmentAfterPosition returns a terminal iterator. +func usagesegmentAfterPosition(n *usagenode, i int) usageIterator { + for i == n.nrSegments { + if n.parent == nil { + return usageIterator{} + } + n, i = n.parent, n.parentIndex + } + return usageIterator{n, i} +} + +func usagezeroValueSlice(slice []usageInfo) { + + for i := range slice { + usageSetFunctions{}.ClearValue(&slice[i]) + } +} + +func usagezeroNodeSlice(slice []*usagenode) { + for i := range slice { + slice[i] = nil + } +} + +// String stringifies a Set for debugging. +func (s *usageSet) String() string { + return s.root.String() +} + +// String stringifes a node (and all of its children) for debugging. +func (n *usagenode) String() string { + var buf bytes.Buffer + n.writeDebugString(&buf, "") + return buf.String() +} + +func (n *usagenode) writeDebugString(buf *bytes.Buffer, prefix string) { + if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { + buf.WriteString(prefix) + buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) + } + for i := 0; i < n.nrSegments; i++ { + if child := n.children[i]; child != nil { + cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) + if child.parent != n || child.parentIndex != i { + buf.WriteString(cprefix) + buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) + } + child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) + } + buf.WriteString(prefix) + buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) + } + if child := n.children[n.nrSegments]; child != nil { + child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) + } +} + +// SegmentDataSlices represents segments from a set as slices of start, end, and +// values. SegmentDataSlices is primarily used as an intermediate representation +// for save/restore and the layout here is optimized for that. +// +// +stateify savable +type usageSegmentDataSlices struct { + Start []uint64 + End []uint64 + Values []usageInfo +} + +// ExportSortedSlice returns a copy of all segments in the given set, in ascending +// key order. +func (s *usageSet) ExportSortedSlices() *usageSegmentDataSlices { + var sds usageSegmentDataSlices + for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + sds.Start = append(sds.Start, seg.Start()) + sds.End = append(sds.End, seg.End()) + sds.Values = append(sds.Values, seg.Value()) + } + sds.Start = sds.Start[:len(sds.Start):len(sds.Start)] + sds.End = sds.End[:len(sds.End):len(sds.End)] + sds.Values = sds.Values[:len(sds.Values):len(sds.Values)] + return &sds +} + +// ImportSortedSlice initializes the given set from the given slice. +// +// Preconditions: s must be empty. sds must represent a valid set (the segments +// in sds must have valid lengths that do not overlap). The segments in sds +// must be sorted in ascending key order. +func (s *usageSet) ImportSortedSlices(sds *usageSegmentDataSlices) error { + if !s.IsEmpty() { + return fmt.Errorf("cannot import into non-empty set %v", s) + } + gap := s.FirstGap() + for i := range sds.Start { + r := __generics_imported0.FileRange{sds.Start[i], sds.End[i]} + if !gap.Range().IsSupersetOf(r) { + return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: [%d, %d) => %v", sds.Start[i], sds.End[i], sds.Values[i]) + } + gap = s.InsertWithoutMerging(gap, r, sds.Values[i]).NextGap() + } + return nil +} +func (s *usageSet) saveRoot() *usageSegmentDataSlices { + return s.ExportSortedSlices() +} + +func (s *usageSet) loadRoot(sds *usageSegmentDataSlices) { + if err := s.ImportSortedSlices(sds); err != nil { + panic(err) + } +} |