diff options
author | gVisor bot <gvisor-bot@google.com> | 2019-06-02 06:44:55 +0000 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2019-06-02 06:44:55 +0000 |
commit | ceb0d792f328d1fc0692197d8856a43c3936a571 (patch) | |
tree | 83155f302eff44a78bcc30a3a08f4efe59a79379 /pkg/sentry/fs/fsutil | |
parent | deb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff) | |
parent | 216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff) |
Merge 216da0b7 (automated)
Diffstat (limited to 'pkg/sentry/fs/fsutil')
-rw-r--r-- | pkg/sentry/fs/fsutil/dirty_set.go | 237 | ||||
-rwxr-xr-x | pkg/sentry/fs/fsutil/dirty_set_impl.go | 1274 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/file.go | 394 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/file_range_set.go | 209 | ||||
-rwxr-xr-x | pkg/sentry/fs/fsutil/file_range_set_impl.go | 1274 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/frame_ref_set.go | 50 | ||||
-rwxr-xr-x | pkg/sentry/fs/fsutil/frame_ref_set_impl.go | 1274 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/fsutil.go | 24 | ||||
-rwxr-xr-x | pkg/sentry/fs/fsutil/fsutil_state_autogen.go | 349 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/host_file_mapper.go | 211 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/host_file_mapper_state.go | 20 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go | 27 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/host_mappable.go | 197 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/inode.go | 503 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/inode_cached.go | 1004 |
15 files changed, 7047 insertions, 0 deletions
diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go new file mode 100644 index 000000000..f1451d77a --- /dev/null +++ b/pkg/sentry/fs/fsutil/dirty_set.go @@ -0,0 +1,237 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "math" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// DirtySet maps offsets into a memmap.Mappable to DirtyInfo. It is used to +// implement Mappables that cache data from another source. +// +// type DirtySet <generated by go_generics> + +// DirtyInfo is the value type of DirtySet, and represents information about a +// Mappable offset that is dirty (the cached data for that offset is newer than +// its source). +// +// +stateify savable +type DirtyInfo struct { + // Keep is true if the represented offset is concurrently writable, such + // that writing the data for that offset back to the source does not + // guarantee that the offset is clean (since it may be concurrently + // rewritten after the writeback). + Keep bool +} + +// dirtySetFunctions implements segment.Functions for DirtySet. +type dirtySetFunctions struct{} + +// MinKey implements segment.Functions.MinKey. +func (dirtySetFunctions) MinKey() uint64 { + return 0 +} + +// MaxKey implements segment.Functions.MaxKey. +func (dirtySetFunctions) MaxKey() uint64 { + return math.MaxUint64 +} + +// ClearValue implements segment.Functions.ClearValue. +func (dirtySetFunctions) ClearValue(val *DirtyInfo) { +} + +// Merge implements segment.Functions.Merge. +func (dirtySetFunctions) Merge(_ memmap.MappableRange, val1 DirtyInfo, _ memmap.MappableRange, val2 DirtyInfo) (DirtyInfo, bool) { + if val1 != val2 { + return DirtyInfo{}, false + } + return val1, true +} + +// Split implements segment.Functions.Split. +func (dirtySetFunctions) Split(_ memmap.MappableRange, val DirtyInfo, _ uint64) (DirtyInfo, DirtyInfo) { + return val, val +} + +// MarkClean marks all offsets in mr as not dirty, except for those to which +// KeepDirty has been applied. +func (ds *DirtySet) MarkClean(mr memmap.MappableRange) { + seg := ds.LowerBoundSegment(mr.Start) + for seg.Ok() && seg.Start() < mr.End { + if seg.Value().Keep { + seg = seg.NextSegment() + continue + } + seg = ds.Isolate(seg, mr) + seg = ds.Remove(seg).NextSegment() + } +} + +// KeepClean marks all offsets in mr as not dirty, even those that were +// previously kept dirty by KeepDirty. +func (ds *DirtySet) KeepClean(mr memmap.MappableRange) { + ds.RemoveRange(mr) +} + +// MarkDirty marks all offsets in mr as dirty. +func (ds *DirtySet) MarkDirty(mr memmap.MappableRange) { + ds.setDirty(mr, false) +} + +// KeepDirty marks all offsets in mr as dirty and prevents them from being +// marked as clean by MarkClean. +func (ds *DirtySet) KeepDirty(mr memmap.MappableRange) { + ds.setDirty(mr, true) +} + +func (ds *DirtySet) setDirty(mr memmap.MappableRange, keep bool) { + var changedAny bool + defer func() { + if changedAny { + // Merge segments split by Isolate to reduce cost of iteration. + ds.MergeRange(mr) + } + }() + seg, gap := ds.Find(mr.Start) + for { + switch { + case seg.Ok() && seg.Start() < mr.End: + if keep && !seg.Value().Keep { + changedAny = true + seg = ds.Isolate(seg, mr) + seg.ValuePtr().Keep = true + } + seg, gap = seg.NextNonEmpty() + + case gap.Ok() && gap.Start() < mr.End: + changedAny = true + seg = ds.Insert(gap, gap.Range().Intersect(mr), DirtyInfo{keep}) + seg, gap = seg.NextNonEmpty() + + default: + return + } + } +} + +// AllowClean allows MarkClean to mark offsets in mr as not dirty, ending the +// effect of a previous call to KeepDirty. (It does not itself mark those +// offsets as not dirty.) +func (ds *DirtySet) AllowClean(mr memmap.MappableRange) { + var changedAny bool + defer func() { + if changedAny { + // Merge segments split by Isolate to reduce cost of iteration. + ds.MergeRange(mr) + } + }() + for seg := ds.LowerBoundSegment(mr.Start); seg.Ok() && seg.Start() < mr.End; seg = seg.NextSegment() { + if seg.Value().Keep { + changedAny = true + seg = ds.Isolate(seg, mr) + seg.ValuePtr().Keep = false + } + } +} + +// SyncDirty passes pages in the range mr that are stored in cache and +// identified as dirty to writeAt, updating dirty to reflect successful writes. +// If writeAt returns a successful partial write, SyncDirty will call it +// repeatedly until all bytes have been written. max is the true size of the +// cached object; offsets beyond max will not be passed to writeAt, even if +// they are marked dirty. +func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { + var changedDirty bool + defer func() { + if changedDirty { + // Merge segments split by Isolate to reduce cost of iteration. + dirty.MergeRange(mr) + } + }() + dseg := dirty.LowerBoundSegment(mr.Start) + for dseg.Ok() && dseg.Start() < mr.End { + var dr memmap.MappableRange + if dseg.Value().Keep { + dr = dseg.Range().Intersect(mr) + } else { + changedDirty = true + dseg = dirty.Isolate(dseg, mr) + dr = dseg.Range() + } + if err := syncDirtyRange(ctx, dr, cache, max, mem, writeAt); err != nil { + return err + } + if dseg.Value().Keep { + dseg = dseg.NextSegment() + } else { + dseg = dirty.Remove(dseg).NextSegment() + } + } + return nil +} + +// SyncDirtyAll passes all pages stored in cache identified as dirty to +// writeAt, updating dirty to reflect successful writes. If writeAt returns a +// successful partial write, SyncDirtyAll will call it repeatedly until all +// bytes have been written. max is the true size of the cached object; offsets +// beyond max will not be passed to writeAt, even if they are marked dirty. +func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { + dseg := dirty.FirstSegment() + for dseg.Ok() { + if err := syncDirtyRange(ctx, dseg.Range(), cache, max, mem, writeAt); err != nil { + return err + } + if dseg.Value().Keep { + dseg = dseg.NextSegment() + } else { + dseg = dirty.Remove(dseg).NextSegment() + } + } + return nil +} + +// Preconditions: mr must be page-aligned. +func syncDirtyRange(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { + for cseg := cache.LowerBoundSegment(mr.Start); cseg.Ok() && cseg.Start() < mr.End; cseg = cseg.NextSegment() { + wbr := cseg.Range().Intersect(mr) + if max < wbr.Start { + break + } + ims, err := mem.MapInternal(cseg.FileRangeOf(wbr), usermem.Read) + if err != nil { + return err + } + if max < wbr.End { + ims = ims.TakeFirst64(max - wbr.Start) + } + offset := wbr.Start + for !ims.IsEmpty() { + n, err := writeAt(ctx, ims, offset) + if err != nil { + return err + } + offset += n + ims = ims.DropFirst64(n) + } + } + return nil +} diff --git a/pkg/sentry/fs/fsutil/dirty_set_impl.go b/pkg/sentry/fs/fsutil/dirty_set_impl.go new file mode 100755 index 000000000..5f25068a1 --- /dev/null +++ b/pkg/sentry/fs/fsutil/dirty_set_impl.go @@ -0,0 +1,1274 @@ +package fsutil + +import ( + __generics_imported0 "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" +) + +import ( + "bytes" + "fmt" +) + +const ( + // minDegree is the minimum degree of an internal node in a Set B-tree. + // + // - Any non-root node has at least minDegree-1 segments. + // + // - Any non-root internal (non-leaf) node has at least minDegree children. + // + // - The root node may have fewer than minDegree-1 segments, but it may + // only have 0 segments if the tree is empty. + // + // Our implementation requires minDegree >= 3. Higher values of minDegree + // usually improve performance, but increase memory usage for small sets. + DirtyminDegree = 3 + + DirtymaxDegree = 2 * DirtyminDegree +) + +// A Set is a mapping of segments with non-overlapping Range keys. The zero +// value for a Set is an empty set. Set values are not safely movable nor +// copyable. Set is thread-compatible. +// +// +stateify savable +type DirtySet struct { + root Dirtynode `state:".(*DirtySegmentDataSlices)"` +} + +// IsEmpty returns true if the set contains no segments. +func (s *DirtySet) IsEmpty() bool { + return s.root.nrSegments == 0 +} + +// IsEmptyRange returns true iff no segments in the set overlap the given +// range. This is semantically equivalent to s.SpanRange(r) == 0, but may be +// more efficient. +func (s *DirtySet) IsEmptyRange(r __generics_imported0.MappableRange) bool { + switch { + case r.Length() < 0: + panic(fmt.Sprintf("invalid range %v", r)) + case r.Length() == 0: + return true + } + _, gap := s.Find(r.Start) + if !gap.Ok() { + return false + } + return r.End <= gap.End() +} + +// Span returns the total size of all segments in the set. +func (s *DirtySet) Span() uint64 { + var sz uint64 + for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + sz += seg.Range().Length() + } + return sz +} + +// SpanRange returns the total size of the intersection of segments in the set +// with the given range. +func (s *DirtySet) SpanRange(r __generics_imported0.MappableRange) uint64 { + switch { + case r.Length() < 0: + panic(fmt.Sprintf("invalid range %v", r)) + case r.Length() == 0: + return 0 + } + var sz uint64 + for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { + sz += seg.Range().Intersect(r).Length() + } + return sz +} + +// FirstSegment returns the first segment in the set. If the set is empty, +// FirstSegment returns a terminal iterator. +func (s *DirtySet) FirstSegment() DirtyIterator { + if s.root.nrSegments == 0 { + return DirtyIterator{} + } + return s.root.firstSegment() +} + +// LastSegment returns the last segment in the set. If the set is empty, +// LastSegment returns a terminal iterator. +func (s *DirtySet) LastSegment() DirtyIterator { + if s.root.nrSegments == 0 { + return DirtyIterator{} + } + return s.root.lastSegment() +} + +// FirstGap returns the first gap in the set. +func (s *DirtySet) FirstGap() DirtyGapIterator { + n := &s.root + for n.hasChildren { + n = n.children[0] + } + return DirtyGapIterator{n, 0} +} + +// LastGap returns the last gap in the set. +func (s *DirtySet) LastGap() DirtyGapIterator { + n := &s.root + for n.hasChildren { + n = n.children[n.nrSegments] + } + return DirtyGapIterator{n, n.nrSegments} +} + +// Find returns the segment or gap whose range contains the given key. If a +// segment is found, the returned Iterator is non-terminal and the +// returned GapIterator is terminal. Otherwise, the returned Iterator is +// terminal and the returned GapIterator is non-terminal. +func (s *DirtySet) Find(key uint64) (DirtyIterator, DirtyGapIterator) { + n := &s.root + for { + + lower := 0 + upper := n.nrSegments + for lower < upper { + i := lower + (upper-lower)/2 + if r := n.keys[i]; key < r.End { + if key >= r.Start { + return DirtyIterator{n, i}, DirtyGapIterator{} + } + upper = i + } else { + lower = i + 1 + } + } + i := lower + if !n.hasChildren { + return DirtyIterator{}, DirtyGapIterator{n, i} + } + n = n.children[i] + } +} + +// FindSegment returns the segment whose range contains the given key. If no +// such segment exists, FindSegment returns a terminal iterator. +func (s *DirtySet) FindSegment(key uint64) DirtyIterator { + seg, _ := s.Find(key) + return seg +} + +// LowerBoundSegment returns the segment with the lowest range that contains a +// key greater than or equal to min. If no such segment exists, +// LowerBoundSegment returns a terminal iterator. +func (s *DirtySet) LowerBoundSegment(min uint64) DirtyIterator { + seg, gap := s.Find(min) + if seg.Ok() { + return seg + } + return gap.NextSegment() +} + +// UpperBoundSegment returns the segment with the highest range that contains a +// key less than or equal to max. If no such segment exists, UpperBoundSegment +// returns a terminal iterator. +func (s *DirtySet) UpperBoundSegment(max uint64) DirtyIterator { + seg, gap := s.Find(max) + if seg.Ok() { + return seg + } + return gap.PrevSegment() +} + +// FindGap returns the gap containing the given key. If no such gap exists +// (i.e. the set contains a segment containing that key), FindGap returns a +// terminal iterator. +func (s *DirtySet) FindGap(key uint64) DirtyGapIterator { + _, gap := s.Find(key) + return gap +} + +// LowerBoundGap returns the gap with the lowest range that is greater than or +// equal to min. +func (s *DirtySet) LowerBoundGap(min uint64) DirtyGapIterator { + seg, gap := s.Find(min) + if gap.Ok() { + return gap + } + return seg.NextGap() +} + +// UpperBoundGap returns the gap with the highest range that is less than or +// equal to max. +func (s *DirtySet) UpperBoundGap(max uint64) DirtyGapIterator { + seg, gap := s.Find(max) + if gap.Ok() { + return gap + } + return seg.PrevGap() +} + +// Add inserts the given segment into the set and returns true. If the new +// segment can be merged with adjacent segments, Add will do so. If the new +// segment would overlap an existing segment, Add returns false. If Add +// succeeds, all existing iterators are invalidated. +func (s *DirtySet) Add(r __generics_imported0.MappableRange, val DirtyInfo) bool { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + gap := s.FindGap(r.Start) + if !gap.Ok() { + return false + } + if r.End > gap.End() { + return false + } + s.Insert(gap, r, val) + return true +} + +// AddWithoutMerging inserts the given segment into the set and returns true. +// If it would overlap an existing segment, AddWithoutMerging does nothing and +// returns false. If AddWithoutMerging succeeds, all existing iterators are +// invalidated. +func (s *DirtySet) AddWithoutMerging(r __generics_imported0.MappableRange, val DirtyInfo) bool { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + gap := s.FindGap(r.Start) + if !gap.Ok() { + return false + } + if r.End > gap.End() { + return false + } + s.InsertWithoutMergingUnchecked(gap, r, val) + return true +} + +// Insert inserts the given segment into the given gap. If the new segment can +// be merged with adjacent segments, Insert will do so. Insert returns an +// iterator to the segment containing the inserted value (which may have been +// merged with other values). All existing iterators (including gap, but not +// including the returned iterator) are invalidated. +// +// If the gap cannot accommodate the segment, or if r is invalid, Insert panics. +// +// Insert is semantically equivalent to a InsertWithoutMerging followed by a +// Merge, but may be more efficient. Note that there is no unchecked variant of +// Insert since Insert must retrieve and inspect gap's predecessor and +// successor segments regardless. +func (s *DirtySet) Insert(gap DirtyGapIterator, r __generics_imported0.MappableRange, val DirtyInfo) DirtyIterator { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + prev, next := gap.PrevSegment(), gap.NextSegment() + if prev.Ok() && prev.End() > r.Start { + panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) + } + if next.Ok() && next.Start() < r.End { + panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) + } + if prev.Ok() && prev.End() == r.Start { + if mval, ok := (dirtySetFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok { + prev.SetEndUnchecked(r.End) + prev.SetValue(mval) + if next.Ok() && next.Start() == r.End { + val = mval + if mval, ok := (dirtySetFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { + prev.SetEndUnchecked(next.End()) + prev.SetValue(mval) + return s.Remove(next).PrevSegment() + } + } + return prev + } + } + if next.Ok() && next.Start() == r.End { + if mval, ok := (dirtySetFunctions{}).Merge(r, val, next.Range(), next.Value()); ok { + next.SetStartUnchecked(r.Start) + next.SetValue(mval) + return next + } + } + return s.InsertWithoutMergingUnchecked(gap, r, val) +} + +// InsertWithoutMerging inserts the given segment into the given gap and +// returns an iterator to the inserted segment. All existing iterators +// (including gap, but not including the returned iterator) are invalidated. +// +// If the gap cannot accommodate the segment, or if r is invalid, +// InsertWithoutMerging panics. +func (s *DirtySet) InsertWithoutMerging(gap DirtyGapIterator, r __generics_imported0.MappableRange, val DirtyInfo) DirtyIterator { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + if gr := gap.Range(); !gr.IsSupersetOf(r) { + panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) + } + return s.InsertWithoutMergingUnchecked(gap, r, val) +} + +// InsertWithoutMergingUnchecked inserts the given segment into the given gap +// and returns an iterator to the inserted segment. All existing iterators +// (including gap, but not including the returned iterator) are invalidated. +// +// Preconditions: r.Start >= gap.Start(); r.End <= gap.End(). +func (s *DirtySet) InsertWithoutMergingUnchecked(gap DirtyGapIterator, r __generics_imported0.MappableRange, val DirtyInfo) DirtyIterator { + gap = gap.node.rebalanceBeforeInsert(gap) + copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) + copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) + gap.node.keys[gap.index] = r + gap.node.values[gap.index] = val + gap.node.nrSegments++ + return DirtyIterator{gap.node, gap.index} +} + +// Remove removes the given segment and returns an iterator to the vacated gap. +// All existing iterators (including seg, but not including the returned +// iterator) are invalidated. +func (s *DirtySet) Remove(seg DirtyIterator) DirtyGapIterator { + + if seg.node.hasChildren { + + victim := seg.PrevSegment() + + seg.SetRangeUnchecked(victim.Range()) + seg.SetValue(victim.Value()) + return s.Remove(victim).NextGap() + } + copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) + copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) + dirtySetFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) + seg.node.nrSegments-- + return seg.node.rebalanceAfterRemove(DirtyGapIterator{seg.node, seg.index}) +} + +// RemoveAll removes all segments from the set. All existing iterators are +// invalidated. +func (s *DirtySet) RemoveAll() { + s.root = Dirtynode{} +} + +// RemoveRange removes all segments in the given range. An iterator to the +// newly formed gap is returned, and all existing iterators are invalidated. +func (s *DirtySet) RemoveRange(r __generics_imported0.MappableRange) DirtyGapIterator { + seg, gap := s.Find(r.Start) + if seg.Ok() { + seg = s.Isolate(seg, r) + gap = s.Remove(seg) + } + for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { + seg = s.Isolate(seg, r) + gap = s.Remove(seg) + } + return gap +} + +// Merge attempts to merge two neighboring segments. If successful, Merge +// returns an iterator to the merged segment, and all existing iterators are +// invalidated. Otherwise, Merge returns a terminal iterator. +// +// If first is not the predecessor of second, Merge panics. +func (s *DirtySet) Merge(first, second DirtyIterator) DirtyIterator { + if first.NextSegment() != second { + panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) + } + return s.MergeUnchecked(first, second) +} + +// MergeUnchecked attempts to merge two neighboring segments. If successful, +// MergeUnchecked returns an iterator to the merged segment, and all existing +// iterators are invalidated. Otherwise, MergeUnchecked returns a terminal +// iterator. +// +// Precondition: first is the predecessor of second: first.NextSegment() == +// second, first == second.PrevSegment(). +func (s *DirtySet) MergeUnchecked(first, second DirtyIterator) DirtyIterator { + if first.End() == second.Start() { + if mval, ok := (dirtySetFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { + + first.SetEndUnchecked(second.End()) + first.SetValue(mval) + return s.Remove(second).PrevSegment() + } + } + return DirtyIterator{} +} + +// MergeAll attempts to merge all adjacent segments in the set. All existing +// iterators are invalidated. +func (s *DirtySet) MergeAll() { + seg := s.FirstSegment() + if !seg.Ok() { + return + } + next := seg.NextSegment() + for next.Ok() { + if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { + seg, next = mseg, mseg.NextSegment() + } else { + seg, next = next, next.NextSegment() + } + } +} + +// MergeRange attempts to merge all adjacent segments that contain a key in the +// specific range. All existing iterators are invalidated. +func (s *DirtySet) MergeRange(r __generics_imported0.MappableRange) { + seg := s.LowerBoundSegment(r.Start) + if !seg.Ok() { + return + } + next := seg.NextSegment() + for next.Ok() && next.Range().Start < r.End { + if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { + seg, next = mseg, mseg.NextSegment() + } else { + seg, next = next, next.NextSegment() + } + } +} + +// MergeAdjacent attempts to merge the segment containing r.Start with its +// predecessor, and the segment containing r.End-1 with its successor. +func (s *DirtySet) MergeAdjacent(r __generics_imported0.MappableRange) { + first := s.FindSegment(r.Start) + if first.Ok() { + if prev := first.PrevSegment(); prev.Ok() { + s.Merge(prev, first) + } + } + last := s.FindSegment(r.End - 1) + if last.Ok() { + if next := last.NextSegment(); next.Ok() { + s.Merge(last, next) + } + } +} + +// Split splits the given segment at the given key and returns iterators to the +// two resulting segments. All existing iterators (including seg, but not +// including the returned iterators) are invalidated. +// +// If the segment cannot be split at split (because split is at the start or +// end of the segment's range, so splitting would produce a segment with zero +// length, or because split falls outside the segment's range altogether), +// Split panics. +func (s *DirtySet) Split(seg DirtyIterator, split uint64) (DirtyIterator, DirtyIterator) { + if !seg.Range().CanSplitAt(split) { + panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) + } + return s.SplitUnchecked(seg, split) +} + +// SplitUnchecked splits the given segment at the given key and returns +// iterators to the two resulting segments. All existing iterators (including +// seg, but not including the returned iterators) are invalidated. +// +// Preconditions: seg.Start() < key < seg.End(). +func (s *DirtySet) SplitUnchecked(seg DirtyIterator, split uint64) (DirtyIterator, DirtyIterator) { + val1, val2 := (dirtySetFunctions{}).Split(seg.Range(), seg.Value(), split) + end2 := seg.End() + seg.SetEndUnchecked(split) + seg.SetValue(val1) + seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), __generics_imported0.MappableRange{split, end2}, val2) + + return seg2.PrevSegment(), seg2 +} + +// SplitAt splits the segment straddling split, if one exists. SplitAt returns +// true if a segment was split and false otherwise. If SplitAt splits a +// segment, all existing iterators are invalidated. +func (s *DirtySet) SplitAt(split uint64) bool { + if seg := s.FindSegment(split); seg.Ok() && seg.Range().CanSplitAt(split) { + s.SplitUnchecked(seg, split) + return true + } + return false +} + +// Isolate ensures that the given segment's range does not escape r by +// splitting at r.Start and r.End if necessary, and returns an updated iterator +// to the bounded segment. All existing iterators (including seg, but not +// including the returned iterators) are invalidated. +func (s *DirtySet) Isolate(seg DirtyIterator, r __generics_imported0.MappableRange) DirtyIterator { + if seg.Range().CanSplitAt(r.Start) { + _, seg = s.SplitUnchecked(seg, r.Start) + } + if seg.Range().CanSplitAt(r.End) { + seg, _ = s.SplitUnchecked(seg, r.End) + } + return seg +} + +// ApplyContiguous applies a function to a contiguous range of segments, +// splitting if necessary. The function is applied until the first gap is +// encountered, at which point the gap is returned. If the function is applied +// across the entire range, a terminal gap is returned. All existing iterators +// are invalidated. +// +// N.B. The Iterator must not be invalidated by the function. +func (s *DirtySet) ApplyContiguous(r __generics_imported0.MappableRange, fn func(seg DirtyIterator)) DirtyGapIterator { + seg, gap := s.Find(r.Start) + if !seg.Ok() { + return gap + } + for { + seg = s.Isolate(seg, r) + fn(seg) + if seg.End() >= r.End { + return DirtyGapIterator{} + } + gap = seg.NextGap() + if !gap.IsEmpty() { + return gap + } + seg = gap.NextSegment() + if !seg.Ok() { + + return DirtyGapIterator{} + } + } +} + +// +stateify savable +type Dirtynode struct { + // An internal binary tree node looks like: + // + // K + // / \ + // Cl Cr + // + // where all keys in the subtree rooted by Cl (the left subtree) are less + // than K (the key of the parent node), and all keys in the subtree rooted + // by Cr (the right subtree) are greater than K. + // + // An internal B-tree node's indexes work out to look like: + // + // K0 K1 K2 ... Kn-1 + // / \/ \/ \ ... / \ + // C0 C1 C2 C3 ... Cn-1 Cn + // + // where n is nrSegments. + nrSegments int + + // parent is a pointer to this node's parent. If this node is root, parent + // is nil. + parent *Dirtynode + + // parentIndex is the index of this node in parent.children. + parentIndex int + + // Flag for internal nodes that is technically redundant with "children[0] + // != nil", but is stored in the first cache line. "hasChildren" rather + // than "isLeaf" because false must be the correct value for an empty root. + hasChildren bool + + // Nodes store keys and values in separate arrays to maximize locality in + // the common case (scanning keys for lookup). + keys [DirtymaxDegree - 1]__generics_imported0.MappableRange + values [DirtymaxDegree - 1]DirtyInfo + children [DirtymaxDegree]*Dirtynode +} + +// firstSegment returns the first segment in the subtree rooted by n. +// +// Preconditions: n.nrSegments != 0. +func (n *Dirtynode) firstSegment() DirtyIterator { + for n.hasChildren { + n = n.children[0] + } + return DirtyIterator{n, 0} +} + +// lastSegment returns the last segment in the subtree rooted by n. +// +// Preconditions: n.nrSegments != 0. +func (n *Dirtynode) lastSegment() DirtyIterator { + for n.hasChildren { + n = n.children[n.nrSegments] + } + return DirtyIterator{n, n.nrSegments - 1} +} + +func (n *Dirtynode) prevSibling() *Dirtynode { + if n.parent == nil || n.parentIndex == 0 { + return nil + } + return n.parent.children[n.parentIndex-1] +} + +func (n *Dirtynode) nextSibling() *Dirtynode { + if n.parent == nil || n.parentIndex == n.parent.nrSegments { + return nil + } + return n.parent.children[n.parentIndex+1] +} + +// rebalanceBeforeInsert splits n and its ancestors if they are full, as +// required for insertion, and returns an updated iterator to the position +// represented by gap. +func (n *Dirtynode) rebalanceBeforeInsert(gap DirtyGapIterator) DirtyGapIterator { + if n.parent != nil { + gap = n.parent.rebalanceBeforeInsert(gap) + } + if n.nrSegments < DirtymaxDegree-1 { + return gap + } + if n.parent == nil { + + left := &Dirtynode{ + nrSegments: DirtyminDegree - 1, + parent: n, + parentIndex: 0, + hasChildren: n.hasChildren, + } + right := &Dirtynode{ + nrSegments: DirtyminDegree - 1, + parent: n, + parentIndex: 1, + hasChildren: n.hasChildren, + } + copy(left.keys[:DirtyminDegree-1], n.keys[:DirtyminDegree-1]) + copy(left.values[:DirtyminDegree-1], n.values[:DirtyminDegree-1]) + copy(right.keys[:DirtyminDegree-1], n.keys[DirtyminDegree:]) + copy(right.values[:DirtyminDegree-1], n.values[DirtyminDegree:]) + n.keys[0], n.values[0] = n.keys[DirtyminDegree-1], n.values[DirtyminDegree-1] + DirtyzeroValueSlice(n.values[1:]) + if n.hasChildren { + copy(left.children[:DirtyminDegree], n.children[:DirtyminDegree]) + copy(right.children[:DirtyminDegree], n.children[DirtyminDegree:]) + DirtyzeroNodeSlice(n.children[2:]) + for i := 0; i < DirtyminDegree; i++ { + left.children[i].parent = left + left.children[i].parentIndex = i + right.children[i].parent = right + right.children[i].parentIndex = i + } + } + n.nrSegments = 1 + n.hasChildren = true + n.children[0] = left + n.children[1] = right + if gap.node != n { + return gap + } + if gap.index < DirtyminDegree { + return DirtyGapIterator{left, gap.index} + } + return DirtyGapIterator{right, gap.index - DirtyminDegree} + } + + copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) + copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) + n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[DirtyminDegree-1], n.values[DirtyminDegree-1] + copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) + for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { + n.parent.children[i].parentIndex = i + } + sibling := &Dirtynode{ + nrSegments: DirtyminDegree - 1, + parent: n.parent, + parentIndex: n.parentIndex + 1, + hasChildren: n.hasChildren, + } + n.parent.children[n.parentIndex+1] = sibling + n.parent.nrSegments++ + copy(sibling.keys[:DirtyminDegree-1], n.keys[DirtyminDegree:]) + copy(sibling.values[:DirtyminDegree-1], n.values[DirtyminDegree:]) + DirtyzeroValueSlice(n.values[DirtyminDegree-1:]) + if n.hasChildren { + copy(sibling.children[:DirtyminDegree], n.children[DirtyminDegree:]) + DirtyzeroNodeSlice(n.children[DirtyminDegree:]) + for i := 0; i < DirtyminDegree; i++ { + sibling.children[i].parent = sibling + sibling.children[i].parentIndex = i + } + } + n.nrSegments = DirtyminDegree - 1 + + if gap.node != n { + return gap + } + if gap.index < DirtyminDegree { + return gap + } + return DirtyGapIterator{sibling, gap.index - DirtyminDegree} +} + +// rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient +// (contain fewer segments than required by B-tree invariants), as required for +// removal, and returns an updated iterator to the position represented by gap. +// +// Precondition: n is the only node in the tree that may currently violate a +// B-tree invariant. +func (n *Dirtynode) rebalanceAfterRemove(gap DirtyGapIterator) DirtyGapIterator { + for { + if n.nrSegments >= DirtyminDegree-1 { + return gap + } + if n.parent == nil { + + return gap + } + + if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= DirtyminDegree { + copy(n.keys[1:], n.keys[:n.nrSegments]) + copy(n.values[1:], n.values[:n.nrSegments]) + n.keys[0] = n.parent.keys[n.parentIndex-1] + n.values[0] = n.parent.values[n.parentIndex-1] + n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] + n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] + dirtySetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) + if n.hasChildren { + copy(n.children[1:], n.children[:n.nrSegments+1]) + n.children[0] = sibling.children[sibling.nrSegments] + sibling.children[sibling.nrSegments] = nil + n.children[0].parent = n + n.children[0].parentIndex = 0 + for i := 1; i < n.nrSegments+2; i++ { + n.children[i].parentIndex = i + } + } + n.nrSegments++ + sibling.nrSegments-- + if gap.node == sibling && gap.index == sibling.nrSegments { + return DirtyGapIterator{n, 0} + } + if gap.node == n { + return DirtyGapIterator{n, gap.index + 1} + } + return gap + } + if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= DirtyminDegree { + n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] + n.values[n.nrSegments] = n.parent.values[n.parentIndex] + n.parent.keys[n.parentIndex] = sibling.keys[0] + n.parent.values[n.parentIndex] = sibling.values[0] + copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) + copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) + dirtySetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) + if n.hasChildren { + n.children[n.nrSegments+1] = sibling.children[0] + copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) + sibling.children[sibling.nrSegments] = nil + n.children[n.nrSegments+1].parent = n + n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 + for i := 0; i < sibling.nrSegments; i++ { + sibling.children[i].parentIndex = i + } + } + n.nrSegments++ + sibling.nrSegments-- + if gap.node == sibling { + if gap.index == 0 { + return DirtyGapIterator{n, n.nrSegments} + } + return DirtyGapIterator{sibling, gap.index - 1} + } + return gap + } + + p := n.parent + if p.nrSegments == 1 { + + left, right := p.children[0], p.children[1] + p.nrSegments = left.nrSegments + right.nrSegments + 1 + p.hasChildren = left.hasChildren + p.keys[left.nrSegments] = p.keys[0] + p.values[left.nrSegments] = p.values[0] + copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) + copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) + copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) + copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) + if left.hasChildren { + copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) + copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) + for i := 0; i < p.nrSegments+1; i++ { + p.children[i].parent = p + p.children[i].parentIndex = i + } + } else { + p.children[0] = nil + p.children[1] = nil + } + if gap.node == left { + return DirtyGapIterator{p, gap.index} + } + if gap.node == right { + return DirtyGapIterator{p, gap.index + left.nrSegments + 1} + } + return gap + } + // Merge n and either sibling, along with the segment separating the + // two, into whichever of the two nodes comes first. This is the + // reverse of the non-root splitting case in + // node.rebalanceBeforeInsert. + var left, right *Dirtynode + if n.parentIndex > 0 { + left = n.prevSibling() + right = n + } else { + left = n + right = n.nextSibling() + } + + if gap.node == right { + gap = DirtyGapIterator{left, gap.index + left.nrSegments + 1} + } + left.keys[left.nrSegments] = p.keys[left.parentIndex] + left.values[left.nrSegments] = p.values[left.parentIndex] + copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) + copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) + if left.hasChildren { + copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) + for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { + left.children[i].parent = left + left.children[i].parentIndex = i + } + } + left.nrSegments += right.nrSegments + 1 + copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) + copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) + dirtySetFunctions{}.ClearValue(&p.values[p.nrSegments-1]) + copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) + for i := 0; i < p.nrSegments; i++ { + p.children[i].parentIndex = i + } + p.children[p.nrSegments] = nil + p.nrSegments-- + + n = p + } +} + +// A Iterator is conceptually one of: +// +// - A pointer to a segment in a set; or +// +// - A terminal iterator, which is a sentinel indicating that the end of +// iteration has been reached. +// +// Iterators are copyable values and are meaningfully equality-comparable. The +// zero value of Iterator is a terminal iterator. +// +// Unless otherwise specified, any mutation of a set invalidates all existing +// iterators into the set. +type DirtyIterator struct { + // node is the node containing the iterated segment. If the iterator is + // terminal, node is nil. + node *Dirtynode + + // index is the index of the segment in node.keys/values. + index int +} + +// Ok returns true if the iterator is not terminal. All other methods are only +// valid for non-terminal iterators. +func (seg DirtyIterator) Ok() bool { + return seg.node != nil +} + +// Range returns the iterated segment's range key. +func (seg DirtyIterator) Range() __generics_imported0.MappableRange { + return seg.node.keys[seg.index] +} + +// Start is equivalent to Range().Start, but should be preferred if only the +// start of the range is needed. +func (seg DirtyIterator) Start() uint64 { + return seg.node.keys[seg.index].Start +} + +// End is equivalent to Range().End, but should be preferred if only the end of +// the range is needed. +func (seg DirtyIterator) End() uint64 { + return seg.node.keys[seg.index].End +} + +// SetRangeUnchecked mutates the iterated segment's range key. This operation +// does not invalidate any iterators. +// +// Preconditions: +// +// - r.Length() > 0. +// +// - The new range must not overlap an existing one: If seg.NextSegment().Ok(), +// then r.end <= seg.NextSegment().Start(); if seg.PrevSegment().Ok(), then +// r.start >= seg.PrevSegment().End(). +func (seg DirtyIterator) SetRangeUnchecked(r __generics_imported0.MappableRange) { + seg.node.keys[seg.index] = r +} + +// SetRange mutates the iterated segment's range key. If the new range would +// cause the iterated segment to overlap another segment, or if the new range +// is invalid, SetRange panics. This operation does not invalidate any +// iterators. +func (seg DirtyIterator) SetRange(r __generics_imported0.MappableRange) { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { + panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) + } + if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { + panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) + } + seg.SetRangeUnchecked(r) +} + +// SetStartUnchecked mutates the iterated segment's start. This operation does +// not invalidate any iterators. +// +// Preconditions: The new start must be valid: start < seg.End(); if +// seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). +func (seg DirtyIterator) SetStartUnchecked(start uint64) { + seg.node.keys[seg.index].Start = start +} + +// SetStart mutates the iterated segment's start. If the new start value would +// cause the iterated segment to overlap another segment, or would result in an +// invalid range, SetStart panics. This operation does not invalidate any +// iterators. +func (seg DirtyIterator) SetStart(start uint64) { + if start >= seg.End() { + panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) + } + if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { + panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) + } + seg.SetStartUnchecked(start) +} + +// SetEndUnchecked mutates the iterated segment's end. This operation does not +// invalidate any iterators. +// +// Preconditions: The new end must be valid: end > seg.Start(); if +// seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). +func (seg DirtyIterator) SetEndUnchecked(end uint64) { + seg.node.keys[seg.index].End = end +} + +// SetEnd mutates the iterated segment's end. If the new end value would cause +// the iterated segment to overlap another segment, or would result in an +// invalid range, SetEnd panics. This operation does not invalidate any +// iterators. +func (seg DirtyIterator) SetEnd(end uint64) { + if end <= seg.Start() { + panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) + } + if next := seg.NextSegment(); next.Ok() && end > next.Start() { + panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) + } + seg.SetEndUnchecked(end) +} + +// Value returns a copy of the iterated segment's value. +func (seg DirtyIterator) Value() DirtyInfo { + return seg.node.values[seg.index] +} + +// ValuePtr returns a pointer to the iterated segment's value. The pointer is +// invalidated if the iterator is invalidated. This operation does not +// invalidate any iterators. +func (seg DirtyIterator) ValuePtr() *DirtyInfo { + return &seg.node.values[seg.index] +} + +// SetValue mutates the iterated segment's value. This operation does not +// invalidate any iterators. +func (seg DirtyIterator) SetValue(val DirtyInfo) { + seg.node.values[seg.index] = val +} + +// PrevSegment returns the iterated segment's predecessor. If there is no +// preceding segment, PrevSegment returns a terminal iterator. +func (seg DirtyIterator) PrevSegment() DirtyIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index].lastSegment() + } + if seg.index > 0 { + return DirtyIterator{seg.node, seg.index - 1} + } + if seg.node.parent == nil { + return DirtyIterator{} + } + return DirtysegmentBeforePosition(seg.node.parent, seg.node.parentIndex) +} + +// NextSegment returns the iterated segment's successor. If there is no +// succeeding segment, NextSegment returns a terminal iterator. +func (seg DirtyIterator) NextSegment() DirtyIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index+1].firstSegment() + } + if seg.index < seg.node.nrSegments-1 { + return DirtyIterator{seg.node, seg.index + 1} + } + if seg.node.parent == nil { + return DirtyIterator{} + } + return DirtysegmentAfterPosition(seg.node.parent, seg.node.parentIndex) +} + +// PrevGap returns the gap immediately before the iterated segment. +func (seg DirtyIterator) PrevGap() DirtyGapIterator { + if seg.node.hasChildren { + + return seg.node.children[seg.index].lastSegment().NextGap() + } + return DirtyGapIterator{seg.node, seg.index} +} + +// NextGap returns the gap immediately after the iterated segment. +func (seg DirtyIterator) NextGap() DirtyGapIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index+1].firstSegment().PrevGap() + } + return DirtyGapIterator{seg.node, seg.index + 1} +} + +// PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, +// or the gap before the iterated segment otherwise. If seg.Start() == +// Functions.MinKey(), PrevNonEmpty will return two terminal iterators. +// Otherwise, exactly one of the iterators returned by PrevNonEmpty will be +// non-terminal. +func (seg DirtyIterator) PrevNonEmpty() (DirtyIterator, DirtyGapIterator) { + gap := seg.PrevGap() + if gap.Range().Length() != 0 { + return DirtyIterator{}, gap + } + return gap.PrevSegment(), DirtyGapIterator{} +} + +// NextNonEmpty returns the iterated segment's successor if it is adjacent, or +// the gap after the iterated segment otherwise. If seg.End() == +// Functions.MaxKey(), NextNonEmpty will return two terminal iterators. +// Otherwise, exactly one of the iterators returned by NextNonEmpty will be +// non-terminal. +func (seg DirtyIterator) NextNonEmpty() (DirtyIterator, DirtyGapIterator) { + gap := seg.NextGap() + if gap.Range().Length() != 0 { + return DirtyIterator{}, gap + } + return gap.NextSegment(), DirtyGapIterator{} +} + +// A GapIterator is conceptually one of: +// +// - A pointer to a position between two segments, before the first segment, or +// after the last segment in a set, called a *gap*; or +// +// - A terminal iterator, which is a sentinel indicating that the end of +// iteration has been reached. +// +// Note that the gap between two adjacent segments exists (iterators to it are +// non-terminal), but has a length of zero. GapIterator.IsEmpty returns true +// for such gaps. An empty set contains a single gap, spanning the entire range +// of the set's keys. +// +// GapIterators are copyable values and are meaningfully equality-comparable. +// The zero value of GapIterator is a terminal iterator. +// +// Unless otherwise specified, any mutation of a set invalidates all existing +// iterators into the set. +type DirtyGapIterator struct { + // The representation of a GapIterator is identical to that of an Iterator, + // except that index corresponds to positions between segments in the same + // way as for node.children (see comment for node.nrSegments). + node *Dirtynode + index int +} + +// Ok returns true if the iterator is not terminal. All other methods are only +// valid for non-terminal iterators. +func (gap DirtyGapIterator) Ok() bool { + return gap.node != nil +} + +// Range returns the range spanned by the iterated gap. +func (gap DirtyGapIterator) Range() __generics_imported0.MappableRange { + return __generics_imported0.MappableRange{gap.Start(), gap.End()} +} + +// Start is equivalent to Range().Start, but should be preferred if only the +// start of the range is needed. +func (gap DirtyGapIterator) Start() uint64 { + if ps := gap.PrevSegment(); ps.Ok() { + return ps.End() + } + return dirtySetFunctions{}.MinKey() +} + +// End is equivalent to Range().End, but should be preferred if only the end of +// the range is needed. +func (gap DirtyGapIterator) End() uint64 { + if ns := gap.NextSegment(); ns.Ok() { + return ns.Start() + } + return dirtySetFunctions{}.MaxKey() +} + +// IsEmpty returns true if the iterated gap is empty (that is, the "gap" is +// between two adjacent segments.) +func (gap DirtyGapIterator) IsEmpty() bool { + return gap.Range().Length() == 0 +} + +// PrevSegment returns the segment immediately before the iterated gap. If no +// such segment exists, PrevSegment returns a terminal iterator. +func (gap DirtyGapIterator) PrevSegment() DirtyIterator { + return DirtysegmentBeforePosition(gap.node, gap.index) +} + +// NextSegment returns the segment immediately after the iterated gap. If no +// such segment exists, NextSegment returns a terminal iterator. +func (gap DirtyGapIterator) NextSegment() DirtyIterator { + return DirtysegmentAfterPosition(gap.node, gap.index) +} + +// PrevGap returns the iterated gap's predecessor. If no such gap exists, +// PrevGap returns a terminal iterator. +func (gap DirtyGapIterator) PrevGap() DirtyGapIterator { + seg := gap.PrevSegment() + if !seg.Ok() { + return DirtyGapIterator{} + } + return seg.PrevGap() +} + +// NextGap returns the iterated gap's successor. If no such gap exists, NextGap +// returns a terminal iterator. +func (gap DirtyGapIterator) NextGap() DirtyGapIterator { + seg := gap.NextSegment() + if !seg.Ok() { + return DirtyGapIterator{} + } + return seg.NextGap() +} + +// segmentBeforePosition returns the predecessor segment of the position given +// by n.children[i], which may or may not contain a child. If no such segment +// exists, segmentBeforePosition returns a terminal iterator. +func DirtysegmentBeforePosition(n *Dirtynode, i int) DirtyIterator { + for i == 0 { + if n.parent == nil { + return DirtyIterator{} + } + n, i = n.parent, n.parentIndex + } + return DirtyIterator{n, i - 1} +} + +// segmentAfterPosition returns the successor segment of the position given by +// n.children[i], which may or may not contain a child. If no such segment +// exists, segmentAfterPosition returns a terminal iterator. +func DirtysegmentAfterPosition(n *Dirtynode, i int) DirtyIterator { + for i == n.nrSegments { + if n.parent == nil { + return DirtyIterator{} + } + n, i = n.parent, n.parentIndex + } + return DirtyIterator{n, i} +} + +func DirtyzeroValueSlice(slice []DirtyInfo) { + + for i := range slice { + dirtySetFunctions{}.ClearValue(&slice[i]) + } +} + +func DirtyzeroNodeSlice(slice []*Dirtynode) { + for i := range slice { + slice[i] = nil + } +} + +// String stringifies a Set for debugging. +func (s *DirtySet) String() string { + return s.root.String() +} + +// String stringifes a node (and all of its children) for debugging. +func (n *Dirtynode) String() string { + var buf bytes.Buffer + n.writeDebugString(&buf, "") + return buf.String() +} + +func (n *Dirtynode) writeDebugString(buf *bytes.Buffer, prefix string) { + if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { + buf.WriteString(prefix) + buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) + } + for i := 0; i < n.nrSegments; i++ { + if child := n.children[i]; child != nil { + cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) + if child.parent != n || child.parentIndex != i { + buf.WriteString(cprefix) + buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) + } + child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) + } + buf.WriteString(prefix) + buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) + } + if child := n.children[n.nrSegments]; child != nil { + child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) + } +} + +// SegmentDataSlices represents segments from a set as slices of start, end, and +// values. SegmentDataSlices is primarily used as an intermediate representation +// for save/restore and the layout here is optimized for that. +// +// +stateify savable +type DirtySegmentDataSlices struct { + Start []uint64 + End []uint64 + Values []DirtyInfo +} + +// ExportSortedSlice returns a copy of all segments in the given set, in ascending +// key order. +func (s *DirtySet) ExportSortedSlices() *DirtySegmentDataSlices { + var sds DirtySegmentDataSlices + for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + sds.Start = append(sds.Start, seg.Start()) + sds.End = append(sds.End, seg.End()) + sds.Values = append(sds.Values, seg.Value()) + } + sds.Start = sds.Start[:len(sds.Start):len(sds.Start)] + sds.End = sds.End[:len(sds.End):len(sds.End)] + sds.Values = sds.Values[:len(sds.Values):len(sds.Values)] + return &sds +} + +// ImportSortedSlice initializes the given set from the given slice. +// +// Preconditions: s must be empty. sds must represent a valid set (the segments +// in sds must have valid lengths that do not overlap). The segments in sds +// must be sorted in ascending key order. +func (s *DirtySet) ImportSortedSlices(sds *DirtySegmentDataSlices) error { + if !s.IsEmpty() { + return fmt.Errorf("cannot import into non-empty set %v", s) + } + gap := s.FirstGap() + for i := range sds.Start { + r := __generics_imported0.MappableRange{sds.Start[i], sds.End[i]} + if !gap.Range().IsSupersetOf(r) { + return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: [%d, %d) => %v", sds.Start[i], sds.End[i], sds.Values[i]) + } + gap = s.InsertWithoutMerging(gap, r, sds.Values[i]).NextGap() + } + return nil +} +func (s *DirtySet) saveRoot() *DirtySegmentDataSlices { + return s.ExportSortedSlices() +} + +func (s *DirtySet) loadRoot(sds *DirtySegmentDataSlices) { + if err := s.ImportSortedSlices(sds); err != nil { + panic(err) + } +} diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go new file mode 100644 index 000000000..9381963d0 --- /dev/null +++ b/pkg/sentry/fs/fsutil/file.go @@ -0,0 +1,394 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// FileNoopRelease implements fs.FileOperations.Release for files that have no +// resources to release. +type FileNoopRelease struct{} + +// Release is a no-op. +func (FileNoopRelease) Release() {} + +// SeekWithDirCursor is used to implement fs.FileOperations.Seek. If dirCursor +// is not nil and the seek was on a directory, the cursor will be updated. +// +// Currently only seeking to 0 on a directory is supported. +// +// FIXME(b/33075855): Lift directory seeking limitations. +func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64, dirCursor *string) (int64, error) { + inode := file.Dirent.Inode + current := file.Offset() + + // Does the Inode represents a non-seekable type? + if fs.IsPipe(inode.StableAttr) || fs.IsSocket(inode.StableAttr) { + return current, syserror.ESPIPE + } + + // Does the Inode represent a character device? + if fs.IsCharDevice(inode.StableAttr) { + // Ignore seek requests. + // + // FIXME(b/34716638): This preserves existing + // behavior but is not universally correct. + return 0, nil + } + + // Otherwise compute the new offset. + switch whence { + case fs.SeekSet: + switch inode.StableAttr.Type { + case fs.RegularFile, fs.SpecialFile, fs.BlockDevice: + if offset < 0 { + return current, syserror.EINVAL + } + return offset, nil + case fs.Directory, fs.SpecialDirectory: + if offset != 0 { + return current, syserror.EINVAL + } + // SEEK_SET to 0 moves the directory "cursor" to the beginning. + if dirCursor != nil { + *dirCursor = "" + } + return 0, nil + default: + return current, syserror.EINVAL + } + case fs.SeekCurrent: + switch inode.StableAttr.Type { + case fs.RegularFile, fs.SpecialFile, fs.BlockDevice: + if current+offset < 0 { + return current, syserror.EINVAL + } + return current + offset, nil + case fs.Directory, fs.SpecialDirectory: + if offset != 0 { + return current, syserror.EINVAL + } + return current, nil + default: + return current, syserror.EINVAL + } + case fs.SeekEnd: + switch inode.StableAttr.Type { + case fs.RegularFile, fs.BlockDevice: + // Allow the file to determine the end. + uattr, err := inode.UnstableAttr(ctx) + if err != nil { + return current, err + } + sz := uattr.Size + if sz+offset < 0 { + return current, syserror.EINVAL + } + return sz + offset, nil + // FIXME(b/34778850): This is not universally correct. + // Remove SpecialDirectory. + case fs.SpecialDirectory: + if offset != 0 { + return current, syserror.EINVAL + } + // SEEK_END to 0 moves the directory "cursor" to the end. + // + // FIXME(b/35442290): The ensures that after the seek, + // reading on the directory will get EOF. But it is not + // correct in general because the directory can grow in + // size; attempting to read those new entries will be + // futile (EOF will always be the result). + return fs.FileMaxOffset, nil + default: + return current, syserror.EINVAL + } + } + + // Not a valid seek request. + return current, syserror.EINVAL +} + +// FileGenericSeek implements fs.FileOperations.Seek for files that use a +// generic seek implementation. +type FileGenericSeek struct{} + +// Seek implements fs.FileOperations.Seek. +func (FileGenericSeek) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) { + return SeekWithDirCursor(ctx, file, whence, offset, nil) +} + +// FileZeroSeek implements fs.FileOperations.Seek for files that maintain a +// constant zero-value offset and require a no-op Seek. +type FileZeroSeek struct{} + +// Seek implements fs.FileOperations.Seek. +func (FileZeroSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) { + return 0, nil +} + +// FileNoSeek implements fs.FileOperations.Seek to return EINVAL. +type FileNoSeek struct{} + +// Seek implements fs.FileOperations.Seek. +func (FileNoSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) { + return 0, syserror.EINVAL +} + +// FilePipeSeek implements fs.FileOperations.Seek and can be used for files +// that behave like pipes (seeking is not supported). +type FilePipeSeek struct{} + +// Seek implements fs.FileOperations.Seek. +func (FilePipeSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) { + return 0, syserror.ESPIPE +} + +// FileNotDirReaddir implements fs.FileOperations.Readdir for non-directories. +type FileNotDirReaddir struct{} + +// Readdir implements fs.FileOperations.FileNotDirReaddir. +func (FileNotDirReaddir) Readdir(context.Context, *fs.File, fs.DentrySerializer) (int64, error) { + return 0, syserror.ENOTDIR +} + +// FileNoFsync implements fs.FileOperations.Fsync for files that don't support +// syncing. +type FileNoFsync struct{} + +// Fsync implements fs.FileOperations.Fsync. +func (FileNoFsync) Fsync(context.Context, *fs.File, int64, int64, fs.SyncType) error { + return syserror.EINVAL +} + +// FileNoopFsync implements fs.FileOperations.Fsync for files that don't need +// to synced. +type FileNoopFsync struct{} + +// Fsync implements fs.FileOperations.Fsync. +func (FileNoopFsync) Fsync(context.Context, *fs.File, int64, int64, fs.SyncType) error { + return nil +} + +// FileNoopFlush implements fs.FileOperations.Flush as a no-op. +type FileNoopFlush struct{} + +// Flush implements fs.FileOperations.Flush. +func (FileNoopFlush) Flush(context.Context, *fs.File) error { + return nil +} + +// FileNoMMap implements fs.FileOperations.Mappable for files that cannot +// be memory mapped. +type FileNoMMap struct{} + +// ConfigureMMap implements fs.FileOperations.ConfigureMMap. +func (FileNoMMap) ConfigureMMap(context.Context, *fs.File, *memmap.MMapOpts) error { + return syserror.ENODEV +} + +// GenericConfigureMMap implements fs.FileOperations.ConfigureMMap for most +// filesystems that support memory mapping. +func GenericConfigureMMap(file *fs.File, m memmap.Mappable, opts *memmap.MMapOpts) error { + opts.Mappable = m + opts.MappingIdentity = file + file.IncRef() + return nil +} + +// FileNoIoctl implements fs.FileOperations.Ioctl for files that don't +// implement the ioctl syscall. +type FileNoIoctl struct{} + +// Ioctl implements fs.FileOperations.Ioctl. +func (FileNoIoctl) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return 0, syserror.ENOTTY +} + +// FileNoSplice implements fs.FileOperations.ReadFrom and +// fs.FileOperations.WriteTo for files that don't support splice. +type FileNoSplice struct{} + +// WriteTo implements fs.FileOperations.WriteTo. +func (FileNoSplice) WriteTo(context.Context, *fs.File, *fs.File, fs.SpliceOpts) (int64, error) { + return 0, syserror.ENOSYS +} + +// ReadFrom implements fs.FileOperations.ReadFrom. +func (FileNoSplice) ReadFrom(context.Context, *fs.File, *fs.File, fs.SpliceOpts) (int64, error) { + return 0, syserror.ENOSYS +} + +// DirFileOperations implements most of fs.FileOperations for directories, +// except for Readdir and UnstableAttr which the embedding type must implement. +type DirFileOperations struct { + waiter.AlwaysReady + FileGenericSeek + FileNoIoctl + FileNoMMap + FileNoopFlush + FileNoopFsync + FileNoopRelease + FileNoSplice +} + +// Read implements fs.FileOperations.Read +func (*DirFileOperations) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { + return 0, syserror.EISDIR +} + +// Write implements fs.FileOperations.Write. +func (*DirFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { + return 0, syserror.EISDIR +} + +// StaticDirFileOperations implements fs.FileOperations for directories with +// static children. +// +// +stateify savable +type StaticDirFileOperations struct { + DirFileOperations `state:"nosave"` + FileUseInodeUnstableAttr `state:"nosave"` + + // dentryMap is a SortedDentryMap used to implement Readdir. + dentryMap *fs.SortedDentryMap + + // dirCursor contains the name of the last directory entry that was + // serialized. + dirCursor string +} + +// NewStaticDirFileOperations returns a new StaticDirFileOperations that will +// iterate the given denty map. +func NewStaticDirFileOperations(dentries *fs.SortedDentryMap) *StaticDirFileOperations { + return &StaticDirFileOperations{ + dentryMap: dentries, + } +} + +// IterateDir implements DirIterator.IterateDir. +func (sdfo *StaticDirFileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) { + n, err := fs.GenericReaddir(dirCtx, sdfo.dentryMap) + return offset + n, err +} + +// Readdir implements fs.FileOperations.Readdir. +func (sdfo *StaticDirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { + root := fs.RootFromContext(ctx) + if root != nil { + defer root.DecRef() + } + dirCtx := &fs.DirCtx{ + Serializer: serializer, + DirCursor: &sdfo.dirCursor, + } + return fs.DirentReaddir(ctx, file.Dirent, sdfo, root, dirCtx, file.Offset()) +} + +// NoReadWriteFile is a file that does not support reading or writing. +// +// +stateify savable +type NoReadWriteFile struct { + waiter.AlwaysReady `state:"nosave"` + FileGenericSeek `state:"nosave"` + FileNoIoctl `state:"nosave"` + FileNoMMap `state:"nosave"` + FileNoopFsync `state:"nosave"` + FileNoopFlush `state:"nosave"` + FileNoopRelease `state:"nosave"` + FileNoRead `state:"nosave"` + FileNoWrite `state:"nosave"` + FileNotDirReaddir `state:"nosave"` + FileUseInodeUnstableAttr `state:"nosave"` + FileNoSplice `state:"nosave"` +} + +var _ fs.FileOperations = (*NoReadWriteFile)(nil) + +// FileStaticContentReader is a helper to implement fs.FileOperations.Read with +// static content. +// +// +stateify savable +type FileStaticContentReader struct { + // content is immutable. + content []byte +} + +// NewFileStaticContentReader initializes a FileStaticContentReader with the +// given content. +func NewFileStaticContentReader(b []byte) FileStaticContentReader { + return FileStaticContentReader{ + content: b, + } +} + +// Read implements fs.FileOperations.Read. +func (scr *FileStaticContentReader) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + if offset >= int64(len(scr.content)) { + return 0, nil + } + n, err := dst.CopyOut(ctx, scr.content[offset:]) + return int64(n), err +} + +// FileNoopWrite implements fs.FileOperations.Write as a noop. +type FileNoopWrite struct{} + +// Write implements fs.FileOperations.Write. +func (FileNoopWrite) Write(_ context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { + return src.NumBytes(), nil +} + +// FileNoRead implements fs.FileOperations.Read to return EINVAL. +type FileNoRead struct{} + +// Read implements fs.FileOperations.Read. +func (FileNoRead) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { + return 0, syserror.EINVAL +} + +// FileNoWrite implements fs.FileOperations.Write to return EINVAL. +type FileNoWrite struct{} + +// Write implements fs.FileOperations.Write. +func (FileNoWrite) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { + return 0, syserror.EINVAL +} + +// FileNoopRead implement fs.FileOperations.Read as a noop. +type FileNoopRead struct{} + +// Read implements fs.FileOperations.Read. +func (FileNoopRead) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { + return 0, nil +} + +// FileUseInodeUnstableAttr implements fs.FileOperations.UnstableAttr by calling +// InodeOperations.UnstableAttr. +type FileUseInodeUnstableAttr struct{} + +// UnstableAttr implements fs.FileOperations.UnstableAttr. +func (FileUseInodeUnstableAttr) UnstableAttr(ctx context.Context, file *fs.File) (fs.UnstableAttr, error) { + return file.Dirent.Inode.UnstableAttr(ctx) +} diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go new file mode 100644 index 000000000..b5ac6c71c --- /dev/null +++ b/pkg/sentry/fs/fsutil/file_range_set.go @@ -0,0 +1,209 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "fmt" + "io" + "math" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// FileRangeSet maps offsets into a memmap.Mappable to offsets into a +// platform.File. It is used to implement Mappables that store data in +// sparsely-allocated memory. +// +// type FileRangeSet <generated by go_generics> + +// fileRangeSetFunctions implements segment.Functions for FileRangeSet. +type fileRangeSetFunctions struct{} + +// MinKey implements segment.Functions.MinKey. +func (fileRangeSetFunctions) MinKey() uint64 { + return 0 +} + +// MaxKey implements segment.Functions.MaxKey. +func (fileRangeSetFunctions) MaxKey() uint64 { + return math.MaxUint64 +} + +// ClearValue implements segment.Functions.ClearValue. +func (fileRangeSetFunctions) ClearValue(_ *uint64) { +} + +// Merge implements segment.Functions.Merge. +func (fileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _ memmap.MappableRange, frstart2 uint64) (uint64, bool) { + if frstart1+mr1.Length() != frstart2 { + return 0, false + } + return frstart1, true +} + +// Split implements segment.Functions.Split. +func (fileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, split uint64) (uint64, uint64) { + return frstart, frstart + (split - mr.Start) +} + +// FileRange returns the FileRange mapped by seg. +func (seg FileRangeIterator) FileRange() platform.FileRange { + return seg.FileRangeOf(seg.Range()) +} + +// FileRangeOf returns the FileRange mapped by mr. +// +// Preconditions: seg.Range().IsSupersetOf(mr). mr.Length() != 0. +func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) platform.FileRange { + frstart := seg.Value() + (mr.Start - seg.Start()) + return platform.FileRange{frstart, frstart + mr.Length()} +} + +// Fill attempts to ensure that all memmap.Mappable offsets in required are +// mapped to a platform.File offset, by allocating from mf with the given +// memory usage kind and invoking readAt to store data into memory. (If readAt +// returns a successful partial read, Fill will call it repeatedly until all +// bytes have been read.) EOF is handled consistently with the requirements of +// mmap(2): bytes after EOF on the same page are zeroed; pages after EOF are +// invalid. +// +// Fill may read offsets outside of required, but will never read offsets +// outside of optional. It returns a non-nil error if any error occurs, even +// if the error only affects offsets in optional, but not in required. +// +// Preconditions: required.Length() > 0. optional.IsSupersetOf(required). +// required and optional must be page-aligned. +func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, mf *pgalloc.MemoryFile, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error { + gap := frs.LowerBoundGap(required.Start) + for gap.Ok() && gap.Start() < required.End { + if gap.Range().Length() == 0 { + gap = gap.NextGap() + continue + } + gr := gap.Range().Intersect(optional) + + // Read data into the gap. + fr, err := mf.AllocateAndFill(gr.Length(), kind, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) { + var done uint64 + for !dsts.IsEmpty() { + n, err := readAt(ctx, dsts, gr.Start+done) + done += n + dsts = dsts.DropFirst64(n) + if err != nil { + if err == io.EOF { + // MemoryFile.AllocateAndFill truncates down to a page + // boundary, but FileRangeSet.Fill is supposed to + // zero-fill to the end of the page in this case. + donepgaddr, ok := usermem.Addr(done).RoundUp() + if donepg := uint64(donepgaddr); ok && donepg != done { + dsts.DropFirst64(donepg - done) + done = donepg + if dsts.IsEmpty() { + return done, nil + } + } + } + return done, err + } + } + return done, nil + })) + + // Store anything we managed to read into the cache. + if done := fr.Length(); done != 0 { + gr.End = gr.Start + done + gap = frs.Insert(gap, gr, fr.Start).NextGap() + } + + if err != nil { + return err + } + } + return nil +} + +// Drop removes segments for memmap.Mappable offsets in mr, freeing the +// corresponding platform.FileRanges. +// +// Preconditions: mr must be page-aligned. +func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mf *pgalloc.MemoryFile) { + seg := frs.LowerBoundSegment(mr.Start) + for seg.Ok() && seg.Start() < mr.End { + seg = frs.Isolate(seg, mr) + mf.DecRef(seg.FileRange()) + seg = frs.Remove(seg).NextSegment() + } +} + +// DropAll removes all segments in mr, freeing the corresponding +// platform.FileRanges. +func (frs *FileRangeSet) DropAll(mf *pgalloc.MemoryFile) { + for seg := frs.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + mf.DecRef(seg.FileRange()) + } + frs.RemoveAll() +} + +// Truncate updates frs to reflect Mappable truncation to the given length: +// bytes after the new EOF on the same page are zeroed, and pages after the new +// EOF are freed. +func (frs *FileRangeSet) Truncate(end uint64, mf *pgalloc.MemoryFile) { + pgendaddr, ok := usermem.Addr(end).RoundUp() + if ok { + pgend := uint64(pgendaddr) + + // Free truncated pages. + frs.SplitAt(pgend) + seg := frs.LowerBoundSegment(pgend) + for seg.Ok() { + mf.DecRef(seg.FileRange()) + seg = frs.Remove(seg).NextSegment() + } + + if end == pgend { + return + } + } + + // Here we know end < end.RoundUp(). If the new EOF lands in the + // middle of a page that we have, zero out its contents beyond the new + // length. + seg := frs.FindSegment(end) + if seg.Ok() { + fr := seg.FileRange() + fr.Start += end - seg.Start() + ims, err := mf.MapInternal(fr, usermem.Write) + if err != nil { + // There's no good recourse from here. This means + // that we can't keep cached memory consistent with + // the new end of file. The caller may have already + // updated the file size on their backing file system. + // + // We don't want to risk blindly continuing onward, + // so in the extremely rare cases this does happen, + // we abandon ship. + panic(fmt.Sprintf("Failed to map %v: %v", fr, err)) + } + if _, err := safemem.ZeroSeq(ims); err != nil { + panic(fmt.Sprintf("Zeroing %v failed: %v", fr, err)) + } + } +} diff --git a/pkg/sentry/fs/fsutil/file_range_set_impl.go b/pkg/sentry/fs/fsutil/file_range_set_impl.go new file mode 100755 index 000000000..a0ab61628 --- /dev/null +++ b/pkg/sentry/fs/fsutil/file_range_set_impl.go @@ -0,0 +1,1274 @@ +package fsutil + +import ( + __generics_imported0 "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" +) + +import ( + "bytes" + "fmt" +) + +const ( + // minDegree is the minimum degree of an internal node in a Set B-tree. + // + // - Any non-root node has at least minDegree-1 segments. + // + // - Any non-root internal (non-leaf) node has at least minDegree children. + // + // - The root node may have fewer than minDegree-1 segments, but it may + // only have 0 segments if the tree is empty. + // + // Our implementation requires minDegree >= 3. Higher values of minDegree + // usually improve performance, but increase memory usage for small sets. + FileRangeminDegree = 3 + + FileRangemaxDegree = 2 * FileRangeminDegree +) + +// A Set is a mapping of segments with non-overlapping Range keys. The zero +// value for a Set is an empty set. Set values are not safely movable nor +// copyable. Set is thread-compatible. +// +// +stateify savable +type FileRangeSet struct { + root FileRangenode `state:".(*FileRangeSegmentDataSlices)"` +} + +// IsEmpty returns true if the set contains no segments. +func (s *FileRangeSet) IsEmpty() bool { + return s.root.nrSegments == 0 +} + +// IsEmptyRange returns true iff no segments in the set overlap the given +// range. This is semantically equivalent to s.SpanRange(r) == 0, but may be +// more efficient. +func (s *FileRangeSet) IsEmptyRange(r __generics_imported0.MappableRange) bool { + switch { + case r.Length() < 0: + panic(fmt.Sprintf("invalid range %v", r)) + case r.Length() == 0: + return true + } + _, gap := s.Find(r.Start) + if !gap.Ok() { + return false + } + return r.End <= gap.End() +} + +// Span returns the total size of all segments in the set. +func (s *FileRangeSet) Span() uint64 { + var sz uint64 + for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + sz += seg.Range().Length() + } + return sz +} + +// SpanRange returns the total size of the intersection of segments in the set +// with the given range. +func (s *FileRangeSet) SpanRange(r __generics_imported0.MappableRange) uint64 { + switch { + case r.Length() < 0: + panic(fmt.Sprintf("invalid range %v", r)) + case r.Length() == 0: + return 0 + } + var sz uint64 + for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { + sz += seg.Range().Intersect(r).Length() + } + return sz +} + +// FirstSegment returns the first segment in the set. If the set is empty, +// FirstSegment returns a terminal iterator. +func (s *FileRangeSet) FirstSegment() FileRangeIterator { + if s.root.nrSegments == 0 { + return FileRangeIterator{} + } + return s.root.firstSegment() +} + +// LastSegment returns the last segment in the set. If the set is empty, +// LastSegment returns a terminal iterator. +func (s *FileRangeSet) LastSegment() FileRangeIterator { + if s.root.nrSegments == 0 { + return FileRangeIterator{} + } + return s.root.lastSegment() +} + +// FirstGap returns the first gap in the set. +func (s *FileRangeSet) FirstGap() FileRangeGapIterator { + n := &s.root + for n.hasChildren { + n = n.children[0] + } + return FileRangeGapIterator{n, 0} +} + +// LastGap returns the last gap in the set. +func (s *FileRangeSet) LastGap() FileRangeGapIterator { + n := &s.root + for n.hasChildren { + n = n.children[n.nrSegments] + } + return FileRangeGapIterator{n, n.nrSegments} +} + +// Find returns the segment or gap whose range contains the given key. If a +// segment is found, the returned Iterator is non-terminal and the +// returned GapIterator is terminal. Otherwise, the returned Iterator is +// terminal and the returned GapIterator is non-terminal. +func (s *FileRangeSet) Find(key uint64) (FileRangeIterator, FileRangeGapIterator) { + n := &s.root + for { + + lower := 0 + upper := n.nrSegments + for lower < upper { + i := lower + (upper-lower)/2 + if r := n.keys[i]; key < r.End { + if key >= r.Start { + return FileRangeIterator{n, i}, FileRangeGapIterator{} + } + upper = i + } else { + lower = i + 1 + } + } + i := lower + if !n.hasChildren { + return FileRangeIterator{}, FileRangeGapIterator{n, i} + } + n = n.children[i] + } +} + +// FindSegment returns the segment whose range contains the given key. If no +// such segment exists, FindSegment returns a terminal iterator. +func (s *FileRangeSet) FindSegment(key uint64) FileRangeIterator { + seg, _ := s.Find(key) + return seg +} + +// LowerBoundSegment returns the segment with the lowest range that contains a +// key greater than or equal to min. If no such segment exists, +// LowerBoundSegment returns a terminal iterator. +func (s *FileRangeSet) LowerBoundSegment(min uint64) FileRangeIterator { + seg, gap := s.Find(min) + if seg.Ok() { + return seg + } + return gap.NextSegment() +} + +// UpperBoundSegment returns the segment with the highest range that contains a +// key less than or equal to max. If no such segment exists, UpperBoundSegment +// returns a terminal iterator. +func (s *FileRangeSet) UpperBoundSegment(max uint64) FileRangeIterator { + seg, gap := s.Find(max) + if seg.Ok() { + return seg + } + return gap.PrevSegment() +} + +// FindGap returns the gap containing the given key. If no such gap exists +// (i.e. the set contains a segment containing that key), FindGap returns a +// terminal iterator. +func (s *FileRangeSet) FindGap(key uint64) FileRangeGapIterator { + _, gap := s.Find(key) + return gap +} + +// LowerBoundGap returns the gap with the lowest range that is greater than or +// equal to min. +func (s *FileRangeSet) LowerBoundGap(min uint64) FileRangeGapIterator { + seg, gap := s.Find(min) + if gap.Ok() { + return gap + } + return seg.NextGap() +} + +// UpperBoundGap returns the gap with the highest range that is less than or +// equal to max. +func (s *FileRangeSet) UpperBoundGap(max uint64) FileRangeGapIterator { + seg, gap := s.Find(max) + if gap.Ok() { + return gap + } + return seg.PrevGap() +} + +// Add inserts the given segment into the set and returns true. If the new +// segment can be merged with adjacent segments, Add will do so. If the new +// segment would overlap an existing segment, Add returns false. If Add +// succeeds, all existing iterators are invalidated. +func (s *FileRangeSet) Add(r __generics_imported0.MappableRange, val uint64) bool { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + gap := s.FindGap(r.Start) + if !gap.Ok() { + return false + } + if r.End > gap.End() { + return false + } + s.Insert(gap, r, val) + return true +} + +// AddWithoutMerging inserts the given segment into the set and returns true. +// If it would overlap an existing segment, AddWithoutMerging does nothing and +// returns false. If AddWithoutMerging succeeds, all existing iterators are +// invalidated. +func (s *FileRangeSet) AddWithoutMerging(r __generics_imported0.MappableRange, val uint64) bool { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + gap := s.FindGap(r.Start) + if !gap.Ok() { + return false + } + if r.End > gap.End() { + return false + } + s.InsertWithoutMergingUnchecked(gap, r, val) + return true +} + +// Insert inserts the given segment into the given gap. If the new segment can +// be merged with adjacent segments, Insert will do so. Insert returns an +// iterator to the segment containing the inserted value (which may have been +// merged with other values). All existing iterators (including gap, but not +// including the returned iterator) are invalidated. +// +// If the gap cannot accommodate the segment, or if r is invalid, Insert panics. +// +// Insert is semantically equivalent to a InsertWithoutMerging followed by a +// Merge, but may be more efficient. Note that there is no unchecked variant of +// Insert since Insert must retrieve and inspect gap's predecessor and +// successor segments regardless. +func (s *FileRangeSet) Insert(gap FileRangeGapIterator, r __generics_imported0.MappableRange, val uint64) FileRangeIterator { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + prev, next := gap.PrevSegment(), gap.NextSegment() + if prev.Ok() && prev.End() > r.Start { + panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) + } + if next.Ok() && next.Start() < r.End { + panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) + } + if prev.Ok() && prev.End() == r.Start { + if mval, ok := (fileRangeSetFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok { + prev.SetEndUnchecked(r.End) + prev.SetValue(mval) + if next.Ok() && next.Start() == r.End { + val = mval + if mval, ok := (fileRangeSetFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { + prev.SetEndUnchecked(next.End()) + prev.SetValue(mval) + return s.Remove(next).PrevSegment() + } + } + return prev + } + } + if next.Ok() && next.Start() == r.End { + if mval, ok := (fileRangeSetFunctions{}).Merge(r, val, next.Range(), next.Value()); ok { + next.SetStartUnchecked(r.Start) + next.SetValue(mval) + return next + } + } + return s.InsertWithoutMergingUnchecked(gap, r, val) +} + +// InsertWithoutMerging inserts the given segment into the given gap and +// returns an iterator to the inserted segment. All existing iterators +// (including gap, but not including the returned iterator) are invalidated. +// +// If the gap cannot accommodate the segment, or if r is invalid, +// InsertWithoutMerging panics. +func (s *FileRangeSet) InsertWithoutMerging(gap FileRangeGapIterator, r __generics_imported0.MappableRange, val uint64) FileRangeIterator { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + if gr := gap.Range(); !gr.IsSupersetOf(r) { + panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) + } + return s.InsertWithoutMergingUnchecked(gap, r, val) +} + +// InsertWithoutMergingUnchecked inserts the given segment into the given gap +// and returns an iterator to the inserted segment. All existing iterators +// (including gap, but not including the returned iterator) are invalidated. +// +// Preconditions: r.Start >= gap.Start(); r.End <= gap.End(). +func (s *FileRangeSet) InsertWithoutMergingUnchecked(gap FileRangeGapIterator, r __generics_imported0.MappableRange, val uint64) FileRangeIterator { + gap = gap.node.rebalanceBeforeInsert(gap) + copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) + copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) + gap.node.keys[gap.index] = r + gap.node.values[gap.index] = val + gap.node.nrSegments++ + return FileRangeIterator{gap.node, gap.index} +} + +// Remove removes the given segment and returns an iterator to the vacated gap. +// All existing iterators (including seg, but not including the returned +// iterator) are invalidated. +func (s *FileRangeSet) Remove(seg FileRangeIterator) FileRangeGapIterator { + + if seg.node.hasChildren { + + victim := seg.PrevSegment() + + seg.SetRangeUnchecked(victim.Range()) + seg.SetValue(victim.Value()) + return s.Remove(victim).NextGap() + } + copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) + copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) + fileRangeSetFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) + seg.node.nrSegments-- + return seg.node.rebalanceAfterRemove(FileRangeGapIterator{seg.node, seg.index}) +} + +// RemoveAll removes all segments from the set. All existing iterators are +// invalidated. +func (s *FileRangeSet) RemoveAll() { + s.root = FileRangenode{} +} + +// RemoveRange removes all segments in the given range. An iterator to the +// newly formed gap is returned, and all existing iterators are invalidated. +func (s *FileRangeSet) RemoveRange(r __generics_imported0.MappableRange) FileRangeGapIterator { + seg, gap := s.Find(r.Start) + if seg.Ok() { + seg = s.Isolate(seg, r) + gap = s.Remove(seg) + } + for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { + seg = s.Isolate(seg, r) + gap = s.Remove(seg) + } + return gap +} + +// Merge attempts to merge two neighboring segments. If successful, Merge +// returns an iterator to the merged segment, and all existing iterators are +// invalidated. Otherwise, Merge returns a terminal iterator. +// +// If first is not the predecessor of second, Merge panics. +func (s *FileRangeSet) Merge(first, second FileRangeIterator) FileRangeIterator { + if first.NextSegment() != second { + panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) + } + return s.MergeUnchecked(first, second) +} + +// MergeUnchecked attempts to merge two neighboring segments. If successful, +// MergeUnchecked returns an iterator to the merged segment, and all existing +// iterators are invalidated. Otherwise, MergeUnchecked returns a terminal +// iterator. +// +// Precondition: first is the predecessor of second: first.NextSegment() == +// second, first == second.PrevSegment(). +func (s *FileRangeSet) MergeUnchecked(first, second FileRangeIterator) FileRangeIterator { + if first.End() == second.Start() { + if mval, ok := (fileRangeSetFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { + + first.SetEndUnchecked(second.End()) + first.SetValue(mval) + return s.Remove(second).PrevSegment() + } + } + return FileRangeIterator{} +} + +// MergeAll attempts to merge all adjacent segments in the set. All existing +// iterators are invalidated. +func (s *FileRangeSet) MergeAll() { + seg := s.FirstSegment() + if !seg.Ok() { + return + } + next := seg.NextSegment() + for next.Ok() { + if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { + seg, next = mseg, mseg.NextSegment() + } else { + seg, next = next, next.NextSegment() + } + } +} + +// MergeRange attempts to merge all adjacent segments that contain a key in the +// specific range. All existing iterators are invalidated. +func (s *FileRangeSet) MergeRange(r __generics_imported0.MappableRange) { + seg := s.LowerBoundSegment(r.Start) + if !seg.Ok() { + return + } + next := seg.NextSegment() + for next.Ok() && next.Range().Start < r.End { + if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { + seg, next = mseg, mseg.NextSegment() + } else { + seg, next = next, next.NextSegment() + } + } +} + +// MergeAdjacent attempts to merge the segment containing r.Start with its +// predecessor, and the segment containing r.End-1 with its successor. +func (s *FileRangeSet) MergeAdjacent(r __generics_imported0.MappableRange) { + first := s.FindSegment(r.Start) + if first.Ok() { + if prev := first.PrevSegment(); prev.Ok() { + s.Merge(prev, first) + } + } + last := s.FindSegment(r.End - 1) + if last.Ok() { + if next := last.NextSegment(); next.Ok() { + s.Merge(last, next) + } + } +} + +// Split splits the given segment at the given key and returns iterators to the +// two resulting segments. All existing iterators (including seg, but not +// including the returned iterators) are invalidated. +// +// If the segment cannot be split at split (because split is at the start or +// end of the segment's range, so splitting would produce a segment with zero +// length, or because split falls outside the segment's range altogether), +// Split panics. +func (s *FileRangeSet) Split(seg FileRangeIterator, split uint64) (FileRangeIterator, FileRangeIterator) { + if !seg.Range().CanSplitAt(split) { + panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) + } + return s.SplitUnchecked(seg, split) +} + +// SplitUnchecked splits the given segment at the given key and returns +// iterators to the two resulting segments. All existing iterators (including +// seg, but not including the returned iterators) are invalidated. +// +// Preconditions: seg.Start() < key < seg.End(). +func (s *FileRangeSet) SplitUnchecked(seg FileRangeIterator, split uint64) (FileRangeIterator, FileRangeIterator) { + val1, val2 := (fileRangeSetFunctions{}).Split(seg.Range(), seg.Value(), split) + end2 := seg.End() + seg.SetEndUnchecked(split) + seg.SetValue(val1) + seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), __generics_imported0.MappableRange{split, end2}, val2) + + return seg2.PrevSegment(), seg2 +} + +// SplitAt splits the segment straddling split, if one exists. SplitAt returns +// true if a segment was split and false otherwise. If SplitAt splits a +// segment, all existing iterators are invalidated. +func (s *FileRangeSet) SplitAt(split uint64) bool { + if seg := s.FindSegment(split); seg.Ok() && seg.Range().CanSplitAt(split) { + s.SplitUnchecked(seg, split) + return true + } + return false +} + +// Isolate ensures that the given segment's range does not escape r by +// splitting at r.Start and r.End if necessary, and returns an updated iterator +// to the bounded segment. All existing iterators (including seg, but not +// including the returned iterators) are invalidated. +func (s *FileRangeSet) Isolate(seg FileRangeIterator, r __generics_imported0.MappableRange) FileRangeIterator { + if seg.Range().CanSplitAt(r.Start) { + _, seg = s.SplitUnchecked(seg, r.Start) + } + if seg.Range().CanSplitAt(r.End) { + seg, _ = s.SplitUnchecked(seg, r.End) + } + return seg +} + +// ApplyContiguous applies a function to a contiguous range of segments, +// splitting if necessary. The function is applied until the first gap is +// encountered, at which point the gap is returned. If the function is applied +// across the entire range, a terminal gap is returned. All existing iterators +// are invalidated. +// +// N.B. The Iterator must not be invalidated by the function. +func (s *FileRangeSet) ApplyContiguous(r __generics_imported0.MappableRange, fn func(seg FileRangeIterator)) FileRangeGapIterator { + seg, gap := s.Find(r.Start) + if !seg.Ok() { + return gap + } + for { + seg = s.Isolate(seg, r) + fn(seg) + if seg.End() >= r.End { + return FileRangeGapIterator{} + } + gap = seg.NextGap() + if !gap.IsEmpty() { + return gap + } + seg = gap.NextSegment() + if !seg.Ok() { + + return FileRangeGapIterator{} + } + } +} + +// +stateify savable +type FileRangenode struct { + // An internal binary tree node looks like: + // + // K + // / \ + // Cl Cr + // + // where all keys in the subtree rooted by Cl (the left subtree) are less + // than K (the key of the parent node), and all keys in the subtree rooted + // by Cr (the right subtree) are greater than K. + // + // An internal B-tree node's indexes work out to look like: + // + // K0 K1 K2 ... Kn-1 + // / \/ \/ \ ... / \ + // C0 C1 C2 C3 ... Cn-1 Cn + // + // where n is nrSegments. + nrSegments int + + // parent is a pointer to this node's parent. If this node is root, parent + // is nil. + parent *FileRangenode + + // parentIndex is the index of this node in parent.children. + parentIndex int + + // Flag for internal nodes that is technically redundant with "children[0] + // != nil", but is stored in the first cache line. "hasChildren" rather + // than "isLeaf" because false must be the correct value for an empty root. + hasChildren bool + + // Nodes store keys and values in separate arrays to maximize locality in + // the common case (scanning keys for lookup). + keys [FileRangemaxDegree - 1]__generics_imported0.MappableRange + values [FileRangemaxDegree - 1]uint64 + children [FileRangemaxDegree]*FileRangenode +} + +// firstSegment returns the first segment in the subtree rooted by n. +// +// Preconditions: n.nrSegments != 0. +func (n *FileRangenode) firstSegment() FileRangeIterator { + for n.hasChildren { + n = n.children[0] + } + return FileRangeIterator{n, 0} +} + +// lastSegment returns the last segment in the subtree rooted by n. +// +// Preconditions: n.nrSegments != 0. +func (n *FileRangenode) lastSegment() FileRangeIterator { + for n.hasChildren { + n = n.children[n.nrSegments] + } + return FileRangeIterator{n, n.nrSegments - 1} +} + +func (n *FileRangenode) prevSibling() *FileRangenode { + if n.parent == nil || n.parentIndex == 0 { + return nil + } + return n.parent.children[n.parentIndex-1] +} + +func (n *FileRangenode) nextSibling() *FileRangenode { + if n.parent == nil || n.parentIndex == n.parent.nrSegments { + return nil + } + return n.parent.children[n.parentIndex+1] +} + +// rebalanceBeforeInsert splits n and its ancestors if they are full, as +// required for insertion, and returns an updated iterator to the position +// represented by gap. +func (n *FileRangenode) rebalanceBeforeInsert(gap FileRangeGapIterator) FileRangeGapIterator { + if n.parent != nil { + gap = n.parent.rebalanceBeforeInsert(gap) + } + if n.nrSegments < FileRangemaxDegree-1 { + return gap + } + if n.parent == nil { + + left := &FileRangenode{ + nrSegments: FileRangeminDegree - 1, + parent: n, + parentIndex: 0, + hasChildren: n.hasChildren, + } + right := &FileRangenode{ + nrSegments: FileRangeminDegree - 1, + parent: n, + parentIndex: 1, + hasChildren: n.hasChildren, + } + copy(left.keys[:FileRangeminDegree-1], n.keys[:FileRangeminDegree-1]) + copy(left.values[:FileRangeminDegree-1], n.values[:FileRangeminDegree-1]) + copy(right.keys[:FileRangeminDegree-1], n.keys[FileRangeminDegree:]) + copy(right.values[:FileRangeminDegree-1], n.values[FileRangeminDegree:]) + n.keys[0], n.values[0] = n.keys[FileRangeminDegree-1], n.values[FileRangeminDegree-1] + FileRangezeroValueSlice(n.values[1:]) + if n.hasChildren { + copy(left.children[:FileRangeminDegree], n.children[:FileRangeminDegree]) + copy(right.children[:FileRangeminDegree], n.children[FileRangeminDegree:]) + FileRangezeroNodeSlice(n.children[2:]) + for i := 0; i < FileRangeminDegree; i++ { + left.children[i].parent = left + left.children[i].parentIndex = i + right.children[i].parent = right + right.children[i].parentIndex = i + } + } + n.nrSegments = 1 + n.hasChildren = true + n.children[0] = left + n.children[1] = right + if gap.node != n { + return gap + } + if gap.index < FileRangeminDegree { + return FileRangeGapIterator{left, gap.index} + } + return FileRangeGapIterator{right, gap.index - FileRangeminDegree} + } + + copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) + copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) + n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[FileRangeminDegree-1], n.values[FileRangeminDegree-1] + copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) + for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { + n.parent.children[i].parentIndex = i + } + sibling := &FileRangenode{ + nrSegments: FileRangeminDegree - 1, + parent: n.parent, + parentIndex: n.parentIndex + 1, + hasChildren: n.hasChildren, + } + n.parent.children[n.parentIndex+1] = sibling + n.parent.nrSegments++ + copy(sibling.keys[:FileRangeminDegree-1], n.keys[FileRangeminDegree:]) + copy(sibling.values[:FileRangeminDegree-1], n.values[FileRangeminDegree:]) + FileRangezeroValueSlice(n.values[FileRangeminDegree-1:]) + if n.hasChildren { + copy(sibling.children[:FileRangeminDegree], n.children[FileRangeminDegree:]) + FileRangezeroNodeSlice(n.children[FileRangeminDegree:]) + for i := 0; i < FileRangeminDegree; i++ { + sibling.children[i].parent = sibling + sibling.children[i].parentIndex = i + } + } + n.nrSegments = FileRangeminDegree - 1 + + if gap.node != n { + return gap + } + if gap.index < FileRangeminDegree { + return gap + } + return FileRangeGapIterator{sibling, gap.index - FileRangeminDegree} +} + +// rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient +// (contain fewer segments than required by B-tree invariants), as required for +// removal, and returns an updated iterator to the position represented by gap. +// +// Precondition: n is the only node in the tree that may currently violate a +// B-tree invariant. +func (n *FileRangenode) rebalanceAfterRemove(gap FileRangeGapIterator) FileRangeGapIterator { + for { + if n.nrSegments >= FileRangeminDegree-1 { + return gap + } + if n.parent == nil { + + return gap + } + + if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= FileRangeminDegree { + copy(n.keys[1:], n.keys[:n.nrSegments]) + copy(n.values[1:], n.values[:n.nrSegments]) + n.keys[0] = n.parent.keys[n.parentIndex-1] + n.values[0] = n.parent.values[n.parentIndex-1] + n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] + n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] + fileRangeSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) + if n.hasChildren { + copy(n.children[1:], n.children[:n.nrSegments+1]) + n.children[0] = sibling.children[sibling.nrSegments] + sibling.children[sibling.nrSegments] = nil + n.children[0].parent = n + n.children[0].parentIndex = 0 + for i := 1; i < n.nrSegments+2; i++ { + n.children[i].parentIndex = i + } + } + n.nrSegments++ + sibling.nrSegments-- + if gap.node == sibling && gap.index == sibling.nrSegments { + return FileRangeGapIterator{n, 0} + } + if gap.node == n { + return FileRangeGapIterator{n, gap.index + 1} + } + return gap + } + if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= FileRangeminDegree { + n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] + n.values[n.nrSegments] = n.parent.values[n.parentIndex] + n.parent.keys[n.parentIndex] = sibling.keys[0] + n.parent.values[n.parentIndex] = sibling.values[0] + copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) + copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) + fileRangeSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) + if n.hasChildren { + n.children[n.nrSegments+1] = sibling.children[0] + copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) + sibling.children[sibling.nrSegments] = nil + n.children[n.nrSegments+1].parent = n + n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 + for i := 0; i < sibling.nrSegments; i++ { + sibling.children[i].parentIndex = i + } + } + n.nrSegments++ + sibling.nrSegments-- + if gap.node == sibling { + if gap.index == 0 { + return FileRangeGapIterator{n, n.nrSegments} + } + return FileRangeGapIterator{sibling, gap.index - 1} + } + return gap + } + + p := n.parent + if p.nrSegments == 1 { + + left, right := p.children[0], p.children[1] + p.nrSegments = left.nrSegments + right.nrSegments + 1 + p.hasChildren = left.hasChildren + p.keys[left.nrSegments] = p.keys[0] + p.values[left.nrSegments] = p.values[0] + copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) + copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) + copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) + copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) + if left.hasChildren { + copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) + copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) + for i := 0; i < p.nrSegments+1; i++ { + p.children[i].parent = p + p.children[i].parentIndex = i + } + } else { + p.children[0] = nil + p.children[1] = nil + } + if gap.node == left { + return FileRangeGapIterator{p, gap.index} + } + if gap.node == right { + return FileRangeGapIterator{p, gap.index + left.nrSegments + 1} + } + return gap + } + // Merge n and either sibling, along with the segment separating the + // two, into whichever of the two nodes comes first. This is the + // reverse of the non-root splitting case in + // node.rebalanceBeforeInsert. + var left, right *FileRangenode + if n.parentIndex > 0 { + left = n.prevSibling() + right = n + } else { + left = n + right = n.nextSibling() + } + + if gap.node == right { + gap = FileRangeGapIterator{left, gap.index + left.nrSegments + 1} + } + left.keys[left.nrSegments] = p.keys[left.parentIndex] + left.values[left.nrSegments] = p.values[left.parentIndex] + copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) + copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) + if left.hasChildren { + copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) + for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { + left.children[i].parent = left + left.children[i].parentIndex = i + } + } + left.nrSegments += right.nrSegments + 1 + copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) + copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) + fileRangeSetFunctions{}.ClearValue(&p.values[p.nrSegments-1]) + copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) + for i := 0; i < p.nrSegments; i++ { + p.children[i].parentIndex = i + } + p.children[p.nrSegments] = nil + p.nrSegments-- + + n = p + } +} + +// A Iterator is conceptually one of: +// +// - A pointer to a segment in a set; or +// +// - A terminal iterator, which is a sentinel indicating that the end of +// iteration has been reached. +// +// Iterators are copyable values and are meaningfully equality-comparable. The +// zero value of Iterator is a terminal iterator. +// +// Unless otherwise specified, any mutation of a set invalidates all existing +// iterators into the set. +type FileRangeIterator struct { + // node is the node containing the iterated segment. If the iterator is + // terminal, node is nil. + node *FileRangenode + + // index is the index of the segment in node.keys/values. + index int +} + +// Ok returns true if the iterator is not terminal. All other methods are only +// valid for non-terminal iterators. +func (seg FileRangeIterator) Ok() bool { + return seg.node != nil +} + +// Range returns the iterated segment's range key. +func (seg FileRangeIterator) Range() __generics_imported0.MappableRange { + return seg.node.keys[seg.index] +} + +// Start is equivalent to Range().Start, but should be preferred if only the +// start of the range is needed. +func (seg FileRangeIterator) Start() uint64 { + return seg.node.keys[seg.index].Start +} + +// End is equivalent to Range().End, but should be preferred if only the end of +// the range is needed. +func (seg FileRangeIterator) End() uint64 { + return seg.node.keys[seg.index].End +} + +// SetRangeUnchecked mutates the iterated segment's range key. This operation +// does not invalidate any iterators. +// +// Preconditions: +// +// - r.Length() > 0. +// +// - The new range must not overlap an existing one: If seg.NextSegment().Ok(), +// then r.end <= seg.NextSegment().Start(); if seg.PrevSegment().Ok(), then +// r.start >= seg.PrevSegment().End(). +func (seg FileRangeIterator) SetRangeUnchecked(r __generics_imported0.MappableRange) { + seg.node.keys[seg.index] = r +} + +// SetRange mutates the iterated segment's range key. If the new range would +// cause the iterated segment to overlap another segment, or if the new range +// is invalid, SetRange panics. This operation does not invalidate any +// iterators. +func (seg FileRangeIterator) SetRange(r __generics_imported0.MappableRange) { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { + panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) + } + if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { + panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) + } + seg.SetRangeUnchecked(r) +} + +// SetStartUnchecked mutates the iterated segment's start. This operation does +// not invalidate any iterators. +// +// Preconditions: The new start must be valid: start < seg.End(); if +// seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). +func (seg FileRangeIterator) SetStartUnchecked(start uint64) { + seg.node.keys[seg.index].Start = start +} + +// SetStart mutates the iterated segment's start. If the new start value would +// cause the iterated segment to overlap another segment, or would result in an +// invalid range, SetStart panics. This operation does not invalidate any +// iterators. +func (seg FileRangeIterator) SetStart(start uint64) { + if start >= seg.End() { + panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) + } + if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { + panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) + } + seg.SetStartUnchecked(start) +} + +// SetEndUnchecked mutates the iterated segment's end. This operation does not +// invalidate any iterators. +// +// Preconditions: The new end must be valid: end > seg.Start(); if +// seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). +func (seg FileRangeIterator) SetEndUnchecked(end uint64) { + seg.node.keys[seg.index].End = end +} + +// SetEnd mutates the iterated segment's end. If the new end value would cause +// the iterated segment to overlap another segment, or would result in an +// invalid range, SetEnd panics. This operation does not invalidate any +// iterators. +func (seg FileRangeIterator) SetEnd(end uint64) { + if end <= seg.Start() { + panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) + } + if next := seg.NextSegment(); next.Ok() && end > next.Start() { + panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) + } + seg.SetEndUnchecked(end) +} + +// Value returns a copy of the iterated segment's value. +func (seg FileRangeIterator) Value() uint64 { + return seg.node.values[seg.index] +} + +// ValuePtr returns a pointer to the iterated segment's value. The pointer is +// invalidated if the iterator is invalidated. This operation does not +// invalidate any iterators. +func (seg FileRangeIterator) ValuePtr() *uint64 { + return &seg.node.values[seg.index] +} + +// SetValue mutates the iterated segment's value. This operation does not +// invalidate any iterators. +func (seg FileRangeIterator) SetValue(val uint64) { + seg.node.values[seg.index] = val +} + +// PrevSegment returns the iterated segment's predecessor. If there is no +// preceding segment, PrevSegment returns a terminal iterator. +func (seg FileRangeIterator) PrevSegment() FileRangeIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index].lastSegment() + } + if seg.index > 0 { + return FileRangeIterator{seg.node, seg.index - 1} + } + if seg.node.parent == nil { + return FileRangeIterator{} + } + return FileRangesegmentBeforePosition(seg.node.parent, seg.node.parentIndex) +} + +// NextSegment returns the iterated segment's successor. If there is no +// succeeding segment, NextSegment returns a terminal iterator. +func (seg FileRangeIterator) NextSegment() FileRangeIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index+1].firstSegment() + } + if seg.index < seg.node.nrSegments-1 { + return FileRangeIterator{seg.node, seg.index + 1} + } + if seg.node.parent == nil { + return FileRangeIterator{} + } + return FileRangesegmentAfterPosition(seg.node.parent, seg.node.parentIndex) +} + +// PrevGap returns the gap immediately before the iterated segment. +func (seg FileRangeIterator) PrevGap() FileRangeGapIterator { + if seg.node.hasChildren { + + return seg.node.children[seg.index].lastSegment().NextGap() + } + return FileRangeGapIterator{seg.node, seg.index} +} + +// NextGap returns the gap immediately after the iterated segment. +func (seg FileRangeIterator) NextGap() FileRangeGapIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index+1].firstSegment().PrevGap() + } + return FileRangeGapIterator{seg.node, seg.index + 1} +} + +// PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, +// or the gap before the iterated segment otherwise. If seg.Start() == +// Functions.MinKey(), PrevNonEmpty will return two terminal iterators. +// Otherwise, exactly one of the iterators returned by PrevNonEmpty will be +// non-terminal. +func (seg FileRangeIterator) PrevNonEmpty() (FileRangeIterator, FileRangeGapIterator) { + gap := seg.PrevGap() + if gap.Range().Length() != 0 { + return FileRangeIterator{}, gap + } + return gap.PrevSegment(), FileRangeGapIterator{} +} + +// NextNonEmpty returns the iterated segment's successor if it is adjacent, or +// the gap after the iterated segment otherwise. If seg.End() == +// Functions.MaxKey(), NextNonEmpty will return two terminal iterators. +// Otherwise, exactly one of the iterators returned by NextNonEmpty will be +// non-terminal. +func (seg FileRangeIterator) NextNonEmpty() (FileRangeIterator, FileRangeGapIterator) { + gap := seg.NextGap() + if gap.Range().Length() != 0 { + return FileRangeIterator{}, gap + } + return gap.NextSegment(), FileRangeGapIterator{} +} + +// A GapIterator is conceptually one of: +// +// - A pointer to a position between two segments, before the first segment, or +// after the last segment in a set, called a *gap*; or +// +// - A terminal iterator, which is a sentinel indicating that the end of +// iteration has been reached. +// +// Note that the gap between two adjacent segments exists (iterators to it are +// non-terminal), but has a length of zero. GapIterator.IsEmpty returns true +// for such gaps. An empty set contains a single gap, spanning the entire range +// of the set's keys. +// +// GapIterators are copyable values and are meaningfully equality-comparable. +// The zero value of GapIterator is a terminal iterator. +// +// Unless otherwise specified, any mutation of a set invalidates all existing +// iterators into the set. +type FileRangeGapIterator struct { + // The representation of a GapIterator is identical to that of an Iterator, + // except that index corresponds to positions between segments in the same + // way as for node.children (see comment for node.nrSegments). + node *FileRangenode + index int +} + +// Ok returns true if the iterator is not terminal. All other methods are only +// valid for non-terminal iterators. +func (gap FileRangeGapIterator) Ok() bool { + return gap.node != nil +} + +// Range returns the range spanned by the iterated gap. +func (gap FileRangeGapIterator) Range() __generics_imported0.MappableRange { + return __generics_imported0.MappableRange{gap.Start(), gap.End()} +} + +// Start is equivalent to Range().Start, but should be preferred if only the +// start of the range is needed. +func (gap FileRangeGapIterator) Start() uint64 { + if ps := gap.PrevSegment(); ps.Ok() { + return ps.End() + } + return fileRangeSetFunctions{}.MinKey() +} + +// End is equivalent to Range().End, but should be preferred if only the end of +// the range is needed. +func (gap FileRangeGapIterator) End() uint64 { + if ns := gap.NextSegment(); ns.Ok() { + return ns.Start() + } + return fileRangeSetFunctions{}.MaxKey() +} + +// IsEmpty returns true if the iterated gap is empty (that is, the "gap" is +// between two adjacent segments.) +func (gap FileRangeGapIterator) IsEmpty() bool { + return gap.Range().Length() == 0 +} + +// PrevSegment returns the segment immediately before the iterated gap. If no +// such segment exists, PrevSegment returns a terminal iterator. +func (gap FileRangeGapIterator) PrevSegment() FileRangeIterator { + return FileRangesegmentBeforePosition(gap.node, gap.index) +} + +// NextSegment returns the segment immediately after the iterated gap. If no +// such segment exists, NextSegment returns a terminal iterator. +func (gap FileRangeGapIterator) NextSegment() FileRangeIterator { + return FileRangesegmentAfterPosition(gap.node, gap.index) +} + +// PrevGap returns the iterated gap's predecessor. If no such gap exists, +// PrevGap returns a terminal iterator. +func (gap FileRangeGapIterator) PrevGap() FileRangeGapIterator { + seg := gap.PrevSegment() + if !seg.Ok() { + return FileRangeGapIterator{} + } + return seg.PrevGap() +} + +// NextGap returns the iterated gap's successor. If no such gap exists, NextGap +// returns a terminal iterator. +func (gap FileRangeGapIterator) NextGap() FileRangeGapIterator { + seg := gap.NextSegment() + if !seg.Ok() { + return FileRangeGapIterator{} + } + return seg.NextGap() +} + +// segmentBeforePosition returns the predecessor segment of the position given +// by n.children[i], which may or may not contain a child. If no such segment +// exists, segmentBeforePosition returns a terminal iterator. +func FileRangesegmentBeforePosition(n *FileRangenode, i int) FileRangeIterator { + for i == 0 { + if n.parent == nil { + return FileRangeIterator{} + } + n, i = n.parent, n.parentIndex + } + return FileRangeIterator{n, i - 1} +} + +// segmentAfterPosition returns the successor segment of the position given by +// n.children[i], which may or may not contain a child. If no such segment +// exists, segmentAfterPosition returns a terminal iterator. +func FileRangesegmentAfterPosition(n *FileRangenode, i int) FileRangeIterator { + for i == n.nrSegments { + if n.parent == nil { + return FileRangeIterator{} + } + n, i = n.parent, n.parentIndex + } + return FileRangeIterator{n, i} +} + +func FileRangezeroValueSlice(slice []uint64) { + + for i := range slice { + fileRangeSetFunctions{}.ClearValue(&slice[i]) + } +} + +func FileRangezeroNodeSlice(slice []*FileRangenode) { + for i := range slice { + slice[i] = nil + } +} + +// String stringifies a Set for debugging. +func (s *FileRangeSet) String() string { + return s.root.String() +} + +// String stringifes a node (and all of its children) for debugging. +func (n *FileRangenode) String() string { + var buf bytes.Buffer + n.writeDebugString(&buf, "") + return buf.String() +} + +func (n *FileRangenode) writeDebugString(buf *bytes.Buffer, prefix string) { + if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { + buf.WriteString(prefix) + buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) + } + for i := 0; i < n.nrSegments; i++ { + if child := n.children[i]; child != nil { + cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) + if child.parent != n || child.parentIndex != i { + buf.WriteString(cprefix) + buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) + } + child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) + } + buf.WriteString(prefix) + buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) + } + if child := n.children[n.nrSegments]; child != nil { + child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) + } +} + +// SegmentDataSlices represents segments from a set as slices of start, end, and +// values. SegmentDataSlices is primarily used as an intermediate representation +// for save/restore and the layout here is optimized for that. +// +// +stateify savable +type FileRangeSegmentDataSlices struct { + Start []uint64 + End []uint64 + Values []uint64 +} + +// ExportSortedSlice returns a copy of all segments in the given set, in ascending +// key order. +func (s *FileRangeSet) ExportSortedSlices() *FileRangeSegmentDataSlices { + var sds FileRangeSegmentDataSlices + for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + sds.Start = append(sds.Start, seg.Start()) + sds.End = append(sds.End, seg.End()) + sds.Values = append(sds.Values, seg.Value()) + } + sds.Start = sds.Start[:len(sds.Start):len(sds.Start)] + sds.End = sds.End[:len(sds.End):len(sds.End)] + sds.Values = sds.Values[:len(sds.Values):len(sds.Values)] + return &sds +} + +// ImportSortedSlice initializes the given set from the given slice. +// +// Preconditions: s must be empty. sds must represent a valid set (the segments +// in sds must have valid lengths that do not overlap). The segments in sds +// must be sorted in ascending key order. +func (s *FileRangeSet) ImportSortedSlices(sds *FileRangeSegmentDataSlices) error { + if !s.IsEmpty() { + return fmt.Errorf("cannot import into non-empty set %v", s) + } + gap := s.FirstGap() + for i := range sds.Start { + r := __generics_imported0.MappableRange{sds.Start[i], sds.End[i]} + if !gap.Range().IsSupersetOf(r) { + return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: [%d, %d) => %v", sds.Start[i], sds.End[i], sds.Values[i]) + } + gap = s.InsertWithoutMerging(gap, r, sds.Values[i]).NextGap() + } + return nil +} +func (s *FileRangeSet) saveRoot() *FileRangeSegmentDataSlices { + return s.ExportSortedSlices() +} + +func (s *FileRangeSet) loadRoot(sds *FileRangeSegmentDataSlices) { + if err := s.ImportSortedSlices(sds); err != nil { + panic(err) + } +} diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go new file mode 100644 index 000000000..6565c28c8 --- /dev/null +++ b/pkg/sentry/fs/fsutil/frame_ref_set.go @@ -0,0 +1,50 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "math" + + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" +) + +type frameRefSetFunctions struct{} + +// MinKey implements segment.Functions.MinKey. +func (frameRefSetFunctions) MinKey() uint64 { + return 0 +} + +// MaxKey implements segment.Functions.MaxKey. +func (frameRefSetFunctions) MaxKey() uint64 { + return math.MaxUint64 +} + +// ClearValue implements segment.Functions.ClearValue. +func (frameRefSetFunctions) ClearValue(val *uint64) { +} + +// Merge implements segment.Functions.Merge. +func (frameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) { + if val1 != val2 { + return 0, false + } + return val1, true +} + +// Split implements segment.Functions.Split. +func (frameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) { + return val, val +} diff --git a/pkg/sentry/fs/fsutil/frame_ref_set_impl.go b/pkg/sentry/fs/fsutil/frame_ref_set_impl.go new file mode 100755 index 000000000..2f858f419 --- /dev/null +++ b/pkg/sentry/fs/fsutil/frame_ref_set_impl.go @@ -0,0 +1,1274 @@ +package fsutil + +import ( + __generics_imported0 "gvisor.googlesource.com/gvisor/pkg/sentry/platform" +) + +import ( + "bytes" + "fmt" +) + +const ( + // minDegree is the minimum degree of an internal node in a Set B-tree. + // + // - Any non-root node has at least minDegree-1 segments. + // + // - Any non-root internal (non-leaf) node has at least minDegree children. + // + // - The root node may have fewer than minDegree-1 segments, but it may + // only have 0 segments if the tree is empty. + // + // Our implementation requires minDegree >= 3. Higher values of minDegree + // usually improve performance, but increase memory usage for small sets. + frameRefminDegree = 3 + + frameRefmaxDegree = 2 * frameRefminDegree +) + +// A Set is a mapping of segments with non-overlapping Range keys. The zero +// value for a Set is an empty set. Set values are not safely movable nor +// copyable. Set is thread-compatible. +// +// +stateify savable +type frameRefSet struct { + root frameRefnode `state:".(*frameRefSegmentDataSlices)"` +} + +// IsEmpty returns true if the set contains no segments. +func (s *frameRefSet) IsEmpty() bool { + return s.root.nrSegments == 0 +} + +// IsEmptyRange returns true iff no segments in the set overlap the given +// range. This is semantically equivalent to s.SpanRange(r) == 0, but may be +// more efficient. +func (s *frameRefSet) IsEmptyRange(r __generics_imported0.FileRange) bool { + switch { + case r.Length() < 0: + panic(fmt.Sprintf("invalid range %v", r)) + case r.Length() == 0: + return true + } + _, gap := s.Find(r.Start) + if !gap.Ok() { + return false + } + return r.End <= gap.End() +} + +// Span returns the total size of all segments in the set. +func (s *frameRefSet) Span() uint64 { + var sz uint64 + for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + sz += seg.Range().Length() + } + return sz +} + +// SpanRange returns the total size of the intersection of segments in the set +// with the given range. +func (s *frameRefSet) SpanRange(r __generics_imported0.FileRange) uint64 { + switch { + case r.Length() < 0: + panic(fmt.Sprintf("invalid range %v", r)) + case r.Length() == 0: + return 0 + } + var sz uint64 + for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { + sz += seg.Range().Intersect(r).Length() + } + return sz +} + +// FirstSegment returns the first segment in the set. If the set is empty, +// FirstSegment returns a terminal iterator. +func (s *frameRefSet) FirstSegment() frameRefIterator { + if s.root.nrSegments == 0 { + return frameRefIterator{} + } + return s.root.firstSegment() +} + +// LastSegment returns the last segment in the set. If the set is empty, +// LastSegment returns a terminal iterator. +func (s *frameRefSet) LastSegment() frameRefIterator { + if s.root.nrSegments == 0 { + return frameRefIterator{} + } + return s.root.lastSegment() +} + +// FirstGap returns the first gap in the set. +func (s *frameRefSet) FirstGap() frameRefGapIterator { + n := &s.root + for n.hasChildren { + n = n.children[0] + } + return frameRefGapIterator{n, 0} +} + +// LastGap returns the last gap in the set. +func (s *frameRefSet) LastGap() frameRefGapIterator { + n := &s.root + for n.hasChildren { + n = n.children[n.nrSegments] + } + return frameRefGapIterator{n, n.nrSegments} +} + +// Find returns the segment or gap whose range contains the given key. If a +// segment is found, the returned Iterator is non-terminal and the +// returned GapIterator is terminal. Otherwise, the returned Iterator is +// terminal and the returned GapIterator is non-terminal. +func (s *frameRefSet) Find(key uint64) (frameRefIterator, frameRefGapIterator) { + n := &s.root + for { + + lower := 0 + upper := n.nrSegments + for lower < upper { + i := lower + (upper-lower)/2 + if r := n.keys[i]; key < r.End { + if key >= r.Start { + return frameRefIterator{n, i}, frameRefGapIterator{} + } + upper = i + } else { + lower = i + 1 + } + } + i := lower + if !n.hasChildren { + return frameRefIterator{}, frameRefGapIterator{n, i} + } + n = n.children[i] + } +} + +// FindSegment returns the segment whose range contains the given key. If no +// such segment exists, FindSegment returns a terminal iterator. +func (s *frameRefSet) FindSegment(key uint64) frameRefIterator { + seg, _ := s.Find(key) + return seg +} + +// LowerBoundSegment returns the segment with the lowest range that contains a +// key greater than or equal to min. If no such segment exists, +// LowerBoundSegment returns a terminal iterator. +func (s *frameRefSet) LowerBoundSegment(min uint64) frameRefIterator { + seg, gap := s.Find(min) + if seg.Ok() { + return seg + } + return gap.NextSegment() +} + +// UpperBoundSegment returns the segment with the highest range that contains a +// key less than or equal to max. If no such segment exists, UpperBoundSegment +// returns a terminal iterator. +func (s *frameRefSet) UpperBoundSegment(max uint64) frameRefIterator { + seg, gap := s.Find(max) + if seg.Ok() { + return seg + } + return gap.PrevSegment() +} + +// FindGap returns the gap containing the given key. If no such gap exists +// (i.e. the set contains a segment containing that key), FindGap returns a +// terminal iterator. +func (s *frameRefSet) FindGap(key uint64) frameRefGapIterator { + _, gap := s.Find(key) + return gap +} + +// LowerBoundGap returns the gap with the lowest range that is greater than or +// equal to min. +func (s *frameRefSet) LowerBoundGap(min uint64) frameRefGapIterator { + seg, gap := s.Find(min) + if gap.Ok() { + return gap + } + return seg.NextGap() +} + +// UpperBoundGap returns the gap with the highest range that is less than or +// equal to max. +func (s *frameRefSet) UpperBoundGap(max uint64) frameRefGapIterator { + seg, gap := s.Find(max) + if gap.Ok() { + return gap + } + return seg.PrevGap() +} + +// Add inserts the given segment into the set and returns true. If the new +// segment can be merged with adjacent segments, Add will do so. If the new +// segment would overlap an existing segment, Add returns false. If Add +// succeeds, all existing iterators are invalidated. +func (s *frameRefSet) Add(r __generics_imported0.FileRange, val uint64) bool { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + gap := s.FindGap(r.Start) + if !gap.Ok() { + return false + } + if r.End > gap.End() { + return false + } + s.Insert(gap, r, val) + return true +} + +// AddWithoutMerging inserts the given segment into the set and returns true. +// If it would overlap an existing segment, AddWithoutMerging does nothing and +// returns false. If AddWithoutMerging succeeds, all existing iterators are +// invalidated. +func (s *frameRefSet) AddWithoutMerging(r __generics_imported0.FileRange, val uint64) bool { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + gap := s.FindGap(r.Start) + if !gap.Ok() { + return false + } + if r.End > gap.End() { + return false + } + s.InsertWithoutMergingUnchecked(gap, r, val) + return true +} + +// Insert inserts the given segment into the given gap. If the new segment can +// be merged with adjacent segments, Insert will do so. Insert returns an +// iterator to the segment containing the inserted value (which may have been +// merged with other values). All existing iterators (including gap, but not +// including the returned iterator) are invalidated. +// +// If the gap cannot accommodate the segment, or if r is invalid, Insert panics. +// +// Insert is semantically equivalent to a InsertWithoutMerging followed by a +// Merge, but may be more efficient. Note that there is no unchecked variant of +// Insert since Insert must retrieve and inspect gap's predecessor and +// successor segments regardless. +func (s *frameRefSet) Insert(gap frameRefGapIterator, r __generics_imported0.FileRange, val uint64) frameRefIterator { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + prev, next := gap.PrevSegment(), gap.NextSegment() + if prev.Ok() && prev.End() > r.Start { + panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) + } + if next.Ok() && next.Start() < r.End { + panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) + } + if prev.Ok() && prev.End() == r.Start { + if mval, ok := (frameRefSetFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok { + prev.SetEndUnchecked(r.End) + prev.SetValue(mval) + if next.Ok() && next.Start() == r.End { + val = mval + if mval, ok := (frameRefSetFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { + prev.SetEndUnchecked(next.End()) + prev.SetValue(mval) + return s.Remove(next).PrevSegment() + } + } + return prev + } + } + if next.Ok() && next.Start() == r.End { + if mval, ok := (frameRefSetFunctions{}).Merge(r, val, next.Range(), next.Value()); ok { + next.SetStartUnchecked(r.Start) + next.SetValue(mval) + return next + } + } + return s.InsertWithoutMergingUnchecked(gap, r, val) +} + +// InsertWithoutMerging inserts the given segment into the given gap and +// returns an iterator to the inserted segment. All existing iterators +// (including gap, but not including the returned iterator) are invalidated. +// +// If the gap cannot accommodate the segment, or if r is invalid, +// InsertWithoutMerging panics. +func (s *frameRefSet) InsertWithoutMerging(gap frameRefGapIterator, r __generics_imported0.FileRange, val uint64) frameRefIterator { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + if gr := gap.Range(); !gr.IsSupersetOf(r) { + panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) + } + return s.InsertWithoutMergingUnchecked(gap, r, val) +} + +// InsertWithoutMergingUnchecked inserts the given segment into the given gap +// and returns an iterator to the inserted segment. All existing iterators +// (including gap, but not including the returned iterator) are invalidated. +// +// Preconditions: r.Start >= gap.Start(); r.End <= gap.End(). +func (s *frameRefSet) InsertWithoutMergingUnchecked(gap frameRefGapIterator, r __generics_imported0.FileRange, val uint64) frameRefIterator { + gap = gap.node.rebalanceBeforeInsert(gap) + copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) + copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) + gap.node.keys[gap.index] = r + gap.node.values[gap.index] = val + gap.node.nrSegments++ + return frameRefIterator{gap.node, gap.index} +} + +// Remove removes the given segment and returns an iterator to the vacated gap. +// All existing iterators (including seg, but not including the returned +// iterator) are invalidated. +func (s *frameRefSet) Remove(seg frameRefIterator) frameRefGapIterator { + + if seg.node.hasChildren { + + victim := seg.PrevSegment() + + seg.SetRangeUnchecked(victim.Range()) + seg.SetValue(victim.Value()) + return s.Remove(victim).NextGap() + } + copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) + copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) + frameRefSetFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) + seg.node.nrSegments-- + return seg.node.rebalanceAfterRemove(frameRefGapIterator{seg.node, seg.index}) +} + +// RemoveAll removes all segments from the set. All existing iterators are +// invalidated. +func (s *frameRefSet) RemoveAll() { + s.root = frameRefnode{} +} + +// RemoveRange removes all segments in the given range. An iterator to the +// newly formed gap is returned, and all existing iterators are invalidated. +func (s *frameRefSet) RemoveRange(r __generics_imported0.FileRange) frameRefGapIterator { + seg, gap := s.Find(r.Start) + if seg.Ok() { + seg = s.Isolate(seg, r) + gap = s.Remove(seg) + } + for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { + seg = s.Isolate(seg, r) + gap = s.Remove(seg) + } + return gap +} + +// Merge attempts to merge two neighboring segments. If successful, Merge +// returns an iterator to the merged segment, and all existing iterators are +// invalidated. Otherwise, Merge returns a terminal iterator. +// +// If first is not the predecessor of second, Merge panics. +func (s *frameRefSet) Merge(first, second frameRefIterator) frameRefIterator { + if first.NextSegment() != second { + panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) + } + return s.MergeUnchecked(first, second) +} + +// MergeUnchecked attempts to merge two neighboring segments. If successful, +// MergeUnchecked returns an iterator to the merged segment, and all existing +// iterators are invalidated. Otherwise, MergeUnchecked returns a terminal +// iterator. +// +// Precondition: first is the predecessor of second: first.NextSegment() == +// second, first == second.PrevSegment(). +func (s *frameRefSet) MergeUnchecked(first, second frameRefIterator) frameRefIterator { + if first.End() == second.Start() { + if mval, ok := (frameRefSetFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { + + first.SetEndUnchecked(second.End()) + first.SetValue(mval) + return s.Remove(second).PrevSegment() + } + } + return frameRefIterator{} +} + +// MergeAll attempts to merge all adjacent segments in the set. All existing +// iterators are invalidated. +func (s *frameRefSet) MergeAll() { + seg := s.FirstSegment() + if !seg.Ok() { + return + } + next := seg.NextSegment() + for next.Ok() { + if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { + seg, next = mseg, mseg.NextSegment() + } else { + seg, next = next, next.NextSegment() + } + } +} + +// MergeRange attempts to merge all adjacent segments that contain a key in the +// specific range. All existing iterators are invalidated. +func (s *frameRefSet) MergeRange(r __generics_imported0.FileRange) { + seg := s.LowerBoundSegment(r.Start) + if !seg.Ok() { + return + } + next := seg.NextSegment() + for next.Ok() && next.Range().Start < r.End { + if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { + seg, next = mseg, mseg.NextSegment() + } else { + seg, next = next, next.NextSegment() + } + } +} + +// MergeAdjacent attempts to merge the segment containing r.Start with its +// predecessor, and the segment containing r.End-1 with its successor. +func (s *frameRefSet) MergeAdjacent(r __generics_imported0.FileRange) { + first := s.FindSegment(r.Start) + if first.Ok() { + if prev := first.PrevSegment(); prev.Ok() { + s.Merge(prev, first) + } + } + last := s.FindSegment(r.End - 1) + if last.Ok() { + if next := last.NextSegment(); next.Ok() { + s.Merge(last, next) + } + } +} + +// Split splits the given segment at the given key and returns iterators to the +// two resulting segments. All existing iterators (including seg, but not +// including the returned iterators) are invalidated. +// +// If the segment cannot be split at split (because split is at the start or +// end of the segment's range, so splitting would produce a segment with zero +// length, or because split falls outside the segment's range altogether), +// Split panics. +func (s *frameRefSet) Split(seg frameRefIterator, split uint64) (frameRefIterator, frameRefIterator) { + if !seg.Range().CanSplitAt(split) { + panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) + } + return s.SplitUnchecked(seg, split) +} + +// SplitUnchecked splits the given segment at the given key and returns +// iterators to the two resulting segments. All existing iterators (including +// seg, but not including the returned iterators) are invalidated. +// +// Preconditions: seg.Start() < key < seg.End(). +func (s *frameRefSet) SplitUnchecked(seg frameRefIterator, split uint64) (frameRefIterator, frameRefIterator) { + val1, val2 := (frameRefSetFunctions{}).Split(seg.Range(), seg.Value(), split) + end2 := seg.End() + seg.SetEndUnchecked(split) + seg.SetValue(val1) + seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), __generics_imported0.FileRange{split, end2}, val2) + + return seg2.PrevSegment(), seg2 +} + +// SplitAt splits the segment straddling split, if one exists. SplitAt returns +// true if a segment was split and false otherwise. If SplitAt splits a +// segment, all existing iterators are invalidated. +func (s *frameRefSet) SplitAt(split uint64) bool { + if seg := s.FindSegment(split); seg.Ok() && seg.Range().CanSplitAt(split) { + s.SplitUnchecked(seg, split) + return true + } + return false +} + +// Isolate ensures that the given segment's range does not escape r by +// splitting at r.Start and r.End if necessary, and returns an updated iterator +// to the bounded segment. All existing iterators (including seg, but not +// including the returned iterators) are invalidated. +func (s *frameRefSet) Isolate(seg frameRefIterator, r __generics_imported0.FileRange) frameRefIterator { + if seg.Range().CanSplitAt(r.Start) { + _, seg = s.SplitUnchecked(seg, r.Start) + } + if seg.Range().CanSplitAt(r.End) { + seg, _ = s.SplitUnchecked(seg, r.End) + } + return seg +} + +// ApplyContiguous applies a function to a contiguous range of segments, +// splitting if necessary. The function is applied until the first gap is +// encountered, at which point the gap is returned. If the function is applied +// across the entire range, a terminal gap is returned. All existing iterators +// are invalidated. +// +// N.B. The Iterator must not be invalidated by the function. +func (s *frameRefSet) ApplyContiguous(r __generics_imported0.FileRange, fn func(seg frameRefIterator)) frameRefGapIterator { + seg, gap := s.Find(r.Start) + if !seg.Ok() { + return gap + } + for { + seg = s.Isolate(seg, r) + fn(seg) + if seg.End() >= r.End { + return frameRefGapIterator{} + } + gap = seg.NextGap() + if !gap.IsEmpty() { + return gap + } + seg = gap.NextSegment() + if !seg.Ok() { + + return frameRefGapIterator{} + } + } +} + +// +stateify savable +type frameRefnode struct { + // An internal binary tree node looks like: + // + // K + // / \ + // Cl Cr + // + // where all keys in the subtree rooted by Cl (the left subtree) are less + // than K (the key of the parent node), and all keys in the subtree rooted + // by Cr (the right subtree) are greater than K. + // + // An internal B-tree node's indexes work out to look like: + // + // K0 K1 K2 ... Kn-1 + // / \/ \/ \ ... / \ + // C0 C1 C2 C3 ... Cn-1 Cn + // + // where n is nrSegments. + nrSegments int + + // parent is a pointer to this node's parent. If this node is root, parent + // is nil. + parent *frameRefnode + + // parentIndex is the index of this node in parent.children. + parentIndex int + + // Flag for internal nodes that is technically redundant with "children[0] + // != nil", but is stored in the first cache line. "hasChildren" rather + // than "isLeaf" because false must be the correct value for an empty root. + hasChildren bool + + // Nodes store keys and values in separate arrays to maximize locality in + // the common case (scanning keys for lookup). + keys [frameRefmaxDegree - 1]__generics_imported0.FileRange + values [frameRefmaxDegree - 1]uint64 + children [frameRefmaxDegree]*frameRefnode +} + +// firstSegment returns the first segment in the subtree rooted by n. +// +// Preconditions: n.nrSegments != 0. +func (n *frameRefnode) firstSegment() frameRefIterator { + for n.hasChildren { + n = n.children[0] + } + return frameRefIterator{n, 0} +} + +// lastSegment returns the last segment in the subtree rooted by n. +// +// Preconditions: n.nrSegments != 0. +func (n *frameRefnode) lastSegment() frameRefIterator { + for n.hasChildren { + n = n.children[n.nrSegments] + } + return frameRefIterator{n, n.nrSegments - 1} +} + +func (n *frameRefnode) prevSibling() *frameRefnode { + if n.parent == nil || n.parentIndex == 0 { + return nil + } + return n.parent.children[n.parentIndex-1] +} + +func (n *frameRefnode) nextSibling() *frameRefnode { + if n.parent == nil || n.parentIndex == n.parent.nrSegments { + return nil + } + return n.parent.children[n.parentIndex+1] +} + +// rebalanceBeforeInsert splits n and its ancestors if they are full, as +// required for insertion, and returns an updated iterator to the position +// represented by gap. +func (n *frameRefnode) rebalanceBeforeInsert(gap frameRefGapIterator) frameRefGapIterator { + if n.parent != nil { + gap = n.parent.rebalanceBeforeInsert(gap) + } + if n.nrSegments < frameRefmaxDegree-1 { + return gap + } + if n.parent == nil { + + left := &frameRefnode{ + nrSegments: frameRefminDegree - 1, + parent: n, + parentIndex: 0, + hasChildren: n.hasChildren, + } + right := &frameRefnode{ + nrSegments: frameRefminDegree - 1, + parent: n, + parentIndex: 1, + hasChildren: n.hasChildren, + } + copy(left.keys[:frameRefminDegree-1], n.keys[:frameRefminDegree-1]) + copy(left.values[:frameRefminDegree-1], n.values[:frameRefminDegree-1]) + copy(right.keys[:frameRefminDegree-1], n.keys[frameRefminDegree:]) + copy(right.values[:frameRefminDegree-1], n.values[frameRefminDegree:]) + n.keys[0], n.values[0] = n.keys[frameRefminDegree-1], n.values[frameRefminDegree-1] + frameRefzeroValueSlice(n.values[1:]) + if n.hasChildren { + copy(left.children[:frameRefminDegree], n.children[:frameRefminDegree]) + copy(right.children[:frameRefminDegree], n.children[frameRefminDegree:]) + frameRefzeroNodeSlice(n.children[2:]) + for i := 0; i < frameRefminDegree; i++ { + left.children[i].parent = left + left.children[i].parentIndex = i + right.children[i].parent = right + right.children[i].parentIndex = i + } + } + n.nrSegments = 1 + n.hasChildren = true + n.children[0] = left + n.children[1] = right + if gap.node != n { + return gap + } + if gap.index < frameRefminDegree { + return frameRefGapIterator{left, gap.index} + } + return frameRefGapIterator{right, gap.index - frameRefminDegree} + } + + copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) + copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) + n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[frameRefminDegree-1], n.values[frameRefminDegree-1] + copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) + for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { + n.parent.children[i].parentIndex = i + } + sibling := &frameRefnode{ + nrSegments: frameRefminDegree - 1, + parent: n.parent, + parentIndex: n.parentIndex + 1, + hasChildren: n.hasChildren, + } + n.parent.children[n.parentIndex+1] = sibling + n.parent.nrSegments++ + copy(sibling.keys[:frameRefminDegree-1], n.keys[frameRefminDegree:]) + copy(sibling.values[:frameRefminDegree-1], n.values[frameRefminDegree:]) + frameRefzeroValueSlice(n.values[frameRefminDegree-1:]) + if n.hasChildren { + copy(sibling.children[:frameRefminDegree], n.children[frameRefminDegree:]) + frameRefzeroNodeSlice(n.children[frameRefminDegree:]) + for i := 0; i < frameRefminDegree; i++ { + sibling.children[i].parent = sibling + sibling.children[i].parentIndex = i + } + } + n.nrSegments = frameRefminDegree - 1 + + if gap.node != n { + return gap + } + if gap.index < frameRefminDegree { + return gap + } + return frameRefGapIterator{sibling, gap.index - frameRefminDegree} +} + +// rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient +// (contain fewer segments than required by B-tree invariants), as required for +// removal, and returns an updated iterator to the position represented by gap. +// +// Precondition: n is the only node in the tree that may currently violate a +// B-tree invariant. +func (n *frameRefnode) rebalanceAfterRemove(gap frameRefGapIterator) frameRefGapIterator { + for { + if n.nrSegments >= frameRefminDegree-1 { + return gap + } + if n.parent == nil { + + return gap + } + + if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= frameRefminDegree { + copy(n.keys[1:], n.keys[:n.nrSegments]) + copy(n.values[1:], n.values[:n.nrSegments]) + n.keys[0] = n.parent.keys[n.parentIndex-1] + n.values[0] = n.parent.values[n.parentIndex-1] + n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] + n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] + frameRefSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) + if n.hasChildren { + copy(n.children[1:], n.children[:n.nrSegments+1]) + n.children[0] = sibling.children[sibling.nrSegments] + sibling.children[sibling.nrSegments] = nil + n.children[0].parent = n + n.children[0].parentIndex = 0 + for i := 1; i < n.nrSegments+2; i++ { + n.children[i].parentIndex = i + } + } + n.nrSegments++ + sibling.nrSegments-- + if gap.node == sibling && gap.index == sibling.nrSegments { + return frameRefGapIterator{n, 0} + } + if gap.node == n { + return frameRefGapIterator{n, gap.index + 1} + } + return gap + } + if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= frameRefminDegree { + n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] + n.values[n.nrSegments] = n.parent.values[n.parentIndex] + n.parent.keys[n.parentIndex] = sibling.keys[0] + n.parent.values[n.parentIndex] = sibling.values[0] + copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) + copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) + frameRefSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) + if n.hasChildren { + n.children[n.nrSegments+1] = sibling.children[0] + copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) + sibling.children[sibling.nrSegments] = nil + n.children[n.nrSegments+1].parent = n + n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 + for i := 0; i < sibling.nrSegments; i++ { + sibling.children[i].parentIndex = i + } + } + n.nrSegments++ + sibling.nrSegments-- + if gap.node == sibling { + if gap.index == 0 { + return frameRefGapIterator{n, n.nrSegments} + } + return frameRefGapIterator{sibling, gap.index - 1} + } + return gap + } + + p := n.parent + if p.nrSegments == 1 { + + left, right := p.children[0], p.children[1] + p.nrSegments = left.nrSegments + right.nrSegments + 1 + p.hasChildren = left.hasChildren + p.keys[left.nrSegments] = p.keys[0] + p.values[left.nrSegments] = p.values[0] + copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) + copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) + copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) + copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) + if left.hasChildren { + copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) + copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) + for i := 0; i < p.nrSegments+1; i++ { + p.children[i].parent = p + p.children[i].parentIndex = i + } + } else { + p.children[0] = nil + p.children[1] = nil + } + if gap.node == left { + return frameRefGapIterator{p, gap.index} + } + if gap.node == right { + return frameRefGapIterator{p, gap.index + left.nrSegments + 1} + } + return gap + } + // Merge n and either sibling, along with the segment separating the + // two, into whichever of the two nodes comes first. This is the + // reverse of the non-root splitting case in + // node.rebalanceBeforeInsert. + var left, right *frameRefnode + if n.parentIndex > 0 { + left = n.prevSibling() + right = n + } else { + left = n + right = n.nextSibling() + } + + if gap.node == right { + gap = frameRefGapIterator{left, gap.index + left.nrSegments + 1} + } + left.keys[left.nrSegments] = p.keys[left.parentIndex] + left.values[left.nrSegments] = p.values[left.parentIndex] + copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) + copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) + if left.hasChildren { + copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) + for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { + left.children[i].parent = left + left.children[i].parentIndex = i + } + } + left.nrSegments += right.nrSegments + 1 + copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) + copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) + frameRefSetFunctions{}.ClearValue(&p.values[p.nrSegments-1]) + copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) + for i := 0; i < p.nrSegments; i++ { + p.children[i].parentIndex = i + } + p.children[p.nrSegments] = nil + p.nrSegments-- + + n = p + } +} + +// A Iterator is conceptually one of: +// +// - A pointer to a segment in a set; or +// +// - A terminal iterator, which is a sentinel indicating that the end of +// iteration has been reached. +// +// Iterators are copyable values and are meaningfully equality-comparable. The +// zero value of Iterator is a terminal iterator. +// +// Unless otherwise specified, any mutation of a set invalidates all existing +// iterators into the set. +type frameRefIterator struct { + // node is the node containing the iterated segment. If the iterator is + // terminal, node is nil. + node *frameRefnode + + // index is the index of the segment in node.keys/values. + index int +} + +// Ok returns true if the iterator is not terminal. All other methods are only +// valid for non-terminal iterators. +func (seg frameRefIterator) Ok() bool { + return seg.node != nil +} + +// Range returns the iterated segment's range key. +func (seg frameRefIterator) Range() __generics_imported0.FileRange { + return seg.node.keys[seg.index] +} + +// Start is equivalent to Range().Start, but should be preferred if only the +// start of the range is needed. +func (seg frameRefIterator) Start() uint64 { + return seg.node.keys[seg.index].Start +} + +// End is equivalent to Range().End, but should be preferred if only the end of +// the range is needed. +func (seg frameRefIterator) End() uint64 { + return seg.node.keys[seg.index].End +} + +// SetRangeUnchecked mutates the iterated segment's range key. This operation +// does not invalidate any iterators. +// +// Preconditions: +// +// - r.Length() > 0. +// +// - The new range must not overlap an existing one: If seg.NextSegment().Ok(), +// then r.end <= seg.NextSegment().Start(); if seg.PrevSegment().Ok(), then +// r.start >= seg.PrevSegment().End(). +func (seg frameRefIterator) SetRangeUnchecked(r __generics_imported0.FileRange) { + seg.node.keys[seg.index] = r +} + +// SetRange mutates the iterated segment's range key. If the new range would +// cause the iterated segment to overlap another segment, or if the new range +// is invalid, SetRange panics. This operation does not invalidate any +// iterators. +func (seg frameRefIterator) SetRange(r __generics_imported0.FileRange) { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { + panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) + } + if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { + panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) + } + seg.SetRangeUnchecked(r) +} + +// SetStartUnchecked mutates the iterated segment's start. This operation does +// not invalidate any iterators. +// +// Preconditions: The new start must be valid: start < seg.End(); if +// seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). +func (seg frameRefIterator) SetStartUnchecked(start uint64) { + seg.node.keys[seg.index].Start = start +} + +// SetStart mutates the iterated segment's start. If the new start value would +// cause the iterated segment to overlap another segment, or would result in an +// invalid range, SetStart panics. This operation does not invalidate any +// iterators. +func (seg frameRefIterator) SetStart(start uint64) { + if start >= seg.End() { + panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) + } + if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { + panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) + } + seg.SetStartUnchecked(start) +} + +// SetEndUnchecked mutates the iterated segment's end. This operation does not +// invalidate any iterators. +// +// Preconditions: The new end must be valid: end > seg.Start(); if +// seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). +func (seg frameRefIterator) SetEndUnchecked(end uint64) { + seg.node.keys[seg.index].End = end +} + +// SetEnd mutates the iterated segment's end. If the new end value would cause +// the iterated segment to overlap another segment, or would result in an +// invalid range, SetEnd panics. This operation does not invalidate any +// iterators. +func (seg frameRefIterator) SetEnd(end uint64) { + if end <= seg.Start() { + panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) + } + if next := seg.NextSegment(); next.Ok() && end > next.Start() { + panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) + } + seg.SetEndUnchecked(end) +} + +// Value returns a copy of the iterated segment's value. +func (seg frameRefIterator) Value() uint64 { + return seg.node.values[seg.index] +} + +// ValuePtr returns a pointer to the iterated segment's value. The pointer is +// invalidated if the iterator is invalidated. This operation does not +// invalidate any iterators. +func (seg frameRefIterator) ValuePtr() *uint64 { + return &seg.node.values[seg.index] +} + +// SetValue mutates the iterated segment's value. This operation does not +// invalidate any iterators. +func (seg frameRefIterator) SetValue(val uint64) { + seg.node.values[seg.index] = val +} + +// PrevSegment returns the iterated segment's predecessor. If there is no +// preceding segment, PrevSegment returns a terminal iterator. +func (seg frameRefIterator) PrevSegment() frameRefIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index].lastSegment() + } + if seg.index > 0 { + return frameRefIterator{seg.node, seg.index - 1} + } + if seg.node.parent == nil { + return frameRefIterator{} + } + return frameRefsegmentBeforePosition(seg.node.parent, seg.node.parentIndex) +} + +// NextSegment returns the iterated segment's successor. If there is no +// succeeding segment, NextSegment returns a terminal iterator. +func (seg frameRefIterator) NextSegment() frameRefIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index+1].firstSegment() + } + if seg.index < seg.node.nrSegments-1 { + return frameRefIterator{seg.node, seg.index + 1} + } + if seg.node.parent == nil { + return frameRefIterator{} + } + return frameRefsegmentAfterPosition(seg.node.parent, seg.node.parentIndex) +} + +// PrevGap returns the gap immediately before the iterated segment. +func (seg frameRefIterator) PrevGap() frameRefGapIterator { + if seg.node.hasChildren { + + return seg.node.children[seg.index].lastSegment().NextGap() + } + return frameRefGapIterator{seg.node, seg.index} +} + +// NextGap returns the gap immediately after the iterated segment. +func (seg frameRefIterator) NextGap() frameRefGapIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index+1].firstSegment().PrevGap() + } + return frameRefGapIterator{seg.node, seg.index + 1} +} + +// PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, +// or the gap before the iterated segment otherwise. If seg.Start() == +// Functions.MinKey(), PrevNonEmpty will return two terminal iterators. +// Otherwise, exactly one of the iterators returned by PrevNonEmpty will be +// non-terminal. +func (seg frameRefIterator) PrevNonEmpty() (frameRefIterator, frameRefGapIterator) { + gap := seg.PrevGap() + if gap.Range().Length() != 0 { + return frameRefIterator{}, gap + } + return gap.PrevSegment(), frameRefGapIterator{} +} + +// NextNonEmpty returns the iterated segment's successor if it is adjacent, or +// the gap after the iterated segment otherwise. If seg.End() == +// Functions.MaxKey(), NextNonEmpty will return two terminal iterators. +// Otherwise, exactly one of the iterators returned by NextNonEmpty will be +// non-terminal. +func (seg frameRefIterator) NextNonEmpty() (frameRefIterator, frameRefGapIterator) { + gap := seg.NextGap() + if gap.Range().Length() != 0 { + return frameRefIterator{}, gap + } + return gap.NextSegment(), frameRefGapIterator{} +} + +// A GapIterator is conceptually one of: +// +// - A pointer to a position between two segments, before the first segment, or +// after the last segment in a set, called a *gap*; or +// +// - A terminal iterator, which is a sentinel indicating that the end of +// iteration has been reached. +// +// Note that the gap between two adjacent segments exists (iterators to it are +// non-terminal), but has a length of zero. GapIterator.IsEmpty returns true +// for such gaps. An empty set contains a single gap, spanning the entire range +// of the set's keys. +// +// GapIterators are copyable values and are meaningfully equality-comparable. +// The zero value of GapIterator is a terminal iterator. +// +// Unless otherwise specified, any mutation of a set invalidates all existing +// iterators into the set. +type frameRefGapIterator struct { + // The representation of a GapIterator is identical to that of an Iterator, + // except that index corresponds to positions between segments in the same + // way as for node.children (see comment for node.nrSegments). + node *frameRefnode + index int +} + +// Ok returns true if the iterator is not terminal. All other methods are only +// valid for non-terminal iterators. +func (gap frameRefGapIterator) Ok() bool { + return gap.node != nil +} + +// Range returns the range spanned by the iterated gap. +func (gap frameRefGapIterator) Range() __generics_imported0.FileRange { + return __generics_imported0.FileRange{gap.Start(), gap.End()} +} + +// Start is equivalent to Range().Start, but should be preferred if only the +// start of the range is needed. +func (gap frameRefGapIterator) Start() uint64 { + if ps := gap.PrevSegment(); ps.Ok() { + return ps.End() + } + return frameRefSetFunctions{}.MinKey() +} + +// End is equivalent to Range().End, but should be preferred if only the end of +// the range is needed. +func (gap frameRefGapIterator) End() uint64 { + if ns := gap.NextSegment(); ns.Ok() { + return ns.Start() + } + return frameRefSetFunctions{}.MaxKey() +} + +// IsEmpty returns true if the iterated gap is empty (that is, the "gap" is +// between two adjacent segments.) +func (gap frameRefGapIterator) IsEmpty() bool { + return gap.Range().Length() == 0 +} + +// PrevSegment returns the segment immediately before the iterated gap. If no +// such segment exists, PrevSegment returns a terminal iterator. +func (gap frameRefGapIterator) PrevSegment() frameRefIterator { + return frameRefsegmentBeforePosition(gap.node, gap.index) +} + +// NextSegment returns the segment immediately after the iterated gap. If no +// such segment exists, NextSegment returns a terminal iterator. +func (gap frameRefGapIterator) NextSegment() frameRefIterator { + return frameRefsegmentAfterPosition(gap.node, gap.index) +} + +// PrevGap returns the iterated gap's predecessor. If no such gap exists, +// PrevGap returns a terminal iterator. +func (gap frameRefGapIterator) PrevGap() frameRefGapIterator { + seg := gap.PrevSegment() + if !seg.Ok() { + return frameRefGapIterator{} + } + return seg.PrevGap() +} + +// NextGap returns the iterated gap's successor. If no such gap exists, NextGap +// returns a terminal iterator. +func (gap frameRefGapIterator) NextGap() frameRefGapIterator { + seg := gap.NextSegment() + if !seg.Ok() { + return frameRefGapIterator{} + } + return seg.NextGap() +} + +// segmentBeforePosition returns the predecessor segment of the position given +// by n.children[i], which may or may not contain a child. If no such segment +// exists, segmentBeforePosition returns a terminal iterator. +func frameRefsegmentBeforePosition(n *frameRefnode, i int) frameRefIterator { + for i == 0 { + if n.parent == nil { + return frameRefIterator{} + } + n, i = n.parent, n.parentIndex + } + return frameRefIterator{n, i - 1} +} + +// segmentAfterPosition returns the successor segment of the position given by +// n.children[i], which may or may not contain a child. If no such segment +// exists, segmentAfterPosition returns a terminal iterator. +func frameRefsegmentAfterPosition(n *frameRefnode, i int) frameRefIterator { + for i == n.nrSegments { + if n.parent == nil { + return frameRefIterator{} + } + n, i = n.parent, n.parentIndex + } + return frameRefIterator{n, i} +} + +func frameRefzeroValueSlice(slice []uint64) { + + for i := range slice { + frameRefSetFunctions{}.ClearValue(&slice[i]) + } +} + +func frameRefzeroNodeSlice(slice []*frameRefnode) { + for i := range slice { + slice[i] = nil + } +} + +// String stringifies a Set for debugging. +func (s *frameRefSet) String() string { + return s.root.String() +} + +// String stringifes a node (and all of its children) for debugging. +func (n *frameRefnode) String() string { + var buf bytes.Buffer + n.writeDebugString(&buf, "") + return buf.String() +} + +func (n *frameRefnode) writeDebugString(buf *bytes.Buffer, prefix string) { + if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { + buf.WriteString(prefix) + buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) + } + for i := 0; i < n.nrSegments; i++ { + if child := n.children[i]; child != nil { + cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) + if child.parent != n || child.parentIndex != i { + buf.WriteString(cprefix) + buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) + } + child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) + } + buf.WriteString(prefix) + buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) + } + if child := n.children[n.nrSegments]; child != nil { + child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) + } +} + +// SegmentDataSlices represents segments from a set as slices of start, end, and +// values. SegmentDataSlices is primarily used as an intermediate representation +// for save/restore and the layout here is optimized for that. +// +// +stateify savable +type frameRefSegmentDataSlices struct { + Start []uint64 + End []uint64 + Values []uint64 +} + +// ExportSortedSlice returns a copy of all segments in the given set, in ascending +// key order. +func (s *frameRefSet) ExportSortedSlices() *frameRefSegmentDataSlices { + var sds frameRefSegmentDataSlices + for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + sds.Start = append(sds.Start, seg.Start()) + sds.End = append(sds.End, seg.End()) + sds.Values = append(sds.Values, seg.Value()) + } + sds.Start = sds.Start[:len(sds.Start):len(sds.Start)] + sds.End = sds.End[:len(sds.End):len(sds.End)] + sds.Values = sds.Values[:len(sds.Values):len(sds.Values)] + return &sds +} + +// ImportSortedSlice initializes the given set from the given slice. +// +// Preconditions: s must be empty. sds must represent a valid set (the segments +// in sds must have valid lengths that do not overlap). The segments in sds +// must be sorted in ascending key order. +func (s *frameRefSet) ImportSortedSlices(sds *frameRefSegmentDataSlices) error { + if !s.IsEmpty() { + return fmt.Errorf("cannot import into non-empty set %v", s) + } + gap := s.FirstGap() + for i := range sds.Start { + r := __generics_imported0.FileRange{sds.Start[i], sds.End[i]} + if !gap.Range().IsSupersetOf(r) { + return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: [%d, %d) => %v", sds.Start[i], sds.End[i], sds.Values[i]) + } + gap = s.InsertWithoutMerging(gap, r, sds.Values[i]).NextGap() + } + return nil +} +func (s *frameRefSet) saveRoot() *frameRefSegmentDataSlices { + return s.ExportSortedSlices() +} + +func (s *frameRefSet) loadRoot(sds *frameRefSegmentDataSlices) { + if err := s.ImportSortedSlices(sds); err != nil { + panic(err) + } +} diff --git a/pkg/sentry/fs/fsutil/fsutil.go b/pkg/sentry/fs/fsutil/fsutil.go new file mode 100644 index 000000000..c9587b1d9 --- /dev/null +++ b/pkg/sentry/fs/fsutil/fsutil.go @@ -0,0 +1,24 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package fsutil provides utilities for implementing fs.InodeOperations +// and fs.FileOperations: +// +// - For embeddable utilities, see inode.go and file.go. +// +// - For fs.Inodes that require a page cache to be memory mapped, see +// inode_cache.go. +// +// - For anon fs.Inodes, see anon.go. +package fsutil diff --git a/pkg/sentry/fs/fsutil/fsutil_state_autogen.go b/pkg/sentry/fs/fsutil/fsutil_state_autogen.go new file mode 100755 index 000000000..5783b151d --- /dev/null +++ b/pkg/sentry/fs/fsutil/fsutil_state_autogen.go @@ -0,0 +1,349 @@ +// automatically generated by stateify. + +package fsutil + +import ( + "gvisor.googlesource.com/gvisor/pkg/state" +) + +func (x *DirtyInfo) beforeSave() {} +func (x *DirtyInfo) save(m state.Map) { + x.beforeSave() + m.Save("Keep", &x.Keep) +} + +func (x *DirtyInfo) afterLoad() {} +func (x *DirtyInfo) load(m state.Map) { + m.Load("Keep", &x.Keep) +} + +func (x *DirtySet) beforeSave() {} +func (x *DirtySet) save(m state.Map) { + x.beforeSave() + var root *DirtySegmentDataSlices = x.saveRoot() + m.SaveValue("root", root) +} + +func (x *DirtySet) afterLoad() {} +func (x *DirtySet) load(m state.Map) { + m.LoadValue("root", new(*DirtySegmentDataSlices), func(y interface{}) { x.loadRoot(y.(*DirtySegmentDataSlices)) }) +} + +func (x *Dirtynode) beforeSave() {} +func (x *Dirtynode) save(m state.Map) { + x.beforeSave() + m.Save("nrSegments", &x.nrSegments) + m.Save("parent", &x.parent) + m.Save("parentIndex", &x.parentIndex) + m.Save("hasChildren", &x.hasChildren) + m.Save("keys", &x.keys) + m.Save("values", &x.values) + m.Save("children", &x.children) +} + +func (x *Dirtynode) afterLoad() {} +func (x *Dirtynode) load(m state.Map) { + m.Load("nrSegments", &x.nrSegments) + m.Load("parent", &x.parent) + m.Load("parentIndex", &x.parentIndex) + m.Load("hasChildren", &x.hasChildren) + m.Load("keys", &x.keys) + m.Load("values", &x.values) + m.Load("children", &x.children) +} + +func (x *DirtySegmentDataSlices) beforeSave() {} +func (x *DirtySegmentDataSlices) save(m state.Map) { + x.beforeSave() + m.Save("Start", &x.Start) + m.Save("End", &x.End) + m.Save("Values", &x.Values) +} + +func (x *DirtySegmentDataSlices) afterLoad() {} +func (x *DirtySegmentDataSlices) load(m state.Map) { + m.Load("Start", &x.Start) + m.Load("End", &x.End) + m.Load("Values", &x.Values) +} + +func (x *StaticDirFileOperations) beforeSave() {} +func (x *StaticDirFileOperations) save(m state.Map) { + x.beforeSave() + m.Save("dentryMap", &x.dentryMap) + m.Save("dirCursor", &x.dirCursor) +} + +func (x *StaticDirFileOperations) afterLoad() {} +func (x *StaticDirFileOperations) load(m state.Map) { + m.Load("dentryMap", &x.dentryMap) + m.Load("dirCursor", &x.dirCursor) +} + +func (x *NoReadWriteFile) beforeSave() {} +func (x *NoReadWriteFile) save(m state.Map) { + x.beforeSave() +} + +func (x *NoReadWriteFile) afterLoad() {} +func (x *NoReadWriteFile) load(m state.Map) { +} + +func (x *FileStaticContentReader) beforeSave() {} +func (x *FileStaticContentReader) save(m state.Map) { + x.beforeSave() + m.Save("content", &x.content) +} + +func (x *FileStaticContentReader) afterLoad() {} +func (x *FileStaticContentReader) load(m state.Map) { + m.Load("content", &x.content) +} + +func (x *FileRangeSet) beforeSave() {} +func (x *FileRangeSet) save(m state.Map) { + x.beforeSave() + var root *FileRangeSegmentDataSlices = x.saveRoot() + m.SaveValue("root", root) +} + +func (x *FileRangeSet) afterLoad() {} +func (x *FileRangeSet) load(m state.Map) { + m.LoadValue("root", new(*FileRangeSegmentDataSlices), func(y interface{}) { x.loadRoot(y.(*FileRangeSegmentDataSlices)) }) +} + +func (x *FileRangenode) beforeSave() {} +func (x *FileRangenode) save(m state.Map) { + x.beforeSave() + m.Save("nrSegments", &x.nrSegments) + m.Save("parent", &x.parent) + m.Save("parentIndex", &x.parentIndex) + m.Save("hasChildren", &x.hasChildren) + m.Save("keys", &x.keys) + m.Save("values", &x.values) + m.Save("children", &x.children) +} + +func (x *FileRangenode) afterLoad() {} +func (x *FileRangenode) load(m state.Map) { + m.Load("nrSegments", &x.nrSegments) + m.Load("parent", &x.parent) + m.Load("parentIndex", &x.parentIndex) + m.Load("hasChildren", &x.hasChildren) + m.Load("keys", &x.keys) + m.Load("values", &x.values) + m.Load("children", &x.children) +} + +func (x *FileRangeSegmentDataSlices) beforeSave() {} +func (x *FileRangeSegmentDataSlices) save(m state.Map) { + x.beforeSave() + m.Save("Start", &x.Start) + m.Save("End", &x.End) + m.Save("Values", &x.Values) +} + +func (x *FileRangeSegmentDataSlices) afterLoad() {} +func (x *FileRangeSegmentDataSlices) load(m state.Map) { + m.Load("Start", &x.Start) + m.Load("End", &x.End) + m.Load("Values", &x.Values) +} + +func (x *frameRefSet) beforeSave() {} +func (x *frameRefSet) save(m state.Map) { + x.beforeSave() + var root *frameRefSegmentDataSlices = x.saveRoot() + m.SaveValue("root", root) +} + +func (x *frameRefSet) afterLoad() {} +func (x *frameRefSet) load(m state.Map) { + m.LoadValue("root", new(*frameRefSegmentDataSlices), func(y interface{}) { x.loadRoot(y.(*frameRefSegmentDataSlices)) }) +} + +func (x *frameRefnode) beforeSave() {} +func (x *frameRefnode) save(m state.Map) { + x.beforeSave() + m.Save("nrSegments", &x.nrSegments) + m.Save("parent", &x.parent) + m.Save("parentIndex", &x.parentIndex) + m.Save("hasChildren", &x.hasChildren) + m.Save("keys", &x.keys) + m.Save("values", &x.values) + m.Save("children", &x.children) +} + +func (x *frameRefnode) afterLoad() {} +func (x *frameRefnode) load(m state.Map) { + m.Load("nrSegments", &x.nrSegments) + m.Load("parent", &x.parent) + m.Load("parentIndex", &x.parentIndex) + m.Load("hasChildren", &x.hasChildren) + m.Load("keys", &x.keys) + m.Load("values", &x.values) + m.Load("children", &x.children) +} + +func (x *frameRefSegmentDataSlices) beforeSave() {} +func (x *frameRefSegmentDataSlices) save(m state.Map) { + x.beforeSave() + m.Save("Start", &x.Start) + m.Save("End", &x.End) + m.Save("Values", &x.Values) +} + +func (x *frameRefSegmentDataSlices) afterLoad() {} +func (x *frameRefSegmentDataSlices) load(m state.Map) { + m.Load("Start", &x.Start) + m.Load("End", &x.End) + m.Load("Values", &x.Values) +} + +func (x *HostFileMapper) beforeSave() {} +func (x *HostFileMapper) save(m state.Map) { + x.beforeSave() + m.Save("refs", &x.refs) +} + +func (x *HostFileMapper) load(m state.Map) { + m.Load("refs", &x.refs) + m.AfterLoad(x.afterLoad) +} + +func (x *HostMappable) beforeSave() {} +func (x *HostMappable) save(m state.Map) { + x.beforeSave() + m.Save("hostFileMapper", &x.hostFileMapper) + m.Save("backingFile", &x.backingFile) + m.Save("mappings", &x.mappings) +} + +func (x *HostMappable) afterLoad() {} +func (x *HostMappable) load(m state.Map) { + m.Load("hostFileMapper", &x.hostFileMapper) + m.Load("backingFile", &x.backingFile) + m.Load("mappings", &x.mappings) +} + +func (x *SimpleFileInode) beforeSave() {} +func (x *SimpleFileInode) save(m state.Map) { + x.beforeSave() + m.Save("InodeSimpleAttributes", &x.InodeSimpleAttributes) +} + +func (x *SimpleFileInode) afterLoad() {} +func (x *SimpleFileInode) load(m state.Map) { + m.Load("InodeSimpleAttributes", &x.InodeSimpleAttributes) +} + +func (x *NoReadWriteFileInode) beforeSave() {} +func (x *NoReadWriteFileInode) save(m state.Map) { + x.beforeSave() + m.Save("InodeSimpleAttributes", &x.InodeSimpleAttributes) +} + +func (x *NoReadWriteFileInode) afterLoad() {} +func (x *NoReadWriteFileInode) load(m state.Map) { + m.Load("InodeSimpleAttributes", &x.InodeSimpleAttributes) +} + +func (x *InodeSimpleAttributes) beforeSave() {} +func (x *InodeSimpleAttributes) save(m state.Map) { + x.beforeSave() + m.Save("fsType", &x.fsType) + m.Save("unstable", &x.unstable) +} + +func (x *InodeSimpleAttributes) afterLoad() {} +func (x *InodeSimpleAttributes) load(m state.Map) { + m.Load("fsType", &x.fsType) + m.Load("unstable", &x.unstable) +} + +func (x *InodeSimpleExtendedAttributes) beforeSave() {} +func (x *InodeSimpleExtendedAttributes) save(m state.Map) { + x.beforeSave() + m.Save("xattrs", &x.xattrs) +} + +func (x *InodeSimpleExtendedAttributes) afterLoad() {} +func (x *InodeSimpleExtendedAttributes) load(m state.Map) { + m.Load("xattrs", &x.xattrs) +} + +func (x *staticFile) beforeSave() {} +func (x *staticFile) save(m state.Map) { + x.beforeSave() + m.Save("FileStaticContentReader", &x.FileStaticContentReader) +} + +func (x *staticFile) afterLoad() {} +func (x *staticFile) load(m state.Map) { + m.Load("FileStaticContentReader", &x.FileStaticContentReader) +} + +func (x *InodeStaticFileGetter) beforeSave() {} +func (x *InodeStaticFileGetter) save(m state.Map) { + x.beforeSave() + m.Save("Contents", &x.Contents) +} + +func (x *InodeStaticFileGetter) afterLoad() {} +func (x *InodeStaticFileGetter) load(m state.Map) { + m.Load("Contents", &x.Contents) +} + +func (x *CachingInodeOperations) beforeSave() {} +func (x *CachingInodeOperations) save(m state.Map) { + x.beforeSave() + m.Save("backingFile", &x.backingFile) + m.Save("mfp", &x.mfp) + m.Save("forcePageCache", &x.forcePageCache) + m.Save("attr", &x.attr) + m.Save("dirtyAttr", &x.dirtyAttr) + m.Save("mappings", &x.mappings) + m.Save("cache", &x.cache) + m.Save("dirty", &x.dirty) + m.Save("hostFileMapper", &x.hostFileMapper) + m.Save("refs", &x.refs) +} + +func (x *CachingInodeOperations) afterLoad() {} +func (x *CachingInodeOperations) load(m state.Map) { + m.Load("backingFile", &x.backingFile) + m.Load("mfp", &x.mfp) + m.Load("forcePageCache", &x.forcePageCache) + m.Load("attr", &x.attr) + m.Load("dirtyAttr", &x.dirtyAttr) + m.Load("mappings", &x.mappings) + m.Load("cache", &x.cache) + m.Load("dirty", &x.dirty) + m.Load("hostFileMapper", &x.hostFileMapper) + m.Load("refs", &x.refs) +} + +func init() { + state.Register("fsutil.DirtyInfo", (*DirtyInfo)(nil), state.Fns{Save: (*DirtyInfo).save, Load: (*DirtyInfo).load}) + state.Register("fsutil.DirtySet", (*DirtySet)(nil), state.Fns{Save: (*DirtySet).save, Load: (*DirtySet).load}) + state.Register("fsutil.Dirtynode", (*Dirtynode)(nil), state.Fns{Save: (*Dirtynode).save, Load: (*Dirtynode).load}) + state.Register("fsutil.DirtySegmentDataSlices", (*DirtySegmentDataSlices)(nil), state.Fns{Save: (*DirtySegmentDataSlices).save, Load: (*DirtySegmentDataSlices).load}) + state.Register("fsutil.StaticDirFileOperations", (*StaticDirFileOperations)(nil), state.Fns{Save: (*StaticDirFileOperations).save, Load: (*StaticDirFileOperations).load}) + state.Register("fsutil.NoReadWriteFile", (*NoReadWriteFile)(nil), state.Fns{Save: (*NoReadWriteFile).save, Load: (*NoReadWriteFile).load}) + state.Register("fsutil.FileStaticContentReader", (*FileStaticContentReader)(nil), state.Fns{Save: (*FileStaticContentReader).save, Load: (*FileStaticContentReader).load}) + state.Register("fsutil.FileRangeSet", (*FileRangeSet)(nil), state.Fns{Save: (*FileRangeSet).save, Load: (*FileRangeSet).load}) + state.Register("fsutil.FileRangenode", (*FileRangenode)(nil), state.Fns{Save: (*FileRangenode).save, Load: (*FileRangenode).load}) + state.Register("fsutil.FileRangeSegmentDataSlices", (*FileRangeSegmentDataSlices)(nil), state.Fns{Save: (*FileRangeSegmentDataSlices).save, Load: (*FileRangeSegmentDataSlices).load}) + state.Register("fsutil.frameRefSet", (*frameRefSet)(nil), state.Fns{Save: (*frameRefSet).save, Load: (*frameRefSet).load}) + state.Register("fsutil.frameRefnode", (*frameRefnode)(nil), state.Fns{Save: (*frameRefnode).save, Load: (*frameRefnode).load}) + state.Register("fsutil.frameRefSegmentDataSlices", (*frameRefSegmentDataSlices)(nil), state.Fns{Save: (*frameRefSegmentDataSlices).save, Load: (*frameRefSegmentDataSlices).load}) + state.Register("fsutil.HostFileMapper", (*HostFileMapper)(nil), state.Fns{Save: (*HostFileMapper).save, Load: (*HostFileMapper).load}) + state.Register("fsutil.HostMappable", (*HostMappable)(nil), state.Fns{Save: (*HostMappable).save, Load: (*HostMappable).load}) + state.Register("fsutil.SimpleFileInode", (*SimpleFileInode)(nil), state.Fns{Save: (*SimpleFileInode).save, Load: (*SimpleFileInode).load}) + state.Register("fsutil.NoReadWriteFileInode", (*NoReadWriteFileInode)(nil), state.Fns{Save: (*NoReadWriteFileInode).save, Load: (*NoReadWriteFileInode).load}) + state.Register("fsutil.InodeSimpleAttributes", (*InodeSimpleAttributes)(nil), state.Fns{Save: (*InodeSimpleAttributes).save, Load: (*InodeSimpleAttributes).load}) + state.Register("fsutil.InodeSimpleExtendedAttributes", (*InodeSimpleExtendedAttributes)(nil), state.Fns{Save: (*InodeSimpleExtendedAttributes).save, Load: (*InodeSimpleExtendedAttributes).load}) + state.Register("fsutil.staticFile", (*staticFile)(nil), state.Fns{Save: (*staticFile).save, Load: (*staticFile).load}) + state.Register("fsutil.InodeStaticFileGetter", (*InodeStaticFileGetter)(nil), state.Fns{Save: (*InodeStaticFileGetter).save, Load: (*InodeStaticFileGetter).load}) + state.Register("fsutil.CachingInodeOperations", (*CachingInodeOperations)(nil), state.Fns{Save: (*CachingInodeOperations).save, Load: (*CachingInodeOperations).load}) +} diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go new file mode 100644 index 000000000..2bdfc0db6 --- /dev/null +++ b/pkg/sentry/fs/fsutil/host_file_mapper.go @@ -0,0 +1,211 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "fmt" + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// HostFileMapper caches mappings of an arbitrary host file descriptor. It is +// used by implementations of memmap.Mappable that represent a host file +// descriptor. +// +// +stateify savable +type HostFileMapper struct { + // HostFile conceptually breaks the file into pieces called chunks, of + // size and alignment chunkSize, and caches mappings of the file on a chunk + // granularity. + + refsMu sync.Mutex `state:"nosave"` + + // refs maps chunk start offsets to the sum of reference counts for all + // pages in that chunk. refs is protected by refsMu. + refs map[uint64]int32 + + mapsMu sync.Mutex `state:"nosave"` + + // mappings maps chunk start offsets to mappings of those chunks, + // obtained by calling syscall.Mmap. mappings is protected by + // mapsMu. + mappings map[uint64]mapping `state:"nosave"` +} + +const ( + chunkShift = usermem.HugePageShift + chunkSize = 1 << chunkShift + chunkMask = chunkSize - 1 +) + +func pagesInChunk(mr memmap.MappableRange, chunkStart uint64) int32 { + return int32(mr.Intersect(memmap.MappableRange{chunkStart, chunkStart + chunkSize}).Length() / usermem.PageSize) +} + +type mapping struct { + addr uintptr + writable bool +} + +// NewHostFileMapper returns a HostFileMapper with no references or cached +// mappings. +func NewHostFileMapper() *HostFileMapper { + return &HostFileMapper{ + refs: make(map[uint64]int32), + mappings: make(map[uint64]mapping), + } +} + +// IncRefOn increments the reference count on all offsets in mr. +// +// Preconditions: mr.Length() != 0. mr.Start and mr.End must be page-aligned. +func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) { + f.refsMu.Lock() + defer f.refsMu.Unlock() + for chunkStart := mr.Start &^ chunkMask; chunkStart < mr.End; chunkStart += chunkSize { + refs := f.refs[chunkStart] + pgs := pagesInChunk(mr, chunkStart) + if refs+pgs < refs { + // Would overflow. + panic(fmt.Sprintf("HostFileMapper.IncRefOn(%v): adding %d page references to chunk %#x, which has %d page references", mr, pgs, chunkStart, refs)) + } + f.refs[chunkStart] = refs + pgs + } +} + +// DecRefOn decrements the reference count on all offsets in mr. +// +// Preconditions: mr.Length() != 0. mr.Start and mr.End must be page-aligned. +func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) { + f.refsMu.Lock() + defer f.refsMu.Unlock() + for chunkStart := mr.Start &^ chunkMask; chunkStart < mr.End; chunkStart += chunkSize { + refs := f.refs[chunkStart] + pgs := pagesInChunk(mr, chunkStart) + switch { + case refs > pgs: + f.refs[chunkStart] = refs - pgs + case refs == pgs: + f.mapsMu.Lock() + delete(f.refs, chunkStart) + if m, ok := f.mappings[chunkStart]; ok { + f.unmapAndRemoveLocked(chunkStart, m) + } + f.mapsMu.Unlock() + case refs < pgs: + panic(fmt.Sprintf("HostFileMapper.DecRefOn(%v): removing %d page references from chunk %#x, which has %d page references", mr, pgs, chunkStart, refs)) + } + } +} + +// MapInternal returns a mapping of offsets in fr from fd. The returned +// safemem.BlockSeq is valid as long as at least one reference is held on all +// offsets in fr or until the next call to UnmapAll. +// +// Preconditions: The caller must hold a reference on all offsets in fr. +func (f *HostFileMapper) MapInternal(fr platform.FileRange, fd int, write bool) (safemem.BlockSeq, error) { + chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift) + f.mapsMu.Lock() + defer f.mapsMu.Unlock() + if chunks == 1 { + // Avoid an unnecessary slice allocation. + var seq safemem.BlockSeq + err := f.forEachMappingBlockLocked(fr, fd, write, func(b safemem.Block) { + seq = safemem.BlockSeqOf(b) + }) + return seq, err + } + blocks := make([]safemem.Block, 0, chunks) + err := f.forEachMappingBlockLocked(fr, fd, write, func(b safemem.Block) { + blocks = append(blocks, b) + }) + return safemem.BlockSeqFromSlice(blocks), err +} + +// Preconditions: f.mapsMu must be locked. +func (f *HostFileMapper) forEachMappingBlockLocked(fr platform.FileRange, fd int, write bool, fn func(safemem.Block)) error { + prot := syscall.PROT_READ + if write { + prot |= syscall.PROT_WRITE + } + for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize { + m, ok := f.mappings[chunkStart] + if !ok { + addr, _, errno := syscall.Syscall6( + syscall.SYS_MMAP, + 0, + chunkSize, + uintptr(prot), + syscall.MAP_SHARED, + uintptr(fd), + uintptr(chunkStart)) + if errno != 0 { + return errno + } + m = mapping{addr, write} + f.mappings[chunkStart] = m + } else if write && !m.writable { + addr, _, errno := syscall.Syscall6( + syscall.SYS_MMAP, + m.addr, + chunkSize, + uintptr(prot), + syscall.MAP_SHARED|syscall.MAP_FIXED, + uintptr(fd), + uintptr(chunkStart)) + if errno != 0 { + return errno + } + m = mapping{addr, write} + f.mappings[chunkStart] = m + } + var startOff uint64 + if chunkStart < fr.Start { + startOff = fr.Start - chunkStart + } + endOff := uint64(chunkSize) + if chunkStart+chunkSize > fr.End { + endOff = fr.End - chunkStart + } + fn(f.unsafeBlockFromChunkMapping(m.addr).TakeFirst64(endOff).DropFirst64(startOff)) + } + return nil +} + +// UnmapAll unmaps all cached mappings. Callers are responsible for +// synchronization with mappings returned by previous calls to MapInternal. +func (f *HostFileMapper) UnmapAll() { + f.mapsMu.Lock() + defer f.mapsMu.Unlock() + for chunkStart, m := range f.mappings { + f.unmapAndRemoveLocked(chunkStart, m) + } +} + +// Preconditions: f.mapsMu must be locked. f.mappings[chunkStart] == m. +func (f *HostFileMapper) unmapAndRemoveLocked(chunkStart uint64, m mapping) { + if _, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m.addr, chunkSize, 0); errno != 0 { + // This leaks address space and is unexpected, but is otherwise + // harmless, so complain but don't panic. + log.Warningf("HostFileMapper: failed to unmap mapping %#x for chunk %#x: %v", m.addr, chunkStart, errno) + } + delete(f.mappings, chunkStart) +} diff --git a/pkg/sentry/fs/fsutil/host_file_mapper_state.go b/pkg/sentry/fs/fsutil/host_file_mapper_state.go new file mode 100644 index 000000000..576d2a3df --- /dev/null +++ b/pkg/sentry/fs/fsutil/host_file_mapper_state.go @@ -0,0 +1,20 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +// afterLoad is invoked by stateify. +func (f *HostFileMapper) afterLoad() { + f.mappings = make(map[uint64]mapping) +} diff --git a/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go new file mode 100644 index 000000000..7167be263 --- /dev/null +++ b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go @@ -0,0 +1,27 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" +) + +func (*HostFileMapper) unsafeBlockFromChunkMapping(addr uintptr) safemem.Block { + // We don't control the host file's length, so touching its mappings may + // raise SIGBUS. Thus accesses to it must use safecopy. + return safemem.BlockFromUnsafePointer((unsafe.Pointer)(addr), chunkSize) +} diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go new file mode 100644 index 000000000..ad0518b8f --- /dev/null +++ b/pkg/sentry/fs/fsutil/host_mappable.go @@ -0,0 +1,197 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "math" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// HostMappable implements memmap.Mappable and platform.File over a +// CachedFileObject. +// +// Lock order (compare the lock order model in mm/mm.go): +// truncateMu ("fs locks") +// mu ("memmap.Mappable locks not taken by Translate") +// ("platform.File locks") +// backingFile ("CachedFileObject locks") +// +// +stateify savable +type HostMappable struct { + hostFileMapper *HostFileMapper + + backingFile CachedFileObject + + mu sync.Mutex `state:"nosave"` + + // mappings tracks mappings of the cached file object into + // memmap.MappingSpaces so it can invalidated upon save. Protected by mu. + mappings memmap.MappingSet + + // truncateMu protects writes and truncations. See Truncate() for details. + truncateMu sync.RWMutex `state:"nosave"` +} + +// NewHostMappable creates a new mappable that maps directly to host FD. +func NewHostMappable(backingFile CachedFileObject) *HostMappable { + return &HostMappable{ + hostFileMapper: NewHostFileMapper(), + backingFile: backingFile, + } +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (h *HostMappable) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { + // Hot path. Avoid defers. + h.mu.Lock() + mapped := h.mappings.AddMapping(ms, ar, offset, writable) + for _, r := range mapped { + h.hostFileMapper.IncRefOn(r) + } + h.mu.Unlock() + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (h *HostMappable) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) { + // Hot path. Avoid defers. + h.mu.Lock() + unmapped := h.mappings.RemoveMapping(ms, ar, offset, writable) + for _, r := range unmapped { + h.hostFileMapper.DecRefOn(r) + } + h.mu.Unlock() +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (h *HostMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error { + return h.AddMapping(ctx, ms, dstAR, offset, writable) +} + +// Translate implements memmap.Mappable.Translate. +func (h *HostMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + return []memmap.Translation{ + { + Source: optional, + File: h, + Offset: optional.Start, + Perms: usermem.AnyAccess, + }, + }, nil +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (h *HostMappable) InvalidateUnsavable(ctx context.Context) error { + h.mu.Lock() + h.mappings.InvalidateAll(memmap.InvalidateOpts{}) + h.mu.Unlock() + return nil +} + +// MapInternal implements platform.File.MapInternal. +func (h *HostMappable) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { + return h.hostFileMapper.MapInternal(fr, h.backingFile.FD(), at.Write) +} + +// FD implements platform.File.FD. +func (h *HostMappable) FD() int { + return h.backingFile.FD() +} + +// IncRef implements platform.File.IncRef. +func (h *HostMappable) IncRef(fr platform.FileRange) { + mr := memmap.MappableRange{Start: fr.Start, End: fr.End} + h.hostFileMapper.IncRefOn(mr) +} + +// DecRef implements platform.File.DecRef. +func (h *HostMappable) DecRef(fr platform.FileRange) { + mr := memmap.MappableRange{Start: fr.Start, End: fr.End} + h.hostFileMapper.DecRefOn(mr) +} + +// Truncate truncates the file, invalidating any mapping that may have been +// removed after the size change. +// +// Truncation and writes are synchronized to prevent races where writes make the +// file grow between truncation and invalidation below: +// T1: Calls SetMaskedAttributes and stalls +// T2: Appends to file causing it to grow +// T2: Writes to mapped pages and COW happens +// T1: Continues and wronly invalidates the page mapped in step above. +func (h *HostMappable) Truncate(ctx context.Context, newSize int64) error { + h.truncateMu.Lock() + defer h.truncateMu.Unlock() + + mask := fs.AttrMask{Size: true} + attr := fs.UnstableAttr{Size: newSize} + if err := h.backingFile.SetMaskedAttributes(ctx, mask, attr); err != nil { + return err + } + + // Invalidate COW mappings that may exist beyond the new size in case the file + // is being shrunk. Other mappings don't need to be invalidated because + // translate will just return identical mappings after invalidation anyway, + // and SIGBUS will be raised and handled when the mappings are touched. + // + // Compare Linux's mm/truncate.c:truncate_setsize() => + // truncate_pagecache() => + // mm/memory.c:unmap_mapping_range(evencows=1). + h.mu.Lock() + defer h.mu.Unlock() + mr := memmap.MappableRange{ + Start: fs.OffsetPageEnd(newSize), + End: fs.OffsetPageEnd(math.MaxInt64), + } + h.mappings.Invalidate(mr, memmap.InvalidateOpts{InvalidatePrivate: true}) + + return nil +} + +// Allocate reserves space in the backing file. +func (h *HostMappable) Allocate(ctx context.Context, offset int64, length int64) error { + h.truncateMu.RLock() + err := h.backingFile.Allocate(ctx, offset, length) + h.truncateMu.RUnlock() + return err +} + +// Write writes to the file backing this mappable. +func (h *HostMappable) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + h.truncateMu.RLock() + n, err := src.CopyInTo(ctx, &writer{ctx: ctx, hostMappable: h, off: offset}) + h.truncateMu.RUnlock() + return n, err +} + +type writer struct { + ctx context.Context + hostMappable *HostMappable + off int64 +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +func (w *writer) WriteFromBlocks(src safemem.BlockSeq) (uint64, error) { + n, err := w.hostMappable.backingFile.WriteFromBlocksAt(w.ctx, src, uint64(w.off)) + w.off += int64(n) + return n, err +} diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go new file mode 100644 index 000000000..925887335 --- /dev/null +++ b/pkg/sentry/fs/fsutil/inode.go @@ -0,0 +1,503 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// SimpleFileInode is a simple implementation of InodeOperations. +// +// +stateify savable +type SimpleFileInode struct { + InodeGenericChecker `state:"nosave"` + InodeNoExtendedAttributes `state:"nosave"` + InodeNoopRelease `state:"nosave"` + InodeNoopWriteOut `state:"nosave"` + InodeNotAllocatable `state:"nosave"` + InodeNotDirectory `state:"nosave"` + InodeNotMappable `state:"nosave"` + InodeNotOpenable `state:"nosave"` + InodeNotSocket `state:"nosave"` + InodeNotSymlink `state:"nosave"` + InodeNotTruncatable `state:"nosave"` + InodeNotVirtual `state:"nosave"` + + InodeSimpleAttributes +} + +// NewSimpleFileInode returns a new SimpleFileInode. +func NewSimpleFileInode(ctx context.Context, owner fs.FileOwner, perms fs.FilePermissions, typ uint64) *SimpleFileInode { + return &SimpleFileInode{ + InodeSimpleAttributes: NewInodeSimpleAttributes(ctx, owner, perms, typ), + } +} + +// NoReadWriteFileInode is an implementation of InodeOperations that supports +// opening files that are not readable or writeable. +// +// +stateify savable +type NoReadWriteFileInode struct { + InodeGenericChecker `state:"nosave"` + InodeNoExtendedAttributes `state:"nosave"` + InodeNoopRelease `state:"nosave"` + InodeNoopWriteOut `state:"nosave"` + InodeNotAllocatable `state:"nosave"` + InodeNotDirectory `state:"nosave"` + InodeNotMappable `state:"nosave"` + InodeNotSocket `state:"nosave"` + InodeNotSymlink `state:"nosave"` + InodeNotTruncatable `state:"nosave"` + InodeNotVirtual `state:"nosave"` + + InodeSimpleAttributes +} + +// NewNoReadWriteFileInode returns a new NoReadWriteFileInode. +func NewNoReadWriteFileInode(ctx context.Context, owner fs.FileOwner, perms fs.FilePermissions, typ uint64) *NoReadWriteFileInode { + return &NoReadWriteFileInode{ + InodeSimpleAttributes: NewInodeSimpleAttributes(ctx, owner, perms, typ), + } +} + +// GetFile implements fs.InodeOperations.GetFile. +func (*NoReadWriteFileInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + return fs.NewFile(ctx, dirent, flags, &NoReadWriteFile{}), nil +} + +// InodeSimpleAttributes implements methods for updating in-memory unstable +// attributes. +// +// +stateify savable +type InodeSimpleAttributes struct { + // fsType is the immutable filesystem type that will be returned by + // StatFS. + fsType uint64 + + // mu protects unstable. + mu sync.RWMutex `state:"nosave"` + unstable fs.UnstableAttr +} + +// NewInodeSimpleAttributes returns a new InodeSimpleAttributes with the given +// owner and permissions, and all timestamps set to the current time. +func NewInodeSimpleAttributes(ctx context.Context, owner fs.FileOwner, perms fs.FilePermissions, typ uint64) InodeSimpleAttributes { + return NewInodeSimpleAttributesWithUnstable(fs.WithCurrentTime(ctx, fs.UnstableAttr{ + Owner: owner, + Perms: perms, + }), typ) +} + +// NewInodeSimpleAttributesWithUnstable returns a new InodeSimpleAttributes +// with the given unstable attributes. +func NewInodeSimpleAttributesWithUnstable(uattr fs.UnstableAttr, typ uint64) InodeSimpleAttributes { + return InodeSimpleAttributes{ + fsType: typ, + unstable: uattr, + } +} + +// UnstableAttr implements fs.InodeOperations.UnstableAttr. +func (i *InodeSimpleAttributes) UnstableAttr(ctx context.Context, _ *fs.Inode) (fs.UnstableAttr, error) { + i.mu.RLock() + u := i.unstable + i.mu.RUnlock() + return u, nil +} + +// SetPermissions implements fs.InodeOperations.SetPermissions. +func (i *InodeSimpleAttributes) SetPermissions(ctx context.Context, _ *fs.Inode, p fs.FilePermissions) bool { + i.mu.Lock() + i.unstable.SetPermissions(ctx, p) + i.mu.Unlock() + return true +} + +// SetOwner implements fs.InodeOperations.SetOwner. +func (i *InodeSimpleAttributes) SetOwner(ctx context.Context, _ *fs.Inode, owner fs.FileOwner) error { + i.mu.Lock() + i.unstable.SetOwner(ctx, owner) + i.mu.Unlock() + return nil +} + +// SetTimestamps implements fs.InodeOperations.SetTimestamps. +func (i *InodeSimpleAttributes) SetTimestamps(ctx context.Context, _ *fs.Inode, ts fs.TimeSpec) error { + i.mu.Lock() + i.unstable.SetTimestamps(ctx, ts) + i.mu.Unlock() + return nil +} + +// AddLink implements fs.InodeOperations.AddLink. +func (i *InodeSimpleAttributes) AddLink() { + i.mu.Lock() + i.unstable.Links++ + i.mu.Unlock() +} + +// DropLink implements fs.InodeOperations.DropLink. +func (i *InodeSimpleAttributes) DropLink() { + i.mu.Lock() + i.unstable.Links-- + i.mu.Unlock() +} + +// StatFS implements fs.InodeOperations.StatFS. +func (i *InodeSimpleAttributes) StatFS(context.Context) (fs.Info, error) { + if i.fsType == 0 { + return fs.Info{}, syserror.ENOSYS + } + return fs.Info{Type: i.fsType}, nil +} + +// NotifyAccess updates the access time. +func (i *InodeSimpleAttributes) NotifyAccess(ctx context.Context) { + i.mu.Lock() + i.unstable.AccessTime = ktime.NowFromContext(ctx) + i.mu.Unlock() +} + +// NotifyModification updates the modification time. +func (i *InodeSimpleAttributes) NotifyModification(ctx context.Context) { + i.mu.Lock() + i.unstable.ModificationTime = ktime.NowFromContext(ctx) + i.mu.Unlock() +} + +// NotifyStatusChange updates the status change time. +func (i *InodeSimpleAttributes) NotifyStatusChange(ctx context.Context) { + i.mu.Lock() + i.unstable.StatusChangeTime = ktime.NowFromContext(ctx) + i.mu.Unlock() +} + +// NotifyModificationAndStatusChange updates the modification and status change +// times. +func (i *InodeSimpleAttributes) NotifyModificationAndStatusChange(ctx context.Context) { + i.mu.Lock() + now := ktime.NowFromContext(ctx) + i.unstable.ModificationTime = now + i.unstable.StatusChangeTime = now + i.mu.Unlock() +} + +// InodeSimpleExtendedAttributes implements +// fs.InodeOperations.{Get,Set,List}xattr. +// +// +stateify savable +type InodeSimpleExtendedAttributes struct { + // mu protects xattrs. + mu sync.RWMutex `state:"nosave"` + xattrs map[string]string +} + +// Getxattr implements fs.InodeOperations.Getxattr. +func (i *InodeSimpleExtendedAttributes) Getxattr(_ *fs.Inode, name string) (string, error) { + i.mu.RLock() + value, ok := i.xattrs[name] + i.mu.RUnlock() + if !ok { + return "", syserror.ENOATTR + } + return value, nil +} + +// Setxattr implements fs.InodeOperations.Setxattr. +func (i *InodeSimpleExtendedAttributes) Setxattr(_ *fs.Inode, name, value string) error { + i.mu.Lock() + if i.xattrs == nil { + i.xattrs = make(map[string]string) + } + i.xattrs[name] = value + i.mu.Unlock() + return nil +} + +// Listxattr implements fs.InodeOperations.Listxattr. +func (i *InodeSimpleExtendedAttributes) Listxattr(_ *fs.Inode) (map[string]struct{}, error) { + i.mu.RLock() + names := make(map[string]struct{}, len(i.xattrs)) + for name := range i.xattrs { + names[name] = struct{}{} + } + i.mu.RUnlock() + return names, nil +} + +// staticFile is a file with static contents. It is returned by +// InodeStaticFileGetter.GetFile. +// +// +stateify savable +type staticFile struct { + FileGenericSeek `state:"nosave"` + FileNoIoctl `state:"nosave"` + FileNoMMap `state:"nosave"` + FileNoSplice `state:"nosave"` + FileNoopFsync `state:"nosave"` + FileNoopFlush `state:"nosave"` + FileNoopRelease `state:"nosave"` + FileNoopWrite `state:"nosave"` + FileNotDirReaddir `state:"nosave"` + FileUseInodeUnstableAttr `state:"nosave"` + waiter.AlwaysReady `state:"nosave"` + + FileStaticContentReader +} + +// InodeNoStatFS implement StatFS by retuning ENOSYS. +type InodeNoStatFS struct{} + +// StatFS implements fs.InodeOperations.StatFS. +func (InodeNoStatFS) StatFS(context.Context) (fs.Info, error) { + return fs.Info{}, syserror.ENOSYS +} + +// InodeStaticFileGetter implements GetFile for a file with static contents. +// +// +stateify savable +type InodeStaticFileGetter struct { + Contents []byte +} + +// GetFile implements fs.InodeOperations.GetFile. +func (i *InodeStaticFileGetter) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + return fs.NewFile(ctx, dirent, flags, &staticFile{ + FileStaticContentReader: NewFileStaticContentReader(i.Contents), + }), nil +} + +// InodeNotMappable returns a nil memmap.Mappable. +type InodeNotMappable struct{} + +// Mappable implements fs.InodeOperations.Mappable. +func (InodeNotMappable) Mappable(*fs.Inode) memmap.Mappable { + return nil +} + +// InodeNoopWriteOut is a no-op implementation of fs.InodeOperations.WriteOut. +type InodeNoopWriteOut struct{} + +// WriteOut is a no-op. +func (InodeNoopWriteOut) WriteOut(context.Context, *fs.Inode) error { + return nil +} + +// InodeNotDirectory can be used by Inodes that are not directories. +type InodeNotDirectory struct{} + +// Lookup implements fs.InodeOperations.Lookup. +func (InodeNotDirectory) Lookup(context.Context, *fs.Inode, string) (*fs.Dirent, error) { + return nil, syserror.ENOTDIR +} + +// Create implements fs.InodeOperations.Create. +func (InodeNotDirectory) Create(context.Context, *fs.Inode, string, fs.FileFlags, fs.FilePermissions) (*fs.File, error) { + return nil, syserror.ENOTDIR +} + +// CreateLink implements fs.InodeOperations.CreateLink. +func (InodeNotDirectory) CreateLink(context.Context, *fs.Inode, string, string) error { + return syserror.ENOTDIR +} + +// CreateHardLink implements fs.InodeOperations.CreateHardLink. +func (InodeNotDirectory) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, string) error { + return syserror.ENOTDIR +} + +// CreateDirectory implements fs.InodeOperations.CreateDirectory. +func (InodeNotDirectory) CreateDirectory(context.Context, *fs.Inode, string, fs.FilePermissions) error { + return syserror.ENOTDIR +} + +// Bind implements fs.InodeOperations.Bind. +func (InodeNotDirectory) Bind(context.Context, *fs.Inode, string, transport.BoundEndpoint, fs.FilePermissions) (*fs.Dirent, error) { + return nil, syserror.ENOTDIR +} + +// CreateFifo implements fs.InodeOperations.CreateFifo. +func (InodeNotDirectory) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error { + return syserror.ENOTDIR +} + +// Remove implements fs.InodeOperations.Remove. +func (InodeNotDirectory) Remove(context.Context, *fs.Inode, string) error { + return syserror.ENOTDIR +} + +// RemoveDirectory implements fs.InodeOperations.RemoveDirectory. +func (InodeNotDirectory) RemoveDirectory(context.Context, *fs.Inode, string) error { + return syserror.ENOTDIR +} + +// Rename implements fs.FileOperations.Rename. +func (InodeNotDirectory) Rename(context.Context, *fs.Inode, *fs.Inode, string, *fs.Inode, string, bool) error { + return syserror.EINVAL +} + +// InodeNotSocket can be used by Inodes that are not sockets. +type InodeNotSocket struct{} + +// BoundEndpoint implements fs.InodeOperations.BoundEndpoint. +func (InodeNotSocket) BoundEndpoint(*fs.Inode, string) transport.BoundEndpoint { + return nil +} + +// InodeNotTruncatable can be used by Inodes that cannot be truncated. +type InodeNotTruncatable struct{} + +// Truncate implements fs.InodeOperations.Truncate. +func (InodeNotTruncatable) Truncate(context.Context, *fs.Inode, int64) error { + return syserror.EINVAL +} + +// InodeIsDirTruncate implements fs.InodeOperations.Truncate for directories. +type InodeIsDirTruncate struct{} + +// Truncate implements fs.InodeOperations.Truncate. +func (InodeIsDirTruncate) Truncate(context.Context, *fs.Inode, int64) error { + return syserror.EISDIR +} + +// InodeNoopTruncate implements fs.InodeOperations.Truncate as a noop. +type InodeNoopTruncate struct{} + +// Truncate implements fs.InodeOperations.Truncate. +func (InodeNoopTruncate) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + +// InodeNotRenameable can be used by Inodes that cannot be truncated. +type InodeNotRenameable struct{} + +// Rename implements fs.InodeOperations.Rename. +func (InodeNotRenameable) Rename(context.Context, *fs.Inode, *fs.Inode, string, *fs.Inode, string, bool) error { + return syserror.EINVAL +} + +// InodeNotOpenable can be used by Inodes that cannot be opened. +type InodeNotOpenable struct{} + +// GetFile implements fs.InodeOperations.GetFile. +func (InodeNotOpenable) GetFile(context.Context, *fs.Dirent, fs.FileFlags) (*fs.File, error) { + return nil, syserror.EIO +} + +// InodeNotVirtual can be used by Inodes that are not virtual. +type InodeNotVirtual struct{} + +// IsVirtual implements fs.InodeOperations.IsVirtual. +func (InodeNotVirtual) IsVirtual() bool { + return false +} + +// InodeVirtual can be used by Inodes that are virtual. +type InodeVirtual struct{} + +// IsVirtual implements fs.InodeOperations.IsVirtual. +func (InodeVirtual) IsVirtual() bool { + return true +} + +// InodeNotSymlink can be used by Inodes that are not symlinks. +type InodeNotSymlink struct{} + +// Readlink implements fs.InodeOperations.Readlink. +func (InodeNotSymlink) Readlink(context.Context, *fs.Inode) (string, error) { + return "", syserror.ENOLINK +} + +// Getlink implements fs.InodeOperations.Getlink. +func (InodeNotSymlink) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) { + return nil, syserror.ENOLINK +} + +// InodeNoExtendedAttributes can be used by Inodes that do not support +// extended attributes. +type InodeNoExtendedAttributes struct{} + +// Getxattr implements fs.InodeOperations.Getxattr. +func (InodeNoExtendedAttributes) Getxattr(*fs.Inode, string) (string, error) { + return "", syserror.EOPNOTSUPP +} + +// Setxattr implements fs.InodeOperations.Setxattr. +func (InodeNoExtendedAttributes) Setxattr(*fs.Inode, string, string) error { + return syserror.EOPNOTSUPP +} + +// Listxattr implements fs.InodeOperations.Listxattr. +func (InodeNoExtendedAttributes) Listxattr(*fs.Inode) (map[string]struct{}, error) { + return nil, syserror.EOPNOTSUPP +} + +// InodeNoopRelease implements fs.InodeOperations.Release as a noop. +type InodeNoopRelease struct{} + +// Release implements fs.InodeOperations.Release. +func (InodeNoopRelease) Release(context.Context) {} + +// InodeGenericChecker implements fs.InodeOperations.Check with a generic +// implementation. +type InodeGenericChecker struct{} + +// Check implements fs.InodeOperations.Check. +func (InodeGenericChecker) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool { + return fs.ContextCanAccessFile(ctx, inode, p) +} + +// InodeDenyWriteChecker implements fs.InodeOperations.Check which denies all +// write operations. +type InodeDenyWriteChecker struct{} + +// Check implements fs.InodeOperations.Check. +func (InodeDenyWriteChecker) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool { + if p.Write { + return false + } + return fs.ContextCanAccessFile(ctx, inode, p) +} + +//InodeNotAllocatable can be used by Inodes that do not support Allocate(). +type InodeNotAllocatable struct{} + +func (InodeNotAllocatable) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error { + return syserror.EOPNOTSUPP +} + +// InodeNoopAllocate implements fs.InodeOperations.Allocate as a noop. +type InodeNoopAllocate struct{} + +// Allocate implements fs.InodeOperations.Allocate. +func (InodeNoopAllocate) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error { + return nil +} + +// InodeIsDirAllocate implements fs.InodeOperations.Allocate for directories. +type InodeIsDirAllocate struct{} + +// Allocate implements fs.InodeOperations.Allocate. +func (InodeIsDirAllocate) Allocate(_ context.Context, _ *fs.Inode, _, _ int64) error { + return syserror.EISDIR +} diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go new file mode 100644 index 000000000..7bee2eb5f --- /dev/null +++ b/pkg/sentry/fs/fsutil/inode_cached.go @@ -0,0 +1,1004 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "fmt" + "io" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// Lock order (compare the lock order model in mm/mm.go): +// +// CachingInodeOperations.attrMu ("fs locks") +// CachingInodeOperations.mapsMu ("memmap.Mappable locks not taken by Translate") +// CachingInodeOperations.dataMu ("memmap.Mappable locks taken by Translate") +// CachedFileObject locks + +// CachingInodeOperations caches the metadata and content of a CachedFileObject. +// It implements a subset of InodeOperations. As a utility it can be used to +// implement the full set of InodeOperations. Generally it should not be +// embedded to avoid unexpected inherited behavior. +// +// CachingInodeOperations implements Mappable for the CachedFileObject: +// +// - If CachedFileObject.FD returns a value >= 0 then the file descriptor +// will be memory mapped on the host. +// +// - Otherwise, the contents of CachedFileObject are buffered into memory +// managed by the CachingInodeOperations. +// +// Implementations of FileOperations for a CachedFileObject must read and +// write through CachingInodeOperations using Read and Write respectively. +// +// Implementations of InodeOperations.WriteOut must call Sync to write out +// in-memory modifications of data and metadata to the CachedFileObject. +// +// +stateify savable +type CachingInodeOperations struct { + // backingFile is a handle to a cached file object. + backingFile CachedFileObject + + // mfp is used to allocate memory that caches backingFile's contents. + mfp pgalloc.MemoryFileProvider + + // forcePageCache indicates the sentry page cache should be used regardless + // of whether the platform supports host mapped I/O or not. This must not be + // modified after inode creation. + forcePageCache bool + + attrMu sync.Mutex `state:"nosave"` + + // attr is unstable cached metadata. + // + // attr is protected by attrMu. attr.Size is protected by both attrMu and + // dataMu; reading it requires locking either mutex, while mutating it + // requires locking both. + attr fs.UnstableAttr + + // dirtyAttr is metadata that was updated in-place but hasn't yet + // been successfully written out. + // + // dirtyAttr is protected by attrMu. + dirtyAttr fs.AttrMask + + mapsMu sync.Mutex `state:"nosave"` + + // mappings tracks mappings of the cached file object into + // memmap.MappingSpaces. + // + // mappings is protected by mapsMu. + mappings memmap.MappingSet + + dataMu sync.RWMutex `state:"nosave"` + + // cache maps offsets into the cached file to offsets into + // mfp.MemoryFile() that store the file's data. + // + // cache is protected by dataMu. + cache FileRangeSet + + // dirty tracks dirty segments in cache. + // + // dirty is protected by dataMu. + dirty DirtySet + + // hostFileMapper caches internal mappings of backingFile.FD(). + hostFileMapper *HostFileMapper + + // refs tracks active references to data in the cache. + // + // refs is protected by dataMu. + refs frameRefSet +} + +// CachedFileObject is a file that may require caching. +type CachedFileObject interface { + // ReadToBlocksAt reads up to dsts.NumBytes() bytes from the file to dsts, + // starting at offset, and returns the number of bytes read. ReadToBlocksAt + // may return a partial read without an error. + ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) + + // WriteFromBlocksAt writes up to srcs.NumBytes() bytes from srcs to the + // file, starting at offset, and returns the number of bytes written. + // WriteFromBlocksAt may return a partial write without an error. + WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) + + // SetMaskedAttributes sets the attributes in attr that are true in mask + // on the backing file. + // + // SetMaskedAttributes may be called at any point, regardless of whether + // the file was opened. + SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error + + // Allocate allows the caller to reserve disk space for the inode. + // It's equivalent to fallocate(2) with 'mode=0'. + Allocate(ctx context.Context, offset int64, length int64) error + + // Sync instructs the remote filesystem to sync the file to stable storage. + Sync(ctx context.Context) error + + // FD returns a host file descriptor. If it is possible for + // CachingInodeOperations.AddMapping to have ever been called with writable + // = true, the FD must have been opened O_RDWR; otherwise, it may have been + // opened O_RDONLY or O_RDWR. (mmap unconditionally requires that mapped + // files are readable.) If no host file descriptor is available, FD returns + // a negative number. + // + // For any given CachedFileObject, if FD() ever succeeds (returns a + // non-negative number), it must always succeed. + // + // FD is called iff the file has been memory mapped. This implies that + // the file was opened (see fs.InodeOperations.GetFile). + FD() int +} + +// NewCachingInodeOperations returns a new CachingInodeOperations backed by +// a CachedFileObject and its initial unstable attributes. +func NewCachingInodeOperations(ctx context.Context, backingFile CachedFileObject, uattr fs.UnstableAttr, forcePageCache bool) *CachingInodeOperations { + mfp := pgalloc.MemoryFileProviderFromContext(ctx) + if mfp == nil { + panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider)) + } + return &CachingInodeOperations{ + backingFile: backingFile, + mfp: mfp, + forcePageCache: forcePageCache, + attr: uattr, + hostFileMapper: NewHostFileMapper(), + } +} + +// Release implements fs.InodeOperations.Release. +func (c *CachingInodeOperations) Release() { + c.mapsMu.Lock() + defer c.mapsMu.Unlock() + c.dataMu.Lock() + defer c.dataMu.Unlock() + + // Something has gone terribly wrong if we're releasing an inode that is + // still memory-mapped. + if !c.mappings.IsEmpty() { + panic(fmt.Sprintf("Releasing CachingInodeOperations with mappings:\n%s", &c.mappings)) + } + + // Drop any cached pages that are still awaiting MemoryFile eviction. (This + // means that MemoryFile no longer needs to evict them.) + mf := c.mfp.MemoryFile() + mf.MarkAllUnevictable(c) + if err := SyncDirtyAll(context.Background(), &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil { + panic(fmt.Sprintf("Failed to writeback cached data: %v", err)) + } + c.cache.DropAll(mf) + c.dirty.RemoveAll() +} + +// UnstableAttr implements fs.InodeOperations.UnstableAttr. +func (c *CachingInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { + c.attrMu.Lock() + attr := c.attr + c.attrMu.Unlock() + return attr, nil +} + +// SetPermissions implements fs.InodeOperations.SetPermissions. +func (c *CachingInodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, perms fs.FilePermissions) bool { + c.attrMu.Lock() + defer c.attrMu.Unlock() + + now := ktime.NowFromContext(ctx) + masked := fs.AttrMask{Perms: true} + if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Perms: perms}); err != nil { + return false + } + c.attr.Perms = perms + c.touchStatusChangeTimeLocked(now) + return true +} + +// SetOwner implements fs.InodeOperations.SetOwner. +func (c *CachingInodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error { + if !owner.UID.Ok() && !owner.GID.Ok() { + return nil + } + + c.attrMu.Lock() + defer c.attrMu.Unlock() + + now := ktime.NowFromContext(ctx) + masked := fs.AttrMask{ + UID: owner.UID.Ok(), + GID: owner.GID.Ok(), + } + if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Owner: owner}); err != nil { + return err + } + if owner.UID.Ok() { + c.attr.Owner.UID = owner.UID + } + if owner.GID.Ok() { + c.attr.Owner.GID = owner.GID + } + c.touchStatusChangeTimeLocked(now) + return nil +} + +// SetTimestamps implements fs.InodeOperations.SetTimestamps. +func (c *CachingInodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error { + if ts.ATimeOmit && ts.MTimeOmit { + return nil + } + + c.attrMu.Lock() + defer c.attrMu.Unlock() + + // Replace requests to use the "system time" with the current time to + // ensure that cached timestamps remain consistent with the remote + // filesystem. + now := ktime.NowFromContext(ctx) + if ts.ATimeSetSystemTime { + ts.ATime = now + } + if ts.MTimeSetSystemTime { + ts.MTime = now + } + masked := fs.AttrMask{ + AccessTime: !ts.ATimeOmit, + ModificationTime: !ts.MTimeOmit, + } + if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{AccessTime: ts.ATime, ModificationTime: ts.MTime}); err != nil { + return err + } + if !ts.ATimeOmit { + c.attr.AccessTime = ts.ATime + } + if !ts.MTimeOmit { + c.attr.ModificationTime = ts.MTime + } + c.touchStatusChangeTimeLocked(now) + return nil +} + +// Truncate implements fs.InodeOperations.Truncate. +func (c *CachingInodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error { + c.attrMu.Lock() + defer c.attrMu.Unlock() + + // c.attr.Size is protected by both c.attrMu and c.dataMu. + c.dataMu.Lock() + now := ktime.NowFromContext(ctx) + masked := fs.AttrMask{Size: true} + attr := fs.UnstableAttr{Size: size} + if err := c.backingFile.SetMaskedAttributes(ctx, masked, attr); err != nil { + c.dataMu.Unlock() + return err + } + oldSize := c.attr.Size + c.attr.Size = size + c.touchModificationAndStatusChangeTimeLocked(now) + + // We drop c.dataMu here so that we can lock c.mapsMu and invalidate + // mappings below. This allows concurrent calls to Read/Translate/etc. + // These functions synchronize with an in-progress Truncate by refusing to + // use cache contents beyond the new c.attr.Size. (We are still holding + // c.attrMu, so we can't race with Truncate/Write.) + c.dataMu.Unlock() + + // Nothing left to do unless shrinking the file. + if size >= oldSize { + return nil + } + + oldpgend := fs.OffsetPageEnd(oldSize) + newpgend := fs.OffsetPageEnd(size) + + // Invalidate past translations of truncated pages. + if newpgend != oldpgend { + c.mapsMu.Lock() + c.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ + // Compare Linux's mm/truncate.c:truncate_setsize() => + // truncate_pagecache() => + // mm/memory.c:unmap_mapping_range(evencows=1). + InvalidatePrivate: true, + }) + c.mapsMu.Unlock() + } + + // We are now guaranteed that there are no translations of truncated pages, + // and can remove them from the cache. Since truncated pages have been + // removed from the backing file, they should be dropped without being + // written back. + c.dataMu.Lock() + defer c.dataMu.Unlock() + c.cache.Truncate(uint64(size), c.mfp.MemoryFile()) + c.dirty.KeepClean(memmap.MappableRange{uint64(size), oldpgend}) + + return nil +} + +// Allocate implements fs.InodeOperations.Allocate. +func (c *CachingInodeOperations) Allocate(ctx context.Context, offset, length int64) error { + newSize := offset + length + + // c.attr.Size is protected by both c.attrMu and c.dataMu. + c.attrMu.Lock() + defer c.attrMu.Unlock() + c.dataMu.Lock() + defer c.dataMu.Unlock() + + if newSize <= c.attr.Size { + return nil + } + + now := ktime.NowFromContext(ctx) + if err := c.backingFile.Allocate(ctx, offset, length); err != nil { + return err + } + + c.attr.Size = newSize + c.touchModificationAndStatusChangeTimeLocked(now) + return nil +} + +// WriteOut implements fs.InodeOperations.WriteOut. +func (c *CachingInodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error { + c.attrMu.Lock() + + // Write dirty pages back. + c.dataMu.Lock() + err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), c.mfp.MemoryFile(), c.backingFile.WriteFromBlocksAt) + c.dataMu.Unlock() + if err != nil { + c.attrMu.Unlock() + return err + } + + // SyncDirtyAll above would have grown if needed. On shrinks, the backing + // file is called directly, so size is never needs to be updated. + c.dirtyAttr.Size = false + + // Write out cached attributes. + if err := c.backingFile.SetMaskedAttributes(ctx, c.dirtyAttr, c.attr); err != nil { + c.attrMu.Unlock() + return err + } + c.dirtyAttr = fs.AttrMask{} + + c.attrMu.Unlock() + + // Fsync the remote file. + return c.backingFile.Sync(ctx) +} + +// IncLinks increases the link count and updates cached modification time. +func (c *CachingInodeOperations) IncLinks(ctx context.Context) { + c.attrMu.Lock() + c.attr.Links++ + c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx)) + c.attrMu.Unlock() +} + +// DecLinks decreases the link count and updates cached modification time. +func (c *CachingInodeOperations) DecLinks(ctx context.Context) { + c.attrMu.Lock() + c.attr.Links-- + c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx)) + c.attrMu.Unlock() +} + +// TouchAccessTime updates the cached access time in-place to the +// current time. It does not update status change time in-place. See +// mm/filemap.c:do_generic_file_read -> include/linux/h:file_accessed. +func (c *CachingInodeOperations) TouchAccessTime(ctx context.Context, inode *fs.Inode) { + if inode.MountSource.Flags.NoAtime { + return + } + + c.attrMu.Lock() + c.touchAccessTimeLocked(ktime.NowFromContext(ctx)) + c.attrMu.Unlock() +} + +// touchAccesstimeLocked updates the cached access time in-place to the current +// time. +// +// Preconditions: c.attrMu is locked for writing. +func (c *CachingInodeOperations) touchAccessTimeLocked(now time.Time) { + c.attr.AccessTime = now + c.dirtyAttr.AccessTime = true +} + +// TouchModificationAndStatusChangeTime updates the cached modification and +// status change times in-place to the current time. +func (c *CachingInodeOperations) TouchModificationAndStatusChangeTime(ctx context.Context) { + c.attrMu.Lock() + c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx)) + c.attrMu.Unlock() +} + +// touchModificationAndStatusChangeTimeLocked updates the cached modification +// and status change times in-place to the current time. +// +// Preconditions: c.attrMu is locked for writing. +func (c *CachingInodeOperations) touchModificationAndStatusChangeTimeLocked(now time.Time) { + c.attr.ModificationTime = now + c.dirtyAttr.ModificationTime = true + c.attr.StatusChangeTime = now + c.dirtyAttr.StatusChangeTime = true +} + +// TouchStatusChangeTime updates the cached status change time in-place to the +// current time. +func (c *CachingInodeOperations) TouchStatusChangeTime(ctx context.Context) { + c.attrMu.Lock() + c.touchStatusChangeTimeLocked(ktime.NowFromContext(ctx)) + c.attrMu.Unlock() +} + +// touchStatusChangeTimeLocked updates the cached status change time +// in-place to the current time. +// +// Preconditions: c.attrMu is locked for writing. +func (c *CachingInodeOperations) touchStatusChangeTimeLocked(now time.Time) { + c.attr.StatusChangeTime = now + c.dirtyAttr.StatusChangeTime = true +} + +// UpdateUnstable updates the cached unstable attributes. Only non-dirty +// attributes are updated. +func (c *CachingInodeOperations) UpdateUnstable(attr fs.UnstableAttr) { + // All attributes are protected by attrMu. + c.attrMu.Lock() + + if !c.dirtyAttr.Usage { + c.attr.Usage = attr.Usage + } + if !c.dirtyAttr.Perms { + c.attr.Perms = attr.Perms + } + if !c.dirtyAttr.UID { + c.attr.Owner.UID = attr.Owner.UID + } + if !c.dirtyAttr.GID { + c.attr.Owner.GID = attr.Owner.GID + } + if !c.dirtyAttr.AccessTime { + c.attr.AccessTime = attr.AccessTime + } + if !c.dirtyAttr.ModificationTime { + c.attr.ModificationTime = attr.ModificationTime + } + if !c.dirtyAttr.StatusChangeTime { + c.attr.StatusChangeTime = attr.StatusChangeTime + } + if !c.dirtyAttr.Links { + c.attr.Links = attr.Links + } + + // Size requires holding attrMu and dataMu. + c.dataMu.Lock() + if !c.dirtyAttr.Size { + c.attr.Size = attr.Size + } + c.dataMu.Unlock() + + c.attrMu.Unlock() +} + +// Read reads from frames and otherwise directly from the backing file +// into dst starting at offset until dst is full, EOF is reached, or an +// error is encountered. +// +// Read may partially fill dst and return a nil error. +func (c *CachingInodeOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + if dst.NumBytes() == 0 { + return 0, nil + } + + // Have we reached EOF? We check for this again in + // inodeReadWriter.ReadToBlocks to avoid holding c.attrMu (which would + // serialize reads) or c.dataMu (which would violate lock ordering), but + // check here first (before calling into MM) since reading at EOF is + // common: getting a return value of 0 from a read syscall is the only way + // to detect EOF. + // + // TODO(jamieliu): Separate out c.attr.Size and use atomics instead of + // c.dataMu. + c.dataMu.RLock() + size := c.attr.Size + c.dataMu.RUnlock() + if offset >= size { + return 0, io.EOF + } + + n, err := dst.CopyOutFrom(ctx, &inodeReadWriter{ctx, c, offset}) + // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). + c.TouchAccessTime(ctx, file.Dirent.Inode) + return n, err +} + +// Write writes to frames and otherwise directly to the backing file +// from src starting at offset and until src is empty or an error is +// encountered. +// +// If Write partially fills src, a non-nil error is returned. +func (c *CachingInodeOperations) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + // Hot path. Avoid defers. + if src.NumBytes() == 0 { + return 0, nil + } + + c.attrMu.Lock() + // Compare Linux's mm/filemap.c:__generic_file_write_iter() => file_update_time(). + c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx)) + n, err := src.CopyInTo(ctx, &inodeReadWriter{ctx, c, offset}) + c.attrMu.Unlock() + return n, err +} + +type inodeReadWriter struct { + ctx context.Context + c *CachingInodeOperations + offset int64 +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + // Hot path. Avoid defers. + rw.c.dataMu.RLock() + + // Compute the range to read. + if rw.offset >= rw.c.attr.Size { + rw.c.dataMu.RUnlock() + return 0, io.EOF + } + end := fs.ReadEndOffset(rw.offset, int64(dsts.NumBytes()), rw.c.attr.Size) + if end == rw.offset { // dsts.NumBytes() == 0? + rw.c.dataMu.RUnlock() + return 0, nil + } + + mem := rw.c.mfp.MemoryFile() + var done uint64 + seg, gap := rw.c.cache.Find(uint64(rw.offset)) + for rw.offset < end { + mr := memmap.MappableRange{uint64(rw.offset), uint64(end)} + switch { + case seg.Ok(): + // Get internal mappings from the cache. + ims, err := mem.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read) + if err != nil { + rw.c.dataMu.RUnlock() + return done, err + } + + // Copy from internal mappings. + n, err := safemem.CopySeq(dsts, ims) + done += n + rw.offset += int64(n) + dsts = dsts.DropFirst64(n) + if err != nil { + rw.c.dataMu.RUnlock() + return done, err + } + + // Continue. + seg, gap = seg.NextNonEmpty() + + case gap.Ok(): + // Read directly from the backing file. + gapmr := gap.Range().Intersect(mr) + dst := dsts.TakeFirst64(gapmr.Length()) + n, err := rw.c.backingFile.ReadToBlocksAt(rw.ctx, dst, gapmr.Start) + done += n + rw.offset += int64(n) + dsts = dsts.DropFirst64(n) + // Partial reads are fine. But we must stop reading. + if n != dst.NumBytes() || err != nil { + rw.c.dataMu.RUnlock() + return done, err + } + + // Continue. + seg, gap = gap.NextSegment(), FileRangeGapIterator{} + + default: + break + } + } + rw.c.dataMu.RUnlock() + return done, nil +} + +// maybeGrowFile grows the file's size if data has been written past the old +// size. +// +// Preconditions: rw.c.attrMu and rw.c.dataMu bust be locked. +func (rw *inodeReadWriter) maybeGrowFile() { + // If the write ends beyond the file's previous size, it causes the + // file to grow. + if rw.offset > rw.c.attr.Size { + rw.c.attr.Size = rw.offset + rw.c.dirtyAttr.Size = true + } + if rw.offset > rw.c.attr.Usage { + // This is incorrect if CachingInodeOperations is caching a sparse + // file. (In Linux, keeping inode::i_blocks up to date is the + // filesystem's responsibility.) + rw.c.attr.Usage = rw.offset + rw.c.dirtyAttr.Usage = true + } +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +// +// Preconditions: rw.c.attrMu must be locked. +func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + // Hot path. Avoid defers. + rw.c.dataMu.Lock() + + // Compute the range to write. + end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes())) + if end == rw.offset { // srcs.NumBytes() == 0? + rw.c.dataMu.Unlock() + return 0, nil + } + + mf := rw.c.mfp.MemoryFile() + var done uint64 + seg, gap := rw.c.cache.Find(uint64(rw.offset)) + for rw.offset < end { + mr := memmap.MappableRange{uint64(rw.offset), uint64(end)} + switch { + case seg.Ok() && seg.Start() < mr.End: + // Get internal mappings from the cache. + segMR := seg.Range().Intersect(mr) + ims, err := mf.MapInternal(seg.FileRangeOf(segMR), usermem.Write) + if err != nil { + rw.maybeGrowFile() + rw.c.dataMu.Unlock() + return done, err + } + + // Copy to internal mappings. + n, err := safemem.CopySeq(ims, srcs) + done += n + rw.offset += int64(n) + srcs = srcs.DropFirst64(n) + rw.c.dirty.MarkDirty(segMR) + if err != nil { + rw.maybeGrowFile() + rw.c.dataMu.Unlock() + return done, err + } + + // Continue. + seg, gap = seg.NextNonEmpty() + + case gap.Ok() && gap.Start() < mr.End: + // Write directly to the backing file. + gapmr := gap.Range().Intersect(mr) + src := srcs.TakeFirst64(gapmr.Length()) + n, err := rw.c.backingFile.WriteFromBlocksAt(rw.ctx, src, gapmr.Start) + done += n + rw.offset += int64(n) + srcs = srcs.DropFirst64(n) + // Partial writes are fine. But we must stop writing. + if n != src.NumBytes() || err != nil { + rw.maybeGrowFile() + rw.c.dataMu.Unlock() + return done, err + } + + // Continue. + seg, gap = gap.NextSegment(), FileRangeGapIterator{} + + default: + break + } + } + rw.maybeGrowFile() + rw.c.dataMu.Unlock() + return done, nil +} + +// useHostPageCache returns true if c uses c.backingFile.FD() for all file I/O +// and memory mappings, and false if c.cache may contain data cached from +// c.backingFile. +func (c *CachingInodeOperations) useHostPageCache() bool { + return !c.forcePageCache && c.backingFile.FD() >= 0 +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { + // Hot path. Avoid defers. + c.mapsMu.Lock() + mapped := c.mappings.AddMapping(ms, ar, offset, writable) + // Do this unconditionally since whether we have c.backingFile.FD() >= 0 + // can change across save/restore. + for _, r := range mapped { + c.hostFileMapper.IncRefOn(r) + } + if !c.useHostPageCache() { + // c.Evict() will refuse to evict memory-mapped pages, so tell the + // MemoryFile to not bother trying. + mf := c.mfp.MemoryFile() + for _, r := range mapped { + mf.MarkUnevictable(c, pgalloc.EvictableRange{r.Start, r.End}) + } + } + if c.useHostPageCache() && !usage.IncrementalMappedAccounting { + for _, r := range mapped { + usage.MemoryAccounting.Inc(r.Length(), usage.Mapped) + } + } + c.mapsMu.Unlock() + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) { + // Hot path. Avoid defers. + c.mapsMu.Lock() + unmapped := c.mappings.RemoveMapping(ms, ar, offset, writable) + for _, r := range unmapped { + c.hostFileMapper.DecRefOn(r) + } + if c.useHostPageCache() { + if !usage.IncrementalMappedAccounting { + for _, r := range unmapped { + usage.MemoryAccounting.Dec(r.Length(), usage.Mapped) + } + } + c.mapsMu.Unlock() + return + } + + // Pages that are no longer referenced by any application memory mappings + // are now considered unused; allow MemoryFile to evict them when + // necessary. + mf := c.mfp.MemoryFile() + c.dataMu.Lock() + for _, r := range unmapped { + // Since these pages are no longer mapped, they are no longer + // concurrently dirtyable by a writable memory mapping. + c.dirty.AllowClean(r) + mf.MarkEvictable(c, pgalloc.EvictableRange{r.Start, r.End}) + } + c.dataMu.Unlock() + c.mapsMu.Unlock() +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (c *CachingInodeOperations) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error { + return c.AddMapping(ctx, ms, dstAR, offset, writable) +} + +// Translate implements memmap.Mappable.Translate. +func (c *CachingInodeOperations) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + // Hot path. Avoid defer. + if c.useHostPageCache() { + return []memmap.Translation{ + { + Source: optional, + File: c, + Offset: optional.Start, + Perms: usermem.AnyAccess, + }, + }, nil + } + + c.dataMu.Lock() + + // Constrain translations to c.attr.Size (rounded up) to prevent + // translation to pages that may be concurrently truncated. + pgend := fs.OffsetPageEnd(c.attr.Size) + var beyondEOF bool + if required.End > pgend { + if required.Start >= pgend { + c.dataMu.Unlock() + return nil, &memmap.BusError{io.EOF} + } + beyondEOF = true + required.End = pgend + } + if optional.End > pgend { + optional.End = pgend + } + + mf := c.mfp.MemoryFile() + cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, c.backingFile.ReadToBlocksAt) + + var ts []memmap.Translation + var translatedEnd uint64 + for seg := c.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { + segMR := seg.Range().Intersect(optional) + // TODO(jamieliu): Make Translations writable even if writability is + // not required if already kept-dirty by another writable translation. + perms := usermem.AccessType{ + Read: true, + Execute: true, + } + if at.Write { + // From this point forward, this memory can be dirtied through the + // mapping at any time. + c.dirty.KeepDirty(segMR) + perms.Write = true + } + ts = append(ts, memmap.Translation{ + Source: segMR, + File: mf, + Offset: seg.FileRangeOf(segMR).Start, + Perms: perms, + }) + translatedEnd = segMR.End + } + + c.dataMu.Unlock() + + // Don't return the error returned by c.cache.Fill if it occurred outside + // of required. + if translatedEnd < required.End && cerr != nil { + return ts, &memmap.BusError{cerr} + } + if beyondEOF { + return ts, &memmap.BusError{io.EOF} + } + return ts, nil +} + +func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange { + const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily + if required.Length() >= maxReadahead { + return required + } + if optional.Length() <= maxReadahead { + return optional + } + optional.Start = required.Start + if optional.Length() <= maxReadahead { + return optional + } + optional.End = optional.Start + maxReadahead + return optional +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error { + // Whether we have a host fd (and consequently what platform.File is + // mapped) can change across save/restore, so invalidate all translations + // unconditionally. + c.mapsMu.Lock() + defer c.mapsMu.Unlock() + c.mappings.InvalidateAll(memmap.InvalidateOpts{}) + + // Sync the cache's contents so that if we have a host fd after restore, + // the remote file's contents are coherent. + mf := c.mfp.MemoryFile() + c.dataMu.Lock() + defer c.dataMu.Unlock() + if err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil { + return err + } + + // Discard the cache so that it's not stored in saved state. This is safe + // because per InvalidateUnsavable invariants, no new translations can have + // been returned after we invalidated all existing translations above. + c.cache.DropAll(mf) + c.dirty.RemoveAll() + + return nil +} + +// Evict implements pgalloc.EvictableMemoryUser.Evict. +func (c *CachingInodeOperations) Evict(ctx context.Context, er pgalloc.EvictableRange) { + c.mapsMu.Lock() + defer c.mapsMu.Unlock() + c.dataMu.Lock() + defer c.dataMu.Unlock() + + mr := memmap.MappableRange{er.Start, er.End} + mf := c.mfp.MemoryFile() + // Only allow pages that are no longer memory-mapped to be evicted. + for mgap := c.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() { + mgapMR := mgap.Range().Intersect(mr) + if mgapMR.Length() == 0 { + continue + } + if err := SyncDirty(ctx, mgapMR, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil { + log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err) + } + c.cache.Drop(mgapMR, mf) + c.dirty.KeepClean(mgapMR) + } +} + +// IncRef implements platform.File.IncRef. This is used when we directly map an +// underlying host fd and CachingInodeOperations is used as the platform.File +// during translation. +func (c *CachingInodeOperations) IncRef(fr platform.FileRange) { + // Hot path. Avoid defers. + c.dataMu.Lock() + seg, gap := c.refs.Find(fr.Start) + for { + switch { + case seg.Ok() && seg.Start() < fr.End: + seg = c.refs.Isolate(seg, fr) + seg.SetValue(seg.Value() + 1) + seg, gap = seg.NextNonEmpty() + case gap.Ok() && gap.Start() < fr.End: + newRange := gap.Range().Intersect(fr) + if usage.IncrementalMappedAccounting { + usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped) + } + seg, gap = c.refs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty() + default: + c.refs.MergeAdjacent(fr) + c.dataMu.Unlock() + return + } + } +} + +// DecRef implements platform.File.DecRef. This is used when we directly map an +// underlying host fd and CachingInodeOperations is used as the platform.File +// during translation. +func (c *CachingInodeOperations) DecRef(fr platform.FileRange) { + // Hot path. Avoid defers. + c.dataMu.Lock() + seg := c.refs.FindSegment(fr.Start) + + for seg.Ok() && seg.Start() < fr.End { + seg = c.refs.Isolate(seg, fr) + if old := seg.Value(); old == 1 { + if usage.IncrementalMappedAccounting { + usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped) + } + seg = c.refs.Remove(seg).NextSegment() + } else { + seg.SetValue(old - 1) + seg = seg.NextSegment() + } + } + c.refs.MergeAdjacent(fr) + c.dataMu.Unlock() + +} + +// MapInternal implements platform.File.MapInternal. This is used when we +// directly map an underlying host fd and CachingInodeOperations is used as the +// platform.File during translation. +func (c *CachingInodeOperations) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { + return c.hostFileMapper.MapInternal(fr, c.backingFile.FD(), at.Write) +} + +// FD implements platform.File.FD. This is used when we directly map an +// underlying host fd and CachingInodeOperations is used as the platform.File +// during translation. +func (c *CachingInodeOperations) FD() int { + return c.backingFile.FD() +} |