diff options
Diffstat (limited to 'pkg/sentry/mm')
-rw-r--r-- | pkg/sentry/mm/address_space.go | 216 | ||||
-rw-r--r-- | pkg/sentry/mm/aio_context.go | 387 | ||||
-rw-r--r-- | pkg/sentry/mm/aio_context_state.go | 20 | ||||
-rw-r--r-- | pkg/sentry/mm/debug.go | 98 | ||||
-rwxr-xr-x | pkg/sentry/mm/file_refcount_set.go | 1274 | ||||
-rw-r--r-- | pkg/sentry/mm/io.go | 639 | ||||
-rwxr-xr-x | pkg/sentry/mm/io_list.go | 173 | ||||
-rw-r--r-- | pkg/sentry/mm/lifecycle.go | 234 | ||||
-rw-r--r-- | pkg/sentry/mm/metadata.go | 139 | ||||
-rw-r--r-- | pkg/sentry/mm/mm.go | 456 | ||||
-rwxr-xr-x | pkg/sentry/mm/mm_state_autogen.go | 380 | ||||
-rw-r--r-- | pkg/sentry/mm/pma.go | 1036 | ||||
-rwxr-xr-x | pkg/sentry/mm/pma_set.go | 1274 | ||||
-rw-r--r-- | pkg/sentry/mm/procfs.go | 289 | ||||
-rw-r--r-- | pkg/sentry/mm/save_restore.go | 57 | ||||
-rw-r--r-- | pkg/sentry/mm/shm.go | 66 | ||||
-rw-r--r-- | pkg/sentry/mm/special_mappable.go | 155 | ||||
-rw-r--r-- | pkg/sentry/mm/syscalls.go | 1197 | ||||
-rw-r--r-- | pkg/sentry/mm/vma.go | 564 | ||||
-rwxr-xr-x | pkg/sentry/mm/vma_set.go | 1274 |
20 files changed, 9928 insertions, 0 deletions
diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go new file mode 100644 index 000000000..06f587fde --- /dev/null +++ b/pkg/sentry/mm/address_space.go @@ -0,0 +1,216 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "fmt" + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/atomicbitops" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// AddressSpace returns the platform.AddressSpace bound to mm. +// +// Preconditions: The caller must have called mm.Activate(). +func (mm *MemoryManager) AddressSpace() platform.AddressSpace { + if atomic.LoadInt32(&mm.active) == 0 { + panic("trying to use inactive address space?") + } + return mm.as +} + +// Activate ensures this MemoryManager has a platform.AddressSpace. +// +// The caller must not hold any locks when calling Activate. +// +// When this MemoryManager is no longer needed by a task, it should call +// Deactivate to release the reference. +func (mm *MemoryManager) Activate() error { + // Fast path: the MemoryManager already has an active + // platform.AddressSpace, and we just need to indicate that we need it too. + if atomicbitops.IncUnlessZeroInt32(&mm.active) { + return nil + } + + for { + // Slow path: may need to synchronize with other goroutines changing + // mm.active to or from zero. + mm.activeMu.Lock() + // Inline Unlock instead of using a defer for performance since this + // method is commonly in the hot-path. + + // Check if we raced with another goroutine performing activation. + if atomic.LoadInt32(&mm.active) > 0 { + // This can't race; Deactivate can't decrease mm.active from 1 to 0 + // without holding activeMu. + atomic.AddInt32(&mm.active, 1) + mm.activeMu.Unlock() + return nil + } + + // Do we have a context? If so, then we never unmapped it. This can + // only be the case if !mm.p.CooperativelySchedulesAddressSpace(). + if mm.as != nil { + atomic.StoreInt32(&mm.active, 1) + mm.activeMu.Unlock() + return nil + } + + // Get a new address space. We must force unmapping by passing nil to + // NewAddressSpace if requested. (As in the nil interface object, not a + // typed nil.) + mappingsID := (interface{})(mm) + if mm.unmapAllOnActivate { + mappingsID = nil + } + as, c, err := mm.p.NewAddressSpace(mappingsID) + if err != nil { + mm.activeMu.Unlock() + return err + } + if as == nil { + // AddressSpace is unavailable, we must wait. + // + // activeMu must not be held while waiting, as the user + // of the address space we are waiting on may attempt + // to take activeMu. + // + // Don't call UninterruptibleSleepStart to register the + // wait to allow the watchdog stuck task to trigger in + // case a process is starved waiting for the address + // space. + mm.activeMu.Unlock() + <-c + continue + } + + // Okay, we could restore all mappings at this point. + // But forget that. Let's just let them fault in. + mm.as = as + + // Unmapping is done, if necessary. + mm.unmapAllOnActivate = false + + // Now that m.as has been assigned, we can set m.active to a non-zero value + // to enable the fast path. + atomic.StoreInt32(&mm.active, 1) + + mm.activeMu.Unlock() + return nil + } +} + +// Deactivate releases a reference to the MemoryManager. +func (mm *MemoryManager) Deactivate() { + // Fast path: this is not the last goroutine to deactivate the + // MemoryManager. + if atomicbitops.DecUnlessOneInt32(&mm.active) { + return + } + + mm.activeMu.Lock() + // Same as Activate. + + // Still active? + if atomic.AddInt32(&mm.active, -1) > 0 { + mm.activeMu.Unlock() + return + } + + // Can we hold on to the address space? + if !mm.p.CooperativelySchedulesAddressSpace() { + mm.activeMu.Unlock() + return + } + + // Release the address space. + mm.as.Release() + + // Lost it. + mm.as = nil + mm.activeMu.Unlock() +} + +// mapASLocked maps addresses in ar into mm.as. If precommit is true, mappings +// for all addresses in ar should be precommitted. +// +// Preconditions: mm.activeMu must be locked. mm.as != nil. ar.Length() != 0. +// ar must be page-aligned. pseg == mm.pmas.LowerBoundSegment(ar.Start). +func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar usermem.AddrRange, precommit bool) error { + // By default, map entire pmas at a time, under the assumption that there + // is no cost to mapping more of a pma than necessary. + mapAR := usermem.AddrRange{0, ^usermem.Addr(usermem.PageSize - 1)} + if precommit { + // When explicitly precommitting, only map ar, since overmapping may + // incur unexpected resource usage. + mapAR = ar + } else if mapUnit := mm.p.MapUnit(); mapUnit != 0 { + // Limit the range we map to ar, aligned to mapUnit. + mapMask := usermem.Addr(mapUnit - 1) + mapAR.Start = ar.Start &^ mapMask + // If rounding ar.End up overflows, just keep the existing mapAR.End. + if end := (ar.End + mapMask) &^ mapMask; end >= ar.End { + mapAR.End = end + } + } + if checkInvariants { + if !mapAR.IsSupersetOf(ar) { + panic(fmt.Sprintf("mapAR %#v is not a superset of ar %#v", mapAR, ar)) + } + } + + // Since this checks ar.End and not mapAR.End, we will never map a pma that + // is not required. + for pseg.Ok() && pseg.Start() < ar.End { + pma := pseg.ValuePtr() + pmaAR := pseg.Range() + pmaMapAR := pmaAR.Intersect(mapAR) + perms := pma.effectivePerms + if pma.needCOW { + perms.Write = false + } + if err := mm.as.MapFile(pmaMapAR.Start, pma.file, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil { + return err + } + pseg = pseg.NextSegment() + } + return nil +} + +// unmapASLocked removes all AddressSpace mappings for addresses in ar. +// +// Preconditions: mm.activeMu must be locked. +func (mm *MemoryManager) unmapASLocked(ar usermem.AddrRange) { + if mm.as == nil { + // No AddressSpace? Force all mappings to be unmapped on the next + // Activate. + mm.unmapAllOnActivate = true + return + } + + // unmapASLocked doesn't require vmas or pmas to exist for ar, so it can be + // passed ranges that include addresses that can't be mapped by the + // application. + ar = ar.Intersect(mm.applicationAddrRange()) + + // Note that this AddressSpace may or may not be active. If the + // platform does not require cooperative sharing of AddressSpaces, they + // are retained between Deactivate/Activate calls. Despite not being + // active, it is still valid to perform operations on these address + // spaces. + mm.as.Unmap(ar.Start, uint64(ar.Length())) +} diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go new file mode 100644 index 000000000..5c61acf36 --- /dev/null +++ b/pkg/sentry/mm/aio_context.go @@ -0,0 +1,387 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// aioManager creates and manages asynchronous I/O contexts. +// +// +stateify savable +type aioManager struct { + // mu protects below. + mu sync.Mutex `state:"nosave"` + + // aioContexts is the set of asynchronous I/O contexts. + contexts map[uint64]*AIOContext +} + +func (a *aioManager) destroy() { + a.mu.Lock() + defer a.mu.Unlock() + + for _, ctx := range a.contexts { + ctx.destroy() + } +} + +// newAIOContext creates a new context for asynchronous I/O. +// +// Returns false if 'id' is currently in use. +func (a *aioManager) newAIOContext(events uint32, id uint64) bool { + a.mu.Lock() + defer a.mu.Unlock() + + if _, ok := a.contexts[id]; ok { + return false + } + + a.contexts[id] = &AIOContext{ + done: make(chan struct{}, 1), + maxOutstanding: events, + } + return true +} + +// destroyAIOContext destroys an asynchronous I/O context. +// +// False is returned if the context does not exist. +func (a *aioManager) destroyAIOContext(id uint64) bool { + a.mu.Lock() + defer a.mu.Unlock() + ctx, ok := a.contexts[id] + if !ok { + return false + } + delete(a.contexts, id) + ctx.destroy() + return true +} + +// lookupAIOContext looks up the given context. +// +// Returns false if context does not exist. +func (a *aioManager) lookupAIOContext(id uint64) (*AIOContext, bool) { + a.mu.Lock() + defer a.mu.Unlock() + ctx, ok := a.contexts[id] + return ctx, ok +} + +// ioResult is a completed I/O operation. +// +// +stateify savable +type ioResult struct { + data interface{} + ioEntry +} + +// AIOContext is a single asynchronous I/O context. +// +// +stateify savable +type AIOContext struct { + // done is the notification channel used for all requests. + done chan struct{} `state:"nosave"` + + // mu protects below. + mu sync.Mutex `state:"nosave"` + + // results is the set of completed requests. + results ioList + + // maxOutstanding is the maximum number of outstanding entries; this value + // is immutable. + maxOutstanding uint32 + + // outstanding is the number of requests outstanding; this will effectively + // be the number of entries in the result list or that are expected to be + // added to the result list. + outstanding uint32 + + // dead is set when the context is destroyed. + dead bool `state:"zerovalue"` +} + +// destroy marks the context dead. +func (ctx *AIOContext) destroy() { + ctx.mu.Lock() + defer ctx.mu.Unlock() + ctx.dead = true + if ctx.outstanding == 0 { + close(ctx.done) + } +} + +// Prepare reserves space for a new request, returning true if available. +// Returns false if the context is busy. +func (ctx *AIOContext) Prepare() bool { + ctx.mu.Lock() + defer ctx.mu.Unlock() + if ctx.outstanding >= ctx.maxOutstanding { + return false + } + ctx.outstanding++ + return true +} + +// PopRequest pops a completed request if available, this function does not do +// any blocking. Returns false if no request is available. +func (ctx *AIOContext) PopRequest() (interface{}, bool) { + ctx.mu.Lock() + defer ctx.mu.Unlock() + + // Is there anything ready? + if e := ctx.results.Front(); e != nil { + ctx.results.Remove(e) + ctx.outstanding-- + if ctx.outstanding == 0 && ctx.dead { + close(ctx.done) + } + return e.data, true + } + return nil, false +} + +// FinishRequest finishes a pending request. It queues up the data +// and notifies listeners. +func (ctx *AIOContext) FinishRequest(data interface{}) { + ctx.mu.Lock() + defer ctx.mu.Unlock() + + // Push to the list and notify opportunistically. The channel notify + // here is guaranteed to be safe because outstanding must be non-zero. + // The done channel is only closed when outstanding reaches zero. + ctx.results.PushBack(&ioResult{data: data}) + + select { + case ctx.done <- struct{}{}: + default: + } +} + +// WaitChannel returns a channel that is notified when an AIO request is +// completed. +// +// The boolean return value indicates whether or not the context is active. +func (ctx *AIOContext) WaitChannel() (chan struct{}, bool) { + ctx.mu.Lock() + defer ctx.mu.Unlock() + if ctx.outstanding == 0 && ctx.dead { + return nil, false + } + return ctx.done, true +} + +// aioMappable implements memmap.MappingIdentity and memmap.Mappable for AIO +// ring buffers. +// +// +stateify savable +type aioMappable struct { + refs.AtomicRefCount + + mfp pgalloc.MemoryFileProvider + fr platform.FileRange +} + +var aioRingBufferSize = uint64(usermem.Addr(linux.AIORingSize).MustRoundUp()) + +func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) { + fr, err := mfp.MemoryFile().Allocate(aioRingBufferSize, usage.Anonymous) + if err != nil { + return nil, err + } + return &aioMappable{mfp: mfp, fr: fr}, nil +} + +// DecRef implements refs.RefCounter.DecRef. +func (m *aioMappable) DecRef() { + m.AtomicRefCount.DecRefWithDestructor(func() { + m.mfp.MemoryFile().DecRef(m.fr) + }) +} + +// MappedName implements memmap.MappingIdentity.MappedName. +func (m *aioMappable) MappedName(ctx context.Context) string { + return "[aio]" +} + +// DeviceID implements memmap.MappingIdentity.DeviceID. +func (m *aioMappable) DeviceID() uint64 { + return 0 +} + +// InodeID implements memmap.MappingIdentity.InodeID. +func (m *aioMappable) InodeID() uint64 { + return 0 +} + +// Msync implements memmap.MappingIdentity.Msync. +func (m *aioMappable) Msync(ctx context.Context, mr memmap.MappableRange) error { + // Linux: aio_ring_fops.fsync == NULL + return syserror.EINVAL +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (m *aioMappable) AddMapping(_ context.Context, _ memmap.MappingSpace, ar usermem.AddrRange, offset uint64, _ bool) error { + // Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap() + // sets VM_DONTEXPAND). + if offset != 0 || uint64(ar.Length()) != aioRingBufferSize { + return syserror.EFAULT + } + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (m *aioMappable) RemoveMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, uint64, bool) { +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (m *aioMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, _ bool) error { + // Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap() + // sets VM_DONTEXPAND). + if offset != 0 || uint64(dstAR.Length()) != aioRingBufferSize { + return syserror.EFAULT + } + // Require that the mapping correspond to a live AIOContext. Compare + // Linux's fs/aio.c:aio_ring_mremap(). + mm, ok := ms.(*MemoryManager) + if !ok { + return syserror.EINVAL + } + am := &mm.aioManager + am.mu.Lock() + defer am.mu.Unlock() + oldID := uint64(srcAR.Start) + aioCtx, ok := am.contexts[oldID] + if !ok { + return syserror.EINVAL + } + aioCtx.mu.Lock() + defer aioCtx.mu.Unlock() + if aioCtx.dead { + return syserror.EINVAL + } + // Use the new ID for the AIOContext. + am.contexts[uint64(dstAR.Start)] = aioCtx + delete(am.contexts, oldID) + return nil +} + +// Translate implements memmap.Mappable.Translate. +func (m *aioMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + var err error + if required.End > m.fr.Length() { + err = &memmap.BusError{syserror.EFAULT} + } + if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 { + return []memmap.Translation{ + { + Source: source, + File: m.mfp.MemoryFile(), + Offset: m.fr.Start + source.Start, + Perms: usermem.AnyAccess, + }, + }, err + } + return nil, err +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (m *aioMappable) InvalidateUnsavable(ctx context.Context) error { + return nil +} + +// NewAIOContext creates a new context for asynchronous I/O. +// +// NewAIOContext is analogous to Linux's fs/aio.c:ioctx_alloc(). +func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint64, error) { + // libaio get_ioevents() expects context "handle" to be a valid address. + // libaio peeks inside looking for a magic number. This function allocates + // a page per context and keeps it set to zeroes to ensure it will not + // match AIO_RING_MAGIC and make libaio happy. + m, err := newAIOMappable(mm.mfp) + if err != nil { + return 0, err + } + defer m.DecRef() + addr, err := mm.MMap(ctx, memmap.MMapOpts{ + Length: aioRingBufferSize, + MappingIdentity: m, + Mappable: m, + // TODO(fvoznika): Linux does "do_mmap_pgoff(..., PROT_READ | + // PROT_WRITE, ...)" in fs/aio.c:aio_setup_ring(); why do we make this + // mapping read-only? + Perms: usermem.Read, + MaxPerms: usermem.Read, + }) + if err != nil { + return 0, err + } + id := uint64(addr) + if !mm.aioManager.newAIOContext(events, id) { + mm.MUnmap(ctx, addr, aioRingBufferSize) + return 0, syserror.EINVAL + } + return id, nil +} + +// DestroyAIOContext destroys an asynchronous I/O context. It returns false if +// the context does not exist. +func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) bool { + if _, ok := mm.LookupAIOContext(ctx, id); !ok { + return false + } + + // Only unmaps after it assured that the address is a valid aio context to + // prevent random memory from been unmapped. + // + // Note: It's possible to unmap this address and map something else into + // the same address. Then it would be unmapping memory that it doesn't own. + // This is, however, the way Linux implements AIO. Keeps the same [weird] + // semantics in case anyone relies on it. + mm.MUnmap(ctx, usermem.Addr(id), aioRingBufferSize) + + return mm.aioManager.destroyAIOContext(id) +} + +// LookupAIOContext looks up the given context. It returns false if the context +// does not exist. +func (mm *MemoryManager) LookupAIOContext(ctx context.Context, id uint64) (*AIOContext, bool) { + aioCtx, ok := mm.aioManager.lookupAIOContext(id) + if !ok { + return nil, false + } + + // Protect against 'ids' that are inaccessible (Linux also reads 4 bytes + // from id). + var buf [4]byte + _, err := mm.CopyIn(ctx, usermem.Addr(id), buf[:], usermem.IOOpts{}) + if err != nil { + return nil, false + } + + return aioCtx, true +} diff --git a/pkg/sentry/mm/aio_context_state.go b/pkg/sentry/mm/aio_context_state.go new file mode 100644 index 000000000..c37fc9f7b --- /dev/null +++ b/pkg/sentry/mm/aio_context_state.go @@ -0,0 +1,20 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +// afterLoad is invoked by stateify. +func (a *AIOContext) afterLoad() { + a.done = make(chan struct{}, 1) +} diff --git a/pkg/sentry/mm/debug.go b/pkg/sentry/mm/debug.go new file mode 100644 index 000000000..fe58cfc4c --- /dev/null +++ b/pkg/sentry/mm/debug.go @@ -0,0 +1,98 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "bytes" + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" +) + +const ( + // If checkInvariants is true, perform runtime checks for invariants + // expected by the mm package. This is normally disabled since MM is a + // significant hot path in general, and some such checks (notably + // memmap.CheckTranslateResult) are very expensive. + checkInvariants = false + + // If logIOErrors is true, log I/O errors that originate from MM before + // converting them to EFAULT. + logIOErrors = false +) + +// String implements fmt.Stringer.String. +func (mm *MemoryManager) String() string { + return mm.DebugString(context.Background()) +} + +// DebugString returns a string containing information about mm for debugging. +func (mm *MemoryManager) DebugString(ctx context.Context) string { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + mm.activeMu.RLock() + defer mm.activeMu.RUnlock() + return mm.debugStringLocked(ctx) +} + +// Preconditions: mm.mappingMu and mm.activeMu must be locked. +func (mm *MemoryManager) debugStringLocked(ctx context.Context) string { + var b bytes.Buffer + b.WriteString("VMAs:\n") + for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() { + b.Write(mm.vmaMapsEntryLocked(ctx, vseg)) + } + b.WriteString("PMAs:\n") + for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() { + b.Write(pseg.debugStringEntryLocked()) + } + return string(b.Bytes()) +} + +// Preconditions: mm.activeMu must be locked. +func (pseg pmaIterator) debugStringEntryLocked() []byte { + var b bytes.Buffer + + fmt.Fprintf(&b, "%08x-%08x ", pseg.Start(), pseg.End()) + + pma := pseg.ValuePtr() + if pma.effectivePerms.Read { + b.WriteByte('r') + } else { + b.WriteByte('-') + } + if pma.effectivePerms.Write { + if pma.needCOW { + b.WriteByte('c') + } else { + b.WriteByte('w') + } + } else { + b.WriteByte('-') + } + if pma.effectivePerms.Execute { + b.WriteByte('x') + } else { + b.WriteByte('-') + } + if pma.private { + b.WriteByte('p') + } else { + b.WriteByte('s') + } + + fmt.Fprintf(&b, " %08x %T\n", pma.off, pma.file) + return b.Bytes() +} diff --git a/pkg/sentry/mm/file_refcount_set.go b/pkg/sentry/mm/file_refcount_set.go new file mode 100755 index 000000000..99c088c83 --- /dev/null +++ b/pkg/sentry/mm/file_refcount_set.go @@ -0,0 +1,1274 @@ +package mm + +import ( + __generics_imported0 "gvisor.googlesource.com/gvisor/pkg/sentry/platform" +) + +import ( + "bytes" + "fmt" +) + +const ( + // minDegree is the minimum degree of an internal node in a Set B-tree. + // + // - Any non-root node has at least minDegree-1 segments. + // + // - Any non-root internal (non-leaf) node has at least minDegree children. + // + // - The root node may have fewer than minDegree-1 segments, but it may + // only have 0 segments if the tree is empty. + // + // Our implementation requires minDegree >= 3. Higher values of minDegree + // usually improve performance, but increase memory usage for small sets. + fileRefcountminDegree = 3 + + fileRefcountmaxDegree = 2 * fileRefcountminDegree +) + +// A Set is a mapping of segments with non-overlapping Range keys. The zero +// value for a Set is an empty set. Set values are not safely movable nor +// copyable. Set is thread-compatible. +// +// +stateify savable +type fileRefcountSet struct { + root fileRefcountnode `state:".(*fileRefcountSegmentDataSlices)"` +} + +// IsEmpty returns true if the set contains no segments. +func (s *fileRefcountSet) IsEmpty() bool { + return s.root.nrSegments == 0 +} + +// IsEmptyRange returns true iff no segments in the set overlap the given +// range. This is semantically equivalent to s.SpanRange(r) == 0, but may be +// more efficient. +func (s *fileRefcountSet) IsEmptyRange(r __generics_imported0.FileRange) bool { + switch { + case r.Length() < 0: + panic(fmt.Sprintf("invalid range %v", r)) + case r.Length() == 0: + return true + } + _, gap := s.Find(r.Start) + if !gap.Ok() { + return false + } + return r.End <= gap.End() +} + +// Span returns the total size of all segments in the set. +func (s *fileRefcountSet) Span() uint64 { + var sz uint64 + for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + sz += seg.Range().Length() + } + return sz +} + +// SpanRange returns the total size of the intersection of segments in the set +// with the given range. +func (s *fileRefcountSet) SpanRange(r __generics_imported0.FileRange) uint64 { + switch { + case r.Length() < 0: + panic(fmt.Sprintf("invalid range %v", r)) + case r.Length() == 0: + return 0 + } + var sz uint64 + for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { + sz += seg.Range().Intersect(r).Length() + } + return sz +} + +// FirstSegment returns the first segment in the set. If the set is empty, +// FirstSegment returns a terminal iterator. +func (s *fileRefcountSet) FirstSegment() fileRefcountIterator { + if s.root.nrSegments == 0 { + return fileRefcountIterator{} + } + return s.root.firstSegment() +} + +// LastSegment returns the last segment in the set. If the set is empty, +// LastSegment returns a terminal iterator. +func (s *fileRefcountSet) LastSegment() fileRefcountIterator { + if s.root.nrSegments == 0 { + return fileRefcountIterator{} + } + return s.root.lastSegment() +} + +// FirstGap returns the first gap in the set. +func (s *fileRefcountSet) FirstGap() fileRefcountGapIterator { + n := &s.root + for n.hasChildren { + n = n.children[0] + } + return fileRefcountGapIterator{n, 0} +} + +// LastGap returns the last gap in the set. +func (s *fileRefcountSet) LastGap() fileRefcountGapIterator { + n := &s.root + for n.hasChildren { + n = n.children[n.nrSegments] + } + return fileRefcountGapIterator{n, n.nrSegments} +} + +// Find returns the segment or gap whose range contains the given key. If a +// segment is found, the returned Iterator is non-terminal and the +// returned GapIterator is terminal. Otherwise, the returned Iterator is +// terminal and the returned GapIterator is non-terminal. +func (s *fileRefcountSet) Find(key uint64) (fileRefcountIterator, fileRefcountGapIterator) { + n := &s.root + for { + + lower := 0 + upper := n.nrSegments + for lower < upper { + i := lower + (upper-lower)/2 + if r := n.keys[i]; key < r.End { + if key >= r.Start { + return fileRefcountIterator{n, i}, fileRefcountGapIterator{} + } + upper = i + } else { + lower = i + 1 + } + } + i := lower + if !n.hasChildren { + return fileRefcountIterator{}, fileRefcountGapIterator{n, i} + } + n = n.children[i] + } +} + +// FindSegment returns the segment whose range contains the given key. If no +// such segment exists, FindSegment returns a terminal iterator. +func (s *fileRefcountSet) FindSegment(key uint64) fileRefcountIterator { + seg, _ := s.Find(key) + return seg +} + +// LowerBoundSegment returns the segment with the lowest range that contains a +// key greater than or equal to min. If no such segment exists, +// LowerBoundSegment returns a terminal iterator. +func (s *fileRefcountSet) LowerBoundSegment(min uint64) fileRefcountIterator { + seg, gap := s.Find(min) + if seg.Ok() { + return seg + } + return gap.NextSegment() +} + +// UpperBoundSegment returns the segment with the highest range that contains a +// key less than or equal to max. If no such segment exists, UpperBoundSegment +// returns a terminal iterator. +func (s *fileRefcountSet) UpperBoundSegment(max uint64) fileRefcountIterator { + seg, gap := s.Find(max) + if seg.Ok() { + return seg + } + return gap.PrevSegment() +} + +// FindGap returns the gap containing the given key. If no such gap exists +// (i.e. the set contains a segment containing that key), FindGap returns a +// terminal iterator. +func (s *fileRefcountSet) FindGap(key uint64) fileRefcountGapIterator { + _, gap := s.Find(key) + return gap +} + +// LowerBoundGap returns the gap with the lowest range that is greater than or +// equal to min. +func (s *fileRefcountSet) LowerBoundGap(min uint64) fileRefcountGapIterator { + seg, gap := s.Find(min) + if gap.Ok() { + return gap + } + return seg.NextGap() +} + +// UpperBoundGap returns the gap with the highest range that is less than or +// equal to max. +func (s *fileRefcountSet) UpperBoundGap(max uint64) fileRefcountGapIterator { + seg, gap := s.Find(max) + if gap.Ok() { + return gap + } + return seg.PrevGap() +} + +// Add inserts the given segment into the set and returns true. If the new +// segment can be merged with adjacent segments, Add will do so. If the new +// segment would overlap an existing segment, Add returns false. If Add +// succeeds, all existing iterators are invalidated. +func (s *fileRefcountSet) Add(r __generics_imported0.FileRange, val int32) bool { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + gap := s.FindGap(r.Start) + if !gap.Ok() { + return false + } + if r.End > gap.End() { + return false + } + s.Insert(gap, r, val) + return true +} + +// AddWithoutMerging inserts the given segment into the set and returns true. +// If it would overlap an existing segment, AddWithoutMerging does nothing and +// returns false. If AddWithoutMerging succeeds, all existing iterators are +// invalidated. +func (s *fileRefcountSet) AddWithoutMerging(r __generics_imported0.FileRange, val int32) bool { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + gap := s.FindGap(r.Start) + if !gap.Ok() { + return false + } + if r.End > gap.End() { + return false + } + s.InsertWithoutMergingUnchecked(gap, r, val) + return true +} + +// Insert inserts the given segment into the given gap. If the new segment can +// be merged with adjacent segments, Insert will do so. Insert returns an +// iterator to the segment containing the inserted value (which may have been +// merged with other values). All existing iterators (including gap, but not +// including the returned iterator) are invalidated. +// +// If the gap cannot accommodate the segment, or if r is invalid, Insert panics. +// +// Insert is semantically equivalent to a InsertWithoutMerging followed by a +// Merge, but may be more efficient. Note that there is no unchecked variant of +// Insert since Insert must retrieve and inspect gap's predecessor and +// successor segments regardless. +func (s *fileRefcountSet) Insert(gap fileRefcountGapIterator, r __generics_imported0.FileRange, val int32) fileRefcountIterator { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + prev, next := gap.PrevSegment(), gap.NextSegment() + if prev.Ok() && prev.End() > r.Start { + panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) + } + if next.Ok() && next.Start() < r.End { + panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) + } + if prev.Ok() && prev.End() == r.Start { + if mval, ok := (fileRefcountSetFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok { + prev.SetEndUnchecked(r.End) + prev.SetValue(mval) + if next.Ok() && next.Start() == r.End { + val = mval + if mval, ok := (fileRefcountSetFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { + prev.SetEndUnchecked(next.End()) + prev.SetValue(mval) + return s.Remove(next).PrevSegment() + } + } + return prev + } + } + if next.Ok() && next.Start() == r.End { + if mval, ok := (fileRefcountSetFunctions{}).Merge(r, val, next.Range(), next.Value()); ok { + next.SetStartUnchecked(r.Start) + next.SetValue(mval) + return next + } + } + return s.InsertWithoutMergingUnchecked(gap, r, val) +} + +// InsertWithoutMerging inserts the given segment into the given gap and +// returns an iterator to the inserted segment. All existing iterators +// (including gap, but not including the returned iterator) are invalidated. +// +// If the gap cannot accommodate the segment, or if r is invalid, +// InsertWithoutMerging panics. +func (s *fileRefcountSet) InsertWithoutMerging(gap fileRefcountGapIterator, r __generics_imported0.FileRange, val int32) fileRefcountIterator { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + if gr := gap.Range(); !gr.IsSupersetOf(r) { + panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) + } + return s.InsertWithoutMergingUnchecked(gap, r, val) +} + +// InsertWithoutMergingUnchecked inserts the given segment into the given gap +// and returns an iterator to the inserted segment. All existing iterators +// (including gap, but not including the returned iterator) are invalidated. +// +// Preconditions: r.Start >= gap.Start(); r.End <= gap.End(). +func (s *fileRefcountSet) InsertWithoutMergingUnchecked(gap fileRefcountGapIterator, r __generics_imported0.FileRange, val int32) fileRefcountIterator { + gap = gap.node.rebalanceBeforeInsert(gap) + copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) + copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) + gap.node.keys[gap.index] = r + gap.node.values[gap.index] = val + gap.node.nrSegments++ + return fileRefcountIterator{gap.node, gap.index} +} + +// Remove removes the given segment and returns an iterator to the vacated gap. +// All existing iterators (including seg, but not including the returned +// iterator) are invalidated. +func (s *fileRefcountSet) Remove(seg fileRefcountIterator) fileRefcountGapIterator { + + if seg.node.hasChildren { + + victim := seg.PrevSegment() + + seg.SetRangeUnchecked(victim.Range()) + seg.SetValue(victim.Value()) + return s.Remove(victim).NextGap() + } + copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) + copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) + fileRefcountSetFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) + seg.node.nrSegments-- + return seg.node.rebalanceAfterRemove(fileRefcountGapIterator{seg.node, seg.index}) +} + +// RemoveAll removes all segments from the set. All existing iterators are +// invalidated. +func (s *fileRefcountSet) RemoveAll() { + s.root = fileRefcountnode{} +} + +// RemoveRange removes all segments in the given range. An iterator to the +// newly formed gap is returned, and all existing iterators are invalidated. +func (s *fileRefcountSet) RemoveRange(r __generics_imported0.FileRange) fileRefcountGapIterator { + seg, gap := s.Find(r.Start) + if seg.Ok() { + seg = s.Isolate(seg, r) + gap = s.Remove(seg) + } + for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { + seg = s.Isolate(seg, r) + gap = s.Remove(seg) + } + return gap +} + +// Merge attempts to merge two neighboring segments. If successful, Merge +// returns an iterator to the merged segment, and all existing iterators are +// invalidated. Otherwise, Merge returns a terminal iterator. +// +// If first is not the predecessor of second, Merge panics. +func (s *fileRefcountSet) Merge(first, second fileRefcountIterator) fileRefcountIterator { + if first.NextSegment() != second { + panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) + } + return s.MergeUnchecked(first, second) +} + +// MergeUnchecked attempts to merge two neighboring segments. If successful, +// MergeUnchecked returns an iterator to the merged segment, and all existing +// iterators are invalidated. Otherwise, MergeUnchecked returns a terminal +// iterator. +// +// Precondition: first is the predecessor of second: first.NextSegment() == +// second, first == second.PrevSegment(). +func (s *fileRefcountSet) MergeUnchecked(first, second fileRefcountIterator) fileRefcountIterator { + if first.End() == second.Start() { + if mval, ok := (fileRefcountSetFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { + + first.SetEndUnchecked(second.End()) + first.SetValue(mval) + return s.Remove(second).PrevSegment() + } + } + return fileRefcountIterator{} +} + +// MergeAll attempts to merge all adjacent segments in the set. All existing +// iterators are invalidated. +func (s *fileRefcountSet) MergeAll() { + seg := s.FirstSegment() + if !seg.Ok() { + return + } + next := seg.NextSegment() + for next.Ok() { + if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { + seg, next = mseg, mseg.NextSegment() + } else { + seg, next = next, next.NextSegment() + } + } +} + +// MergeRange attempts to merge all adjacent segments that contain a key in the +// specific range. All existing iterators are invalidated. +func (s *fileRefcountSet) MergeRange(r __generics_imported0.FileRange) { + seg := s.LowerBoundSegment(r.Start) + if !seg.Ok() { + return + } + next := seg.NextSegment() + for next.Ok() && next.Range().Start < r.End { + if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { + seg, next = mseg, mseg.NextSegment() + } else { + seg, next = next, next.NextSegment() + } + } +} + +// MergeAdjacent attempts to merge the segment containing r.Start with its +// predecessor, and the segment containing r.End-1 with its successor. +func (s *fileRefcountSet) MergeAdjacent(r __generics_imported0.FileRange) { + first := s.FindSegment(r.Start) + if first.Ok() { + if prev := first.PrevSegment(); prev.Ok() { + s.Merge(prev, first) + } + } + last := s.FindSegment(r.End - 1) + if last.Ok() { + if next := last.NextSegment(); next.Ok() { + s.Merge(last, next) + } + } +} + +// Split splits the given segment at the given key and returns iterators to the +// two resulting segments. All existing iterators (including seg, but not +// including the returned iterators) are invalidated. +// +// If the segment cannot be split at split (because split is at the start or +// end of the segment's range, so splitting would produce a segment with zero +// length, or because split falls outside the segment's range altogether), +// Split panics. +func (s *fileRefcountSet) Split(seg fileRefcountIterator, split uint64) (fileRefcountIterator, fileRefcountIterator) { + if !seg.Range().CanSplitAt(split) { + panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) + } + return s.SplitUnchecked(seg, split) +} + +// SplitUnchecked splits the given segment at the given key and returns +// iterators to the two resulting segments. All existing iterators (including +// seg, but not including the returned iterators) are invalidated. +// +// Preconditions: seg.Start() < key < seg.End(). +func (s *fileRefcountSet) SplitUnchecked(seg fileRefcountIterator, split uint64) (fileRefcountIterator, fileRefcountIterator) { + val1, val2 := (fileRefcountSetFunctions{}).Split(seg.Range(), seg.Value(), split) + end2 := seg.End() + seg.SetEndUnchecked(split) + seg.SetValue(val1) + seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), __generics_imported0.FileRange{split, end2}, val2) + + return seg2.PrevSegment(), seg2 +} + +// SplitAt splits the segment straddling split, if one exists. SplitAt returns +// true if a segment was split and false otherwise. If SplitAt splits a +// segment, all existing iterators are invalidated. +func (s *fileRefcountSet) SplitAt(split uint64) bool { + if seg := s.FindSegment(split); seg.Ok() && seg.Range().CanSplitAt(split) { + s.SplitUnchecked(seg, split) + return true + } + return false +} + +// Isolate ensures that the given segment's range does not escape r by +// splitting at r.Start and r.End if necessary, and returns an updated iterator +// to the bounded segment. All existing iterators (including seg, but not +// including the returned iterators) are invalidated. +func (s *fileRefcountSet) Isolate(seg fileRefcountIterator, r __generics_imported0.FileRange) fileRefcountIterator { + if seg.Range().CanSplitAt(r.Start) { + _, seg = s.SplitUnchecked(seg, r.Start) + } + if seg.Range().CanSplitAt(r.End) { + seg, _ = s.SplitUnchecked(seg, r.End) + } + return seg +} + +// ApplyContiguous applies a function to a contiguous range of segments, +// splitting if necessary. The function is applied until the first gap is +// encountered, at which point the gap is returned. If the function is applied +// across the entire range, a terminal gap is returned. All existing iterators +// are invalidated. +// +// N.B. The Iterator must not be invalidated by the function. +func (s *fileRefcountSet) ApplyContiguous(r __generics_imported0.FileRange, fn func(seg fileRefcountIterator)) fileRefcountGapIterator { + seg, gap := s.Find(r.Start) + if !seg.Ok() { + return gap + } + for { + seg = s.Isolate(seg, r) + fn(seg) + if seg.End() >= r.End { + return fileRefcountGapIterator{} + } + gap = seg.NextGap() + if !gap.IsEmpty() { + return gap + } + seg = gap.NextSegment() + if !seg.Ok() { + + return fileRefcountGapIterator{} + } + } +} + +// +stateify savable +type fileRefcountnode struct { + // An internal binary tree node looks like: + // + // K + // / \ + // Cl Cr + // + // where all keys in the subtree rooted by Cl (the left subtree) are less + // than K (the key of the parent node), and all keys in the subtree rooted + // by Cr (the right subtree) are greater than K. + // + // An internal B-tree node's indexes work out to look like: + // + // K0 K1 K2 ... Kn-1 + // / \/ \/ \ ... / \ + // C0 C1 C2 C3 ... Cn-1 Cn + // + // where n is nrSegments. + nrSegments int + + // parent is a pointer to this node's parent. If this node is root, parent + // is nil. + parent *fileRefcountnode + + // parentIndex is the index of this node in parent.children. + parentIndex int + + // Flag for internal nodes that is technically redundant with "children[0] + // != nil", but is stored in the first cache line. "hasChildren" rather + // than "isLeaf" because false must be the correct value for an empty root. + hasChildren bool + + // Nodes store keys and values in separate arrays to maximize locality in + // the common case (scanning keys for lookup). + keys [fileRefcountmaxDegree - 1]__generics_imported0.FileRange + values [fileRefcountmaxDegree - 1]int32 + children [fileRefcountmaxDegree]*fileRefcountnode +} + +// firstSegment returns the first segment in the subtree rooted by n. +// +// Preconditions: n.nrSegments != 0. +func (n *fileRefcountnode) firstSegment() fileRefcountIterator { + for n.hasChildren { + n = n.children[0] + } + return fileRefcountIterator{n, 0} +} + +// lastSegment returns the last segment in the subtree rooted by n. +// +// Preconditions: n.nrSegments != 0. +func (n *fileRefcountnode) lastSegment() fileRefcountIterator { + for n.hasChildren { + n = n.children[n.nrSegments] + } + return fileRefcountIterator{n, n.nrSegments - 1} +} + +func (n *fileRefcountnode) prevSibling() *fileRefcountnode { + if n.parent == nil || n.parentIndex == 0 { + return nil + } + return n.parent.children[n.parentIndex-1] +} + +func (n *fileRefcountnode) nextSibling() *fileRefcountnode { + if n.parent == nil || n.parentIndex == n.parent.nrSegments { + return nil + } + return n.parent.children[n.parentIndex+1] +} + +// rebalanceBeforeInsert splits n and its ancestors if they are full, as +// required for insertion, and returns an updated iterator to the position +// represented by gap. +func (n *fileRefcountnode) rebalanceBeforeInsert(gap fileRefcountGapIterator) fileRefcountGapIterator { + if n.parent != nil { + gap = n.parent.rebalanceBeforeInsert(gap) + } + if n.nrSegments < fileRefcountmaxDegree-1 { + return gap + } + if n.parent == nil { + + left := &fileRefcountnode{ + nrSegments: fileRefcountminDegree - 1, + parent: n, + parentIndex: 0, + hasChildren: n.hasChildren, + } + right := &fileRefcountnode{ + nrSegments: fileRefcountminDegree - 1, + parent: n, + parentIndex: 1, + hasChildren: n.hasChildren, + } + copy(left.keys[:fileRefcountminDegree-1], n.keys[:fileRefcountminDegree-1]) + copy(left.values[:fileRefcountminDegree-1], n.values[:fileRefcountminDegree-1]) + copy(right.keys[:fileRefcountminDegree-1], n.keys[fileRefcountminDegree:]) + copy(right.values[:fileRefcountminDegree-1], n.values[fileRefcountminDegree:]) + n.keys[0], n.values[0] = n.keys[fileRefcountminDegree-1], n.values[fileRefcountminDegree-1] + fileRefcountzeroValueSlice(n.values[1:]) + if n.hasChildren { + copy(left.children[:fileRefcountminDegree], n.children[:fileRefcountminDegree]) + copy(right.children[:fileRefcountminDegree], n.children[fileRefcountminDegree:]) + fileRefcountzeroNodeSlice(n.children[2:]) + for i := 0; i < fileRefcountminDegree; i++ { + left.children[i].parent = left + left.children[i].parentIndex = i + right.children[i].parent = right + right.children[i].parentIndex = i + } + } + n.nrSegments = 1 + n.hasChildren = true + n.children[0] = left + n.children[1] = right + if gap.node != n { + return gap + } + if gap.index < fileRefcountminDegree { + return fileRefcountGapIterator{left, gap.index} + } + return fileRefcountGapIterator{right, gap.index - fileRefcountminDegree} + } + + copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) + copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) + n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[fileRefcountminDegree-1], n.values[fileRefcountminDegree-1] + copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) + for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { + n.parent.children[i].parentIndex = i + } + sibling := &fileRefcountnode{ + nrSegments: fileRefcountminDegree - 1, + parent: n.parent, + parentIndex: n.parentIndex + 1, + hasChildren: n.hasChildren, + } + n.parent.children[n.parentIndex+1] = sibling + n.parent.nrSegments++ + copy(sibling.keys[:fileRefcountminDegree-1], n.keys[fileRefcountminDegree:]) + copy(sibling.values[:fileRefcountminDegree-1], n.values[fileRefcountminDegree:]) + fileRefcountzeroValueSlice(n.values[fileRefcountminDegree-1:]) + if n.hasChildren { + copy(sibling.children[:fileRefcountminDegree], n.children[fileRefcountminDegree:]) + fileRefcountzeroNodeSlice(n.children[fileRefcountminDegree:]) + for i := 0; i < fileRefcountminDegree; i++ { + sibling.children[i].parent = sibling + sibling.children[i].parentIndex = i + } + } + n.nrSegments = fileRefcountminDegree - 1 + + if gap.node != n { + return gap + } + if gap.index < fileRefcountminDegree { + return gap + } + return fileRefcountGapIterator{sibling, gap.index - fileRefcountminDegree} +} + +// rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient +// (contain fewer segments than required by B-tree invariants), as required for +// removal, and returns an updated iterator to the position represented by gap. +// +// Precondition: n is the only node in the tree that may currently violate a +// B-tree invariant. +func (n *fileRefcountnode) rebalanceAfterRemove(gap fileRefcountGapIterator) fileRefcountGapIterator { + for { + if n.nrSegments >= fileRefcountminDegree-1 { + return gap + } + if n.parent == nil { + + return gap + } + + if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= fileRefcountminDegree { + copy(n.keys[1:], n.keys[:n.nrSegments]) + copy(n.values[1:], n.values[:n.nrSegments]) + n.keys[0] = n.parent.keys[n.parentIndex-1] + n.values[0] = n.parent.values[n.parentIndex-1] + n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] + n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] + fileRefcountSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) + if n.hasChildren { + copy(n.children[1:], n.children[:n.nrSegments+1]) + n.children[0] = sibling.children[sibling.nrSegments] + sibling.children[sibling.nrSegments] = nil + n.children[0].parent = n + n.children[0].parentIndex = 0 + for i := 1; i < n.nrSegments+2; i++ { + n.children[i].parentIndex = i + } + } + n.nrSegments++ + sibling.nrSegments-- + if gap.node == sibling && gap.index == sibling.nrSegments { + return fileRefcountGapIterator{n, 0} + } + if gap.node == n { + return fileRefcountGapIterator{n, gap.index + 1} + } + return gap + } + if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= fileRefcountminDegree { + n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] + n.values[n.nrSegments] = n.parent.values[n.parentIndex] + n.parent.keys[n.parentIndex] = sibling.keys[0] + n.parent.values[n.parentIndex] = sibling.values[0] + copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) + copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) + fileRefcountSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) + if n.hasChildren { + n.children[n.nrSegments+1] = sibling.children[0] + copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) + sibling.children[sibling.nrSegments] = nil + n.children[n.nrSegments+1].parent = n + n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 + for i := 0; i < sibling.nrSegments; i++ { + sibling.children[i].parentIndex = i + } + } + n.nrSegments++ + sibling.nrSegments-- + if gap.node == sibling { + if gap.index == 0 { + return fileRefcountGapIterator{n, n.nrSegments} + } + return fileRefcountGapIterator{sibling, gap.index - 1} + } + return gap + } + + p := n.parent + if p.nrSegments == 1 { + + left, right := p.children[0], p.children[1] + p.nrSegments = left.nrSegments + right.nrSegments + 1 + p.hasChildren = left.hasChildren + p.keys[left.nrSegments] = p.keys[0] + p.values[left.nrSegments] = p.values[0] + copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) + copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) + copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) + copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) + if left.hasChildren { + copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) + copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) + for i := 0; i < p.nrSegments+1; i++ { + p.children[i].parent = p + p.children[i].parentIndex = i + } + } else { + p.children[0] = nil + p.children[1] = nil + } + if gap.node == left { + return fileRefcountGapIterator{p, gap.index} + } + if gap.node == right { + return fileRefcountGapIterator{p, gap.index + left.nrSegments + 1} + } + return gap + } + // Merge n and either sibling, along with the segment separating the + // two, into whichever of the two nodes comes first. This is the + // reverse of the non-root splitting case in + // node.rebalanceBeforeInsert. + var left, right *fileRefcountnode + if n.parentIndex > 0 { + left = n.prevSibling() + right = n + } else { + left = n + right = n.nextSibling() + } + + if gap.node == right { + gap = fileRefcountGapIterator{left, gap.index + left.nrSegments + 1} + } + left.keys[left.nrSegments] = p.keys[left.parentIndex] + left.values[left.nrSegments] = p.values[left.parentIndex] + copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) + copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) + if left.hasChildren { + copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) + for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { + left.children[i].parent = left + left.children[i].parentIndex = i + } + } + left.nrSegments += right.nrSegments + 1 + copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) + copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) + fileRefcountSetFunctions{}.ClearValue(&p.values[p.nrSegments-1]) + copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) + for i := 0; i < p.nrSegments; i++ { + p.children[i].parentIndex = i + } + p.children[p.nrSegments] = nil + p.nrSegments-- + + n = p + } +} + +// A Iterator is conceptually one of: +// +// - A pointer to a segment in a set; or +// +// - A terminal iterator, which is a sentinel indicating that the end of +// iteration has been reached. +// +// Iterators are copyable values and are meaningfully equality-comparable. The +// zero value of Iterator is a terminal iterator. +// +// Unless otherwise specified, any mutation of a set invalidates all existing +// iterators into the set. +type fileRefcountIterator struct { + // node is the node containing the iterated segment. If the iterator is + // terminal, node is nil. + node *fileRefcountnode + + // index is the index of the segment in node.keys/values. + index int +} + +// Ok returns true if the iterator is not terminal. All other methods are only +// valid for non-terminal iterators. +func (seg fileRefcountIterator) Ok() bool { + return seg.node != nil +} + +// Range returns the iterated segment's range key. +func (seg fileRefcountIterator) Range() __generics_imported0.FileRange { + return seg.node.keys[seg.index] +} + +// Start is equivalent to Range().Start, but should be preferred if only the +// start of the range is needed. +func (seg fileRefcountIterator) Start() uint64 { + return seg.node.keys[seg.index].Start +} + +// End is equivalent to Range().End, but should be preferred if only the end of +// the range is needed. +func (seg fileRefcountIterator) End() uint64 { + return seg.node.keys[seg.index].End +} + +// SetRangeUnchecked mutates the iterated segment's range key. This operation +// does not invalidate any iterators. +// +// Preconditions: +// +// - r.Length() > 0. +// +// - The new range must not overlap an existing one: If seg.NextSegment().Ok(), +// then r.end <= seg.NextSegment().Start(); if seg.PrevSegment().Ok(), then +// r.start >= seg.PrevSegment().End(). +func (seg fileRefcountIterator) SetRangeUnchecked(r __generics_imported0.FileRange) { + seg.node.keys[seg.index] = r +} + +// SetRange mutates the iterated segment's range key. If the new range would +// cause the iterated segment to overlap another segment, or if the new range +// is invalid, SetRange panics. This operation does not invalidate any +// iterators. +func (seg fileRefcountIterator) SetRange(r __generics_imported0.FileRange) { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { + panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) + } + if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { + panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) + } + seg.SetRangeUnchecked(r) +} + +// SetStartUnchecked mutates the iterated segment's start. This operation does +// not invalidate any iterators. +// +// Preconditions: The new start must be valid: start < seg.End(); if +// seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). +func (seg fileRefcountIterator) SetStartUnchecked(start uint64) { + seg.node.keys[seg.index].Start = start +} + +// SetStart mutates the iterated segment's start. If the new start value would +// cause the iterated segment to overlap another segment, or would result in an +// invalid range, SetStart panics. This operation does not invalidate any +// iterators. +func (seg fileRefcountIterator) SetStart(start uint64) { + if start >= seg.End() { + panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) + } + if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { + panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) + } + seg.SetStartUnchecked(start) +} + +// SetEndUnchecked mutates the iterated segment's end. This operation does not +// invalidate any iterators. +// +// Preconditions: The new end must be valid: end > seg.Start(); if +// seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). +func (seg fileRefcountIterator) SetEndUnchecked(end uint64) { + seg.node.keys[seg.index].End = end +} + +// SetEnd mutates the iterated segment's end. If the new end value would cause +// the iterated segment to overlap another segment, or would result in an +// invalid range, SetEnd panics. This operation does not invalidate any +// iterators. +func (seg fileRefcountIterator) SetEnd(end uint64) { + if end <= seg.Start() { + panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) + } + if next := seg.NextSegment(); next.Ok() && end > next.Start() { + panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) + } + seg.SetEndUnchecked(end) +} + +// Value returns a copy of the iterated segment's value. +func (seg fileRefcountIterator) Value() int32 { + return seg.node.values[seg.index] +} + +// ValuePtr returns a pointer to the iterated segment's value. The pointer is +// invalidated if the iterator is invalidated. This operation does not +// invalidate any iterators. +func (seg fileRefcountIterator) ValuePtr() *int32 { + return &seg.node.values[seg.index] +} + +// SetValue mutates the iterated segment's value. This operation does not +// invalidate any iterators. +func (seg fileRefcountIterator) SetValue(val int32) { + seg.node.values[seg.index] = val +} + +// PrevSegment returns the iterated segment's predecessor. If there is no +// preceding segment, PrevSegment returns a terminal iterator. +func (seg fileRefcountIterator) PrevSegment() fileRefcountIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index].lastSegment() + } + if seg.index > 0 { + return fileRefcountIterator{seg.node, seg.index - 1} + } + if seg.node.parent == nil { + return fileRefcountIterator{} + } + return fileRefcountsegmentBeforePosition(seg.node.parent, seg.node.parentIndex) +} + +// NextSegment returns the iterated segment's successor. If there is no +// succeeding segment, NextSegment returns a terminal iterator. +func (seg fileRefcountIterator) NextSegment() fileRefcountIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index+1].firstSegment() + } + if seg.index < seg.node.nrSegments-1 { + return fileRefcountIterator{seg.node, seg.index + 1} + } + if seg.node.parent == nil { + return fileRefcountIterator{} + } + return fileRefcountsegmentAfterPosition(seg.node.parent, seg.node.parentIndex) +} + +// PrevGap returns the gap immediately before the iterated segment. +func (seg fileRefcountIterator) PrevGap() fileRefcountGapIterator { + if seg.node.hasChildren { + + return seg.node.children[seg.index].lastSegment().NextGap() + } + return fileRefcountGapIterator{seg.node, seg.index} +} + +// NextGap returns the gap immediately after the iterated segment. +func (seg fileRefcountIterator) NextGap() fileRefcountGapIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index+1].firstSegment().PrevGap() + } + return fileRefcountGapIterator{seg.node, seg.index + 1} +} + +// PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, +// or the gap before the iterated segment otherwise. If seg.Start() == +// Functions.MinKey(), PrevNonEmpty will return two terminal iterators. +// Otherwise, exactly one of the iterators returned by PrevNonEmpty will be +// non-terminal. +func (seg fileRefcountIterator) PrevNonEmpty() (fileRefcountIterator, fileRefcountGapIterator) { + gap := seg.PrevGap() + if gap.Range().Length() != 0 { + return fileRefcountIterator{}, gap + } + return gap.PrevSegment(), fileRefcountGapIterator{} +} + +// NextNonEmpty returns the iterated segment's successor if it is adjacent, or +// the gap after the iterated segment otherwise. If seg.End() == +// Functions.MaxKey(), NextNonEmpty will return two terminal iterators. +// Otherwise, exactly one of the iterators returned by NextNonEmpty will be +// non-terminal. +func (seg fileRefcountIterator) NextNonEmpty() (fileRefcountIterator, fileRefcountGapIterator) { + gap := seg.NextGap() + if gap.Range().Length() != 0 { + return fileRefcountIterator{}, gap + } + return gap.NextSegment(), fileRefcountGapIterator{} +} + +// A GapIterator is conceptually one of: +// +// - A pointer to a position between two segments, before the first segment, or +// after the last segment in a set, called a *gap*; or +// +// - A terminal iterator, which is a sentinel indicating that the end of +// iteration has been reached. +// +// Note that the gap between two adjacent segments exists (iterators to it are +// non-terminal), but has a length of zero. GapIterator.IsEmpty returns true +// for such gaps. An empty set contains a single gap, spanning the entire range +// of the set's keys. +// +// GapIterators are copyable values and are meaningfully equality-comparable. +// The zero value of GapIterator is a terminal iterator. +// +// Unless otherwise specified, any mutation of a set invalidates all existing +// iterators into the set. +type fileRefcountGapIterator struct { + // The representation of a GapIterator is identical to that of an Iterator, + // except that index corresponds to positions between segments in the same + // way as for node.children (see comment for node.nrSegments). + node *fileRefcountnode + index int +} + +// Ok returns true if the iterator is not terminal. All other methods are only +// valid for non-terminal iterators. +func (gap fileRefcountGapIterator) Ok() bool { + return gap.node != nil +} + +// Range returns the range spanned by the iterated gap. +func (gap fileRefcountGapIterator) Range() __generics_imported0.FileRange { + return __generics_imported0.FileRange{gap.Start(), gap.End()} +} + +// Start is equivalent to Range().Start, but should be preferred if only the +// start of the range is needed. +func (gap fileRefcountGapIterator) Start() uint64 { + if ps := gap.PrevSegment(); ps.Ok() { + return ps.End() + } + return fileRefcountSetFunctions{}.MinKey() +} + +// End is equivalent to Range().End, but should be preferred if only the end of +// the range is needed. +func (gap fileRefcountGapIterator) End() uint64 { + if ns := gap.NextSegment(); ns.Ok() { + return ns.Start() + } + return fileRefcountSetFunctions{}.MaxKey() +} + +// IsEmpty returns true if the iterated gap is empty (that is, the "gap" is +// between two adjacent segments.) +func (gap fileRefcountGapIterator) IsEmpty() bool { + return gap.Range().Length() == 0 +} + +// PrevSegment returns the segment immediately before the iterated gap. If no +// such segment exists, PrevSegment returns a terminal iterator. +func (gap fileRefcountGapIterator) PrevSegment() fileRefcountIterator { + return fileRefcountsegmentBeforePosition(gap.node, gap.index) +} + +// NextSegment returns the segment immediately after the iterated gap. If no +// such segment exists, NextSegment returns a terminal iterator. +func (gap fileRefcountGapIterator) NextSegment() fileRefcountIterator { + return fileRefcountsegmentAfterPosition(gap.node, gap.index) +} + +// PrevGap returns the iterated gap's predecessor. If no such gap exists, +// PrevGap returns a terminal iterator. +func (gap fileRefcountGapIterator) PrevGap() fileRefcountGapIterator { + seg := gap.PrevSegment() + if !seg.Ok() { + return fileRefcountGapIterator{} + } + return seg.PrevGap() +} + +// NextGap returns the iterated gap's successor. If no such gap exists, NextGap +// returns a terminal iterator. +func (gap fileRefcountGapIterator) NextGap() fileRefcountGapIterator { + seg := gap.NextSegment() + if !seg.Ok() { + return fileRefcountGapIterator{} + } + return seg.NextGap() +} + +// segmentBeforePosition returns the predecessor segment of the position given +// by n.children[i], which may or may not contain a child. If no such segment +// exists, segmentBeforePosition returns a terminal iterator. +func fileRefcountsegmentBeforePosition(n *fileRefcountnode, i int) fileRefcountIterator { + for i == 0 { + if n.parent == nil { + return fileRefcountIterator{} + } + n, i = n.parent, n.parentIndex + } + return fileRefcountIterator{n, i - 1} +} + +// segmentAfterPosition returns the successor segment of the position given by +// n.children[i], which may or may not contain a child. If no such segment +// exists, segmentAfterPosition returns a terminal iterator. +func fileRefcountsegmentAfterPosition(n *fileRefcountnode, i int) fileRefcountIterator { + for i == n.nrSegments { + if n.parent == nil { + return fileRefcountIterator{} + } + n, i = n.parent, n.parentIndex + } + return fileRefcountIterator{n, i} +} + +func fileRefcountzeroValueSlice(slice []int32) { + + for i := range slice { + fileRefcountSetFunctions{}.ClearValue(&slice[i]) + } +} + +func fileRefcountzeroNodeSlice(slice []*fileRefcountnode) { + for i := range slice { + slice[i] = nil + } +} + +// String stringifies a Set for debugging. +func (s *fileRefcountSet) String() string { + return s.root.String() +} + +// String stringifes a node (and all of its children) for debugging. +func (n *fileRefcountnode) String() string { + var buf bytes.Buffer + n.writeDebugString(&buf, "") + return buf.String() +} + +func (n *fileRefcountnode) writeDebugString(buf *bytes.Buffer, prefix string) { + if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { + buf.WriteString(prefix) + buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) + } + for i := 0; i < n.nrSegments; i++ { + if child := n.children[i]; child != nil { + cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) + if child.parent != n || child.parentIndex != i { + buf.WriteString(cprefix) + buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) + } + child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) + } + buf.WriteString(prefix) + buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) + } + if child := n.children[n.nrSegments]; child != nil { + child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) + } +} + +// SegmentDataSlices represents segments from a set as slices of start, end, and +// values. SegmentDataSlices is primarily used as an intermediate representation +// for save/restore and the layout here is optimized for that. +// +// +stateify savable +type fileRefcountSegmentDataSlices struct { + Start []uint64 + End []uint64 + Values []int32 +} + +// ExportSortedSlice returns a copy of all segments in the given set, in ascending +// key order. +func (s *fileRefcountSet) ExportSortedSlices() *fileRefcountSegmentDataSlices { + var sds fileRefcountSegmentDataSlices + for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + sds.Start = append(sds.Start, seg.Start()) + sds.End = append(sds.End, seg.End()) + sds.Values = append(sds.Values, seg.Value()) + } + sds.Start = sds.Start[:len(sds.Start):len(sds.Start)] + sds.End = sds.End[:len(sds.End):len(sds.End)] + sds.Values = sds.Values[:len(sds.Values):len(sds.Values)] + return &sds +} + +// ImportSortedSlice initializes the given set from the given slice. +// +// Preconditions: s must be empty. sds must represent a valid set (the segments +// in sds must have valid lengths that do not overlap). The segments in sds +// must be sorted in ascending key order. +func (s *fileRefcountSet) ImportSortedSlices(sds *fileRefcountSegmentDataSlices) error { + if !s.IsEmpty() { + return fmt.Errorf("cannot import into non-empty set %v", s) + } + gap := s.FirstGap() + for i := range sds.Start { + r := __generics_imported0.FileRange{sds.Start[i], sds.End[i]} + if !gap.Range().IsSupersetOf(r) { + return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: [%d, %d) => %v", sds.Start[i], sds.End[i], sds.Values[i]) + } + gap = s.InsertWithoutMerging(gap, r, sds.Values[i]).NextGap() + } + return nil +} +func (s *fileRefcountSet) saveRoot() *fileRefcountSegmentDataSlices { + return s.ExportSortedSlices() +} + +func (s *fileRefcountSet) loadRoot(sds *fileRefcountSegmentDataSlices) { + if err := s.ImportSortedSlices(sds); err != nil { + panic(err) + } +} diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go new file mode 100644 index 000000000..e4c057d28 --- /dev/null +++ b/pkg/sentry/mm/io.go @@ -0,0 +1,639 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// There are two supported ways to copy data to/from application virtual +// memory: +// +// 1. Internally-mapped copying: Determine the platform.File that backs the +// copied-to/from virtual address, obtain a mapping of its pages, and read or +// write to the mapping. +// +// 2. AddressSpace copying: If platform.Platform.SupportsAddressSpaceIO() is +// true, AddressSpace permissions are applicable, and an AddressSpace is +// available, copy directly through the AddressSpace, handling faults as +// needed. +// +// (Given that internally-mapped copying requires that backing memory is always +// implemented using a host file descriptor, we could also preadv/pwritev to it +// instead. But this would incur a host syscall for each use of the mapped +// page, whereas mmap is a one-time cost.) +// +// The fixed overhead of internally-mapped copying is expected to be higher +// than that of AddressSpace copying since the former always needs to translate +// addresses, whereas the latter only needs to do so when faults occur. +// However, the throughput of internally-mapped copying is expected to be +// somewhat higher than that of AddressSpace copying due to the high cost of +// page faults and because implementations of the latter usually rely on +// safecopy, which doesn't use AVX registers. So we prefer to use AddressSpace +// copying (when available) for smaller copies, and switch to internally-mapped +// copying once a size threshold is exceeded. +const ( + // copyMapMinBytes is the size threshold for switching to internally-mapped + // copying in CopyOut, CopyIn, and ZeroOut. + copyMapMinBytes = 32 << 10 // 32 KB + + // rwMapMinBytes is the size threshold for switching to internally-mapped + // copying in CopyOutFrom and CopyInTo. It's lower than copyMapMinBytes + // since AddressSpace copying in this case requires additional buffering; + // see CopyOutFrom for details. + rwMapMinBytes = 512 +) + +// CheckIORange is similar to usermem.Addr.ToRange, but applies bounds checks +// consistent with Linux's arch/x86/include/asm/uaccess.h:access_ok(). +// +// Preconditions: length >= 0. +func (mm *MemoryManager) CheckIORange(addr usermem.Addr, length int64) (usermem.AddrRange, bool) { + // Note that access_ok() constrains end even if length == 0. + ar, ok := addr.ToRange(uint64(length)) + return ar, (ok && ar.End <= mm.layout.MaxAddr) +} + +// checkIOVec applies bound checks consistent with Linux's +// arch/x86/include/asm/uaccess.h:access_ok() to ars. +func (mm *MemoryManager) checkIOVec(ars usermem.AddrRangeSeq) bool { + for !ars.IsEmpty() { + ar := ars.Head() + if _, ok := mm.CheckIORange(ar.Start, int64(ar.Length())); !ok { + return false + } + ars = ars.Tail() + } + return true +} + +func (mm *MemoryManager) asioEnabled(opts usermem.IOOpts) bool { + return mm.haveASIO && !opts.IgnorePermissions && opts.AddressSpaceActive +} + +// translateIOError converts errors to EFAULT, as is usually reported for all +// I/O errors originating from MM in Linux. +func translateIOError(ctx context.Context, err error) error { + if err == nil { + return nil + } + if logIOErrors { + ctx.Debugf("MM I/O error: %v", err) + } + return syserror.EFAULT +} + +// CopyOut implements usermem.IO.CopyOut. +func (mm *MemoryManager) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) { + ar, ok := mm.CheckIORange(addr, int64(len(src))) + if !ok { + return 0, syserror.EFAULT + } + + if len(src) == 0 { + return 0, nil + } + + // Do AddressSpace IO if applicable. + if mm.asioEnabled(opts) && len(src) < copyMapMinBytes { + return mm.asCopyOut(ctx, addr, src) + } + + // Go through internal mappings. + n64, err := mm.withInternalMappings(ctx, ar, usermem.Write, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { + n, err := safemem.CopySeq(ims, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(src))) + return n, translateIOError(ctx, err) + }) + return int(n64), err +} + +func (mm *MemoryManager) asCopyOut(ctx context.Context, addr usermem.Addr, src []byte) (int, error) { + var done int + for { + n, err := mm.as.CopyOut(addr+usermem.Addr(done), src[done:]) + done += n + if err == nil { + return done, nil + } + if f, ok := err.(platform.SegmentationFault); ok { + ar, _ := addr.ToRange(uint64(len(src))) + if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Write); err != nil { + return done, err + } + continue + } + return done, translateIOError(ctx, err) + } +} + +// CopyIn implements usermem.IO.CopyIn. +func (mm *MemoryManager) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) { + ar, ok := mm.CheckIORange(addr, int64(len(dst))) + if !ok { + return 0, syserror.EFAULT + } + + if len(dst) == 0 { + return 0, nil + } + + // Do AddressSpace IO if applicable. + if mm.asioEnabled(opts) && len(dst) < copyMapMinBytes { + return mm.asCopyIn(ctx, addr, dst) + } + + // Go through internal mappings. + n64, err := mm.withInternalMappings(ctx, ar, usermem.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { + n, err := safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(dst)), ims) + return n, translateIOError(ctx, err) + }) + return int(n64), err +} + +func (mm *MemoryManager) asCopyIn(ctx context.Context, addr usermem.Addr, dst []byte) (int, error) { + var done int + for { + n, err := mm.as.CopyIn(addr+usermem.Addr(done), dst[done:]) + done += n + if err == nil { + return done, nil + } + if f, ok := err.(platform.SegmentationFault); ok { + ar, _ := addr.ToRange(uint64(len(dst))) + if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Read); err != nil { + return done, err + } + continue + } + return done, translateIOError(ctx, err) + } +} + +// ZeroOut implements usermem.IO.ZeroOut. +func (mm *MemoryManager) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) { + ar, ok := mm.CheckIORange(addr, toZero) + if !ok { + return 0, syserror.EFAULT + } + + if toZero == 0 { + return 0, nil + } + + // Do AddressSpace IO if applicable. + if mm.asioEnabled(opts) && toZero < copyMapMinBytes { + return mm.asZeroOut(ctx, addr, toZero) + } + + // Go through internal mappings. + return mm.withInternalMappings(ctx, ar, usermem.Write, opts.IgnorePermissions, func(dsts safemem.BlockSeq) (uint64, error) { + n, err := safemem.ZeroSeq(dsts) + return n, translateIOError(ctx, err) + }) +} + +func (mm *MemoryManager) asZeroOut(ctx context.Context, addr usermem.Addr, toZero int64) (int64, error) { + var done int64 + for { + n, err := mm.as.ZeroOut(addr+usermem.Addr(done), uintptr(toZero-done)) + done += int64(n) + if err == nil { + return done, nil + } + if f, ok := err.(platform.SegmentationFault); ok { + ar, _ := addr.ToRange(uint64(toZero)) + if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Write); err != nil { + return done, err + } + continue + } + return done, translateIOError(ctx, err) + } +} + +// CopyOutFrom implements usermem.IO.CopyOutFrom. +func (mm *MemoryManager) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) { + if !mm.checkIOVec(ars) { + return 0, syserror.EFAULT + } + + if ars.NumBytes() == 0 { + return 0, nil + } + + // Do AddressSpace IO if applicable. + if mm.asioEnabled(opts) && ars.NumBytes() < rwMapMinBytes { + // We have to introduce a buffered copy, instead of just passing a + // safemem.BlockSeq representing addresses in the AddressSpace to src. + // This is because usermem.IO.CopyOutFrom() guarantees that it calls + // src.ReadToBlocks() at most once, which is incompatible with handling + // faults between calls. In the future, this is probably best resolved + // by introducing a CopyOutFrom variant or option that allows it to + // call src.ReadToBlocks() any number of times. + // + // This issue applies to CopyInTo as well. + buf := make([]byte, int(ars.NumBytes())) + bufN, bufErr := src.ReadToBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf))) + var done int64 + for done < int64(bufN) { + ar := ars.Head() + cplen := int64(ar.Length()) + if cplen > int64(bufN)-done { + cplen = int64(bufN) - done + } + n, err := mm.asCopyOut(ctx, ar.Start, buf[int(done):int(done+cplen)]) + done += int64(n) + if err != nil { + return done, err + } + ars = ars.Tail() + } + // Do not convert errors returned by src to EFAULT. + return done, bufErr + } + + // Go through internal mappings. + return mm.withVecInternalMappings(ctx, ars, usermem.Write, opts.IgnorePermissions, src.ReadToBlocks) +} + +// CopyInTo implements usermem.IO.CopyInTo. +func (mm *MemoryManager) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) { + if !mm.checkIOVec(ars) { + return 0, syserror.EFAULT + } + + if ars.NumBytes() == 0 { + return 0, nil + } + + // Do AddressSpace IO if applicable. + if mm.asioEnabled(opts) && ars.NumBytes() < rwMapMinBytes { + buf := make([]byte, int(ars.NumBytes())) + var done int + var bufErr error + for !ars.IsEmpty() { + ar := ars.Head() + var n int + n, bufErr = mm.asCopyIn(ctx, ar.Start, buf[done:done+int(ar.Length())]) + done += n + if bufErr != nil { + break + } + ars = ars.Tail() + } + n, err := dst.WriteFromBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:done]))) + if err != nil { + return int64(n), err + } + // Do not convert errors returned by dst to EFAULT. + return int64(n), bufErr + } + + // Go through internal mappings. + return mm.withVecInternalMappings(ctx, ars, usermem.Read, opts.IgnorePermissions, dst.WriteFromBlocks) +} + +// SwapUint32 implements usermem.IO.SwapUint32. +func (mm *MemoryManager) SwapUint32(ctx context.Context, addr usermem.Addr, new uint32, opts usermem.IOOpts) (uint32, error) { + ar, ok := mm.CheckIORange(addr, 4) + if !ok { + return 0, syserror.EFAULT + } + + // Do AddressSpace IO if applicable. + if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions { + for { + old, err := mm.as.SwapUint32(addr, new) + if err == nil { + return old, nil + } + if f, ok := err.(platform.SegmentationFault); ok { + if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.ReadWrite); err != nil { + return 0, err + } + continue + } + return 0, translateIOError(ctx, err) + } + } + + // Go through internal mappings. + var old uint32 + _, err := mm.withInternalMappings(ctx, ar, usermem.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { + if ims.NumBlocks() != 1 || ims.NumBytes() != 4 { + // Atomicity is unachievable across mappings. + return 0, syserror.EFAULT + } + im := ims.Head() + var err error + old, err = safemem.SwapUint32(im, new) + if err != nil { + return 0, translateIOError(ctx, err) + } + // Return the number of bytes read. + return 4, nil + }) + return old, err +} + +// CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32. +func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr usermem.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) { + ar, ok := mm.CheckIORange(addr, 4) + if !ok { + return 0, syserror.EFAULT + } + + // Do AddressSpace IO if applicable. + if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions { + for { + prev, err := mm.as.CompareAndSwapUint32(addr, old, new) + if err == nil { + return prev, nil + } + if f, ok := err.(platform.SegmentationFault); ok { + if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.ReadWrite); err != nil { + return 0, err + } + continue + } + return 0, translateIOError(ctx, err) + } + } + + // Go through internal mappings. + var prev uint32 + _, err := mm.withInternalMappings(ctx, ar, usermem.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { + if ims.NumBlocks() != 1 || ims.NumBytes() != 4 { + // Atomicity is unachievable across mappings. + return 0, syserror.EFAULT + } + im := ims.Head() + var err error + prev, err = safemem.CompareAndSwapUint32(im, old, new) + if err != nil { + return 0, translateIOError(ctx, err) + } + // Return the number of bytes read. + return 4, nil + }) + return prev, err +} + +// LoadUint32 implements usermem.IO.LoadUint32. +func (mm *MemoryManager) LoadUint32(ctx context.Context, addr usermem.Addr, opts usermem.IOOpts) (uint32, error) { + ar, ok := mm.CheckIORange(addr, 4) + if !ok { + return 0, syserror.EFAULT + } + + // Do AddressSpace IO if applicable. + if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions { + for { + val, err := mm.as.LoadUint32(addr) + if err == nil { + return val, nil + } + if f, ok := err.(platform.SegmentationFault); ok { + if err := mm.handleASIOFault(ctx, f.Addr, ar, usermem.Read); err != nil { + return 0, err + } + continue + } + return 0, translateIOError(ctx, err) + } + } + + // Go through internal mappings. + var val uint32 + _, err := mm.withInternalMappings(ctx, ar, usermem.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { + if ims.NumBlocks() != 1 || ims.NumBytes() != 4 { + // Atomicity is unachievable across mappings. + return 0, syserror.EFAULT + } + im := ims.Head() + var err error + val, err = safemem.LoadUint32(im) + if err != nil { + return 0, translateIOError(ctx, err) + } + // Return the number of bytes read. + return 4, nil + }) + return val, err +} + +// handleASIOFault handles a page fault at address addr for an AddressSpaceIO +// operation spanning ioar. +// +// Preconditions: mm.as != nil. ioar.Length() != 0. ioar.Contains(addr). +func (mm *MemoryManager) handleASIOFault(ctx context.Context, addr usermem.Addr, ioar usermem.AddrRange, at usermem.AccessType) error { + // Try to map all remaining pages in the I/O operation. This RoundUp can't + // overflow because otherwise it would have been caught by CheckIORange. + end, _ := ioar.End.RoundUp() + ar := usermem.AddrRange{addr.RoundDown(), end} + + // Don't bother trying existingPMAsLocked; in most cases, if we did have + // existing pmas, we wouldn't have faulted. + + // Ensure that we have usable vmas. Here and below, only return early if we + // can't map the first (faulting) page; failure to map later pages are + // silently ignored. This maximizes partial success. + mm.mappingMu.RLock() + vseg, vend, err := mm.getVMAsLocked(ctx, ar, at, false) + if vendaddr := vend.Start(); vendaddr < ar.End { + if vendaddr <= ar.Start { + mm.mappingMu.RUnlock() + return translateIOError(ctx, err) + } + ar.End = vendaddr + } + + // Ensure that we have usable pmas. + mm.activeMu.Lock() + pseg, pend, err := mm.getPMAsLocked(ctx, vseg, ar, at) + mm.mappingMu.RUnlock() + if pendaddr := pend.Start(); pendaddr < ar.End { + if pendaddr <= ar.Start { + mm.activeMu.Unlock() + return translateIOError(ctx, err) + } + ar.End = pendaddr + } + + // Downgrade to a read-lock on activeMu since we don't need to mutate pmas + // anymore. + mm.activeMu.DowngradeLock() + + err = mm.mapASLocked(pseg, ar, false) + mm.activeMu.RUnlock() + return translateIOError(ctx, err) +} + +// withInternalMappings ensures that pmas exist for all addresses in ar, +// support access of type (at, ignorePermissions), and have internal mappings +// cached. It then calls f with mm.activeMu locked for reading, passing +// internal mappings for the subrange of ar for which this property holds. +// +// withInternalMappings takes a function returning uint64 since many safemem +// functions have this property, but returns an int64 since this is usually +// more useful for usermem.IO methods. +// +// Preconditions: 0 < ar.Length() <= math.MaxInt64. +func (mm *MemoryManager) withInternalMappings(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) { + // If pmas are already available, we can do IO without touching mm.vmas or + // mm.mappingMu. + mm.activeMu.RLock() + if pseg := mm.existingPMAsLocked(ar, at, ignorePermissions, true /* needInternalMappings */); pseg.Ok() { + n, err := f(mm.internalMappingsLocked(pseg, ar)) + mm.activeMu.RUnlock() + // Do not convert errors returned by f to EFAULT. + return int64(n), err + } + mm.activeMu.RUnlock() + + // Ensure that we have usable vmas. + mm.mappingMu.RLock() + vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions) + if vendaddr := vend.Start(); vendaddr < ar.End { + if vendaddr <= ar.Start { + mm.mappingMu.RUnlock() + return 0, translateIOError(ctx, verr) + } + ar.End = vendaddr + } + + // Ensure that we have usable pmas. + mm.activeMu.Lock() + pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, at) + mm.mappingMu.RUnlock() + if pendaddr := pend.Start(); pendaddr < ar.End { + if pendaddr <= ar.Start { + mm.activeMu.Unlock() + return 0, translateIOError(ctx, perr) + } + ar.End = pendaddr + } + imend, imerr := mm.getPMAInternalMappingsLocked(pseg, ar) + mm.activeMu.DowngradeLock() + if imendaddr := imend.Start(); imendaddr < ar.End { + if imendaddr <= ar.Start { + mm.activeMu.RUnlock() + return 0, translateIOError(ctx, imerr) + } + ar.End = imendaddr + } + + // Do I/O. + un, err := f(mm.internalMappingsLocked(pseg, ar)) + mm.activeMu.RUnlock() + n := int64(un) + + // Return the first error in order of progress through ar. + if err != nil { + // Do not convert errors returned by f to EFAULT. + return n, err + } + if imerr != nil { + return n, translateIOError(ctx, imerr) + } + if perr != nil { + return n, translateIOError(ctx, perr) + } + return n, translateIOError(ctx, verr) +} + +// withVecInternalMappings ensures that pmas exist for all addresses in ars, +// support access of type (at, ignorePermissions), and have internal mappings +// cached. It then calls f with mm.activeMu locked for reading, passing +// internal mappings for the subset of ars for which this property holds. +// +// Preconditions: !ars.IsEmpty(). +func (mm *MemoryManager) withVecInternalMappings(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) { + // withInternalMappings is faster than withVecInternalMappings because of + // iterator plumbing (this isn't generally practical in the vector case due + // to iterator invalidation between AddrRanges). Use it if possible. + if ars.NumRanges() == 1 { + return mm.withInternalMappings(ctx, ars.Head(), at, ignorePermissions, f) + } + + // If pmas are already available, we can do IO without touching mm.vmas or + // mm.mappingMu. + mm.activeMu.RLock() + if mm.existingVecPMAsLocked(ars, at, ignorePermissions, true /* needInternalMappings */) { + n, err := f(mm.vecInternalMappingsLocked(ars)) + mm.activeMu.RUnlock() + // Do not convert errors returned by f to EFAULT. + return int64(n), err + } + mm.activeMu.RUnlock() + + // Ensure that we have usable vmas. + mm.mappingMu.RLock() + vars, verr := mm.getVecVMAsLocked(ctx, ars, at, ignorePermissions) + if vars.NumBytes() == 0 { + mm.mappingMu.RUnlock() + return 0, translateIOError(ctx, verr) + } + + // Ensure that we have usable pmas. + mm.activeMu.Lock() + pars, perr := mm.getVecPMAsLocked(ctx, vars, at) + mm.mappingMu.RUnlock() + if pars.NumBytes() == 0 { + mm.activeMu.Unlock() + return 0, translateIOError(ctx, perr) + } + imars, imerr := mm.getVecPMAInternalMappingsLocked(pars) + mm.activeMu.DowngradeLock() + if imars.NumBytes() == 0 { + mm.activeMu.RUnlock() + return 0, translateIOError(ctx, imerr) + } + + // Do I/O. + un, err := f(mm.vecInternalMappingsLocked(imars)) + mm.activeMu.RUnlock() + n := int64(un) + + // Return the first error in order of progress through ars. + if err != nil { + // Do not convert errors from f to EFAULT. + return n, err + } + if imerr != nil { + return n, translateIOError(ctx, imerr) + } + if perr != nil { + return n, translateIOError(ctx, perr) + } + return n, translateIOError(ctx, verr) +} + +// truncatedAddrRangeSeq returns a copy of ars, but with the end truncated to +// at most address end on AddrRange arsit.Head(). It is used in vector I/O paths to +// truncate usermem.AddrRangeSeq when errors occur. +// +// Preconditions: !arsit.IsEmpty(). end <= arsit.Head().End. +func truncatedAddrRangeSeq(ars, arsit usermem.AddrRangeSeq, end usermem.Addr) usermem.AddrRangeSeq { + ar := arsit.Head() + if end <= ar.Start { + return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes()) + } + return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes() + int64(end-ar.Start)) +} diff --git a/pkg/sentry/mm/io_list.go b/pkg/sentry/mm/io_list.go new file mode 100755 index 000000000..99c83c4b9 --- /dev/null +++ b/pkg/sentry/mm/io_list.go @@ -0,0 +1,173 @@ +package mm + +// ElementMapper provides an identity mapping by default. +// +// This can be replaced to provide a struct that maps elements to linker +// objects, if they are not the same. An ElementMapper is not typically +// required if: Linker is left as is, Element is left as is, or Linker and +// Element are the same type. +type ioElementMapper struct{} + +// linkerFor maps an Element to a Linker. +// +// This default implementation should be inlined. +// +//go:nosplit +func (ioElementMapper) linkerFor(elem *ioResult) *ioResult { return elem } + +// List is an intrusive list. Entries can be added to or removed from the list +// in O(1) time and with no additional memory allocations. +// +// The zero value for List is an empty list ready to use. +// +// To iterate over a list (where l is a List): +// for e := l.Front(); e != nil; e = e.Next() { +// // do something with e. +// } +// +// +stateify savable +type ioList struct { + head *ioResult + tail *ioResult +} + +// Reset resets list l to the empty state. +func (l *ioList) Reset() { + l.head = nil + l.tail = nil +} + +// Empty returns true iff the list is empty. +func (l *ioList) Empty() bool { + return l.head == nil +} + +// Front returns the first element of list l or nil. +func (l *ioList) Front() *ioResult { + return l.head +} + +// Back returns the last element of list l or nil. +func (l *ioList) Back() *ioResult { + return l.tail +} + +// PushFront inserts the element e at the front of list l. +func (l *ioList) PushFront(e *ioResult) { + ioElementMapper{}.linkerFor(e).SetNext(l.head) + ioElementMapper{}.linkerFor(e).SetPrev(nil) + + if l.head != nil { + ioElementMapper{}.linkerFor(l.head).SetPrev(e) + } else { + l.tail = e + } + + l.head = e +} + +// PushBack inserts the element e at the back of list l. +func (l *ioList) PushBack(e *ioResult) { + ioElementMapper{}.linkerFor(e).SetNext(nil) + ioElementMapper{}.linkerFor(e).SetPrev(l.tail) + + if l.tail != nil { + ioElementMapper{}.linkerFor(l.tail).SetNext(e) + } else { + l.head = e + } + + l.tail = e +} + +// PushBackList inserts list m at the end of list l, emptying m. +func (l *ioList) PushBackList(m *ioList) { + if l.head == nil { + l.head = m.head + l.tail = m.tail + } else if m.head != nil { + ioElementMapper{}.linkerFor(l.tail).SetNext(m.head) + ioElementMapper{}.linkerFor(m.head).SetPrev(l.tail) + + l.tail = m.tail + } + + m.head = nil + m.tail = nil +} + +// InsertAfter inserts e after b. +func (l *ioList) InsertAfter(b, e *ioResult) { + a := ioElementMapper{}.linkerFor(b).Next() + ioElementMapper{}.linkerFor(e).SetNext(a) + ioElementMapper{}.linkerFor(e).SetPrev(b) + ioElementMapper{}.linkerFor(b).SetNext(e) + + if a != nil { + ioElementMapper{}.linkerFor(a).SetPrev(e) + } else { + l.tail = e + } +} + +// InsertBefore inserts e before a. +func (l *ioList) InsertBefore(a, e *ioResult) { + b := ioElementMapper{}.linkerFor(a).Prev() + ioElementMapper{}.linkerFor(e).SetNext(a) + ioElementMapper{}.linkerFor(e).SetPrev(b) + ioElementMapper{}.linkerFor(a).SetPrev(e) + + if b != nil { + ioElementMapper{}.linkerFor(b).SetNext(e) + } else { + l.head = e + } +} + +// Remove removes e from l. +func (l *ioList) Remove(e *ioResult) { + prev := ioElementMapper{}.linkerFor(e).Prev() + next := ioElementMapper{}.linkerFor(e).Next() + + if prev != nil { + ioElementMapper{}.linkerFor(prev).SetNext(next) + } else { + l.head = next + } + + if next != nil { + ioElementMapper{}.linkerFor(next).SetPrev(prev) + } else { + l.tail = prev + } +} + +// Entry is a default implementation of Linker. Users can add anonymous fields +// of this type to their structs to make them automatically implement the +// methods needed by List. +// +// +stateify savable +type ioEntry struct { + next *ioResult + prev *ioResult +} + +// Next returns the entry that follows e in the list. +func (e *ioEntry) Next() *ioResult { + return e.next +} + +// Prev returns the entry that precedes e in the list. +func (e *ioEntry) Prev() *ioResult { + return e.prev +} + +// SetNext assigns 'entry' as the entry that follows e in the list. +func (e *ioEntry) SetNext(elem *ioResult) { + e.next = elem +} + +// SetPrev assigns 'entry' as the entry that precedes e in the list. +func (e *ioEntry) SetPrev(elem *ioResult) { + e.prev = elem +} diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go new file mode 100644 index 000000000..7a65a62a2 --- /dev/null +++ b/pkg/sentry/mm/lifecycle.go @@ -0,0 +1,234 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "fmt" + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/atomicbitops" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// NewMemoryManager returns a new MemoryManager with no mappings and 1 user. +func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider) *MemoryManager { + return &MemoryManager{ + p: p, + mfp: mfp, + haveASIO: p.SupportsAddressSpaceIO(), + privateRefs: &privateRefs{}, + users: 1, + auxv: arch.Auxv{}, + aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, + } +} + +// SetMmapLayout initializes mm's layout from the given arch.Context. +// +// Preconditions: mm contains no mappings and is not used concurrently. +func (mm *MemoryManager) SetMmapLayout(ac arch.Context, r *limits.LimitSet) (arch.MmapLayout, error) { + layout, err := ac.NewMmapLayout(mm.p.MinUserAddress(), mm.p.MaxUserAddress(), r) + if err != nil { + return arch.MmapLayout{}, err + } + mm.layout = layout + return layout, nil +} + +// Fork creates a copy of mm with 1 user, as for Linux syscalls fork() or +// clone() (without CLONE_VM). +func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + mm2 := &MemoryManager{ + p: mm.p, + mfp: mm.mfp, + haveASIO: mm.haveASIO, + layout: mm.layout, + privateRefs: mm.privateRefs, + users: 1, + brk: mm.brk, + usageAS: mm.usageAS, + dataAS: mm.dataAS, + // "The child does not inherit its parent's memory locks (mlock(2), + // mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is + // MLockNone, both of which are zero values. vma.mlockMode is reset + // when copied below. + captureInvalidations: true, + argv: mm.argv, + envv: mm.envv, + auxv: append(arch.Auxv(nil), mm.auxv...), + // IncRef'd below, once we know that there isn't an error. + executable: mm.executable, + aioManager: aioManager{contexts: make(map[uint64]*AIOContext)}, + } + + // Copy vmas. + dstvgap := mm2.vmas.FirstGap() + for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() { + vma := srcvseg.Value() // makes a copy of the vma + vmaAR := srcvseg.Range() + // Inform the Mappable, if any, of the new mapping. + if vma.mappable != nil { + if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off, vma.canWriteMappableLocked()); err != nil { + mm2.removeVMAsLocked(ctx, mm2.applicationAddrRange()) + return nil, err + } + } + if vma.id != nil { + vma.id.IncRef() + } + vma.mlockMode = memmap.MLockNone + dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap() + // We don't need to update mm2.usageAS since we copied it from mm + // above. + } + + // Copy pmas. We have to lock mm.activeMu for writing to make existing + // private pmas copy-on-write. We also have to lock mm2.activeMu since + // after copying vmas above, memmap.Mappables may call mm2.Invalidate. We + // only copy private pmas, since in the common case where fork(2) is + // immediately followed by execve(2), copying non-private pmas that can be + // regenerated by calling memmap.Mappable.Translate is a waste of time. + // (Linux does the same; compare kernel/fork.c:dup_mmap() => + // mm/memory.c:copy_page_range().) + mm2.activeMu.Lock() + defer mm2.activeMu.Unlock() + mm.activeMu.Lock() + defer mm.activeMu.Unlock() + dstpgap := mm2.pmas.FirstGap() + var unmapAR usermem.AddrRange + for srcpseg := mm.pmas.FirstSegment(); srcpseg.Ok(); srcpseg = srcpseg.NextSegment() { + pma := srcpseg.ValuePtr() + if !pma.private { + continue + } + if !pma.needCOW { + pma.needCOW = true + if pma.effectivePerms.Write { + // We don't want to unmap the whole address space, even though + // doing so would reduce calls to unmapASLocked(), because mm + // will most likely continue to be used after the fork, so + // unmapping pmas unnecessarily will result in extra page + // faults. But we do want to merge consecutive AddrRanges + // across pma boundaries. + if unmapAR.End == srcpseg.Start() { + unmapAR.End = srcpseg.End() + } else { + if unmapAR.Length() != 0 { + mm.unmapASLocked(unmapAR) + } + unmapAR = srcpseg.Range() + } + pma.effectivePerms.Write = false + } + pma.maxPerms.Write = false + } + fr := srcpseg.fileRange() + mm2.incPrivateRef(fr) + srcpseg.ValuePtr().file.IncRef(fr) + addrRange := srcpseg.Range() + mm2.addRSSLocked(addrRange) + dstpgap = mm2.pmas.Insert(dstpgap, addrRange, *pma).NextGap() + } + if unmapAR.Length() != 0 { + mm.unmapASLocked(unmapAR) + } + + // Between when we call memmap.Mappable.AddMapping while copying vmas and + // when we lock mm2.activeMu to copy pmas, calls to mm2.Invalidate() are + // ineffective because the pmas they invalidate haven't yet been copied, + // possibly allowing mm2 to get invalidated translations: + // + // Invalidating Mappable mm.Fork + // --------------------- ------- + // + // mm2.Invalidate() + // mm.activeMu.Lock() + // mm.Invalidate() /* blocks */ + // mm2.activeMu.Lock() + // (mm copies invalidated pma to mm2) + // + // This would technically be both safe (since we only copy private pmas, + // which will still hold a reference on their memory) and consistent with + // Linux, but we avoid it anyway by setting mm2.captureInvalidations during + // construction, causing calls to mm2.Invalidate() to be captured in + // mm2.capturedInvalidations, to be replayed after pmas are copied - i.e. + // here. + mm2.captureInvalidations = false + for _, invArgs := range mm2.capturedInvalidations { + mm2.invalidateLocked(invArgs.ar, invArgs.opts.InvalidatePrivate, true) + } + mm2.capturedInvalidations = nil + + if mm2.executable != nil { + mm2.executable.IncRef() + } + return mm2, nil +} + +// IncUsers increments mm's user count and returns true. If the user count is +// already 0, IncUsers does nothing and returns false. +func (mm *MemoryManager) IncUsers() bool { + return atomicbitops.IncUnlessZeroInt32(&mm.users) +} + +// DecUsers decrements mm's user count. If the user count reaches 0, all +// mappings in mm are unmapped. +func (mm *MemoryManager) DecUsers(ctx context.Context) { + if users := atomic.AddInt32(&mm.users, -1); users > 0 { + return + } else if users < 0 { + panic(fmt.Sprintf("Invalid MemoryManager.users: %d", users)) + } + + mm.aioManager.destroy() + + mm.metadataMu.Lock() + exe := mm.executable + mm.executable = nil + mm.metadataMu.Unlock() + if exe != nil { + exe.DecRef() + } + + mm.activeMu.Lock() + // Sanity check. + if atomic.LoadInt32(&mm.active) != 0 { + panic("active address space lost?") + } + // Make sure the AddressSpace is returned. + if mm.as != nil { + mm.as.Release() + mm.as = nil + } + mm.activeMu.Unlock() + + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + // If mm is being dropped before mm.SetMmapLayout was called, + // mm.applicationAddrRange() will be empty. + if ar := mm.applicationAddrRange(); ar.Length() != 0 { + mm.unmapLocked(ctx, ar) + } +} diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go new file mode 100644 index 000000000..9768e51f1 --- /dev/null +++ b/pkg/sentry/mm/metadata.go @@ -0,0 +1,139 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// ArgvStart returns the start of the application argument vector. +// +// There is no guarantee that this value is sensible w.r.t. ArgvEnd. +func (mm *MemoryManager) ArgvStart() usermem.Addr { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + return mm.argv.Start +} + +// SetArgvStart sets the start of the application argument vector. +func (mm *MemoryManager) SetArgvStart(a usermem.Addr) { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + mm.argv.Start = a +} + +// ArgvEnd returns the end of the application argument vector. +// +// There is no guarantee that this value is sensible w.r.t. ArgvStart. +func (mm *MemoryManager) ArgvEnd() usermem.Addr { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + return mm.argv.End +} + +// SetArgvEnd sets the end of the application argument vector. +func (mm *MemoryManager) SetArgvEnd(a usermem.Addr) { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + mm.argv.End = a +} + +// EnvvStart returns the start of the application environment vector. +// +// There is no guarantee that this value is sensible w.r.t. EnvvEnd. +func (mm *MemoryManager) EnvvStart() usermem.Addr { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + return mm.envv.Start +} + +// SetEnvvStart sets the start of the application environment vector. +func (mm *MemoryManager) SetEnvvStart(a usermem.Addr) { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + mm.envv.Start = a +} + +// EnvvEnd returns the end of the application environment vector. +// +// There is no guarantee that this value is sensible w.r.t. EnvvStart. +func (mm *MemoryManager) EnvvEnd() usermem.Addr { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + return mm.envv.End +} + +// SetEnvvEnd sets the end of the application environment vector. +func (mm *MemoryManager) SetEnvvEnd(a usermem.Addr) { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + mm.envv.End = a +} + +// Auxv returns the current map of auxiliary vectors. +func (mm *MemoryManager) Auxv() arch.Auxv { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + return append(arch.Auxv(nil), mm.auxv...) +} + +// SetAuxv sets the entire map of auxiliary vectors. +func (mm *MemoryManager) SetAuxv(auxv arch.Auxv) { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + mm.auxv = append(arch.Auxv(nil), auxv...) +} + +// Executable returns the executable, if available. +// +// An additional reference will be taken in the case of a non-nil executable, +// which must be released by the caller. +func (mm *MemoryManager) Executable() *fs.Dirent { + mm.metadataMu.Lock() + defer mm.metadataMu.Unlock() + + if mm.executable == nil { + return nil + } + + mm.executable.IncRef() + return mm.executable +} + +// SetExecutable sets the executable. +// +// This takes a reference on d. +func (mm *MemoryManager) SetExecutable(d *fs.Dirent) { + mm.metadataMu.Lock() + + // Grab a new reference. + d.IncRef() + + // Set the executable. + orig := mm.executable + mm.executable = d + + mm.metadataMu.Unlock() + + // Release the old reference. + // + // Do this without holding the lock, since it may wind up doing some + // I/O to sync the dirent, etc. + if orig != nil { + orig.DecRef() + } +} diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go new file mode 100644 index 000000000..eb6defa2b --- /dev/null +++ b/pkg/sentry/mm/mm.go @@ -0,0 +1,456 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package mm provides a memory management subsystem. See README.md for a +// detailed overview. +// +// Lock order: +// +// fs locks, except for memmap.Mappable locks +// mm.MemoryManager.metadataMu +// mm.MemoryManager.mappingMu +// Locks taken by memmap.Mappable methods other than Translate +// mm.MemoryManager.activeMu +// Locks taken by memmap.Mappable.Translate +// mm.privateRefs.mu +// platform.AddressSpace locks +// platform.File locks +// mm.aioManager.mu +// mm.AIOContext.mu +// +// Only mm.MemoryManager.Fork is permitted to lock mm.MemoryManager.activeMu in +// multiple mm.MemoryManagers, as it does so in a well-defined order (forked +// child first). +package mm + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/third_party/gvsync" +) + +// MemoryManager implements a virtual address space. +// +// +stateify savable +type MemoryManager struct { + // p and mfp are immutable. + p platform.Platform + mfp pgalloc.MemoryFileProvider + + // haveASIO is the cached result of p.SupportsAddressSpaceIO(). Aside from + // eliminating an indirect call in the hot I/O path, this makes + // MemoryManager.asioEnabled() a leaf function, allowing it to be inlined. + // + // haveASIO is immutable. + haveASIO bool `state:"nosave"` + + // layout is the memory layout. + // + // layout is set by the binary loader before the MemoryManager can be used. + layout arch.MmapLayout + + // privateRefs stores reference counts for private memory (memory whose + // ownership is shared by one or more pmas instead of being owned by a + // memmap.Mappable). + // + // privateRefs is immutable. + privateRefs *privateRefs + + // users is the number of dependences on the mappings in the MemoryManager. + // When the number of references in users reaches zero, all mappings are + // unmapped. + // + // users is accessed using atomic memory operations. + users int32 + + // mappingMu is analogous to Linux's struct mm_struct::mmap_sem. + mappingMu gvsync.DowngradableRWMutex `state:"nosave"` + + // vmas stores virtual memory areas. Since vmas are stored by value, + // clients should usually use vmaIterator.ValuePtr() instead of + // vmaIterator.Value() to get a pointer to the vma rather than a copy. + // + // Invariants: vmas are always page-aligned. + // + // vmas is protected by mappingMu. + vmas vmaSet + + // brk is the mm's brk, which is manipulated using the brk(2) system call. + // The brk is initially set up by the loader which maps an executable + // binary into the mm. + // + // brk is protected by mappingMu. + brk usermem.AddrRange + + // usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks. + // + // usageAS is protected by mappingMu. + usageAS uint64 + + // lockedAS is the combined size in bytes of all vmas with vma.mlockMode != + // memmap.MLockNone. + // + // lockedAS is protected by mappingMu. + lockedAS uint64 + + // dataAS is the size of private data segments, like mm_struct->data_vm. + // It means the vma which is private, writable, not stack. + // + // dataAS is protected by mappingMu. + dataAS uint64 + + // New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or + // defMLockMode is greater. + // + // defMLockMode is protected by mappingMu. + defMLockMode memmap.MLockMode + + // activeMu is loosely analogous to Linux's struct + // mm_struct::page_table_lock. + activeMu gvsync.DowngradableRWMutex `state:"nosave"` + + // pmas stores platform mapping areas used to implement vmas. Since pmas + // are stored by value, clients should usually use pmaIterator.ValuePtr() + // instead of pmaIterator.Value() to get a pointer to the pma rather than + // a copy. + // + // Inserting or removing segments from pmas should happen along with a + // call to mm.insertRSS or mm.removeRSS. + // + // Invariants: pmas are always page-aligned. If a pma exists for a given + // address, a vma must also exist for that address. + // + // pmas is protected by activeMu. + pmas pmaSet + + // curRSS is pmas.Span(), cached to accelerate updates to maxRSS. It is + // reported as the MemoryManager's RSS. + // + // maxRSS should be modified only via insertRSS and removeRSS, not + // directly. + // + // maxRSS is protected by activeMu. + curRSS uint64 + + // maxRSS is the maximum resident set size in bytes of a MemoryManager. + // It is tracked as the application adds and removes mappings to pmas. + // + // maxRSS should be modified only via insertRSS, not directly. + // + // maxRSS is protected by activeMu. + maxRSS uint64 + + // as is the platform.AddressSpace that pmas are mapped into. active is the + // number of contexts that require as to be non-nil; if active == 0, as may + // be nil. + // + // as is protected by activeMu. active is manipulated with atomic memory + // operations; transitions to and from zero are additionally protected by + // activeMu. (This is because such transitions may need to be atomic with + // changes to as.) + as platform.AddressSpace `state:"nosave"` + active int32 `state:"zerovalue"` + + // unmapAllOnActivate indicates that the next Activate call should activate + // an empty AddressSpace. + // + // This is used to ensure that an AddressSpace cached in + // NewAddressSpace is not used after some change in the MemoryManager + // or VMAs has made that AddressSpace stale. + // + // unmapAllOnActivate is protected by activeMu. It must only be set when + // there is no active or cached AddressSpace. If as != nil, then + // invalidations should be propagated immediately. + unmapAllOnActivate bool `state:"nosave"` + + // If captureInvalidations is true, calls to MM.Invalidate() are recorded + // in capturedInvalidations rather than being applied immediately to pmas. + // This is to avoid a race condition in MM.Fork(); see that function for + // details. + // + // Both captureInvalidations and capturedInvalidations are protected by + // activeMu. Neither need to be saved since captureInvalidations is only + // enabled during MM.Fork(), during which saving can't occur. + captureInvalidations bool `state:"zerovalue"` + capturedInvalidations []invalidateArgs `state:"nosave"` + + metadataMu sync.Mutex `state:"nosave"` + + // argv is the application argv. This is set up by the loader and may be + // modified by prctl(PR_SET_MM_ARG_START/PR_SET_MM_ARG_END). No + // requirements apply to argv; we do not require that argv.WellFormed(). + // + // argv is protected by metadataMu. + argv usermem.AddrRange + + // envv is the application envv. This is set up by the loader and may be + // modified by prctl(PR_SET_MM_ENV_START/PR_SET_MM_ENV_END). No + // requirements apply to envv; we do not require that envv.WellFormed(). + // + // envv is protected by metadataMu. + envv usermem.AddrRange + + // auxv is the ELF's auxiliary vector. + // + // auxv is protected by metadataMu. + auxv arch.Auxv + + // executable is the executable for this MemoryManager. If executable + // is not nil, it holds a reference on the Dirent. + // + // executable is protected by metadataMu. + executable *fs.Dirent + + // aioManager keeps track of AIOContexts used for async IOs. AIOManager + // must be cloned when CLONE_VM is used. + aioManager aioManager +} + +// vma represents a virtual memory area. +// +// +stateify savable +type vma struct { + // mappable is the virtual memory object mapped by this vma. If mappable is + // nil, the vma represents a private anonymous mapping. + mappable memmap.Mappable + + // off is the offset into mappable at which this vma begins. If mappable is + // nil, off is meaningless. + off uint64 + + // To speedup VMA save/restore, we group and save the following booleans + // as a single integer. + + // realPerms are the memory permissions on this vma, as defined by the + // application. + realPerms usermem.AccessType `state:".(int)"` + + // effectivePerms are the memory permissions on this vma which are + // actually used to control access. + // + // Invariant: effectivePerms == realPerms.Effective(). + effectivePerms usermem.AccessType `state:"manual"` + + // maxPerms limits the set of permissions that may ever apply to this + // memory, as well as accesses for which usermem.IOOpts.IgnorePermissions + // is true (e.g. ptrace(PTRACE_POKEDATA)). + // + // Invariant: maxPerms == maxPerms.Effective(). + maxPerms usermem.AccessType `state:"manual"` + + // private is true if this is a MAP_PRIVATE mapping, such that writes to + // the mapping are propagated to a copy. + private bool `state:"manual"` + + // growsDown is true if the mapping may be automatically extended downward + // under certain conditions. If growsDown is true, mappable must be nil. + // + // There is currently no corresponding growsUp flag; in Linux, the only + // architectures that can have VM_GROWSUP mappings are ia64, parisc, and + // metag, none of which we currently support. + growsDown bool `state:"manual"` + + mlockMode memmap.MLockMode + + // If id is not nil, it controls the lifecycle of mappable and provides vma + // metadata shown in /proc/[pid]/maps, and the vma holds a reference. + id memmap.MappingIdentity + + // If hint is non-empty, it is a description of the vma printed in + // /proc/[pid]/maps. hint takes priority over id.MappedName(). + hint string +} + +const ( + vmaRealPermsRead = 1 << iota + vmaRealPermsWrite + vmaRealPermsExecute + vmaEffectivePermsRead + vmaEffectivePermsWrite + vmaEffectivePermsExecute + vmaMaxPermsRead + vmaMaxPermsWrite + vmaMaxPermsExecute + vmaPrivate + vmaGrowsDown +) + +func (v *vma) saveRealPerms() int { + var b int + if v.realPerms.Read { + b |= vmaRealPermsRead + } + if v.realPerms.Write { + b |= vmaRealPermsWrite + } + if v.realPerms.Execute { + b |= vmaRealPermsExecute + } + if v.effectivePerms.Read { + b |= vmaEffectivePermsRead + } + if v.effectivePerms.Write { + b |= vmaEffectivePermsWrite + } + if v.effectivePerms.Execute { + b |= vmaEffectivePermsExecute + } + if v.maxPerms.Read { + b |= vmaMaxPermsRead + } + if v.maxPerms.Write { + b |= vmaMaxPermsWrite + } + if v.maxPerms.Execute { + b |= vmaMaxPermsExecute + } + if v.private { + b |= vmaPrivate + } + if v.growsDown { + b |= vmaGrowsDown + } + return b +} + +func (v *vma) loadRealPerms(b int) { + if b&vmaRealPermsRead > 0 { + v.realPerms.Read = true + } + if b&vmaRealPermsWrite > 0 { + v.realPerms.Write = true + } + if b&vmaRealPermsExecute > 0 { + v.realPerms.Execute = true + } + if b&vmaEffectivePermsRead > 0 { + v.effectivePerms.Read = true + } + if b&vmaEffectivePermsWrite > 0 { + v.effectivePerms.Write = true + } + if b&vmaEffectivePermsExecute > 0 { + v.effectivePerms.Execute = true + } + if b&vmaMaxPermsRead > 0 { + v.maxPerms.Read = true + } + if b&vmaMaxPermsWrite > 0 { + v.maxPerms.Write = true + } + if b&vmaMaxPermsExecute > 0 { + v.maxPerms.Execute = true + } + if b&vmaPrivate > 0 { + v.private = true + } + if b&vmaGrowsDown > 0 { + v.growsDown = true + } +} + +// pma represents a platform mapping area. +// +// +stateify savable +type pma struct { + // file is the file mapped by this pma. Only pmas for which file == + // MemoryManager.mfp.MemoryFile() may be saved. pmas hold a reference to + // the corresponding file range while they exist. + file platform.File `state:"nosave"` + + // off is the offset into file at which this pma begins. + // + // Note that pmas do *not* hold references on offsets in file! If private + // is true, MemoryManager.privateRefs holds the reference instead. If + // private is false, the corresponding memmap.Mappable holds the reference + // instead (per memmap.Mappable.Translate requirement). + off uint64 + + // translatePerms is the permissions returned by memmap.Mappable.Translate. + // If private is true, translatePerms is usermem.AnyAccess. + translatePerms usermem.AccessType + + // effectivePerms is the permissions allowed for non-ignorePermissions + // accesses. maxPerms is the permissions allowed for ignorePermissions + // accesses. These are vma.effectivePerms and vma.maxPerms respectively, + // masked by pma.translatePerms and with Write disallowed if pma.needCOW is + // true. + // + // These are stored in the pma so that the IO implementation can avoid + // iterating mm.vmas when pmas already exist. + effectivePerms usermem.AccessType + maxPerms usermem.AccessType + + // needCOW is true if writes to the mapping must be propagated to a copy. + needCOW bool + + // private is true if this pma represents private memory. + // + // If private is true, file must be MemoryManager.mfp.MemoryFile(), the pma + // holds a reference on the mapped memory that is tracked in privateRefs, + // and calls to Invalidate for which + // memmap.InvalidateOpts.InvalidatePrivate is false should ignore the pma. + // + // If private is false, this pma caches a translation from the + // corresponding vma's memmap.Mappable.Translate. + private bool + + // If internalMappings is not empty, it is the cached return value of + // file.MapInternal for the platform.FileRange mapped by this pma. + internalMappings safemem.BlockSeq `state:"nosave"` +} + +// +stateify savable +type privateRefs struct { + mu sync.Mutex `state:"nosave"` + + // refs maps offsets into MemoryManager.mfp.MemoryFile() to the number of + // pmas (or, equivalently, MemoryManagers) that share ownership of the + // memory at that offset. + refs fileRefcountSet +} + +type invalidateArgs struct { + ar usermem.AddrRange + opts memmap.InvalidateOpts +} + +// fileRefcountSetFunctions implements segment.Functions for fileRefcountSet. +type fileRefcountSetFunctions struct{} + +func (fileRefcountSetFunctions) MinKey() uint64 { + return 0 +} + +func (fileRefcountSetFunctions) MaxKey() uint64 { + return ^uint64(0) +} + +func (fileRefcountSetFunctions) ClearValue(_ *int32) { +} + +func (fileRefcountSetFunctions) Merge(_ platform.FileRange, rc1 int32, _ platform.FileRange, rc2 int32) (int32, bool) { + return rc1, rc1 == rc2 +} + +func (fileRefcountSetFunctions) Split(_ platform.FileRange, rc int32, _ uint64) (int32, int32) { + return rc, rc +} diff --git a/pkg/sentry/mm/mm_state_autogen.go b/pkg/sentry/mm/mm_state_autogen.go new file mode 100755 index 000000000..160f347f8 --- /dev/null +++ b/pkg/sentry/mm/mm_state_autogen.go @@ -0,0 +1,380 @@ +// automatically generated by stateify. + +package mm + +import ( + "gvisor.googlesource.com/gvisor/pkg/state" +) + +func (x *aioManager) beforeSave() {} +func (x *aioManager) save(m state.Map) { + x.beforeSave() + m.Save("contexts", &x.contexts) +} + +func (x *aioManager) afterLoad() {} +func (x *aioManager) load(m state.Map) { + m.Load("contexts", &x.contexts) +} + +func (x *ioResult) beforeSave() {} +func (x *ioResult) save(m state.Map) { + x.beforeSave() + m.Save("data", &x.data) + m.Save("ioEntry", &x.ioEntry) +} + +func (x *ioResult) afterLoad() {} +func (x *ioResult) load(m state.Map) { + m.Load("data", &x.data) + m.Load("ioEntry", &x.ioEntry) +} + +func (x *AIOContext) beforeSave() {} +func (x *AIOContext) save(m state.Map) { + x.beforeSave() + if !state.IsZeroValue(x.dead) { m.Failf("dead is %v, expected zero", x.dead) } + m.Save("results", &x.results) + m.Save("maxOutstanding", &x.maxOutstanding) + m.Save("outstanding", &x.outstanding) +} + +func (x *AIOContext) load(m state.Map) { + m.Load("results", &x.results) + m.Load("maxOutstanding", &x.maxOutstanding) + m.Load("outstanding", &x.outstanding) + m.AfterLoad(x.afterLoad) +} + +func (x *aioMappable) beforeSave() {} +func (x *aioMappable) save(m state.Map) { + x.beforeSave() + m.Save("AtomicRefCount", &x.AtomicRefCount) + m.Save("mfp", &x.mfp) + m.Save("fr", &x.fr) +} + +func (x *aioMappable) afterLoad() {} +func (x *aioMappable) load(m state.Map) { + m.Load("AtomicRefCount", &x.AtomicRefCount) + m.Load("mfp", &x.mfp) + m.Load("fr", &x.fr) +} + +func (x *fileRefcountSet) beforeSave() {} +func (x *fileRefcountSet) save(m state.Map) { + x.beforeSave() + var root *fileRefcountSegmentDataSlices = x.saveRoot() + m.SaveValue("root", root) +} + +func (x *fileRefcountSet) afterLoad() {} +func (x *fileRefcountSet) load(m state.Map) { + m.LoadValue("root", new(*fileRefcountSegmentDataSlices), func(y interface{}) { x.loadRoot(y.(*fileRefcountSegmentDataSlices)) }) +} + +func (x *fileRefcountnode) beforeSave() {} +func (x *fileRefcountnode) save(m state.Map) { + x.beforeSave() + m.Save("nrSegments", &x.nrSegments) + m.Save("parent", &x.parent) + m.Save("parentIndex", &x.parentIndex) + m.Save("hasChildren", &x.hasChildren) + m.Save("keys", &x.keys) + m.Save("values", &x.values) + m.Save("children", &x.children) +} + +func (x *fileRefcountnode) afterLoad() {} +func (x *fileRefcountnode) load(m state.Map) { + m.Load("nrSegments", &x.nrSegments) + m.Load("parent", &x.parent) + m.Load("parentIndex", &x.parentIndex) + m.Load("hasChildren", &x.hasChildren) + m.Load("keys", &x.keys) + m.Load("values", &x.values) + m.Load("children", &x.children) +} + +func (x *fileRefcountSegmentDataSlices) beforeSave() {} +func (x *fileRefcountSegmentDataSlices) save(m state.Map) { + x.beforeSave() + m.Save("Start", &x.Start) + m.Save("End", &x.End) + m.Save("Values", &x.Values) +} + +func (x *fileRefcountSegmentDataSlices) afterLoad() {} +func (x *fileRefcountSegmentDataSlices) load(m state.Map) { + m.Load("Start", &x.Start) + m.Load("End", &x.End) + m.Load("Values", &x.Values) +} + +func (x *ioList) beforeSave() {} +func (x *ioList) save(m state.Map) { + x.beforeSave() + m.Save("head", &x.head) + m.Save("tail", &x.tail) +} + +func (x *ioList) afterLoad() {} +func (x *ioList) load(m state.Map) { + m.Load("head", &x.head) + m.Load("tail", &x.tail) +} + +func (x *ioEntry) beforeSave() {} +func (x *ioEntry) save(m state.Map) { + x.beforeSave() + m.Save("next", &x.next) + m.Save("prev", &x.prev) +} + +func (x *ioEntry) afterLoad() {} +func (x *ioEntry) load(m state.Map) { + m.Load("next", &x.next) + m.Load("prev", &x.prev) +} + +func (x *MemoryManager) save(m state.Map) { + x.beforeSave() + if !state.IsZeroValue(x.active) { m.Failf("active is %v, expected zero", x.active) } + if !state.IsZeroValue(x.captureInvalidations) { m.Failf("captureInvalidations is %v, expected zero", x.captureInvalidations) } + m.Save("p", &x.p) + m.Save("mfp", &x.mfp) + m.Save("layout", &x.layout) + m.Save("privateRefs", &x.privateRefs) + m.Save("users", &x.users) + m.Save("vmas", &x.vmas) + m.Save("brk", &x.brk) + m.Save("usageAS", &x.usageAS) + m.Save("lockedAS", &x.lockedAS) + m.Save("dataAS", &x.dataAS) + m.Save("defMLockMode", &x.defMLockMode) + m.Save("pmas", &x.pmas) + m.Save("curRSS", &x.curRSS) + m.Save("maxRSS", &x.maxRSS) + m.Save("argv", &x.argv) + m.Save("envv", &x.envv) + m.Save("auxv", &x.auxv) + m.Save("executable", &x.executable) + m.Save("aioManager", &x.aioManager) +} + +func (x *MemoryManager) load(m state.Map) { + m.Load("p", &x.p) + m.Load("mfp", &x.mfp) + m.Load("layout", &x.layout) + m.Load("privateRefs", &x.privateRefs) + m.Load("users", &x.users) + m.Load("vmas", &x.vmas) + m.Load("brk", &x.brk) + m.Load("usageAS", &x.usageAS) + m.Load("lockedAS", &x.lockedAS) + m.Load("dataAS", &x.dataAS) + m.Load("defMLockMode", &x.defMLockMode) + m.Load("pmas", &x.pmas) + m.Load("curRSS", &x.curRSS) + m.Load("maxRSS", &x.maxRSS) + m.Load("argv", &x.argv) + m.Load("envv", &x.envv) + m.Load("auxv", &x.auxv) + m.Load("executable", &x.executable) + m.Load("aioManager", &x.aioManager) + m.AfterLoad(x.afterLoad) +} + +func (x *vma) beforeSave() {} +func (x *vma) save(m state.Map) { + x.beforeSave() + var realPerms int = x.saveRealPerms() + m.SaveValue("realPerms", realPerms) + m.Save("mappable", &x.mappable) + m.Save("off", &x.off) + m.Save("mlockMode", &x.mlockMode) + m.Save("id", &x.id) + m.Save("hint", &x.hint) +} + +func (x *vma) afterLoad() {} +func (x *vma) load(m state.Map) { + m.Load("mappable", &x.mappable) + m.Load("off", &x.off) + m.Load("mlockMode", &x.mlockMode) + m.Load("id", &x.id) + m.Load("hint", &x.hint) + m.LoadValue("realPerms", new(int), func(y interface{}) { x.loadRealPerms(y.(int)) }) +} + +func (x *pma) beforeSave() {} +func (x *pma) save(m state.Map) { + x.beforeSave() + m.Save("off", &x.off) + m.Save("translatePerms", &x.translatePerms) + m.Save("effectivePerms", &x.effectivePerms) + m.Save("maxPerms", &x.maxPerms) + m.Save("needCOW", &x.needCOW) + m.Save("private", &x.private) +} + +func (x *pma) afterLoad() {} +func (x *pma) load(m state.Map) { + m.Load("off", &x.off) + m.Load("translatePerms", &x.translatePerms) + m.Load("effectivePerms", &x.effectivePerms) + m.Load("maxPerms", &x.maxPerms) + m.Load("needCOW", &x.needCOW) + m.Load("private", &x.private) +} + +func (x *privateRefs) beforeSave() {} +func (x *privateRefs) save(m state.Map) { + x.beforeSave() + m.Save("refs", &x.refs) +} + +func (x *privateRefs) afterLoad() {} +func (x *privateRefs) load(m state.Map) { + m.Load("refs", &x.refs) +} + +func (x *pmaSet) beforeSave() {} +func (x *pmaSet) save(m state.Map) { + x.beforeSave() + var root *pmaSegmentDataSlices = x.saveRoot() + m.SaveValue("root", root) +} + +func (x *pmaSet) afterLoad() {} +func (x *pmaSet) load(m state.Map) { + m.LoadValue("root", new(*pmaSegmentDataSlices), func(y interface{}) { x.loadRoot(y.(*pmaSegmentDataSlices)) }) +} + +func (x *pmanode) beforeSave() {} +func (x *pmanode) save(m state.Map) { + x.beforeSave() + m.Save("nrSegments", &x.nrSegments) + m.Save("parent", &x.parent) + m.Save("parentIndex", &x.parentIndex) + m.Save("hasChildren", &x.hasChildren) + m.Save("keys", &x.keys) + m.Save("values", &x.values) + m.Save("children", &x.children) +} + +func (x *pmanode) afterLoad() {} +func (x *pmanode) load(m state.Map) { + m.Load("nrSegments", &x.nrSegments) + m.Load("parent", &x.parent) + m.Load("parentIndex", &x.parentIndex) + m.Load("hasChildren", &x.hasChildren) + m.Load("keys", &x.keys) + m.Load("values", &x.values) + m.Load("children", &x.children) +} + +func (x *pmaSegmentDataSlices) beforeSave() {} +func (x *pmaSegmentDataSlices) save(m state.Map) { + x.beforeSave() + m.Save("Start", &x.Start) + m.Save("End", &x.End) + m.Save("Values", &x.Values) +} + +func (x *pmaSegmentDataSlices) afterLoad() {} +func (x *pmaSegmentDataSlices) load(m state.Map) { + m.Load("Start", &x.Start) + m.Load("End", &x.End) + m.Load("Values", &x.Values) +} + +func (x *SpecialMappable) beforeSave() {} +func (x *SpecialMappable) save(m state.Map) { + x.beforeSave() + m.Save("AtomicRefCount", &x.AtomicRefCount) + m.Save("mfp", &x.mfp) + m.Save("fr", &x.fr) + m.Save("name", &x.name) +} + +func (x *SpecialMappable) afterLoad() {} +func (x *SpecialMappable) load(m state.Map) { + m.Load("AtomicRefCount", &x.AtomicRefCount) + m.Load("mfp", &x.mfp) + m.Load("fr", &x.fr) + m.Load("name", &x.name) +} + +func (x *vmaSet) beforeSave() {} +func (x *vmaSet) save(m state.Map) { + x.beforeSave() + var root *vmaSegmentDataSlices = x.saveRoot() + m.SaveValue("root", root) +} + +func (x *vmaSet) afterLoad() {} +func (x *vmaSet) load(m state.Map) { + m.LoadValue("root", new(*vmaSegmentDataSlices), func(y interface{}) { x.loadRoot(y.(*vmaSegmentDataSlices)) }) +} + +func (x *vmanode) beforeSave() {} +func (x *vmanode) save(m state.Map) { + x.beforeSave() + m.Save("nrSegments", &x.nrSegments) + m.Save("parent", &x.parent) + m.Save("parentIndex", &x.parentIndex) + m.Save("hasChildren", &x.hasChildren) + m.Save("keys", &x.keys) + m.Save("values", &x.values) + m.Save("children", &x.children) +} + +func (x *vmanode) afterLoad() {} +func (x *vmanode) load(m state.Map) { + m.Load("nrSegments", &x.nrSegments) + m.Load("parent", &x.parent) + m.Load("parentIndex", &x.parentIndex) + m.Load("hasChildren", &x.hasChildren) + m.Load("keys", &x.keys) + m.Load("values", &x.values) + m.Load("children", &x.children) +} + +func (x *vmaSegmentDataSlices) beforeSave() {} +func (x *vmaSegmentDataSlices) save(m state.Map) { + x.beforeSave() + m.Save("Start", &x.Start) + m.Save("End", &x.End) + m.Save("Values", &x.Values) +} + +func (x *vmaSegmentDataSlices) afterLoad() {} +func (x *vmaSegmentDataSlices) load(m state.Map) { + m.Load("Start", &x.Start) + m.Load("End", &x.End) + m.Load("Values", &x.Values) +} + +func init() { + state.Register("mm.aioManager", (*aioManager)(nil), state.Fns{Save: (*aioManager).save, Load: (*aioManager).load}) + state.Register("mm.ioResult", (*ioResult)(nil), state.Fns{Save: (*ioResult).save, Load: (*ioResult).load}) + state.Register("mm.AIOContext", (*AIOContext)(nil), state.Fns{Save: (*AIOContext).save, Load: (*AIOContext).load}) + state.Register("mm.aioMappable", (*aioMappable)(nil), state.Fns{Save: (*aioMappable).save, Load: (*aioMappable).load}) + state.Register("mm.fileRefcountSet", (*fileRefcountSet)(nil), state.Fns{Save: (*fileRefcountSet).save, Load: (*fileRefcountSet).load}) + state.Register("mm.fileRefcountnode", (*fileRefcountnode)(nil), state.Fns{Save: (*fileRefcountnode).save, Load: (*fileRefcountnode).load}) + state.Register("mm.fileRefcountSegmentDataSlices", (*fileRefcountSegmentDataSlices)(nil), state.Fns{Save: (*fileRefcountSegmentDataSlices).save, Load: (*fileRefcountSegmentDataSlices).load}) + state.Register("mm.ioList", (*ioList)(nil), state.Fns{Save: (*ioList).save, Load: (*ioList).load}) + state.Register("mm.ioEntry", (*ioEntry)(nil), state.Fns{Save: (*ioEntry).save, Load: (*ioEntry).load}) + state.Register("mm.MemoryManager", (*MemoryManager)(nil), state.Fns{Save: (*MemoryManager).save, Load: (*MemoryManager).load}) + state.Register("mm.vma", (*vma)(nil), state.Fns{Save: (*vma).save, Load: (*vma).load}) + state.Register("mm.pma", (*pma)(nil), state.Fns{Save: (*pma).save, Load: (*pma).load}) + state.Register("mm.privateRefs", (*privateRefs)(nil), state.Fns{Save: (*privateRefs).save, Load: (*privateRefs).load}) + state.Register("mm.pmaSet", (*pmaSet)(nil), state.Fns{Save: (*pmaSet).save, Load: (*pmaSet).load}) + state.Register("mm.pmanode", (*pmanode)(nil), state.Fns{Save: (*pmanode).save, Load: (*pmanode).load}) + state.Register("mm.pmaSegmentDataSlices", (*pmaSegmentDataSlices)(nil), state.Fns{Save: (*pmaSegmentDataSlices).save, Load: (*pmaSegmentDataSlices).load}) + state.Register("mm.SpecialMappable", (*SpecialMappable)(nil), state.Fns{Save: (*SpecialMappable).save, Load: (*SpecialMappable).load}) + state.Register("mm.vmaSet", (*vmaSet)(nil), state.Fns{Save: (*vmaSet).save, Load: (*vmaSet).load}) + state.Register("mm.vmanode", (*vmanode)(nil), state.Fns{Save: (*vmanode).save, Load: (*vmanode).load}) + state.Register("mm.vmaSegmentDataSlices", (*vmaSegmentDataSlices)(nil), state.Fns{Save: (*vmaSegmentDataSlices).save, Load: (*vmaSegmentDataSlices).load}) +} diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go new file mode 100644 index 000000000..ece561ff0 --- /dev/null +++ b/pkg/sentry/mm/pma.go @@ -0,0 +1,1036 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform/safecopy" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// existingPMAsLocked checks that pmas exist for all addresses in ar, and +// support access of type (at, ignorePermissions). If so, it returns an +// iterator to the pma containing ar.Start. Otherwise it returns a terminal +// iterator. +// +// Preconditions: mm.activeMu must be locked. ar.Length() != 0. +func (mm *MemoryManager) existingPMAsLocked(ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool, needInternalMappings bool) pmaIterator { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + } + + first := mm.pmas.FindSegment(ar.Start) + pseg := first + for pseg.Ok() { + pma := pseg.ValuePtr() + perms := pma.effectivePerms + if ignorePermissions { + perms = pma.maxPerms + } + if !perms.SupersetOf(at) { + return pmaIterator{} + } + if needInternalMappings && pma.internalMappings.IsEmpty() { + return pmaIterator{} + } + + if ar.End <= pseg.End() { + return first + } + pseg, _ = pseg.NextNonEmpty() + } + + // Ran out of pmas before reaching ar.End. + return pmaIterator{} +} + +// existingVecPMAsLocked returns true if pmas exist for all addresses in ars, +// and support access of type (at, ignorePermissions). +// +// Preconditions: mm.activeMu must be locked. +func (mm *MemoryManager) existingVecPMAsLocked(ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool, needInternalMappings bool) bool { + for ; !ars.IsEmpty(); ars = ars.Tail() { + if ar := ars.Head(); ar.Length() != 0 && !mm.existingPMAsLocked(ar, at, ignorePermissions, needInternalMappings).Ok() { + return false + } + } + return true +} + +// getPMAsLocked ensures that pmas exist for all addresses in ar, and support +// access of type at. It returns: +// +// - An iterator to the pma containing ar.Start. If no pma contains ar.Start, +// the iterator is unspecified. +// +// - An iterator to the gap after the last pma containing an address in ar. If +// pmas exist for no addresses in ar, the iterator is to a gap that begins +// before ar.Start. +// +// - An error that is non-nil if pmas exist for only a subset of ar. +// +// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for +// writing. ar.Length() != 0. vseg.Range().Contains(ar.Start). vmas must exist +// for all addresses in ar, and support accesses of type at (i.e. permission +// checks must have been performed against vmas). +func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, at usermem.AccessType) (pmaIterator, pmaGapIterator, error) { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + if !vseg.Ok() { + panic("terminal vma iterator") + } + if !vseg.Range().Contains(ar.Start) { + panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar)) + } + } + + // Page-align ar so that all AddrRanges are aligned. + end, ok := ar.End.RoundUp() + var alignerr error + if !ok { + end = ar.End.RoundDown() + alignerr = syserror.EFAULT + } + ar = usermem.AddrRange{ar.Start.RoundDown(), end} + + pstart, pend, perr := mm.getPMAsInternalLocked(ctx, vseg, ar, at) + if pend.Start() <= ar.Start { + return pmaIterator{}, pend, perr + } + // getPMAsInternalLocked may not have returned pstart due to iterator + // invalidation. + if !pstart.Ok() { + pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend) + } + if perr != nil { + return pstart, pend, perr + } + return pstart, pend, alignerr +} + +// getVecPMAsLocked ensures that pmas exist for all addresses in ars, and +// support access of type at. It returns the subset of ars for which pmas +// exist. If this is not equal to ars, it returns a non-nil error explaining +// why. +// +// Preconditions: mm.mappingMu must be locked. mm.activeMu must be locked for +// writing. vmas must exist for all addresses in ars, and support accesses of +// type at (i.e. permission checks must have been performed against vmas). +func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType) (usermem.AddrRangeSeq, error) { + for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() { + ar := arsit.Head() + if ar.Length() == 0 { + continue + } + if checkInvariants { + if !ar.WellFormed() { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + } + + // Page-align ar so that all AddrRanges are aligned. + end, ok := ar.End.RoundUp() + var alignerr error + if !ok { + end = ar.End.RoundDown() + alignerr = syserror.EFAULT + } + ar = usermem.AddrRange{ar.Start.RoundDown(), end} + + _, pend, perr := mm.getPMAsInternalLocked(ctx, mm.vmas.FindSegment(ar.Start), ar, at) + if perr != nil { + return truncatedAddrRangeSeq(ars, arsit, pend.Start()), perr + } + if alignerr != nil { + return truncatedAddrRangeSeq(ars, arsit, pend.Start()), alignerr + } + } + + return ars, nil +} + +// getPMAsInternalLocked is equivalent to getPMAsLocked, with the following +// exceptions: +// +// - getPMAsInternalLocked returns a pmaIterator on a best-effort basis (that +// is, the returned iterator may be terminal, even if a pma that contains +// ar.Start exists). Returning this iterator on a best-effort basis allows +// callers that require it to use it when it's cheaply available, while also +// avoiding the overhead of retrieving it when it's not. +// +// - getPMAsInternalLocked additionally requires that ar is page-aligned. +// +// getPMAsInternalLocked is an implementation helper for getPMAsLocked and +// getVecPMAsLocked; other clients should call one of those instead. +func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, at usermem.AccessType) (pmaIterator, pmaGapIterator, error) { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + if !vseg.Ok() { + panic("terminal vma iterator") + } + if !vseg.Range().Contains(ar.Start) { + panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar)) + } + } + + mf := mm.mfp.MemoryFile() + // Limit the range we allocate to ar, aligned to privateAllocUnit. + maskAR := privateAligned(ar) + didUnmapAS := false + // The range in which we iterate vmas and pmas is still limited to ar, to + // ensure that we don't allocate or COW-break a pma we don't need. + pseg, pgap := mm.pmas.Find(ar.Start) + pstart := pseg + for { + // Get pmas for this vma. + vsegAR := vseg.Range().Intersect(ar) + vma := vseg.ValuePtr() + pmaLoop: + for { + switch { + case pgap.Ok() && pgap.Start() < vsegAR.End: + // Need a pma here. + optAR := vseg.Range().Intersect(pgap.Range()) + if checkInvariants { + if optAR.Length() <= 0 { + panic(fmt.Sprintf("vseg %v and pgap %v do not overlap", vseg, pgap)) + } + } + if vma.mappable == nil { + // Private anonymous mappings get pmas by allocating. + allocAR := optAR.Intersect(maskAR) + fr, err := mf.Allocate(uint64(allocAR.Length()), usage.Anonymous) + if err != nil { + return pstart, pgap, err + } + if checkInvariants { + if !fr.WellFormed() || fr.Length() != uint64(allocAR.Length()) { + panic(fmt.Sprintf("Allocate(%v) returned invalid FileRange %v", allocAR.Length(), fr)) + } + } + mm.addRSSLocked(allocAR) + mm.incPrivateRef(fr) + mf.IncRef(fr) + pseg, pgap = mm.pmas.Insert(pgap, allocAR, pma{ + file: mf, + off: fr.Start, + translatePerms: usermem.AnyAccess, + effectivePerms: vma.effectivePerms, + maxPerms: vma.maxPerms, + // Since we just allocated this memory and have the + // only reference, the new pma does not need + // copy-on-write. + private: true, + }).NextNonEmpty() + pstart = pmaIterator{} // iterators invalidated + } else { + // Other mappings get pmas by translating. + optMR := vseg.mappableRangeOf(optAR) + reqAR := optAR.Intersect(ar) + reqMR := vseg.mappableRangeOf(reqAR) + perms := at + if vma.private { + // This pma will be copy-on-write; don't require write + // permission, but do require read permission to + // facilitate the copy. + // + // If at.Write is true, we will need to break + // copy-on-write immediately, which occurs after + // translation below. + perms.Read = true + perms.Write = false + } + ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms) + if checkInvariants { + if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil { + panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err)) + } + } + // Install a pma for each translation. + if len(ts) == 0 { + return pstart, pgap, err + } + pstart = pmaIterator{} // iterators invalidated + for _, t := range ts { + newpmaAR := vseg.addrRangeOf(t.Source) + newpma := pma{ + file: t.File, + off: t.Offset, + translatePerms: t.Perms, + effectivePerms: vma.effectivePerms.Intersect(t.Perms), + maxPerms: vma.maxPerms.Intersect(t.Perms), + } + if vma.private { + newpma.effectivePerms.Write = false + newpma.maxPerms.Write = false + newpma.needCOW = true + } + mm.addRSSLocked(newpmaAR) + t.File.IncRef(t.FileRange()) + // This is valid because memmap.Mappable.Translate is + // required to return Translations in increasing + // Translation.Source order. + pseg = mm.pmas.Insert(pgap, newpmaAR, newpma) + pgap = pseg.NextGap() + } + // The error returned by Translate is only significant if + // it occurred before ar.End. + if err != nil && vseg.addrRangeOf(ts[len(ts)-1].Source).End < ar.End { + return pstart, pgap, err + } + // Rewind pseg to the first pma inserted and continue the + // loop to check if we need to break copy-on-write. + pseg, pgap = mm.findOrSeekPrevUpperBoundPMA(vseg.addrRangeOf(ts[0].Source).Start, pgap), pmaGapIterator{} + continue + } + + case pseg.Ok() && pseg.Start() < vsegAR.End: + oldpma := pseg.ValuePtr() + if at.Write && mm.isPMACopyOnWriteLocked(vseg, pseg) { + // Break copy-on-write by copying. + if checkInvariants { + if !oldpma.maxPerms.Read { + panic(fmt.Sprintf("pma %v needs to be copied for writing, but is not readable: %v", pseg.Range(), oldpma)) + } + } + // The majority of copy-on-write breaks on executable pages + // come from: + // + // - The ELF loader, which must zero out bytes on the last + // page of each segment after the end of the segment. + // + // - gdb's use of ptrace to insert breakpoints. + // + // Neither of these cases has enough spatial locality to + // benefit from copying nearby pages, so if the vma is + // executable, only copy the pages required. + var copyAR usermem.AddrRange + if vseg.ValuePtr().effectivePerms.Execute { + copyAR = pseg.Range().Intersect(ar) + } else { + copyAR = pseg.Range().Intersect(maskAR) + } + // Get internal mappings from the pma to copy from. + if err := pseg.getInternalMappingsLocked(); err != nil { + return pstart, pseg.PrevGap(), err + } + // Copy contents. + fr, err := mf.AllocateAndFill(uint64(copyAR.Length()), usage.Anonymous, &safemem.BlockSeqReader{mm.internalMappingsLocked(pseg, copyAR)}) + if _, ok := err.(safecopy.BusError); ok { + // If we got SIGBUS during the copy, deliver SIGBUS to + // userspace (instead of SIGSEGV) if we're breaking + // copy-on-write due to application page fault. + err = &memmap.BusError{err} + } + if fr.Length() == 0 { + return pstart, pseg.PrevGap(), err + } + // Unmap all of maskAR, not just copyAR, to minimize host + // syscalls. AddressSpace mappings must be removed before + // mm.decPrivateRef(). + if !didUnmapAS { + mm.unmapASLocked(maskAR) + didUnmapAS = true + } + // Replace the pma with a copy in the part of the address + // range where copying was successful. This doesn't change + // RSS. + copyAR.End = copyAR.Start + usermem.Addr(fr.Length()) + if copyAR != pseg.Range() { + pseg = mm.pmas.Isolate(pseg, copyAR) + pstart = pmaIterator{} // iterators invalidated + } + oldpma = pseg.ValuePtr() + if oldpma.private { + mm.decPrivateRef(pseg.fileRange()) + } + oldpma.file.DecRef(pseg.fileRange()) + mm.incPrivateRef(fr) + mf.IncRef(fr) + oldpma.file = mf + oldpma.off = fr.Start + oldpma.translatePerms = usermem.AnyAccess + oldpma.effectivePerms = vma.effectivePerms + oldpma.maxPerms = vma.maxPerms + oldpma.needCOW = false + oldpma.private = true + oldpma.internalMappings = safemem.BlockSeq{} + // Try to merge the pma with its neighbors. + if prev := pseg.PrevSegment(); prev.Ok() { + if merged := mm.pmas.Merge(prev, pseg); merged.Ok() { + pseg = merged + pstart = pmaIterator{} // iterators invalidated + } + } + if next := pseg.NextSegment(); next.Ok() { + if merged := mm.pmas.Merge(pseg, next); merged.Ok() { + pseg = merged + pstart = pmaIterator{} // iterators invalidated + } + } + // The error returned by AllocateAndFill is only + // significant if it occurred before ar.End. + if err != nil && pseg.End() < ar.End { + return pstart, pseg.NextGap(), err + } + // Ensure pseg and pgap are correct for the next iteration + // of the loop. + pseg, pgap = pseg.NextNonEmpty() + } else if !oldpma.translatePerms.SupersetOf(at) { + // Get new pmas (with sufficient permissions) by calling + // memmap.Mappable.Translate again. + if checkInvariants { + if oldpma.private { + panic(fmt.Sprintf("private pma %v has non-maximal pma.translatePerms: %v", pseg.Range(), oldpma)) + } + } + // Allow the entire pma to be replaced. + optAR := pseg.Range() + optMR := vseg.mappableRangeOf(optAR) + reqAR := optAR.Intersect(ar) + reqMR := vseg.mappableRangeOf(reqAR) + perms := oldpma.translatePerms.Union(at) + ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms) + if checkInvariants { + if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil { + panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err)) + } + } + // Remove the part of the existing pma covered by new + // Translations, then insert new pmas. This doesn't change + // RSS. Note that we don't need to call unmapASLocked: any + // existing AddressSpace mappings are still valid (though + // less permissive than the new pmas indicate) until + // Invalidate is called, and will be replaced by future + // calls to mapASLocked. + if len(ts) == 0 { + return pstart, pseg.PrevGap(), err + } + transMR := memmap.MappableRange{ts[0].Source.Start, ts[len(ts)-1].Source.End} + transAR := vseg.addrRangeOf(transMR) + pseg = mm.pmas.Isolate(pseg, transAR) + pseg.ValuePtr().file.DecRef(pseg.fileRange()) + pgap = mm.pmas.Remove(pseg) + pstart = pmaIterator{} // iterators invalidated + for _, t := range ts { + newpmaAR := vseg.addrRangeOf(t.Source) + newpma := pma{ + file: t.File, + off: t.Offset, + translatePerms: t.Perms, + effectivePerms: vma.effectivePerms.Intersect(t.Perms), + maxPerms: vma.maxPerms.Intersect(t.Perms), + } + if vma.private { + newpma.effectivePerms.Write = false + newpma.maxPerms.Write = false + newpma.needCOW = true + } + t.File.IncRef(t.FileRange()) + pseg = mm.pmas.Insert(pgap, newpmaAR, newpma) + pgap = pseg.NextGap() + } + // The error returned by Translate is only significant if + // it occurred before ar.End. + if err != nil && pseg.End() < ar.End { + return pstart, pgap, err + } + // Ensure pseg and pgap are correct for the next iteration + // of the loop. + if pgap.Range().Length() == 0 { + pseg, pgap = pgap.NextSegment(), pmaGapIterator{} + } else { + pseg = pmaIterator{} + } + } else { + // We have a usable pma; continue. + pseg, pgap = pseg.NextNonEmpty() + } + + default: + break pmaLoop + } + } + // Go to the next vma. + if ar.End <= vseg.End() { + if pgap.Ok() { + return pstart, pgap, nil + } + return pstart, pseg.PrevGap(), nil + } + vseg = vseg.NextSegment() + } +} + +const ( + // When memory is allocated for a private pma, align the allocated address + // range to a privateAllocUnit boundary when possible. Larger values of + // privateAllocUnit may reduce page faults by allowing fewer, larger pmas + // to be mapped, but may result in larger amounts of wasted memory in the + // presence of fragmentation. privateAllocUnit must be a power-of-2 + // multiple of usermem.PageSize. + privateAllocUnit = usermem.HugePageSize + + privateAllocMask = privateAllocUnit - 1 +) + +func privateAligned(ar usermem.AddrRange) usermem.AddrRange { + aligned := usermem.AddrRange{ar.Start &^ privateAllocMask, ar.End} + if end := (ar.End + privateAllocMask) &^ privateAllocMask; end >= ar.End { + aligned.End = end + } + if checkInvariants { + if !aligned.IsSupersetOf(ar) { + panic(fmt.Sprintf("aligned AddrRange %#v is not a superset of ar %#v", aligned, ar)) + } + } + return aligned +} + +// isPMACopyOnWriteLocked returns true if the contents of the pma represented +// by pseg must be copied to a new private pma to be written to. +// +// If the pma is a copy-on-write private pma, and holds the only reference on +// the memory it maps, isPMACopyOnWriteLocked will take ownership of the memory +// and update the pma to indicate that it does not require copy-on-write. +// +// Preconditions: vseg.Range().IsSupersetOf(pseg.Range()). mm.mappingMu must be +// locked. mm.activeMu must be locked for writing. +func (mm *MemoryManager) isPMACopyOnWriteLocked(vseg vmaIterator, pseg pmaIterator) bool { + pma := pseg.ValuePtr() + if !pma.needCOW { + return false + } + if !pma.private { + return true + } + // If we have the only reference on private memory to be copied, just take + // ownership of it instead of copying. If we do hold the only reference, + // additional references can only be taken by mm.Fork(), which is excluded + // by mm.activeMu, so this isn't racy. + mm.privateRefs.mu.Lock() + defer mm.privateRefs.mu.Unlock() + fr := pseg.fileRange() + // This check relies on mm.privateRefs.refs being kept fully merged. + rseg := mm.privateRefs.refs.FindSegment(fr.Start) + if rseg.Ok() && rseg.Value() == 1 && fr.End <= rseg.End() { + pma.needCOW = false + // pma.private => pma.translatePerms == usermem.AnyAccess + vma := vseg.ValuePtr() + pma.effectivePerms = vma.effectivePerms + pma.maxPerms = vma.maxPerms + return false + } + return true +} + +// Invalidate implements memmap.MappingSpace.Invalidate. +func (mm *MemoryManager) Invalidate(ar usermem.AddrRange, opts memmap.InvalidateOpts) { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + } + + mm.activeMu.Lock() + defer mm.activeMu.Unlock() + if mm.captureInvalidations { + mm.capturedInvalidations = append(mm.capturedInvalidations, invalidateArgs{ar, opts}) + return + } + mm.invalidateLocked(ar, opts.InvalidatePrivate, true) +} + +// invalidateLocked removes pmas and AddressSpace mappings of those pmas for +// addresses in ar. +// +// Preconditions: mm.activeMu must be locked for writing. ar.Length() != 0. ar +// must be page-aligned. +func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivate, invalidateShared bool) { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + } + + var didUnmapAS bool + pseg := mm.pmas.LowerBoundSegment(ar.Start) + for pseg.Ok() && pseg.Start() < ar.End { + pma := pseg.ValuePtr() + if (invalidatePrivate && pma.private) || (invalidateShared && !pma.private) { + pseg = mm.pmas.Isolate(pseg, ar) + pma = pseg.ValuePtr() + if !didUnmapAS { + // Unmap all of ar, not just pseg.Range(), to minimize host + // syscalls. AddressSpace mappings must be removed before + // mm.decPrivateRef(). + mm.unmapASLocked(ar) + didUnmapAS = true + } + if pma.private { + mm.decPrivateRef(pseg.fileRange()) + } + mm.removeRSSLocked(pseg.Range()) + pma.file.DecRef(pseg.fileRange()) + pseg = mm.pmas.Remove(pseg).NextSegment() + } else { + pseg = pseg.NextSegment() + } + } +} + +// Pin returns the platform.File ranges currently mapped by addresses in ar in +// mm, acquiring a reference on the returned ranges which the caller must +// release by calling Unpin. If not all addresses are mapped, Pin returns a +// non-nil error. Note that Pin may return both a non-empty slice of +// PinnedRanges and a non-nil error. +// +// Pin does not prevent mapped ranges from changing, making it unsuitable for +// most I/O. It should only be used in contexts that would use get_user_pages() +// in the Linux kernel. +// +// Preconditions: ar.Length() != 0. ar must be page-aligned. +func (mm *MemoryManager) Pin(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) ([]PinnedRange, error) { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + } + + // Ensure that we have usable vmas. + mm.mappingMu.RLock() + vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions) + if vendaddr := vend.Start(); vendaddr < ar.End { + if vendaddr <= ar.Start { + mm.mappingMu.RUnlock() + return nil, verr + } + ar.End = vendaddr + } + + // Ensure that we have usable pmas. + mm.activeMu.Lock() + pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, at) + mm.mappingMu.RUnlock() + if pendaddr := pend.Start(); pendaddr < ar.End { + if pendaddr <= ar.Start { + mm.activeMu.Unlock() + return nil, perr + } + ar.End = pendaddr + } + + // Gather pmas. + var prs []PinnedRange + for pseg.Ok() && pseg.Start() < ar.End { + psar := pseg.Range().Intersect(ar) + f := pseg.ValuePtr().file + fr := pseg.fileRangeOf(psar) + f.IncRef(fr) + prs = append(prs, PinnedRange{ + Source: psar, + File: f, + Offset: fr.Start, + }) + pseg = pseg.NextSegment() + } + mm.activeMu.Unlock() + + // Return the first error in order of progress through ar. + if perr != nil { + return prs, perr + } + return prs, verr +} + +// PinnedRanges are returned by MemoryManager.Pin. +type PinnedRange struct { + // Source is the corresponding range of addresses. + Source usermem.AddrRange + + // File is the mapped file. + File platform.File + + // Offset is the offset into File at which this PinnedRange begins. + Offset uint64 +} + +// FileRange returns the platform.File offsets mapped by pr. +func (pr PinnedRange) FileRange() platform.FileRange { + return platform.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())} +} + +// Unpin releases the reference held by prs. +func Unpin(prs []PinnedRange) { + for i := range prs { + prs[i].File.DecRef(prs[i].FileRange()) + } +} + +// movePMAsLocked moves all pmas in oldAR to newAR. +// +// Preconditions: mm.activeMu must be locked for writing. oldAR.Length() != 0. +// oldAR.Length() <= newAR.Length(). !oldAR.Overlaps(newAR). +// mm.pmas.IsEmptyRange(newAR). oldAR and newAR must be page-aligned. +func (mm *MemoryManager) movePMAsLocked(oldAR, newAR usermem.AddrRange) { + if checkInvariants { + if !oldAR.WellFormed() || oldAR.Length() <= 0 || !oldAR.IsPageAligned() { + panic(fmt.Sprintf("invalid oldAR: %v", oldAR)) + } + if !newAR.WellFormed() || newAR.Length() <= 0 || !newAR.IsPageAligned() { + panic(fmt.Sprintf("invalid newAR: %v", newAR)) + } + if oldAR.Length() > newAR.Length() { + panic(fmt.Sprintf("old address range %v may contain pmas that will not fit in new address range %v", oldAR, newAR)) + } + if oldAR.Overlaps(newAR) { + panic(fmt.Sprintf("old and new address ranges overlap: %v, %v", oldAR, newAR)) + } + // mm.pmas.IsEmptyRange is checked by mm.pmas.Insert. + } + + type movedPMA struct { + oldAR usermem.AddrRange + pma pma + } + var movedPMAs []movedPMA + pseg := mm.pmas.LowerBoundSegment(oldAR.Start) + for pseg.Ok() && pseg.Start() < oldAR.End { + pseg = mm.pmas.Isolate(pseg, oldAR) + movedPMAs = append(movedPMAs, movedPMA{ + oldAR: pseg.Range(), + pma: pseg.Value(), + }) + pseg = mm.pmas.Remove(pseg).NextSegment() + // No RSS change is needed since we're re-inserting the same pmas + // below. + } + + off := newAR.Start - oldAR.Start + pgap := mm.pmas.FindGap(newAR.Start) + for i := range movedPMAs { + mpma := &movedPMAs[i] + pmaNewAR := usermem.AddrRange{mpma.oldAR.Start + off, mpma.oldAR.End + off} + pgap = mm.pmas.Insert(pgap, pmaNewAR, mpma.pma).NextGap() + } + + mm.unmapASLocked(oldAR) +} + +// getPMAInternalMappingsLocked ensures that pmas for all addresses in ar have +// cached internal mappings. It returns: +// +// - An iterator to the gap after the last pma with internal mappings +// containing an address in ar. If internal mappings exist for no addresses in +// ar, the iterator is to a gap that begins before ar.Start. +// +// - An error that is non-nil if internal mappings exist for only a subset of +// ar. +// +// Preconditions: mm.activeMu must be locked for writing. +// pseg.Range().Contains(ar.Start). pmas must exist for all addresses in ar. +// ar.Length() != 0. +// +// Postconditions: getPMAInternalMappingsLocked does not invalidate iterators +// into mm.pmas. +func (mm *MemoryManager) getPMAInternalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) (pmaGapIterator, error) { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + if !pseg.Range().Contains(ar.Start) { + panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar)) + } + } + + for { + if err := pseg.getInternalMappingsLocked(); err != nil { + return pseg.PrevGap(), err + } + if ar.End <= pseg.End() { + return pseg.NextGap(), nil + } + pseg, _ = pseg.NextNonEmpty() + } +} + +// getVecPMAInternalMappingsLocked ensures that pmas for all addresses in ars +// have cached internal mappings. It returns the subset of ars for which +// internal mappings exist. If this is not equal to ars, it returns a non-nil +// error explaining why. +// +// Preconditions: mm.activeMu must be locked for writing. pmas must exist for +// all addresses in ar. +// +// Postconditions: getVecPMAInternalMappingsLocked does not invalidate iterators +// into mm.pmas. +func (mm *MemoryManager) getVecPMAInternalMappingsLocked(ars usermem.AddrRangeSeq) (usermem.AddrRangeSeq, error) { + for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() { + ar := arsit.Head() + if ar.Length() == 0 { + continue + } + if pend, err := mm.getPMAInternalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); err != nil { + return truncatedAddrRangeSeq(ars, arsit, pend.Start()), err + } + } + return ars, nil +} + +// internalMappingsLocked returns internal mappings for addresses in ar. +// +// Preconditions: mm.activeMu must be locked. Internal mappings must have been +// previously established for all addresses in ar. ar.Length() != 0. +// pseg.Range().Contains(ar.Start). +func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar usermem.AddrRange) safemem.BlockSeq { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + if !pseg.Range().Contains(ar.Start) { + panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar)) + } + } + + if ar.End <= pseg.End() { + // Since only one pma is involved, we can use pma.internalMappings + // directly, avoiding a slice allocation. + offset := uint64(ar.Start - pseg.Start()) + return pseg.ValuePtr().internalMappings.DropFirst64(offset).TakeFirst64(uint64(ar.Length())) + } + + var ims []safemem.Block + for { + pr := pseg.Range().Intersect(ar) + for pims := pseg.ValuePtr().internalMappings.DropFirst64(uint64(pr.Start - pseg.Start())).TakeFirst64(uint64(pr.Length())); !pims.IsEmpty(); pims = pims.Tail() { + ims = append(ims, pims.Head()) + } + if ar.End <= pseg.End() { + break + } + pseg = pseg.NextSegment() + } + return safemem.BlockSeqFromSlice(ims) +} + +// vecInternalMappingsLocked returns internal mappings for addresses in ars. +// +// Preconditions: mm.activeMu must be locked. Internal mappings must have been +// previously established for all addresses in ars. +func (mm *MemoryManager) vecInternalMappingsLocked(ars usermem.AddrRangeSeq) safemem.BlockSeq { + var ims []safemem.Block + for ; !ars.IsEmpty(); ars = ars.Tail() { + ar := ars.Head() + if ar.Length() == 0 { + continue + } + for pims := mm.internalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); !pims.IsEmpty(); pims = pims.Tail() { + ims = append(ims, pims.Head()) + } + } + return safemem.BlockSeqFromSlice(ims) +} + +// incPrivateRef acquires a reference on private pages in fr. +func (mm *MemoryManager) incPrivateRef(fr platform.FileRange) { + mm.privateRefs.mu.Lock() + defer mm.privateRefs.mu.Unlock() + refSet := &mm.privateRefs.refs + seg, gap := refSet.Find(fr.Start) + for { + switch { + case seg.Ok() && seg.Start() < fr.End: + seg = refSet.Isolate(seg, fr) + seg.SetValue(seg.Value() + 1) + seg, gap = seg.NextNonEmpty() + case gap.Ok() && gap.Start() < fr.End: + seg, gap = refSet.InsertWithoutMerging(gap, gap.Range().Intersect(fr), 1).NextNonEmpty() + default: + refSet.MergeAdjacent(fr) + return + } + } +} + +// decPrivateRef releases a reference on private pages in fr. +func (mm *MemoryManager) decPrivateRef(fr platform.FileRange) { + var freed []platform.FileRange + + mm.privateRefs.mu.Lock() + refSet := &mm.privateRefs.refs + seg := refSet.LowerBoundSegment(fr.Start) + for seg.Ok() && seg.Start() < fr.End { + seg = refSet.Isolate(seg, fr) + if old := seg.Value(); old == 1 { + freed = append(freed, seg.Range()) + seg = refSet.Remove(seg).NextSegment() + } else { + seg.SetValue(old - 1) + seg = seg.NextSegment() + } + } + refSet.MergeAdjacent(fr) + mm.privateRefs.mu.Unlock() + + mf := mm.mfp.MemoryFile() + for _, fr := range freed { + mf.DecRef(fr) + } +} + +// addRSSLocked updates the current and maximum resident set size of a +// MemoryManager to reflect the insertion of a pma at ar. +// +// Preconditions: mm.activeMu must be locked for writing. +func (mm *MemoryManager) addRSSLocked(ar usermem.AddrRange) { + mm.curRSS += uint64(ar.Length()) + if mm.curRSS > mm.maxRSS { + mm.maxRSS = mm.curRSS + } +} + +// removeRSSLocked updates the current resident set size of a MemoryManager to +// reflect the removal of a pma at ar. +// +// Preconditions: mm.activeMu must be locked for writing. +func (mm *MemoryManager) removeRSSLocked(ar usermem.AddrRange) { + mm.curRSS -= uint64(ar.Length()) +} + +// pmaSetFunctions implements segment.Functions for pmaSet. +type pmaSetFunctions struct{} + +func (pmaSetFunctions) MinKey() usermem.Addr { + return 0 +} + +func (pmaSetFunctions) MaxKey() usermem.Addr { + return ^usermem.Addr(0) +} + +func (pmaSetFunctions) ClearValue(pma *pma) { + pma.file = nil + pma.internalMappings = safemem.BlockSeq{} +} + +func (pmaSetFunctions) Merge(ar1 usermem.AddrRange, pma1 pma, ar2 usermem.AddrRange, pma2 pma) (pma, bool) { + if pma1.file != pma2.file || + pma1.off+uint64(ar1.Length()) != pma2.off || + pma1.translatePerms != pma2.translatePerms || + pma1.effectivePerms != pma2.effectivePerms || + pma1.maxPerms != pma2.maxPerms || + pma1.needCOW != pma2.needCOW || + pma1.private != pma2.private { + return pma{}, false + } + + // Discard internal mappings instead of trying to merge them, since merging + // them requires an allocation and getting them again from the + // platform.File might not. + pma1.internalMappings = safemem.BlockSeq{} + return pma1, true +} + +func (pmaSetFunctions) Split(ar usermem.AddrRange, p pma, split usermem.Addr) (pma, pma) { + newlen1 := uint64(split - ar.Start) + p2 := p + p2.off += newlen1 + if !p.internalMappings.IsEmpty() { + p.internalMappings = p.internalMappings.TakeFirst64(newlen1) + p2.internalMappings = p2.internalMappings.DropFirst64(newlen1) + } + return p, p2 +} + +// findOrSeekPrevUpperBoundPMA returns mm.pmas.UpperBoundSegment(addr), but may do +// so by scanning linearly backward from pgap. +// +// Preconditions: mm.activeMu must be locked. addr <= pgap.Start(). +func (mm *MemoryManager) findOrSeekPrevUpperBoundPMA(addr usermem.Addr, pgap pmaGapIterator) pmaIterator { + if checkInvariants { + if !pgap.Ok() { + panic("terminal pma iterator") + } + if addr > pgap.Start() { + panic(fmt.Sprintf("can't seek backward to %#x from %#x", addr, pgap.Start())) + } + } + // Optimistically check if pgap.PrevSegment() is the PMA we're looking for, + // which is the case if findOrSeekPrevUpperBoundPMA is called to find the + // start of a range containing only a single PMA. + if pseg := pgap.PrevSegment(); pseg.Start() <= addr { + return pseg + } + return mm.pmas.UpperBoundSegment(addr) +} + +// getInternalMappingsLocked ensures that pseg.ValuePtr().internalMappings is +// non-empty. +// +// Preconditions: mm.activeMu must be locked for writing. +func (pseg pmaIterator) getInternalMappingsLocked() error { + pma := pseg.ValuePtr() + if pma.internalMappings.IsEmpty() { + // This must use maxPerms (instead of perms) because some permission + // constraints are only visible to vmas; for example, mappings of + // read-only files have vma.maxPerms.Write unset, but this may not be + // visible to the memmap.Mappable. + perms := pma.maxPerms + // We will never execute application code through an internal mapping. + perms.Execute = false + ims, err := pma.file.MapInternal(pseg.fileRange(), perms) + if err != nil { + return err + } + pma.internalMappings = ims + } + return nil +} + +func (pseg pmaIterator) fileRange() platform.FileRange { + return pseg.fileRangeOf(pseg.Range()) +} + +// Preconditions: pseg.Range().IsSupersetOf(ar). ar.Length != 0. +func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) platform.FileRange { + if checkInvariants { + if !pseg.Ok() { + panic("terminal pma iterator") + } + if !ar.WellFormed() || ar.Length() <= 0 { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + if !pseg.Range().IsSupersetOf(ar) { + panic(fmt.Sprintf("ar %v out of bounds %v", ar, pseg.Range())) + } + } + + pma := pseg.ValuePtr() + pstart := pseg.Start() + return platform.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)} +} diff --git a/pkg/sentry/mm/pma_set.go b/pkg/sentry/mm/pma_set.go new file mode 100755 index 000000000..6380d8619 --- /dev/null +++ b/pkg/sentry/mm/pma_set.go @@ -0,0 +1,1274 @@ +package mm + +import ( + __generics_imported0 "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +import ( + "bytes" + "fmt" +) + +const ( + // minDegree is the minimum degree of an internal node in a Set B-tree. + // + // - Any non-root node has at least minDegree-1 segments. + // + // - Any non-root internal (non-leaf) node has at least minDegree children. + // + // - The root node may have fewer than minDegree-1 segments, but it may + // only have 0 segments if the tree is empty. + // + // Our implementation requires minDegree >= 3. Higher values of minDegree + // usually improve performance, but increase memory usage for small sets. + pmaminDegree = 8 + + pmamaxDegree = 2 * pmaminDegree +) + +// A Set is a mapping of segments with non-overlapping Range keys. The zero +// value for a Set is an empty set. Set values are not safely movable nor +// copyable. Set is thread-compatible. +// +// +stateify savable +type pmaSet struct { + root pmanode `state:".(*pmaSegmentDataSlices)"` +} + +// IsEmpty returns true if the set contains no segments. +func (s *pmaSet) IsEmpty() bool { + return s.root.nrSegments == 0 +} + +// IsEmptyRange returns true iff no segments in the set overlap the given +// range. This is semantically equivalent to s.SpanRange(r) == 0, but may be +// more efficient. +func (s *pmaSet) IsEmptyRange(r __generics_imported0.AddrRange) bool { + switch { + case r.Length() < 0: + panic(fmt.Sprintf("invalid range %v", r)) + case r.Length() == 0: + return true + } + _, gap := s.Find(r.Start) + if !gap.Ok() { + return false + } + return r.End <= gap.End() +} + +// Span returns the total size of all segments in the set. +func (s *pmaSet) Span() __generics_imported0.Addr { + var sz __generics_imported0.Addr + for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + sz += seg.Range().Length() + } + return sz +} + +// SpanRange returns the total size of the intersection of segments in the set +// with the given range. +func (s *pmaSet) SpanRange(r __generics_imported0.AddrRange) __generics_imported0.Addr { + switch { + case r.Length() < 0: + panic(fmt.Sprintf("invalid range %v", r)) + case r.Length() == 0: + return 0 + } + var sz __generics_imported0.Addr + for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { + sz += seg.Range().Intersect(r).Length() + } + return sz +} + +// FirstSegment returns the first segment in the set. If the set is empty, +// FirstSegment returns a terminal iterator. +func (s *pmaSet) FirstSegment() pmaIterator { + if s.root.nrSegments == 0 { + return pmaIterator{} + } + return s.root.firstSegment() +} + +// LastSegment returns the last segment in the set. If the set is empty, +// LastSegment returns a terminal iterator. +func (s *pmaSet) LastSegment() pmaIterator { + if s.root.nrSegments == 0 { + return pmaIterator{} + } + return s.root.lastSegment() +} + +// FirstGap returns the first gap in the set. +func (s *pmaSet) FirstGap() pmaGapIterator { + n := &s.root + for n.hasChildren { + n = n.children[0] + } + return pmaGapIterator{n, 0} +} + +// LastGap returns the last gap in the set. +func (s *pmaSet) LastGap() pmaGapIterator { + n := &s.root + for n.hasChildren { + n = n.children[n.nrSegments] + } + return pmaGapIterator{n, n.nrSegments} +} + +// Find returns the segment or gap whose range contains the given key. If a +// segment is found, the returned Iterator is non-terminal and the +// returned GapIterator is terminal. Otherwise, the returned Iterator is +// terminal and the returned GapIterator is non-terminal. +func (s *pmaSet) Find(key __generics_imported0.Addr) (pmaIterator, pmaGapIterator) { + n := &s.root + for { + + lower := 0 + upper := n.nrSegments + for lower < upper { + i := lower + (upper-lower)/2 + if r := n.keys[i]; key < r.End { + if key >= r.Start { + return pmaIterator{n, i}, pmaGapIterator{} + } + upper = i + } else { + lower = i + 1 + } + } + i := lower + if !n.hasChildren { + return pmaIterator{}, pmaGapIterator{n, i} + } + n = n.children[i] + } +} + +// FindSegment returns the segment whose range contains the given key. If no +// such segment exists, FindSegment returns a terminal iterator. +func (s *pmaSet) FindSegment(key __generics_imported0.Addr) pmaIterator { + seg, _ := s.Find(key) + return seg +} + +// LowerBoundSegment returns the segment with the lowest range that contains a +// key greater than or equal to min. If no such segment exists, +// LowerBoundSegment returns a terminal iterator. +func (s *pmaSet) LowerBoundSegment(min __generics_imported0.Addr) pmaIterator { + seg, gap := s.Find(min) + if seg.Ok() { + return seg + } + return gap.NextSegment() +} + +// UpperBoundSegment returns the segment with the highest range that contains a +// key less than or equal to max. If no such segment exists, UpperBoundSegment +// returns a terminal iterator. +func (s *pmaSet) UpperBoundSegment(max __generics_imported0.Addr) pmaIterator { + seg, gap := s.Find(max) + if seg.Ok() { + return seg + } + return gap.PrevSegment() +} + +// FindGap returns the gap containing the given key. If no such gap exists +// (i.e. the set contains a segment containing that key), FindGap returns a +// terminal iterator. +func (s *pmaSet) FindGap(key __generics_imported0.Addr) pmaGapIterator { + _, gap := s.Find(key) + return gap +} + +// LowerBoundGap returns the gap with the lowest range that is greater than or +// equal to min. +func (s *pmaSet) LowerBoundGap(min __generics_imported0.Addr) pmaGapIterator { + seg, gap := s.Find(min) + if gap.Ok() { + return gap + } + return seg.NextGap() +} + +// UpperBoundGap returns the gap with the highest range that is less than or +// equal to max. +func (s *pmaSet) UpperBoundGap(max __generics_imported0.Addr) pmaGapIterator { + seg, gap := s.Find(max) + if gap.Ok() { + return gap + } + return seg.PrevGap() +} + +// Add inserts the given segment into the set and returns true. If the new +// segment can be merged with adjacent segments, Add will do so. If the new +// segment would overlap an existing segment, Add returns false. If Add +// succeeds, all existing iterators are invalidated. +func (s *pmaSet) Add(r __generics_imported0.AddrRange, val pma) bool { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + gap := s.FindGap(r.Start) + if !gap.Ok() { + return false + } + if r.End > gap.End() { + return false + } + s.Insert(gap, r, val) + return true +} + +// AddWithoutMerging inserts the given segment into the set and returns true. +// If it would overlap an existing segment, AddWithoutMerging does nothing and +// returns false. If AddWithoutMerging succeeds, all existing iterators are +// invalidated. +func (s *pmaSet) AddWithoutMerging(r __generics_imported0.AddrRange, val pma) bool { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + gap := s.FindGap(r.Start) + if !gap.Ok() { + return false + } + if r.End > gap.End() { + return false + } + s.InsertWithoutMergingUnchecked(gap, r, val) + return true +} + +// Insert inserts the given segment into the given gap. If the new segment can +// be merged with adjacent segments, Insert will do so. Insert returns an +// iterator to the segment containing the inserted value (which may have been +// merged with other values). All existing iterators (including gap, but not +// including the returned iterator) are invalidated. +// +// If the gap cannot accommodate the segment, or if r is invalid, Insert panics. +// +// Insert is semantically equivalent to a InsertWithoutMerging followed by a +// Merge, but may be more efficient. Note that there is no unchecked variant of +// Insert since Insert must retrieve and inspect gap's predecessor and +// successor segments regardless. +func (s *pmaSet) Insert(gap pmaGapIterator, r __generics_imported0.AddrRange, val pma) pmaIterator { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + prev, next := gap.PrevSegment(), gap.NextSegment() + if prev.Ok() && prev.End() > r.Start { + panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) + } + if next.Ok() && next.Start() < r.End { + panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) + } + if prev.Ok() && prev.End() == r.Start { + if mval, ok := (pmaSetFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok { + prev.SetEndUnchecked(r.End) + prev.SetValue(mval) + if next.Ok() && next.Start() == r.End { + val = mval + if mval, ok := (pmaSetFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { + prev.SetEndUnchecked(next.End()) + prev.SetValue(mval) + return s.Remove(next).PrevSegment() + } + } + return prev + } + } + if next.Ok() && next.Start() == r.End { + if mval, ok := (pmaSetFunctions{}).Merge(r, val, next.Range(), next.Value()); ok { + next.SetStartUnchecked(r.Start) + next.SetValue(mval) + return next + } + } + return s.InsertWithoutMergingUnchecked(gap, r, val) +} + +// InsertWithoutMerging inserts the given segment into the given gap and +// returns an iterator to the inserted segment. All existing iterators +// (including gap, but not including the returned iterator) are invalidated. +// +// If the gap cannot accommodate the segment, or if r is invalid, +// InsertWithoutMerging panics. +func (s *pmaSet) InsertWithoutMerging(gap pmaGapIterator, r __generics_imported0.AddrRange, val pma) pmaIterator { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + if gr := gap.Range(); !gr.IsSupersetOf(r) { + panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) + } + return s.InsertWithoutMergingUnchecked(gap, r, val) +} + +// InsertWithoutMergingUnchecked inserts the given segment into the given gap +// and returns an iterator to the inserted segment. All existing iterators +// (including gap, but not including the returned iterator) are invalidated. +// +// Preconditions: r.Start >= gap.Start(); r.End <= gap.End(). +func (s *pmaSet) InsertWithoutMergingUnchecked(gap pmaGapIterator, r __generics_imported0.AddrRange, val pma) pmaIterator { + gap = gap.node.rebalanceBeforeInsert(gap) + copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) + copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) + gap.node.keys[gap.index] = r + gap.node.values[gap.index] = val + gap.node.nrSegments++ + return pmaIterator{gap.node, gap.index} +} + +// Remove removes the given segment and returns an iterator to the vacated gap. +// All existing iterators (including seg, but not including the returned +// iterator) are invalidated. +func (s *pmaSet) Remove(seg pmaIterator) pmaGapIterator { + + if seg.node.hasChildren { + + victim := seg.PrevSegment() + + seg.SetRangeUnchecked(victim.Range()) + seg.SetValue(victim.Value()) + return s.Remove(victim).NextGap() + } + copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) + copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) + pmaSetFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) + seg.node.nrSegments-- + return seg.node.rebalanceAfterRemove(pmaGapIterator{seg.node, seg.index}) +} + +// RemoveAll removes all segments from the set. All existing iterators are +// invalidated. +func (s *pmaSet) RemoveAll() { + s.root = pmanode{} +} + +// RemoveRange removes all segments in the given range. An iterator to the +// newly formed gap is returned, and all existing iterators are invalidated. +func (s *pmaSet) RemoveRange(r __generics_imported0.AddrRange) pmaGapIterator { + seg, gap := s.Find(r.Start) + if seg.Ok() { + seg = s.Isolate(seg, r) + gap = s.Remove(seg) + } + for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { + seg = s.Isolate(seg, r) + gap = s.Remove(seg) + } + return gap +} + +// Merge attempts to merge two neighboring segments. If successful, Merge +// returns an iterator to the merged segment, and all existing iterators are +// invalidated. Otherwise, Merge returns a terminal iterator. +// +// If first is not the predecessor of second, Merge panics. +func (s *pmaSet) Merge(first, second pmaIterator) pmaIterator { + if first.NextSegment() != second { + panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) + } + return s.MergeUnchecked(first, second) +} + +// MergeUnchecked attempts to merge two neighboring segments. If successful, +// MergeUnchecked returns an iterator to the merged segment, and all existing +// iterators are invalidated. Otherwise, MergeUnchecked returns a terminal +// iterator. +// +// Precondition: first is the predecessor of second: first.NextSegment() == +// second, first == second.PrevSegment(). +func (s *pmaSet) MergeUnchecked(first, second pmaIterator) pmaIterator { + if first.End() == second.Start() { + if mval, ok := (pmaSetFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { + + first.SetEndUnchecked(second.End()) + first.SetValue(mval) + return s.Remove(second).PrevSegment() + } + } + return pmaIterator{} +} + +// MergeAll attempts to merge all adjacent segments in the set. All existing +// iterators are invalidated. +func (s *pmaSet) MergeAll() { + seg := s.FirstSegment() + if !seg.Ok() { + return + } + next := seg.NextSegment() + for next.Ok() { + if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { + seg, next = mseg, mseg.NextSegment() + } else { + seg, next = next, next.NextSegment() + } + } +} + +// MergeRange attempts to merge all adjacent segments that contain a key in the +// specific range. All existing iterators are invalidated. +func (s *pmaSet) MergeRange(r __generics_imported0.AddrRange) { + seg := s.LowerBoundSegment(r.Start) + if !seg.Ok() { + return + } + next := seg.NextSegment() + for next.Ok() && next.Range().Start < r.End { + if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { + seg, next = mseg, mseg.NextSegment() + } else { + seg, next = next, next.NextSegment() + } + } +} + +// MergeAdjacent attempts to merge the segment containing r.Start with its +// predecessor, and the segment containing r.End-1 with its successor. +func (s *pmaSet) MergeAdjacent(r __generics_imported0.AddrRange) { + first := s.FindSegment(r.Start) + if first.Ok() { + if prev := first.PrevSegment(); prev.Ok() { + s.Merge(prev, first) + } + } + last := s.FindSegment(r.End - 1) + if last.Ok() { + if next := last.NextSegment(); next.Ok() { + s.Merge(last, next) + } + } +} + +// Split splits the given segment at the given key and returns iterators to the +// two resulting segments. All existing iterators (including seg, but not +// including the returned iterators) are invalidated. +// +// If the segment cannot be split at split (because split is at the start or +// end of the segment's range, so splitting would produce a segment with zero +// length, or because split falls outside the segment's range altogether), +// Split panics. +func (s *pmaSet) Split(seg pmaIterator, split __generics_imported0.Addr) (pmaIterator, pmaIterator) { + if !seg.Range().CanSplitAt(split) { + panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) + } + return s.SplitUnchecked(seg, split) +} + +// SplitUnchecked splits the given segment at the given key and returns +// iterators to the two resulting segments. All existing iterators (including +// seg, but not including the returned iterators) are invalidated. +// +// Preconditions: seg.Start() < key < seg.End(). +func (s *pmaSet) SplitUnchecked(seg pmaIterator, split __generics_imported0.Addr) (pmaIterator, pmaIterator) { + val1, val2 := (pmaSetFunctions{}).Split(seg.Range(), seg.Value(), split) + end2 := seg.End() + seg.SetEndUnchecked(split) + seg.SetValue(val1) + seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), __generics_imported0.AddrRange{split, end2}, val2) + + return seg2.PrevSegment(), seg2 +} + +// SplitAt splits the segment straddling split, if one exists. SplitAt returns +// true if a segment was split and false otherwise. If SplitAt splits a +// segment, all existing iterators are invalidated. +func (s *pmaSet) SplitAt(split __generics_imported0.Addr) bool { + if seg := s.FindSegment(split); seg.Ok() && seg.Range().CanSplitAt(split) { + s.SplitUnchecked(seg, split) + return true + } + return false +} + +// Isolate ensures that the given segment's range does not escape r by +// splitting at r.Start and r.End if necessary, and returns an updated iterator +// to the bounded segment. All existing iterators (including seg, but not +// including the returned iterators) are invalidated. +func (s *pmaSet) Isolate(seg pmaIterator, r __generics_imported0.AddrRange) pmaIterator { + if seg.Range().CanSplitAt(r.Start) { + _, seg = s.SplitUnchecked(seg, r.Start) + } + if seg.Range().CanSplitAt(r.End) { + seg, _ = s.SplitUnchecked(seg, r.End) + } + return seg +} + +// ApplyContiguous applies a function to a contiguous range of segments, +// splitting if necessary. The function is applied until the first gap is +// encountered, at which point the gap is returned. If the function is applied +// across the entire range, a terminal gap is returned. All existing iterators +// are invalidated. +// +// N.B. The Iterator must not be invalidated by the function. +func (s *pmaSet) ApplyContiguous(r __generics_imported0.AddrRange, fn func(seg pmaIterator)) pmaGapIterator { + seg, gap := s.Find(r.Start) + if !seg.Ok() { + return gap + } + for { + seg = s.Isolate(seg, r) + fn(seg) + if seg.End() >= r.End { + return pmaGapIterator{} + } + gap = seg.NextGap() + if !gap.IsEmpty() { + return gap + } + seg = gap.NextSegment() + if !seg.Ok() { + + return pmaGapIterator{} + } + } +} + +// +stateify savable +type pmanode struct { + // An internal binary tree node looks like: + // + // K + // / \ + // Cl Cr + // + // where all keys in the subtree rooted by Cl (the left subtree) are less + // than K (the key of the parent node), and all keys in the subtree rooted + // by Cr (the right subtree) are greater than K. + // + // An internal B-tree node's indexes work out to look like: + // + // K0 K1 K2 ... Kn-1 + // / \/ \/ \ ... / \ + // C0 C1 C2 C3 ... Cn-1 Cn + // + // where n is nrSegments. + nrSegments int + + // parent is a pointer to this node's parent. If this node is root, parent + // is nil. + parent *pmanode + + // parentIndex is the index of this node in parent.children. + parentIndex int + + // Flag for internal nodes that is technically redundant with "children[0] + // != nil", but is stored in the first cache line. "hasChildren" rather + // than "isLeaf" because false must be the correct value for an empty root. + hasChildren bool + + // Nodes store keys and values in separate arrays to maximize locality in + // the common case (scanning keys for lookup). + keys [pmamaxDegree - 1]__generics_imported0.AddrRange + values [pmamaxDegree - 1]pma + children [pmamaxDegree]*pmanode +} + +// firstSegment returns the first segment in the subtree rooted by n. +// +// Preconditions: n.nrSegments != 0. +func (n *pmanode) firstSegment() pmaIterator { + for n.hasChildren { + n = n.children[0] + } + return pmaIterator{n, 0} +} + +// lastSegment returns the last segment in the subtree rooted by n. +// +// Preconditions: n.nrSegments != 0. +func (n *pmanode) lastSegment() pmaIterator { + for n.hasChildren { + n = n.children[n.nrSegments] + } + return pmaIterator{n, n.nrSegments - 1} +} + +func (n *pmanode) prevSibling() *pmanode { + if n.parent == nil || n.parentIndex == 0 { + return nil + } + return n.parent.children[n.parentIndex-1] +} + +func (n *pmanode) nextSibling() *pmanode { + if n.parent == nil || n.parentIndex == n.parent.nrSegments { + return nil + } + return n.parent.children[n.parentIndex+1] +} + +// rebalanceBeforeInsert splits n and its ancestors if they are full, as +// required for insertion, and returns an updated iterator to the position +// represented by gap. +func (n *pmanode) rebalanceBeforeInsert(gap pmaGapIterator) pmaGapIterator { + if n.parent != nil { + gap = n.parent.rebalanceBeforeInsert(gap) + } + if n.nrSegments < pmamaxDegree-1 { + return gap + } + if n.parent == nil { + + left := &pmanode{ + nrSegments: pmaminDegree - 1, + parent: n, + parentIndex: 0, + hasChildren: n.hasChildren, + } + right := &pmanode{ + nrSegments: pmaminDegree - 1, + parent: n, + parentIndex: 1, + hasChildren: n.hasChildren, + } + copy(left.keys[:pmaminDegree-1], n.keys[:pmaminDegree-1]) + copy(left.values[:pmaminDegree-1], n.values[:pmaminDegree-1]) + copy(right.keys[:pmaminDegree-1], n.keys[pmaminDegree:]) + copy(right.values[:pmaminDegree-1], n.values[pmaminDegree:]) + n.keys[0], n.values[0] = n.keys[pmaminDegree-1], n.values[pmaminDegree-1] + pmazeroValueSlice(n.values[1:]) + if n.hasChildren { + copy(left.children[:pmaminDegree], n.children[:pmaminDegree]) + copy(right.children[:pmaminDegree], n.children[pmaminDegree:]) + pmazeroNodeSlice(n.children[2:]) + for i := 0; i < pmaminDegree; i++ { + left.children[i].parent = left + left.children[i].parentIndex = i + right.children[i].parent = right + right.children[i].parentIndex = i + } + } + n.nrSegments = 1 + n.hasChildren = true + n.children[0] = left + n.children[1] = right + if gap.node != n { + return gap + } + if gap.index < pmaminDegree { + return pmaGapIterator{left, gap.index} + } + return pmaGapIterator{right, gap.index - pmaminDegree} + } + + copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) + copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) + n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[pmaminDegree-1], n.values[pmaminDegree-1] + copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) + for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { + n.parent.children[i].parentIndex = i + } + sibling := &pmanode{ + nrSegments: pmaminDegree - 1, + parent: n.parent, + parentIndex: n.parentIndex + 1, + hasChildren: n.hasChildren, + } + n.parent.children[n.parentIndex+1] = sibling + n.parent.nrSegments++ + copy(sibling.keys[:pmaminDegree-1], n.keys[pmaminDegree:]) + copy(sibling.values[:pmaminDegree-1], n.values[pmaminDegree:]) + pmazeroValueSlice(n.values[pmaminDegree-1:]) + if n.hasChildren { + copy(sibling.children[:pmaminDegree], n.children[pmaminDegree:]) + pmazeroNodeSlice(n.children[pmaminDegree:]) + for i := 0; i < pmaminDegree; i++ { + sibling.children[i].parent = sibling + sibling.children[i].parentIndex = i + } + } + n.nrSegments = pmaminDegree - 1 + + if gap.node != n { + return gap + } + if gap.index < pmaminDegree { + return gap + } + return pmaGapIterator{sibling, gap.index - pmaminDegree} +} + +// rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient +// (contain fewer segments than required by B-tree invariants), as required for +// removal, and returns an updated iterator to the position represented by gap. +// +// Precondition: n is the only node in the tree that may currently violate a +// B-tree invariant. +func (n *pmanode) rebalanceAfterRemove(gap pmaGapIterator) pmaGapIterator { + for { + if n.nrSegments >= pmaminDegree-1 { + return gap + } + if n.parent == nil { + + return gap + } + + if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= pmaminDegree { + copy(n.keys[1:], n.keys[:n.nrSegments]) + copy(n.values[1:], n.values[:n.nrSegments]) + n.keys[0] = n.parent.keys[n.parentIndex-1] + n.values[0] = n.parent.values[n.parentIndex-1] + n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] + n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] + pmaSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) + if n.hasChildren { + copy(n.children[1:], n.children[:n.nrSegments+1]) + n.children[0] = sibling.children[sibling.nrSegments] + sibling.children[sibling.nrSegments] = nil + n.children[0].parent = n + n.children[0].parentIndex = 0 + for i := 1; i < n.nrSegments+2; i++ { + n.children[i].parentIndex = i + } + } + n.nrSegments++ + sibling.nrSegments-- + if gap.node == sibling && gap.index == sibling.nrSegments { + return pmaGapIterator{n, 0} + } + if gap.node == n { + return pmaGapIterator{n, gap.index + 1} + } + return gap + } + if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= pmaminDegree { + n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] + n.values[n.nrSegments] = n.parent.values[n.parentIndex] + n.parent.keys[n.parentIndex] = sibling.keys[0] + n.parent.values[n.parentIndex] = sibling.values[0] + copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) + copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) + pmaSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) + if n.hasChildren { + n.children[n.nrSegments+1] = sibling.children[0] + copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) + sibling.children[sibling.nrSegments] = nil + n.children[n.nrSegments+1].parent = n + n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 + for i := 0; i < sibling.nrSegments; i++ { + sibling.children[i].parentIndex = i + } + } + n.nrSegments++ + sibling.nrSegments-- + if gap.node == sibling { + if gap.index == 0 { + return pmaGapIterator{n, n.nrSegments} + } + return pmaGapIterator{sibling, gap.index - 1} + } + return gap + } + + p := n.parent + if p.nrSegments == 1 { + + left, right := p.children[0], p.children[1] + p.nrSegments = left.nrSegments + right.nrSegments + 1 + p.hasChildren = left.hasChildren + p.keys[left.nrSegments] = p.keys[0] + p.values[left.nrSegments] = p.values[0] + copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) + copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) + copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) + copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) + if left.hasChildren { + copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) + copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) + for i := 0; i < p.nrSegments+1; i++ { + p.children[i].parent = p + p.children[i].parentIndex = i + } + } else { + p.children[0] = nil + p.children[1] = nil + } + if gap.node == left { + return pmaGapIterator{p, gap.index} + } + if gap.node == right { + return pmaGapIterator{p, gap.index + left.nrSegments + 1} + } + return gap + } + // Merge n and either sibling, along with the segment separating the + // two, into whichever of the two nodes comes first. This is the + // reverse of the non-root splitting case in + // node.rebalanceBeforeInsert. + var left, right *pmanode + if n.parentIndex > 0 { + left = n.prevSibling() + right = n + } else { + left = n + right = n.nextSibling() + } + + if gap.node == right { + gap = pmaGapIterator{left, gap.index + left.nrSegments + 1} + } + left.keys[left.nrSegments] = p.keys[left.parentIndex] + left.values[left.nrSegments] = p.values[left.parentIndex] + copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) + copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) + if left.hasChildren { + copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) + for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { + left.children[i].parent = left + left.children[i].parentIndex = i + } + } + left.nrSegments += right.nrSegments + 1 + copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) + copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) + pmaSetFunctions{}.ClearValue(&p.values[p.nrSegments-1]) + copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) + for i := 0; i < p.nrSegments; i++ { + p.children[i].parentIndex = i + } + p.children[p.nrSegments] = nil + p.nrSegments-- + + n = p + } +} + +// A Iterator is conceptually one of: +// +// - A pointer to a segment in a set; or +// +// - A terminal iterator, which is a sentinel indicating that the end of +// iteration has been reached. +// +// Iterators are copyable values and are meaningfully equality-comparable. The +// zero value of Iterator is a terminal iterator. +// +// Unless otherwise specified, any mutation of a set invalidates all existing +// iterators into the set. +type pmaIterator struct { + // node is the node containing the iterated segment. If the iterator is + // terminal, node is nil. + node *pmanode + + // index is the index of the segment in node.keys/values. + index int +} + +// Ok returns true if the iterator is not terminal. All other methods are only +// valid for non-terminal iterators. +func (seg pmaIterator) Ok() bool { + return seg.node != nil +} + +// Range returns the iterated segment's range key. +func (seg pmaIterator) Range() __generics_imported0.AddrRange { + return seg.node.keys[seg.index] +} + +// Start is equivalent to Range().Start, but should be preferred if only the +// start of the range is needed. +func (seg pmaIterator) Start() __generics_imported0.Addr { + return seg.node.keys[seg.index].Start +} + +// End is equivalent to Range().End, but should be preferred if only the end of +// the range is needed. +func (seg pmaIterator) End() __generics_imported0.Addr { + return seg.node.keys[seg.index].End +} + +// SetRangeUnchecked mutates the iterated segment's range key. This operation +// does not invalidate any iterators. +// +// Preconditions: +// +// - r.Length() > 0. +// +// - The new range must not overlap an existing one: If seg.NextSegment().Ok(), +// then r.end <= seg.NextSegment().Start(); if seg.PrevSegment().Ok(), then +// r.start >= seg.PrevSegment().End(). +func (seg pmaIterator) SetRangeUnchecked(r __generics_imported0.AddrRange) { + seg.node.keys[seg.index] = r +} + +// SetRange mutates the iterated segment's range key. If the new range would +// cause the iterated segment to overlap another segment, or if the new range +// is invalid, SetRange panics. This operation does not invalidate any +// iterators. +func (seg pmaIterator) SetRange(r __generics_imported0.AddrRange) { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { + panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) + } + if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { + panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) + } + seg.SetRangeUnchecked(r) +} + +// SetStartUnchecked mutates the iterated segment's start. This operation does +// not invalidate any iterators. +// +// Preconditions: The new start must be valid: start < seg.End(); if +// seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). +func (seg pmaIterator) SetStartUnchecked(start __generics_imported0.Addr) { + seg.node.keys[seg.index].Start = start +} + +// SetStart mutates the iterated segment's start. If the new start value would +// cause the iterated segment to overlap another segment, or would result in an +// invalid range, SetStart panics. This operation does not invalidate any +// iterators. +func (seg pmaIterator) SetStart(start __generics_imported0.Addr) { + if start >= seg.End() { + panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) + } + if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { + panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) + } + seg.SetStartUnchecked(start) +} + +// SetEndUnchecked mutates the iterated segment's end. This operation does not +// invalidate any iterators. +// +// Preconditions: The new end must be valid: end > seg.Start(); if +// seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). +func (seg pmaIterator) SetEndUnchecked(end __generics_imported0.Addr) { + seg.node.keys[seg.index].End = end +} + +// SetEnd mutates the iterated segment's end. If the new end value would cause +// the iterated segment to overlap another segment, or would result in an +// invalid range, SetEnd panics. This operation does not invalidate any +// iterators. +func (seg pmaIterator) SetEnd(end __generics_imported0.Addr) { + if end <= seg.Start() { + panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) + } + if next := seg.NextSegment(); next.Ok() && end > next.Start() { + panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) + } + seg.SetEndUnchecked(end) +} + +// Value returns a copy of the iterated segment's value. +func (seg pmaIterator) Value() pma { + return seg.node.values[seg.index] +} + +// ValuePtr returns a pointer to the iterated segment's value. The pointer is +// invalidated if the iterator is invalidated. This operation does not +// invalidate any iterators. +func (seg pmaIterator) ValuePtr() *pma { + return &seg.node.values[seg.index] +} + +// SetValue mutates the iterated segment's value. This operation does not +// invalidate any iterators. +func (seg pmaIterator) SetValue(val pma) { + seg.node.values[seg.index] = val +} + +// PrevSegment returns the iterated segment's predecessor. If there is no +// preceding segment, PrevSegment returns a terminal iterator. +func (seg pmaIterator) PrevSegment() pmaIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index].lastSegment() + } + if seg.index > 0 { + return pmaIterator{seg.node, seg.index - 1} + } + if seg.node.parent == nil { + return pmaIterator{} + } + return pmasegmentBeforePosition(seg.node.parent, seg.node.parentIndex) +} + +// NextSegment returns the iterated segment's successor. If there is no +// succeeding segment, NextSegment returns a terminal iterator. +func (seg pmaIterator) NextSegment() pmaIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index+1].firstSegment() + } + if seg.index < seg.node.nrSegments-1 { + return pmaIterator{seg.node, seg.index + 1} + } + if seg.node.parent == nil { + return pmaIterator{} + } + return pmasegmentAfterPosition(seg.node.parent, seg.node.parentIndex) +} + +// PrevGap returns the gap immediately before the iterated segment. +func (seg pmaIterator) PrevGap() pmaGapIterator { + if seg.node.hasChildren { + + return seg.node.children[seg.index].lastSegment().NextGap() + } + return pmaGapIterator{seg.node, seg.index} +} + +// NextGap returns the gap immediately after the iterated segment. +func (seg pmaIterator) NextGap() pmaGapIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index+1].firstSegment().PrevGap() + } + return pmaGapIterator{seg.node, seg.index + 1} +} + +// PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, +// or the gap before the iterated segment otherwise. If seg.Start() == +// Functions.MinKey(), PrevNonEmpty will return two terminal iterators. +// Otherwise, exactly one of the iterators returned by PrevNonEmpty will be +// non-terminal. +func (seg pmaIterator) PrevNonEmpty() (pmaIterator, pmaGapIterator) { + gap := seg.PrevGap() + if gap.Range().Length() != 0 { + return pmaIterator{}, gap + } + return gap.PrevSegment(), pmaGapIterator{} +} + +// NextNonEmpty returns the iterated segment's successor if it is adjacent, or +// the gap after the iterated segment otherwise. If seg.End() == +// Functions.MaxKey(), NextNonEmpty will return two terminal iterators. +// Otherwise, exactly one of the iterators returned by NextNonEmpty will be +// non-terminal. +func (seg pmaIterator) NextNonEmpty() (pmaIterator, pmaGapIterator) { + gap := seg.NextGap() + if gap.Range().Length() != 0 { + return pmaIterator{}, gap + } + return gap.NextSegment(), pmaGapIterator{} +} + +// A GapIterator is conceptually one of: +// +// - A pointer to a position between two segments, before the first segment, or +// after the last segment in a set, called a *gap*; or +// +// - A terminal iterator, which is a sentinel indicating that the end of +// iteration has been reached. +// +// Note that the gap between two adjacent segments exists (iterators to it are +// non-terminal), but has a length of zero. GapIterator.IsEmpty returns true +// for such gaps. An empty set contains a single gap, spanning the entire range +// of the set's keys. +// +// GapIterators are copyable values and are meaningfully equality-comparable. +// The zero value of GapIterator is a terminal iterator. +// +// Unless otherwise specified, any mutation of a set invalidates all existing +// iterators into the set. +type pmaGapIterator struct { + // The representation of a GapIterator is identical to that of an Iterator, + // except that index corresponds to positions between segments in the same + // way as for node.children (see comment for node.nrSegments). + node *pmanode + index int +} + +// Ok returns true if the iterator is not terminal. All other methods are only +// valid for non-terminal iterators. +func (gap pmaGapIterator) Ok() bool { + return gap.node != nil +} + +// Range returns the range spanned by the iterated gap. +func (gap pmaGapIterator) Range() __generics_imported0.AddrRange { + return __generics_imported0.AddrRange{gap.Start(), gap.End()} +} + +// Start is equivalent to Range().Start, but should be preferred if only the +// start of the range is needed. +func (gap pmaGapIterator) Start() __generics_imported0.Addr { + if ps := gap.PrevSegment(); ps.Ok() { + return ps.End() + } + return pmaSetFunctions{}.MinKey() +} + +// End is equivalent to Range().End, but should be preferred if only the end of +// the range is needed. +func (gap pmaGapIterator) End() __generics_imported0.Addr { + if ns := gap.NextSegment(); ns.Ok() { + return ns.Start() + } + return pmaSetFunctions{}.MaxKey() +} + +// IsEmpty returns true if the iterated gap is empty (that is, the "gap" is +// between two adjacent segments.) +func (gap pmaGapIterator) IsEmpty() bool { + return gap.Range().Length() == 0 +} + +// PrevSegment returns the segment immediately before the iterated gap. If no +// such segment exists, PrevSegment returns a terminal iterator. +func (gap pmaGapIterator) PrevSegment() pmaIterator { + return pmasegmentBeforePosition(gap.node, gap.index) +} + +// NextSegment returns the segment immediately after the iterated gap. If no +// such segment exists, NextSegment returns a terminal iterator. +func (gap pmaGapIterator) NextSegment() pmaIterator { + return pmasegmentAfterPosition(gap.node, gap.index) +} + +// PrevGap returns the iterated gap's predecessor. If no such gap exists, +// PrevGap returns a terminal iterator. +func (gap pmaGapIterator) PrevGap() pmaGapIterator { + seg := gap.PrevSegment() + if !seg.Ok() { + return pmaGapIterator{} + } + return seg.PrevGap() +} + +// NextGap returns the iterated gap's successor. If no such gap exists, NextGap +// returns a terminal iterator. +func (gap pmaGapIterator) NextGap() pmaGapIterator { + seg := gap.NextSegment() + if !seg.Ok() { + return pmaGapIterator{} + } + return seg.NextGap() +} + +// segmentBeforePosition returns the predecessor segment of the position given +// by n.children[i], which may or may not contain a child. If no such segment +// exists, segmentBeforePosition returns a terminal iterator. +func pmasegmentBeforePosition(n *pmanode, i int) pmaIterator { + for i == 0 { + if n.parent == nil { + return pmaIterator{} + } + n, i = n.parent, n.parentIndex + } + return pmaIterator{n, i - 1} +} + +// segmentAfterPosition returns the successor segment of the position given by +// n.children[i], which may or may not contain a child. If no such segment +// exists, segmentAfterPosition returns a terminal iterator. +func pmasegmentAfterPosition(n *pmanode, i int) pmaIterator { + for i == n.nrSegments { + if n.parent == nil { + return pmaIterator{} + } + n, i = n.parent, n.parentIndex + } + return pmaIterator{n, i} +} + +func pmazeroValueSlice(slice []pma) { + + for i := range slice { + pmaSetFunctions{}.ClearValue(&slice[i]) + } +} + +func pmazeroNodeSlice(slice []*pmanode) { + for i := range slice { + slice[i] = nil + } +} + +// String stringifies a Set for debugging. +func (s *pmaSet) String() string { + return s.root.String() +} + +// String stringifes a node (and all of its children) for debugging. +func (n *pmanode) String() string { + var buf bytes.Buffer + n.writeDebugString(&buf, "") + return buf.String() +} + +func (n *pmanode) writeDebugString(buf *bytes.Buffer, prefix string) { + if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { + buf.WriteString(prefix) + buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) + } + for i := 0; i < n.nrSegments; i++ { + if child := n.children[i]; child != nil { + cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) + if child.parent != n || child.parentIndex != i { + buf.WriteString(cprefix) + buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) + } + child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) + } + buf.WriteString(prefix) + buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) + } + if child := n.children[n.nrSegments]; child != nil { + child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) + } +} + +// SegmentDataSlices represents segments from a set as slices of start, end, and +// values. SegmentDataSlices is primarily used as an intermediate representation +// for save/restore and the layout here is optimized for that. +// +// +stateify savable +type pmaSegmentDataSlices struct { + Start []__generics_imported0.Addr + End []__generics_imported0.Addr + Values []pma +} + +// ExportSortedSlice returns a copy of all segments in the given set, in ascending +// key order. +func (s *pmaSet) ExportSortedSlices() *pmaSegmentDataSlices { + var sds pmaSegmentDataSlices + for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + sds.Start = append(sds.Start, seg.Start()) + sds.End = append(sds.End, seg.End()) + sds.Values = append(sds.Values, seg.Value()) + } + sds.Start = sds.Start[:len(sds.Start):len(sds.Start)] + sds.End = sds.End[:len(sds.End):len(sds.End)] + sds.Values = sds.Values[:len(sds.Values):len(sds.Values)] + return &sds +} + +// ImportSortedSlice initializes the given set from the given slice. +// +// Preconditions: s must be empty. sds must represent a valid set (the segments +// in sds must have valid lengths that do not overlap). The segments in sds +// must be sorted in ascending key order. +func (s *pmaSet) ImportSortedSlices(sds *pmaSegmentDataSlices) error { + if !s.IsEmpty() { + return fmt.Errorf("cannot import into non-empty set %v", s) + } + gap := s.FirstGap() + for i := range sds.Start { + r := __generics_imported0.AddrRange{sds.Start[i], sds.End[i]} + if !gap.Range().IsSupersetOf(r) { + return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: [%d, %d) => %v", sds.Start[i], sds.End[i], sds.Values[i]) + } + gap = s.InsertWithoutMerging(gap, r, sds.Values[i]).NextGap() + } + return nil +} +func (s *pmaSet) saveRoot() *pmaSegmentDataSlices { + return s.ExportSortedSlices() +} + +func (s *pmaSet) loadRoot(sds *pmaSegmentDataSlices) { + if err := s.ImportSortedSlices(sds); err != nil { + panic(err) + } +} diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go new file mode 100644 index 000000000..c8302a553 --- /dev/null +++ b/pkg/sentry/mm/procfs.go @@ -0,0 +1,289 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "bytes" + "fmt" + "strings" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +const ( + // devMinorBits is the number of minor bits in a device number. Linux: + // include/linux/kdev_t.h:MINORBITS + devMinorBits = 20 + + vsyscallEnd = usermem.Addr(0xffffffffff601000) + vsyscallMapsEntry = "ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n" + vsyscallSmapsEntry = vsyscallMapsEntry + + "Size: 4 kB\n" + + "Rss: 0 kB\n" + + "Pss: 0 kB\n" + + "Shared_Clean: 0 kB\n" + + "Shared_Dirty: 0 kB\n" + + "Private_Clean: 0 kB\n" + + "Private_Dirty: 0 kB\n" + + "Referenced: 0 kB\n" + + "Anonymous: 0 kB\n" + + "AnonHugePages: 0 kB\n" + + "Shared_Hugetlb: 0 kB\n" + + "Private_Hugetlb: 0 kB\n" + + "Swap: 0 kB\n" + + "SwapPss: 0 kB\n" + + "KernelPageSize: 4 kB\n" + + "MMUPageSize: 4 kB\n" + + "Locked: 0 kB\n" + + "VmFlags: rd ex \n" +) + +// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. +func (mm *MemoryManager) NeedsUpdate(generation int64) bool { + return true +} + +// ReadMapsSeqFileData is called by fs/proc.mapsData.ReadSeqFileData to +// implement /proc/[pid]/maps. +func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + var data []seqfile.SeqData + var start usermem.Addr + if handle != nil { + start = *handle.(*usermem.Addr) + } + for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() { + // FIXME(b/30793614): If we use a usermem.Addr for the handle, we get + // "panic: autosave error: type usermem.Addr is not registered". + vmaAddr := vseg.End() + data = append(data, seqfile.SeqData{ + Buf: mm.vmaMapsEntryLocked(ctx, vseg), + Handle: &vmaAddr, + }) + } + + // We always emulate vsyscall, so advertise it here. Everything about a + // vsyscall region is static, so just hard code the maps entry since we + // don't have a real vma backing it. The vsyscall region is at the end of + // the virtual address space so nothing should be mapped after it (if + // something is really mapped in the tiny ~10 MiB segment afterwards, we'll + // get the sorting on the maps file wrong at worst; but that's not possible + // on any current platform). + // + // Artifically adjust the seqfile handle so we only output vsyscall entry once. + if start != vsyscallEnd { + // FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd. + vmaAddr := vsyscallEnd + data = append(data, seqfile.SeqData{ + Buf: []byte(vsyscallMapsEntry), + Handle: &vmaAddr, + }) + } + return data, 1 +} + +// vmaMapsEntryLocked returns a /proc/[pid]/maps entry for the vma iterated by +// vseg, including the trailing newline. +// +// Preconditions: mm.mappingMu must be locked. +func (mm *MemoryManager) vmaMapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte { + var b bytes.Buffer + mm.appendVMAMapsEntryLocked(ctx, vseg, &b) + return b.Bytes() +} + +// Preconditions: mm.mappingMu must be locked. +func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaIterator, b *bytes.Buffer) { + vma := vseg.ValuePtr() + private := "p" + if !vma.private { + private = "s" + } + + var dev, ino uint64 + if vma.id != nil { + dev = vma.id.DeviceID() + ino = vma.id.InodeID() + } + devMajor := uint32(dev >> devMinorBits) + devMinor := uint32(dev & ((1 << devMinorBits) - 1)) + + // Do not include the guard page: fs/proc/task_mmu.c:show_map_vma() => + // stack_guard_page_start(). + fmt.Fprintf(b, "%08x-%08x %s%s %08x %02x:%02x %d ", + vseg.Start(), vseg.End(), vma.realPerms, private, vma.off, devMajor, devMinor, ino) + + // Figure out our filename or hint. + var s string + if vma.hint != "" { + s = vma.hint + } else if vma.id != nil { + // FIXME(jamieliu): We are holding mm.mappingMu here, which is + // consistent with Linux's holding mmap_sem in + // fs/proc/task_mmu.c:show_map_vma() => fs/seq_file.c:seq_file_path(). + // However, it's not clear that fs.File.MappedName() is actually + // consistent with this lock order. + s = vma.id.MappedName(ctx) + } + if s != "" { + // Per linux, we pad until the 74th character. + if pad := 73 - b.Len(); pad > 0 { + b.WriteString(strings.Repeat(" ", pad)) + } + b.WriteString(s) + } + b.WriteString("\n") +} + +// ReadSmapsSeqFileData is called by fs/proc.smapsData.ReadSeqFileData to +// implement /proc/[pid]/smaps. +func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + var data []seqfile.SeqData + var start usermem.Addr + if handle != nil { + start = *handle.(*usermem.Addr) + } + for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() { + // FIXME(b/30793614): If we use a usermem.Addr for the handle, we get + // "panic: autosave error: type usermem.Addr is not registered". + vmaAddr := vseg.End() + data = append(data, seqfile.SeqData{ + Buf: mm.vmaSmapsEntryLocked(ctx, vseg), + Handle: &vmaAddr, + }) + } + + // We always emulate vsyscall, so advertise it here. See + // ReadMapsSeqFileData for additional commentary. + if start != vsyscallEnd { + // FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd. + vmaAddr := vsyscallEnd + data = append(data, seqfile.SeqData{ + Buf: []byte(vsyscallSmapsEntry), + Handle: &vmaAddr, + }) + } + return data, 1 +} + +// vmaSmapsEntryLocked returns a /proc/[pid]/smaps entry for the vma iterated +// by vseg, including the trailing newline. +// +// Preconditions: mm.mappingMu must be locked. +func (mm *MemoryManager) vmaSmapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte { + var b bytes.Buffer + mm.appendVMAMapsEntryLocked(ctx, vseg, &b) + vma := vseg.ValuePtr() + + // We take mm.activeMu here in each call to vmaSmapsEntryLocked, instead of + // requiring it to be locked as a precondition, to reduce the latency + // impact of reading /proc/[pid]/smaps on concurrent performance-sensitive + // operations requiring activeMu for writing like faults. + mm.activeMu.RLock() + var rss uint64 + var anon uint64 + vsegAR := vseg.Range() + for pseg := mm.pmas.LowerBoundSegment(vsegAR.Start); pseg.Ok() && pseg.Start() < vsegAR.End; pseg = pseg.NextSegment() { + psegAR := pseg.Range().Intersect(vsegAR) + size := uint64(psegAR.Length()) + rss += size + if pseg.ValuePtr().private { + anon += size + } + } + mm.activeMu.RUnlock() + + fmt.Fprintf(&b, "Size: %8d kB\n", vseg.Range().Length()/1024) + fmt.Fprintf(&b, "Rss: %8d kB\n", rss/1024) + // Currently we report PSS = RSS, i.e. we pretend each page mapped by a pma + // is only mapped by that pma. This avoids having to query memmap.Mappables + // for reference count information on each page. As a corollary, all pages + // are accounted as "private" whether or not the vma is private; compare + // Linux's fs/proc/task_mmu.c:smaps_account(). + fmt.Fprintf(&b, "Pss: %8d kB\n", rss/1024) + fmt.Fprintf(&b, "Shared_Clean: %8d kB\n", 0) + fmt.Fprintf(&b, "Shared_Dirty: %8d kB\n", 0) + // Pretend that all pages are dirty if the vma is writable, and clean otherwise. + clean := rss + if vma.effectivePerms.Write { + clean = 0 + } + fmt.Fprintf(&b, "Private_Clean: %8d kB\n", clean/1024) + fmt.Fprintf(&b, "Private_Dirty: %8d kB\n", (rss-clean)/1024) + // Pretend that all pages are "referenced" (recently touched). + fmt.Fprintf(&b, "Referenced: %8d kB\n", rss/1024) + fmt.Fprintf(&b, "Anonymous: %8d kB\n", anon/1024) + // Hugepages (hugetlb and THP) are not implemented. + fmt.Fprintf(&b, "AnonHugePages: %8d kB\n", 0) + fmt.Fprintf(&b, "Shared_Hugetlb: %8d kB\n", 0) + fmt.Fprintf(&b, "Private_Hugetlb: %7d kB\n", 0) + // Swap is not implemented. + fmt.Fprintf(&b, "Swap: %8d kB\n", 0) + fmt.Fprintf(&b, "SwapPss: %8d kB\n", 0) + fmt.Fprintf(&b, "KernelPageSize: %8d kB\n", usermem.PageSize/1024) + fmt.Fprintf(&b, "MMUPageSize: %8d kB\n", usermem.PageSize/1024) + locked := rss + if vma.mlockMode == memmap.MLockNone { + locked = 0 + } + fmt.Fprintf(&b, "Locked: %8d kB\n", locked/1024) + + b.WriteString("VmFlags: ") + if vma.realPerms.Read { + b.WriteString("rd ") + } + if vma.realPerms.Write { + b.WriteString("wr ") + } + if vma.realPerms.Execute { + b.WriteString("ex ") + } + if vma.canWriteMappableLocked() { // VM_SHARED + b.WriteString("sh ") + } + if vma.maxPerms.Read { + b.WriteString("mr ") + } + if vma.maxPerms.Write { + b.WriteString("mw ") + } + if vma.maxPerms.Execute { + b.WriteString("me ") + } + if !vma.private { // VM_MAYSHARE + b.WriteString("ms ") + } + if vma.growsDown { + b.WriteString("gd ") + } + if vma.mlockMode != memmap.MLockNone { // VM_LOCKED + b.WriteString("lo ") + } + if vma.mlockMode == memmap.MLockLazy { // VM_LOCKONFAULT + b.WriteString("?? ") // no explicit encoding in fs/proc/task_mmu.c:show_smap_vma_flags() + } + if vma.private && vma.effectivePerms.Write { // VM_ACCOUNT + b.WriteString("ac ") + } + b.WriteString("\n") + + return b.Bytes() +} diff --git a/pkg/sentry/mm/save_restore.go b/pkg/sentry/mm/save_restore.go new file mode 100644 index 000000000..0385957bd --- /dev/null +++ b/pkg/sentry/mm/save_restore.go @@ -0,0 +1,57 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" +) + +// InvalidateUnsavable invokes memmap.Mappable.InvalidateUnsavable on all +// Mappables mapped by mm. +func (mm *MemoryManager) InvalidateUnsavable(ctx context.Context) error { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() { + if vma := vseg.ValuePtr(); vma.mappable != nil { + if err := vma.mappable.InvalidateUnsavable(ctx); err != nil { + return err + } + } + } + return nil +} + +// beforeSave is invoked by stateify. +func (mm *MemoryManager) beforeSave() { + mf := mm.mfp.MemoryFile() + for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() { + if pma := pseg.ValuePtr(); pma.file != mf { + // InvalidateUnsavable should have caused all such pmas to be + // invalidated. + panic(fmt.Sprintf("Can't save pma %#v with non-MemoryFile of type %T:\n%s", pseg.Range(), pma.file, mm)) + } + } +} + +// afterLoad is invoked by stateify. +func (mm *MemoryManager) afterLoad() { + mm.haveASIO = mm.p.SupportsAddressSpaceIO() + mf := mm.mfp.MemoryFile() + for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() { + pseg.ValuePtr().file = mf + } +} diff --git a/pkg/sentry/mm/shm.go b/pkg/sentry/mm/shm.go new file mode 100644 index 000000000..12913007b --- /dev/null +++ b/pkg/sentry/mm/shm.go @@ -0,0 +1,66 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/shm" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// DetachShm unmaps a sysv shared memory segment. +func (mm *MemoryManager) DetachShm(ctx context.Context, addr usermem.Addr) error { + if addr != addr.RoundDown() { + // "... shmaddr is not aligned on a page boundary." - man shmdt(2) + return syserror.EINVAL + } + + var detached *shm.Shm + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + + // Find and remove the first vma containing an address >= addr that maps a + // segment originally attached at addr. + vseg := mm.vmas.LowerBoundSegment(addr) + for vseg.Ok() { + vma := vseg.ValuePtr() + if shm, ok := vma.mappable.(*shm.Shm); ok && vseg.Start() >= addr && uint64(vseg.Start()-addr) == vma.off { + detached = shm + vseg = mm.unmapLocked(ctx, vseg.Range()).NextSegment() + break + } else { + vseg = vseg.NextSegment() + } + } + + if detached == nil { + // There is no shared memory segment attached at addr. + return syserror.EINVAL + } + + // Remove all vmas that could have been created by the same attach. + end := addr + usermem.Addr(detached.EffectiveSize()) + for vseg.Ok() && vseg.End() <= end { + vma := vseg.ValuePtr() + if vma.mappable == detached && uint64(vseg.Start()-addr) == vma.off { + vseg = mm.unmapLocked(ctx, vseg.Range()).NextSegment() + } else { + vseg = vseg.NextSegment() + } + } + + return nil +} diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go new file mode 100644 index 000000000..687959005 --- /dev/null +++ b/pkg/sentry/mm/special_mappable.go @@ -0,0 +1,155 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// SpecialMappable implements memmap.MappingIdentity and memmap.Mappable with +// semantics similar to Linux's mm/mmap.c:_install_special_mapping(), except +// that SpecialMappable takes ownership of the memory that it represents +// (_install_special_mapping() does not.) +// +// +stateify savable +type SpecialMappable struct { + refs.AtomicRefCount + + mfp pgalloc.MemoryFileProvider + fr platform.FileRange + name string +} + +// NewSpecialMappable returns a SpecialMappable that owns fr, which represents +// offsets in mfp.MemoryFile() that contain the SpecialMappable's data. The +// SpecialMappable will use the given name in /proc/[pid]/maps. +// +// Preconditions: fr.Length() != 0. +func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *SpecialMappable { + return &SpecialMappable{mfp: mfp, fr: fr, name: name} +} + +// DecRef implements refs.RefCounter.DecRef. +func (m *SpecialMappable) DecRef() { + m.AtomicRefCount.DecRefWithDestructor(func() { + m.mfp.MemoryFile().DecRef(m.fr) + }) +} + +// MappedName implements memmap.MappingIdentity.MappedName. +func (m *SpecialMappable) MappedName(ctx context.Context) string { + return m.name +} + +// DeviceID implements memmap.MappingIdentity.DeviceID. +func (m *SpecialMappable) DeviceID() uint64 { + return 0 +} + +// InodeID implements memmap.MappingIdentity.InodeID. +func (m *SpecialMappable) InodeID() uint64 { + return 0 +} + +// Msync implements memmap.MappingIdentity.Msync. +func (m *SpecialMappable) Msync(ctx context.Context, mr memmap.MappableRange) error { + // Linux: vm_file is NULL, causing msync to skip it entirely. + return nil +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (*SpecialMappable) AddMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, uint64, bool) error { + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (*SpecialMappable) RemoveMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, uint64, bool) { +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (*SpecialMappable) CopyMapping(context.Context, memmap.MappingSpace, usermem.AddrRange, usermem.AddrRange, uint64, bool) error { + return nil +} + +// Translate implements memmap.Mappable.Translate. +func (m *SpecialMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + var err error + if required.End > m.fr.Length() { + err = &memmap.BusError{syserror.EFAULT} + } + if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 { + return []memmap.Translation{ + { + Source: source, + File: m.mfp.MemoryFile(), + Offset: m.fr.Start + source.Start, + Perms: usermem.AnyAccess, + }, + }, err + } + return nil, err +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (m *SpecialMappable) InvalidateUnsavable(ctx context.Context) error { + // Since data is stored in pgalloc.MemoryFile, the contents of which are + // preserved across save/restore, we don't need to do anything. + return nil +} + +// MemoryFileProvider returns the MemoryFileProvider whose MemoryFile stores +// the SpecialMappable's contents. +func (m *SpecialMappable) MemoryFileProvider() pgalloc.MemoryFileProvider { + return m.mfp +} + +// FileRange returns the offsets into MemoryFileProvider().MemoryFile() that +// store the SpecialMappable's contents. +func (m *SpecialMappable) FileRange() platform.FileRange { + return m.fr +} + +// Length returns the length of the SpecialMappable. +func (m *SpecialMappable) Length() uint64 { + return m.fr.Length() +} + +// NewSharedAnonMappable returns a SpecialMappable that implements the +// semantics of mmap(MAP_SHARED|MAP_ANONYMOUS) and mappings of /dev/zero. +// +// TODO(jamieliu): The use of SpecialMappable is a lazy code reuse hack. Linux +// uses an ephemeral file created by mm/shmem.c:shmem_zero_setup(); we should +// do the same to get non-zero device and inode IDs. +func NewSharedAnonMappable(length uint64, mfp pgalloc.MemoryFileProvider) (*SpecialMappable, error) { + if length == 0 { + return nil, syserror.EINVAL + } + alignedLen, ok := usermem.Addr(length).RoundUp() + if !ok { + return nil, syserror.EINVAL + } + fr, err := mfp.MemoryFile().Allocate(uint64(alignedLen), usage.Anonymous) + if err != nil { + return nil, err + } + return NewSpecialMappable("/dev/zero (deleted)", mfp, fr), nil +} diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go new file mode 100644 index 000000000..0368c6794 --- /dev/null +++ b/pkg/sentry/mm/syscalls.go @@ -0,0 +1,1197 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "fmt" + mrand "math/rand" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// HandleUserFault handles an application page fault. sp is the faulting +// application thread's stack pointer. +// +// Preconditions: mm.as != nil. +func (mm *MemoryManager) HandleUserFault(ctx context.Context, addr usermem.Addr, at usermem.AccessType, sp usermem.Addr) error { + ar, ok := addr.RoundDown().ToRange(usermem.PageSize) + if !ok { + return syserror.EFAULT + } + + // Don't bother trying existingPMAsLocked; in most cases, if we did have + // existing pmas, we wouldn't have faulted. + + // Ensure that we have a usable vma. Here and below, since we are only + // asking for a single page, there is no possibility of partial success, + // and any error is immediately fatal. + mm.mappingMu.RLock() + vseg, _, err := mm.getVMAsLocked(ctx, ar, at, false) + if err != nil { + mm.mappingMu.RUnlock() + return err + } + + // Ensure that we have a usable pma. + mm.activeMu.Lock() + pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, at) + mm.mappingMu.RUnlock() + if err != nil { + mm.activeMu.Unlock() + return err + } + + // Downgrade to a read-lock on activeMu since we don't need to mutate pmas + // anymore. + mm.activeMu.DowngradeLock() + + // Map the faulted page into the active AddressSpace. + err = mm.mapASLocked(pseg, ar, false) + mm.activeMu.RUnlock() + return err +} + +// MMap establishes a memory mapping. +func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (usermem.Addr, error) { + if opts.Length == 0 { + return 0, syserror.EINVAL + } + length, ok := usermem.Addr(opts.Length).RoundUp() + if !ok { + return 0, syserror.ENOMEM + } + opts.Length = uint64(length) + + if opts.Mappable != nil { + // Offset must be aligned. + if usermem.Addr(opts.Offset).RoundDown() != usermem.Addr(opts.Offset) { + return 0, syserror.EINVAL + } + // Offset + length must not overflow. + if end := opts.Offset + opts.Length; end < opts.Offset { + return 0, syserror.ENOMEM + } + } else { + opts.Offset = 0 + if !opts.Private { + if opts.MappingIdentity != nil { + return 0, syserror.EINVAL + } + m, err := NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx)) + if err != nil { + return 0, err + } + defer m.DecRef() + opts.MappingIdentity = m + opts.Mappable = m + } + } + + if opts.Addr.RoundDown() != opts.Addr { + // MAP_FIXED requires addr to be page-aligned; non-fixed mappings + // don't. + if opts.Fixed { + return 0, syserror.EINVAL + } + opts.Addr = opts.Addr.RoundDown() + } + + if !opts.MaxPerms.SupersetOf(opts.Perms) { + return 0, syserror.EACCES + } + if opts.Unmap && !opts.Fixed { + return 0, syserror.EINVAL + } + if opts.GrowsDown && opts.Mappable != nil { + return 0, syserror.EINVAL + } + + // Get the new vma. + mm.mappingMu.Lock() + if opts.MLockMode < mm.defMLockMode { + opts.MLockMode = mm.defMLockMode + } + vseg, ar, err := mm.createVMALocked(ctx, opts) + if err != nil { + mm.mappingMu.Unlock() + return 0, err + } + + // TODO(jamieliu): In Linux, VM_LOCKONFAULT (which may be set on the new + // vma by mlockall(MCL_FUTURE|MCL_ONFAULT) => mm_struct::def_flags) appears + // to effectively disable MAP_POPULATE by unsetting FOLL_POPULATE in + // mm/util.c:vm_mmap_pgoff() => mm/gup.c:__mm_populate() => + // populate_vma_page_range(). Confirm this behavior. + switch { + case opts.Precommit || opts.MLockMode == memmap.MLockEager: + // Get pmas and map with precommit as requested. + mm.populateVMAAndUnlock(ctx, vseg, ar, true) + + case opts.Mappable == nil && length <= privateAllocUnit: + // NOTE(b/63077076, b/63360184): Get pmas and map eagerly in the hope + // that doing so will save on future page faults. We only do this for + // anonymous mappings, since otherwise the cost of + // memmap.Mappable.Translate is unknown; and only for small mappings, + // to avoid needing to allocate large amounts of memory that we may + // subsequently need to checkpoint. + mm.populateVMAAndUnlock(ctx, vseg, ar, false) + + default: + mm.mappingMu.Unlock() + } + + return ar.Start, nil +} + +// populateVMA obtains pmas for addresses in ar in the given vma, and maps them +// into mm.as if it is active. +// +// Preconditions: mm.mappingMu must be locked. vseg.Range().IsSupersetOf(ar). +func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) { + if !vseg.ValuePtr().effectivePerms.Any() { + // Linux doesn't populate inaccessible pages. See + // mm/gup.c:populate_vma_page_range. + return + } + + mm.activeMu.Lock() + // Can't defer mm.activeMu.Unlock(); see below. + + // Even if we get new pmas, we can't actually map them if we don't have an + // AddressSpace. + if mm.as == nil { + mm.activeMu.Unlock() + return + } + + // Ensure that we have usable pmas. + pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, usermem.NoAccess) + if err != nil { + // mm/util.c:vm_mmap_pgoff() ignores the error, if any, from + // mm/gup.c:mm_populate(). If it matters, we'll get it again when + // userspace actually tries to use the failing page. + mm.activeMu.Unlock() + return + } + + // Downgrade to a read-lock on activeMu since we don't need to mutate pmas + // anymore. + mm.activeMu.DowngradeLock() + + // As above, errors are silently ignored. + mm.mapASLocked(pseg, ar, precommit) + mm.activeMu.RUnlock() +} + +// populateVMAAndUnlock is equivalent to populateVMA, but also unconditionally +// unlocks mm.mappingMu. In cases where populateVMAAndUnlock is usable, it is +// preferable to populateVMA since it unlocks mm.mappingMu before performing +// expensive operations that don't require it to be locked. +// +// Preconditions: mm.mappingMu must be locked for writing. +// vseg.Range().IsSupersetOf(ar). +// +// Postconditions: mm.mappingMu will be unlocked. +func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar usermem.AddrRange, precommit bool) { + // See populateVMA above for commentary. + if !vseg.ValuePtr().effectivePerms.Any() { + mm.mappingMu.Unlock() + return + } + + mm.activeMu.Lock() + + if mm.as == nil { + mm.activeMu.Unlock() + mm.mappingMu.Unlock() + return + } + + // mm.mappingMu doesn't need to be write-locked for getPMAsLocked, and it + // isn't needed at all for mapASLocked. + mm.mappingMu.DowngradeLock() + pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, usermem.NoAccess) + mm.mappingMu.RUnlock() + if err != nil { + mm.activeMu.Unlock() + return + } + + mm.activeMu.DowngradeLock() + mm.mapASLocked(pseg, ar, precommit) + mm.activeMu.RUnlock() +} + +// MapStack allocates the initial process stack. +func (mm *MemoryManager) MapStack(ctx context.Context) (usermem.AddrRange, error) { + // maxStackSize is the maximum supported process stack size in bytes. + // + // This limit exists because stack growing isn't implemented, so the entire + // process stack must be mapped up-front. + const maxStackSize = 128 << 20 + + stackSize := limits.FromContext(ctx).Get(limits.Stack) + r, ok := usermem.Addr(stackSize.Cur).RoundUp() + sz := uint64(r) + if !ok { + // RLIM_INFINITY rounds up to 0. + sz = linux.DefaultStackSoftLimit + } else if sz > maxStackSize { + ctx.Warningf("Capping stack size from RLIMIT_STACK of %v down to %v.", sz, maxStackSize) + sz = maxStackSize + } else if sz == 0 { + return usermem.AddrRange{}, syserror.ENOMEM + } + szaddr := usermem.Addr(sz) + ctx.Debugf("Allocating stack with size of %v bytes", sz) + + // Determine the stack's desired location. Unlike Linux, address + // randomization can't be disabled. + stackEnd := mm.layout.MaxAddr - usermem.Addr(mrand.Int63n(int64(mm.layout.MaxStackRand))).RoundDown() + if stackEnd < szaddr { + return usermem.AddrRange{}, syserror.ENOMEM + } + stackStart := stackEnd - szaddr + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + _, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{ + Length: sz, + Addr: stackStart, + Perms: usermem.ReadWrite, + MaxPerms: usermem.AnyAccess, + Private: true, + GrowsDown: true, + MLockMode: mm.defMLockMode, + Hint: "[stack]", + }) + return ar, err +} + +// MUnmap implements the semantics of Linux's munmap(2). +func (mm *MemoryManager) MUnmap(ctx context.Context, addr usermem.Addr, length uint64) error { + if addr != addr.RoundDown() { + return syserror.EINVAL + } + if length == 0 { + return syserror.EINVAL + } + la, ok := usermem.Addr(length).RoundUp() + if !ok { + return syserror.EINVAL + } + ar, ok := addr.ToRange(uint64(la)) + if !ok { + return syserror.EINVAL + } + + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + mm.unmapLocked(ctx, ar) + return nil +} + +// MRemapOpts specifies options to MRemap. +type MRemapOpts struct { + // Move controls whether MRemap moves the remapped mapping to a new address. + Move MRemapMoveMode + + // NewAddr is the new address for the remapping. NewAddr is ignored unless + // Move is MMRemapMustMove. + NewAddr usermem.Addr +} + +// MRemapMoveMode controls MRemap's moving behavior. +type MRemapMoveMode int + +const ( + // MRemapNoMove prevents MRemap from moving the remapped mapping. + MRemapNoMove MRemapMoveMode = iota + + // MRemapMayMove allows MRemap to move the remapped mapping. + MRemapMayMove + + // MRemapMustMove requires MRemap to move the remapped mapping to + // MRemapOpts.NewAddr, replacing any existing mappings in the remapped + // range. + MRemapMustMove +) + +// MRemap implements the semantics of Linux's mremap(2). +func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSize uint64, newSize uint64, opts MRemapOpts) (usermem.Addr, error) { + // "Note that old_address has to be page aligned." - mremap(2) + if oldAddr.RoundDown() != oldAddr { + return 0, syserror.EINVAL + } + + // Linux treats an old_size that rounds up to 0 as 0, which is otherwise a + // valid size. However, new_size can't be 0 after rounding. + oldSizeAddr, _ := usermem.Addr(oldSize).RoundUp() + oldSize = uint64(oldSizeAddr) + newSizeAddr, ok := usermem.Addr(newSize).RoundUp() + if !ok || newSizeAddr == 0 { + return 0, syserror.EINVAL + } + newSize = uint64(newSizeAddr) + + oldEnd, ok := oldAddr.AddLength(oldSize) + if !ok { + return 0, syserror.EINVAL + } + + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + + // All cases require that a vma exists at oldAddr. + vseg := mm.vmas.FindSegment(oldAddr) + if !vseg.Ok() { + return 0, syserror.EFAULT + } + + // Behavior matrix: + // + // Move | oldSize = 0 | oldSize < newSize | oldSize = newSize | oldSize > newSize + // ---------+-------------+-------------------+-------------------+------------------ + // NoMove | ENOMEM [1] | Grow in-place | No-op | Shrink in-place + // MayMove | Copy [1] | Grow in-place or | No-op | Shrink in-place + // | | move | | + // MustMove | Copy | Move and grow | Move | Shrink and move + // + // [1] In-place growth is impossible because the vma at oldAddr already + // occupies at least part of the destination. Thus the NoMove case always + // fails and the MayMove case always falls back to copying. + + if vma := vseg.ValuePtr(); newSize > oldSize && vma.mlockMode != memmap.MLockNone { + // Check against RLIMIT_MEMLOCK. Unlike mmap, mlock, and mlockall, + // mremap in Linux does not check mm/mlock.c:can_do_mlock() and + // therefore does not return EPERM if RLIMIT_MEMLOCK is 0 and + // !CAP_IPC_LOCK. + mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur + if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { + if newLockedAS := mm.lockedAS - oldSize + newSize; newLockedAS > mlockLimit { + return 0, syserror.EAGAIN + } + } + } + + if opts.Move != MRemapMustMove { + // Handle no-ops and in-place shrinking. These cases don't care if + // [oldAddr, oldEnd) maps to a single vma, or is even mapped at all + // (aside from oldAddr). + if newSize <= oldSize { + if newSize < oldSize { + // If oldAddr+oldSize didn't overflow, oldAddr+newSize can't + // either. + newEnd := oldAddr + usermem.Addr(newSize) + mm.unmapLocked(ctx, usermem.AddrRange{newEnd, oldEnd}) + } + return oldAddr, nil + } + + // Handle in-place growing. + + // Check that oldEnd maps to the same vma as oldAddr. + if vseg.End() < oldEnd { + return 0, syserror.EFAULT + } + // "Grow" the existing vma by creating a new mergeable one. + vma := vseg.ValuePtr() + var newOffset uint64 + if vma.mappable != nil { + newOffset = vseg.mappableRange().End + } + vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{ + Length: newSize - oldSize, + MappingIdentity: vma.id, + Mappable: vma.mappable, + Offset: newOffset, + Addr: oldEnd, + Fixed: true, + Perms: vma.realPerms, + MaxPerms: vma.maxPerms, + Private: vma.private, + GrowsDown: vma.growsDown, + MLockMode: vma.mlockMode, + Hint: vma.hint, + }) + if err == nil { + if vma.mlockMode == memmap.MLockEager { + mm.populateVMA(ctx, vseg, ar, true) + } + return oldAddr, nil + } + // In-place growth failed. In the MRemapMayMove case, fall through to + // copying/moving below. + if opts.Move == MRemapNoMove { + return 0, err + } + } + + // Find a location for the new mapping. + var newAR usermem.AddrRange + switch opts.Move { + case MRemapMayMove: + newAddr, err := mm.findAvailableLocked(newSize, findAvailableOpts{}) + if err != nil { + return 0, err + } + newAR, _ = newAddr.ToRange(newSize) + + case MRemapMustMove: + newAddr := opts.NewAddr + if newAddr.RoundDown() != newAddr { + return 0, syserror.EINVAL + } + var ok bool + newAR, ok = newAddr.ToRange(newSize) + if !ok { + return 0, syserror.EINVAL + } + if (usermem.AddrRange{oldAddr, oldEnd}).Overlaps(newAR) { + return 0, syserror.EINVAL + } + + // Unmap any mappings at the destination. + mm.unmapLocked(ctx, newAR) + + // If the sizes specify shrinking, unmap everything between the new and + // old sizes at the source. Unmapping before the following checks is + // correct: compare Linux's mm/mremap.c:mremap_to() => do_munmap(), + // vma_to_resize(). + if newSize < oldSize { + oldNewEnd := oldAddr + usermem.Addr(newSize) + mm.unmapLocked(ctx, usermem.AddrRange{oldNewEnd, oldEnd}) + oldEnd = oldNewEnd + } + + // unmapLocked may have invalidated vseg; look it up again. + vseg = mm.vmas.FindSegment(oldAddr) + } + + oldAR := usermem.AddrRange{oldAddr, oldEnd} + + // Check that oldEnd maps to the same vma as oldAddr. + if vseg.End() < oldEnd { + return 0, syserror.EFAULT + } + + // Check against RLIMIT_AS. + newUsageAS := mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length()) + if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS { + return 0, syserror.ENOMEM + } + + if vma := vseg.ValuePtr(); vma.mappable != nil { + // Check that offset+length does not overflow. + if vma.off+uint64(newAR.Length()) < vma.off { + return 0, syserror.EINVAL + } + // Inform the Mappable, if any, of the new mapping. + if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start), vma.canWriteMappableLocked()); err != nil { + return 0, err + } + } + + if oldSize == 0 { + // Handle copying. + // + // We can't use createVMALocked because it calls Mappable.AddMapping, + // whereas we've already called Mappable.CopyMapping (which is + // consistent with Linux). Call vseg.Value() (rather than + // vseg.ValuePtr()) to make a copy of the vma. + vma := vseg.Value() + if vma.mappable != nil { + vma.off = vseg.mappableOffsetAt(oldAR.Start) + } + if vma.id != nil { + vma.id.IncRef() + } + vseg := mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma) + mm.usageAS += uint64(newAR.Length()) + if vma.isPrivateDataLocked() { + mm.dataAS += uint64(newAR.Length()) + } + if vma.mlockMode != memmap.MLockNone { + mm.lockedAS += uint64(newAR.Length()) + if vma.mlockMode == memmap.MLockEager { + mm.populateVMA(ctx, vseg, newAR, true) + } + } + return newAR.Start, nil + } + + // Handle moving. + // + // Remove the existing vma before inserting the new one to minimize + // iterator invalidation. We do this directly (instead of calling + // removeVMAsLocked) because: + // + // 1. We can't drop the reference on vma.id, which will be transferred to + // the new vma. + // + // 2. We can't call vma.mappable.RemoveMapping, because pmas are still at + // oldAR, so calling RemoveMapping could cause us to miss an invalidation + // overlapping oldAR. + // + // Call vseg.Value() (rather than vseg.ValuePtr()) to make a copy of the + // vma. + vseg = mm.vmas.Isolate(vseg, oldAR) + vma := vseg.Value() + mm.vmas.Remove(vseg) + vseg = mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma) + mm.usageAS = mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length()) + if vma.isPrivateDataLocked() { + mm.dataAS = mm.dataAS - uint64(oldAR.Length()) + uint64(newAR.Length()) + } + if vma.mlockMode != memmap.MLockNone { + mm.lockedAS = mm.lockedAS - uint64(oldAR.Length()) + uint64(newAR.Length()) + } + + // Move pmas. This is technically optional for non-private pmas, which + // could just go through memmap.Mappable.Translate again, but it's required + // for private pmas. + mm.activeMu.Lock() + mm.movePMAsLocked(oldAR, newAR) + mm.activeMu.Unlock() + + // Now that pmas have been moved to newAR, we can notify vma.mappable that + // oldAR is no longer mapped. + if vma.mappable != nil { + vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.canWriteMappableLocked()) + } + + if vma.mlockMode == memmap.MLockEager { + mm.populateVMA(ctx, vseg, newAR, true) + } + + return newAR.Start, nil +} + +// MProtect implements the semantics of Linux's mprotect(2). +func (mm *MemoryManager) MProtect(addr usermem.Addr, length uint64, realPerms usermem.AccessType, growsDown bool) error { + if addr.RoundDown() != addr { + return syserror.EINVAL + } + if length == 0 { + return nil + } + rlength, ok := usermem.Addr(length).RoundUp() + if !ok { + return syserror.ENOMEM + } + ar, ok := addr.ToRange(uint64(rlength)) + if !ok { + return syserror.ENOMEM + } + effectivePerms := realPerms.Effective() + + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + // Non-growsDown mprotect requires that all of ar is mapped, and stops at + // the first non-empty gap. growsDown mprotect requires that the first vma + // be growsDown, but does not require it to extend all the way to ar.Start; + // vmas after the first must be contiguous but need not be growsDown, like + // the non-growsDown case. + vseg := mm.vmas.LowerBoundSegment(ar.Start) + if !vseg.Ok() { + return syserror.ENOMEM + } + if growsDown { + if !vseg.ValuePtr().growsDown { + return syserror.EINVAL + } + if ar.End <= vseg.Start() { + return syserror.ENOMEM + } + ar.Start = vseg.Start() + } else { + if ar.Start < vseg.Start() { + return syserror.ENOMEM + } + } + + mm.activeMu.Lock() + defer mm.activeMu.Unlock() + defer func() { + mm.vmas.MergeRange(ar) + mm.vmas.MergeAdjacent(ar) + mm.pmas.MergeRange(ar) + mm.pmas.MergeAdjacent(ar) + }() + pseg := mm.pmas.LowerBoundSegment(ar.Start) + var didUnmapAS bool + for { + // Check for permission validity before splitting vmas, for consistency + // with Linux. + if !vseg.ValuePtr().maxPerms.SupersetOf(effectivePerms) { + return syserror.EACCES + } + vseg = mm.vmas.Isolate(vseg, ar) + + // Update vma permissions. + vma := vseg.ValuePtr() + vmaLength := vseg.Range().Length() + if vma.isPrivateDataLocked() { + mm.dataAS -= uint64(vmaLength) + } + + vma.realPerms = realPerms + vma.effectivePerms = effectivePerms + if vma.isPrivateDataLocked() { + mm.dataAS += uint64(vmaLength) + } + + // Propagate vma permission changes to pmas. + for pseg.Ok() && pseg.Start() < vseg.End() { + if pseg.Range().Overlaps(vseg.Range()) { + pseg = mm.pmas.Isolate(pseg, vseg.Range()) + pma := pseg.ValuePtr() + if !effectivePerms.SupersetOf(pma.effectivePerms) && !didUnmapAS { + // Unmap all of ar, not just vseg.Range(), to minimize host + // syscalls. + mm.unmapASLocked(ar) + didUnmapAS = true + } + pma.effectivePerms = effectivePerms.Intersect(pma.translatePerms) + if pma.needCOW { + pma.effectivePerms.Write = false + } + } + pseg = pseg.NextSegment() + } + + // Continue to the next vma. + if ar.End <= vseg.End() { + return nil + } + vseg, _ = vseg.NextNonEmpty() + if !vseg.Ok() { + return syserror.ENOMEM + } + } +} + +// BrkSetup sets mm's brk address to addr and its brk size to 0. +func (mm *MemoryManager) BrkSetup(ctx context.Context, addr usermem.Addr) { + mm.mappingMu.Lock() + defer mm.mappingMu.Unlock() + // Unmap the existing brk. + if mm.brk.Length() != 0 { + mm.unmapLocked(ctx, mm.brk) + } + mm.brk = usermem.AddrRange{addr, addr} +} + +// Brk implements the semantics of Linux's brk(2), except that it returns an +// error on failure. +func (mm *MemoryManager) Brk(ctx context.Context, addr usermem.Addr) (usermem.Addr, error) { + mm.mappingMu.Lock() + // Can't defer mm.mappingMu.Unlock(); see below. + + if addr < mm.brk.Start { + addr = mm.brk.End + mm.mappingMu.Unlock() + return addr, syserror.EINVAL + } + + // TODO(gvisor.dev/issue/156): This enforces RLIMIT_DATA, but is + // slightly more permissive than the usual data limit. In particular, + // this only limits the size of the heap; a true RLIMIT_DATA limits the + // size of heap + data + bss. The segment sizes need to be plumbed from + // the loader package to fully enforce RLIMIT_DATA. + if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur { + addr = mm.brk.End + mm.mappingMu.Unlock() + return addr, syserror.ENOMEM + } + + oldbrkpg, _ := mm.brk.End.RoundUp() + newbrkpg, ok := addr.RoundUp() + if !ok { + addr = mm.brk.End + mm.mappingMu.Unlock() + return addr, syserror.EFAULT + } + + switch { + case oldbrkpg < newbrkpg: + vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{ + Length: uint64(newbrkpg - oldbrkpg), + Addr: oldbrkpg, + Fixed: true, + // Compare Linux's + // arch/x86/include/asm/page_types.h:VM_DATA_DEFAULT_FLAGS. + Perms: usermem.ReadWrite, + MaxPerms: usermem.AnyAccess, + Private: true, + // Linux: mm/mmap.c:sys_brk() => do_brk_flags() includes + // mm->def_flags. + MLockMode: mm.defMLockMode, + Hint: "[heap]", + }) + if err != nil { + addr = mm.brk.End + mm.mappingMu.Unlock() + return addr, err + } + mm.brk.End = addr + if mm.defMLockMode == memmap.MLockEager { + mm.populateVMAAndUnlock(ctx, vseg, ar, true) + } else { + mm.mappingMu.Unlock() + } + + case newbrkpg < oldbrkpg: + mm.unmapLocked(ctx, usermem.AddrRange{newbrkpg, oldbrkpg}) + fallthrough + + default: + mm.brk.End = addr + mm.mappingMu.Unlock() + } + + return addr, nil +} + +// MLock implements the semantics of Linux's mlock()/mlock2()/munlock(), +// depending on mode. +func (mm *MemoryManager) MLock(ctx context.Context, addr usermem.Addr, length uint64, mode memmap.MLockMode) error { + // Linux allows this to overflow. + la, _ := usermem.Addr(length + addr.PageOffset()).RoundUp() + ar, ok := addr.RoundDown().ToRange(uint64(la)) + if !ok { + return syserror.EINVAL + } + + mm.mappingMu.Lock() + // Can't defer mm.mappingMu.Unlock(); see below. + + if mode != memmap.MLockNone { + // Check against RLIMIT_MEMLOCK. + if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { + mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur + if mlockLimit == 0 { + mm.mappingMu.Unlock() + return syserror.EPERM + } + if newLockedAS := mm.lockedAS + uint64(ar.Length()) - mm.mlockedBytesRangeLocked(ar); newLockedAS > mlockLimit { + mm.mappingMu.Unlock() + return syserror.ENOMEM + } + } + } + + // Check this after RLIMIT_MEMLOCK for consistency with Linux. + if ar.Length() == 0 { + mm.mappingMu.Unlock() + return nil + } + + // Apply the new mlock mode to vmas. + var unmapped bool + vseg := mm.vmas.FindSegment(ar.Start) + for { + if !vseg.Ok() { + unmapped = true + break + } + vseg = mm.vmas.Isolate(vseg, ar) + vma := vseg.ValuePtr() + prevMode := vma.mlockMode + vma.mlockMode = mode + if mode != memmap.MLockNone && prevMode == memmap.MLockNone { + mm.lockedAS += uint64(vseg.Range().Length()) + } else if mode == memmap.MLockNone && prevMode != memmap.MLockNone { + mm.lockedAS -= uint64(vseg.Range().Length()) + } + if ar.End <= vseg.End() { + break + } + vseg, _ = vseg.NextNonEmpty() + } + mm.vmas.MergeRange(ar) + mm.vmas.MergeAdjacent(ar) + if unmapped { + mm.mappingMu.Unlock() + return syserror.ENOMEM + } + + if mode == memmap.MLockEager { + // Ensure that we have usable pmas. Since we didn't return ENOMEM + // above, ar must be fully covered by vmas, so we can just use + // NextSegment below. + mm.activeMu.Lock() + mm.mappingMu.DowngradeLock() + for vseg := mm.vmas.FindSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { + if !vseg.ValuePtr().effectivePerms.Any() { + // Linux: mm/gup.c:__get_user_pages() returns EFAULT in this + // case, which is converted to ENOMEM by mlock. + mm.activeMu.Unlock() + mm.mappingMu.RUnlock() + return syserror.ENOMEM + } + _, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), usermem.NoAccess) + if err != nil { + mm.activeMu.Unlock() + mm.mappingMu.RUnlock() + // Linux: mm/mlock.c:__mlock_posix_error_return() + if err == syserror.EFAULT { + return syserror.ENOMEM + } + if err == syserror.ENOMEM { + return syserror.EAGAIN + } + return err + } + } + + // Map pmas into the active AddressSpace, if we have one. + mm.mappingMu.RUnlock() + if mm.as != nil { + mm.activeMu.DowngradeLock() + err := mm.mapASLocked(mm.pmas.LowerBoundSegment(ar.Start), ar, true /* precommit */) + mm.activeMu.RUnlock() + if err != nil { + return err + } + } else { + mm.activeMu.Unlock() + } + } else { + mm.mappingMu.Unlock() + } + + return nil +} + +// MLockAllOpts holds options to MLockAll. +type MLockAllOpts struct { + // If Current is true, change the memory-locking behavior of all mappings + // to Mode. If Future is true, upgrade the memory-locking behavior of all + // future mappings to Mode. At least one of Current or Future must be true. + Current bool + Future bool + Mode memmap.MLockMode +} + +// MLockAll implements the semantics of Linux's mlockall()/munlockall(), +// depending on opts. +func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error { + if !opts.Current && !opts.Future { + return syserror.EINVAL + } + + mm.mappingMu.Lock() + // Can't defer mm.mappingMu.Unlock(); see below. + + if opts.Current { + if opts.Mode != memmap.MLockNone { + // Check against RLIMIT_MEMLOCK. + if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { + mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur + if mlockLimit == 0 { + mm.mappingMu.Unlock() + return syserror.EPERM + } + if uint64(mm.vmas.Span()) > mlockLimit { + mm.mappingMu.Unlock() + return syserror.ENOMEM + } + } + } + for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() { + vma := vseg.ValuePtr() + prevMode := vma.mlockMode + vma.mlockMode = opts.Mode + if opts.Mode != memmap.MLockNone && prevMode == memmap.MLockNone { + mm.lockedAS += uint64(vseg.Range().Length()) + } else if opts.Mode == memmap.MLockNone && prevMode != memmap.MLockNone { + mm.lockedAS -= uint64(vseg.Range().Length()) + } + } + } + + if opts.Future { + mm.defMLockMode = opts.Mode + } + + if opts.Current && opts.Mode == memmap.MLockEager { + // Linux: mm/mlock.c:sys_mlockall() => include/linux/mm.h:mm_populate() + // ignores the return value of __mm_populate(), so all errors below are + // ignored. + // + // Try to get usable pmas. + mm.activeMu.Lock() + mm.mappingMu.DowngradeLock() + for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() { + if vseg.ValuePtr().effectivePerms.Any() { + mm.getPMAsLocked(ctx, vseg, vseg.Range(), usermem.NoAccess) + } + } + + // Map all pmas into the active AddressSpace, if we have one. + mm.mappingMu.RUnlock() + if mm.as != nil { + mm.activeMu.DowngradeLock() + mm.mapASLocked(mm.pmas.FirstSegment(), mm.applicationAddrRange(), true /* precommit */) + mm.activeMu.RUnlock() + } else { + mm.activeMu.Unlock() + } + } else { + mm.mappingMu.Unlock() + } + return nil +} + +// Decommit implements the semantics of Linux's madvise(MADV_DONTNEED). +func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error { + ar, ok := addr.ToRange(length) + if !ok { + return syserror.EINVAL + } + + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + mm.activeMu.Lock() + defer mm.activeMu.Unlock() + + // Linux's mm/madvise.c:madvise_dontneed() => mm/memory.c:zap_page_range() + // is analogous to our mm.invalidateLocked(ar, true, true). We inline this + // here, with the special case that we synchronously decommit + // uniquely-owned (non-copy-on-write) pages for private anonymous vma, + // which is the common case for MADV_DONTNEED. Invalidating these pmas, and + // allowing them to be reallocated when touched again, increases pma + // fragmentation, which may significantly reduce performance for + // non-vectored I/O implementations. Also, decommitting synchronously + // ensures that Decommit immediately reduces host memory usage. + var didUnmapAS bool + pseg := mm.pmas.LowerBoundSegment(ar.Start) + mf := mm.mfp.MemoryFile() + for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { + vma := vseg.ValuePtr() + if vma.mlockMode != memmap.MLockNone { + return syserror.EINVAL + } + vsegAR := vseg.Range().Intersect(ar) + // pseg should already correspond to either this vma or a later one, + // since there can't be a pma without a corresponding vma. + if checkInvariants { + if pseg.Ok() && pseg.End() <= vsegAR.Start { + panic(fmt.Sprintf("pma %v precedes vma %v", pseg.Range(), vsegAR)) + } + } + for pseg.Ok() && pseg.Start() < vsegAR.End { + pma := pseg.ValuePtr() + if pma.private && !mm.isPMACopyOnWriteLocked(vseg, pseg) { + psegAR := pseg.Range().Intersect(ar) + if vsegAR.IsSupersetOf(psegAR) && vma.mappable == nil { + if err := mf.Decommit(pseg.fileRangeOf(psegAR)); err == nil { + pseg = pseg.NextSegment() + continue + } + // If an error occurs, fall through to the general + // invalidation case below. + } + } + pseg = mm.pmas.Isolate(pseg, vsegAR) + pma = pseg.ValuePtr() + if !didUnmapAS { + // Unmap all of ar, not just pseg.Range(), to minimize host + // syscalls. AddressSpace mappings must be removed before + // mm.decPrivateRef(). + mm.unmapASLocked(ar) + didUnmapAS = true + } + if pma.private { + mm.decPrivateRef(pseg.fileRange()) + } + pma.file.DecRef(pseg.fileRange()) + mm.removeRSSLocked(pseg.Range()) + pseg = mm.pmas.Remove(pseg).NextSegment() + } + } + + // "If there are some parts of the specified address space that are not + // mapped, the Linux version of madvise() ignores them and applies the call + // to the rest (but returns ENOMEM from the system call, as it should)." - + // madvise(2) + if mm.vmas.SpanRange(ar) != ar.Length() { + return syserror.ENOMEM + } + return nil +} + +// MSyncOpts holds options to MSync. +type MSyncOpts struct { + // Sync has the semantics of MS_SYNC. + Sync bool + + // Invalidate has the semantics of MS_INVALIDATE. + Invalidate bool +} + +// MSync implements the semantics of Linux's msync(). +func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length uint64, opts MSyncOpts) error { + if addr != addr.RoundDown() { + return syserror.EINVAL + } + if length == 0 { + return nil + } + la, ok := usermem.Addr(length).RoundUp() + if !ok { + return syserror.ENOMEM + } + ar, ok := addr.ToRange(uint64(la)) + if !ok { + return syserror.ENOMEM + } + + mm.mappingMu.RLock() + // Can't defer mm.mappingMu.RUnlock(); see below. + vseg := mm.vmas.LowerBoundSegment(ar.Start) + if !vseg.Ok() { + mm.mappingMu.RUnlock() + return syserror.ENOMEM + } + var unmapped bool + lastEnd := ar.Start + for { + if !vseg.Ok() { + mm.mappingMu.RUnlock() + unmapped = true + break + } + if lastEnd < vseg.Start() { + unmapped = true + } + lastEnd = vseg.End() + vma := vseg.ValuePtr() + if opts.Invalidate && vma.mlockMode != memmap.MLockNone { + mm.mappingMu.RUnlock() + return syserror.EBUSY + } + // It's only possible to have dirtied the Mappable through a shared + // mapping. Don't check if the mapping is writable, because mprotect + // may have changed this, and also because Linux doesn't. + if id := vma.id; opts.Sync && id != nil && vma.mappable != nil && !vma.private { + // We can't call memmap.MappingIdentity.Msync while holding + // mm.mappingMu since it may take fs locks that precede it in the + // lock order. + id.IncRef() + mr := vseg.mappableRangeOf(vseg.Range().Intersect(ar)) + mm.mappingMu.RUnlock() + err := id.Msync(ctx, mr) + id.DecRef() + if err != nil { + return err + } + if lastEnd >= ar.End { + break + } + mm.mappingMu.RLock() + vseg = mm.vmas.LowerBoundSegment(lastEnd) + } else { + if lastEnd >= ar.End { + mm.mappingMu.RUnlock() + break + } + vseg = vseg.NextSegment() + } + } + + if unmapped { + return syserror.ENOMEM + } + return nil +} + +// GetSharedFutexKey is used by kernel.Task.GetSharedKey. +func (mm *MemoryManager) GetSharedFutexKey(ctx context.Context, addr usermem.Addr) (futex.Key, error) { + ar, ok := addr.ToRange(4) // sizeof(int32). + if !ok { + return futex.Key{}, syserror.EFAULT + } + + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + vseg, _, err := mm.getVMAsLocked(ctx, ar, usermem.Read, false) + if err != nil { + return futex.Key{}, err + } + vma := vseg.ValuePtr() + + if vma.private { + return futex.Key{ + Kind: futex.KindSharedPrivate, + Offset: uint64(addr), + }, nil + } + + if vma.id != nil { + vma.id.IncRef() + } + return futex.Key{ + Kind: futex.KindSharedMappable, + Mappable: vma.mappable, + MappingIdentity: vma.id, + Offset: vseg.mappableOffsetAt(addr), + }, nil +} + +// VirtualMemorySize returns the combined length in bytes of all mappings in +// mm. +func (mm *MemoryManager) VirtualMemorySize() uint64 { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + return mm.usageAS +} + +// VirtualMemorySizeRange returns the combined length in bytes of all mappings +// in ar in mm. +func (mm *MemoryManager) VirtualMemorySizeRange(ar usermem.AddrRange) uint64 { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + return uint64(mm.vmas.SpanRange(ar)) +} + +// ResidentSetSize returns the value advertised as mm's RSS in bytes. +func (mm *MemoryManager) ResidentSetSize() uint64 { + mm.activeMu.RLock() + defer mm.activeMu.RUnlock() + return mm.curRSS +} + +// MaxResidentSetSize returns the value advertised as mm's max RSS in bytes. +func (mm *MemoryManager) MaxResidentSetSize() uint64 { + mm.activeMu.RLock() + defer mm.activeMu.RUnlock() + return mm.maxRSS +} + +// VirtualDataSize returns the size of private data segments in mm. +func (mm *MemoryManager) VirtualDataSize() uint64 { + mm.mappingMu.RLock() + defer mm.mappingMu.RUnlock() + return mm.dataAS +} diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go new file mode 100644 index 000000000..02203f79f --- /dev/null +++ b/pkg/sentry/mm/vma.go @@ -0,0 +1,564 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mm + +import ( + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Preconditions: mm.mappingMu must be locked for writing. opts must be valid +// as defined by the checks in MMap. +func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOpts) (vmaIterator, usermem.AddrRange, error) { + if opts.MaxPerms != opts.MaxPerms.Effective() { + panic(fmt.Sprintf("Non-effective MaxPerms %s cannot be enforced", opts.MaxPerms)) + } + + // Find a useable range. + addr, err := mm.findAvailableLocked(opts.Length, findAvailableOpts{ + Addr: opts.Addr, + Fixed: opts.Fixed, + Unmap: opts.Unmap, + Map32Bit: opts.Map32Bit, + }) + if err != nil { + return vmaIterator{}, usermem.AddrRange{}, err + } + ar, _ := addr.ToRange(opts.Length) + + // Check against RLIMIT_AS. + newUsageAS := mm.usageAS + opts.Length + if opts.Unmap { + newUsageAS -= uint64(mm.vmas.SpanRange(ar)) + } + if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS { + return vmaIterator{}, usermem.AddrRange{}, syserror.ENOMEM + } + + if opts.MLockMode != memmap.MLockNone { + // Check against RLIMIT_MEMLOCK. + if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) { + mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur + if mlockLimit == 0 { + return vmaIterator{}, usermem.AddrRange{}, syserror.EPERM + } + newLockedAS := mm.lockedAS + opts.Length + if opts.Unmap { + newLockedAS -= mm.mlockedBytesRangeLocked(ar) + } + if newLockedAS > mlockLimit { + return vmaIterator{}, usermem.AddrRange{}, syserror.EAGAIN + } + } + } + + // Remove overwritten mappings. This ordering is consistent with Linux: + // compare Linux's mm/mmap.c:mmap_region() => do_munmap(), + // file->f_op->mmap(). + var vgap vmaGapIterator + if opts.Unmap { + vgap = mm.unmapLocked(ctx, ar) + } else { + vgap = mm.vmas.FindGap(ar.Start) + } + + // Inform the Mappable, if any, of the new mapping. + if opts.Mappable != nil { + // The expression for writable is vma.canWriteMappableLocked(), but we + // don't yet have a vma. + if err := opts.Mappable.AddMapping(ctx, mm, ar, opts.Offset, !opts.Private && opts.MaxPerms.Write); err != nil { + return vmaIterator{}, usermem.AddrRange{}, err + } + } + + // Take a reference on opts.MappingIdentity before inserting the vma since + // vma merging can drop the reference. + if opts.MappingIdentity != nil { + opts.MappingIdentity.IncRef() + } + + // Finally insert the vma. + v := vma{ + mappable: opts.Mappable, + off: opts.Offset, + realPerms: opts.Perms, + effectivePerms: opts.Perms.Effective(), + maxPerms: opts.MaxPerms, + private: opts.Private, + growsDown: opts.GrowsDown, + mlockMode: opts.MLockMode, + id: opts.MappingIdentity, + hint: opts.Hint, + } + + vseg := mm.vmas.Insert(vgap, ar, v) + mm.usageAS += opts.Length + if v.isPrivateDataLocked() { + mm.dataAS += opts.Length + } + if opts.MLockMode != memmap.MLockNone { + mm.lockedAS += opts.Length + } + + return vseg, ar, nil +} + +type findAvailableOpts struct { + // These fields are equivalent to those in memmap.MMapOpts, except that: + // + // - Addr must be page-aligned. + // + // - Unmap allows existing guard pages in the returned range. + + Addr usermem.Addr + Fixed bool + Unmap bool + Map32Bit bool +} + +// map32Start/End are the bounds to which MAP_32BIT mappings are constrained, +// and are equivalent to Linux's MAP32_BASE and MAP32_MAX respectively. +const ( + map32Start = 0x40000000 + map32End = 0x80000000 +) + +// findAvailableLocked finds an allocatable range. +// +// Preconditions: mm.mappingMu must be locked. +func (mm *MemoryManager) findAvailableLocked(length uint64, opts findAvailableOpts) (usermem.Addr, error) { + if opts.Fixed { + opts.Map32Bit = false + } + allowedAR := mm.applicationAddrRange() + if opts.Map32Bit { + allowedAR = allowedAR.Intersect(usermem.AddrRange{map32Start, map32End}) + } + + // Does the provided suggestion work? + if ar, ok := opts.Addr.ToRange(length); ok { + if allowedAR.IsSupersetOf(ar) { + if opts.Unmap { + return ar.Start, nil + } + // Check for the presence of an existing vma or guard page. + if vgap := mm.vmas.FindGap(ar.Start); vgap.Ok() && vgap.availableRange().IsSupersetOf(ar) { + return ar.Start, nil + } + } + } + + // Fixed mappings accept only the requested address. + if opts.Fixed { + return 0, syserror.ENOMEM + } + + // Prefer hugepage alignment if a hugepage or more is requested. + alignment := uint64(usermem.PageSize) + if length >= usermem.HugePageSize { + alignment = usermem.HugePageSize + } + + if opts.Map32Bit { + return mm.findLowestAvailableLocked(length, alignment, allowedAR) + } + if mm.layout.DefaultDirection == arch.MmapBottomUp { + return mm.findLowestAvailableLocked(length, alignment, usermem.AddrRange{mm.layout.BottomUpBase, mm.layout.MaxAddr}) + } + return mm.findHighestAvailableLocked(length, alignment, usermem.AddrRange{mm.layout.MinAddr, mm.layout.TopDownBase}) +} + +func (mm *MemoryManager) applicationAddrRange() usermem.AddrRange { + return usermem.AddrRange{mm.layout.MinAddr, mm.layout.MaxAddr} +} + +// Preconditions: mm.mappingMu must be locked. +func (mm *MemoryManager) findLowestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) { + for gap := mm.vmas.LowerBoundGap(bounds.Start); gap.Ok() && gap.Start() < bounds.End; gap = gap.NextGap() { + if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length { + // Can we shift up to match the alignment? + if offset := uint64(gr.Start) % alignment; offset != 0 { + if uint64(gr.Length()) >= length+alignment-offset { + // Yes, we're aligned. + return gr.Start + usermem.Addr(alignment-offset), nil + } + } + + // Either aligned perfectly, or can't align it. + return gr.Start, nil + } + } + return 0, syserror.ENOMEM +} + +// Preconditions: mm.mappingMu must be locked. +func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bounds usermem.AddrRange) (usermem.Addr, error) { + for gap := mm.vmas.UpperBoundGap(bounds.End); gap.Ok() && gap.End() > bounds.Start; gap = gap.PrevGap() { + if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length { + // Can we shift down to match the alignment? + start := gr.End - usermem.Addr(length) + if offset := uint64(start) % alignment; offset != 0 { + if gr.Start <= start-usermem.Addr(offset) { + // Yes, we're aligned. + return start - usermem.Addr(offset), nil + } + } + + // Either aligned perfectly, or can't align it. + return start, nil + } + } + return 0, syserror.ENOMEM +} + +// Preconditions: mm.mappingMu must be locked. +func (mm *MemoryManager) mlockedBytesRangeLocked(ar usermem.AddrRange) uint64 { + var total uint64 + for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() { + if vseg.ValuePtr().mlockMode != memmap.MLockNone { + total += uint64(vseg.Range().Intersect(ar).Length()) + } + } + return total +} + +// getVMAsLocked ensures that vmas exist for all addresses in ar, and support +// access of type (at, ignorePermissions). It returns: +// +// - An iterator to the vma containing ar.Start. If no vma contains ar.Start, +// the iterator is unspecified. +// +// - An iterator to the gap after the last vma containing an address in ar. If +// vmas exist for no addresses in ar, the iterator is to a gap that begins +// before ar.Start. +// +// - An error that is non-nil if vmas exist for only a subset of ar. +// +// Preconditions: mm.mappingMu must be locked for reading; it may be +// temporarily unlocked. ar.Length() != 0. +func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar usermem.AddrRange, at usermem.AccessType, ignorePermissions bool) (vmaIterator, vmaGapIterator, error) { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + } + + // Inline mm.vmas.LowerBoundSegment so that we have the preceding gap if + // !vbegin.Ok(). + vbegin, vgap := mm.vmas.Find(ar.Start) + if !vbegin.Ok() { + vbegin = vgap.NextSegment() + // vseg.Ok() is checked before entering the following loop. + } else { + vgap = vbegin.PrevGap() + } + + addr := ar.Start + vseg := vbegin + for vseg.Ok() { + // Loop invariants: vgap = vseg.PrevGap(); addr < vseg.End(). + vma := vseg.ValuePtr() + if addr < vseg.Start() { + // TODO(jamieliu): Implement vma.growsDown here. + return vbegin, vgap, syserror.EFAULT + } + + perms := vma.effectivePerms + if ignorePermissions { + perms = vma.maxPerms + } + if !perms.SupersetOf(at) { + return vbegin, vgap, syserror.EPERM + } + + addr = vseg.End() + vgap = vseg.NextGap() + if addr >= ar.End { + return vbegin, vgap, nil + } + vseg = vgap.NextSegment() + } + + // Ran out of vmas before ar.End. + return vbegin, vgap, syserror.EFAULT +} + +// getVecVMAsLocked ensures that vmas exist for all addresses in ars, and +// support access to type of (at, ignorePermissions). It returns the subset of +// ars for which vmas exist. If this is not equal to ars, it returns a non-nil +// error explaining why. +// +// Preconditions: mm.mappingMu must be locked for reading; it may be +// temporarily unlocked. +// +// Postconditions: ars is not mutated. +func (mm *MemoryManager) getVecVMAsLocked(ctx context.Context, ars usermem.AddrRangeSeq, at usermem.AccessType, ignorePermissions bool) (usermem.AddrRangeSeq, error) { + for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() { + ar := arsit.Head() + if ar.Length() == 0 { + continue + } + if _, vend, err := mm.getVMAsLocked(ctx, ar, at, ignorePermissions); err != nil { + return truncatedAddrRangeSeq(ars, arsit, vend.Start()), err + } + } + return ars, nil +} + +// vma extension will not shrink the number of unmapped bytes between the start +// of a growsDown vma and the end of its predecessor non-growsDown vma below +// guardBytes. +// +// guardBytes is equivalent to Linux's stack_guard_gap after upstream +// 1be7107fbe18 "mm: larger stack guard gap, between vmas". +const guardBytes = 256 * usermem.PageSize + +// unmapLocked unmaps all addresses in ar and returns the resulting gap in +// mm.vmas. +// +// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0. +// ar must be page-aligned. +func (mm *MemoryManager) unmapLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + } + + // AddressSpace mappings and pmas must be invalidated before + // mm.removeVMAsLocked() => memmap.Mappable.RemoveMapping(). + mm.Invalidate(ar, memmap.InvalidateOpts{InvalidatePrivate: true}) + return mm.removeVMAsLocked(ctx, ar) +} + +// removeVMAsLocked removes vmas for addresses in ar and returns the resulting +// gap in mm.vmas. It does not remove pmas or AddressSpace mappings; clients +// must do so before calling removeVMAsLocked. +// +// Preconditions: mm.mappingMu must be locked for writing. ar.Length() != 0. ar +// must be page-aligned. +func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRange) vmaGapIterator { + if checkInvariants { + if !ar.WellFormed() || ar.Length() <= 0 || !ar.IsPageAligned() { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + } + + vseg, vgap := mm.vmas.Find(ar.Start) + if vgap.Ok() { + vseg = vgap.NextSegment() + } + for vseg.Ok() && vseg.Start() < ar.End { + vseg = mm.vmas.Isolate(vseg, ar) + vmaAR := vseg.Range() + vma := vseg.ValuePtr() + if vma.mappable != nil { + vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off, vma.canWriteMappableLocked()) + } + if vma.id != nil { + vma.id.DecRef() + } + mm.usageAS -= uint64(vmaAR.Length()) + if vma.isPrivateDataLocked() { + mm.dataAS -= uint64(vmaAR.Length()) + } + if vma.mlockMode != memmap.MLockNone { + mm.lockedAS -= uint64(vmaAR.Length()) + } + vgap = mm.vmas.Remove(vseg) + vseg = vgap.NextSegment() + } + return vgap +} + +// canWriteMappableLocked returns true if it is possible for vma.mappable to be +// written to via this vma, i.e. if it is possible that +// vma.mappable.Translate(at.Write=true) may be called as a result of this vma. +// This includes via I/O with usermem.IOOpts.IgnorePermissions = true, such as +// PTRACE_POKEDATA. +// +// canWriteMappableLocked is equivalent to Linux's VM_SHARED. +// +// Preconditions: mm.mappingMu must be locked. +func (vma *vma) canWriteMappableLocked() bool { + return !vma.private && vma.maxPerms.Write +} + +// isPrivateDataLocked identify the data segments - private, writable, not stack +// +// Preconditions: mm.mappingMu must be locked. +func (vma *vma) isPrivateDataLocked() bool { + return vma.realPerms.Write && vma.private && !vma.growsDown +} + +// vmaSetFunctions implements segment.Functions for vmaSet. +type vmaSetFunctions struct{} + +func (vmaSetFunctions) MinKey() usermem.Addr { + return 0 +} + +func (vmaSetFunctions) MaxKey() usermem.Addr { + return ^usermem.Addr(0) +} + +func (vmaSetFunctions) ClearValue(vma *vma) { + vma.mappable = nil + vma.id = nil + vma.hint = "" +} + +func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRange, vma2 vma) (vma, bool) { + if vma1.mappable != vma2.mappable || + (vma1.mappable != nil && vma1.off+uint64(ar1.Length()) != vma2.off) || + vma1.realPerms != vma2.realPerms || + vma1.maxPerms != vma2.maxPerms || + vma1.private != vma2.private || + vma1.growsDown != vma2.growsDown || + vma1.mlockMode != vma2.mlockMode || + vma1.id != vma2.id || + vma1.hint != vma2.hint { + return vma{}, false + } + + if vma2.id != nil { + vma2.id.DecRef() + } + return vma1, true +} + +func (vmaSetFunctions) Split(ar usermem.AddrRange, v vma, split usermem.Addr) (vma, vma) { + v2 := v + if v2.mappable != nil { + v2.off += uint64(split - ar.Start) + } + if v2.id != nil { + v2.id.IncRef() + } + return v, v2 +} + +// Preconditions: vseg.ValuePtr().mappable != nil. vseg.Range().Contains(addr). +func (vseg vmaIterator) mappableOffsetAt(addr usermem.Addr) uint64 { + if checkInvariants { + if !vseg.Ok() { + panic("terminal vma iterator") + } + if vseg.ValuePtr().mappable == nil { + panic("Mappable offset is meaningless for anonymous vma") + } + if !vseg.Range().Contains(addr) { + panic(fmt.Sprintf("addr %v out of bounds %v", addr, vseg.Range())) + } + } + + vma := vseg.ValuePtr() + vstart := vseg.Start() + return vma.off + uint64(addr-vstart) +} + +// Preconditions: vseg.ValuePtr().mappable != nil. +func (vseg vmaIterator) mappableRange() memmap.MappableRange { + return vseg.mappableRangeOf(vseg.Range()) +} + +// Preconditions: vseg.ValuePtr().mappable != nil. +// vseg.Range().IsSupersetOf(ar). ar.Length() != 0. +func (vseg vmaIterator) mappableRangeOf(ar usermem.AddrRange) memmap.MappableRange { + if checkInvariants { + if !vseg.Ok() { + panic("terminal vma iterator") + } + if vseg.ValuePtr().mappable == nil { + panic("MappableRange is meaningless for anonymous vma") + } + if !ar.WellFormed() || ar.Length() <= 0 { + panic(fmt.Sprintf("invalid ar: %v", ar)) + } + if !vseg.Range().IsSupersetOf(ar) { + panic(fmt.Sprintf("ar %v out of bounds %v", ar, vseg.Range())) + } + } + + vma := vseg.ValuePtr() + vstart := vseg.Start() + return memmap.MappableRange{vma.off + uint64(ar.Start-vstart), vma.off + uint64(ar.End-vstart)} +} + +// Preconditions: vseg.ValuePtr().mappable != nil. +// vseg.mappableRange().IsSupersetOf(mr). mr.Length() != 0. +func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) usermem.AddrRange { + if checkInvariants { + if !vseg.Ok() { + panic("terminal vma iterator") + } + if vseg.ValuePtr().mappable == nil { + panic("MappableRange is meaningless for anonymous vma") + } + if !mr.WellFormed() || mr.Length() <= 0 { + panic(fmt.Sprintf("invalid mr: %v", mr)) + } + if !vseg.mappableRange().IsSupersetOf(mr) { + panic(fmt.Sprintf("mr %v out of bounds %v", mr, vseg.mappableRange())) + } + } + + vma := vseg.ValuePtr() + vstart := vseg.Start() + return usermem.AddrRange{vstart + usermem.Addr(mr.Start-vma.off), vstart + usermem.Addr(mr.End-vma.off)} +} + +// seekNextLowerBound returns mm.vmas.LowerBoundSegment(addr), but does so by +// scanning linearly forward from vseg. +// +// Preconditions: mm.mappingMu must be locked. addr >= vseg.Start(). +func (vseg vmaIterator) seekNextLowerBound(addr usermem.Addr) vmaIterator { + if checkInvariants { + if !vseg.Ok() { + panic("terminal vma iterator") + } + if addr < vseg.Start() { + panic(fmt.Sprintf("can't seek forward to %#x from %#x", addr, vseg.Start())) + } + } + for vseg.Ok() && addr >= vseg.End() { + vseg = vseg.NextSegment() + } + return vseg +} + +// availableRange returns the subset of vgap.Range() in which new vmas may be +// created without MMapOpts.Unmap == true. +func (vgap vmaGapIterator) availableRange() usermem.AddrRange { + ar := vgap.Range() + next := vgap.NextSegment() + if !next.Ok() || !next.ValuePtr().growsDown { + return ar + } + // Exclude guard pages. + if ar.Length() < guardBytes { + return usermem.AddrRange{ar.Start, ar.Start} + } + ar.End -= guardBytes + return ar +} diff --git a/pkg/sentry/mm/vma_set.go b/pkg/sentry/mm/vma_set.go new file mode 100755 index 000000000..c042fe606 --- /dev/null +++ b/pkg/sentry/mm/vma_set.go @@ -0,0 +1,1274 @@ +package mm + +import ( + __generics_imported0 "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +import ( + "bytes" + "fmt" +) + +const ( + // minDegree is the minimum degree of an internal node in a Set B-tree. + // + // - Any non-root node has at least minDegree-1 segments. + // + // - Any non-root internal (non-leaf) node has at least minDegree children. + // + // - The root node may have fewer than minDegree-1 segments, but it may + // only have 0 segments if the tree is empty. + // + // Our implementation requires minDegree >= 3. Higher values of minDegree + // usually improve performance, but increase memory usage for small sets. + vmaminDegree = 8 + + vmamaxDegree = 2 * vmaminDegree +) + +// A Set is a mapping of segments with non-overlapping Range keys. The zero +// value for a Set is an empty set. Set values are not safely movable nor +// copyable. Set is thread-compatible. +// +// +stateify savable +type vmaSet struct { + root vmanode `state:".(*vmaSegmentDataSlices)"` +} + +// IsEmpty returns true if the set contains no segments. +func (s *vmaSet) IsEmpty() bool { + return s.root.nrSegments == 0 +} + +// IsEmptyRange returns true iff no segments in the set overlap the given +// range. This is semantically equivalent to s.SpanRange(r) == 0, but may be +// more efficient. +func (s *vmaSet) IsEmptyRange(r __generics_imported0.AddrRange) bool { + switch { + case r.Length() < 0: + panic(fmt.Sprintf("invalid range %v", r)) + case r.Length() == 0: + return true + } + _, gap := s.Find(r.Start) + if !gap.Ok() { + return false + } + return r.End <= gap.End() +} + +// Span returns the total size of all segments in the set. +func (s *vmaSet) Span() __generics_imported0.Addr { + var sz __generics_imported0.Addr + for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + sz += seg.Range().Length() + } + return sz +} + +// SpanRange returns the total size of the intersection of segments in the set +// with the given range. +func (s *vmaSet) SpanRange(r __generics_imported0.AddrRange) __generics_imported0.Addr { + switch { + case r.Length() < 0: + panic(fmt.Sprintf("invalid range %v", r)) + case r.Length() == 0: + return 0 + } + var sz __generics_imported0.Addr + for seg := s.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; seg = seg.NextSegment() { + sz += seg.Range().Intersect(r).Length() + } + return sz +} + +// FirstSegment returns the first segment in the set. If the set is empty, +// FirstSegment returns a terminal iterator. +func (s *vmaSet) FirstSegment() vmaIterator { + if s.root.nrSegments == 0 { + return vmaIterator{} + } + return s.root.firstSegment() +} + +// LastSegment returns the last segment in the set. If the set is empty, +// LastSegment returns a terminal iterator. +func (s *vmaSet) LastSegment() vmaIterator { + if s.root.nrSegments == 0 { + return vmaIterator{} + } + return s.root.lastSegment() +} + +// FirstGap returns the first gap in the set. +func (s *vmaSet) FirstGap() vmaGapIterator { + n := &s.root + for n.hasChildren { + n = n.children[0] + } + return vmaGapIterator{n, 0} +} + +// LastGap returns the last gap in the set. +func (s *vmaSet) LastGap() vmaGapIterator { + n := &s.root + for n.hasChildren { + n = n.children[n.nrSegments] + } + return vmaGapIterator{n, n.nrSegments} +} + +// Find returns the segment or gap whose range contains the given key. If a +// segment is found, the returned Iterator is non-terminal and the +// returned GapIterator is terminal. Otherwise, the returned Iterator is +// terminal and the returned GapIterator is non-terminal. +func (s *vmaSet) Find(key __generics_imported0.Addr) (vmaIterator, vmaGapIterator) { + n := &s.root + for { + + lower := 0 + upper := n.nrSegments + for lower < upper { + i := lower + (upper-lower)/2 + if r := n.keys[i]; key < r.End { + if key >= r.Start { + return vmaIterator{n, i}, vmaGapIterator{} + } + upper = i + } else { + lower = i + 1 + } + } + i := lower + if !n.hasChildren { + return vmaIterator{}, vmaGapIterator{n, i} + } + n = n.children[i] + } +} + +// FindSegment returns the segment whose range contains the given key. If no +// such segment exists, FindSegment returns a terminal iterator. +func (s *vmaSet) FindSegment(key __generics_imported0.Addr) vmaIterator { + seg, _ := s.Find(key) + return seg +} + +// LowerBoundSegment returns the segment with the lowest range that contains a +// key greater than or equal to min. If no such segment exists, +// LowerBoundSegment returns a terminal iterator. +func (s *vmaSet) LowerBoundSegment(min __generics_imported0.Addr) vmaIterator { + seg, gap := s.Find(min) + if seg.Ok() { + return seg + } + return gap.NextSegment() +} + +// UpperBoundSegment returns the segment with the highest range that contains a +// key less than or equal to max. If no such segment exists, UpperBoundSegment +// returns a terminal iterator. +func (s *vmaSet) UpperBoundSegment(max __generics_imported0.Addr) vmaIterator { + seg, gap := s.Find(max) + if seg.Ok() { + return seg + } + return gap.PrevSegment() +} + +// FindGap returns the gap containing the given key. If no such gap exists +// (i.e. the set contains a segment containing that key), FindGap returns a +// terminal iterator. +func (s *vmaSet) FindGap(key __generics_imported0.Addr) vmaGapIterator { + _, gap := s.Find(key) + return gap +} + +// LowerBoundGap returns the gap with the lowest range that is greater than or +// equal to min. +func (s *vmaSet) LowerBoundGap(min __generics_imported0.Addr) vmaGapIterator { + seg, gap := s.Find(min) + if gap.Ok() { + return gap + } + return seg.NextGap() +} + +// UpperBoundGap returns the gap with the highest range that is less than or +// equal to max. +func (s *vmaSet) UpperBoundGap(max __generics_imported0.Addr) vmaGapIterator { + seg, gap := s.Find(max) + if gap.Ok() { + return gap + } + return seg.PrevGap() +} + +// Add inserts the given segment into the set and returns true. If the new +// segment can be merged with adjacent segments, Add will do so. If the new +// segment would overlap an existing segment, Add returns false. If Add +// succeeds, all existing iterators are invalidated. +func (s *vmaSet) Add(r __generics_imported0.AddrRange, val vma) bool { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + gap := s.FindGap(r.Start) + if !gap.Ok() { + return false + } + if r.End > gap.End() { + return false + } + s.Insert(gap, r, val) + return true +} + +// AddWithoutMerging inserts the given segment into the set and returns true. +// If it would overlap an existing segment, AddWithoutMerging does nothing and +// returns false. If AddWithoutMerging succeeds, all existing iterators are +// invalidated. +func (s *vmaSet) AddWithoutMerging(r __generics_imported0.AddrRange, val vma) bool { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + gap := s.FindGap(r.Start) + if !gap.Ok() { + return false + } + if r.End > gap.End() { + return false + } + s.InsertWithoutMergingUnchecked(gap, r, val) + return true +} + +// Insert inserts the given segment into the given gap. If the new segment can +// be merged with adjacent segments, Insert will do so. Insert returns an +// iterator to the segment containing the inserted value (which may have been +// merged with other values). All existing iterators (including gap, but not +// including the returned iterator) are invalidated. +// +// If the gap cannot accommodate the segment, or if r is invalid, Insert panics. +// +// Insert is semantically equivalent to a InsertWithoutMerging followed by a +// Merge, but may be more efficient. Note that there is no unchecked variant of +// Insert since Insert must retrieve and inspect gap's predecessor and +// successor segments regardless. +func (s *vmaSet) Insert(gap vmaGapIterator, r __generics_imported0.AddrRange, val vma) vmaIterator { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + prev, next := gap.PrevSegment(), gap.NextSegment() + if prev.Ok() && prev.End() > r.Start { + panic(fmt.Sprintf("new segment %v overlaps predecessor %v", r, prev.Range())) + } + if next.Ok() && next.Start() < r.End { + panic(fmt.Sprintf("new segment %v overlaps successor %v", r, next.Range())) + } + if prev.Ok() && prev.End() == r.Start { + if mval, ok := (vmaSetFunctions{}).Merge(prev.Range(), prev.Value(), r, val); ok { + prev.SetEndUnchecked(r.End) + prev.SetValue(mval) + if next.Ok() && next.Start() == r.End { + val = mval + if mval, ok := (vmaSetFunctions{}).Merge(prev.Range(), val, next.Range(), next.Value()); ok { + prev.SetEndUnchecked(next.End()) + prev.SetValue(mval) + return s.Remove(next).PrevSegment() + } + } + return prev + } + } + if next.Ok() && next.Start() == r.End { + if mval, ok := (vmaSetFunctions{}).Merge(r, val, next.Range(), next.Value()); ok { + next.SetStartUnchecked(r.Start) + next.SetValue(mval) + return next + } + } + return s.InsertWithoutMergingUnchecked(gap, r, val) +} + +// InsertWithoutMerging inserts the given segment into the given gap and +// returns an iterator to the inserted segment. All existing iterators +// (including gap, but not including the returned iterator) are invalidated. +// +// If the gap cannot accommodate the segment, or if r is invalid, +// InsertWithoutMerging panics. +func (s *vmaSet) InsertWithoutMerging(gap vmaGapIterator, r __generics_imported0.AddrRange, val vma) vmaIterator { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + if gr := gap.Range(); !gr.IsSupersetOf(r) { + panic(fmt.Sprintf("cannot insert segment range %v into gap range %v", r, gr)) + } + return s.InsertWithoutMergingUnchecked(gap, r, val) +} + +// InsertWithoutMergingUnchecked inserts the given segment into the given gap +// and returns an iterator to the inserted segment. All existing iterators +// (including gap, but not including the returned iterator) are invalidated. +// +// Preconditions: r.Start >= gap.Start(); r.End <= gap.End(). +func (s *vmaSet) InsertWithoutMergingUnchecked(gap vmaGapIterator, r __generics_imported0.AddrRange, val vma) vmaIterator { + gap = gap.node.rebalanceBeforeInsert(gap) + copy(gap.node.keys[gap.index+1:], gap.node.keys[gap.index:gap.node.nrSegments]) + copy(gap.node.values[gap.index+1:], gap.node.values[gap.index:gap.node.nrSegments]) + gap.node.keys[gap.index] = r + gap.node.values[gap.index] = val + gap.node.nrSegments++ + return vmaIterator{gap.node, gap.index} +} + +// Remove removes the given segment and returns an iterator to the vacated gap. +// All existing iterators (including seg, but not including the returned +// iterator) are invalidated. +func (s *vmaSet) Remove(seg vmaIterator) vmaGapIterator { + + if seg.node.hasChildren { + + victim := seg.PrevSegment() + + seg.SetRangeUnchecked(victim.Range()) + seg.SetValue(victim.Value()) + return s.Remove(victim).NextGap() + } + copy(seg.node.keys[seg.index:], seg.node.keys[seg.index+1:seg.node.nrSegments]) + copy(seg.node.values[seg.index:], seg.node.values[seg.index+1:seg.node.nrSegments]) + vmaSetFunctions{}.ClearValue(&seg.node.values[seg.node.nrSegments-1]) + seg.node.nrSegments-- + return seg.node.rebalanceAfterRemove(vmaGapIterator{seg.node, seg.index}) +} + +// RemoveAll removes all segments from the set. All existing iterators are +// invalidated. +func (s *vmaSet) RemoveAll() { + s.root = vmanode{} +} + +// RemoveRange removes all segments in the given range. An iterator to the +// newly formed gap is returned, and all existing iterators are invalidated. +func (s *vmaSet) RemoveRange(r __generics_imported0.AddrRange) vmaGapIterator { + seg, gap := s.Find(r.Start) + if seg.Ok() { + seg = s.Isolate(seg, r) + gap = s.Remove(seg) + } + for seg = gap.NextSegment(); seg.Ok() && seg.Start() < r.End; seg = gap.NextSegment() { + seg = s.Isolate(seg, r) + gap = s.Remove(seg) + } + return gap +} + +// Merge attempts to merge two neighboring segments. If successful, Merge +// returns an iterator to the merged segment, and all existing iterators are +// invalidated. Otherwise, Merge returns a terminal iterator. +// +// If first is not the predecessor of second, Merge panics. +func (s *vmaSet) Merge(first, second vmaIterator) vmaIterator { + if first.NextSegment() != second { + panic(fmt.Sprintf("attempt to merge non-neighboring segments %v, %v", first.Range(), second.Range())) + } + return s.MergeUnchecked(first, second) +} + +// MergeUnchecked attempts to merge two neighboring segments. If successful, +// MergeUnchecked returns an iterator to the merged segment, and all existing +// iterators are invalidated. Otherwise, MergeUnchecked returns a terminal +// iterator. +// +// Precondition: first is the predecessor of second: first.NextSegment() == +// second, first == second.PrevSegment(). +func (s *vmaSet) MergeUnchecked(first, second vmaIterator) vmaIterator { + if first.End() == second.Start() { + if mval, ok := (vmaSetFunctions{}).Merge(first.Range(), first.Value(), second.Range(), second.Value()); ok { + + first.SetEndUnchecked(second.End()) + first.SetValue(mval) + return s.Remove(second).PrevSegment() + } + } + return vmaIterator{} +} + +// MergeAll attempts to merge all adjacent segments in the set. All existing +// iterators are invalidated. +func (s *vmaSet) MergeAll() { + seg := s.FirstSegment() + if !seg.Ok() { + return + } + next := seg.NextSegment() + for next.Ok() { + if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { + seg, next = mseg, mseg.NextSegment() + } else { + seg, next = next, next.NextSegment() + } + } +} + +// MergeRange attempts to merge all adjacent segments that contain a key in the +// specific range. All existing iterators are invalidated. +func (s *vmaSet) MergeRange(r __generics_imported0.AddrRange) { + seg := s.LowerBoundSegment(r.Start) + if !seg.Ok() { + return + } + next := seg.NextSegment() + for next.Ok() && next.Range().Start < r.End { + if mseg := s.MergeUnchecked(seg, next); mseg.Ok() { + seg, next = mseg, mseg.NextSegment() + } else { + seg, next = next, next.NextSegment() + } + } +} + +// MergeAdjacent attempts to merge the segment containing r.Start with its +// predecessor, and the segment containing r.End-1 with its successor. +func (s *vmaSet) MergeAdjacent(r __generics_imported0.AddrRange) { + first := s.FindSegment(r.Start) + if first.Ok() { + if prev := first.PrevSegment(); prev.Ok() { + s.Merge(prev, first) + } + } + last := s.FindSegment(r.End - 1) + if last.Ok() { + if next := last.NextSegment(); next.Ok() { + s.Merge(last, next) + } + } +} + +// Split splits the given segment at the given key and returns iterators to the +// two resulting segments. All existing iterators (including seg, but not +// including the returned iterators) are invalidated. +// +// If the segment cannot be split at split (because split is at the start or +// end of the segment's range, so splitting would produce a segment with zero +// length, or because split falls outside the segment's range altogether), +// Split panics. +func (s *vmaSet) Split(seg vmaIterator, split __generics_imported0.Addr) (vmaIterator, vmaIterator) { + if !seg.Range().CanSplitAt(split) { + panic(fmt.Sprintf("can't split %v at %v", seg.Range(), split)) + } + return s.SplitUnchecked(seg, split) +} + +// SplitUnchecked splits the given segment at the given key and returns +// iterators to the two resulting segments. All existing iterators (including +// seg, but not including the returned iterators) are invalidated. +// +// Preconditions: seg.Start() < key < seg.End(). +func (s *vmaSet) SplitUnchecked(seg vmaIterator, split __generics_imported0.Addr) (vmaIterator, vmaIterator) { + val1, val2 := (vmaSetFunctions{}).Split(seg.Range(), seg.Value(), split) + end2 := seg.End() + seg.SetEndUnchecked(split) + seg.SetValue(val1) + seg2 := s.InsertWithoutMergingUnchecked(seg.NextGap(), __generics_imported0.AddrRange{split, end2}, val2) + + return seg2.PrevSegment(), seg2 +} + +// SplitAt splits the segment straddling split, if one exists. SplitAt returns +// true if a segment was split and false otherwise. If SplitAt splits a +// segment, all existing iterators are invalidated. +func (s *vmaSet) SplitAt(split __generics_imported0.Addr) bool { + if seg := s.FindSegment(split); seg.Ok() && seg.Range().CanSplitAt(split) { + s.SplitUnchecked(seg, split) + return true + } + return false +} + +// Isolate ensures that the given segment's range does not escape r by +// splitting at r.Start and r.End if necessary, and returns an updated iterator +// to the bounded segment. All existing iterators (including seg, but not +// including the returned iterators) are invalidated. +func (s *vmaSet) Isolate(seg vmaIterator, r __generics_imported0.AddrRange) vmaIterator { + if seg.Range().CanSplitAt(r.Start) { + _, seg = s.SplitUnchecked(seg, r.Start) + } + if seg.Range().CanSplitAt(r.End) { + seg, _ = s.SplitUnchecked(seg, r.End) + } + return seg +} + +// ApplyContiguous applies a function to a contiguous range of segments, +// splitting if necessary. The function is applied until the first gap is +// encountered, at which point the gap is returned. If the function is applied +// across the entire range, a terminal gap is returned. All existing iterators +// are invalidated. +// +// N.B. The Iterator must not be invalidated by the function. +func (s *vmaSet) ApplyContiguous(r __generics_imported0.AddrRange, fn func(seg vmaIterator)) vmaGapIterator { + seg, gap := s.Find(r.Start) + if !seg.Ok() { + return gap + } + for { + seg = s.Isolate(seg, r) + fn(seg) + if seg.End() >= r.End { + return vmaGapIterator{} + } + gap = seg.NextGap() + if !gap.IsEmpty() { + return gap + } + seg = gap.NextSegment() + if !seg.Ok() { + + return vmaGapIterator{} + } + } +} + +// +stateify savable +type vmanode struct { + // An internal binary tree node looks like: + // + // K + // / \ + // Cl Cr + // + // where all keys in the subtree rooted by Cl (the left subtree) are less + // than K (the key of the parent node), and all keys in the subtree rooted + // by Cr (the right subtree) are greater than K. + // + // An internal B-tree node's indexes work out to look like: + // + // K0 K1 K2 ... Kn-1 + // / \/ \/ \ ... / \ + // C0 C1 C2 C3 ... Cn-1 Cn + // + // where n is nrSegments. + nrSegments int + + // parent is a pointer to this node's parent. If this node is root, parent + // is nil. + parent *vmanode + + // parentIndex is the index of this node in parent.children. + parentIndex int + + // Flag for internal nodes that is technically redundant with "children[0] + // != nil", but is stored in the first cache line. "hasChildren" rather + // than "isLeaf" because false must be the correct value for an empty root. + hasChildren bool + + // Nodes store keys and values in separate arrays to maximize locality in + // the common case (scanning keys for lookup). + keys [vmamaxDegree - 1]__generics_imported0.AddrRange + values [vmamaxDegree - 1]vma + children [vmamaxDegree]*vmanode +} + +// firstSegment returns the first segment in the subtree rooted by n. +// +// Preconditions: n.nrSegments != 0. +func (n *vmanode) firstSegment() vmaIterator { + for n.hasChildren { + n = n.children[0] + } + return vmaIterator{n, 0} +} + +// lastSegment returns the last segment in the subtree rooted by n. +// +// Preconditions: n.nrSegments != 0. +func (n *vmanode) lastSegment() vmaIterator { + for n.hasChildren { + n = n.children[n.nrSegments] + } + return vmaIterator{n, n.nrSegments - 1} +} + +func (n *vmanode) prevSibling() *vmanode { + if n.parent == nil || n.parentIndex == 0 { + return nil + } + return n.parent.children[n.parentIndex-1] +} + +func (n *vmanode) nextSibling() *vmanode { + if n.parent == nil || n.parentIndex == n.parent.nrSegments { + return nil + } + return n.parent.children[n.parentIndex+1] +} + +// rebalanceBeforeInsert splits n and its ancestors if they are full, as +// required for insertion, and returns an updated iterator to the position +// represented by gap. +func (n *vmanode) rebalanceBeforeInsert(gap vmaGapIterator) vmaGapIterator { + if n.parent != nil { + gap = n.parent.rebalanceBeforeInsert(gap) + } + if n.nrSegments < vmamaxDegree-1 { + return gap + } + if n.parent == nil { + + left := &vmanode{ + nrSegments: vmaminDegree - 1, + parent: n, + parentIndex: 0, + hasChildren: n.hasChildren, + } + right := &vmanode{ + nrSegments: vmaminDegree - 1, + parent: n, + parentIndex: 1, + hasChildren: n.hasChildren, + } + copy(left.keys[:vmaminDegree-1], n.keys[:vmaminDegree-1]) + copy(left.values[:vmaminDegree-1], n.values[:vmaminDegree-1]) + copy(right.keys[:vmaminDegree-1], n.keys[vmaminDegree:]) + copy(right.values[:vmaminDegree-1], n.values[vmaminDegree:]) + n.keys[0], n.values[0] = n.keys[vmaminDegree-1], n.values[vmaminDegree-1] + vmazeroValueSlice(n.values[1:]) + if n.hasChildren { + copy(left.children[:vmaminDegree], n.children[:vmaminDegree]) + copy(right.children[:vmaminDegree], n.children[vmaminDegree:]) + vmazeroNodeSlice(n.children[2:]) + for i := 0; i < vmaminDegree; i++ { + left.children[i].parent = left + left.children[i].parentIndex = i + right.children[i].parent = right + right.children[i].parentIndex = i + } + } + n.nrSegments = 1 + n.hasChildren = true + n.children[0] = left + n.children[1] = right + if gap.node != n { + return gap + } + if gap.index < vmaminDegree { + return vmaGapIterator{left, gap.index} + } + return vmaGapIterator{right, gap.index - vmaminDegree} + } + + copy(n.parent.keys[n.parentIndex+1:], n.parent.keys[n.parentIndex:n.parent.nrSegments]) + copy(n.parent.values[n.parentIndex+1:], n.parent.values[n.parentIndex:n.parent.nrSegments]) + n.parent.keys[n.parentIndex], n.parent.values[n.parentIndex] = n.keys[vmaminDegree-1], n.values[vmaminDegree-1] + copy(n.parent.children[n.parentIndex+2:], n.parent.children[n.parentIndex+1:n.parent.nrSegments+1]) + for i := n.parentIndex + 2; i < n.parent.nrSegments+2; i++ { + n.parent.children[i].parentIndex = i + } + sibling := &vmanode{ + nrSegments: vmaminDegree - 1, + parent: n.parent, + parentIndex: n.parentIndex + 1, + hasChildren: n.hasChildren, + } + n.parent.children[n.parentIndex+1] = sibling + n.parent.nrSegments++ + copy(sibling.keys[:vmaminDegree-1], n.keys[vmaminDegree:]) + copy(sibling.values[:vmaminDegree-1], n.values[vmaminDegree:]) + vmazeroValueSlice(n.values[vmaminDegree-1:]) + if n.hasChildren { + copy(sibling.children[:vmaminDegree], n.children[vmaminDegree:]) + vmazeroNodeSlice(n.children[vmaminDegree:]) + for i := 0; i < vmaminDegree; i++ { + sibling.children[i].parent = sibling + sibling.children[i].parentIndex = i + } + } + n.nrSegments = vmaminDegree - 1 + + if gap.node != n { + return gap + } + if gap.index < vmaminDegree { + return gap + } + return vmaGapIterator{sibling, gap.index - vmaminDegree} +} + +// rebalanceAfterRemove "unsplits" n and its ancestors if they are deficient +// (contain fewer segments than required by B-tree invariants), as required for +// removal, and returns an updated iterator to the position represented by gap. +// +// Precondition: n is the only node in the tree that may currently violate a +// B-tree invariant. +func (n *vmanode) rebalanceAfterRemove(gap vmaGapIterator) vmaGapIterator { + for { + if n.nrSegments >= vmaminDegree-1 { + return gap + } + if n.parent == nil { + + return gap + } + + if sibling := n.prevSibling(); sibling != nil && sibling.nrSegments >= vmaminDegree { + copy(n.keys[1:], n.keys[:n.nrSegments]) + copy(n.values[1:], n.values[:n.nrSegments]) + n.keys[0] = n.parent.keys[n.parentIndex-1] + n.values[0] = n.parent.values[n.parentIndex-1] + n.parent.keys[n.parentIndex-1] = sibling.keys[sibling.nrSegments-1] + n.parent.values[n.parentIndex-1] = sibling.values[sibling.nrSegments-1] + vmaSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) + if n.hasChildren { + copy(n.children[1:], n.children[:n.nrSegments+1]) + n.children[0] = sibling.children[sibling.nrSegments] + sibling.children[sibling.nrSegments] = nil + n.children[0].parent = n + n.children[0].parentIndex = 0 + for i := 1; i < n.nrSegments+2; i++ { + n.children[i].parentIndex = i + } + } + n.nrSegments++ + sibling.nrSegments-- + if gap.node == sibling && gap.index == sibling.nrSegments { + return vmaGapIterator{n, 0} + } + if gap.node == n { + return vmaGapIterator{n, gap.index + 1} + } + return gap + } + if sibling := n.nextSibling(); sibling != nil && sibling.nrSegments >= vmaminDegree { + n.keys[n.nrSegments] = n.parent.keys[n.parentIndex] + n.values[n.nrSegments] = n.parent.values[n.parentIndex] + n.parent.keys[n.parentIndex] = sibling.keys[0] + n.parent.values[n.parentIndex] = sibling.values[0] + copy(sibling.keys[:sibling.nrSegments-1], sibling.keys[1:]) + copy(sibling.values[:sibling.nrSegments-1], sibling.values[1:]) + vmaSetFunctions{}.ClearValue(&sibling.values[sibling.nrSegments-1]) + if n.hasChildren { + n.children[n.nrSegments+1] = sibling.children[0] + copy(sibling.children[:sibling.nrSegments], sibling.children[1:]) + sibling.children[sibling.nrSegments] = nil + n.children[n.nrSegments+1].parent = n + n.children[n.nrSegments+1].parentIndex = n.nrSegments + 1 + for i := 0; i < sibling.nrSegments; i++ { + sibling.children[i].parentIndex = i + } + } + n.nrSegments++ + sibling.nrSegments-- + if gap.node == sibling { + if gap.index == 0 { + return vmaGapIterator{n, n.nrSegments} + } + return vmaGapIterator{sibling, gap.index - 1} + } + return gap + } + + p := n.parent + if p.nrSegments == 1 { + + left, right := p.children[0], p.children[1] + p.nrSegments = left.nrSegments + right.nrSegments + 1 + p.hasChildren = left.hasChildren + p.keys[left.nrSegments] = p.keys[0] + p.values[left.nrSegments] = p.values[0] + copy(p.keys[:left.nrSegments], left.keys[:left.nrSegments]) + copy(p.values[:left.nrSegments], left.values[:left.nrSegments]) + copy(p.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) + copy(p.values[left.nrSegments+1:], right.values[:right.nrSegments]) + if left.hasChildren { + copy(p.children[:left.nrSegments+1], left.children[:left.nrSegments+1]) + copy(p.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) + for i := 0; i < p.nrSegments+1; i++ { + p.children[i].parent = p + p.children[i].parentIndex = i + } + } else { + p.children[0] = nil + p.children[1] = nil + } + if gap.node == left { + return vmaGapIterator{p, gap.index} + } + if gap.node == right { + return vmaGapIterator{p, gap.index + left.nrSegments + 1} + } + return gap + } + // Merge n and either sibling, along with the segment separating the + // two, into whichever of the two nodes comes first. This is the + // reverse of the non-root splitting case in + // node.rebalanceBeforeInsert. + var left, right *vmanode + if n.parentIndex > 0 { + left = n.prevSibling() + right = n + } else { + left = n + right = n.nextSibling() + } + + if gap.node == right { + gap = vmaGapIterator{left, gap.index + left.nrSegments + 1} + } + left.keys[left.nrSegments] = p.keys[left.parentIndex] + left.values[left.nrSegments] = p.values[left.parentIndex] + copy(left.keys[left.nrSegments+1:], right.keys[:right.nrSegments]) + copy(left.values[left.nrSegments+1:], right.values[:right.nrSegments]) + if left.hasChildren { + copy(left.children[left.nrSegments+1:], right.children[:right.nrSegments+1]) + for i := left.nrSegments + 1; i < left.nrSegments+right.nrSegments+2; i++ { + left.children[i].parent = left + left.children[i].parentIndex = i + } + } + left.nrSegments += right.nrSegments + 1 + copy(p.keys[left.parentIndex:], p.keys[left.parentIndex+1:p.nrSegments]) + copy(p.values[left.parentIndex:], p.values[left.parentIndex+1:p.nrSegments]) + vmaSetFunctions{}.ClearValue(&p.values[p.nrSegments-1]) + copy(p.children[left.parentIndex+1:], p.children[left.parentIndex+2:p.nrSegments+1]) + for i := 0; i < p.nrSegments; i++ { + p.children[i].parentIndex = i + } + p.children[p.nrSegments] = nil + p.nrSegments-- + + n = p + } +} + +// A Iterator is conceptually one of: +// +// - A pointer to a segment in a set; or +// +// - A terminal iterator, which is a sentinel indicating that the end of +// iteration has been reached. +// +// Iterators are copyable values and are meaningfully equality-comparable. The +// zero value of Iterator is a terminal iterator. +// +// Unless otherwise specified, any mutation of a set invalidates all existing +// iterators into the set. +type vmaIterator struct { + // node is the node containing the iterated segment. If the iterator is + // terminal, node is nil. + node *vmanode + + // index is the index of the segment in node.keys/values. + index int +} + +// Ok returns true if the iterator is not terminal. All other methods are only +// valid for non-terminal iterators. +func (seg vmaIterator) Ok() bool { + return seg.node != nil +} + +// Range returns the iterated segment's range key. +func (seg vmaIterator) Range() __generics_imported0.AddrRange { + return seg.node.keys[seg.index] +} + +// Start is equivalent to Range().Start, but should be preferred if only the +// start of the range is needed. +func (seg vmaIterator) Start() __generics_imported0.Addr { + return seg.node.keys[seg.index].Start +} + +// End is equivalent to Range().End, but should be preferred if only the end of +// the range is needed. +func (seg vmaIterator) End() __generics_imported0.Addr { + return seg.node.keys[seg.index].End +} + +// SetRangeUnchecked mutates the iterated segment's range key. This operation +// does not invalidate any iterators. +// +// Preconditions: +// +// - r.Length() > 0. +// +// - The new range must not overlap an existing one: If seg.NextSegment().Ok(), +// then r.end <= seg.NextSegment().Start(); if seg.PrevSegment().Ok(), then +// r.start >= seg.PrevSegment().End(). +func (seg vmaIterator) SetRangeUnchecked(r __generics_imported0.AddrRange) { + seg.node.keys[seg.index] = r +} + +// SetRange mutates the iterated segment's range key. If the new range would +// cause the iterated segment to overlap another segment, or if the new range +// is invalid, SetRange panics. This operation does not invalidate any +// iterators. +func (seg vmaIterator) SetRange(r __generics_imported0.AddrRange) { + if r.Length() <= 0 { + panic(fmt.Sprintf("invalid segment range %v", r)) + } + if prev := seg.PrevSegment(); prev.Ok() && r.Start < prev.End() { + panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, prev.Range())) + } + if next := seg.NextSegment(); next.Ok() && r.End > next.Start() { + panic(fmt.Sprintf("new segment range %v overlaps segment range %v", r, next.Range())) + } + seg.SetRangeUnchecked(r) +} + +// SetStartUnchecked mutates the iterated segment's start. This operation does +// not invalidate any iterators. +// +// Preconditions: The new start must be valid: start < seg.End(); if +// seg.PrevSegment().Ok(), then start >= seg.PrevSegment().End(). +func (seg vmaIterator) SetStartUnchecked(start __generics_imported0.Addr) { + seg.node.keys[seg.index].Start = start +} + +// SetStart mutates the iterated segment's start. If the new start value would +// cause the iterated segment to overlap another segment, or would result in an +// invalid range, SetStart panics. This operation does not invalidate any +// iterators. +func (seg vmaIterator) SetStart(start __generics_imported0.Addr) { + if start >= seg.End() { + panic(fmt.Sprintf("new start %v would invalidate segment range %v", start, seg.Range())) + } + if prev := seg.PrevSegment(); prev.Ok() && start < prev.End() { + panic(fmt.Sprintf("new start %v would cause segment range %v to overlap segment range %v", start, seg.Range(), prev.Range())) + } + seg.SetStartUnchecked(start) +} + +// SetEndUnchecked mutates the iterated segment's end. This operation does not +// invalidate any iterators. +// +// Preconditions: The new end must be valid: end > seg.Start(); if +// seg.NextSegment().Ok(), then end <= seg.NextSegment().Start(). +func (seg vmaIterator) SetEndUnchecked(end __generics_imported0.Addr) { + seg.node.keys[seg.index].End = end +} + +// SetEnd mutates the iterated segment's end. If the new end value would cause +// the iterated segment to overlap another segment, or would result in an +// invalid range, SetEnd panics. This operation does not invalidate any +// iterators. +func (seg vmaIterator) SetEnd(end __generics_imported0.Addr) { + if end <= seg.Start() { + panic(fmt.Sprintf("new end %v would invalidate segment range %v", end, seg.Range())) + } + if next := seg.NextSegment(); next.Ok() && end > next.Start() { + panic(fmt.Sprintf("new end %v would cause segment range %v to overlap segment range %v", end, seg.Range(), next.Range())) + } + seg.SetEndUnchecked(end) +} + +// Value returns a copy of the iterated segment's value. +func (seg vmaIterator) Value() vma { + return seg.node.values[seg.index] +} + +// ValuePtr returns a pointer to the iterated segment's value. The pointer is +// invalidated if the iterator is invalidated. This operation does not +// invalidate any iterators. +func (seg vmaIterator) ValuePtr() *vma { + return &seg.node.values[seg.index] +} + +// SetValue mutates the iterated segment's value. This operation does not +// invalidate any iterators. +func (seg vmaIterator) SetValue(val vma) { + seg.node.values[seg.index] = val +} + +// PrevSegment returns the iterated segment's predecessor. If there is no +// preceding segment, PrevSegment returns a terminal iterator. +func (seg vmaIterator) PrevSegment() vmaIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index].lastSegment() + } + if seg.index > 0 { + return vmaIterator{seg.node, seg.index - 1} + } + if seg.node.parent == nil { + return vmaIterator{} + } + return vmasegmentBeforePosition(seg.node.parent, seg.node.parentIndex) +} + +// NextSegment returns the iterated segment's successor. If there is no +// succeeding segment, NextSegment returns a terminal iterator. +func (seg vmaIterator) NextSegment() vmaIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index+1].firstSegment() + } + if seg.index < seg.node.nrSegments-1 { + return vmaIterator{seg.node, seg.index + 1} + } + if seg.node.parent == nil { + return vmaIterator{} + } + return vmasegmentAfterPosition(seg.node.parent, seg.node.parentIndex) +} + +// PrevGap returns the gap immediately before the iterated segment. +func (seg vmaIterator) PrevGap() vmaGapIterator { + if seg.node.hasChildren { + + return seg.node.children[seg.index].lastSegment().NextGap() + } + return vmaGapIterator{seg.node, seg.index} +} + +// NextGap returns the gap immediately after the iterated segment. +func (seg vmaIterator) NextGap() vmaGapIterator { + if seg.node.hasChildren { + return seg.node.children[seg.index+1].firstSegment().PrevGap() + } + return vmaGapIterator{seg.node, seg.index + 1} +} + +// PrevNonEmpty returns the iterated segment's predecessor if it is adjacent, +// or the gap before the iterated segment otherwise. If seg.Start() == +// Functions.MinKey(), PrevNonEmpty will return two terminal iterators. +// Otherwise, exactly one of the iterators returned by PrevNonEmpty will be +// non-terminal. +func (seg vmaIterator) PrevNonEmpty() (vmaIterator, vmaGapIterator) { + gap := seg.PrevGap() + if gap.Range().Length() != 0 { + return vmaIterator{}, gap + } + return gap.PrevSegment(), vmaGapIterator{} +} + +// NextNonEmpty returns the iterated segment's successor if it is adjacent, or +// the gap after the iterated segment otherwise. If seg.End() == +// Functions.MaxKey(), NextNonEmpty will return two terminal iterators. +// Otherwise, exactly one of the iterators returned by NextNonEmpty will be +// non-terminal. +func (seg vmaIterator) NextNonEmpty() (vmaIterator, vmaGapIterator) { + gap := seg.NextGap() + if gap.Range().Length() != 0 { + return vmaIterator{}, gap + } + return gap.NextSegment(), vmaGapIterator{} +} + +// A GapIterator is conceptually one of: +// +// - A pointer to a position between two segments, before the first segment, or +// after the last segment in a set, called a *gap*; or +// +// - A terminal iterator, which is a sentinel indicating that the end of +// iteration has been reached. +// +// Note that the gap between two adjacent segments exists (iterators to it are +// non-terminal), but has a length of zero. GapIterator.IsEmpty returns true +// for such gaps. An empty set contains a single gap, spanning the entire range +// of the set's keys. +// +// GapIterators are copyable values and are meaningfully equality-comparable. +// The zero value of GapIterator is a terminal iterator. +// +// Unless otherwise specified, any mutation of a set invalidates all existing +// iterators into the set. +type vmaGapIterator struct { + // The representation of a GapIterator is identical to that of an Iterator, + // except that index corresponds to positions between segments in the same + // way as for node.children (see comment for node.nrSegments). + node *vmanode + index int +} + +// Ok returns true if the iterator is not terminal. All other methods are only +// valid for non-terminal iterators. +func (gap vmaGapIterator) Ok() bool { + return gap.node != nil +} + +// Range returns the range spanned by the iterated gap. +func (gap vmaGapIterator) Range() __generics_imported0.AddrRange { + return __generics_imported0.AddrRange{gap.Start(), gap.End()} +} + +// Start is equivalent to Range().Start, but should be preferred if only the +// start of the range is needed. +func (gap vmaGapIterator) Start() __generics_imported0.Addr { + if ps := gap.PrevSegment(); ps.Ok() { + return ps.End() + } + return vmaSetFunctions{}.MinKey() +} + +// End is equivalent to Range().End, but should be preferred if only the end of +// the range is needed. +func (gap vmaGapIterator) End() __generics_imported0.Addr { + if ns := gap.NextSegment(); ns.Ok() { + return ns.Start() + } + return vmaSetFunctions{}.MaxKey() +} + +// IsEmpty returns true if the iterated gap is empty (that is, the "gap" is +// between two adjacent segments.) +func (gap vmaGapIterator) IsEmpty() bool { + return gap.Range().Length() == 0 +} + +// PrevSegment returns the segment immediately before the iterated gap. If no +// such segment exists, PrevSegment returns a terminal iterator. +func (gap vmaGapIterator) PrevSegment() vmaIterator { + return vmasegmentBeforePosition(gap.node, gap.index) +} + +// NextSegment returns the segment immediately after the iterated gap. If no +// such segment exists, NextSegment returns a terminal iterator. +func (gap vmaGapIterator) NextSegment() vmaIterator { + return vmasegmentAfterPosition(gap.node, gap.index) +} + +// PrevGap returns the iterated gap's predecessor. If no such gap exists, +// PrevGap returns a terminal iterator. +func (gap vmaGapIterator) PrevGap() vmaGapIterator { + seg := gap.PrevSegment() + if !seg.Ok() { + return vmaGapIterator{} + } + return seg.PrevGap() +} + +// NextGap returns the iterated gap's successor. If no such gap exists, NextGap +// returns a terminal iterator. +func (gap vmaGapIterator) NextGap() vmaGapIterator { + seg := gap.NextSegment() + if !seg.Ok() { + return vmaGapIterator{} + } + return seg.NextGap() +} + +// segmentBeforePosition returns the predecessor segment of the position given +// by n.children[i], which may or may not contain a child. If no such segment +// exists, segmentBeforePosition returns a terminal iterator. +func vmasegmentBeforePosition(n *vmanode, i int) vmaIterator { + for i == 0 { + if n.parent == nil { + return vmaIterator{} + } + n, i = n.parent, n.parentIndex + } + return vmaIterator{n, i - 1} +} + +// segmentAfterPosition returns the successor segment of the position given by +// n.children[i], which may or may not contain a child. If no such segment +// exists, segmentAfterPosition returns a terminal iterator. +func vmasegmentAfterPosition(n *vmanode, i int) vmaIterator { + for i == n.nrSegments { + if n.parent == nil { + return vmaIterator{} + } + n, i = n.parent, n.parentIndex + } + return vmaIterator{n, i} +} + +func vmazeroValueSlice(slice []vma) { + + for i := range slice { + vmaSetFunctions{}.ClearValue(&slice[i]) + } +} + +func vmazeroNodeSlice(slice []*vmanode) { + for i := range slice { + slice[i] = nil + } +} + +// String stringifies a Set for debugging. +func (s *vmaSet) String() string { + return s.root.String() +} + +// String stringifes a node (and all of its children) for debugging. +func (n *vmanode) String() string { + var buf bytes.Buffer + n.writeDebugString(&buf, "") + return buf.String() +} + +func (n *vmanode) writeDebugString(buf *bytes.Buffer, prefix string) { + if n.hasChildren != (n.nrSegments > 0 && n.children[0] != nil) { + buf.WriteString(prefix) + buf.WriteString(fmt.Sprintf("WARNING: inconsistent value of hasChildren: got %v, want %v\n", n.hasChildren, !n.hasChildren)) + } + for i := 0; i < n.nrSegments; i++ { + if child := n.children[i]; child != nil { + cprefix := fmt.Sprintf("%s- % 3d ", prefix, i) + if child.parent != n || child.parentIndex != i { + buf.WriteString(cprefix) + buf.WriteString(fmt.Sprintf("WARNING: inconsistent linkage to parent: got (%p, %d), want (%p, %d)\n", child.parent, child.parentIndex, n, i)) + } + child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, i)) + } + buf.WriteString(prefix) + buf.WriteString(fmt.Sprintf("- % 3d: %v => %v\n", i, n.keys[i], n.values[i])) + } + if child := n.children[n.nrSegments]; child != nil { + child.writeDebugString(buf, fmt.Sprintf("%s- % 3d ", prefix, n.nrSegments)) + } +} + +// SegmentDataSlices represents segments from a set as slices of start, end, and +// values. SegmentDataSlices is primarily used as an intermediate representation +// for save/restore and the layout here is optimized for that. +// +// +stateify savable +type vmaSegmentDataSlices struct { + Start []__generics_imported0.Addr + End []__generics_imported0.Addr + Values []vma +} + +// ExportSortedSlice returns a copy of all segments in the given set, in ascending +// key order. +func (s *vmaSet) ExportSortedSlices() *vmaSegmentDataSlices { + var sds vmaSegmentDataSlices + for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + sds.Start = append(sds.Start, seg.Start()) + sds.End = append(sds.End, seg.End()) + sds.Values = append(sds.Values, seg.Value()) + } + sds.Start = sds.Start[:len(sds.Start):len(sds.Start)] + sds.End = sds.End[:len(sds.End):len(sds.End)] + sds.Values = sds.Values[:len(sds.Values):len(sds.Values)] + return &sds +} + +// ImportSortedSlice initializes the given set from the given slice. +// +// Preconditions: s must be empty. sds must represent a valid set (the segments +// in sds must have valid lengths that do not overlap). The segments in sds +// must be sorted in ascending key order. +func (s *vmaSet) ImportSortedSlices(sds *vmaSegmentDataSlices) error { + if !s.IsEmpty() { + return fmt.Errorf("cannot import into non-empty set %v", s) + } + gap := s.FirstGap() + for i := range sds.Start { + r := __generics_imported0.AddrRange{sds.Start[i], sds.End[i]} + if !gap.Range().IsSupersetOf(r) { + return fmt.Errorf("segment overlaps a preceding segment or is incorrectly sorted: [%d, %d) => %v", sds.Start[i], sds.End[i], sds.Values[i]) + } + gap = s.InsertWithoutMerging(gap, r, sds.Values[i]).NextGap() + } + return nil +} +func (s *vmaSet) saveRoot() *vmaSegmentDataSlices { + return s.ExportSortedSlices() +} + +func (s *vmaSet) loadRoot(sds *vmaSegmentDataSlices) { + if err := s.ImportSortedSlices(sds); err != nil { + panic(err) + } +} |