diff options
author | gVisor bot <gvisor-bot@google.com> | 2019-06-02 06:44:55 +0000 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2019-06-02 06:44:55 +0000 |
commit | ceb0d792f328d1fc0692197d8856a43c3936a571 (patch) | |
tree | 83155f302eff44a78bcc30a3a08f4efe59a79379 /pkg/sentry/platform/ring0/pagetables | |
parent | deb7ecf1e46862d54f4b102f2d163cfbcfc37f3b (diff) | |
parent | 216da0b733dbed9aad9b2ab92ac75bcb906fd7ee (diff) |
Merge 216da0b7 (automated)
Diffstat (limited to 'pkg/sentry/platform/ring0/pagetables')
-rw-r--r-- | pkg/sentry/platform/ring0/pagetables/allocator.go | 122 | ||||
-rw-r--r-- | pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go | 53 | ||||
-rw-r--r-- | pkg/sentry/platform/ring0/pagetables/pagetables.go | 221 | ||||
-rw-r--r-- | pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go | 45 | ||||
-rwxr-xr-x | pkg/sentry/platform/ring0/pagetables/pagetables_state_autogen.go | 4 | ||||
-rw-r--r-- | pkg/sentry/platform/ring0/pagetables/pagetables_x86.go | 180 | ||||
-rw-r--r-- | pkg/sentry/platform/ring0/pagetables/pcids_x86.go | 109 | ||||
-rwxr-xr-x | pkg/sentry/platform/ring0/pagetables/walker_empty.go | 255 | ||||
-rwxr-xr-x | pkg/sentry/platform/ring0/pagetables/walker_lookup.go | 255 | ||||
-rwxr-xr-x | pkg/sentry/platform/ring0/pagetables/walker_map.go | 255 | ||||
-rwxr-xr-x | pkg/sentry/platform/ring0/pagetables/walker_unmap.go | 255 |
11 files changed, 1754 insertions, 0 deletions
diff --git a/pkg/sentry/platform/ring0/pagetables/allocator.go b/pkg/sentry/platform/ring0/pagetables/allocator.go new file mode 100644 index 000000000..23fd5c352 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/allocator.go @@ -0,0 +1,122 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pagetables + +// Allocator is used to allocate and map PTEs. +// +// Note that allocators may be called concurrently. +type Allocator interface { + // NewPTEs returns a new set of PTEs and their physical address. + NewPTEs() *PTEs + + // PhysicalFor gives the physical address for a set of PTEs. + PhysicalFor(ptes *PTEs) uintptr + + // LookupPTEs looks up PTEs by physical address. + LookupPTEs(physical uintptr) *PTEs + + // FreePTEs marks a set of PTEs a freed, although they may not be available + // for use again until Recycle is called, below. + FreePTEs(ptes *PTEs) + + // Recycle makes freed PTEs available for use again. + Recycle() +} + +// RuntimeAllocator is a trivial allocator. +type RuntimeAllocator struct { + // used is the set of PTEs that have been allocated. This includes any + // PTEs that may be in the pool below. PTEs are only freed from this + // map by the Drain call. + // + // This exists to prevent accidental garbage collection. + used map[*PTEs]struct{} + + // pool is the set of free-to-use PTEs. + pool []*PTEs + + // freed is the set of recently-freed PTEs. + freed []*PTEs +} + +// NewRuntimeAllocator returns an allocator that uses runtime allocation. +func NewRuntimeAllocator() *RuntimeAllocator { + return &RuntimeAllocator{ + used: make(map[*PTEs]struct{}), + } +} + +// Recycle returns freed pages to the pool. +func (r *RuntimeAllocator) Recycle() { + r.pool = append(r.pool, r.freed...) + r.freed = r.freed[:0] +} + +// Drain empties the pool. +func (r *RuntimeAllocator) Drain() { + r.Recycle() + for i, ptes := range r.pool { + // Zap the entry in the underlying array to ensure that it can + // be properly garbage collected. + r.pool[i] = nil + // Similarly, free the reference held by the used map (these + // also apply for the pool entries). + delete(r.used, ptes) + } + r.pool = r.pool[:0] +} + +// NewPTEs implements Allocator.NewPTEs. +// +// Note that the "physical" address here is actually the virtual address of the +// PTEs structure. The entries are tracked only to avoid garbage collection. +// +// This is guaranteed not to split as long as the pool is sufficiently full. +// +//go:nosplit +func (r *RuntimeAllocator) NewPTEs() *PTEs { + // Pull from the pool if we can. + if len(r.pool) > 0 { + ptes := r.pool[len(r.pool)-1] + r.pool = r.pool[:len(r.pool)-1] + return ptes + } + + // Allocate a new entry. + ptes := newAlignedPTEs() + r.used[ptes] = struct{}{} + return ptes +} + +// PhysicalFor returns the physical address for the given PTEs. +// +//go:nosplit +func (r *RuntimeAllocator) PhysicalFor(ptes *PTEs) uintptr { + return physicalFor(ptes) +} + +// LookupPTEs implements Allocator.LookupPTEs. +// +//go:nosplit +func (r *RuntimeAllocator) LookupPTEs(physical uintptr) *PTEs { + return fromPhysical(physical) +} + +// FreePTEs implements Allocator.FreePTEs. +// +//go:nosplit +func (r *RuntimeAllocator) FreePTEs(ptes *PTEs) { + r.freed = append(r.freed, ptes) +} diff --git a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go new file mode 100644 index 000000000..1b996b4e2 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go @@ -0,0 +1,53 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pagetables + +import ( + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// newAlignedPTEs returns a set of aligned PTEs. +func newAlignedPTEs() *PTEs { + ptes := new(PTEs) + offset := physicalFor(ptes) & (usermem.PageSize - 1) + if offset == 0 { + // Already aligned. + return ptes + } + + // Need to force an aligned allocation. + unaligned := make([]byte, (2*usermem.PageSize)-1) + offset = uintptr(unsafe.Pointer(&unaligned[0])) & (usermem.PageSize - 1) + if offset != 0 { + offset = usermem.PageSize - offset + } + return (*PTEs)(unsafe.Pointer(&unaligned[offset])) +} + +// physicalFor returns the "physical" address for PTEs. +// +//go:nosplit +func physicalFor(ptes *PTEs) uintptr { + return uintptr(unsafe.Pointer(ptes)) +} + +// fromPhysical returns the PTEs from the "physical" address. +// +//go:nosplit +func fromPhysical(physical uintptr) *PTEs { + return (*PTEs)(unsafe.Pointer(physical)) +} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go new file mode 100644 index 000000000..e5dcaada7 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go @@ -0,0 +1,221 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package pagetables provides a generic implementation of pagetables. +// +// The core functions must be safe to call from a nosplit context. Furthermore, +// this pagetables implementation goes to lengths to ensure that all functions +// are free from runtime allocation. Calls to NewPTEs/FreePTEs may be made +// during walks, but these can be cached elsewhere if required. +package pagetables + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// PageTables is a set of page tables. +type PageTables struct { + // Allocator is used to allocate nodes. + Allocator Allocator + + // root is the pagetable root. + root *PTEs + + // rootPhysical is the cached physical address of the root. + // + // This is saved only to prevent constant translation. + rootPhysical uintptr + + // archPageTables includes architecture-specific features. + archPageTables +} + +// New returns new PageTables. +func New(a Allocator) *PageTables { + p := new(PageTables) + p.Init(a) + return p +} + +// Init initializes a set of PageTables. +// +//go:nosplit +func (p *PageTables) Init(allocator Allocator) { + p.Allocator = allocator + p.root = p.Allocator.NewPTEs() + p.rootPhysical = p.Allocator.PhysicalFor(p.root) +} + +// mapVisitor is used for map. +type mapVisitor struct { + target uintptr // Input. + physical uintptr // Input. + opts MapOpts // Input. + prev bool // Output. +} + +// visit is used for map. +// +//go:nosplit +func (v *mapVisitor) visit(start uintptr, pte *PTE, align uintptr) { + p := v.physical + (start - uintptr(v.target)) + if pte.Valid() && (pte.Address() != p || pte.Opts() != v.opts) { + v.prev = true + } + if p&align != 0 { + // We will install entries at a smaller granulaity if we don't + // install a valid entry here, however we must zap any existing + // entry to ensure this happens. + pte.Clear() + return + } + pte.Set(p, v.opts) +} + +//go:nosplit +func (*mapVisitor) requiresAlloc() bool { return true } + +//go:nosplit +func (*mapVisitor) requiresSplit() bool { return true } + +// Map installs a mapping with the given physical address. +// +// True is returned iff there was a previous mapping in the range. +// +// Precondition: addr & length must be page-aligned, their sum must not overflow. +// +//go:nosplit +func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physical uintptr) bool { + if !opts.AccessType.Any() { + return p.Unmap(addr, length) + } + w := mapWalker{ + pageTables: p, + visitor: mapVisitor{ + target: uintptr(addr), + physical: physical, + opts: opts, + }, + } + w.iterateRange(uintptr(addr), uintptr(addr)+length) + return w.visitor.prev +} + +// unmapVisitor is used for unmap. +type unmapVisitor struct { + count int +} + +//go:nosplit +func (*unmapVisitor) requiresAlloc() bool { return false } + +//go:nosplit +func (*unmapVisitor) requiresSplit() bool { return true } + +// visit unmaps the given entry. +// +//go:nosplit +func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) { + pte.Clear() + v.count++ +} + +// Unmap unmaps the given range. +// +// True is returned iff there was a previous mapping in the range. +// +// Precondition: addr & length must be page-aligned. +// +//go:nosplit +func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool { + w := unmapWalker{ + pageTables: p, + visitor: unmapVisitor{ + count: 0, + }, + } + w.iterateRange(uintptr(addr), uintptr(addr)+length) + return w.visitor.count > 0 +} + +// emptyVisitor is used for emptiness checks. +type emptyVisitor struct { + count int +} + +//go:nosplit +func (*emptyVisitor) requiresAlloc() bool { return false } + +//go:nosplit +func (*emptyVisitor) requiresSplit() bool { return false } + +// visit unmaps the given entry. +// +//go:nosplit +func (v *emptyVisitor) visit(start uintptr, pte *PTE, align uintptr) { + v.count++ +} + +// IsEmpty checks if the given range is empty. +// +// Precondition: addr & length must be page-aligned. +// +//go:nosplit +func (p *PageTables) IsEmpty(addr usermem.Addr, length uintptr) bool { + w := emptyWalker{ + pageTables: p, + } + w.iterateRange(uintptr(addr), uintptr(addr)+length) + return w.visitor.count == 0 +} + +// lookupVisitor is used for lookup. +type lookupVisitor struct { + target uintptr // Input. + physical uintptr // Output. + opts MapOpts // Output. +} + +// visit matches the given address. +// +//go:nosplit +func (v *lookupVisitor) visit(start uintptr, pte *PTE, align uintptr) { + if !pte.Valid() { + return + } + v.physical = pte.Address() + (start - uintptr(v.target)) + v.opts = pte.Opts() +} + +//go:nosplit +func (*lookupVisitor) requiresAlloc() bool { return false } + +//go:nosplit +func (*lookupVisitor) requiresSplit() bool { return false } + +// Lookup returns the physical address for the given virtual address. +// +//go:nosplit +func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts) { + mask := uintptr(usermem.PageSize - 1) + offset := uintptr(addr) & mask + w := lookupWalker{ + pageTables: p, + visitor: lookupVisitor{ + target: uintptr(addr &^ usermem.Addr(mask)), + }, + } + w.iterateRange(uintptr(addr), uintptr(addr)+1) + return w.visitor.physical + offset, w.visitor.opts +} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go new file mode 100644 index 000000000..7aa6c524e --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go @@ -0,0 +1,45 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pagetables + +// Address constraints. +// +// The lowerTop and upperBottom currently apply to four-level pagetables; +// additional refactoring would be necessary to support five-level pagetables. +const ( + lowerTop = 0x00007fffffffffff + upperBottom = 0xffff800000000000 + + pteShift = 12 + pmdShift = 21 + pudShift = 30 + pgdShift = 39 + + pteMask = 0x1ff << pteShift + pmdMask = 0x1ff << pmdShift + pudMask = 0x1ff << pudShift + pgdMask = 0x1ff << pgdShift + + pteSize = 1 << pteShift + pmdSize = 1 << pmdShift + pudSize = 1 << pudShift + pgdSize = 1 << pgdShift + + executeDisable = 1 << 63 + entriesPerPage = 512 +) + +// PTEs is a collection of entries. +type PTEs [entriesPerPage]PTE diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_state_autogen.go b/pkg/sentry/platform/ring0/pagetables/pagetables_state_autogen.go new file mode 100755 index 000000000..ac1ccf3d3 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_state_autogen.go @@ -0,0 +1,4 @@ +// automatically generated by stateify. + +package pagetables + diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go new file mode 100644 index 000000000..ff427fbe9 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go @@ -0,0 +1,180 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build i386 amd64 + +package pagetables + +import ( + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// archPageTables is architecture-specific data. +type archPageTables struct { + // pcid is the value assigned by PCIDs.Assign. + // + // Note that zero is a valid PCID. + pcid uint16 +} + +// CR3 returns the CR3 value for these tables. +// +// This may be called in interrupt contexts. A PCID of zero always implies a +// flush and should be passed when PCIDs are not enabled. See pcids_x86.go for +// more information. +// +//go:nosplit +func (p *PageTables) CR3(noFlush bool, pcid uint16) uint64 { + // Bit 63 is set to avoid flushing the PCID (per SDM 4.10.4.1). + const noFlushBit uint64 = 0x8000000000000000 + if noFlush && pcid != 0 { + return noFlushBit | uint64(p.rootPhysical) | uint64(pcid) + } + return uint64(p.rootPhysical) | uint64(pcid) +} + +// Bits in page table entries. +const ( + present = 0x001 + writable = 0x002 + user = 0x004 + writeThrough = 0x008 + cacheDisable = 0x010 + accessed = 0x020 + dirty = 0x040 + super = 0x080 + global = 0x100 + optionMask = executeDisable | 0xfff +) + +// MapOpts are x86 options. +type MapOpts struct { + // AccessType defines permissions. + AccessType usermem.AccessType + + // Global indicates the page is globally accessible. + Global bool + + // User indicates the page is a user page. + User bool +} + +// PTE is a page table entry. +type PTE uintptr + +// Clear clears this PTE, including super page information. +// +//go:nosplit +func (p *PTE) Clear() { + atomic.StoreUintptr((*uintptr)(p), 0) +} + +// Valid returns true iff this entry is valid. +// +//go:nosplit +func (p *PTE) Valid() bool { + return atomic.LoadUintptr((*uintptr)(p))&present != 0 +} + +// Opts returns the PTE options. +// +// These are all options except Valid and Super. +// +//go:nosplit +func (p *PTE) Opts() MapOpts { + v := atomic.LoadUintptr((*uintptr)(p)) + return MapOpts{ + AccessType: usermem.AccessType{ + Read: v&present != 0, + Write: v&writable != 0, + Execute: v&executeDisable == 0, + }, + Global: v&global != 0, + User: v&user != 0, + } +} + +// SetSuper sets this page as a super page. +// +// The page must not be valid or a panic will result. +// +//go:nosplit +func (p *PTE) SetSuper() { + if p.Valid() { + // This is not allowed. + panic("SetSuper called on valid page!") + } + atomic.StoreUintptr((*uintptr)(p), super) +} + +// IsSuper returns true iff this page is a super page. +// +//go:nosplit +func (p *PTE) IsSuper() bool { + return atomic.LoadUintptr((*uintptr)(p))&super != 0 +} + +// Set sets this PTE value. +// +// This does not change the super page property. +// +//go:nosplit +func (p *PTE) Set(addr uintptr, opts MapOpts) { + if !opts.AccessType.Any() { + p.Clear() + return + } + v := (addr &^ optionMask) | present | accessed + if opts.User { + v |= user + } + if opts.Global { + v |= global + } + if !opts.AccessType.Execute { + v |= executeDisable + } + if opts.AccessType.Write { + v |= writable | dirty + } + if p.IsSuper() { + // Note that this is inherited from the previous instance. Set + // does not change the value of Super. See above. + v |= super + } + atomic.StoreUintptr((*uintptr)(p), v) +} + +// setPageTable sets this PTE value and forces the write bit and super bit to +// be cleared. This is used explicitly for breaking super pages. +// +//go:nosplit +func (p *PTE) setPageTable(pt *PageTables, ptes *PTEs) { + addr := pt.Allocator.PhysicalFor(ptes) + if addr&^optionMask != addr { + // This should never happen. + panic("unaligned physical address!") + } + v := addr | present | user | writable | accessed | dirty + atomic.StoreUintptr((*uintptr)(p), v) +} + +// Address extracts the address. This should only be used if Valid returns true. +// +//go:nosplit +func (p *PTE) Address() uintptr { + return atomic.LoadUintptr((*uintptr)(p)) &^ optionMask +} diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go new file mode 100644 index 000000000..0f029f25d --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go @@ -0,0 +1,109 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build i386 amd64 + +package pagetables + +import ( + "sync" +) + +// limitPCID is the number of valid PCIDs. +const limitPCID = 4096 + +// PCIDs is a simple PCID database. +// +// This is not protected by locks and is thus suitable for use only with a +// single CPU at a time. +type PCIDs struct { + // mu protects below. + mu sync.Mutex + + // cache are the assigned page tables. + cache map[*PageTables]uint16 + + // avail are available PCIDs. + avail []uint16 +} + +// NewPCIDs returns a new PCID database. +// +// start is the first index to assign. Typically this will be one, as the zero +// pcid will always be flushed on transition (see pagetables_x86.go). This may +// be more than one if specific PCIDs are reserved. +// +// Nil is returned iff the start and size are out of range. +func NewPCIDs(start, size uint16) *PCIDs { + if start+uint16(size) >= limitPCID { + return nil // See comment. + } + p := &PCIDs{ + cache: make(map[*PageTables]uint16), + } + for pcid := start; pcid < start+size; pcid++ { + p.avail = append(p.avail, pcid) + } + return p +} + +// Assign assigns a PCID to the given PageTables. +// +// This may overwrite any previous assignment provided. If this in the case, +// true is returned to indicate that the PCID should be flushed. +func (p *PCIDs) Assign(pt *PageTables) (uint16, bool) { + p.mu.Lock() + if pcid, ok := p.cache[pt]; ok { + p.mu.Unlock() + return pcid, false // No flush. + } + + // Is there something available? + if len(p.avail) > 0 { + pcid := p.avail[len(p.avail)-1] + p.avail = p.avail[:len(p.avail)-1] + p.cache[pt] = pcid + + // We need to flush because while this is in the available + // pool, it may have been used previously. + p.mu.Unlock() + return pcid, true + } + + // Evict an existing table. + for old, pcid := range p.cache { + delete(p.cache, old) + p.cache[pt] = pcid + + // A flush is definitely required in this case, these page + // tables may still be active. (They will just be assigned some + // other PCID if and when they hit the given CPU again.) + p.mu.Unlock() + return pcid, true + } + + // No PCID. + p.mu.Unlock() + return 0, false +} + +// Drop drops references to a set of page tables. +func (p *PCIDs) Drop(pt *PageTables) { + p.mu.Lock() + if pcid, ok := p.cache[pt]; ok { + delete(p.cache, pt) + p.avail = append(p.avail, pcid) + } + p.mu.Unlock() +} diff --git a/pkg/sentry/platform/ring0/pagetables/walker_empty.go b/pkg/sentry/platform/ring0/pagetables/walker_empty.go new file mode 100755 index 000000000..417784e17 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/walker_empty.go @@ -0,0 +1,255 @@ +package pagetables + +// Walker walks page tables. +type emptyWalker struct { + // pageTables are the tables to walk. + pageTables *PageTables + + // Visitor is the set of arguments. + visitor emptyVisitor +} + +// iterateRange iterates over all appropriate levels of page tables for the given range. +// +// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The +// exception is super pages. If a valid super page (huge or jumbo) cannot be +// installed, then the walk will continue to individual entries. +// +// This algorithm will attempt to maximize the use of super pages whenever +// possible. Whether a super page is provided will be clear through the range +// provided in the callback. +// +// Note that if requiresAlloc is true, then no gaps will be present. However, +// if alloc is not set, then the iteration will likely be full of gaps. +// +// Note that this function should generally be avoided in favor of Map, Unmap, +// etc. when not necessary. +// +// Precondition: start must be page-aligned. +// +// Precondition: start must be less than end. +// +// Precondition: If requiresAlloc is true, then start and end should not span +// non-canonical ranges. If they do, a panic will result. +// +//go:nosplit +func (w *emptyWalker) iterateRange(start, end uintptr) { + if start%pteSize != 0 { + panic("unaligned start") + } + if end < start { + panic("start > end") + } + if start < lowerTop { + if end <= lowerTop { + w.iterateRangeCanonical(start, end) + } else if end > lowerTop && end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + w.iterateRangeCanonical(upperBottom, end) + } + } else if start < upperBottom { + if end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(upperBottom, end) + } + } else { + w.iterateRangeCanonical(start, end) + } +} + +// next returns the next address quantized by the given size. +// +//go:nosplit +func emptynext(start uintptr, size uintptr) uintptr { + start &= ^(size - 1) + start += size + return start +} + +// iterateRangeCanonical walks a canonical range. +// +//go:nosplit +func (w *emptyWalker) iterateRangeCanonical(start, end uintptr) { + for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { + var ( + pgdEntry = &w.pageTables.root[pgdIndex] + pudEntries *PTEs + ) + if !pgdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + start = emptynext(start, pgdSize) + continue + } + + pudEntries = w.pageTables.Allocator.NewPTEs() + pgdEntry.setPageTable(w.pageTables, pudEntries) + } else { + pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) + } + + clearPUDEntries := uint16(0) + + for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { + var ( + pudEntry = &pudEntries[pudIndex] + pmdEntries *PTEs + ) + if !pudEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPUDEntries++ + start = emptynext(start, pudSize) + continue + } + + if start&(pudSize-1) == 0 && end-start >= pudSize { + pudEntry.SetSuper() + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + if pudEntry.Valid() { + start = emptynext(start, pudSize) + continue + } + } + + pmdEntries = w.pageTables.Allocator.NewPTEs() + pudEntry.setPageTable(w.pageTables, pmdEntries) + + } else if pudEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < emptynext(start, pudSize)) { + + pmdEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pmdEntries[index].SetSuper() + pmdEntries[index].Set( + pudEntry.Address()+(pmdSize*uintptr(index)), + pudEntry.Opts()) + } + pudEntry.setPageTable(w.pageTables, pmdEntries) + } else { + + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + + if !pudEntry.Valid() { + clearPUDEntries++ + } + + start = emptynext(start, pudSize) + continue + } + } else { + pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) + } + + clearPMDEntries := uint16(0) + + for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { + var ( + pmdEntry = &pmdEntries[pmdIndex] + pteEntries *PTEs + ) + if !pmdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPMDEntries++ + start = emptynext(start, pmdSize) + continue + } + + if start&(pmdSize-1) == 0 && end-start >= pmdSize { + pmdEntry.SetSuper() + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + if pmdEntry.Valid() { + start = emptynext(start, pmdSize) + continue + } + } + + pteEntries = w.pageTables.Allocator.NewPTEs() + pmdEntry.setPageTable(w.pageTables, pteEntries) + + } else if pmdEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < emptynext(start, pmdSize)) { + + pteEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pteEntries[index].Set( + pmdEntry.Address()+(pteSize*uintptr(index)), + pmdEntry.Opts()) + } + pmdEntry.setPageTable(w.pageTables, pteEntries) + } else { + + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + + if !pmdEntry.Valid() { + clearPMDEntries++ + } + + start = emptynext(start, pmdSize) + continue + } + } else { + pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) + } + + clearPTEEntries := uint16(0) + + for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { + var ( + pteEntry = &pteEntries[pteIndex] + ) + if !pteEntry.Valid() && !w.visitor.requiresAlloc() { + clearPTEEntries++ + start += pteSize + continue + } + + w.visitor.visit(uintptr(start), pteEntry, pteSize-1) + if !pteEntry.Valid() { + if w.visitor.requiresAlloc() { + panic("PTE not set after iteration with requiresAlloc!") + } + clearPTEEntries++ + } + + start += pteSize + continue + } + + if clearPTEEntries == entriesPerPage { + pmdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pteEntries) + clearPMDEntries++ + } + } + + if clearPMDEntries == entriesPerPage { + pudEntry.Clear() + w.pageTables.Allocator.FreePTEs(pmdEntries) + clearPUDEntries++ + } + } + + if clearPUDEntries == entriesPerPage { + pgdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pudEntries) + } + } +} diff --git a/pkg/sentry/platform/ring0/pagetables/walker_lookup.go b/pkg/sentry/platform/ring0/pagetables/walker_lookup.go new file mode 100755 index 000000000..906c9c50f --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/walker_lookup.go @@ -0,0 +1,255 @@ +package pagetables + +// Walker walks page tables. +type lookupWalker struct { + // pageTables are the tables to walk. + pageTables *PageTables + + // Visitor is the set of arguments. + visitor lookupVisitor +} + +// iterateRange iterates over all appropriate levels of page tables for the given range. +// +// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The +// exception is super pages. If a valid super page (huge or jumbo) cannot be +// installed, then the walk will continue to individual entries. +// +// This algorithm will attempt to maximize the use of super pages whenever +// possible. Whether a super page is provided will be clear through the range +// provided in the callback. +// +// Note that if requiresAlloc is true, then no gaps will be present. However, +// if alloc is not set, then the iteration will likely be full of gaps. +// +// Note that this function should generally be avoided in favor of Map, Unmap, +// etc. when not necessary. +// +// Precondition: start must be page-aligned. +// +// Precondition: start must be less than end. +// +// Precondition: If requiresAlloc is true, then start and end should not span +// non-canonical ranges. If they do, a panic will result. +// +//go:nosplit +func (w *lookupWalker) iterateRange(start, end uintptr) { + if start%pteSize != 0 { + panic("unaligned start") + } + if end < start { + panic("start > end") + } + if start < lowerTop { + if end <= lowerTop { + w.iterateRangeCanonical(start, end) + } else if end > lowerTop && end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + w.iterateRangeCanonical(upperBottom, end) + } + } else if start < upperBottom { + if end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(upperBottom, end) + } + } else { + w.iterateRangeCanonical(start, end) + } +} + +// next returns the next address quantized by the given size. +// +//go:nosplit +func lookupnext(start uintptr, size uintptr) uintptr { + start &= ^(size - 1) + start += size + return start +} + +// iterateRangeCanonical walks a canonical range. +// +//go:nosplit +func (w *lookupWalker) iterateRangeCanonical(start, end uintptr) { + for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { + var ( + pgdEntry = &w.pageTables.root[pgdIndex] + pudEntries *PTEs + ) + if !pgdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + start = lookupnext(start, pgdSize) + continue + } + + pudEntries = w.pageTables.Allocator.NewPTEs() + pgdEntry.setPageTable(w.pageTables, pudEntries) + } else { + pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) + } + + clearPUDEntries := uint16(0) + + for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { + var ( + pudEntry = &pudEntries[pudIndex] + pmdEntries *PTEs + ) + if !pudEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPUDEntries++ + start = lookupnext(start, pudSize) + continue + } + + if start&(pudSize-1) == 0 && end-start >= pudSize { + pudEntry.SetSuper() + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + if pudEntry.Valid() { + start = lookupnext(start, pudSize) + continue + } + } + + pmdEntries = w.pageTables.Allocator.NewPTEs() + pudEntry.setPageTable(w.pageTables, pmdEntries) + + } else if pudEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < lookupnext(start, pudSize)) { + + pmdEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pmdEntries[index].SetSuper() + pmdEntries[index].Set( + pudEntry.Address()+(pmdSize*uintptr(index)), + pudEntry.Opts()) + } + pudEntry.setPageTable(w.pageTables, pmdEntries) + } else { + + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + + if !pudEntry.Valid() { + clearPUDEntries++ + } + + start = lookupnext(start, pudSize) + continue + } + } else { + pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) + } + + clearPMDEntries := uint16(0) + + for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { + var ( + pmdEntry = &pmdEntries[pmdIndex] + pteEntries *PTEs + ) + if !pmdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPMDEntries++ + start = lookupnext(start, pmdSize) + continue + } + + if start&(pmdSize-1) == 0 && end-start >= pmdSize { + pmdEntry.SetSuper() + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + if pmdEntry.Valid() { + start = lookupnext(start, pmdSize) + continue + } + } + + pteEntries = w.pageTables.Allocator.NewPTEs() + pmdEntry.setPageTable(w.pageTables, pteEntries) + + } else if pmdEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < lookupnext(start, pmdSize)) { + + pteEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pteEntries[index].Set( + pmdEntry.Address()+(pteSize*uintptr(index)), + pmdEntry.Opts()) + } + pmdEntry.setPageTable(w.pageTables, pteEntries) + } else { + + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + + if !pmdEntry.Valid() { + clearPMDEntries++ + } + + start = lookupnext(start, pmdSize) + continue + } + } else { + pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) + } + + clearPTEEntries := uint16(0) + + for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { + var ( + pteEntry = &pteEntries[pteIndex] + ) + if !pteEntry.Valid() && !w.visitor.requiresAlloc() { + clearPTEEntries++ + start += pteSize + continue + } + + w.visitor.visit(uintptr(start), pteEntry, pteSize-1) + if !pteEntry.Valid() { + if w.visitor.requiresAlloc() { + panic("PTE not set after iteration with requiresAlloc!") + } + clearPTEEntries++ + } + + start += pteSize + continue + } + + if clearPTEEntries == entriesPerPage { + pmdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pteEntries) + clearPMDEntries++ + } + } + + if clearPMDEntries == entriesPerPage { + pudEntry.Clear() + w.pageTables.Allocator.FreePTEs(pmdEntries) + clearPUDEntries++ + } + } + + if clearPUDEntries == entriesPerPage { + pgdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pudEntries) + } + } +} diff --git a/pkg/sentry/platform/ring0/pagetables/walker_map.go b/pkg/sentry/platform/ring0/pagetables/walker_map.go new file mode 100755 index 000000000..61ee3c825 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/walker_map.go @@ -0,0 +1,255 @@ +package pagetables + +// Walker walks page tables. +type mapWalker struct { + // pageTables are the tables to walk. + pageTables *PageTables + + // Visitor is the set of arguments. + visitor mapVisitor +} + +// iterateRange iterates over all appropriate levels of page tables for the given range. +// +// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The +// exception is super pages. If a valid super page (huge or jumbo) cannot be +// installed, then the walk will continue to individual entries. +// +// This algorithm will attempt to maximize the use of super pages whenever +// possible. Whether a super page is provided will be clear through the range +// provided in the callback. +// +// Note that if requiresAlloc is true, then no gaps will be present. However, +// if alloc is not set, then the iteration will likely be full of gaps. +// +// Note that this function should generally be avoided in favor of Map, Unmap, +// etc. when not necessary. +// +// Precondition: start must be page-aligned. +// +// Precondition: start must be less than end. +// +// Precondition: If requiresAlloc is true, then start and end should not span +// non-canonical ranges. If they do, a panic will result. +// +//go:nosplit +func (w *mapWalker) iterateRange(start, end uintptr) { + if start%pteSize != 0 { + panic("unaligned start") + } + if end < start { + panic("start > end") + } + if start < lowerTop { + if end <= lowerTop { + w.iterateRangeCanonical(start, end) + } else if end > lowerTop && end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + w.iterateRangeCanonical(upperBottom, end) + } + } else if start < upperBottom { + if end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(upperBottom, end) + } + } else { + w.iterateRangeCanonical(start, end) + } +} + +// next returns the next address quantized by the given size. +// +//go:nosplit +func mapnext(start uintptr, size uintptr) uintptr { + start &= ^(size - 1) + start += size + return start +} + +// iterateRangeCanonical walks a canonical range. +// +//go:nosplit +func (w *mapWalker) iterateRangeCanonical(start, end uintptr) { + for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { + var ( + pgdEntry = &w.pageTables.root[pgdIndex] + pudEntries *PTEs + ) + if !pgdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + start = mapnext(start, pgdSize) + continue + } + + pudEntries = w.pageTables.Allocator.NewPTEs() + pgdEntry.setPageTable(w.pageTables, pudEntries) + } else { + pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) + } + + clearPUDEntries := uint16(0) + + for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { + var ( + pudEntry = &pudEntries[pudIndex] + pmdEntries *PTEs + ) + if !pudEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPUDEntries++ + start = mapnext(start, pudSize) + continue + } + + if start&(pudSize-1) == 0 && end-start >= pudSize { + pudEntry.SetSuper() + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + if pudEntry.Valid() { + start = mapnext(start, pudSize) + continue + } + } + + pmdEntries = w.pageTables.Allocator.NewPTEs() + pudEntry.setPageTable(w.pageTables, pmdEntries) + + } else if pudEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < mapnext(start, pudSize)) { + + pmdEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pmdEntries[index].SetSuper() + pmdEntries[index].Set( + pudEntry.Address()+(pmdSize*uintptr(index)), + pudEntry.Opts()) + } + pudEntry.setPageTable(w.pageTables, pmdEntries) + } else { + + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + + if !pudEntry.Valid() { + clearPUDEntries++ + } + + start = mapnext(start, pudSize) + continue + } + } else { + pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) + } + + clearPMDEntries := uint16(0) + + for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { + var ( + pmdEntry = &pmdEntries[pmdIndex] + pteEntries *PTEs + ) + if !pmdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPMDEntries++ + start = mapnext(start, pmdSize) + continue + } + + if start&(pmdSize-1) == 0 && end-start >= pmdSize { + pmdEntry.SetSuper() + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + if pmdEntry.Valid() { + start = mapnext(start, pmdSize) + continue + } + } + + pteEntries = w.pageTables.Allocator.NewPTEs() + pmdEntry.setPageTable(w.pageTables, pteEntries) + + } else if pmdEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < mapnext(start, pmdSize)) { + + pteEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pteEntries[index].Set( + pmdEntry.Address()+(pteSize*uintptr(index)), + pmdEntry.Opts()) + } + pmdEntry.setPageTable(w.pageTables, pteEntries) + } else { + + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + + if !pmdEntry.Valid() { + clearPMDEntries++ + } + + start = mapnext(start, pmdSize) + continue + } + } else { + pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) + } + + clearPTEEntries := uint16(0) + + for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { + var ( + pteEntry = &pteEntries[pteIndex] + ) + if !pteEntry.Valid() && !w.visitor.requiresAlloc() { + clearPTEEntries++ + start += pteSize + continue + } + + w.visitor.visit(uintptr(start), pteEntry, pteSize-1) + if !pteEntry.Valid() { + if w.visitor.requiresAlloc() { + panic("PTE not set after iteration with requiresAlloc!") + } + clearPTEEntries++ + } + + start += pteSize + continue + } + + if clearPTEEntries == entriesPerPage { + pmdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pteEntries) + clearPMDEntries++ + } + } + + if clearPMDEntries == entriesPerPage { + pudEntry.Clear() + w.pageTables.Allocator.FreePTEs(pmdEntries) + clearPUDEntries++ + } + } + + if clearPUDEntries == entriesPerPage { + pgdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pudEntries) + } + } +} diff --git a/pkg/sentry/platform/ring0/pagetables/walker_unmap.go b/pkg/sentry/platform/ring0/pagetables/walker_unmap.go new file mode 100755 index 000000000..be2aa0ce4 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/walker_unmap.go @@ -0,0 +1,255 @@ +package pagetables + +// Walker walks page tables. +type unmapWalker struct { + // pageTables are the tables to walk. + pageTables *PageTables + + // Visitor is the set of arguments. + visitor unmapVisitor +} + +// iterateRange iterates over all appropriate levels of page tables for the given range. +// +// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The +// exception is super pages. If a valid super page (huge or jumbo) cannot be +// installed, then the walk will continue to individual entries. +// +// This algorithm will attempt to maximize the use of super pages whenever +// possible. Whether a super page is provided will be clear through the range +// provided in the callback. +// +// Note that if requiresAlloc is true, then no gaps will be present. However, +// if alloc is not set, then the iteration will likely be full of gaps. +// +// Note that this function should generally be avoided in favor of Map, Unmap, +// etc. when not necessary. +// +// Precondition: start must be page-aligned. +// +// Precondition: start must be less than end. +// +// Precondition: If requiresAlloc is true, then start and end should not span +// non-canonical ranges. If they do, a panic will result. +// +//go:nosplit +func (w *unmapWalker) iterateRange(start, end uintptr) { + if start%pteSize != 0 { + panic("unaligned start") + } + if end < start { + panic("start > end") + } + if start < lowerTop { + if end <= lowerTop { + w.iterateRangeCanonical(start, end) + } else if end > lowerTop && end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + w.iterateRangeCanonical(upperBottom, end) + } + } else if start < upperBottom { + if end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(upperBottom, end) + } + } else { + w.iterateRangeCanonical(start, end) + } +} + +// next returns the next address quantized by the given size. +// +//go:nosplit +func unmapnext(start uintptr, size uintptr) uintptr { + start &= ^(size - 1) + start += size + return start +} + +// iterateRangeCanonical walks a canonical range. +// +//go:nosplit +func (w *unmapWalker) iterateRangeCanonical(start, end uintptr) { + for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { + var ( + pgdEntry = &w.pageTables.root[pgdIndex] + pudEntries *PTEs + ) + if !pgdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + start = unmapnext(start, pgdSize) + continue + } + + pudEntries = w.pageTables.Allocator.NewPTEs() + pgdEntry.setPageTable(w.pageTables, pudEntries) + } else { + pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) + } + + clearPUDEntries := uint16(0) + + for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { + var ( + pudEntry = &pudEntries[pudIndex] + pmdEntries *PTEs + ) + if !pudEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPUDEntries++ + start = unmapnext(start, pudSize) + continue + } + + if start&(pudSize-1) == 0 && end-start >= pudSize { + pudEntry.SetSuper() + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + if pudEntry.Valid() { + start = unmapnext(start, pudSize) + continue + } + } + + pmdEntries = w.pageTables.Allocator.NewPTEs() + pudEntry.setPageTable(w.pageTables, pmdEntries) + + } else if pudEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < unmapnext(start, pudSize)) { + + pmdEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pmdEntries[index].SetSuper() + pmdEntries[index].Set( + pudEntry.Address()+(pmdSize*uintptr(index)), + pudEntry.Opts()) + } + pudEntry.setPageTable(w.pageTables, pmdEntries) + } else { + + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + + if !pudEntry.Valid() { + clearPUDEntries++ + } + + start = unmapnext(start, pudSize) + continue + } + } else { + pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) + } + + clearPMDEntries := uint16(0) + + for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { + var ( + pmdEntry = &pmdEntries[pmdIndex] + pteEntries *PTEs + ) + if !pmdEntry.Valid() { + if !w.visitor.requiresAlloc() { + + clearPMDEntries++ + start = unmapnext(start, pmdSize) + continue + } + + if start&(pmdSize-1) == 0 && end-start >= pmdSize { + pmdEntry.SetSuper() + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + if pmdEntry.Valid() { + start = unmapnext(start, pmdSize) + continue + } + } + + pteEntries = w.pageTables.Allocator.NewPTEs() + pmdEntry.setPageTable(w.pageTables, pteEntries) + + } else if pmdEntry.IsSuper() { + + if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < unmapnext(start, pmdSize)) { + + pteEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pteEntries[index].Set( + pmdEntry.Address()+(pteSize*uintptr(index)), + pmdEntry.Opts()) + } + pmdEntry.setPageTable(w.pageTables, pteEntries) + } else { + + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + + if !pmdEntry.Valid() { + clearPMDEntries++ + } + + start = unmapnext(start, pmdSize) + continue + } + } else { + pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) + } + + clearPTEEntries := uint16(0) + + for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { + var ( + pteEntry = &pteEntries[pteIndex] + ) + if !pteEntry.Valid() && !w.visitor.requiresAlloc() { + clearPTEEntries++ + start += pteSize + continue + } + + w.visitor.visit(uintptr(start), pteEntry, pteSize-1) + if !pteEntry.Valid() { + if w.visitor.requiresAlloc() { + panic("PTE not set after iteration with requiresAlloc!") + } + clearPTEEntries++ + } + + start += pteSize + continue + } + + if clearPTEEntries == entriesPerPage { + pmdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pteEntries) + clearPMDEntries++ + } + } + + if clearPMDEntries == entriesPerPage { + pudEntry.Clear() + w.pageTables.Allocator.FreePTEs(pmdEntries) + clearPUDEntries++ + } + } + + if clearPUDEntries == entriesPerPage { + pgdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pudEntries) + } + } +} |