diff options
author | Adin Scannell <ascannell@google.com> | 2018-06-11 18:14:22 -0700 |
---|---|---|
committer | Shentubot <shentubot@google.com> | 2018-06-11 18:15:14 -0700 |
commit | 1397a413b49d6036f2586e85c8074aa3d4d6c6fa (patch) | |
tree | 7bb617429fe26dfdc47bc02bab8d74eb621775e6 /pkg/sentry | |
parent | 09b0a9c320bd777bc52384bd0ec91ecfc61e481d (diff) |
Make page tables split-safe.
In order to minimize the likelihood of exit during page table
modifications, make the full set of page table functions split-safe.
This is not strictly necessary (and you may still incur splits due to
allocations from the allocator pool) but should make retries a very rare
occurance.
PiperOrigin-RevId: 200146688
Change-Id: I8fa36aa16b807beda2f0b057be60038258e8d597
Diffstat (limited to 'pkg/sentry')
-rw-r--r-- | pkg/sentry/platform/ring0/pagetables/BUILD | 72 | ||||
-rw-r--r-- | pkg/sentry/platform/ring0/pagetables/pagetables.go | 192 | ||||
-rw-r--r-- | pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go | 276 | ||||
-rw-r--r-- | pkg/sentry/platform/ring0/pagetables/pagetables_test.go | 83 | ||||
-rw-r--r-- | pkg/sentry/platform/ring0/pagetables/walker_amd64.go | 307 |
5 files changed, 583 insertions, 347 deletions
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD index 08b73e87d..023e298a0 100644 --- a/pkg/sentry/platform/ring0/pagetables/BUILD +++ b/pkg/sentry/platform/ring0/pagetables/BUILD @@ -1,6 +1,73 @@ package(licenses = ["notice"]) # Apache 2.0 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") + +go_template( + name = "generic_walker", + srcs = [ + "walker_amd64.go", + ], + opt_types = [ + "Visitor", + ], + visibility = [":__pkg__"], +) + +go_template_instance( + name = "walker_map", + out = "walker_map.go", + package = "pagetables", + prefix = "map", + template = ":generic_walker", + types = { + "Visitor": "mapVisitor", + }, +) + +go_template_instance( + name = "walker_unmap", + out = "walker_unmap.go", + package = "pagetables", + prefix = "unmap", + template = ":generic_walker", + types = { + "Visitor": "unmapVisitor", + }, +) + +go_template_instance( + name = "walker_lookup", + out = "walker_lookup.go", + package = "pagetables", + prefix = "lookup", + template = ":generic_walker", + types = { + "Visitor": "lookupVisitor", + }, +) + +go_template_instance( + name = "walker_empty", + out = "walker_empty.go", + package = "pagetables", + prefix = "empty", + template = ":generic_walker", + types = { + "Visitor": "emptyVisitor", + }, +) + +go_template_instance( + name = "walker_check", + out = "walker_check.go", + package = "pagetables", + prefix = "check", + template = ":generic_walker", + types = { + "Visitor": "checkVisitor", + }, +) go_library( name = "pagetables", @@ -11,6 +78,10 @@ go_library( "pagetables_amd64.go", "pagetables_x86.go", "pcids_x86.go", + "walker_empty.go", + "walker_lookup.go", + "walker_map.go", + "walker_unmap.go", ], importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables", visibility = [ @@ -26,6 +97,7 @@ go_test( srcs = [ "pagetables_amd64_test.go", "pagetables_test.go", + "walker_check.go", ], embed = [":pagetables"], deps = ["//pkg/sentry/usermem"], diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go index 6963ba62d..ff5787f89 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go @@ -13,6 +13,11 @@ // limitations under the License. // Package pagetables provides a generic implementation of pagetables. +// +// The core functions must be safe to call from a nosplit context. Furthermore, +// this pagetables implementation goes to lengths to ensure that all functions +// are free from runtime allocation. Calls to NewPTEs/FreePTEs may be made +// during walks, but these can be cached elsewhere if required. package pagetables import ( @@ -38,64 +43,179 @@ type PageTables struct { // New returns new PageTables. func New(a Allocator) *PageTables { - p := &PageTables{Allocator: a} + p := new(PageTables) + p.Init(a) + return p +} + +// Init initializes a set of PageTables. +// +//go:nosplit +func (p *PageTables) Init(allocator Allocator) { + p.Allocator = allocator p.root = p.Allocator.NewPTEs() p.rootPhysical = p.Allocator.PhysicalFor(p.root) - return p } +// mapVisitor is used for map. +type mapVisitor struct { + target uintptr // Input. + physical uintptr // Input. + opts MapOpts // Input. + prev bool // Output. +} + +// visit is used for map. +// +//go:nosplit +func (v *mapVisitor) visit(start uintptr, pte *PTE, align uintptr) { + p := v.physical + (start - uintptr(v.target)) + if pte.Valid() && (pte.Address() != p || pte.Opts() != v.opts) { + v.prev = true + } + if p&align != 0 { + // We will install entries at a smaller granulaity if we don't + // install a valid entry here, however we must zap any existing + // entry to ensure this happens. + pte.Clear() + return + } + pte.Set(p, v.opts) +} + +//go:nosplit +func (*mapVisitor) requiresAlloc() bool { return true } + +//go:nosplit +func (*mapVisitor) requiresSplit() bool { return true } + // Map installs a mapping with the given physical address. // // True is returned iff there was a previous mapping in the range. // -// Precondition: addr & length must be aligned, their sum must not overflow. +// Precondition: addr & length must be page-aligned, their sum must not overflow. +// +//go:nosplit func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physical uintptr) bool { if !opts.AccessType.Any() { return p.Unmap(addr, length) } - prev := false - end, ok := addr.AddLength(uint64(length)) - if !ok { - panic("pagetables.Map: overflow") + w := mapWalker{ + pageTables: p, + visitor: mapVisitor{ + target: uintptr(addr), + physical: physical, + opts: opts, + }, } - p.iterateRange(uintptr(addr), uintptr(end), true, func(s, e uintptr, pte *PTE, align uintptr) { - p := physical + (s - uintptr(addr)) - prev = prev || (pte.Valid() && (p != pte.Address() || opts != pte.Opts())) - if p&align != 0 { - // We will install entries at a smaller granulaity if - // we don't install a valid entry here, however we must - // zap any existing entry to ensure this happens. - pte.Clear() - return - } - pte.Set(p, opts) - }) - return prev + w.iterateRange(uintptr(addr), uintptr(addr)+length) + return w.visitor.prev +} + +// unmapVisitor is used for unmap. +type unmapVisitor struct { + count int +} + +//go:nosplit +func (*unmapVisitor) requiresAlloc() bool { return false } + +//go:nosplit +func (*unmapVisitor) requiresSplit() bool { return true } + +// visit unmaps the given entry. +// +//go:nosplit +func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) { + pte.Clear() + v.count++ } // Unmap unmaps the given range. // // True is returned iff there was a previous mapping in the range. +// +// Precondition: addr & length must be page-aligned. +// +//go:nosplit func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool { - count := 0 - p.iterateRange(uintptr(addr), uintptr(addr)+length, false, func(s, e uintptr, pte *PTE, align uintptr) { - pte.Clear() - count++ - }) - return count > 0 + w := unmapWalker{ + pageTables: p, + visitor: unmapVisitor{ + count: 0, + }, + } + w.iterateRange(uintptr(addr), uintptr(addr)+length) + return w.visitor.count > 0 } +// emptyVisitor is used for emptiness checks. +type emptyVisitor struct { + count int +} + +//go:nosplit +func (*emptyVisitor) requiresAlloc() bool { return false } + +//go:nosplit +func (*emptyVisitor) requiresSplit() bool { return false } + +// visit unmaps the given entry. +// +//go:nosplit +func (v *emptyVisitor) visit(start uintptr, pte *PTE, align uintptr) { + v.count++ +} + +// IsEmpty checks if the given range is empty. +// +// Precondition: addr & length must be page-aligned. +// +//go:nosplit +func (p *PageTables) IsEmpty(addr usermem.Addr, length uintptr) bool { + w := emptyWalker{ + pageTables: p, + } + w.iterateRange(uintptr(addr), uintptr(addr)+length) + return w.visitor.count == 0 +} + +// lookupVisitor is used for lookup. +type lookupVisitor struct { + target uintptr // Input. + physical uintptr // Output. + opts MapOpts // Output. +} + +// visit matches the given address. +// +//go:nosplit +func (v *lookupVisitor) visit(start uintptr, pte *PTE, align uintptr) { + if !pte.Valid() { + return + } + v.physical = pte.Address() + (start - uintptr(v.target)) + v.opts = pte.Opts() +} + +//go:nosplit +func (*lookupVisitor) requiresAlloc() bool { return false } + +//go:nosplit +func (*lookupVisitor) requiresSplit() bool { return false } + // Lookup returns the physical address for the given virtual address. +// +//go:nosplit func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts) { mask := uintptr(usermem.PageSize - 1) - off := uintptr(addr) & mask - addr = addr &^ usermem.Addr(mask) - p.iterateRange(uintptr(addr), uintptr(addr+usermem.PageSize), false, func(s, e uintptr, pte *PTE, align uintptr) { - if !pte.Valid() { - return - } - physical = pte.Address() + (s - uintptr(addr)) + off - opts = pte.Opts() - }) - return + offset := uintptr(addr) & mask + w := lookupWalker{ + pageTables: p, + visitor: lookupVisitor{ + target: uintptr(addr &^ usermem.Addr(mask)), + }, + } + w.iterateRange(uintptr(addr), uintptr(addr)+1) + return w.visitor.physical + offset, w.visitor.opts } diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go index 6a724e4fd..878463018 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go @@ -12,14 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -// +build amd64 - package pagetables -import ( - "fmt" -) - // Address constraints. // // The lowerTop and upperBottom currently apply to four-level pagetables; @@ -49,273 +43,3 @@ const ( // PTEs is a collection of entries. type PTEs [entriesPerPage]PTE - -// next returns the next address quantized by the given size. -func next(start uint64, size uint64) uint64 { - start &= ^(size - 1) - start += size - return start -} - -// iterateRange iterates over all appropriate levels of page tables for the given range. -// -// If alloc is set, then Set _must_ be called on all given PTEs. The exception -// is super pages. If a valid super page cannot be installed, then the walk -// will continue to individual entries. -// -// This algorithm will attempt to maximize the use of super pages whenever -// possible. Whether a super page is provided will be clear through the range -// provided in the callback. -// -// Note that if alloc set, then no gaps will be present. However, if alloc is -// not set, then the iteration will likely be full of gaps. -// -// Note that this function should generally be avoided in favor of Map, Unmap, -// etc. when not necessary. -// -// Precondition: startAddr and endAddr must be page-aligned. -// -// Precondition: startStart must be less than endAddr. -// -// Precondition: If alloc is set, then startAddr and endAddr should not span -// non-canonical ranges. If they do, a panic will result. -func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn func(s, e uintptr, pte *PTE, align uintptr)) { - start := uint64(startAddr) - end := uint64(endAddr) - if start%pteSize != 0 { - panic(fmt.Sprintf("unaligned start: %v", start)) - } - if start > end { - panic(fmt.Sprintf("start > end (%v > %v))", start, end)) - } - - // Deal with cases where we traverse the "gap". - // - // These are all explicitly disallowed if alloc is set, and we must - // traverse an entry for each address explicitly. - switch { - case start < lowerTop && end > lowerTop && end < upperBottom: - if alloc { - panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end)) - } - p.iterateRange(startAddr, lowerTop, false, fn) - return - case start < lowerTop && end > lowerTop: - if alloc { - panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end)) - } - p.iterateRange(startAddr, lowerTop, false, fn) - p.iterateRange(upperBottom, endAddr, false, fn) - return - case start > lowerTop && end < upperBottom: - if alloc { - panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end)) - } - return - case start > lowerTop && start < upperBottom && end > upperBottom: - if alloc { - panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end)) - } - p.iterateRange(upperBottom, endAddr, false, fn) - return - } - - for pgdIndex := int((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { - var ( - pgdEntry = &p.root[pgdIndex] - pudEntries *PTEs - ) - if !pgdEntry.Valid() { - if !alloc { - // Skip over this entry. - start = next(start, pgdSize) - continue - } - - // Allocate a new pgd. - pudEntries = p.Allocator.NewPTEs() - pgdEntry.setPageTable(p, pudEntries) - } else { - pudEntries = p.Allocator.LookupPTEs(pgdEntry.Address()) - } - - // Map the next level. - clearPUDEntries := 0 - - for pudIndex := int((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { - var ( - pudEntry = &pudEntries[pudIndex] - pmdEntries *PTEs - ) - if !pudEntry.Valid() { - if !alloc { - // Skip over this entry. - clearPUDEntries++ - start = next(start, pudSize) - continue - } - - // This level has 1-GB super pages. Is this - // entire region contained in a single PUD - // entry? If so, we can skip allocating a new - // page for the pmd. - if start&(pudSize-1) == 0 && end-start >= pudSize { - pudEntry.SetSuper() - fn(uintptr(start), uintptr(start+pudSize), pudEntry, pudSize-1) - if pudEntry.Valid() { - start = next(start, pudSize) - continue - } - } - - // Allocate a new pud. - pmdEntries = p.Allocator.NewPTEs() - pudEntry.setPageTable(p, pmdEntries) - - } else if pudEntry.IsSuper() { - // Does this page need to be split? - if start&(pudSize-1) != 0 || end < next(start, pudSize) { - currentAddr := uint64(pudEntry.Address()) - - // Install the relevant entries. - pmdEntries = p.Allocator.NewPTEs() - for index := 0; index < entriesPerPage; index++ { - pmdEntry := &pmdEntries[index] - pmdEntry.SetSuper() - pmdEntry.Set(uintptr(currentAddr), pudEntry.Opts()) - currentAddr += pmdSize - } - - // Reset to point to the new page. - pudEntry.setPageTable(p, pmdEntries) - } else { - // A super page to be checked directly. - fn(uintptr(start), uintptr(start+pudSize), pudEntry, pudSize-1) - - // Might have been cleared. - if !pudEntry.Valid() { - clearPUDEntries++ - } - - // Note that the super page was changed. - start = next(start, pudSize) - continue - } - } else { - pmdEntries = p.Allocator.LookupPTEs(pudEntry.Address()) - } - - // Map the next level, since this is valid. - clearPMDEntries := 0 - - for pmdIndex := int((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { - var ( - pmdEntry = &pmdEntries[pmdIndex] - pteEntries *PTEs - ) - if !pmdEntry.Valid() { - if !alloc { - // Skip over this entry. - clearPMDEntries++ - start = next(start, pmdSize) - continue - } - - // This level has 2-MB huge pages. If this - // region is contained in a single PMD entry? - // As above, we can skip allocating a new page. - if start&(pmdSize-1) == 0 && end-start >= pmdSize { - pmdEntry.SetSuper() - fn(uintptr(start), uintptr(start+pmdSize), pmdEntry, pmdSize-1) - if pmdEntry.Valid() { - start = next(start, pmdSize) - continue - } - } - - // Allocate a new pmd. - pteEntries = p.Allocator.NewPTEs() - pmdEntry.setPageTable(p, pteEntries) - - } else if pmdEntry.IsSuper() { - // Does this page need to be split? - if start&(pmdSize-1) != 0 || end < next(start, pmdSize) { - currentAddr := uint64(pmdEntry.Address()) - - // Install the relevant entries. - pteEntries = p.Allocator.NewPTEs() - for index := 0; index < entriesPerPage; index++ { - pteEntry := &pteEntries[index] - pteEntry.Set(uintptr(currentAddr), pmdEntry.Opts()) - currentAddr += pteSize - } - - // Reset to point to the new page. - pmdEntry.setPageTable(p, pteEntries) - } else { - // A huge page to be checked directly. - fn(uintptr(start), uintptr(start+pmdSize), pmdEntry, pmdSize-1) - - // Might have been cleared. - if !pmdEntry.Valid() { - clearPMDEntries++ - } - - // Note that the huge page was changed. - start = next(start, pmdSize) - continue - } - } else { - pteEntries = p.Allocator.LookupPTEs(pmdEntry.Address()) - } - - // Map the next level, since this is valid. - clearPTEEntries := 0 - - for pteIndex := int((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { - var ( - pteEntry = &pteEntries[pteIndex] - ) - if !pteEntry.Valid() && !alloc { - clearPTEEntries++ - start += pteSize - continue - } - - // At this point, we are guaranteed that start%pteSize == 0. - fn(uintptr(start), uintptr(start+pteSize), pteEntry, pteSize-1) - if !pteEntry.Valid() { - if alloc { - panic("PTE not set after iteration with alloc=true!") - } - clearPTEEntries++ - } - - // Note that the pte was changed. - start += pteSize - continue - } - - // Check if we no longer need this page. - if clearPTEEntries == entriesPerPage { - pmdEntry.Clear() - p.Allocator.FreePTEs(pteEntries) - clearPMDEntries++ - } - } - - // Check if we no longer need this page. - if clearPMDEntries == entriesPerPage { - pudEntry.Clear() - p.Allocator.FreePTEs(pmdEntries) - clearPUDEntries++ - } - } - - // Check if we no longer need this page. - if clearPUDEntries == entriesPerPage { - pgdEntry.Clear() - p.Allocator.FreePTEs(pudEntries) - } - } -} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go index 28178f656..dca3f69ef 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go @@ -27,48 +27,61 @@ type mapping struct { opts MapOpts } -func checkMappings(t *testing.T, pt *PageTables, m []mapping) { - var ( - current int - found []mapping - failed string - ) +type checkVisitor struct { + expected []mapping // Input. + current int // Temporary. + found []mapping // Output. + failed string // Output. +} - // Iterate over all the mappings. - pt.iterateRange(0, ^uintptr(0), false, func(s, e uintptr, pte *PTE, align uintptr) { - found = append(found, mapping{ - start: s, - length: e - s, - addr: pte.Address(), - opts: pte.Opts(), - }) - if failed != "" { - // Don't keep looking for errors. - return - } - - if current >= len(m) { - failed = "more mappings than expected" - } else if m[current].start != s { - failed = "start didn't match expected" - } else if m[current].length != (e - s) { - failed = "end didn't match expected" - } else if m[current].addr != pte.Address() { - failed = "address didn't match expected" - } else if m[current].opts != pte.Opts() { - failed = "opts didn't match" - } - current++ +func (v *checkVisitor) visit(start uintptr, pte *PTE, align uintptr) { + v.found = append(v.found, mapping{ + start: start, + length: align + 1, + addr: pte.Address(), + opts: pte.Opts(), }) + if v.failed != "" { + // Don't keep looking for errors. + return + } + + if v.current >= len(v.expected) { + v.failed = "more mappings than expected" + } else if v.expected[v.current].start != start { + v.failed = "start didn't match expected" + } else if v.expected[v.current].length != (align + 1) { + v.failed = "end didn't match expected" + } else if v.expected[v.current].addr != pte.Address() { + v.failed = "address didn't match expected" + } else if v.expected[v.current].opts != pte.Opts() { + v.failed = "opts didn't match" + } + v.current++ +} + +func (*checkVisitor) requiresAlloc() bool { return false } + +func (*checkVisitor) requiresSplit() bool { return false } + +func checkMappings(t *testing.T, pt *PageTables, m []mapping) { + // Iterate over all the mappings. + w := checkWalker{ + pageTables: pt, + visitor: checkVisitor{ + expected: m, + }, + } + w.iterateRange(0, ^uintptr(0)) // Were we expected additional mappings? - if failed == "" && current != len(m) { - failed = "insufficient mappings found" + if w.visitor.failed == "" && w.visitor.current != len(w.visitor.expected) { + w.visitor.failed = "insufficient mappings found" } // Emit a meaningful error message on failure. - if failed != "" { - t.Errorf("%s; got %#v, wanted %#v", failed, found, m) + if w.visitor.failed != "" { + t.Errorf("%s; got %#v, wanted %#v", w.visitor.failed, w.visitor.found, w.visitor.expected) } } diff --git a/pkg/sentry/platform/ring0/pagetables/walker_amd64.go b/pkg/sentry/platform/ring0/pagetables/walker_amd64.go new file mode 100644 index 000000000..afa4d473a --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/walker_amd64.go @@ -0,0 +1,307 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package pagetables + +// Visitor is a generic type. +type Visitor interface { + // visit is called on each PTE. + visit(start uintptr, pte *PTE, align uintptr) + + // requiresAlloc indicates that new entries should be allocated within + // the walked range. + requiresAlloc() bool + + // requiresSplit indicates that entries in the given range should be + // split if they are huge or jumbo pages. + requiresSplit() bool +} + +// Walker walks page tables. +type Walker struct { + // pageTables are the tables to walk. + pageTables *PageTables + + // Visitor is the set of arguments. + visitor Visitor +} + +// iterateRange iterates over all appropriate levels of page tables for the given range. +// +// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The +// exception is super pages. If a valid super page (huge or jumbo) cannot be +// installed, then the walk will continue to individual entries. +// +// This algorithm will attempt to maximize the use of super pages whenever +// possible. Whether a super page is provided will be clear through the range +// provided in the callback. +// +// Note that if requiresAlloc is true, then no gaps will be present. However, +// if alloc is not set, then the iteration will likely be full of gaps. +// +// Note that this function should generally be avoided in favor of Map, Unmap, +// etc. when not necessary. +// +// Precondition: start must be page-aligned. +// +// Precondition: start must be less than end. +// +// Precondition: If requiresAlloc is true, then start and end should not span +// non-canonical ranges. If they do, a panic will result. +// +//go:nosplit +func (w *Walker) iterateRange(start, end uintptr) { + if start%pteSize != 0 { + panic("unaligned start") + } + if end < start { + panic("start > end") + } + if start < lowerTop { + if end <= lowerTop { + w.iterateRangeCanonical(start, end) + } else if end > lowerTop && end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(start, lowerTop) + w.iterateRangeCanonical(upperBottom, end) + } + } else if start < upperBottom { + if end <= upperBottom { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + } else { + if w.visitor.requiresAlloc() { + panic("alloc spans non-canonical range") + } + w.iterateRangeCanonical(upperBottom, end) + } + } else { + w.iterateRangeCanonical(start, end) + } +} + +// next returns the next address quantized by the given size. +// +//go:nosplit +func next(start uintptr, size uintptr) uintptr { + start &= ^(size - 1) + start += size + return start +} + +// iterateRangeCanonical walks a canonical range. +// +//go:nosplit +func (w *Walker) iterateRangeCanonical(start, end uintptr) { + for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { + var ( + pgdEntry = &w.pageTables.root[pgdIndex] + pudEntries *PTEs + ) + if !pgdEntry.Valid() { + if !w.visitor.requiresAlloc() { + // Skip over this entry. + start = next(start, pgdSize) + continue + } + + // Allocate a new pgd. + pudEntries = w.pageTables.Allocator.NewPTEs() + pgdEntry.setPageTable(w.pageTables, pudEntries) + } else { + pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address()) + } + + // Map the next level. + clearPUDEntries := uint16(0) + + for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { + var ( + pudEntry = &pudEntries[pudIndex] + pmdEntries *PTEs + ) + if !pudEntry.Valid() { + if !w.visitor.requiresAlloc() { + // Skip over this entry. + clearPUDEntries++ + start = next(start, pudSize) + continue + } + + // This level has 1-GB super pages. Is this + // entire region at least as large as a single + // PUD entry? If so, we can skip allocating a + // new page for the pmd. + if start&(pudSize-1) == 0 && end-start >= pudSize { + pudEntry.SetSuper() + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + if pudEntry.Valid() { + start = next(start, pudSize) + continue + } + } + + // Allocate a new pud. + pmdEntries = w.pageTables.Allocator.NewPTEs() + pudEntry.setPageTable(w.pageTables, pmdEntries) + + } else if pudEntry.IsSuper() { + // Does this page need to be split? + if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < next(start, pudSize)) { + // Install the relevant entries. + pmdEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pmdEntries[index].SetSuper() + pmdEntries[index].Set( + pudEntry.Address()+(pmdSize*uintptr(index)), + pudEntry.Opts()) + } + pudEntry.setPageTable(w.pageTables, pmdEntries) + } else { + // A super page to be checked directly. + w.visitor.visit(uintptr(start), pudEntry, pudSize-1) + + // Might have been cleared. + if !pudEntry.Valid() { + clearPUDEntries++ + } + + // Note that the super page was changed. + start = next(start, pudSize) + continue + } + } else { + pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address()) + } + + // Map the next level, since this is valid. + clearPMDEntries := uint16(0) + + for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { + var ( + pmdEntry = &pmdEntries[pmdIndex] + pteEntries *PTEs + ) + if !pmdEntry.Valid() { + if !w.visitor.requiresAlloc() { + // Skip over this entry. + clearPMDEntries++ + start = next(start, pmdSize) + continue + } + + // This level has 2-MB huge pages. If this + // region is contined in a single PMD entry? + // As above, we can skip allocating a new page. + if start&(pmdSize-1) == 0 && end-start >= pmdSize { + pmdEntry.SetSuper() + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + if pmdEntry.Valid() { + start = next(start, pmdSize) + continue + } + } + + // Allocate a new pmd. + pteEntries = w.pageTables.Allocator.NewPTEs() + pmdEntry.setPageTable(w.pageTables, pteEntries) + + } else if pmdEntry.IsSuper() { + // Does this page need to be split? + if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < next(start, pmdSize)) { + // Install the relevant entries. + pteEntries = w.pageTables.Allocator.NewPTEs() + for index := uint16(0); index < entriesPerPage; index++ { + pteEntries[index].Set( + pmdEntry.Address()+(pteSize*uintptr(index)), + pmdEntry.Opts()) + } + pmdEntry.setPageTable(w.pageTables, pteEntries) + } else { + // A huge page to be checked directly. + w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1) + + // Might have been cleared. + if !pmdEntry.Valid() { + clearPMDEntries++ + } + + // Note that the huge page was changed. + start = next(start, pmdSize) + continue + } + } else { + pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address()) + } + + // Map the next level, since this is valid. + clearPTEEntries := uint16(0) + + for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { + var ( + pteEntry = &pteEntries[pteIndex] + ) + if !pteEntry.Valid() && !w.visitor.requiresAlloc() { + clearPTEEntries++ + start += pteSize + continue + } + + // At this point, we are guaranteed that start%pteSize == 0. + w.visitor.visit(uintptr(start), pteEntry, pteSize-1) + if !pteEntry.Valid() { + if w.visitor.requiresAlloc() { + panic("PTE not set after iteration with requiresAlloc!") + } + clearPTEEntries++ + } + + // Note that the pte was changed. + start += pteSize + continue + } + + // Check if we no longer need this page. + if clearPTEEntries == entriesPerPage { + pmdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pteEntries) + clearPMDEntries++ + } + } + + // Check if we no longer need this page. + if clearPMDEntries == entriesPerPage { + pudEntry.Clear() + w.pageTables.Allocator.FreePTEs(pmdEntries) + clearPUDEntries++ + } + } + + // Check if we no longer need this page. + if clearPUDEntries == entriesPerPage { + pgdEntry.Clear() + w.pageTables.Allocator.FreePTEs(pudEntries) + } + } +} |