summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorAdin Scannell <ascannell@google.com>2018-06-11 18:14:22 -0700
committerShentubot <shentubot@google.com>2018-06-11 18:15:14 -0700
commit1397a413b49d6036f2586e85c8074aa3d4d6c6fa (patch)
tree7bb617429fe26dfdc47bc02bab8d74eb621775e6
parent09b0a9c320bd777bc52384bd0ec91ecfc61e481d (diff)
Make page tables split-safe.
In order to minimize the likelihood of exit during page table modifications, make the full set of page table functions split-safe. This is not strictly necessary (and you may still incur splits due to allocations from the allocator pool) but should make retries a very rare occurance. PiperOrigin-RevId: 200146688 Change-Id: I8fa36aa16b807beda2f0b057be60038258e8d597
-rw-r--r--pkg/sentry/platform/ring0/pagetables/BUILD72
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables.go192
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go276
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables_test.go83
-rw-r--r--pkg/sentry/platform/ring0/pagetables/walker_amd64.go307
5 files changed, 583 insertions, 347 deletions
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index 08b73e87d..023e298a0 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -1,6 +1,73 @@
package(licenses = ["notice"]) # Apache 2.0
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance")
+
+go_template(
+ name = "generic_walker",
+ srcs = [
+ "walker_amd64.go",
+ ],
+ opt_types = [
+ "Visitor",
+ ],
+ visibility = [":__pkg__"],
+)
+
+go_template_instance(
+ name = "walker_map",
+ out = "walker_map.go",
+ package = "pagetables",
+ prefix = "map",
+ template = ":generic_walker",
+ types = {
+ "Visitor": "mapVisitor",
+ },
+)
+
+go_template_instance(
+ name = "walker_unmap",
+ out = "walker_unmap.go",
+ package = "pagetables",
+ prefix = "unmap",
+ template = ":generic_walker",
+ types = {
+ "Visitor": "unmapVisitor",
+ },
+)
+
+go_template_instance(
+ name = "walker_lookup",
+ out = "walker_lookup.go",
+ package = "pagetables",
+ prefix = "lookup",
+ template = ":generic_walker",
+ types = {
+ "Visitor": "lookupVisitor",
+ },
+)
+
+go_template_instance(
+ name = "walker_empty",
+ out = "walker_empty.go",
+ package = "pagetables",
+ prefix = "empty",
+ template = ":generic_walker",
+ types = {
+ "Visitor": "emptyVisitor",
+ },
+)
+
+go_template_instance(
+ name = "walker_check",
+ out = "walker_check.go",
+ package = "pagetables",
+ prefix = "check",
+ template = ":generic_walker",
+ types = {
+ "Visitor": "checkVisitor",
+ },
+)
go_library(
name = "pagetables",
@@ -11,6 +78,10 @@ go_library(
"pagetables_amd64.go",
"pagetables_x86.go",
"pcids_x86.go",
+ "walker_empty.go",
+ "walker_lookup.go",
+ "walker_map.go",
+ "walker_unmap.go",
],
importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables",
visibility = [
@@ -26,6 +97,7 @@ go_test(
srcs = [
"pagetables_amd64_test.go",
"pagetables_test.go",
+ "walker_check.go",
],
embed = [":pagetables"],
deps = ["//pkg/sentry/usermem"],
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
index 6963ba62d..ff5787f89 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -13,6 +13,11 @@
// limitations under the License.
// Package pagetables provides a generic implementation of pagetables.
+//
+// The core functions must be safe to call from a nosplit context. Furthermore,
+// this pagetables implementation goes to lengths to ensure that all functions
+// are free from runtime allocation. Calls to NewPTEs/FreePTEs may be made
+// during walks, but these can be cached elsewhere if required.
package pagetables
import (
@@ -38,64 +43,179 @@ type PageTables struct {
// New returns new PageTables.
func New(a Allocator) *PageTables {
- p := &PageTables{Allocator: a}
+ p := new(PageTables)
+ p.Init(a)
+ return p
+}
+
+// Init initializes a set of PageTables.
+//
+//go:nosplit
+func (p *PageTables) Init(allocator Allocator) {
+ p.Allocator = allocator
p.root = p.Allocator.NewPTEs()
p.rootPhysical = p.Allocator.PhysicalFor(p.root)
- return p
}
+// mapVisitor is used for map.
+type mapVisitor struct {
+ target uintptr // Input.
+ physical uintptr // Input.
+ opts MapOpts // Input.
+ prev bool // Output.
+}
+
+// visit is used for map.
+//
+//go:nosplit
+func (v *mapVisitor) visit(start uintptr, pte *PTE, align uintptr) {
+ p := v.physical + (start - uintptr(v.target))
+ if pte.Valid() && (pte.Address() != p || pte.Opts() != v.opts) {
+ v.prev = true
+ }
+ if p&align != 0 {
+ // We will install entries at a smaller granulaity if we don't
+ // install a valid entry here, however we must zap any existing
+ // entry to ensure this happens.
+ pte.Clear()
+ return
+ }
+ pte.Set(p, v.opts)
+}
+
+//go:nosplit
+func (*mapVisitor) requiresAlloc() bool { return true }
+
+//go:nosplit
+func (*mapVisitor) requiresSplit() bool { return true }
+
// Map installs a mapping with the given physical address.
//
// True is returned iff there was a previous mapping in the range.
//
-// Precondition: addr & length must be aligned, their sum must not overflow.
+// Precondition: addr & length must be page-aligned, their sum must not overflow.
+//
+//go:nosplit
func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physical uintptr) bool {
if !opts.AccessType.Any() {
return p.Unmap(addr, length)
}
- prev := false
- end, ok := addr.AddLength(uint64(length))
- if !ok {
- panic("pagetables.Map: overflow")
+ w := mapWalker{
+ pageTables: p,
+ visitor: mapVisitor{
+ target: uintptr(addr),
+ physical: physical,
+ opts: opts,
+ },
}
- p.iterateRange(uintptr(addr), uintptr(end), true, func(s, e uintptr, pte *PTE, align uintptr) {
- p := physical + (s - uintptr(addr))
- prev = prev || (pte.Valid() && (p != pte.Address() || opts != pte.Opts()))
- if p&align != 0 {
- // We will install entries at a smaller granulaity if
- // we don't install a valid entry here, however we must
- // zap any existing entry to ensure this happens.
- pte.Clear()
- return
- }
- pte.Set(p, opts)
- })
- return prev
+ w.iterateRange(uintptr(addr), uintptr(addr)+length)
+ return w.visitor.prev
+}
+
+// unmapVisitor is used for unmap.
+type unmapVisitor struct {
+ count int
+}
+
+//go:nosplit
+func (*unmapVisitor) requiresAlloc() bool { return false }
+
+//go:nosplit
+func (*unmapVisitor) requiresSplit() bool { return true }
+
+// visit unmaps the given entry.
+//
+//go:nosplit
+func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) {
+ pte.Clear()
+ v.count++
}
// Unmap unmaps the given range.
//
// True is returned iff there was a previous mapping in the range.
+//
+// Precondition: addr & length must be page-aligned.
+//
+//go:nosplit
func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool {
- count := 0
- p.iterateRange(uintptr(addr), uintptr(addr)+length, false, func(s, e uintptr, pte *PTE, align uintptr) {
- pte.Clear()
- count++
- })
- return count > 0
+ w := unmapWalker{
+ pageTables: p,
+ visitor: unmapVisitor{
+ count: 0,
+ },
+ }
+ w.iterateRange(uintptr(addr), uintptr(addr)+length)
+ return w.visitor.count > 0
}
+// emptyVisitor is used for emptiness checks.
+type emptyVisitor struct {
+ count int
+}
+
+//go:nosplit
+func (*emptyVisitor) requiresAlloc() bool { return false }
+
+//go:nosplit
+func (*emptyVisitor) requiresSplit() bool { return false }
+
+// visit unmaps the given entry.
+//
+//go:nosplit
+func (v *emptyVisitor) visit(start uintptr, pte *PTE, align uintptr) {
+ v.count++
+}
+
+// IsEmpty checks if the given range is empty.
+//
+// Precondition: addr & length must be page-aligned.
+//
+//go:nosplit
+func (p *PageTables) IsEmpty(addr usermem.Addr, length uintptr) bool {
+ w := emptyWalker{
+ pageTables: p,
+ }
+ w.iterateRange(uintptr(addr), uintptr(addr)+length)
+ return w.visitor.count == 0
+}
+
+// lookupVisitor is used for lookup.
+type lookupVisitor struct {
+ target uintptr // Input.
+ physical uintptr // Output.
+ opts MapOpts // Output.
+}
+
+// visit matches the given address.
+//
+//go:nosplit
+func (v *lookupVisitor) visit(start uintptr, pte *PTE, align uintptr) {
+ if !pte.Valid() {
+ return
+ }
+ v.physical = pte.Address() + (start - uintptr(v.target))
+ v.opts = pte.Opts()
+}
+
+//go:nosplit
+func (*lookupVisitor) requiresAlloc() bool { return false }
+
+//go:nosplit
+func (*lookupVisitor) requiresSplit() bool { return false }
+
// Lookup returns the physical address for the given virtual address.
+//
+//go:nosplit
func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts) {
mask := uintptr(usermem.PageSize - 1)
- off := uintptr(addr) & mask
- addr = addr &^ usermem.Addr(mask)
- p.iterateRange(uintptr(addr), uintptr(addr+usermem.PageSize), false, func(s, e uintptr, pte *PTE, align uintptr) {
- if !pte.Valid() {
- return
- }
- physical = pte.Address() + (s - uintptr(addr)) + off
- opts = pte.Opts()
- })
- return
+ offset := uintptr(addr) & mask
+ w := lookupWalker{
+ pageTables: p,
+ visitor: lookupVisitor{
+ target: uintptr(addr &^ usermem.Addr(mask)),
+ },
+ }
+ w.iterateRange(uintptr(addr), uintptr(addr)+1)
+ return w.visitor.physical + offset, w.visitor.opts
}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
index 6a724e4fd..878463018 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
@@ -12,14 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// +build amd64
-
package pagetables
-import (
- "fmt"
-)
-
// Address constraints.
//
// The lowerTop and upperBottom currently apply to four-level pagetables;
@@ -49,273 +43,3 @@ const (
// PTEs is a collection of entries.
type PTEs [entriesPerPage]PTE
-
-// next returns the next address quantized by the given size.
-func next(start uint64, size uint64) uint64 {
- start &= ^(size - 1)
- start += size
- return start
-}
-
-// iterateRange iterates over all appropriate levels of page tables for the given range.
-//
-// If alloc is set, then Set _must_ be called on all given PTEs. The exception
-// is super pages. If a valid super page cannot be installed, then the walk
-// will continue to individual entries.
-//
-// This algorithm will attempt to maximize the use of super pages whenever
-// possible. Whether a super page is provided will be clear through the range
-// provided in the callback.
-//
-// Note that if alloc set, then no gaps will be present. However, if alloc is
-// not set, then the iteration will likely be full of gaps.
-//
-// Note that this function should generally be avoided in favor of Map, Unmap,
-// etc. when not necessary.
-//
-// Precondition: startAddr and endAddr must be page-aligned.
-//
-// Precondition: startStart must be less than endAddr.
-//
-// Precondition: If alloc is set, then startAddr and endAddr should not span
-// non-canonical ranges. If they do, a panic will result.
-func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn func(s, e uintptr, pte *PTE, align uintptr)) {
- start := uint64(startAddr)
- end := uint64(endAddr)
- if start%pteSize != 0 {
- panic(fmt.Sprintf("unaligned start: %v", start))
- }
- if start > end {
- panic(fmt.Sprintf("start > end (%v > %v))", start, end))
- }
-
- // Deal with cases where we traverse the "gap".
- //
- // These are all explicitly disallowed if alloc is set, and we must
- // traverse an entry for each address explicitly.
- switch {
- case start < lowerTop && end > lowerTop && end < upperBottom:
- if alloc {
- panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
- }
- p.iterateRange(startAddr, lowerTop, false, fn)
- return
- case start < lowerTop && end > lowerTop:
- if alloc {
- panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
- }
- p.iterateRange(startAddr, lowerTop, false, fn)
- p.iterateRange(upperBottom, endAddr, false, fn)
- return
- case start > lowerTop && end < upperBottom:
- if alloc {
- panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
- }
- return
- case start > lowerTop && start < upperBottom && end > upperBottom:
- if alloc {
- panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
- }
- p.iterateRange(upperBottom, endAddr, false, fn)
- return
- }
-
- for pgdIndex := int((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
- var (
- pgdEntry = &p.root[pgdIndex]
- pudEntries *PTEs
- )
- if !pgdEntry.Valid() {
- if !alloc {
- // Skip over this entry.
- start = next(start, pgdSize)
- continue
- }
-
- // Allocate a new pgd.
- pudEntries = p.Allocator.NewPTEs()
- pgdEntry.setPageTable(p, pudEntries)
- } else {
- pudEntries = p.Allocator.LookupPTEs(pgdEntry.Address())
- }
-
- // Map the next level.
- clearPUDEntries := 0
-
- for pudIndex := int((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
- var (
- pudEntry = &pudEntries[pudIndex]
- pmdEntries *PTEs
- )
- if !pudEntry.Valid() {
- if !alloc {
- // Skip over this entry.
- clearPUDEntries++
- start = next(start, pudSize)
- continue
- }
-
- // This level has 1-GB super pages. Is this
- // entire region contained in a single PUD
- // entry? If so, we can skip allocating a new
- // page for the pmd.
- if start&(pudSize-1) == 0 && end-start >= pudSize {
- pudEntry.SetSuper()
- fn(uintptr(start), uintptr(start+pudSize), pudEntry, pudSize-1)
- if pudEntry.Valid() {
- start = next(start, pudSize)
- continue
- }
- }
-
- // Allocate a new pud.
- pmdEntries = p.Allocator.NewPTEs()
- pudEntry.setPageTable(p, pmdEntries)
-
- } else if pudEntry.IsSuper() {
- // Does this page need to be split?
- if start&(pudSize-1) != 0 || end < next(start, pudSize) {
- currentAddr := uint64(pudEntry.Address())
-
- // Install the relevant entries.
- pmdEntries = p.Allocator.NewPTEs()
- for index := 0; index < entriesPerPage; index++ {
- pmdEntry := &pmdEntries[index]
- pmdEntry.SetSuper()
- pmdEntry.Set(uintptr(currentAddr), pudEntry.Opts())
- currentAddr += pmdSize
- }
-
- // Reset to point to the new page.
- pudEntry.setPageTable(p, pmdEntries)
- } else {
- // A super page to be checked directly.
- fn(uintptr(start), uintptr(start+pudSize), pudEntry, pudSize-1)
-
- // Might have been cleared.
- if !pudEntry.Valid() {
- clearPUDEntries++
- }
-
- // Note that the super page was changed.
- start = next(start, pudSize)
- continue
- }
- } else {
- pmdEntries = p.Allocator.LookupPTEs(pudEntry.Address())
- }
-
- // Map the next level, since this is valid.
- clearPMDEntries := 0
-
- for pmdIndex := int((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
- var (
- pmdEntry = &pmdEntries[pmdIndex]
- pteEntries *PTEs
- )
- if !pmdEntry.Valid() {
- if !alloc {
- // Skip over this entry.
- clearPMDEntries++
- start = next(start, pmdSize)
- continue
- }
-
- // This level has 2-MB huge pages. If this
- // region is contained in a single PMD entry?
- // As above, we can skip allocating a new page.
- if start&(pmdSize-1) == 0 && end-start >= pmdSize {
- pmdEntry.SetSuper()
- fn(uintptr(start), uintptr(start+pmdSize), pmdEntry, pmdSize-1)
- if pmdEntry.Valid() {
- start = next(start, pmdSize)
- continue
- }
- }
-
- // Allocate a new pmd.
- pteEntries = p.Allocator.NewPTEs()
- pmdEntry.setPageTable(p, pteEntries)
-
- } else if pmdEntry.IsSuper() {
- // Does this page need to be split?
- if start&(pmdSize-1) != 0 || end < next(start, pmdSize) {
- currentAddr := uint64(pmdEntry.Address())
-
- // Install the relevant entries.
- pteEntries = p.Allocator.NewPTEs()
- for index := 0; index < entriesPerPage; index++ {
- pteEntry := &pteEntries[index]
- pteEntry.Set(uintptr(currentAddr), pmdEntry.Opts())
- currentAddr += pteSize
- }
-
- // Reset to point to the new page.
- pmdEntry.setPageTable(p, pteEntries)
- } else {
- // A huge page to be checked directly.
- fn(uintptr(start), uintptr(start+pmdSize), pmdEntry, pmdSize-1)
-
- // Might have been cleared.
- if !pmdEntry.Valid() {
- clearPMDEntries++
- }
-
- // Note that the huge page was changed.
- start = next(start, pmdSize)
- continue
- }
- } else {
- pteEntries = p.Allocator.LookupPTEs(pmdEntry.Address())
- }
-
- // Map the next level, since this is valid.
- clearPTEEntries := 0
-
- for pteIndex := int((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
- var (
- pteEntry = &pteEntries[pteIndex]
- )
- if !pteEntry.Valid() && !alloc {
- clearPTEEntries++
- start += pteSize
- continue
- }
-
- // At this point, we are guaranteed that start%pteSize == 0.
- fn(uintptr(start), uintptr(start+pteSize), pteEntry, pteSize-1)
- if !pteEntry.Valid() {
- if alloc {
- panic("PTE not set after iteration with alloc=true!")
- }
- clearPTEEntries++
- }
-
- // Note that the pte was changed.
- start += pteSize
- continue
- }
-
- // Check if we no longer need this page.
- if clearPTEEntries == entriesPerPage {
- pmdEntry.Clear()
- p.Allocator.FreePTEs(pteEntries)
- clearPMDEntries++
- }
- }
-
- // Check if we no longer need this page.
- if clearPMDEntries == entriesPerPage {
- pudEntry.Clear()
- p.Allocator.FreePTEs(pmdEntries)
- clearPUDEntries++
- }
- }
-
- // Check if we no longer need this page.
- if clearPUDEntries == entriesPerPage {
- pgdEntry.Clear()
- p.Allocator.FreePTEs(pudEntries)
- }
- }
-}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
index 28178f656..dca3f69ef 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
@@ -27,48 +27,61 @@ type mapping struct {
opts MapOpts
}
-func checkMappings(t *testing.T, pt *PageTables, m []mapping) {
- var (
- current int
- found []mapping
- failed string
- )
+type checkVisitor struct {
+ expected []mapping // Input.
+ current int // Temporary.
+ found []mapping // Output.
+ failed string // Output.
+}
- // Iterate over all the mappings.
- pt.iterateRange(0, ^uintptr(0), false, func(s, e uintptr, pte *PTE, align uintptr) {
- found = append(found, mapping{
- start: s,
- length: e - s,
- addr: pte.Address(),
- opts: pte.Opts(),
- })
- if failed != "" {
- // Don't keep looking for errors.
- return
- }
-
- if current >= len(m) {
- failed = "more mappings than expected"
- } else if m[current].start != s {
- failed = "start didn't match expected"
- } else if m[current].length != (e - s) {
- failed = "end didn't match expected"
- } else if m[current].addr != pte.Address() {
- failed = "address didn't match expected"
- } else if m[current].opts != pte.Opts() {
- failed = "opts didn't match"
- }
- current++
+func (v *checkVisitor) visit(start uintptr, pte *PTE, align uintptr) {
+ v.found = append(v.found, mapping{
+ start: start,
+ length: align + 1,
+ addr: pte.Address(),
+ opts: pte.Opts(),
})
+ if v.failed != "" {
+ // Don't keep looking for errors.
+ return
+ }
+
+ if v.current >= len(v.expected) {
+ v.failed = "more mappings than expected"
+ } else if v.expected[v.current].start != start {
+ v.failed = "start didn't match expected"
+ } else if v.expected[v.current].length != (align + 1) {
+ v.failed = "end didn't match expected"
+ } else if v.expected[v.current].addr != pte.Address() {
+ v.failed = "address didn't match expected"
+ } else if v.expected[v.current].opts != pte.Opts() {
+ v.failed = "opts didn't match"
+ }
+ v.current++
+}
+
+func (*checkVisitor) requiresAlloc() bool { return false }
+
+func (*checkVisitor) requiresSplit() bool { return false }
+
+func checkMappings(t *testing.T, pt *PageTables, m []mapping) {
+ // Iterate over all the mappings.
+ w := checkWalker{
+ pageTables: pt,
+ visitor: checkVisitor{
+ expected: m,
+ },
+ }
+ w.iterateRange(0, ^uintptr(0))
// Were we expected additional mappings?
- if failed == "" && current != len(m) {
- failed = "insufficient mappings found"
+ if w.visitor.failed == "" && w.visitor.current != len(w.visitor.expected) {
+ w.visitor.failed = "insufficient mappings found"
}
// Emit a meaningful error message on failure.
- if failed != "" {
- t.Errorf("%s; got %#v, wanted %#v", failed, found, m)
+ if w.visitor.failed != "" {
+ t.Errorf("%s; got %#v, wanted %#v", w.visitor.failed, w.visitor.found, w.visitor.expected)
}
}
diff --git a/pkg/sentry/platform/ring0/pagetables/walker_amd64.go b/pkg/sentry/platform/ring0/pagetables/walker_amd64.go
new file mode 100644
index 000000000..afa4d473a
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/walker_amd64.go
@@ -0,0 +1,307 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package pagetables
+
+// Visitor is a generic type.
+type Visitor interface {
+ // visit is called on each PTE.
+ visit(start uintptr, pte *PTE, align uintptr)
+
+ // requiresAlloc indicates that new entries should be allocated within
+ // the walked range.
+ requiresAlloc() bool
+
+ // requiresSplit indicates that entries in the given range should be
+ // split if they are huge or jumbo pages.
+ requiresSplit() bool
+}
+
+// Walker walks page tables.
+type Walker struct {
+ // pageTables are the tables to walk.
+ pageTables *PageTables
+
+ // Visitor is the set of arguments.
+ visitor Visitor
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If requiresAlloc is true, then Set _must_ be called on all given PTEs. The
+// exception is super pages. If a valid super page (huge or jumbo) cannot be
+// installed, then the walk will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of super pages whenever
+// possible. Whether a super page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if requiresAlloc is true, then no gaps will be present. However,
+// if alloc is not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: start must be page-aligned.
+//
+// Precondition: start must be less than end.
+//
+// Precondition: If requiresAlloc is true, then start and end should not span
+// non-canonical ranges. If they do, a panic will result.
+//
+//go:nosplit
+func (w *Walker) iterateRange(start, end uintptr) {
+ if start%pteSize != 0 {
+ panic("unaligned start")
+ }
+ if end < start {
+ panic("start > end")
+ }
+ if start < lowerTop {
+ if end <= lowerTop {
+ w.iterateRangeCanonical(start, end)
+ } else if end > lowerTop && end <= upperBottom {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ w.iterateRangeCanonical(start, lowerTop)
+ } else {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ w.iterateRangeCanonical(start, lowerTop)
+ w.iterateRangeCanonical(upperBottom, end)
+ }
+ } else if start < upperBottom {
+ if end <= upperBottom {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ } else {
+ if w.visitor.requiresAlloc() {
+ panic("alloc spans non-canonical range")
+ }
+ w.iterateRangeCanonical(upperBottom, end)
+ }
+ } else {
+ w.iterateRangeCanonical(start, end)
+ }
+}
+
+// next returns the next address quantized by the given size.
+//
+//go:nosplit
+func next(start uintptr, size uintptr) uintptr {
+ start &= ^(size - 1)
+ start += size
+ return start
+}
+
+// iterateRangeCanonical walks a canonical range.
+//
+//go:nosplit
+func (w *Walker) iterateRangeCanonical(start, end uintptr) {
+ for pgdIndex := uint16((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+ var (
+ pgdEntry = &w.pageTables.root[pgdIndex]
+ pudEntries *PTEs
+ )
+ if !pgdEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+ // Skip over this entry.
+ start = next(start, pgdSize)
+ continue
+ }
+
+ // Allocate a new pgd.
+ pudEntries = w.pageTables.Allocator.NewPTEs()
+ pgdEntry.setPageTable(w.pageTables, pudEntries)
+ } else {
+ pudEntries = w.pageTables.Allocator.LookupPTEs(pgdEntry.Address())
+ }
+
+ // Map the next level.
+ clearPUDEntries := uint16(0)
+
+ for pudIndex := uint16((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+ var (
+ pudEntry = &pudEntries[pudIndex]
+ pmdEntries *PTEs
+ )
+ if !pudEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+ // Skip over this entry.
+ clearPUDEntries++
+ start = next(start, pudSize)
+ continue
+ }
+
+ // This level has 1-GB super pages. Is this
+ // entire region at least as large as a single
+ // PUD entry? If so, we can skip allocating a
+ // new page for the pmd.
+ if start&(pudSize-1) == 0 && end-start >= pudSize {
+ pudEntry.SetSuper()
+ w.visitor.visit(uintptr(start), pudEntry, pudSize-1)
+ if pudEntry.Valid() {
+ start = next(start, pudSize)
+ continue
+ }
+ }
+
+ // Allocate a new pud.
+ pmdEntries = w.pageTables.Allocator.NewPTEs()
+ pudEntry.setPageTable(w.pageTables, pmdEntries)
+
+ } else if pudEntry.IsSuper() {
+ // Does this page need to be split?
+ if w.visitor.requiresSplit() && (start&(pudSize-1) != 0 || end < next(start, pudSize)) {
+ // Install the relevant entries.
+ pmdEntries = w.pageTables.Allocator.NewPTEs()
+ for index := uint16(0); index < entriesPerPage; index++ {
+ pmdEntries[index].SetSuper()
+ pmdEntries[index].Set(
+ pudEntry.Address()+(pmdSize*uintptr(index)),
+ pudEntry.Opts())
+ }
+ pudEntry.setPageTable(w.pageTables, pmdEntries)
+ } else {
+ // A super page to be checked directly.
+ w.visitor.visit(uintptr(start), pudEntry, pudSize-1)
+
+ // Might have been cleared.
+ if !pudEntry.Valid() {
+ clearPUDEntries++
+ }
+
+ // Note that the super page was changed.
+ start = next(start, pudSize)
+ continue
+ }
+ } else {
+ pmdEntries = w.pageTables.Allocator.LookupPTEs(pudEntry.Address())
+ }
+
+ // Map the next level, since this is valid.
+ clearPMDEntries := uint16(0)
+
+ for pmdIndex := uint16((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+ var (
+ pmdEntry = &pmdEntries[pmdIndex]
+ pteEntries *PTEs
+ )
+ if !pmdEntry.Valid() {
+ if !w.visitor.requiresAlloc() {
+ // Skip over this entry.
+ clearPMDEntries++
+ start = next(start, pmdSize)
+ continue
+ }
+
+ // This level has 2-MB huge pages. If this
+ // region is contined in a single PMD entry?
+ // As above, we can skip allocating a new page.
+ if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+ pmdEntry.SetSuper()
+ w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1)
+ if pmdEntry.Valid() {
+ start = next(start, pmdSize)
+ continue
+ }
+ }
+
+ // Allocate a new pmd.
+ pteEntries = w.pageTables.Allocator.NewPTEs()
+ pmdEntry.setPageTable(w.pageTables, pteEntries)
+
+ } else if pmdEntry.IsSuper() {
+ // Does this page need to be split?
+ if w.visitor.requiresSplit() && (start&(pmdSize-1) != 0 || end < next(start, pmdSize)) {
+ // Install the relevant entries.
+ pteEntries = w.pageTables.Allocator.NewPTEs()
+ for index := uint16(0); index < entriesPerPage; index++ {
+ pteEntries[index].Set(
+ pmdEntry.Address()+(pteSize*uintptr(index)),
+ pmdEntry.Opts())
+ }
+ pmdEntry.setPageTable(w.pageTables, pteEntries)
+ } else {
+ // A huge page to be checked directly.
+ w.visitor.visit(uintptr(start), pmdEntry, pmdSize-1)
+
+ // Might have been cleared.
+ if !pmdEntry.Valid() {
+ clearPMDEntries++
+ }
+
+ // Note that the huge page was changed.
+ start = next(start, pmdSize)
+ continue
+ }
+ } else {
+ pteEntries = w.pageTables.Allocator.LookupPTEs(pmdEntry.Address())
+ }
+
+ // Map the next level, since this is valid.
+ clearPTEEntries := uint16(0)
+
+ for pteIndex := uint16((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+ var (
+ pteEntry = &pteEntries[pteIndex]
+ )
+ if !pteEntry.Valid() && !w.visitor.requiresAlloc() {
+ clearPTEEntries++
+ start += pteSize
+ continue
+ }
+
+ // At this point, we are guaranteed that start%pteSize == 0.
+ w.visitor.visit(uintptr(start), pteEntry, pteSize-1)
+ if !pteEntry.Valid() {
+ if w.visitor.requiresAlloc() {
+ panic("PTE not set after iteration with requiresAlloc!")
+ }
+ clearPTEEntries++
+ }
+
+ // Note that the pte was changed.
+ start += pteSize
+ continue
+ }
+
+ // Check if we no longer need this page.
+ if clearPTEEntries == entriesPerPage {
+ pmdEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pteEntries)
+ clearPMDEntries++
+ }
+ }
+
+ // Check if we no longer need this page.
+ if clearPMDEntries == entriesPerPage {
+ pudEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pmdEntries)
+ clearPUDEntries++
+ }
+ }
+
+ // Check if we no longer need this page.
+ if clearPUDEntries == entriesPerPage {
+ pgdEntry.Clear()
+ w.pageTables.Allocator.FreePTEs(pudEntries)
+ }
+ }
+}