summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go')
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go397
1 files changed, 397 insertions, 0 deletions
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
new file mode 100644
index 000000000..b89665c96
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
@@ -0,0 +1,397 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package pagetables
+
+import (
+ "fmt"
+ "sync/atomic"
+)
+
+// Address constraints.
+//
+// The lowerTop and upperBottom currently apply to four-level pagetables;
+// additional refactoring would be necessary to support five-level pagetables.
+const (
+ lowerTop = 0x00007fffffffffff
+ upperBottom = 0xffff800000000000
+
+ pteShift = 12
+ pmdShift = 21
+ pudShift = 30
+ pgdShift = 39
+
+ pteMask = 0x1ff << pteShift
+ pmdMask = 0x1ff << pmdShift
+ pudMask = 0x1ff << pudShift
+ pgdMask = 0x1ff << pgdShift
+
+ pteSize = 1 << pteShift
+ pmdSize = 1 << pmdShift
+ pudSize = 1 << pudShift
+ pgdSize = 1 << pgdShift
+)
+
+// Bits in page table entries.
+const (
+ present = 0x001
+ writable = 0x002
+ user = 0x004
+ writeThrough = 0x008
+ cacheDisable = 0x010
+ accessed = 0x020
+ dirty = 0x040
+ super = 0x080
+ executeDisable = 1 << 63
+)
+
+// PTE is a page table entry.
+type PTE uint64
+
+// Clear clears this PTE, including super page information.
+func (p *PTE) Clear() {
+ atomic.StoreUint64((*uint64)(p), 0)
+}
+
+// Valid returns true iff this entry is valid.
+func (p *PTE) Valid() bool {
+ return atomic.LoadUint64((*uint64)(p))&present != 0
+}
+
+// Writeable returns true iff the page is writable.
+func (p *PTE) Writeable() bool {
+ return atomic.LoadUint64((*uint64)(p))&writable != 0
+}
+
+// User returns true iff the page is user-accessible.
+func (p *PTE) User() bool {
+ return atomic.LoadUint64((*uint64)(p))&user != 0
+}
+
+// Executable returns true iff the page is executable.
+func (p *PTE) Executable() bool {
+ return atomic.LoadUint64((*uint64)(p))&executeDisable == 0
+}
+
+// SetSuper sets this page as a super page.
+//
+// The page must not be valid or a panic will result.
+func (p *PTE) SetSuper() {
+ if p.Valid() {
+ // This is not allowed.
+ panic("SetSuper called on valid page!")
+ }
+ atomic.StoreUint64((*uint64)(p), super)
+}
+
+// IsSuper returns true iff this page is a super page.
+func (p *PTE) IsSuper() bool {
+ return atomic.LoadUint64((*uint64)(p))&super != 0
+}
+
+// Set sets this PTE value.
+func (p *PTE) Set(addr uintptr, write, execute bool, userAccessible bool) {
+ v := uint64(addr)&^uint64(0xfff) | present | accessed
+ if userAccessible {
+ v |= user
+ }
+ if !execute {
+ v |= executeDisable
+ }
+ if write {
+ v |= writable | dirty
+ }
+ if p.IsSuper() {
+ v |= super
+ }
+ atomic.StoreUint64((*uint64)(p), v)
+}
+
+// setPageTable sets this PTE value and forces the write bit and super bit to
+// be cleared. This is used explicitly for breaking super pages.
+func (p *PTE) setPageTable(addr uintptr) {
+ v := uint64(addr)&^uint64(0xfff) | present | user | writable | accessed | dirty
+ atomic.StoreUint64((*uint64)(p), v)
+}
+
+// Address extracts the address. This should only be used if Valid returns true.
+func (p *PTE) Address() uintptr {
+ return uintptr(atomic.LoadUint64((*uint64)(p)) & ^uint64(executeDisable|0xfff))
+}
+
+// entriesPerPage is the number of PTEs per page.
+const entriesPerPage = 512
+
+// PTEs is a collection of entries.
+type PTEs [entriesPerPage]PTE
+
+// next returns the next address quantized by the given size.
+func next(start uint64, size uint64) uint64 {
+ start &= ^(size - 1)
+ start += size
+ return start
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If alloc is set, then Set _must_ be called on all given PTEs. The exception
+// is super pages. If a valid super page cannot be installed, then the walk
+// will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of super pages whenever
+// possible. Whether a super page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if alloc set, then no gaps will be present. However, if alloc is
+// not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: startAddr and endAddr must be page-aligned.
+//
+// Precondition: startStart must be less than endAddr.
+//
+// Precondition: If alloc is set, then startAddr and endAddr should not span
+// non-canonical ranges. If they do, a panic will result.
+func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn func(s, e uintptr, pte *PTE, align uintptr)) {
+ start := uint64(startAddr)
+ end := uint64(endAddr)
+ if start%pteSize != 0 {
+ panic(fmt.Sprintf("unaligned start: %v", start))
+ }
+ if start > end {
+ panic(fmt.Sprintf("start > end (%v > %v))", start, end))
+ }
+
+ // Deal with cases where we traverse the "gap".
+ //
+ // These are all explicitly disallowed if alloc is set, and we must
+ // traverse an entry for each address explicitly.
+ switch {
+ case start < lowerTop && end > lowerTop && end < upperBottom:
+ if alloc {
+ panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
+ }
+ p.iterateRange(startAddr, lowerTop, false, fn)
+ return
+ case start < lowerTop && end > lowerTop:
+ if alloc {
+ panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
+ }
+ p.iterateRange(startAddr, lowerTop, false, fn)
+ p.iterateRange(upperBottom, endAddr, false, fn)
+ return
+ case start > lowerTop && end < upperBottom:
+ if alloc {
+ panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
+ }
+ return
+ case start > lowerTop && start < upperBottom && end > upperBottom:
+ if alloc {
+ panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
+ }
+ p.iterateRange(upperBottom, endAddr, false, fn)
+ return
+ }
+
+ for pgdIndex := int((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+ pgdEntry := &p.root.PTEs()[pgdIndex]
+ if !pgdEntry.Valid() {
+ if !alloc {
+ // Skip over this entry.
+ start = next(start, pgdSize)
+ continue
+ }
+
+ // Allocate a new pgd.
+ p.setPageTable(p.root, pgdIndex, p.allocNode())
+ }
+
+ // Map the next level.
+ pudNode := p.getPageTable(p.root, pgdIndex)
+ clearPUDEntries := 0
+
+ for pudIndex := int((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+ pudEntry := &(pudNode.PTEs()[pudIndex])
+ if !pudEntry.Valid() {
+ if !alloc {
+ // Skip over this entry.
+ clearPUDEntries++
+ start = next(start, pudSize)
+ continue
+ }
+
+ // This level has 1-GB super pages. Is this
+ // entire region contained in a single PUD
+ // entry? If so, we can skip allocating a new
+ // page for the pmd.
+ if start&(pudSize-1) == 0 && end-start >= pudSize {
+ pudEntry.SetSuper()
+ fn(uintptr(start), uintptr(start+pudSize), pudEntry, pudSize-1)
+ if pudEntry.Valid() {
+ start = next(start, pudSize)
+ continue
+ }
+ }
+
+ // Allocate a new pud.
+ p.setPageTable(pudNode, pudIndex, p.allocNode())
+
+ } else if pudEntry.IsSuper() {
+ // Does this page need to be split?
+ if start&(pudSize-1) != 0 || end < next(start, pudSize) {
+ currentAddr := uint64(pudEntry.Address())
+ writeable := pudEntry.Writeable()
+ executable := pudEntry.Executable()
+ user := pudEntry.User()
+
+ // Install the relevant entries.
+ pmdNode := p.allocNode()
+ pmdEntries := pmdNode.PTEs()
+ for index := 0; index < entriesPerPage; index++ {
+ pmdEntry := &pmdEntries[index]
+ pmdEntry.SetSuper()
+ pmdEntry.Set(uintptr(currentAddr), writeable, executable, user)
+ currentAddr += pmdSize
+ }
+
+ // Reset to point to the new page.
+ p.setPageTable(pudNode, pudIndex, pmdNode)
+ } else {
+ // A super page to be checked directly.
+ fn(uintptr(start), uintptr(start+pudSize), pudEntry, pudSize-1)
+
+ // Might have been cleared.
+ if !pudEntry.Valid() {
+ clearPUDEntries++
+ }
+
+ // Note that the super page was changed.
+ start = next(start, pudSize)
+ continue
+ }
+ }
+
+ // Map the next level, since this is valid.
+ pmdNode := p.getPageTable(pudNode, pudIndex)
+ clearPMDEntries := 0
+
+ for pmdIndex := int((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+ pmdEntry := &pmdNode.PTEs()[pmdIndex]
+ if !pmdEntry.Valid() {
+ if !alloc {
+ // Skip over this entry.
+ clearPMDEntries++
+ start = next(start, pmdSize)
+ continue
+ }
+
+ // This level has 2-MB huge pages. If this
+ // region is contined in a single PMD entry?
+ // As above, we can skip allocating a new page.
+ if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+ pmdEntry.SetSuper()
+ fn(uintptr(start), uintptr(start+pmdSize), pmdEntry, pmdSize-1)
+ if pmdEntry.Valid() {
+ start = next(start, pmdSize)
+ continue
+ }
+ }
+
+ // Allocate a new pmd.
+ p.setPageTable(pmdNode, pmdIndex, p.allocNode())
+
+ } else if pmdEntry.IsSuper() {
+ // Does this page need to be split?
+ if start&(pmdSize-1) != 0 || end < next(start, pmdSize) {
+ currentAddr := uint64(pmdEntry.Address())
+ writeable := pmdEntry.Writeable()
+ executable := pmdEntry.Executable()
+ user := pmdEntry.User()
+
+ // Install the relevant entries.
+ pteNode := p.allocNode()
+ pteEntries := pteNode.PTEs()
+ for index := 0; index < entriesPerPage; index++ {
+ pteEntry := &pteEntries[index]
+ pteEntry.Set(uintptr(currentAddr), writeable, executable, user)
+ currentAddr += pteSize
+ }
+
+ // Reset to point to the new page.
+ p.setPageTable(pmdNode, pmdIndex, pteNode)
+ } else {
+ // A huge page to be checked directly.
+ fn(uintptr(start), uintptr(start+pmdSize), pmdEntry, pmdSize-1)
+
+ // Might have been cleared.
+ if !pmdEntry.Valid() {
+ clearPMDEntries++
+ }
+
+ // Note that the huge page was changed.
+ start = next(start, pmdSize)
+ continue
+ }
+ }
+
+ // Map the next level, since this is valid.
+ pteNode := p.getPageTable(pmdNode, pmdIndex)
+ clearPTEEntries := 0
+
+ for pteIndex := int((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+ pteEntry := &pteNode.PTEs()[pteIndex]
+ if !pteEntry.Valid() && !alloc {
+ clearPTEEntries++
+ start += pteSize
+ continue
+ }
+
+ // At this point, we are guaranteed that start%pteSize == 0.
+ fn(uintptr(start), uintptr(start+pteSize), pteEntry, pteSize-1)
+ if !pteEntry.Valid() {
+ if alloc {
+ panic("PTE not set after iteration with alloc=true!")
+ }
+ clearPTEEntries++
+ }
+
+ // Note that the pte was changed.
+ start += pteSize
+ continue
+ }
+
+ // Check if we no longer need this page.
+ if clearPTEEntries == entriesPerPage {
+ p.clearPageTable(pmdNode, pmdIndex)
+ clearPMDEntries++
+ }
+ }
+
+ // Check if we no longer need this page.
+ if clearPMDEntries == entriesPerPage {
+ p.clearPageTable(pudNode, pudIndex)
+ clearPUDEntries++
+ }
+ }
+
+ // Check if we no longer need this page.
+ if clearPUDEntries == entriesPerPage {
+ p.clearPageTable(p.root, pgdIndex)
+ }
+ }
+}