diff options
Diffstat (limited to 'pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go')
-rw-r--r-- | pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go | 397 |
1 files changed, 397 insertions, 0 deletions
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go new file mode 100644 index 000000000..b89665c96 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go @@ -0,0 +1,397 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package pagetables + +import ( + "fmt" + "sync/atomic" +) + +// Address constraints. +// +// The lowerTop and upperBottom currently apply to four-level pagetables; +// additional refactoring would be necessary to support five-level pagetables. +const ( + lowerTop = 0x00007fffffffffff + upperBottom = 0xffff800000000000 + + pteShift = 12 + pmdShift = 21 + pudShift = 30 + pgdShift = 39 + + pteMask = 0x1ff << pteShift + pmdMask = 0x1ff << pmdShift + pudMask = 0x1ff << pudShift + pgdMask = 0x1ff << pgdShift + + pteSize = 1 << pteShift + pmdSize = 1 << pmdShift + pudSize = 1 << pudShift + pgdSize = 1 << pgdShift +) + +// Bits in page table entries. +const ( + present = 0x001 + writable = 0x002 + user = 0x004 + writeThrough = 0x008 + cacheDisable = 0x010 + accessed = 0x020 + dirty = 0x040 + super = 0x080 + executeDisable = 1 << 63 +) + +// PTE is a page table entry. +type PTE uint64 + +// Clear clears this PTE, including super page information. +func (p *PTE) Clear() { + atomic.StoreUint64((*uint64)(p), 0) +} + +// Valid returns true iff this entry is valid. +func (p *PTE) Valid() bool { + return atomic.LoadUint64((*uint64)(p))&present != 0 +} + +// Writeable returns true iff the page is writable. +func (p *PTE) Writeable() bool { + return atomic.LoadUint64((*uint64)(p))&writable != 0 +} + +// User returns true iff the page is user-accessible. +func (p *PTE) User() bool { + return atomic.LoadUint64((*uint64)(p))&user != 0 +} + +// Executable returns true iff the page is executable. +func (p *PTE) Executable() bool { + return atomic.LoadUint64((*uint64)(p))&executeDisable == 0 +} + +// SetSuper sets this page as a super page. +// +// The page must not be valid or a panic will result. +func (p *PTE) SetSuper() { + if p.Valid() { + // This is not allowed. + panic("SetSuper called on valid page!") + } + atomic.StoreUint64((*uint64)(p), super) +} + +// IsSuper returns true iff this page is a super page. +func (p *PTE) IsSuper() bool { + return atomic.LoadUint64((*uint64)(p))&super != 0 +} + +// Set sets this PTE value. +func (p *PTE) Set(addr uintptr, write, execute bool, userAccessible bool) { + v := uint64(addr)&^uint64(0xfff) | present | accessed + if userAccessible { + v |= user + } + if !execute { + v |= executeDisable + } + if write { + v |= writable | dirty + } + if p.IsSuper() { + v |= super + } + atomic.StoreUint64((*uint64)(p), v) +} + +// setPageTable sets this PTE value and forces the write bit and super bit to +// be cleared. This is used explicitly for breaking super pages. +func (p *PTE) setPageTable(addr uintptr) { + v := uint64(addr)&^uint64(0xfff) | present | user | writable | accessed | dirty + atomic.StoreUint64((*uint64)(p), v) +} + +// Address extracts the address. This should only be used if Valid returns true. +func (p *PTE) Address() uintptr { + return uintptr(atomic.LoadUint64((*uint64)(p)) & ^uint64(executeDisable|0xfff)) +} + +// entriesPerPage is the number of PTEs per page. +const entriesPerPage = 512 + +// PTEs is a collection of entries. +type PTEs [entriesPerPage]PTE + +// next returns the next address quantized by the given size. +func next(start uint64, size uint64) uint64 { + start &= ^(size - 1) + start += size + return start +} + +// iterateRange iterates over all appropriate levels of page tables for the given range. +// +// If alloc is set, then Set _must_ be called on all given PTEs. The exception +// is super pages. If a valid super page cannot be installed, then the walk +// will continue to individual entries. +// +// This algorithm will attempt to maximize the use of super pages whenever +// possible. Whether a super page is provided will be clear through the range +// provided in the callback. +// +// Note that if alloc set, then no gaps will be present. However, if alloc is +// not set, then the iteration will likely be full of gaps. +// +// Note that this function should generally be avoided in favor of Map, Unmap, +// etc. when not necessary. +// +// Precondition: startAddr and endAddr must be page-aligned. +// +// Precondition: startStart must be less than endAddr. +// +// Precondition: If alloc is set, then startAddr and endAddr should not span +// non-canonical ranges. If they do, a panic will result. +func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn func(s, e uintptr, pte *PTE, align uintptr)) { + start := uint64(startAddr) + end := uint64(endAddr) + if start%pteSize != 0 { + panic(fmt.Sprintf("unaligned start: %v", start)) + } + if start > end { + panic(fmt.Sprintf("start > end (%v > %v))", start, end)) + } + + // Deal with cases where we traverse the "gap". + // + // These are all explicitly disallowed if alloc is set, and we must + // traverse an entry for each address explicitly. + switch { + case start < lowerTop && end > lowerTop && end < upperBottom: + if alloc { + panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end)) + } + p.iterateRange(startAddr, lowerTop, false, fn) + return + case start < lowerTop && end > lowerTop: + if alloc { + panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end)) + } + p.iterateRange(startAddr, lowerTop, false, fn) + p.iterateRange(upperBottom, endAddr, false, fn) + return + case start > lowerTop && end < upperBottom: + if alloc { + panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end)) + } + return + case start > lowerTop && start < upperBottom && end > upperBottom: + if alloc { + panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end)) + } + p.iterateRange(upperBottom, endAddr, false, fn) + return + } + + for pgdIndex := int((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { + pgdEntry := &p.root.PTEs()[pgdIndex] + if !pgdEntry.Valid() { + if !alloc { + // Skip over this entry. + start = next(start, pgdSize) + continue + } + + // Allocate a new pgd. + p.setPageTable(p.root, pgdIndex, p.allocNode()) + } + + // Map the next level. + pudNode := p.getPageTable(p.root, pgdIndex) + clearPUDEntries := 0 + + for pudIndex := int((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { + pudEntry := &(pudNode.PTEs()[pudIndex]) + if !pudEntry.Valid() { + if !alloc { + // Skip over this entry. + clearPUDEntries++ + start = next(start, pudSize) + continue + } + + // This level has 1-GB super pages. Is this + // entire region contained in a single PUD + // entry? If so, we can skip allocating a new + // page for the pmd. + if start&(pudSize-1) == 0 && end-start >= pudSize { + pudEntry.SetSuper() + fn(uintptr(start), uintptr(start+pudSize), pudEntry, pudSize-1) + if pudEntry.Valid() { + start = next(start, pudSize) + continue + } + } + + // Allocate a new pud. + p.setPageTable(pudNode, pudIndex, p.allocNode()) + + } else if pudEntry.IsSuper() { + // Does this page need to be split? + if start&(pudSize-1) != 0 || end < next(start, pudSize) { + currentAddr := uint64(pudEntry.Address()) + writeable := pudEntry.Writeable() + executable := pudEntry.Executable() + user := pudEntry.User() + + // Install the relevant entries. + pmdNode := p.allocNode() + pmdEntries := pmdNode.PTEs() + for index := 0; index < entriesPerPage; index++ { + pmdEntry := &pmdEntries[index] + pmdEntry.SetSuper() + pmdEntry.Set(uintptr(currentAddr), writeable, executable, user) + currentAddr += pmdSize + } + + // Reset to point to the new page. + p.setPageTable(pudNode, pudIndex, pmdNode) + } else { + // A super page to be checked directly. + fn(uintptr(start), uintptr(start+pudSize), pudEntry, pudSize-1) + + // Might have been cleared. + if !pudEntry.Valid() { + clearPUDEntries++ + } + + // Note that the super page was changed. + start = next(start, pudSize) + continue + } + } + + // Map the next level, since this is valid. + pmdNode := p.getPageTable(pudNode, pudIndex) + clearPMDEntries := 0 + + for pmdIndex := int((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { + pmdEntry := &pmdNode.PTEs()[pmdIndex] + if !pmdEntry.Valid() { + if !alloc { + // Skip over this entry. + clearPMDEntries++ + start = next(start, pmdSize) + continue + } + + // This level has 2-MB huge pages. If this + // region is contined in a single PMD entry? + // As above, we can skip allocating a new page. + if start&(pmdSize-1) == 0 && end-start >= pmdSize { + pmdEntry.SetSuper() + fn(uintptr(start), uintptr(start+pmdSize), pmdEntry, pmdSize-1) + if pmdEntry.Valid() { + start = next(start, pmdSize) + continue + } + } + + // Allocate a new pmd. + p.setPageTable(pmdNode, pmdIndex, p.allocNode()) + + } else if pmdEntry.IsSuper() { + // Does this page need to be split? + if start&(pmdSize-1) != 0 || end < next(start, pmdSize) { + currentAddr := uint64(pmdEntry.Address()) + writeable := pmdEntry.Writeable() + executable := pmdEntry.Executable() + user := pmdEntry.User() + + // Install the relevant entries. + pteNode := p.allocNode() + pteEntries := pteNode.PTEs() + for index := 0; index < entriesPerPage; index++ { + pteEntry := &pteEntries[index] + pteEntry.Set(uintptr(currentAddr), writeable, executable, user) + currentAddr += pteSize + } + + // Reset to point to the new page. + p.setPageTable(pmdNode, pmdIndex, pteNode) + } else { + // A huge page to be checked directly. + fn(uintptr(start), uintptr(start+pmdSize), pmdEntry, pmdSize-1) + + // Might have been cleared. + if !pmdEntry.Valid() { + clearPMDEntries++ + } + + // Note that the huge page was changed. + start = next(start, pmdSize) + continue + } + } + + // Map the next level, since this is valid. + pteNode := p.getPageTable(pmdNode, pmdIndex) + clearPTEEntries := 0 + + for pteIndex := int((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { + pteEntry := &pteNode.PTEs()[pteIndex] + if !pteEntry.Valid() && !alloc { + clearPTEEntries++ + start += pteSize + continue + } + + // At this point, we are guaranteed that start%pteSize == 0. + fn(uintptr(start), uintptr(start+pteSize), pteEntry, pteSize-1) + if !pteEntry.Valid() { + if alloc { + panic("PTE not set after iteration with alloc=true!") + } + clearPTEEntries++ + } + + // Note that the pte was changed. + start += pteSize + continue + } + + // Check if we no longer need this page. + if clearPTEEntries == entriesPerPage { + p.clearPageTable(pmdNode, pmdIndex) + clearPMDEntries++ + } + } + + // Check if we no longer need this page. + if clearPMDEntries == entriesPerPage { + p.clearPageTable(pudNode, pudIndex) + clearPUDEntries++ + } + } + + // Check if we no longer need this page. + if clearPUDEntries == entriesPerPage { + p.clearPageTable(p.root, pgdIndex) + } + } +} |