summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/platform/ring0/pagetables
diff options
context:
space:
mode:
authorGoogler <noreply@google.com>2018-04-27 10:37:02 -0700
committerAdin Scannell <ascannell@google.com>2018-04-28 01:44:26 -0400
commitd02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/platform/ring0/pagetables
parentf70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)
Check in gVisor.
PiperOrigin-RevId: 194583126 Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
Diffstat (limited to 'pkg/sentry/platform/ring0/pagetables')
-rw-r--r--pkg/sentry/platform/ring0/pagetables/BUILD32
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables.go193
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go397
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables_test.go161
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go31
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables_x86.go79
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go79
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pcids_x86.go74
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go65
9 files changed, 1111 insertions, 0 deletions
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
new file mode 100644
index 000000000..c0c481ab3
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -0,0 +1,32 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+ name = "pagetables",
+ srcs = [
+ "pagetables.go",
+ "pagetables_amd64.go",
+ "pagetables_unsafe.go",
+ "pagetables_x86.go",
+ "pcids_x86.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables",
+ visibility = [
+ "//pkg/sentry/platform/kvm:__subpackages__",
+ "//pkg/sentry/platform/ring0:__subpackages__",
+ ],
+ deps = ["//pkg/sentry/usermem"],
+)
+
+go_test(
+ name = "pagetables_test",
+ size = "small",
+ srcs = [
+ "pagetables_test.go",
+ "pagetables_x86_test.go",
+ "pcids_x86_test.go",
+ ],
+ embed = [":pagetables"],
+ deps = ["//pkg/sentry/usermem"],
+)
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
new file mode 100644
index 000000000..3cbf0bfa5
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -0,0 +1,193 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pagetables provides a generic implementation of pagetables.
+package pagetables
+
+import (
+ "sync"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// Node is a single node within a set of page tables.
+type Node struct {
+ // unalignedData has unaligned data. Unfortunately, we can't really
+ // rely on the allocator to give us what we want here. So we just throw
+ // it at the wall and use the portion that matches. Gross. This may be
+ // changed in the future to use a different allocation mechanism.
+ //
+ // Access must happen via functions found in pagetables_unsafe.go.
+ unalignedData [(2 * usermem.PageSize) - 1]byte
+
+ // physical is the translated address of these entries.
+ //
+ // This is filled in at creation time.
+ physical uintptr
+}
+
+// PageTables is a set of page tables.
+type PageTables struct {
+ mu sync.Mutex
+
+ // root is the pagetable root.
+ root *Node
+
+ // translater is the translater passed at creation.
+ translater Translater
+
+ // archPageTables includes architecture-specific features.
+ archPageTables
+
+ // allNodes is a set of nodes indexed by translater address.
+ allNodes map[uintptr]*Node
+}
+
+// Translater translates to guest physical addresses.
+type Translater interface {
+ // TranslateToPhysical translates the given pointer object into a
+ // "physical" address. We do not require that it translates back, the
+ // reverse mapping is maintained internally.
+ TranslateToPhysical(*PTEs) uintptr
+}
+
+// New returns new PageTables.
+func New(t Translater, opts Opts) *PageTables {
+ p := &PageTables{
+ translater: t,
+ allNodes: make(map[uintptr]*Node),
+ }
+ p.root = p.allocNode()
+ p.init(opts)
+ return p
+}
+
+// New returns a new set of PageTables derived from the given one.
+//
+// This function should always be preferred to New if there are existing
+// pagetables, as this function preserves architectural constraints relevant to
+// managing multiple sets of pagetables.
+func (p *PageTables) New() *PageTables {
+ np := &PageTables{
+ translater: p.translater,
+ allNodes: make(map[uintptr]*Node),
+ }
+ np.root = np.allocNode()
+ np.initFrom(&p.archPageTables)
+ return np
+}
+
+// setPageTable sets the given index as a page table.
+func (p *PageTables) setPageTable(n *Node, index int, child *Node) {
+ phys := p.translater.TranslateToPhysical(child.PTEs())
+ p.allNodes[phys] = child
+ pte := &n.PTEs()[index]
+ pte.setPageTable(phys)
+}
+
+// clearPageTable clears the given entry.
+func (p *PageTables) clearPageTable(n *Node, index int) {
+ pte := &n.PTEs()[index]
+ physical := pte.Address()
+ pte.Clear()
+ delete(p.allNodes, physical)
+}
+
+// getPageTable returns the page table entry.
+func (p *PageTables) getPageTable(n *Node, index int) *Node {
+ pte := &n.PTEs()[index]
+ physical := pte.Address()
+ child := p.allNodes[physical]
+ return child
+}
+
+// Map installs a mapping with the given physical address.
+//
+// True is returned iff there was a previous mapping in the range.
+//
+// Precondition: addr & length must be aligned, their sum must not overflow.
+func (p *PageTables) Map(addr usermem.Addr, length uintptr, user bool, at usermem.AccessType, physical uintptr) bool {
+ if at == usermem.NoAccess {
+ return p.Unmap(addr, length)
+ }
+ prev := false
+ p.mu.Lock()
+ end, ok := addr.AddLength(uint64(length))
+ if !ok {
+ panic("pagetables.Map: overflow")
+ }
+ p.iterateRange(uintptr(addr), uintptr(end), true, func(s, e uintptr, pte *PTE, align uintptr) {
+ p := physical + (s - uintptr(addr))
+ prev = prev || (pte.Valid() && (p != pte.Address() || at.Write != pte.Writeable() || at.Execute != pte.Executable()))
+ if p&align != 0 {
+ // We will install entries at a smaller granulaity if
+ // we don't install a valid entry here, however we must
+ // zap any existing entry to ensure this happens.
+ pte.Clear()
+ return
+ }
+ pte.Set(p, at.Write, at.Execute, user)
+ })
+ p.mu.Unlock()
+ return prev
+}
+
+// Unmap unmaps the given range.
+//
+// True is returned iff there was a previous mapping in the range.
+func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool {
+ p.mu.Lock()
+ count := 0
+ p.iterateRange(uintptr(addr), uintptr(addr)+length, false, func(s, e uintptr, pte *PTE, align uintptr) {
+ pte.Clear()
+ count++
+ })
+ p.mu.Unlock()
+ return count > 0
+}
+
+// Release releases this address space.
+//
+// This must be called to release the PCID.
+func (p *PageTables) Release() {
+ // Clear all pages.
+ p.Unmap(0, ^uintptr(0))
+ p.release()
+}
+
+// Lookup returns the physical address for the given virtual address.
+func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, accessType usermem.AccessType) {
+ mask := uintptr(usermem.PageSize - 1)
+ off := uintptr(addr) & mask
+ addr = addr &^ usermem.Addr(mask)
+ p.iterateRange(uintptr(addr), uintptr(addr+usermem.PageSize), false, func(s, e uintptr, pte *PTE, align uintptr) {
+ if !pte.Valid() {
+ return
+ }
+ physical = pte.Address() + (s - uintptr(addr)) + off
+ accessType = usermem.AccessType{
+ Read: true,
+ Write: pte.Writeable(),
+ Execute: pte.Executable(),
+ }
+ })
+ return physical, accessType
+}
+
+// allocNode allocates a new page.
+func (p *PageTables) allocNode() *Node {
+ n := new(Node)
+ n.physical = p.translater.TranslateToPhysical(n.PTEs())
+ return n
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
new file mode 100644
index 000000000..b89665c96
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
@@ -0,0 +1,397 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package pagetables
+
+import (
+ "fmt"
+ "sync/atomic"
+)
+
+// Address constraints.
+//
+// The lowerTop and upperBottom currently apply to four-level pagetables;
+// additional refactoring would be necessary to support five-level pagetables.
+const (
+ lowerTop = 0x00007fffffffffff
+ upperBottom = 0xffff800000000000
+
+ pteShift = 12
+ pmdShift = 21
+ pudShift = 30
+ pgdShift = 39
+
+ pteMask = 0x1ff << pteShift
+ pmdMask = 0x1ff << pmdShift
+ pudMask = 0x1ff << pudShift
+ pgdMask = 0x1ff << pgdShift
+
+ pteSize = 1 << pteShift
+ pmdSize = 1 << pmdShift
+ pudSize = 1 << pudShift
+ pgdSize = 1 << pgdShift
+)
+
+// Bits in page table entries.
+const (
+ present = 0x001
+ writable = 0x002
+ user = 0x004
+ writeThrough = 0x008
+ cacheDisable = 0x010
+ accessed = 0x020
+ dirty = 0x040
+ super = 0x080
+ executeDisable = 1 << 63
+)
+
+// PTE is a page table entry.
+type PTE uint64
+
+// Clear clears this PTE, including super page information.
+func (p *PTE) Clear() {
+ atomic.StoreUint64((*uint64)(p), 0)
+}
+
+// Valid returns true iff this entry is valid.
+func (p *PTE) Valid() bool {
+ return atomic.LoadUint64((*uint64)(p))&present != 0
+}
+
+// Writeable returns true iff the page is writable.
+func (p *PTE) Writeable() bool {
+ return atomic.LoadUint64((*uint64)(p))&writable != 0
+}
+
+// User returns true iff the page is user-accessible.
+func (p *PTE) User() bool {
+ return atomic.LoadUint64((*uint64)(p))&user != 0
+}
+
+// Executable returns true iff the page is executable.
+func (p *PTE) Executable() bool {
+ return atomic.LoadUint64((*uint64)(p))&executeDisable == 0
+}
+
+// SetSuper sets this page as a super page.
+//
+// The page must not be valid or a panic will result.
+func (p *PTE) SetSuper() {
+ if p.Valid() {
+ // This is not allowed.
+ panic("SetSuper called on valid page!")
+ }
+ atomic.StoreUint64((*uint64)(p), super)
+}
+
+// IsSuper returns true iff this page is a super page.
+func (p *PTE) IsSuper() bool {
+ return atomic.LoadUint64((*uint64)(p))&super != 0
+}
+
+// Set sets this PTE value.
+func (p *PTE) Set(addr uintptr, write, execute bool, userAccessible bool) {
+ v := uint64(addr)&^uint64(0xfff) | present | accessed
+ if userAccessible {
+ v |= user
+ }
+ if !execute {
+ v |= executeDisable
+ }
+ if write {
+ v |= writable | dirty
+ }
+ if p.IsSuper() {
+ v |= super
+ }
+ atomic.StoreUint64((*uint64)(p), v)
+}
+
+// setPageTable sets this PTE value and forces the write bit and super bit to
+// be cleared. This is used explicitly for breaking super pages.
+func (p *PTE) setPageTable(addr uintptr) {
+ v := uint64(addr)&^uint64(0xfff) | present | user | writable | accessed | dirty
+ atomic.StoreUint64((*uint64)(p), v)
+}
+
+// Address extracts the address. This should only be used if Valid returns true.
+func (p *PTE) Address() uintptr {
+ return uintptr(atomic.LoadUint64((*uint64)(p)) & ^uint64(executeDisable|0xfff))
+}
+
+// entriesPerPage is the number of PTEs per page.
+const entriesPerPage = 512
+
+// PTEs is a collection of entries.
+type PTEs [entriesPerPage]PTE
+
+// next returns the next address quantized by the given size.
+func next(start uint64, size uint64) uint64 {
+ start &= ^(size - 1)
+ start += size
+ return start
+}
+
+// iterateRange iterates over all appropriate levels of page tables for the given range.
+//
+// If alloc is set, then Set _must_ be called on all given PTEs. The exception
+// is super pages. If a valid super page cannot be installed, then the walk
+// will continue to individual entries.
+//
+// This algorithm will attempt to maximize the use of super pages whenever
+// possible. Whether a super page is provided will be clear through the range
+// provided in the callback.
+//
+// Note that if alloc set, then no gaps will be present. However, if alloc is
+// not set, then the iteration will likely be full of gaps.
+//
+// Note that this function should generally be avoided in favor of Map, Unmap,
+// etc. when not necessary.
+//
+// Precondition: startAddr and endAddr must be page-aligned.
+//
+// Precondition: startStart must be less than endAddr.
+//
+// Precondition: If alloc is set, then startAddr and endAddr should not span
+// non-canonical ranges. If they do, a panic will result.
+func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn func(s, e uintptr, pte *PTE, align uintptr)) {
+ start := uint64(startAddr)
+ end := uint64(endAddr)
+ if start%pteSize != 0 {
+ panic(fmt.Sprintf("unaligned start: %v", start))
+ }
+ if start > end {
+ panic(fmt.Sprintf("start > end (%v > %v))", start, end))
+ }
+
+ // Deal with cases where we traverse the "gap".
+ //
+ // These are all explicitly disallowed if alloc is set, and we must
+ // traverse an entry for each address explicitly.
+ switch {
+ case start < lowerTop && end > lowerTop && end < upperBottom:
+ if alloc {
+ panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
+ }
+ p.iterateRange(startAddr, lowerTop, false, fn)
+ return
+ case start < lowerTop && end > lowerTop:
+ if alloc {
+ panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
+ }
+ p.iterateRange(startAddr, lowerTop, false, fn)
+ p.iterateRange(upperBottom, endAddr, false, fn)
+ return
+ case start > lowerTop && end < upperBottom:
+ if alloc {
+ panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
+ }
+ return
+ case start > lowerTop && start < upperBottom && end > upperBottom:
+ if alloc {
+ panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end))
+ }
+ p.iterateRange(upperBottom, endAddr, false, fn)
+ return
+ }
+
+ for pgdIndex := int((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
+ pgdEntry := &p.root.PTEs()[pgdIndex]
+ if !pgdEntry.Valid() {
+ if !alloc {
+ // Skip over this entry.
+ start = next(start, pgdSize)
+ continue
+ }
+
+ // Allocate a new pgd.
+ p.setPageTable(p.root, pgdIndex, p.allocNode())
+ }
+
+ // Map the next level.
+ pudNode := p.getPageTable(p.root, pgdIndex)
+ clearPUDEntries := 0
+
+ for pudIndex := int((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ {
+ pudEntry := &(pudNode.PTEs()[pudIndex])
+ if !pudEntry.Valid() {
+ if !alloc {
+ // Skip over this entry.
+ clearPUDEntries++
+ start = next(start, pudSize)
+ continue
+ }
+
+ // This level has 1-GB super pages. Is this
+ // entire region contained in a single PUD
+ // entry? If so, we can skip allocating a new
+ // page for the pmd.
+ if start&(pudSize-1) == 0 && end-start >= pudSize {
+ pudEntry.SetSuper()
+ fn(uintptr(start), uintptr(start+pudSize), pudEntry, pudSize-1)
+ if pudEntry.Valid() {
+ start = next(start, pudSize)
+ continue
+ }
+ }
+
+ // Allocate a new pud.
+ p.setPageTable(pudNode, pudIndex, p.allocNode())
+
+ } else if pudEntry.IsSuper() {
+ // Does this page need to be split?
+ if start&(pudSize-1) != 0 || end < next(start, pudSize) {
+ currentAddr := uint64(pudEntry.Address())
+ writeable := pudEntry.Writeable()
+ executable := pudEntry.Executable()
+ user := pudEntry.User()
+
+ // Install the relevant entries.
+ pmdNode := p.allocNode()
+ pmdEntries := pmdNode.PTEs()
+ for index := 0; index < entriesPerPage; index++ {
+ pmdEntry := &pmdEntries[index]
+ pmdEntry.SetSuper()
+ pmdEntry.Set(uintptr(currentAddr), writeable, executable, user)
+ currentAddr += pmdSize
+ }
+
+ // Reset to point to the new page.
+ p.setPageTable(pudNode, pudIndex, pmdNode)
+ } else {
+ // A super page to be checked directly.
+ fn(uintptr(start), uintptr(start+pudSize), pudEntry, pudSize-1)
+
+ // Might have been cleared.
+ if !pudEntry.Valid() {
+ clearPUDEntries++
+ }
+
+ // Note that the super page was changed.
+ start = next(start, pudSize)
+ continue
+ }
+ }
+
+ // Map the next level, since this is valid.
+ pmdNode := p.getPageTable(pudNode, pudIndex)
+ clearPMDEntries := 0
+
+ for pmdIndex := int((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ {
+ pmdEntry := &pmdNode.PTEs()[pmdIndex]
+ if !pmdEntry.Valid() {
+ if !alloc {
+ // Skip over this entry.
+ clearPMDEntries++
+ start = next(start, pmdSize)
+ continue
+ }
+
+ // This level has 2-MB huge pages. If this
+ // region is contined in a single PMD entry?
+ // As above, we can skip allocating a new page.
+ if start&(pmdSize-1) == 0 && end-start >= pmdSize {
+ pmdEntry.SetSuper()
+ fn(uintptr(start), uintptr(start+pmdSize), pmdEntry, pmdSize-1)
+ if pmdEntry.Valid() {
+ start = next(start, pmdSize)
+ continue
+ }
+ }
+
+ // Allocate a new pmd.
+ p.setPageTable(pmdNode, pmdIndex, p.allocNode())
+
+ } else if pmdEntry.IsSuper() {
+ // Does this page need to be split?
+ if start&(pmdSize-1) != 0 || end < next(start, pmdSize) {
+ currentAddr := uint64(pmdEntry.Address())
+ writeable := pmdEntry.Writeable()
+ executable := pmdEntry.Executable()
+ user := pmdEntry.User()
+
+ // Install the relevant entries.
+ pteNode := p.allocNode()
+ pteEntries := pteNode.PTEs()
+ for index := 0; index < entriesPerPage; index++ {
+ pteEntry := &pteEntries[index]
+ pteEntry.Set(uintptr(currentAddr), writeable, executable, user)
+ currentAddr += pteSize
+ }
+
+ // Reset to point to the new page.
+ p.setPageTable(pmdNode, pmdIndex, pteNode)
+ } else {
+ // A huge page to be checked directly.
+ fn(uintptr(start), uintptr(start+pmdSize), pmdEntry, pmdSize-1)
+
+ // Might have been cleared.
+ if !pmdEntry.Valid() {
+ clearPMDEntries++
+ }
+
+ // Note that the huge page was changed.
+ start = next(start, pmdSize)
+ continue
+ }
+ }
+
+ // Map the next level, since this is valid.
+ pteNode := p.getPageTable(pmdNode, pmdIndex)
+ clearPTEEntries := 0
+
+ for pteIndex := int((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ {
+ pteEntry := &pteNode.PTEs()[pteIndex]
+ if !pteEntry.Valid() && !alloc {
+ clearPTEEntries++
+ start += pteSize
+ continue
+ }
+
+ // At this point, we are guaranteed that start%pteSize == 0.
+ fn(uintptr(start), uintptr(start+pteSize), pteEntry, pteSize-1)
+ if !pteEntry.Valid() {
+ if alloc {
+ panic("PTE not set after iteration with alloc=true!")
+ }
+ clearPTEEntries++
+ }
+
+ // Note that the pte was changed.
+ start += pteSize
+ continue
+ }
+
+ // Check if we no longer need this page.
+ if clearPTEEntries == entriesPerPage {
+ p.clearPageTable(pmdNode, pmdIndex)
+ clearPMDEntries++
+ }
+ }
+
+ // Check if we no longer need this page.
+ if clearPMDEntries == entriesPerPage {
+ p.clearPageTable(pudNode, pudIndex)
+ clearPUDEntries++
+ }
+ }
+
+ // Check if we no longer need this page.
+ if clearPUDEntries == entriesPerPage {
+ p.clearPageTable(p.root, pgdIndex)
+ }
+ }
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
new file mode 100644
index 000000000..9cbc0e3b0
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go
@@ -0,0 +1,161 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pagetables
+
+import (
+ "reflect"
+ "testing"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+type reflectTranslater struct{}
+
+func (r reflectTranslater) TranslateToPhysical(ptes *PTEs) uintptr {
+ return reflect.ValueOf(ptes).Pointer()
+}
+
+type mapping struct {
+ start uintptr
+ length uintptr
+ addr uintptr
+ writeable bool
+}
+
+func checkMappings(t *testing.T, pt *PageTables, m []mapping) {
+ var (
+ current int
+ found []mapping
+ failed string
+ )
+
+ // Iterate over all the mappings.
+ pt.iterateRange(0, ^uintptr(0), false, func(s, e uintptr, pte *PTE, align uintptr) {
+ found = append(found, mapping{
+ start: s,
+ length: e - s,
+ addr: pte.Address(),
+ writeable: pte.Writeable(),
+ })
+ if failed != "" {
+ // Don't keep looking for errors.
+ return
+ }
+
+ if current >= len(m) {
+ failed = "more mappings than expected"
+ } else if m[current].start != s {
+ failed = "start didn't match expected"
+ } else if m[current].length != (e - s) {
+ failed = "end didn't match expected"
+ } else if m[current].addr != pte.Address() {
+ failed = "address didn't match expected"
+ } else if m[current].writeable != pte.Writeable() {
+ failed = "writeable didn't match"
+ }
+ current++
+ })
+
+ // Were we expected additional mappings?
+ if failed == "" && current != len(m) {
+ failed = "insufficient mappings found"
+ }
+
+ // Emit a meaningful error message on failure.
+ if failed != "" {
+ t.Errorf("%s; got %#v, wanted %#v", failed, found, m)
+ }
+}
+
+func TestAllocFree(t *testing.T) {
+ pt := New(reflectTranslater{}, Opts{})
+ pt.Release()
+}
+
+func TestUnmap(t *testing.T) {
+ pt := New(reflectTranslater{}, Opts{})
+
+ // Map and unmap one entry.
+ pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
+ pt.Unmap(0x400000, pteSize)
+
+ checkMappings(t, pt, nil)
+ pt.Release()
+}
+
+func TestReadOnly(t *testing.T) {
+ pt := New(reflectTranslater{}, Opts{})
+
+ // Map one entry.
+ pt.Map(0x400000, pteSize, true, usermem.Read, pteSize*42)
+
+ checkMappings(t, pt, []mapping{
+ {0x400000, pteSize, pteSize * 42, false},
+ })
+ pt.Release()
+}
+
+func TestReadWrite(t *testing.T) {
+ pt := New(reflectTranslater{}, Opts{})
+
+ // Map one entry.
+ pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
+
+ checkMappings(t, pt, []mapping{
+ {0x400000, pteSize, pteSize * 42, true},
+ })
+ pt.Release()
+}
+
+func TestSerialEntries(t *testing.T) {
+ pt := New(reflectTranslater{}, Opts{})
+
+ // Map two sequential entries.
+ pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
+ pt.Map(0x401000, pteSize, true, usermem.ReadWrite, pteSize*47)
+
+ checkMappings(t, pt, []mapping{
+ {0x400000, pteSize, pteSize * 42, true},
+ {0x401000, pteSize, pteSize * 47, true},
+ })
+ pt.Release()
+}
+
+func TestSpanningEntries(t *testing.T) {
+ pt := New(reflectTranslater{}, Opts{})
+
+ // Span a pgd with two pages.
+ pt.Map(0x00007efffffff000, 2*pteSize, true, usermem.Read, pteSize*42)
+
+ checkMappings(t, pt, []mapping{
+ {0x00007efffffff000, pteSize, pteSize * 42, false},
+ {0x00007f0000000000, pteSize, pteSize * 43, false},
+ })
+ pt.Release()
+}
+
+func TestSparseEntries(t *testing.T) {
+ pt := New(reflectTranslater{}, Opts{})
+
+ // Map two entries in different pgds.
+ pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
+ pt.Map(0x00007f0000000000, pteSize, true, usermem.Read, pteSize*47)
+
+ checkMappings(t, pt, []mapping{
+ {0x400000, pteSize, pteSize * 42, true},
+ {0x00007f0000000000, pteSize, pteSize * 47, false},
+ })
+ pt.Release()
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go b/pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go
new file mode 100644
index 000000000..a2b44fb79
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go
@@ -0,0 +1,31 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pagetables
+
+import (
+ "unsafe"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// PTEs returns aligned PTE entries.
+func (n *Node) PTEs() *PTEs {
+ addr := uintptr(unsafe.Pointer(&n.unalignedData[0]))
+ offset := addr & (usermem.PageSize - 1)
+ if offset != 0 {
+ offset = usermem.PageSize - offset
+ }
+ return (*PTEs)(unsafe.Pointer(addr + offset))
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
new file mode 100644
index 000000000..dac66373f
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go
@@ -0,0 +1,79 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package pagetables
+
+// Opts are pagetable options.
+type Opts struct {
+ EnablePCID bool
+}
+
+// archPageTables has x86-specific features.
+type archPageTables struct {
+ // pcids is the PCID database.
+ pcids *PCIDs
+
+ // pcid is the globally unique identifier, or zero if none were
+ // available or pcids is nil.
+ pcid uint16
+}
+
+// init initializes arch-specific features.
+func (a *archPageTables) init(opts Opts) {
+ if opts.EnablePCID {
+ a.pcids = NewPCIDs()
+ a.pcid = a.pcids.allocate()
+ }
+}
+
+// initFrom initializes arch-specific features from an existing entry.'
+func (a *archPageTables) initFrom(other *archPageTables) {
+ a.pcids = other.pcids // Refer to the same PCID database.
+ if a.pcids != nil {
+ a.pcid = a.pcids.allocate()
+ }
+}
+
+// release is called from Release.
+func (a *archPageTables) release() {
+ // Return the PCID.
+ if a.pcids != nil {
+ a.pcids.free(a.pcid)
+ }
+}
+
+// CR3 returns the CR3 value for these tables.
+//
+// This may be called in interrupt contexts.
+//
+//go:nosplit
+func (p *PageTables) CR3() uint64 {
+ // Bit 63 is set to avoid flushing the PCID (per SDM 4.10.4.1).
+ const noFlushBit uint64 = 0x8000000000000000
+ if p.pcid != 0 {
+ return noFlushBit | uint64(p.root.physical) | uint64(p.pcid)
+ }
+ return uint64(p.root.physical)
+}
+
+// FlushCR3 returns the CR3 value that flushes the TLB.
+//
+// This may be called in interrupt contexts.
+//
+//go:nosplit
+func (p *PageTables) FlushCR3() uint64 {
+ return uint64(p.root.physical) | uint64(p.pcid)
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go
new file mode 100644
index 000000000..1fc403c48
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go
@@ -0,0 +1,79 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package pagetables
+
+import (
+ "testing"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+func Test2MAnd4K(t *testing.T) {
+ pt := New(reflectTranslater{}, Opts{})
+
+ // Map a small page and a huge page.
+ pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
+ pt.Map(0x00007f0000000000, 1<<21, true, usermem.Read, pmdSize*47)
+
+ checkMappings(t, pt, []mapping{
+ {0x400000, pteSize, pteSize * 42, true},
+ {0x00007f0000000000, pmdSize, pmdSize * 47, false},
+ })
+ pt.Release()
+}
+
+func Test1GAnd4K(t *testing.T) {
+ pt := New(reflectTranslater{}, Opts{})
+
+ // Map a small page and a super page.
+ pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42)
+ pt.Map(0x00007f0000000000, pudSize, true, usermem.Read, pudSize*47)
+
+ checkMappings(t, pt, []mapping{
+ {0x400000, pteSize, pteSize * 42, true},
+ {0x00007f0000000000, pudSize, pudSize * 47, false},
+ })
+ pt.Release()
+}
+
+func TestSplit1GPage(t *testing.T) {
+ pt := New(reflectTranslater{}, Opts{})
+
+ // Map a super page and knock out the middle.
+ pt.Map(0x00007f0000000000, pudSize, true, usermem.Read, pudSize*42)
+ pt.Unmap(usermem.Addr(0x00007f0000000000+pteSize), pudSize-(2*pteSize))
+
+ checkMappings(t, pt, []mapping{
+ {0x00007f0000000000, pteSize, pudSize * 42, false},
+ {0x00007f0000000000 + pudSize - pteSize, pteSize, pudSize*42 + pudSize - pteSize, false},
+ })
+ pt.Release()
+}
+
+func TestSplit2MPage(t *testing.T) {
+ pt := New(reflectTranslater{}, Opts{})
+
+ // Map a huge page and knock out the middle.
+ pt.Map(0x00007f0000000000, pmdSize, true, usermem.Read, pmdSize*42)
+ pt.Unmap(usermem.Addr(0x00007f0000000000+pteSize), pmdSize-(2*pteSize))
+
+ checkMappings(t, pt, []mapping{
+ {0x00007f0000000000, pteSize, pmdSize * 42, false},
+ {0x00007f0000000000 + pmdSize - pteSize, pteSize, pmdSize*42 + pmdSize - pteSize, false},
+ })
+ pt.Release()
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
new file mode 100644
index 000000000..509e8c0d9
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
@@ -0,0 +1,74 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package pagetables
+
+import (
+ "sync"
+)
+
+// maxPCID is the maximum allowed PCID.
+const maxPCID = 4095
+
+// PCIDs is a simple PCID database.
+type PCIDs struct {
+ mu sync.Mutex
+
+ // last is the last fresh PCID given out (not including the available
+ // pool). If last >= maxPCID, then the only PCIDs available in the
+ // available pool below.
+ last uint16
+
+ // available are PCIDs that have been freed.
+ available map[uint16]struct{}
+}
+
+// NewPCIDs returns a new PCID set.
+func NewPCIDs() *PCIDs {
+ return &PCIDs{
+ available: make(map[uint16]struct{}),
+ }
+}
+
+// allocate returns an unused PCID, or zero if all are taken.
+func (p *PCIDs) allocate() uint16 {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+ if len(p.available) > 0 {
+ for id := range p.available {
+ delete(p.available, id)
+ return id
+ }
+ }
+ if id := p.last + 1; id <= maxPCID {
+ p.last = id
+ return id
+ }
+ // Nothing available.
+ return 0
+}
+
+// free returns a PCID to the pool.
+//
+// It is safe to call free with a zero pcid. That is, you may always call free
+// with anything returned by allocate.
+func (p *PCIDs) free(id uint16) {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+ if id != 0 {
+ p.available[id] = struct{}{}
+ }
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go
new file mode 100644
index 000000000..0b555cd76
--- /dev/null
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go
@@ -0,0 +1,65 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build i386 amd64
+
+package pagetables
+
+import (
+ "testing"
+)
+
+func TestMaxPCID(t *testing.T) {
+ p := NewPCIDs()
+ for i := 0; i < maxPCID; i++ {
+ if id := p.allocate(); id != uint16(i+1) {
+ t.Errorf("got %d, expected %d", id, i+1)
+ }
+ }
+ if id := p.allocate(); id != 0 {
+ if id != 0 {
+ t.Errorf("got %d, expected 0", id)
+ }
+ }
+}
+
+func TestFirstPCID(t *testing.T) {
+ p := NewPCIDs()
+ if id := p.allocate(); id != 1 {
+ t.Errorf("got %d, expected 1", id)
+ }
+}
+
+func TestFreePCID(t *testing.T) {
+ p := NewPCIDs()
+ p.free(0)
+ if id := p.allocate(); id != 1 {
+ t.Errorf("got %d, expected 1 (not zero)", id)
+ }
+}
+
+func TestReusePCID(t *testing.T) {
+ p := NewPCIDs()
+ id := p.allocate()
+ if id != 1 {
+ t.Errorf("got %d, expected 1", id)
+ }
+ p.free(id)
+ if id := p.allocate(); id != 1 {
+ t.Errorf("got %d, expected 1", id)
+ }
+ if id := p.allocate(); id != 2 {
+ t.Errorf("got %d, expected 2", id)
+ }
+}