diff options
Diffstat (limited to 'pkg/sentry/platform/ring0/pagetables')
-rw-r--r-- | pkg/sentry/platform/ring0/pagetables/BUILD | 32 | ||||
-rw-r--r-- | pkg/sentry/platform/ring0/pagetables/pagetables.go | 193 | ||||
-rw-r--r-- | pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go | 397 | ||||
-rw-r--r-- | pkg/sentry/platform/ring0/pagetables/pagetables_test.go | 161 | ||||
-rw-r--r-- | pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go | 31 | ||||
-rw-r--r-- | pkg/sentry/platform/ring0/pagetables/pagetables_x86.go | 79 | ||||
-rw-r--r-- | pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go | 79 | ||||
-rw-r--r-- | pkg/sentry/platform/ring0/pagetables/pcids_x86.go | 74 | ||||
-rw-r--r-- | pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go | 65 |
9 files changed, 1111 insertions, 0 deletions
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD new file mode 100644 index 000000000..c0c481ab3 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/BUILD @@ -0,0 +1,32 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") + +go_library( + name = "pagetables", + srcs = [ + "pagetables.go", + "pagetables_amd64.go", + "pagetables_unsafe.go", + "pagetables_x86.go", + "pcids_x86.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables", + visibility = [ + "//pkg/sentry/platform/kvm:__subpackages__", + "//pkg/sentry/platform/ring0:__subpackages__", + ], + deps = ["//pkg/sentry/usermem"], +) + +go_test( + name = "pagetables_test", + size = "small", + srcs = [ + "pagetables_test.go", + "pagetables_x86_test.go", + "pcids_x86_test.go", + ], + embed = [":pagetables"], + deps = ["//pkg/sentry/usermem"], +) diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go new file mode 100644 index 000000000..3cbf0bfa5 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go @@ -0,0 +1,193 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package pagetables provides a generic implementation of pagetables. +package pagetables + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// Node is a single node within a set of page tables. +type Node struct { + // unalignedData has unaligned data. Unfortunately, we can't really + // rely on the allocator to give us what we want here. So we just throw + // it at the wall and use the portion that matches. Gross. This may be + // changed in the future to use a different allocation mechanism. + // + // Access must happen via functions found in pagetables_unsafe.go. + unalignedData [(2 * usermem.PageSize) - 1]byte + + // physical is the translated address of these entries. + // + // This is filled in at creation time. + physical uintptr +} + +// PageTables is a set of page tables. +type PageTables struct { + mu sync.Mutex + + // root is the pagetable root. + root *Node + + // translater is the translater passed at creation. + translater Translater + + // archPageTables includes architecture-specific features. + archPageTables + + // allNodes is a set of nodes indexed by translater address. + allNodes map[uintptr]*Node +} + +// Translater translates to guest physical addresses. +type Translater interface { + // TranslateToPhysical translates the given pointer object into a + // "physical" address. We do not require that it translates back, the + // reverse mapping is maintained internally. + TranslateToPhysical(*PTEs) uintptr +} + +// New returns new PageTables. +func New(t Translater, opts Opts) *PageTables { + p := &PageTables{ + translater: t, + allNodes: make(map[uintptr]*Node), + } + p.root = p.allocNode() + p.init(opts) + return p +} + +// New returns a new set of PageTables derived from the given one. +// +// This function should always be preferred to New if there are existing +// pagetables, as this function preserves architectural constraints relevant to +// managing multiple sets of pagetables. +func (p *PageTables) New() *PageTables { + np := &PageTables{ + translater: p.translater, + allNodes: make(map[uintptr]*Node), + } + np.root = np.allocNode() + np.initFrom(&p.archPageTables) + return np +} + +// setPageTable sets the given index as a page table. +func (p *PageTables) setPageTable(n *Node, index int, child *Node) { + phys := p.translater.TranslateToPhysical(child.PTEs()) + p.allNodes[phys] = child + pte := &n.PTEs()[index] + pte.setPageTable(phys) +} + +// clearPageTable clears the given entry. +func (p *PageTables) clearPageTable(n *Node, index int) { + pte := &n.PTEs()[index] + physical := pte.Address() + pte.Clear() + delete(p.allNodes, physical) +} + +// getPageTable returns the page table entry. +func (p *PageTables) getPageTable(n *Node, index int) *Node { + pte := &n.PTEs()[index] + physical := pte.Address() + child := p.allNodes[physical] + return child +} + +// Map installs a mapping with the given physical address. +// +// True is returned iff there was a previous mapping in the range. +// +// Precondition: addr & length must be aligned, their sum must not overflow. +func (p *PageTables) Map(addr usermem.Addr, length uintptr, user bool, at usermem.AccessType, physical uintptr) bool { + if at == usermem.NoAccess { + return p.Unmap(addr, length) + } + prev := false + p.mu.Lock() + end, ok := addr.AddLength(uint64(length)) + if !ok { + panic("pagetables.Map: overflow") + } + p.iterateRange(uintptr(addr), uintptr(end), true, func(s, e uintptr, pte *PTE, align uintptr) { + p := physical + (s - uintptr(addr)) + prev = prev || (pte.Valid() && (p != pte.Address() || at.Write != pte.Writeable() || at.Execute != pte.Executable())) + if p&align != 0 { + // We will install entries at a smaller granulaity if + // we don't install a valid entry here, however we must + // zap any existing entry to ensure this happens. + pte.Clear() + return + } + pte.Set(p, at.Write, at.Execute, user) + }) + p.mu.Unlock() + return prev +} + +// Unmap unmaps the given range. +// +// True is returned iff there was a previous mapping in the range. +func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool { + p.mu.Lock() + count := 0 + p.iterateRange(uintptr(addr), uintptr(addr)+length, false, func(s, e uintptr, pte *PTE, align uintptr) { + pte.Clear() + count++ + }) + p.mu.Unlock() + return count > 0 +} + +// Release releases this address space. +// +// This must be called to release the PCID. +func (p *PageTables) Release() { + // Clear all pages. + p.Unmap(0, ^uintptr(0)) + p.release() +} + +// Lookup returns the physical address for the given virtual address. +func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, accessType usermem.AccessType) { + mask := uintptr(usermem.PageSize - 1) + off := uintptr(addr) & mask + addr = addr &^ usermem.Addr(mask) + p.iterateRange(uintptr(addr), uintptr(addr+usermem.PageSize), false, func(s, e uintptr, pte *PTE, align uintptr) { + if !pte.Valid() { + return + } + physical = pte.Address() + (s - uintptr(addr)) + off + accessType = usermem.AccessType{ + Read: true, + Write: pte.Writeable(), + Execute: pte.Executable(), + } + }) + return physical, accessType +} + +// allocNode allocates a new page. +func (p *PageTables) allocNode() *Node { + n := new(Node) + n.physical = p.translater.TranslateToPhysical(n.PTEs()) + return n +} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go new file mode 100644 index 000000000..b89665c96 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go @@ -0,0 +1,397 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 + +package pagetables + +import ( + "fmt" + "sync/atomic" +) + +// Address constraints. +// +// The lowerTop and upperBottom currently apply to four-level pagetables; +// additional refactoring would be necessary to support five-level pagetables. +const ( + lowerTop = 0x00007fffffffffff + upperBottom = 0xffff800000000000 + + pteShift = 12 + pmdShift = 21 + pudShift = 30 + pgdShift = 39 + + pteMask = 0x1ff << pteShift + pmdMask = 0x1ff << pmdShift + pudMask = 0x1ff << pudShift + pgdMask = 0x1ff << pgdShift + + pteSize = 1 << pteShift + pmdSize = 1 << pmdShift + pudSize = 1 << pudShift + pgdSize = 1 << pgdShift +) + +// Bits in page table entries. +const ( + present = 0x001 + writable = 0x002 + user = 0x004 + writeThrough = 0x008 + cacheDisable = 0x010 + accessed = 0x020 + dirty = 0x040 + super = 0x080 + executeDisable = 1 << 63 +) + +// PTE is a page table entry. +type PTE uint64 + +// Clear clears this PTE, including super page information. +func (p *PTE) Clear() { + atomic.StoreUint64((*uint64)(p), 0) +} + +// Valid returns true iff this entry is valid. +func (p *PTE) Valid() bool { + return atomic.LoadUint64((*uint64)(p))&present != 0 +} + +// Writeable returns true iff the page is writable. +func (p *PTE) Writeable() bool { + return atomic.LoadUint64((*uint64)(p))&writable != 0 +} + +// User returns true iff the page is user-accessible. +func (p *PTE) User() bool { + return atomic.LoadUint64((*uint64)(p))&user != 0 +} + +// Executable returns true iff the page is executable. +func (p *PTE) Executable() bool { + return atomic.LoadUint64((*uint64)(p))&executeDisable == 0 +} + +// SetSuper sets this page as a super page. +// +// The page must not be valid or a panic will result. +func (p *PTE) SetSuper() { + if p.Valid() { + // This is not allowed. + panic("SetSuper called on valid page!") + } + atomic.StoreUint64((*uint64)(p), super) +} + +// IsSuper returns true iff this page is a super page. +func (p *PTE) IsSuper() bool { + return atomic.LoadUint64((*uint64)(p))&super != 0 +} + +// Set sets this PTE value. +func (p *PTE) Set(addr uintptr, write, execute bool, userAccessible bool) { + v := uint64(addr)&^uint64(0xfff) | present | accessed + if userAccessible { + v |= user + } + if !execute { + v |= executeDisable + } + if write { + v |= writable | dirty + } + if p.IsSuper() { + v |= super + } + atomic.StoreUint64((*uint64)(p), v) +} + +// setPageTable sets this PTE value and forces the write bit and super bit to +// be cleared. This is used explicitly for breaking super pages. +func (p *PTE) setPageTable(addr uintptr) { + v := uint64(addr)&^uint64(0xfff) | present | user | writable | accessed | dirty + atomic.StoreUint64((*uint64)(p), v) +} + +// Address extracts the address. This should only be used if Valid returns true. +func (p *PTE) Address() uintptr { + return uintptr(atomic.LoadUint64((*uint64)(p)) & ^uint64(executeDisable|0xfff)) +} + +// entriesPerPage is the number of PTEs per page. +const entriesPerPage = 512 + +// PTEs is a collection of entries. +type PTEs [entriesPerPage]PTE + +// next returns the next address quantized by the given size. +func next(start uint64, size uint64) uint64 { + start &= ^(size - 1) + start += size + return start +} + +// iterateRange iterates over all appropriate levels of page tables for the given range. +// +// If alloc is set, then Set _must_ be called on all given PTEs. The exception +// is super pages. If a valid super page cannot be installed, then the walk +// will continue to individual entries. +// +// This algorithm will attempt to maximize the use of super pages whenever +// possible. Whether a super page is provided will be clear through the range +// provided in the callback. +// +// Note that if alloc set, then no gaps will be present. However, if alloc is +// not set, then the iteration will likely be full of gaps. +// +// Note that this function should generally be avoided in favor of Map, Unmap, +// etc. when not necessary. +// +// Precondition: startAddr and endAddr must be page-aligned. +// +// Precondition: startStart must be less than endAddr. +// +// Precondition: If alloc is set, then startAddr and endAddr should not span +// non-canonical ranges. If they do, a panic will result. +func (p *PageTables) iterateRange(startAddr, endAddr uintptr, alloc bool, fn func(s, e uintptr, pte *PTE, align uintptr)) { + start := uint64(startAddr) + end := uint64(endAddr) + if start%pteSize != 0 { + panic(fmt.Sprintf("unaligned start: %v", start)) + } + if start > end { + panic(fmt.Sprintf("start > end (%v > %v))", start, end)) + } + + // Deal with cases where we traverse the "gap". + // + // These are all explicitly disallowed if alloc is set, and we must + // traverse an entry for each address explicitly. + switch { + case start < lowerTop && end > lowerTop && end < upperBottom: + if alloc { + panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end)) + } + p.iterateRange(startAddr, lowerTop, false, fn) + return + case start < lowerTop && end > lowerTop: + if alloc { + panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end)) + } + p.iterateRange(startAddr, lowerTop, false, fn) + p.iterateRange(upperBottom, endAddr, false, fn) + return + case start > lowerTop && end < upperBottom: + if alloc { + panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end)) + } + return + case start > lowerTop && start < upperBottom && end > upperBottom: + if alloc { + panic(fmt.Sprintf("alloc [%x, %x) spans non-canonical range", start, end)) + } + p.iterateRange(upperBottom, endAddr, false, fn) + return + } + + for pgdIndex := int((start & pgdMask) >> pgdShift); start < end && pgdIndex < entriesPerPage; pgdIndex++ { + pgdEntry := &p.root.PTEs()[pgdIndex] + if !pgdEntry.Valid() { + if !alloc { + // Skip over this entry. + start = next(start, pgdSize) + continue + } + + // Allocate a new pgd. + p.setPageTable(p.root, pgdIndex, p.allocNode()) + } + + // Map the next level. + pudNode := p.getPageTable(p.root, pgdIndex) + clearPUDEntries := 0 + + for pudIndex := int((start & pudMask) >> pudShift); start < end && pudIndex < entriesPerPage; pudIndex++ { + pudEntry := &(pudNode.PTEs()[pudIndex]) + if !pudEntry.Valid() { + if !alloc { + // Skip over this entry. + clearPUDEntries++ + start = next(start, pudSize) + continue + } + + // This level has 1-GB super pages. Is this + // entire region contained in a single PUD + // entry? If so, we can skip allocating a new + // page for the pmd. + if start&(pudSize-1) == 0 && end-start >= pudSize { + pudEntry.SetSuper() + fn(uintptr(start), uintptr(start+pudSize), pudEntry, pudSize-1) + if pudEntry.Valid() { + start = next(start, pudSize) + continue + } + } + + // Allocate a new pud. + p.setPageTable(pudNode, pudIndex, p.allocNode()) + + } else if pudEntry.IsSuper() { + // Does this page need to be split? + if start&(pudSize-1) != 0 || end < next(start, pudSize) { + currentAddr := uint64(pudEntry.Address()) + writeable := pudEntry.Writeable() + executable := pudEntry.Executable() + user := pudEntry.User() + + // Install the relevant entries. + pmdNode := p.allocNode() + pmdEntries := pmdNode.PTEs() + for index := 0; index < entriesPerPage; index++ { + pmdEntry := &pmdEntries[index] + pmdEntry.SetSuper() + pmdEntry.Set(uintptr(currentAddr), writeable, executable, user) + currentAddr += pmdSize + } + + // Reset to point to the new page. + p.setPageTable(pudNode, pudIndex, pmdNode) + } else { + // A super page to be checked directly. + fn(uintptr(start), uintptr(start+pudSize), pudEntry, pudSize-1) + + // Might have been cleared. + if !pudEntry.Valid() { + clearPUDEntries++ + } + + // Note that the super page was changed. + start = next(start, pudSize) + continue + } + } + + // Map the next level, since this is valid. + pmdNode := p.getPageTable(pudNode, pudIndex) + clearPMDEntries := 0 + + for pmdIndex := int((start & pmdMask) >> pmdShift); start < end && pmdIndex < entriesPerPage; pmdIndex++ { + pmdEntry := &pmdNode.PTEs()[pmdIndex] + if !pmdEntry.Valid() { + if !alloc { + // Skip over this entry. + clearPMDEntries++ + start = next(start, pmdSize) + continue + } + + // This level has 2-MB huge pages. If this + // region is contined in a single PMD entry? + // As above, we can skip allocating a new page. + if start&(pmdSize-1) == 0 && end-start >= pmdSize { + pmdEntry.SetSuper() + fn(uintptr(start), uintptr(start+pmdSize), pmdEntry, pmdSize-1) + if pmdEntry.Valid() { + start = next(start, pmdSize) + continue + } + } + + // Allocate a new pmd. + p.setPageTable(pmdNode, pmdIndex, p.allocNode()) + + } else if pmdEntry.IsSuper() { + // Does this page need to be split? + if start&(pmdSize-1) != 0 || end < next(start, pmdSize) { + currentAddr := uint64(pmdEntry.Address()) + writeable := pmdEntry.Writeable() + executable := pmdEntry.Executable() + user := pmdEntry.User() + + // Install the relevant entries. + pteNode := p.allocNode() + pteEntries := pteNode.PTEs() + for index := 0; index < entriesPerPage; index++ { + pteEntry := &pteEntries[index] + pteEntry.Set(uintptr(currentAddr), writeable, executable, user) + currentAddr += pteSize + } + + // Reset to point to the new page. + p.setPageTable(pmdNode, pmdIndex, pteNode) + } else { + // A huge page to be checked directly. + fn(uintptr(start), uintptr(start+pmdSize), pmdEntry, pmdSize-1) + + // Might have been cleared. + if !pmdEntry.Valid() { + clearPMDEntries++ + } + + // Note that the huge page was changed. + start = next(start, pmdSize) + continue + } + } + + // Map the next level, since this is valid. + pteNode := p.getPageTable(pmdNode, pmdIndex) + clearPTEEntries := 0 + + for pteIndex := int((start & pteMask) >> pteShift); start < end && pteIndex < entriesPerPage; pteIndex++ { + pteEntry := &pteNode.PTEs()[pteIndex] + if !pteEntry.Valid() && !alloc { + clearPTEEntries++ + start += pteSize + continue + } + + // At this point, we are guaranteed that start%pteSize == 0. + fn(uintptr(start), uintptr(start+pteSize), pteEntry, pteSize-1) + if !pteEntry.Valid() { + if alloc { + panic("PTE not set after iteration with alloc=true!") + } + clearPTEEntries++ + } + + // Note that the pte was changed. + start += pteSize + continue + } + + // Check if we no longer need this page. + if clearPTEEntries == entriesPerPage { + p.clearPageTable(pmdNode, pmdIndex) + clearPMDEntries++ + } + } + + // Check if we no longer need this page. + if clearPMDEntries == entriesPerPage { + p.clearPageTable(pudNode, pudIndex) + clearPUDEntries++ + } + } + + // Check if we no longer need this page. + if clearPUDEntries == entriesPerPage { + p.clearPageTable(p.root, pgdIndex) + } + } +} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go new file mode 100644 index 000000000..9cbc0e3b0 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go @@ -0,0 +1,161 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pagetables + +import ( + "reflect" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +type reflectTranslater struct{} + +func (r reflectTranslater) TranslateToPhysical(ptes *PTEs) uintptr { + return reflect.ValueOf(ptes).Pointer() +} + +type mapping struct { + start uintptr + length uintptr + addr uintptr + writeable bool +} + +func checkMappings(t *testing.T, pt *PageTables, m []mapping) { + var ( + current int + found []mapping + failed string + ) + + // Iterate over all the mappings. + pt.iterateRange(0, ^uintptr(0), false, func(s, e uintptr, pte *PTE, align uintptr) { + found = append(found, mapping{ + start: s, + length: e - s, + addr: pte.Address(), + writeable: pte.Writeable(), + }) + if failed != "" { + // Don't keep looking for errors. + return + } + + if current >= len(m) { + failed = "more mappings than expected" + } else if m[current].start != s { + failed = "start didn't match expected" + } else if m[current].length != (e - s) { + failed = "end didn't match expected" + } else if m[current].addr != pte.Address() { + failed = "address didn't match expected" + } else if m[current].writeable != pte.Writeable() { + failed = "writeable didn't match" + } + current++ + }) + + // Were we expected additional mappings? + if failed == "" && current != len(m) { + failed = "insufficient mappings found" + } + + // Emit a meaningful error message on failure. + if failed != "" { + t.Errorf("%s; got %#v, wanted %#v", failed, found, m) + } +} + +func TestAllocFree(t *testing.T) { + pt := New(reflectTranslater{}, Opts{}) + pt.Release() +} + +func TestUnmap(t *testing.T) { + pt := New(reflectTranslater{}, Opts{}) + + // Map and unmap one entry. + pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42) + pt.Unmap(0x400000, pteSize) + + checkMappings(t, pt, nil) + pt.Release() +} + +func TestReadOnly(t *testing.T) { + pt := New(reflectTranslater{}, Opts{}) + + // Map one entry. + pt.Map(0x400000, pteSize, true, usermem.Read, pteSize*42) + + checkMappings(t, pt, []mapping{ + {0x400000, pteSize, pteSize * 42, false}, + }) + pt.Release() +} + +func TestReadWrite(t *testing.T) { + pt := New(reflectTranslater{}, Opts{}) + + // Map one entry. + pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42) + + checkMappings(t, pt, []mapping{ + {0x400000, pteSize, pteSize * 42, true}, + }) + pt.Release() +} + +func TestSerialEntries(t *testing.T) { + pt := New(reflectTranslater{}, Opts{}) + + // Map two sequential entries. + pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42) + pt.Map(0x401000, pteSize, true, usermem.ReadWrite, pteSize*47) + + checkMappings(t, pt, []mapping{ + {0x400000, pteSize, pteSize * 42, true}, + {0x401000, pteSize, pteSize * 47, true}, + }) + pt.Release() +} + +func TestSpanningEntries(t *testing.T) { + pt := New(reflectTranslater{}, Opts{}) + + // Span a pgd with two pages. + pt.Map(0x00007efffffff000, 2*pteSize, true, usermem.Read, pteSize*42) + + checkMappings(t, pt, []mapping{ + {0x00007efffffff000, pteSize, pteSize * 42, false}, + {0x00007f0000000000, pteSize, pteSize * 43, false}, + }) + pt.Release() +} + +func TestSparseEntries(t *testing.T) { + pt := New(reflectTranslater{}, Opts{}) + + // Map two entries in different pgds. + pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42) + pt.Map(0x00007f0000000000, pteSize, true, usermem.Read, pteSize*47) + + checkMappings(t, pt, []mapping{ + {0x400000, pteSize, pteSize * 42, true}, + {0x00007f0000000000, pteSize, pteSize * 47, false}, + }) + pt.Release() +} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go b/pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go new file mode 100644 index 000000000..a2b44fb79 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_unsafe.go @@ -0,0 +1,31 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pagetables + +import ( + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// PTEs returns aligned PTE entries. +func (n *Node) PTEs() *PTEs { + addr := uintptr(unsafe.Pointer(&n.unalignedData[0])) + offset := addr & (usermem.PageSize - 1) + if offset != 0 { + offset = usermem.PageSize - offset + } + return (*PTEs)(unsafe.Pointer(addr + offset)) +} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go new file mode 100644 index 000000000..dac66373f --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go @@ -0,0 +1,79 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build i386 amd64 + +package pagetables + +// Opts are pagetable options. +type Opts struct { + EnablePCID bool +} + +// archPageTables has x86-specific features. +type archPageTables struct { + // pcids is the PCID database. + pcids *PCIDs + + // pcid is the globally unique identifier, or zero if none were + // available or pcids is nil. + pcid uint16 +} + +// init initializes arch-specific features. +func (a *archPageTables) init(opts Opts) { + if opts.EnablePCID { + a.pcids = NewPCIDs() + a.pcid = a.pcids.allocate() + } +} + +// initFrom initializes arch-specific features from an existing entry.' +func (a *archPageTables) initFrom(other *archPageTables) { + a.pcids = other.pcids // Refer to the same PCID database. + if a.pcids != nil { + a.pcid = a.pcids.allocate() + } +} + +// release is called from Release. +func (a *archPageTables) release() { + // Return the PCID. + if a.pcids != nil { + a.pcids.free(a.pcid) + } +} + +// CR3 returns the CR3 value for these tables. +// +// This may be called in interrupt contexts. +// +//go:nosplit +func (p *PageTables) CR3() uint64 { + // Bit 63 is set to avoid flushing the PCID (per SDM 4.10.4.1). + const noFlushBit uint64 = 0x8000000000000000 + if p.pcid != 0 { + return noFlushBit | uint64(p.root.physical) | uint64(p.pcid) + } + return uint64(p.root.physical) +} + +// FlushCR3 returns the CR3 value that flushes the TLB. +// +// This may be called in interrupt contexts. +// +//go:nosplit +func (p *PageTables) FlushCR3() uint64 { + return uint64(p.root.physical) | uint64(p.pcid) +} diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go new file mode 100644 index 000000000..1fc403c48 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86_test.go @@ -0,0 +1,79 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build i386 amd64 + +package pagetables + +import ( + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +func Test2MAnd4K(t *testing.T) { + pt := New(reflectTranslater{}, Opts{}) + + // Map a small page and a huge page. + pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42) + pt.Map(0x00007f0000000000, 1<<21, true, usermem.Read, pmdSize*47) + + checkMappings(t, pt, []mapping{ + {0x400000, pteSize, pteSize * 42, true}, + {0x00007f0000000000, pmdSize, pmdSize * 47, false}, + }) + pt.Release() +} + +func Test1GAnd4K(t *testing.T) { + pt := New(reflectTranslater{}, Opts{}) + + // Map a small page and a super page. + pt.Map(0x400000, pteSize, true, usermem.ReadWrite, pteSize*42) + pt.Map(0x00007f0000000000, pudSize, true, usermem.Read, pudSize*47) + + checkMappings(t, pt, []mapping{ + {0x400000, pteSize, pteSize * 42, true}, + {0x00007f0000000000, pudSize, pudSize * 47, false}, + }) + pt.Release() +} + +func TestSplit1GPage(t *testing.T) { + pt := New(reflectTranslater{}, Opts{}) + + // Map a super page and knock out the middle. + pt.Map(0x00007f0000000000, pudSize, true, usermem.Read, pudSize*42) + pt.Unmap(usermem.Addr(0x00007f0000000000+pteSize), pudSize-(2*pteSize)) + + checkMappings(t, pt, []mapping{ + {0x00007f0000000000, pteSize, pudSize * 42, false}, + {0x00007f0000000000 + pudSize - pteSize, pteSize, pudSize*42 + pudSize - pteSize, false}, + }) + pt.Release() +} + +func TestSplit2MPage(t *testing.T) { + pt := New(reflectTranslater{}, Opts{}) + + // Map a huge page and knock out the middle. + pt.Map(0x00007f0000000000, pmdSize, true, usermem.Read, pmdSize*42) + pt.Unmap(usermem.Addr(0x00007f0000000000+pteSize), pmdSize-(2*pteSize)) + + checkMappings(t, pt, []mapping{ + {0x00007f0000000000, pteSize, pmdSize * 42, false}, + {0x00007f0000000000 + pmdSize - pteSize, pteSize, pmdSize*42 + pmdSize - pteSize, false}, + }) + pt.Release() +} diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go new file mode 100644 index 000000000..509e8c0d9 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go @@ -0,0 +1,74 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build i386 amd64 + +package pagetables + +import ( + "sync" +) + +// maxPCID is the maximum allowed PCID. +const maxPCID = 4095 + +// PCIDs is a simple PCID database. +type PCIDs struct { + mu sync.Mutex + + // last is the last fresh PCID given out (not including the available + // pool). If last >= maxPCID, then the only PCIDs available in the + // available pool below. + last uint16 + + // available are PCIDs that have been freed. + available map[uint16]struct{} +} + +// NewPCIDs returns a new PCID set. +func NewPCIDs() *PCIDs { + return &PCIDs{ + available: make(map[uint16]struct{}), + } +} + +// allocate returns an unused PCID, or zero if all are taken. +func (p *PCIDs) allocate() uint16 { + p.mu.Lock() + defer p.mu.Unlock() + if len(p.available) > 0 { + for id := range p.available { + delete(p.available, id) + return id + } + } + if id := p.last + 1; id <= maxPCID { + p.last = id + return id + } + // Nothing available. + return 0 +} + +// free returns a PCID to the pool. +// +// It is safe to call free with a zero pcid. That is, you may always call free +// with anything returned by allocate. +func (p *PCIDs) free(id uint16) { + p.mu.Lock() + defer p.mu.Unlock() + if id != 0 { + p.available[id] = struct{}{} + } +} diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go new file mode 100644 index 000000000..0b555cd76 --- /dev/null +++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86_test.go @@ -0,0 +1,65 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build i386 amd64 + +package pagetables + +import ( + "testing" +) + +func TestMaxPCID(t *testing.T) { + p := NewPCIDs() + for i := 0; i < maxPCID; i++ { + if id := p.allocate(); id != uint16(i+1) { + t.Errorf("got %d, expected %d", id, i+1) + } + } + if id := p.allocate(); id != 0 { + if id != 0 { + t.Errorf("got %d, expected 0", id) + } + } +} + +func TestFirstPCID(t *testing.T) { + p := NewPCIDs() + if id := p.allocate(); id != 1 { + t.Errorf("got %d, expected 1", id) + } +} + +func TestFreePCID(t *testing.T) { + p := NewPCIDs() + p.free(0) + if id := p.allocate(); id != 1 { + t.Errorf("got %d, expected 1 (not zero)", id) + } +} + +func TestReusePCID(t *testing.T) { + p := NewPCIDs() + id := p.allocate() + if id != 1 { + t.Errorf("got %d, expected 1", id) + } + p.free(id) + if id := p.allocate(); id != 1 { + t.Errorf("got %d, expected 1", id) + } + if id := p.allocate(); id != 2 { + t.Errorf("got %d, expected 2", id) + } +} |