summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry
diff options
context:
space:
mode:
authorLai Jiangshan <jiangshan.ljs@antfin.com>2020-08-13 13:01:20 +0800
committerLai Jiangshan <laijs@linux.alibaba.com>2020-11-03 00:10:32 +0800
commit3425485b7c47861422c8bf285635d11911035383 (patch)
treefa412b1b8beaebae2bc05cd78b8e16f4c8727c7e /pkg/sentry
parentdd056112b72abde9f570a69ad7cfc2a0a6beed14 (diff)
kvm: share upper halves among all pagtables
Fixes: #509 Signed-off-by: Lai Jiangshan <jiangshan.ljs@antfin.com> Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
Diffstat (limited to 'pkg/sentry')
-rw-r--r--pkg/sentry/platform/kvm/kvm.go3
-rw-r--r--pkg/sentry/platform/kvm/machine.go15
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64.go31
-rw-r--r--pkg/sentry/platform/ring0/defs.go3
-rw-r--r--pkg/sentry/platform/ring0/defs_amd64.go10
-rw-r--r--pkg/sentry/platform/ring0/defs_arm64.go7
-rw-r--r--pkg/sentry/platform/ring0/kernel.go6
-rw-r--r--pkg/sentry/platform/ring0/kernel_amd64.go5
-rw-r--r--pkg/sentry/platform/ring0/kernel_arm64.go4
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables.go84
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go10
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go21
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go13
-rw-r--r--pkg/sentry/platform/ring0/pagetables/walker_arm64.go2
14 files changed, 149 insertions, 65 deletions
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index dd45ad10b..5979aef97 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -158,8 +158,7 @@ func (*KVM) MaxUserAddress() usermem.Addr {
// NewAddressSpace returns a new pagetable root.
func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan struct{}, error) {
// Allocate page tables and install system mappings.
- pageTables := pagetables.New(newAllocator())
- k.machine.mapUpperHalf(pageTables)
+ pageTables := pagetables.NewWithUpper(newAllocator(), k.machine.upperSharedPageTables, ring0.KernelStartAddress)
// Return the new address space.
return &addressSpace{
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index f70d761fd..e2fffc99b 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -41,6 +41,9 @@ type machine struct {
// slots are currently being updated, and the caller should retry.
nextSlot uint32
+ // upperSharedPageTables tracks the read-only shared upper of all the pagetables.
+ upperSharedPageTables *pagetables.PageTables
+
// kernel is the set of global structures.
kernel ring0.Kernel
@@ -199,9 +202,7 @@ func newMachine(vm int) (*machine, error) {
log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
m.vCPUsByTID = make(map[uint64]*vCPU)
m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
- m.kernel.Init(ring0.KernelOpts{
- PageTables: pagetables.New(newAllocator()),
- }, m.maxVCPUs)
+ m.kernel.Init(m.maxVCPUs)
// Pull the maximum slots.
maxSlots, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_MEMSLOTS)
@@ -213,6 +214,13 @@ func newMachine(vm int) (*machine, error) {
log.Debugf("The maximum number of slots is %d.", m.maxSlots)
m.usedSlots = make([]uintptr, m.maxSlots)
+ // Create the upper shared pagetables and kernel(sentry) pagetables.
+ m.upperSharedPageTables = pagetables.New(newAllocator())
+ m.mapUpperHalf(m.upperSharedPageTables)
+ m.upperSharedPageTables.Allocator.(*allocator).base.Drain()
+ m.upperSharedPageTables.MarkReadOnlyShared()
+ m.kernel.PageTables = pagetables.NewWithUpper(newAllocator(), m.upperSharedPageTables, ring0.KernelStartAddress)
+
// Apply the physical mappings. Note that these mappings may point to
// guest physical addresses that are not actually available. These
// physical pages are mapped on demand, see kernel_unsafe.go.
@@ -226,7 +234,6 @@ func newMachine(vm int) (*machine, error) {
return true // Keep iterating.
})
- m.mapUpperHalf(m.kernel.PageTables)
var physicalRegionsReadOnly []physicalRegion
var physicalRegionsAvailable []physicalRegion
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index a8b729e62..8e03c310d 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -432,30 +432,27 @@ func availableRegionsForSetMem() (phyRegions []physicalRegion) {
return physicalRegions
}
-var execRegions = func() (regions []region) {
+func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
+ // Map all the executible regions so that all the entry functions
+ // are mapped in the upper half.
applyVirtualRegions(func(vr virtualRegion) {
if excludeVirtualRegion(vr) || vr.filename == "[vsyscall]" {
return
}
+
if vr.accessType.Execute {
- regions = append(regions, vr.region)
+ r := vr.region
+ physical, length, ok := translateToPhysical(r.virtual)
+ if !ok || length < r.length {
+ panic("impossible translation")
+ }
+ pageTable.Map(
+ usermem.Addr(ring0.KernelStartAddress|r.virtual),
+ r.length,
+ pagetables.MapOpts{AccessType: usermem.Execute},
+ physical)
}
})
- return
-}()
-
-func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
- for _, r := range execRegions {
- physical, length, ok := translateToPhysical(r.virtual)
- if !ok || length < r.length {
- panic("impossilbe translation")
- }
- pageTable.Map(
- usermem.Addr(ring0.KernelStartAddress|r.virtual),
- r.length,
- pagetables.MapOpts{AccessType: usermem.Execute},
- physical)
- }
for start, end := range m.kernel.EntryRegions() {
regionLen := end - start
physical, length, ok := translateToPhysical(start)
diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go
index e6daf24df..f9765771e 100644
--- a/pkg/sentry/platform/ring0/defs.go
+++ b/pkg/sentry/platform/ring0/defs.go
@@ -23,6 +23,9 @@ import (
//
// This contains global state, shared by multiple CPUs.
type Kernel struct {
+ // PageTables are the kernel pagetables; this must be provided.
+ PageTables *pagetables.PageTables
+
KernelArchState
}
diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go
index 00899273e..7a2275558 100644
--- a/pkg/sentry/platform/ring0/defs_amd64.go
+++ b/pkg/sentry/platform/ring0/defs_amd64.go
@@ -66,17 +66,9 @@ var (
KernelDataSegment SegmentDescriptor
)
-// KernelOpts has initialization options for the kernel.
-type KernelOpts struct {
- // PageTables are the kernel pagetables; this must be provided.
- PageTables *pagetables.PageTables
-}
-
// KernelArchState contains architecture-specific state.
type KernelArchState struct {
- KernelOpts
-
- // cpuEntries is array of kernelEntry for all cpus
+ // cpuEntries is array of kernelEntry for all cpus.
cpuEntries []kernelEntry
// globalIDT is our set of interrupt gates.
diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go
index 508236e46..a014dcbc0 100644
--- a/pkg/sentry/platform/ring0/defs_arm64.go
+++ b/pkg/sentry/platform/ring0/defs_arm64.go
@@ -32,15 +32,8 @@ var (
KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1)
)
-// KernelOpts has initialization options for the kernel.
-type KernelOpts struct {
- // PageTables are the kernel pagetables; this must be provided.
- PageTables *pagetables.PageTables
-}
-
// KernelArchState contains architecture-specific state.
type KernelArchState struct {
- KernelOpts
}
// CPUArchState contains CPU-specific arch state.
diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go
index 264be23d3..292f9d0cc 100644
--- a/pkg/sentry/platform/ring0/kernel.go
+++ b/pkg/sentry/platform/ring0/kernel.go
@@ -16,11 +16,9 @@ package ring0
// Init initializes a new kernel.
//
-// N.B. that constraints on KernelOpts must be satisfied.
-//
//go:nosplit
-func (k *Kernel) Init(opts KernelOpts, maxCPUs int) {
- k.init(opts, maxCPUs)
+func (k *Kernel) Init(maxCPUs int) {
+ k.init(maxCPUs)
}
// Halt halts execution.
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index 3a9dff4cc..b55dc29b3 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -24,10 +24,7 @@ import (
)
// init initializes architecture-specific state.
-func (k *Kernel) init(opts KernelOpts, maxCPUs int) {
- // Save the root page tables.
- k.PageTables = opts.PageTables
-
+func (k *Kernel) init(maxCPUs int) {
entrySize := reflect.TypeOf(kernelEntry{}).Size()
var (
entries []kernelEntry
diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go
index b294ccc7c..6cbbf001f 100644
--- a/pkg/sentry/platform/ring0/kernel_arm64.go
+++ b/pkg/sentry/platform/ring0/kernel_arm64.go
@@ -25,9 +25,7 @@ func HaltAndResume()
func HaltEl1SvcAndResume()
// init initializes architecture-specific state.
-func (k *Kernel) init(opts KernelOpts, maxCPUs int) {
- // Save the root page tables.
- k.PageTables = opts.PageTables
+func (k *Kernel) init(maxCPUs int) {
}
// init initializes architecture-specific state.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
index 7f18ac296..bc16a1622 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -30,6 +30,10 @@ type PageTables struct {
Allocator Allocator
// root is the pagetable root.
+ //
+ // For same archs such as amd64, the upper of the PTEs is cloned
+ // from and owned by upperSharedPageTables which are shared among
+ // many PageTables if upperSharedPageTables is not nil.
root *PTEs
// rootPhysical is the cached physical address of the root.
@@ -39,15 +43,52 @@ type PageTables struct {
// archPageTables includes architecture-specific features.
archPageTables
+
+ // upperSharedPageTables represents a read-only shared upper
+ // of the Pagetable. When it is not nil, the upper is not
+ // allowed to be modified.
+ upperSharedPageTables *PageTables
+
+ // upperStart is the start address of the upper portion that
+ // are shared from upperSharedPageTables
+ upperStart uintptr
+
+ // readOnlyShared indicates the Pagetables are read-only and
+ // own the ranges that are shared with other Pagetables.
+ readOnlyShared bool
}
-// New returns new PageTables.
-func New(a Allocator) *PageTables {
+// NewWithUpper returns new PageTables.
+//
+// upperSharedPageTables are used for mapping the upper of addresses,
+// starting at upperStart. These pageTables should not be touched (as
+// invalidations may be incorrect) after they are passed as an
+// upperSharedPageTables. Only when all dependent PageTables are gone
+// may they be used. The intenteded use case is for kernel page tables,
+// which are static and fixed.
+//
+// Precondition: upperStart must be between canonical ranges.
+// Precondition: upperStart must be pgdSize aligned.
+// precondition: upperSharedPageTables must be marked read-only shared.
+func NewWithUpper(a Allocator, upperSharedPageTables *PageTables, upperStart uintptr) *PageTables {
p := new(PageTables)
p.Init(a)
+ if upperSharedPageTables != nil {
+ if !upperSharedPageTables.readOnlyShared {
+ panic("Only read-only shared pagetables can be used as upper")
+ }
+ p.upperSharedPageTables = upperSharedPageTables
+ p.upperStart = upperStart
+ p.cloneUpperShared()
+ }
return p
}
+// New returns new PageTables.
+func New(a Allocator) *PageTables {
+ return NewWithUpper(a, nil, 0)
+}
+
// mapVisitor is used for map.
type mapVisitor struct {
target uintptr // Input.
@@ -90,6 +131,21 @@ func (*mapVisitor) requiresSplit() bool { return true }
//
//go:nosplit
func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physical uintptr) bool {
+ if p.readOnlyShared {
+ panic("Should not modify read-only shared pagetables.")
+ }
+ if uintptr(addr)+length < uintptr(addr) {
+ panic("addr & length overflow")
+ }
+ if p.upperSharedPageTables != nil {
+ // ignore change to the read-only upper shared portion.
+ if uintptr(addr) >= p.upperStart {
+ return false
+ }
+ if uintptr(addr)+length > p.upperStart {
+ length = p.upperStart - uintptr(addr)
+ }
+ }
if !opts.AccessType.Any() {
return p.Unmap(addr, length)
}
@@ -128,12 +184,27 @@ func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) {
//
// True is returned iff there was a previous mapping in the range.
//
-// Precondition: addr & length must be page-aligned.
+// Precondition: addr & length must be page-aligned, their sum must not overflow.
//
// +checkescape:hard,stack
//
//go:nosplit
func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool {
+ if p.readOnlyShared {
+ panic("Should not modify read-only shared pagetables.")
+ }
+ if uintptr(addr)+length < uintptr(addr) {
+ panic("addr & length overflow")
+ }
+ if p.upperSharedPageTables != nil {
+ // ignore change to the read-only upper shared portion.
+ if uintptr(addr) >= p.upperStart {
+ return false
+ }
+ if uintptr(addr)+length > p.upperStart {
+ length = p.upperStart - uintptr(addr)
+ }
+ }
w := unmapWalker{
pageTables: p,
visitor: unmapVisitor{
@@ -218,3 +289,10 @@ func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts)
w.iterateRange(uintptr(addr), uintptr(addr)+1)
return w.visitor.physical + offset, w.visitor.opts
}
+
+// MarkReadOnlyShared marks the pagetables read-only and can be shared.
+//
+// It is usually used on the pagetables that are used as the upper
+func (p *PageTables) MarkReadOnlyShared() {
+ p.readOnlyShared = true
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
index 520161755..a4e416af7 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
@@ -24,14 +24,6 @@ import (
// archPageTables is architecture-specific data.
type archPageTables struct {
- // root is the pagetable root for kernel space.
- root *PTEs
-
- // rootPhysical is the cached physical address of the root.
- //
- // This is saved only to prevent constant translation.
- rootPhysical uintptr
-
asid uint16
}
@@ -46,7 +38,7 @@ func (p *PageTables) TTBR0_EL1(noFlush bool, asid uint16) uint64 {
//
//go:nosplit
func (p *PageTables) TTBR1_EL1(noFlush bool, asid uint16) uint64 {
- return uint64(p.archPageTables.rootPhysical) | (uint64(asid)&ttbrASIDMask)<<ttbrASIDOffset
+ return uint64(p.upperSharedPageTables.rootPhysical) | (uint64(asid)&ttbrASIDMask)<<ttbrASIDOffset
}
// Bits in page table entries.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
index 0c153cf8c..e7ab887e5 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
@@ -50,5 +50,26 @@ func (p *PageTables) Init(allocator Allocator) {
p.rootPhysical = p.Allocator.PhysicalFor(p.root)
}
+func pgdIndex(upperStart uintptr) uintptr {
+ if upperStart&(pgdSize-1) != 0 {
+ panic("upperStart should be pgd size aligned")
+ }
+ if upperStart >= upperBottom {
+ return entriesPerPage/2 + (upperStart-upperBottom)/pgdSize
+ }
+ if upperStart < lowerTop {
+ return upperStart / pgdSize
+ }
+ panic("upperStart should be in canonical range")
+}
+
+// cloneUpperShared clone the upper from the upper shared page tables.
+//
+//go:nosplit
+func (p *PageTables) cloneUpperShared() {
+ start := pgdIndex(p.upperStart)
+ copy(p.root[start:entriesPerPage], p.upperSharedPageTables.root[start:entriesPerPage])
+}
+
// PTEs is a collection of entries.
type PTEs [entriesPerPage]PTE
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go
index 5ddd10256..5392bf27a 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go
@@ -49,8 +49,17 @@ func (p *PageTables) Init(allocator Allocator) {
p.Allocator = allocator
p.root = p.Allocator.NewPTEs()
p.rootPhysical = p.Allocator.PhysicalFor(p.root)
- p.archPageTables.root = p.Allocator.NewPTEs()
- p.archPageTables.rootPhysical = p.Allocator.PhysicalFor(p.archPageTables.root)
+}
+
+// cloneUpperShared clone the upper from the upper shared page tables.
+//
+//go:nosplit
+func (p *PageTables) cloneUpperShared() {
+ if p.upperStart != upperBottom {
+ panic("upperStart should be the same as upperBottom")
+ }
+
+ // nothing to do for arm.
}
// PTEs is a collection of entries.
diff --git a/pkg/sentry/platform/ring0/pagetables/walker_arm64.go b/pkg/sentry/platform/ring0/pagetables/walker_arm64.go
index c261d393a..157c9a7cc 100644
--- a/pkg/sentry/platform/ring0/pagetables/walker_arm64.go
+++ b/pkg/sentry/platform/ring0/pagetables/walker_arm64.go
@@ -116,7 +116,7 @@ func next(start uintptr, size uintptr) uintptr {
func (w *Walker) iterateRangeCanonical(start, end uintptr) {
pgdEntryIndex := w.pageTables.root
if start >= upperBottom {
- pgdEntryIndex = w.pageTables.archPageTables.root
+ pgdEntryIndex = w.pageTables.upperSharedPageTables.root
}
for pgdIndex := (uint16((start & pgdMask) >> pgdShift)); start < end && pgdIndex < entriesPerPage; pgdIndex++ {