summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/platform
diff options
context:
space:
mode:
authorgVisor bot <gvisor-bot@google.com>2021-09-23 21:12:10 +0000
committergVisor bot <gvisor-bot@google.com>2021-09-23 21:12:10 +0000
commita7232056d6e7b6a823927b6d0338ac765b42ca7b (patch)
tree2cdb5f06175a53546776b715a160404fd7ab1b0b /pkg/sentry/platform
parent275222631a31e1cc29a02776bae47631229df191 (diff)
parent93ac1557751a0c17a85f49d715b96833acf39dc6 (diff)
Merge release-20210921.0-25-g93ac15577 (automated)
Diffstat (limited to 'pkg/sentry/platform')
-rw-r--r--pkg/sentry/platform/kvm/atomicptr_machine_unsafe.go39
-rw-r--r--pkg/sentry/platform/kvm/bluepill.go3
-rw-r--r--pkg/sentry/platform/kvm/bluepill_arm64.s34
-rw-r--r--pkg/sentry/platform/kvm/bluepill_impl_amd64.s27
-rw-r--r--pkg/sentry/platform/kvm/bluepill_unsafe.go32
-rw-r--r--pkg/sentry/platform/kvm/kvm_unsafe_state_autogen.go35
-rw-r--r--pkg/sentry/platform/kvm/machine.go136
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64.go25
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64_unsafe.go12
-rw-r--r--pkg/sentry/platform/kvm/machine_arm64.go120
-rw-r--r--pkg/sentry/platform/kvm/machine_arm64_unsafe.go12
-rw-r--r--pkg/sentry/platform/kvm/machine_unsafe.go43
-rw-r--r--pkg/sentry/platform/kvm/physical_map.go3
13 files changed, 442 insertions, 79 deletions
diff --git a/pkg/sentry/platform/kvm/atomicptr_machine_unsafe.go b/pkg/sentry/platform/kvm/atomicptr_machine_unsafe.go
new file mode 100644
index 000000000..a6f37e528
--- /dev/null
+++ b/pkg/sentry/platform/kvm/atomicptr_machine_unsafe.go
@@ -0,0 +1,39 @@
+package kvm
+
+import (
+ "sync/atomic"
+ "unsafe"
+)
+
+// An AtomicPtr is a pointer to a value of type Value that can be atomically
+// loaded and stored. The zero value of an AtomicPtr represents nil.
+//
+// Note that copying AtomicPtr by value performs a non-atomic read of the
+// stored pointer, which is unsafe if Store() can be called concurrently; in
+// this case, do `dst.Store(src.Load())` instead.
+//
+// +stateify savable
+type machineAtomicPtr struct {
+ ptr unsafe.Pointer `state:".(*machine)"`
+}
+
+func (p *machineAtomicPtr) savePtr() *machine {
+ return p.Load()
+}
+
+func (p *machineAtomicPtr) loadPtr(v *machine) {
+ p.Store(v)
+}
+
+// Load returns the value set by the most recent Store. It returns nil if there
+// has been no previous call to Store.
+//
+//go:nosplit
+func (p *machineAtomicPtr) Load() *machine {
+ return (*machine)(atomic.LoadPointer(&p.ptr))
+}
+
+// Store sets the value returned by Load to x.
+func (p *machineAtomicPtr) Store(x *machine) {
+ atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x))
+}
diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
index bb9967b9f..826997e77 100644
--- a/pkg/sentry/platform/kvm/bluepill.go
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -61,6 +61,9 @@ var (
// This is called by bluepillHandler.
savedHandler uintptr
+ // savedSigsysHandler is a pointer to the previos handler of the SIGSYS signals.
+ savedSigsysHandler uintptr
+
// dieTrampolineAddr is the address of dieTrampoline.
dieTrampolineAddr uintptr
)
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.s b/pkg/sentry/platform/kvm/bluepill_arm64.s
index 308f2a951..9690e3772 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64.s
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.s
@@ -29,9 +29,12 @@
// Only limited use of the context is done in the assembly stub below, most is
// done in the Go handlers.
#define SIGINFO_SIGNO 0x0
+#define SIGINFO_CODE 0x8
#define CONTEXT_PC 0x1B8
#define CONTEXT_R0 0xB8
+#define SYS_MMAP 222
+
// getTLS returns the value of TPIDR_EL0 register.
TEXT ·getTLS(SB),NOSPLIT,$0-8
MRS TPIDR_EL0, R1
@@ -98,6 +101,37 @@ TEXT ·addrOfSighandler(SB), $0-8
MOVD R0, ret+0(FP)
RET
+// The arguments are the following:
+//
+// R0 - The signal number.
+// R1 - Pointer to siginfo_t structure.
+// R2 - Pointer to ucontext structure.
+//
+TEXT ·sigsysHandler(SB),NOSPLIT,$0
+ // si_code should be SYS_SECCOMP.
+ MOVD SIGINFO_CODE(R1), R7
+ CMPW $1, R7
+ BNE fallback
+
+ CMPW $SYS_MMAP, R8
+ BNE fallback
+
+ MOVD R2, 8(RSP)
+ BL ·seccompMmapHandler(SB) // Call the handler.
+
+ RET
+
+fallback:
+ // Jump to the previous signal handler.
+ MOVD ·savedHandler(SB), R7
+ B (R7)
+
+// func addrOfSighandler() uintptr
+TEXT ·addrOfSigsysHandler(SB), $0-8
+ MOVD $·sigsysHandler(SB), R0
+ MOVD R0, ret+0(FP)
+ RET
+
// dieTrampoline: see bluepill.go, bluepill_arm64_unsafe.go for documentation.
TEXT ·dieTrampoline(SB),NOSPLIT,$0
// R0: Fake the old PC as caller
diff --git a/pkg/sentry/platform/kvm/bluepill_impl_amd64.s b/pkg/sentry/platform/kvm/bluepill_impl_amd64.s
index 99f254342..7d90e2b1f 100644
--- a/pkg/sentry/platform/kvm/bluepill_impl_amd64.s
+++ b/pkg/sentry/platform/kvm/bluepill_impl_amd64.s
@@ -102,6 +102,8 @@
// This is checked as the source of the fault.
#define CLI $0xfa
+#define SYS_MMAP 9
+
// See bluepill.go.
TEXT ·bluepill(SB),NOSPLIT,$0
begin:
@@ -165,6 +167,31 @@ TEXT ·addrOfSighandler(SB), $0-8
MOVQ AX, ret+0(FP)
RET
+TEXT ·sigsysHandler(SB),NOSPLIT,$0
+ // Check if the signal is from the kernel.
+ MOVQ $1, CX
+ CMPL CX, 0x8(SI)
+ JNE fallback
+
+ MOVL CONTEXT_RAX(DX), CX
+ CMPL CX, $SYS_MMAP
+ JNE fallback
+ PUSHQ DX // First argument (context).
+ CALL ·seccompMmapHandler(SB) // Call the handler.
+ POPQ DX // Discard the argument.
+ RET
+fallback:
+ // Jump to the previous signal handler.
+ XORQ CX, CX
+ MOVQ ·savedSigsysHandler(SB), AX
+ JMP AX
+
+// func addrOfSighandler() uintptr
+TEXT ·addrOfSigsysHandler(SB), $0-8
+ MOVQ $·sigsysHandler(SB), AX
+ MOVQ AX, ret+0(FP)
+ RET
+
// dieTrampoline: see bluepill.go, bluepill_amd64_unsafe.go for documentation.
TEXT ·dieTrampoline(SB),NOSPLIT,$0
PUSHQ BX // First argument (vCPU).
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index 0f0c1e73b..e38ca05c0 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -193,36 +193,8 @@ func bluepillHandler(context unsafe.Pointer) {
return
}
- // Increment the fault count.
- atomic.AddUint32(&c.faults, 1)
-
- // For MMIO, the physical address is the first data item.
- physical = uintptr(c.runData.data[0])
- virtual, ok := handleBluepillFault(c.machine, physical, physicalRegions, _KVM_MEM_FLAGS_NONE)
- if !ok {
- c.die(bluepillArchContext(context), "invalid physical address")
- return
- }
-
- // We now need to fill in the data appropriately. KVM
- // expects us to provide the result of the given MMIO
- // operation in the runData struct. This is safe
- // because, if a fault occurs here, the same fault
- // would have occurred in guest mode. The kernel should
- // not create invalid page table mappings.
- data := (*[8]byte)(unsafe.Pointer(&c.runData.data[1]))
- length := (uintptr)((uint32)(c.runData.data[2]))
- write := (uint8)(((c.runData.data[2] >> 32) & 0xff)) != 0
- for i := uintptr(0); i < length; i++ {
- b := bytePtr(uintptr(virtual) + i)
- if write {
- // Write to the given address.
- *b = data[i]
- } else {
- // Read from the given address.
- data[i] = *b
- }
- }
+ c.die(bluepillArchContext(context), "exit_mmio")
+ return
case _KVM_EXIT_IRQ_WINDOW_OPEN:
bluepillStopGuest(c)
case _KVM_EXIT_SHUTDOWN:
diff --git a/pkg/sentry/platform/kvm/kvm_unsafe_state_autogen.go b/pkg/sentry/platform/kvm/kvm_unsafe_state_autogen.go
index ca1360c0c..4940ae3fc 100644
--- a/pkg/sentry/platform/kvm/kvm_unsafe_state_autogen.go
+++ b/pkg/sentry/platform/kvm/kvm_unsafe_state_autogen.go
@@ -4,3 +4,38 @@
// +build go1.12,go1.12
package kvm
+
+import (
+ "gvisor.dev/gvisor/pkg/state"
+)
+
+func (p *machineAtomicPtr) StateTypeName() string {
+ return "pkg/sentry/platform/kvm.machineAtomicPtr"
+}
+
+func (p *machineAtomicPtr) StateFields() []string {
+ return []string{
+ "ptr",
+ }
+}
+
+func (p *machineAtomicPtr) beforeSave() {}
+
+// +checklocksignore
+func (p *machineAtomicPtr) StateSave(stateSinkObject state.Sink) {
+ p.beforeSave()
+ var ptrValue *machine
+ ptrValue = p.savePtr()
+ stateSinkObject.SaveValue(0, ptrValue)
+}
+
+func (p *machineAtomicPtr) afterLoad() {}
+
+// +checklocksignore
+func (p *machineAtomicPtr) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.LoadValue(0, new(*machine), func(y interface{}) { p.loadPtr(y.(*machine)) })
+}
+
+func init() {
+ state.Register((*machineAtomicPtr)(nil))
+}
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index d67563958..dcf34015d 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -17,15 +17,19 @@ package kvm
import (
"fmt"
"runtime"
+ gosync "sync"
"sync/atomic"
"golang.org/x/sys/unix"
+ "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/atomicbitops"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/procid"
"gvisor.dev/gvisor/pkg/ring0"
"gvisor.dev/gvisor/pkg/ring0/pagetables"
+ "gvisor.dev/gvisor/pkg/safecopy"
+ "gvisor.dev/gvisor/pkg/seccomp"
ktime "gvisor.dev/gvisor/pkg/sentry/time"
"gvisor.dev/gvisor/pkg/sync"
)
@@ -35,6 +39,9 @@ type machine struct {
// fd is the vm fd.
fd int
+ // machinePoolIndex is the index in the machinePool array.
+ machinePoolIndex uint32
+
// nextSlot is the next slot for setMemoryRegion.
//
// This must be accessed atomically. If nextSlot is ^uint32(0), then
@@ -192,6 +199,10 @@ func (m *machine) newVCPU() *vCPU {
return c // Done.
}
+// readOnlyGuestRegions contains regions that have to be mapped read-only into
+// the guest physical address space. Right now, it is used on arm64 only.
+var readOnlyGuestRegions []region
+
// newMachine returns a new VM context.
func newMachine(vm int) (*machine, error) {
// Create the machine.
@@ -227,6 +238,10 @@ func newMachine(vm int) (*machine, error) {
m.upperSharedPageTables.MarkReadOnlyShared()
m.kernel.PageTables = pagetables.NewWithUpper(newAllocator(), m.upperSharedPageTables, ring0.KernelStartAddress)
+ // Install seccomp rules to trap runtime mmap system calls. They will
+ // be handled by seccompMmapHandler.
+ seccompMmapRules(m)
+
// Apply the physical mappings. Note that these mappings may point to
// guest physical addresses that are not actually available. These
// physical pages are mapped on demand, see kernel_unsafe.go.
@@ -241,32 +256,11 @@ func newMachine(vm int) (*machine, error) {
return true // Keep iterating.
})
- var physicalRegionsReadOnly []physicalRegion
- var physicalRegionsAvailable []physicalRegion
-
- physicalRegionsReadOnly = rdonlyRegionsForSetMem()
- physicalRegionsAvailable = availableRegionsForSetMem()
-
- // Map all read-only regions.
- for _, r := range physicalRegionsReadOnly {
- m.mapPhysical(r.physical, r.length, physicalRegionsReadOnly, _KVM_MEM_READONLY)
- }
-
// Ensure that the currently mapped virtual regions are actually
// available in the VM. Note that this doesn't guarantee no future
// faults, however it should guarantee that everything is available to
// ensure successful vCPU entry.
- applyVirtualRegions(func(vr virtualRegion) {
- if excludeVirtualRegion(vr) {
- return // skip region.
- }
-
- for _, r := range physicalRegionsReadOnly {
- if vr.virtual == r.virtual {
- return
- }
- }
-
+ mapRegion := func(vr region, flags uint32) {
for virtual := vr.virtual; virtual < vr.virtual+vr.length; {
physical, length, ok := translateToPhysical(virtual)
if !ok {
@@ -280,9 +274,32 @@ func newMachine(vm int) (*machine, error) {
}
// Ensure the physical range is mapped.
- m.mapPhysical(physical, length, physicalRegionsAvailable, _KVM_MEM_FLAGS_NONE)
+ m.mapPhysical(physical, length, physicalRegions, flags)
virtual += length
}
+ }
+
+ for _, vr := range readOnlyGuestRegions {
+ mapRegion(vr, _KVM_MEM_READONLY)
+ }
+
+ applyVirtualRegions(func(vr virtualRegion) {
+ if excludeVirtualRegion(vr) {
+ return // skip region.
+ }
+ for _, r := range readOnlyGuestRegions {
+ if vr.virtual == r.virtual {
+ return
+ }
+ }
+ // Take into account that the stack can grow down.
+ if vr.filename == "[stack]" {
+ vr.virtual -= 1 << 20
+ vr.length += 1 << 20
+ }
+
+ mapRegion(vr.region, 0)
+
})
// Initialize architecture state.
@@ -352,6 +369,10 @@ func (m *machine) mapPhysical(physical, length uintptr, phyRegions []physicalReg
func (m *machine) Destroy() {
runtime.SetFinalizer(m, nil)
+ machinePoolMu.Lock()
+ machinePool[m.machinePoolIndex].Store(nil)
+ machinePoolMu.Unlock()
+
// Destroy vCPUs.
for _, c := range m.vCPUsByID {
if c == nil {
@@ -683,3 +704,72 @@ func (c *vCPU) setSystemTimeLegacy() error {
}
}
}
+
+const machinePoolSize = 16
+
+// machinePool is enumerated from the seccompMmapHandler signal handler
+var (
+ machinePool [machinePoolSize]machineAtomicPtr
+ machinePoolLen uint32
+ machinePoolMu sync.Mutex
+ seccompMmapRulesOnce gosync.Once
+)
+
+func sigsysHandler()
+func addrOfSigsysHandler() uintptr
+
+// seccompMmapRules adds seccomp rules to trap mmap system calls that will be
+// handled in seccompMmapHandler.
+func seccompMmapRules(m *machine) {
+ seccompMmapRulesOnce.Do(func() {
+ // Install the handler.
+ if err := safecopy.ReplaceSignalHandler(unix.SIGSYS, addrOfSigsysHandler(), &savedSigsysHandler); err != nil {
+ panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err))
+ }
+ rules := []seccomp.RuleSet{}
+ rules = append(rules, []seccomp.RuleSet{
+ // Trap mmap system calls and handle them in sigsysGoHandler
+ {
+ Rules: seccomp.SyscallRules{
+ unix.SYS_MMAP: {
+ {
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ seccomp.MatchAny{},
+ /* MAP_DENYWRITE is ignored and used only for filtering. */
+ seccomp.MaskedEqual(unix.MAP_DENYWRITE, 0),
+ },
+ },
+ },
+ Action: linux.SECCOMP_RET_TRAP,
+ },
+ }...)
+ instrs, err := seccomp.BuildProgram(rules, linux.SECCOMP_RET_ALLOW, linux.SECCOMP_RET_ALLOW)
+ if err != nil {
+ panic(fmt.Sprintf("failed to build rules: %v", err))
+ }
+ // Perform the actual installation.
+ if err := seccomp.SetFilter(instrs); err != nil {
+ panic(fmt.Sprintf("failed to set filter: %v", err))
+ }
+ })
+
+ machinePoolMu.Lock()
+ n := atomic.LoadUint32(&machinePoolLen)
+ i := uint32(0)
+ for ; i < n; i++ {
+ if machinePool[i].Load() == nil {
+ break
+ }
+ }
+ if i == n {
+ if i == machinePoolSize {
+ machinePoolMu.Unlock()
+ panic("machinePool is full")
+ }
+ atomic.AddUint32(&machinePoolLen, 1)
+ }
+ machinePool[i].Store(m)
+ m.machinePoolIndex = i
+ machinePoolMu.Unlock()
+}
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index a96634381..ab1e036b7 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -309,22 +309,6 @@ func loadByte(ptr *byte) byte {
return *ptr
}
-// prefaultFloatingPointState touches each page of the floating point state to
-// be sure that its physical pages are mapped.
-//
-// Otherwise the kernel can trigger KVM_EXIT_MMIO and an instruction that
-// triggered a fault will be emulated by the kvm kernel code, but it can't
-// emulate instructions like xsave and xrstor.
-//
-//go:nosplit
-func prefaultFloatingPointState(data *fpu.State) {
- size := len(*data)
- for i := 0; i < size; i += hostarch.PageSize {
- loadByte(&(*data)[i])
- }
- loadByte(&(*data)[size-1])
-}
-
// SwitchToUser unpacks architectural-details.
func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *linux.SignalInfo) (hostarch.AccessType, error) {
// Check for canonical addresses.
@@ -355,11 +339,6 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *linux.SignalInfo)
// allocations occur.
entersyscall()
bluepill(c)
- // The root table physical page has to be mapped to not fault in iret
- // or sysret after switching into a user address space. sysret and
- // iret are in the upper half that is global and already mapped.
- switchOpts.PageTables.PrefaultRootTable()
- prefaultFloatingPointState(switchOpts.FloatingPointState)
vector = c.CPU.SwitchToUser(switchOpts)
exitsyscall()
@@ -522,3 +501,7 @@ func (m *machine) getNewVCPU() *vCPU {
}
return nil
}
+
+func archPhysicalRegions(physicalRegions []physicalRegion) []physicalRegion {
+ return physicalRegions
+}
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index de798bb2c..fbacea9ad 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -161,3 +161,15 @@ func (c *vCPU) getSystemRegisters(sregs *systemRegs) unix.Errno {
}
return 0
}
+
+//go:nosplit
+func seccompMmapSyscall(context unsafe.Pointer) (uintptr, uintptr, unix.Errno) {
+ ctx := bluepillArchContext(context)
+
+ // MAP_DENYWRITE is deprecated and ignored by kernel. We use it only for seccomp filters.
+ addr, _, e := unix.RawSyscall6(uintptr(ctx.Rax), uintptr(ctx.Rdi), uintptr(ctx.Rsi),
+ uintptr(ctx.Rdx), uintptr(ctx.R10)|unix.MAP_DENYWRITE, uintptr(ctx.R8), uintptr(ctx.R9))
+ ctx.Rax = uint64(addr)
+
+ return addr, uintptr(ctx.Rsi), e
+}
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
index 7937a8481..08d98c479 100644
--- a/pkg/sentry/platform/kvm/machine_arm64.go
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -110,18 +110,128 @@ func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
return phyRegions
}
+// archPhysicalRegions fills readOnlyGuestRegions and allocates separate
+// physical regions form them.
+func archPhysicalRegions(physicalRegions []physicalRegion) []physicalRegion {
+ applyVirtualRegions(func(vr virtualRegion) {
+ if excludeVirtualRegion(vr) {
+ return // skip region.
+ }
+ if !vr.accessType.Write {
+ readOnlyGuestRegions = append(readOnlyGuestRegions, vr.region)
+ }
+ })
+
+ rdRegions := readOnlyGuestRegions[:]
+
+ // Add an unreachable region.
+ rdRegions = append(rdRegions, region{
+ virtual: 0xffffffffffffffff,
+ length: 0,
+ })
+
+ var regions []physicalRegion
+ addValidRegion := func(r *physicalRegion, virtual, length uintptr) {
+ if length == 0 {
+ return
+ }
+ regions = append(regions, physicalRegion{
+ region: region{
+ virtual: virtual,
+ length: length,
+ },
+ physical: r.physical + (virtual - r.virtual),
+ })
+ }
+ i := 0
+ for _, pr := range physicalRegions {
+ start := pr.virtual
+ end := pr.virtual + pr.length
+ for start < end {
+ rdRegion := rdRegions[i]
+ rdStart := rdRegion.virtual
+ rdEnd := rdRegion.virtual + rdRegion.length
+ if rdEnd <= start {
+ i++
+ continue
+ }
+ if rdStart > start {
+ newEnd := rdStart
+ if end < rdStart {
+ newEnd = end
+ }
+ addValidRegion(&pr, start, newEnd-start)
+ start = rdStart
+ continue
+ }
+ if rdEnd < end {
+ addValidRegion(&pr, start, rdEnd-start)
+ start = rdEnd
+ continue
+ }
+ addValidRegion(&pr, start, end-start)
+ start = end
+ }
+ }
+
+ return regions
+}
+
// Get all available physicalRegions.
-func availableRegionsForSetMem() (phyRegions []physicalRegion) {
- var excludeRegions []region
+func availableRegionsForSetMem() []physicalRegion {
+ var excludedRegions []region
applyVirtualRegions(func(vr virtualRegion) {
if !vr.accessType.Write {
- excludeRegions = append(excludeRegions, vr.region)
+ excludedRegions = append(excludedRegions, vr.region)
}
})
- phyRegions = computePhysicalRegions(excludeRegions)
+ // Add an unreachable region.
+ excludedRegions = append(excludedRegions, region{
+ virtual: 0xffffffffffffffff,
+ length: 0,
+ })
- return phyRegions
+ var regions []physicalRegion
+ addValidRegion := func(r *physicalRegion, virtual, length uintptr) {
+ if length == 0 {
+ return
+ }
+ regions = append(regions, physicalRegion{
+ region: region{
+ virtual: virtual,
+ length: length,
+ },
+ physical: r.physical + (virtual - r.virtual),
+ })
+ }
+ i := 0
+ for _, pr := range physicalRegions {
+ start := pr.virtual
+ end := pr.virtual + pr.length
+ for start < end {
+ er := excludedRegions[i]
+ excludeEnd := er.virtual + er.length
+ excludeStart := er.virtual
+ if excludeEnd < start {
+ i++
+ continue
+ }
+ if excludeStart < start {
+ start = excludeEnd
+ i++
+ continue
+ }
+ rend := excludeStart
+ if rend > end {
+ rend = end
+ }
+ addValidRegion(&pr, start, rend-start)
+ start = excludeEnd
+ }
+ }
+
+ return regions
}
// nonCanonical generates a canonical address return.
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index 1a4a9ce7d..7e8e19dcb 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -333,3 +333,15 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *linux.SignalInfo)
}
}
+
+//go:nosplit
+func seccompMmapSyscall(context unsafe.Pointer) (uintptr, uintptr, unix.Errno) {
+ ctx := bluepillArchContext(context)
+
+ // MAP_DENYWRITE is deprecated and ignored by kernel. We use it only for seccomp filters.
+ addr, _, e := unix.RawSyscall6(uintptr(ctx.Regs[8]), uintptr(ctx.Regs[0]), uintptr(ctx.Regs[1]),
+ uintptr(ctx.Regs[2]), uintptr(ctx.Regs[3])|unix.MAP_DENYWRITE, uintptr(ctx.Regs[4]), uintptr(ctx.Regs[5]))
+ ctx.Regs[0] = uint64(addr)
+
+ return addr, uintptr(ctx.Regs[1]), e
+}
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index cc3a1253b..cf3a4e7c9 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -171,3 +171,46 @@ func (c *vCPU) setSignalMask() error {
return nil
}
+
+// seccompMmapHandler is a signal handler for runtime mmap system calls
+// that are trapped by seccomp.
+//
+// It executes the mmap syscall with specified arguments and maps a new region
+// to the guest.
+//
+//go:nosplit
+func seccompMmapHandler(context unsafe.Pointer) {
+ addr, length, errno := seccompMmapSyscall(context)
+ if errno != 0 {
+ return
+ }
+
+ for i := uint32(0); i < atomic.LoadUint32(&machinePoolLen); i++ {
+ m := machinePool[i].Load()
+ if m == nil {
+ continue
+ }
+
+ // Map the new region to the guest.
+ vr := region{
+ virtual: addr,
+ length: length,
+ }
+ for virtual := vr.virtual; virtual < vr.virtual+vr.length; {
+ physical, length, ok := translateToPhysical(virtual)
+ if !ok {
+ // This must be an invalid region that was
+ // knocked out by creation of the physical map.
+ return
+ }
+ if virtual+length > vr.virtual+vr.length {
+ // Cap the length to the end of the area.
+ length = vr.virtual + vr.length - virtual
+ }
+
+ // Ensure the physical range is mapped.
+ m.mapPhysical(physical, length, physicalRegions, _KVM_MEM_FLAGS_NONE)
+ virtual += length
+ }
+ }
+}
diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go
index d812e6c26..9864d1258 100644
--- a/pkg/sentry/platform/kvm/physical_map.go
+++ b/pkg/sentry/platform/kvm/physical_map.go
@@ -168,6 +168,9 @@ func computePhysicalRegions(excludedRegions []region) (physicalRegions []physica
}
addValidRegion(lastExcludedEnd, ring0.MaximumUserAddress-lastExcludedEnd)
+ // Do arch-specific actions on physical regions.
+ physicalRegions = archPhysicalRegions(physicalRegions)
+
// Dump our all physical regions.
for _, r := range physicalRegions {
log.Infof("physicalRegion: virtual [%x,%x) => physical [%x,%x)",