summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/platform/kvm/bluepill_fault.go
blob: b974760535a26fe09db7dd90e65f6eecf66f0d34 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kvm

import (
	"sync/atomic"
	"syscall"

	"gvisor.dev/gvisor/pkg/sentry/usermem"
)

const (
	// faultBlockSize is the size used for servicing memory faults.
	//
	// This should be large enough to avoid frequent faults and avoid using
	// all available KVM slots (~512), but small enough that KVM does not
	// complain about slot sizes (~4GB). See handleBluepillFault for how
	// this block is used.
	faultBlockSize = 2 << 30

	// faultBlockMask is the mask for the fault blocks.
	//
	// This must be typed to avoid overflow complaints (ugh).
	faultBlockMask = ^uintptr(faultBlockSize - 1)
)

// yield yields the CPU.
//
//go:nosplit
func yield() {
	syscall.RawSyscall(syscall.SYS_SCHED_YIELD, 0, 0, 0)
}

// calculateBluepillFault calculates the fault address range.
//
//go:nosplit
func calculateBluepillFault(physical uintptr) (virtualStart, physicalStart, length uintptr, ok bool) {
	alignedPhysical := physical &^ uintptr(usermem.PageSize-1)
	for _, pr := range physicalRegions {
		end := pr.physical + pr.length
		if physical < pr.physical || physical >= end {
			continue
		}

		// Adjust the block to match our size.
		physicalStart = alignedPhysical & faultBlockMask
		if physicalStart < pr.physical {
			// Bound the starting point to the start of the region.
			physicalStart = pr.physical
		}
		virtualStart = pr.virtual + (physicalStart - pr.physical)
		physicalEnd := physicalStart + faultBlockSize
		if physicalEnd > end {
			physicalEnd = end
		}
		length = physicalEnd - physicalStart
		return virtualStart, physicalStart, length, true
	}

	return 0, 0, 0, false
}

// handleBluepillFault handles a physical fault.
//
// The corresponding virtual address is returned. This may throw on error.
//
//go:nosplit
func handleBluepillFault(m *machine, physical uintptr) (uintptr, bool) {
	// Paging fault: we need to map the underlying physical pages for this
	// fault. This all has to be done in this function because we're in a
	// signal handler context. (We can't call any functions that might
	// split the stack.)
	virtualStart, physicalStart, length, ok := calculateBluepillFault(physical)
	if !ok {
		return 0, false
	}

	// Set the KVM slot.
	//
	// First, we need to acquire the exclusive right to set a slot.  See
	// machine.nextSlot for information about the protocol.
	slot := atomic.SwapUint32(&m.nextSlot, ^uint32(0))
	for slot == ^uint32(0) {
		yield() // Race with another call.
		slot = atomic.SwapUint32(&m.nextSlot, ^uint32(0))
	}
	errno := m.setMemoryRegion(int(slot), physicalStart, length, virtualStart)
	if errno == 0 {
		// Successfully added region; we can increment nextSlot and
		// allow another set to proceed here.
		atomic.StoreUint32(&m.nextSlot, slot+1)
		return virtualStart + (physical - physicalStart), true
	}

	// Release our slot (still available).
	atomic.StoreUint32(&m.nextSlot, slot)

	switch errno {
	case syscall.EEXIST:
		// The region already exists. It's possible that we raced with
		// another vCPU here. We just revert nextSlot and return true,
		// because this must have been satisfied by some other vCPU.
		return virtualStart + (physical - physicalStart), true
	case syscall.EINVAL:
		throw("set memory region failed; out of slots")
	case syscall.ENOMEM:
		throw("set memory region failed: out of memory")
	case syscall.EFAULT:
		throw("set memory region failed: invalid physical range")
	default:
		throw("set memory region failed: unknown reason")
	}

	panic("unreachable")
}