pkg/sentry/platform/kvm/machine.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412

// Copyright 2018 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kvm

import (
	"fmt"
	"runtime"
	"sync"
	"sync/atomic"
	"syscall"

	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
	"gvisor.googlesource.com/gvisor/pkg/tmutex"
)

// machine contains state associated with the VM as a whole.
type machine struct {
	// fd is the vm fd.
	fd int

	// nextSlot is the next slot for setMemoryRegion.
	//
	// This must be accessed atomically. If nextSlot is ^uint32(0), then
	// slots are currently being updated, and the caller should retry.
	nextSlot uint32

	// kernel is the set of global structures.
	kernel *ring0.Kernel

	// mappingCache is used for mapPhysical.
	mappingCache sync.Map

	// mu protects vCPUs.
	mu sync.Mutex

	// vCPUs are the machine vCPUs.
	//
	// This is eventually keyed by system TID, but is initially indexed by
	// the negative vCPU id. This is merely an optimization, so while
	// collisions here are not possible, it wouldn't matter anyways.
	vCPUs map[uint64]*vCPU
}

const (
	// vCPUReady is the lock value for an available vCPU.
	//
	// Legal transitions: vCPUGuest (bluepill).
	vCPUReady uintptr = iota

	// vCPUGuest indicates the vCPU is in guest mode.
	//
	// Legal transition: vCPUReady (bluepill), vCPUWaiter (wait).
	vCPUGuest

	// vCPUWaiter indicates that the vCPU should be released.
	//
	// Legal transition: vCPUReady (bluepill).
	vCPUWaiter
)

// vCPU is a single KVM vCPU.
type vCPU struct {
	// CPU is the kernel CPU data.
	//
	// This must be the first element of this structure, it is referenced
	// by the bluepill code (see bluepill_amd64.s).
	ring0.CPU

	// fd is the vCPU fd.
	fd int

	// tid is the last set tid.
	tid uint64

	// switches is a count of world switches (informational only).
	switches uint32

	// faults is a count of world faults (informational only).
	faults uint32

	// state is the vCPU state; all are described above.
	state uintptr

	// runData for this vCPU.
	runData *runData

	// machine associated with this vCPU.
	machine *machine

	// mu applies across get/put; it does not protect the above.
	mu tmutex.Mutex
}

// newMachine returns a new VM context.
func newMachine(vm int, vCPUs int) (*machine, error) {
	// Create the machine.
	m := &machine{
		fd:    vm,
		vCPUs: make(map[uint64]*vCPU),
	}
	if vCPUs > _KVM_NR_VCPUS {
		// Hard cap at KVM's limit.
		vCPUs = _KVM_NR_VCPUS
	}
	if n := 2 * runtime.NumCPU(); vCPUs > n {
		// Cap at twice the number of physical cores. Otherwise we're
		// just wasting memory and thrashing. (There may be scheduling
		// issues when you've got > n active threads.)
		vCPUs = n
	}
	m.kernel = ring0.New(ring0.KernelOpts{
		PageTables: pagetables.New(m, pagetablesOpts),
	})

	// Initialize architecture state.
	if err := m.initArchState(vCPUs); err != nil {
		m.Destroy()
		return nil, err
	}

	// Create all the vCPUs.
	for id := 0; id < vCPUs; id++ {
		// Create the vCPU.
		fd, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(vm), _KVM_CREATE_VCPU, uintptr(id))
		if errno != 0 {
			m.Destroy()
			return nil, fmt.Errorf("error creating VCPU: %v", errno)
		}
		c := &vCPU{
			fd:      int(fd),
			machine: m,
		}
		c.mu.Init()
		c.CPU.Init(m.kernel)
		c.CPU.KernelSyscall = bluepillSyscall
		c.CPU.KernelException = bluepillException
		m.vCPUs[uint64(-id)] = c // See above.

		// Ensure the signal mask is correct.
		if err := c.setSignalMask(); err != nil {
			m.Destroy()
			return nil, err
		}

		// Initialize architecture state.
		if err := c.initArchState(); err != nil {
			m.Destroy()
			return nil, err
		}

		// Map the run data.
		runData, err := mapRunData(int(fd))
		if err != nil {
			m.Destroy()
			return nil, err
		}
		c.runData = runData
	}

	// Apply the physical mappings. Note that these mappings may point to
	// guest physical addresses that are not actually available. These
	// physical pages are mapped on demand, see kernel_unsafe.go.
	applyPhysicalRegions(func(pr physicalRegion) bool {
		// Map everything in the lower half.
		m.kernel.PageTables.Map(usermem.Addr(pr.virtual), pr.length, false /* kernel */, usermem.AnyAccess, pr.physical)
		// And keep everything in the upper half.
		kernelAddr := usermem.Addr(ring0.KernelStartAddress | pr.virtual)
		m.kernel.PageTables.Map(kernelAddr, pr.length, false /* kernel */, usermem.AnyAccess, pr.physical)
		return true // Keep iterating.
	})

	// Ensure that the currently mapped virtual regions are actually
	// available in the VM. Note that this doesn't guarantee no future
	// faults, however it should guarantee that everything is available to
	// ensure successful vCPU entry.
	applyVirtualRegions(func(vr virtualRegion) {
		if excludeVirtualRegion(vr) {
			return // skip region.
		}
		for virtual := vr.virtual; virtual < vr.virtual+vr.length; {
			physical, length, ok := TranslateToPhysical(virtual)
			if !ok {
				// This must be an invalid region that was
				// knocked out by creation of the physical map.
				return
			}
			if virtual+length > vr.virtual+vr.length {
				// Cap the length to the end of the area.
				length = vr.virtual + vr.length - virtual
			}

			// Ensure the physical range is mapped.
			m.mapPhysical(physical, length)
			virtual += length
		}
	})

	// Ensure the machine is cleaned up properly.
	runtime.SetFinalizer(m, (*machine).Destroy)
	return m, nil
}

// mapPhysical checks for the mapping of a physical range, and installs one if
// not available. This attempts to be efficient for calls in the hot path.
//
// This panics on error.
func (m *machine) mapPhysical(physical, length uintptr) {
	for end := physical + length; physical < end; {
		_, physicalStart, length, ok := calculateBluepillFault(m, physical)
		if !ok {
			// Should never happen.
			panic("mapPhysical on unknown physical address")
		}

		if _, ok := m.mappingCache.LoadOrStore(physicalStart, true); !ok {
			// Not present in the cache; requires setting the slot.
			if _, ok := handleBluepillFault(m, physical); !ok {
				panic("handleBluepillFault failed")
			}
		}

		// Move to the next chunk.
		physical = physicalStart + length
	}
}

// Destroy frees associated resources.
//
// Destroy should only be called once all active users of the machine are gone.
// The machine object should not be used after calling Destroy.
//
// Precondition: all vCPUs must be returned to the machine.
func (m *machine) Destroy() {
	runtime.SetFinalizer(m, nil)

	// Destroy vCPUs.
	for _, c := range m.vCPUs {
		// Ensure the vCPU is not still running in guest mode. This is
		// possible iff teardown has been done by other threads, and
		// somehow a single thread has not executed any system calls.
		c.wait()

		// Teardown the vCPU itself.
		switch state := c.State(); state {
		case vCPUReady:
			// Note that the runData may not be mapped if an error
			// occurs during the middle of initialization.
			if c.runData != nil {
				if err := unmapRunData(c.runData); err != nil {
					panic(fmt.Sprintf("error unmapping rundata: %v", err))
				}
			}
			if err := syscall.Close(int(c.fd)); err != nil {
				panic(fmt.Sprintf("error closing vCPU fd: %v", err))
			}
		case vCPUGuest, vCPUWaiter:
			// Should never happen; waited above.
			panic("vCPU disposed in guest state")
		default:
			// Should never happen; not a valid state.
			panic(fmt.Sprintf("vCPU in invalid state: %v", state))
		}
	}

	// Release host mappings.
	if m.kernel.PageTables != nil {
		m.kernel.PageTables.Release()
	}

	// vCPUs are gone: teardown machine state.
	if err := syscall.Close(m.fd); err != nil {
		panic(fmt.Sprintf("error closing VM fd: %v", err))
	}
}

// Get gets an available vCPU.
func (m *machine) Get() (*vCPU, error) {
	runtime.LockOSThread()
	tid := procid.Current()
	m.mu.Lock()

	for {
		// Check for an exact match.
		if c := m.vCPUs[tid]; c != nil && c.mu.TryLock() {
			m.mu.Unlock()
			return c, nil
		}

		// Scan for an available vCPU.
		for origTID, c := range m.vCPUs {
			if c.LockInState(vCPUReady) {
				delete(m.vCPUs, origTID)
				m.vCPUs[tid] = c
				m.mu.Unlock()

				// We need to reload thread-local segments as
				// we have origTID != tid and the vCPU state
				// may be stale.
				c.loadSegments()
				atomic.StoreUint64(&c.tid, tid)
				return c, nil
			}
		}

		// Everything is busy executing user code (locked).
		//
		// We hold the pool lock here, so we should be able to kick something
		// out of kernel mode and have it bounce into host mode when it tries
		// to grab the vCPU again.
		for _, c := range m.vCPUs {
			if c.State() != vCPUWaiter {
				c.Bounce()
			}
		}

		// Give other threads an opportunity to run.
		yield()
	}
}

// Put puts the current vCPU.
func (m *machine) Put(c *vCPU) {
	c.Unlock()
	runtime.UnlockOSThread()
}

// State returns the current state.
func (c *vCPU) State() uintptr {
	return atomic.LoadUintptr(&c.state)
}

// Lock locks the vCPU.
func (c *vCPU) Lock() {
	c.mu.Lock()
}

// Invalidate invalidates caches.
func (c *vCPU) Invalidate() {
}

// LockInState locks the vCPU if it is in the given state and TryLock succeeds.
func (c *vCPU) LockInState(state uintptr) bool {
	if c.State() == state && c.mu.TryLock() {
		if c.State() != state {
			c.mu.Unlock()
			return false
		}
		return true
	}
	return false
}

// Unlock unlocks the given vCPU.
func (c *vCPU) Unlock() {
	// Ensure we're out of guest mode, if necessary.
	if c.State() == vCPUWaiter {
		redpill() // Force guest mode exit.
	}
	c.mu.Unlock()
}

// NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
func (c *vCPU) NotifyInterrupt() {
	c.Bounce()
}

// pid is used below in bounce.
var pid = syscall.Getpid()

// Bounce ensures that the vCPU bounces back to the kernel.
//
// In practice, this means returning EAGAIN from running user code. The vCPU
// will be unlocked and relock, and the kernel is guaranteed to check for
// interrupt notifications (e.g. injected via Notify) and invalidations.
func (c *vCPU) Bounce() {
	for {
		if c.mu.TryLock() {
			// We know that the vCPU must be in the kernel already,
			// because the lock was not acquired. We specifically
			// don't want to call bounce in this case, because it's
			// not necessary to knock the vCPU out of guest mode.
			c.mu.Unlock()
			return
		}

		if state := c.State(); state == vCPUGuest || state == vCPUWaiter {
			// We know that the vCPU was in guest mode, so a single signal
			// interruption will guarantee that a transition takes place.
			syscall.Tgkill(pid, int(atomic.LoadUint64(&c.tid)), bounceSignal)
			return
		}

		// Someone holds the lock, but the vCPU is not yet transitioned
		// into guest mode. It's in the critical section; give it time.
		yield()
	}
}