pkg/sentry/vfs/mount_unsafe.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364

// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build go1.12
// +build !go1.15

// Check go:linkname function signatures when updating Go version.

package vfs

import (
	"fmt"
	"math/bits"
	"reflect"
	"sync/atomic"
	"unsafe"

	"gvisor.dev/gvisor/pkg/gohacks"
	"gvisor.dev/gvisor/pkg/sync"
)

// mountKey represents the location at which a Mount is mounted. It is
// structurally identical to VirtualDentry, but stores its fields as
// unsafe.Pointer since mutators synchronize with VFS path traversal using
// seqcounts.
type mountKey struct {
	parent unsafe.Pointer // *Mount
	point  unsafe.Pointer // *Dentry
}

func (mnt *Mount) parent() *Mount {
	return (*Mount)(atomic.LoadPointer(&mnt.key.parent))
}

func (mnt *Mount) point() *Dentry {
	return (*Dentry)(atomic.LoadPointer(&mnt.key.point))
}

func (mnt *Mount) loadKey() VirtualDentry {
	return VirtualDentry{
		mount:  mnt.parent(),
		dentry: mnt.point(),
	}
}

// Invariant: mnt.key.parent == nil. vd.Ok().
func (mnt *Mount) storeKey(vd VirtualDentry) {
	atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(vd.mount))
	atomic.StorePointer(&mnt.key.point, unsafe.Pointer(vd.dentry))
}

// mountTable maps (mount parent, mount point) pairs to mounts. It supports
// efficient concurrent lookup, even in the presence of concurrent mutators
// (provided mutation is sufficiently uncommon).
//
// mountTable.Init() must be called on new mountTables before use.
//
// +stateify savable
type mountTable struct {
	// mountTable is implemented as a seqcount-protected hash table that
	// resolves collisions with linear probing, featuring Robin Hood insertion
	// and backward shift deletion. These minimize probe length variance,
	// significantly improving the performance of linear probing at high load
	// factors. (mountTable doesn't use bucketing, which is the other major
	// technique commonly used in high-performance hash tables; the efficiency
	// of bucketing is largely due to SIMD lookup, and Go lacks both SIMD
	// intrinsics and inline assembly, limiting the performance of this
	// approach.)

	seq  sync.SeqCount `state:"nosave"`
	seed uint32        // for hashing keys

	// size holds both length (number of elements) and capacity (number of
	// slots): capacity is stored as its base-2 log (referred to as order) in
	// the least significant bits of size, and length is stored in the
	// remaining bits. Go defines bit shifts >= width of shifted unsigned
	// operand as shifting to 0, which differs from x86's SHL, so the Go
	// compiler inserts a bounds check for each bit shift unless we mask order
	// anyway (cf. runtime.bucketShift()), and length isn't used by lookup;
	// thus this bit packing gets us more bits for the length (vs. storing
	// length and cap in separate uint32s) for ~free.
	size uint64

	slots unsafe.Pointer `state:"nosave"` // []mountSlot; never nil after Init
}

type mountSlot struct {
	// We don't store keys in slots; instead, we just check Mount.parent and
	// Mount.point directly. Any practical use of lookup will need to touch
	// Mounts anyway, and comparing hashes means that false positives are
	// extremely rare, so this isn't an extra cache line touch overall.
	value unsafe.Pointer // *Mount
	hash  uintptr
}

const (
	mtSizeOrderBits = 6 // log2 of pointer size in bits
	mtSizeOrderMask = (1 << mtSizeOrderBits) - 1
	mtSizeOrderOne  = 1
	mtSizeLenLSB    = mtSizeOrderBits
	mtSizeLenOne    = 1 << mtSizeLenLSB
	mtSizeLenNegOne = ^uint64(mtSizeOrderMask) // uint64(-1) << mtSizeLenLSB

	mountSlotBytes = unsafe.Sizeof(mountSlot{})
	mountKeyBytes  = unsafe.Sizeof(mountKey{})

	// Tuning parameters.
	//
	// Essentially every mountTable will contain at least /proc, /sys, and
	// /dev/shm, so there is ~no reason for mtInitCap to be < 4.
	mtInitOrder  = 2
	mtInitCap    = 1 << mtInitOrder
	mtMaxLoadNum = 13
	mtMaxLoadDen = 16
)

func init() {
	// We can't just define mtSizeOrderBits as follows because Go doesn't have
	// constexpr.
	if ptrBits := uint(unsafe.Sizeof(uintptr(0)) * 8); mtSizeOrderBits != bits.TrailingZeros(ptrBits) {
		panic(fmt.Sprintf("mtSizeOrderBits (%d) must be %d = log2 of pointer size in bits (%d)", mtSizeOrderBits, bits.TrailingZeros(ptrBits), ptrBits))
	}
	if bits.OnesCount(uint(mountSlotBytes)) != 1 {
		panic(fmt.Sprintf("sizeof(mountSlotBytes) (%d) must be a power of 2 to use bit masking for wraparound", mountSlotBytes))
	}
	if mtInitCap <= 1 {
		panic(fmt.Sprintf("mtInitCap (%d) must be at least 2 since mountTable methods assume that there will always be at least one empty slot", mtInitCap))
	}
	if mtMaxLoadNum >= mtMaxLoadDen {
		panic(fmt.Sprintf("invalid mountTable maximum load factor (%d/%d)", mtMaxLoadNum, mtMaxLoadDen))
	}
}

// Init must be called exactly once on each mountTable before use.
func (mt *mountTable) Init() {
	mt.seed = rand32()
	mt.size = mtInitOrder
	mt.slots = newMountTableSlots(mtInitCap)
}

func newMountTableSlots(cap uintptr) unsafe.Pointer {
	slice := make([]mountSlot, cap, cap)
	hdr := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
	return unsafe.Pointer(hdr.Data)
}

// Lookup returns the Mount with the given parent, mounted at the given point.
// If no such Mount exists, Lookup returns nil.
//
// Lookup may be called even if there are concurrent mutators of mt.
func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount {
	key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)}
	hash := memhash(gohacks.Noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes)

loop:
	for {
		epoch := mt.seq.BeginRead()
		size := atomic.LoadUint64(&mt.size)
		slots := atomic.LoadPointer(&mt.slots)
		if !mt.seq.ReadOk(epoch) {
			continue
		}
		tcap := uintptr(1) << (size & mtSizeOrderMask)
		mask := tcap - 1
		off := (hash & mask) * mountSlotBytes
		offmask := mask * mountSlotBytes
		for {
			// This avoids bounds checking.
			slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
			slotValue := atomic.LoadPointer(&slot.value)
			slotHash := atomic.LoadUintptr(&slot.hash)
			if !mt.seq.ReadOk(epoch) {
				// The element we're looking for might have been moved into a
				// slot we've previously checked, so restart entirely.
				continue loop
			}
			if slotValue == nil {
				return nil
			}
			if slotHash == hash {
				mount := (*Mount)(slotValue)
				var mountKey mountKey
				mountKey.parent = atomic.LoadPointer(&mount.key.parent)
				mountKey.point = atomic.LoadPointer(&mount.key.point)
				if !mt.seq.ReadOk(epoch) {
					continue loop
				}
				if key == mountKey {
					return mount
				}
			}
			off = (off + mountSlotBytes) & offmask
		}
	}
}

// Insert inserts the given mount into mt.
//
// Preconditions: mt must not already contain a Mount with the same mount point
// and parent.
func (mt *mountTable) Insert(mount *Mount) {
	mt.seq.BeginWrite()
	mt.insertSeqed(mount)
	mt.seq.EndWrite()
}

// insertSeqed inserts the given mount into mt.
//
// Preconditions: mt.seq must be in a writer critical section. mt must not
// already contain a Mount with the same mount point and parent.
func (mt *mountTable) insertSeqed(mount *Mount) {
	hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)

	// We're under the maximum load factor if:
	//
	//          (len+1) / cap <= mtMaxLoadNum / mtMaxLoadDen
	// (len+1) * mtMaxLoadDen <= mtMaxLoadNum * cap
	tlen := mt.size >> mtSizeLenLSB
	order := mt.size & mtSizeOrderMask
	tcap := uintptr(1) << order
	if ((tlen + 1) * mtMaxLoadDen) <= (uint64(mtMaxLoadNum) << order) {
		// Atomically insert the new element into the table.
		atomic.AddUint64(&mt.size, mtSizeLenOne)
		mtInsertLocked(mt.slots, tcap, unsafe.Pointer(mount), hash)
		return
	}

	// Otherwise, we have to expand. Double the number of slots in the new
	// table.
	newOrder := order + 1
	if newOrder > mtSizeOrderMask {
		panic("mount table size overflow")
	}
	newCap := uintptr(1) << newOrder
	newSlots := newMountTableSlots(newCap)
	// Copy existing elements to the new table.
	oldCur := mt.slots
	// Go does not permit pointers to the end of allocated objects, so we
	// must use a pointer to the last element of the old table. The
	// following expression is equivalent to
	// `slots+(cap-1)*mountSlotBytes` but has a critical path length of 2
	// arithmetic instructions instead of 3.
	oldLast := unsafe.Pointer((uintptr(mt.slots) - mountSlotBytes) + (tcap * mountSlotBytes))
	for {
		oldSlot := (*mountSlot)(oldCur)
		if oldSlot.value != nil {
			mtInsertLocked(newSlots, newCap, oldSlot.value, oldSlot.hash)
		}
		if oldCur == oldLast {
			break
		}
		oldCur = unsafe.Pointer(uintptr(oldCur) + mountSlotBytes)
	}
	// Insert the new element into the new table.
	mtInsertLocked(newSlots, newCap, unsafe.Pointer(mount), hash)
	// Switch to the new table.
	atomic.AddUint64(&mt.size, mtSizeLenOne|mtSizeOrderOne)
	atomic.StorePointer(&mt.slots, newSlots)
}

// Preconditions: There are no concurrent mutators of the table (slots, cap).
// If the table is visible to readers, then mt.seq must be in a writer critical
// section. cap must be a power of 2.
func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, hash uintptr) {
	mask := cap - 1
	off := (hash & mask) * mountSlotBytes
	offmask := mask * mountSlotBytes
	disp := uintptr(0)
	for {
		slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
		slotValue := slot.value
		if slotValue == nil {
			atomic.StorePointer(&slot.value, value)
			atomic.StoreUintptr(&slot.hash, hash)
			return
		}
		// If we've been displaced farther from our first-probed slot than the
		// element stored in this one, swap elements and switch to inserting
		// the replaced one. (This is Robin Hood insertion.)
		slotHash := slot.hash
		slotDisp := ((off / mountSlotBytes) - slotHash) & mask
		if disp > slotDisp {
			atomic.StorePointer(&slot.value, value)
			atomic.StoreUintptr(&slot.hash, hash)
			value = slotValue
			hash = slotHash
			disp = slotDisp
		}
		off = (off + mountSlotBytes) & offmask
		disp++
	}
}

// Remove removes the given mount from mt.
//
// Preconditions: mt must contain mount.
func (mt *mountTable) Remove(mount *Mount) {
	mt.seq.BeginWrite()
	mt.removeSeqed(mount)
	mt.seq.EndWrite()
}

// removeSeqed removes the given mount from mt.
//
// Preconditions: mt.seq must be in a writer critical section. mt must contain
// mount.
func (mt *mountTable) removeSeqed(mount *Mount) {
	hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
	tcap := uintptr(1) << (mt.size & mtSizeOrderMask)
	mask := tcap - 1
	slots := mt.slots
	off := (hash & mask) * mountSlotBytes
	offmask := mask * mountSlotBytes
	for {
		slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
		slotValue := slot.value
		if slotValue == unsafe.Pointer(mount) {
			// Found the element to remove. Move all subsequent elements
			// backward until we either find an empty slot, or an element that
			// is already in its first-probed slot. (This is backward shift
			// deletion.)
			for {
				nextOff := (off + mountSlotBytes) & offmask
				nextSlot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + nextOff))
				nextSlotValue := nextSlot.value
				if nextSlotValue == nil {
					break
				}
				nextSlotHash := nextSlot.hash
				if (nextOff / mountSlotBytes) == (nextSlotHash & mask) {
					break
				}
				atomic.StorePointer(&slot.value, nextSlotValue)
				atomic.StoreUintptr(&slot.hash, nextSlotHash)
				off = nextOff
				slot = nextSlot
			}
			atomic.StorePointer(&slot.value, nil)
			atomic.AddUint64(&mt.size, mtSizeLenNegOne)
			return
		}
		if checkInvariants && slotValue == nil {
			panic(fmt.Sprintf("mountTable.Remove() called on missing Mount %v", mount))
		}
		off = (off + mountSlotBytes) & offmask
	}
}

//go:linkname memhash runtime.memhash
func memhash(p unsafe.Pointer, seed, s uintptr) uintptr

//go:linkname rand32 runtime.fastrand
func rand32() uint32