1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
|
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build go1.12
// +build !go1.15
// Check go:linkname function signatures when updating Go version.
package vfs
import (
"fmt"
"math/bits"
"reflect"
"sync/atomic"
"unsafe"
"gvisor.dev/gvisor/pkg/syncutil"
)
// mountKey represents the location at which a Mount is mounted. It is
// structurally identical to VirtualDentry, but stores its fields as
// unsafe.Pointer since mutators synchronize with VFS path traversal using
// seqcounts.
type mountKey struct {
parent unsafe.Pointer // *Mount
point unsafe.Pointer // *Dentry
}
// Invariant: mnt.key's fields are nil. parent and point are non-nil.
func (mnt *Mount) storeKey(parent *Mount, point *Dentry) {
atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(parent))
atomic.StorePointer(&mnt.key.point, unsafe.Pointer(point))
}
func (mnt *Mount) loadKey() (*Mount, *Dentry) {
return (*Mount)(atomic.LoadPointer(&mnt.key.parent)), (*Dentry)(atomic.LoadPointer(&mnt.key.point))
}
func (mnt *Mount) parent() *Mount {
return (*Mount)(atomic.LoadPointer(&mnt.key.parent))
}
func (mnt *Mount) point() *Dentry {
return (*Dentry)(atomic.LoadPointer(&mnt.key.point))
}
// mountTable maps (mount parent, mount point) pairs to mounts. It supports
// efficient concurrent lookup, even in the presence of concurrent mutators
// (provided mutation is sufficiently uncommon).
//
// mountTable.Init() must be called on new mountTables before use.
type mountTable struct {
// mountTable is implemented as a seqcount-protected hash table that
// resolves collisions with linear probing, featuring Robin Hood insertion
// and backward shift deletion. These minimize probe length variance,
// significantly improving the performance of linear probing at high load
// factors. (mountTable doesn't use bucketing, which is the other major
// technique commonly used in high-performance hash tables; the efficiency
// of bucketing is largely due to SIMD lookup, and Go lacks both SIMD
// intrinsics and inline assembly, limiting the performance of this
// approach.)
seq syncutil.SeqCount
seed uint32 // for hashing keys
// size holds both length (number of elements) and capacity (number of
// slots): capacity is stored as its base-2 log (referred to as order) in
// the least significant bits of size, and length is stored in the
// remaining bits. Go defines bit shifts >= width of shifted unsigned
// operand as shifting to 0, which differs from x86's SHL, so the Go
// compiler inserts a bounds check for each bit shift unless we mask order
// anyway (cf. runtime.bucketShift()), and length isn't used by lookup;
// thus this bit packing gets us more bits for the length (vs. storing
// length and cap in separate uint32s) for ~free.
size uint64
slots unsafe.Pointer // []mountSlot; never nil after Init
}
type mountSlot struct {
// We don't store keys in slots; instead, we just check Mount.parent and
// Mount.point directly. Any practical use of lookup will need to touch
// Mounts anyway, and comparing hashes means that false positives are
// extremely rare, so this isn't an extra cache line touch overall.
value unsafe.Pointer // *Mount
hash uintptr
}
const (
mtSizeOrderBits = 6 // log2 of pointer size in bits
mtSizeOrderMask = (1 << mtSizeOrderBits) - 1
mtSizeOrderOne = 1
mtSizeLenLSB = mtSizeOrderBits
mtSizeLenOne = 1 << mtSizeLenLSB
mtSizeLenNegOne = ^uint64(mtSizeOrderMask) // uint64(-1) << mtSizeLenLSB
mountSlotBytes = unsafe.Sizeof(mountSlot{})
mountKeyBytes = unsafe.Sizeof(mountKey{})
// Tuning parameters.
//
// Essentially every mountTable will contain at least /proc, /sys, and
// /dev/shm, so there is ~no reason for mtInitCap to be < 4.
mtInitOrder = 2
mtInitCap = 1 << mtInitOrder
mtMaxLoadNum = 13
mtMaxLoadDen = 16
)
func init() {
// We can't just define mtSizeOrderBits as follows because Go doesn't have
// constexpr.
if ptrBits := uint(unsafe.Sizeof(uintptr(0)) * 8); mtSizeOrderBits != bits.TrailingZeros(ptrBits) {
panic(fmt.Sprintf("mtSizeOrderBits (%d) must be %d = log2 of pointer size in bits (%d)", mtSizeOrderBits, bits.TrailingZeros(ptrBits), ptrBits))
}
if bits.OnesCount(uint(mountSlotBytes)) != 1 {
panic(fmt.Sprintf("sizeof(mountSlotBytes) (%d) must be a power of 2 to use bit masking for wraparound", mountSlotBytes))
}
if mtInitCap <= 1 {
panic(fmt.Sprintf("mtInitCap (%d) must be at least 2 since mountTable methods assume that there will always be at least one empty slot", mtInitCap))
}
if mtMaxLoadNum >= mtMaxLoadDen {
panic(fmt.Sprintf("invalid mountTable maximum load factor (%d/%d)", mtMaxLoadNum, mtMaxLoadDen))
}
}
// Init must be called exactly once on each mountTable before use.
func (mt *mountTable) Init() {
mt.seed = rand32()
mt.size = mtInitOrder
mt.slots = newMountTableSlots(mtInitCap)
}
func newMountTableSlots(cap uintptr) unsafe.Pointer {
slice := make([]mountSlot, cap, cap)
hdr := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
return unsafe.Pointer(hdr.Data)
}
// Lookup returns the Mount with the given parent, mounted at the given point.
// If no such Mount exists, Lookup returns nil.
//
// Lookup may be called even if there are concurrent mutators of mt.
func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount {
key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)}
hash := memhash(noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes)
loop:
for {
epoch := mt.seq.BeginRead()
size := atomic.LoadUint64(&mt.size)
slots := atomic.LoadPointer(&mt.slots)
if !mt.seq.ReadOk(epoch) {
continue
}
tcap := uintptr(1) << (size & mtSizeOrderMask)
mask := tcap - 1
off := (hash & mask) * mountSlotBytes
offmask := mask * mountSlotBytes
for {
// This avoids bounds checking.
slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
slotValue := atomic.LoadPointer(&slot.value)
slotHash := atomic.LoadUintptr(&slot.hash)
if !mt.seq.ReadOk(epoch) {
// The element we're looking for might have been moved into a
// slot we've previously checked, so restart entirely.
continue loop
}
if slotValue == nil {
return nil
}
if slotHash == hash {
mount := (*Mount)(slotValue)
var mountKey mountKey
mountKey.parent = atomic.LoadPointer(&mount.key.parent)
mountKey.point = atomic.LoadPointer(&mount.key.point)
if !mt.seq.ReadOk(epoch) {
continue loop
}
if key == mountKey {
return mount
}
}
off = (off + mountSlotBytes) & offmask
}
}
}
// Insert inserts the given mount into mt.
//
// Preconditions: There are no concurrent mutators of mt. mt must not already
// contain a Mount with the same mount point and parent.
func (mt *mountTable) Insert(mount *Mount) {
hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
// We're under the maximum load factor if:
//
// (len+1) / cap <= mtMaxLoadNum / mtMaxLoadDen
// (len+1) * mtMaxLoadDen <= mtMaxLoadNum * cap
tlen := mt.size >> mtSizeLenLSB
order := mt.size & mtSizeOrderMask
tcap := uintptr(1) << order
if ((tlen + 1) * mtMaxLoadDen) <= (uint64(mtMaxLoadNum) << order) {
// Atomically insert the new element into the table.
mt.seq.BeginWrite()
atomic.AddUint64(&mt.size, mtSizeLenOne)
mtInsertLocked(mt.slots, tcap, unsafe.Pointer(mount), hash)
mt.seq.EndWrite()
return
}
// Otherwise, we have to expand. Double the number of slots in the new
// table.
newOrder := order + 1
if newOrder > mtSizeOrderMask {
panic("mount table size overflow")
}
newCap := uintptr(1) << newOrder
newSlots := newMountTableSlots(newCap)
// Copy existing elements to the new table.
oldCur := mt.slots
// Go does not permit pointers to the end of allocated objects, so we
// must use a pointer to the last element of the old table. The
// following expression is equivalent to
// `slots+(cap-1)*mountSlotBytes` but has a critical path length of 2
// arithmetic instructions instead of 3.
oldLast := unsafe.Pointer((uintptr(mt.slots) - mountSlotBytes) + (tcap * mountSlotBytes))
for {
oldSlot := (*mountSlot)(oldCur)
if oldSlot.value != nil {
// Don't need to lock mt.seq yet since newSlots isn't visible
// to readers.
mtInsertLocked(newSlots, newCap, oldSlot.value, oldSlot.hash)
}
if oldCur == oldLast {
break
}
oldCur = unsafe.Pointer(uintptr(oldCur) + mountSlotBytes)
}
// Insert the new element into the new table.
mtInsertLocked(newSlots, newCap, unsafe.Pointer(mount), hash)
// Atomically switch to the new table.
mt.seq.BeginWrite()
atomic.AddUint64(&mt.size, mtSizeLenOne|mtSizeOrderOne)
atomic.StorePointer(&mt.slots, newSlots)
mt.seq.EndWrite()
}
// Preconditions: There are no concurrent mutators of the table (slots, cap).
// If the table is visible to readers, then mt.seq must be in a writer critical
// section. cap must be a power of 2.
func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, hash uintptr) {
mask := cap - 1
off := (hash & mask) * mountSlotBytes
offmask := mask * mountSlotBytes
disp := uintptr(0)
for {
slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
slotValue := slot.value
if slotValue == nil {
atomic.StorePointer(&slot.value, value)
atomic.StoreUintptr(&slot.hash, hash)
return
}
// If we've been displaced farther from our first-probed slot than the
// element stored in this one, swap elements and switch to inserting
// the replaced one. (This is Robin Hood insertion.)
slotHash := slot.hash
slotDisp := ((off / mountSlotBytes) - slotHash) & mask
if disp > slotDisp {
atomic.StorePointer(&slot.value, value)
atomic.StoreUintptr(&slot.hash, hash)
value = slotValue
hash = slotHash
disp = slotDisp
}
off = (off + mountSlotBytes) & offmask
disp++
}
}
// Remove removes the given mount from mt.
//
// Preconditions: There are no concurrent mutators of mt. mt must contain
// mount.
func (mt *mountTable) Remove(mount *Mount) {
hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
tcap := uintptr(1) << (mt.size & mtSizeOrderMask)
mask := tcap - 1
slots := mt.slots
off := (hash & mask) * mountSlotBytes
offmask := mask * mountSlotBytes
for {
slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
slotValue := slot.value
if slotValue == unsafe.Pointer(mount) {
// Found the element to remove. Move all subsequent elements
// backward until we either find an empty slot, or an element that
// is already in its first-probed slot. (This is backward shift
// deletion.)
mt.seq.BeginWrite()
for {
nextOff := (off + mountSlotBytes) & offmask
nextSlot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + nextOff))
nextSlotValue := nextSlot.value
if nextSlotValue == nil {
break
}
nextSlotHash := nextSlot.hash
if (nextOff / mountSlotBytes) == (nextSlotHash & mask) {
break
}
atomic.StorePointer(&slot.value, nextSlotValue)
atomic.StoreUintptr(&slot.hash, nextSlotHash)
off = nextOff
slot = nextSlot
}
atomic.StorePointer(&slot.value, nil)
atomic.AddUint64(&mt.size, mtSizeLenNegOne)
mt.seq.EndWrite()
return
}
if checkInvariants && slotValue == nil {
panic(fmt.Sprintf("mountTable.Remove() called on missing Mount %v", mount))
}
off = (off + mountSlotBytes) & offmask
}
}
//go:linkname memhash runtime.memhash
func memhash(p unsafe.Pointer, seed, s uintptr) uintptr
//go:linkname rand32 runtime.fastrand
func rand32() uint32
// This is copy/pasted from runtime.noescape(), and is needed because arguments
// apparently escape from all functions defined by linkname.
//
//go:nosplit
func noescape(p unsafe.Pointer) unsafe.Pointer {
x := uintptr(p)
return unsafe.Pointer(x ^ 0)
}
|