1 files changed, 356 insertions, 0 deletions
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
new file mode 100755
index 000000000..b0511aa40
--- /dev/null
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -0,0 +1,356 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build go1.12
+// +build !go1.14
+
+// Check go:linkname function signatures when updating Go version.
+
+package vfs
+
+import (
+	"fmt"
+	"math/bits"
+	"reflect"
+	"sync/atomic"
+	"unsafe"
+
+	"gvisor.dev/gvisor/third_party/gvsync"
+)
+
+// mountKey represents the location at which a Mount is mounted. It is
+// structurally identical to VirtualDentry, but stores its fields as
+// unsafe.Pointer since mutators synchronize with VFS path traversal using
+// seqcounts.
+type mountKey struct {
+	parent unsafe.Pointer // *Mount
+	point  unsafe.Pointer // *Dentry
+}
+
+// Invariant: mnt.key's fields are nil. parent and point are non-nil.
+func (mnt *Mount) storeKey(parent *Mount, point *Dentry) {
+	atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(parent))
+	atomic.StorePointer(&mnt.key.point, unsafe.Pointer(point))
+}
+
+func (mnt *Mount) loadKey() (*Mount, *Dentry) {
+	return (*Mount)(atomic.LoadPointer(&mnt.key.parent)), (*Dentry)(atomic.LoadPointer(&mnt.key.point))
+}
+
+func (mnt *Mount) parent() *Mount {
+	return (*Mount)(atomic.LoadPointer(&mnt.key.parent))
+}
+
+func (mnt *Mount) point() *Dentry {
+	return (*Dentry)(atomic.LoadPointer(&mnt.key.point))
+}
+
+// mountTable maps (mount parent, mount point) pairs to mounts. It supports
+// efficient concurrent lookup, even in the presence of concurrent mutators
+// (provided mutation is sufficiently uncommon).
+//
+// mountTable.Init() must be called on new mountTables before use.
+type mountTable struct {
+	// mountTable is implemented as a seqcount-protected hash table that
+	// resolves collisions with linear probing, featuring Robin Hood insertion
+	// and backward shift deletion. These minimize probe length variance,
+	// significantly improving the performance of linear probing at high load
+	// factors. (mountTable doesn't use bucketing, which is the other major
+	// technique commonly used in high-performance hash tables; the efficiency
+	// of bucketing is largely due to SIMD lookup, and Go lacks both SIMD
+	// intrinsics and inline assembly, limiting the performance of this
+	// approach.)
+
+	seq  gvsync.SeqCount
+	seed uint32 // for hashing keys
+
+	// size holds both length (number of elements) and capacity (number of
+	// slots): capacity is stored as its base-2 log (referred to as order) in
+	// the least significant bits of size, and length is stored in the
+	// remaining bits. Go defines bit shifts >= width of shifted unsigned
+	// operand as shifting to 0, which differs from x86's SHL, so the Go
+	// compiler inserts a bounds check for each bit shift unless we mask order
+	// anyway (cf. runtime.bucketShift()), and length isn't used by lookup;
+	// thus this bit packing gets us more bits for the length (vs. storing
+	// length and cap in separate uint32s) for ~free.
+	size uint64
+
+	slots unsafe.Pointer // []mountSlot; never nil after Init
+}
+
+type mountSlot struct {
+	// We don't store keys in slots; instead, we just check Mount.parent and
+	// Mount.point directly. Any practical use of lookup will need to touch
+	// Mounts anyway, and comparing hashes means that false positives are
+	// extremely rare, so this isn't an extra cache line touch overall.
+	value unsafe.Pointer // *Mount
+	hash  uintptr
+}
+
+const (
+	mtSizeOrderBits = 6 // log2 of pointer size in bits
+	mtSizeOrderMask = (1 << mtSizeOrderBits) - 1
+	mtSizeOrderOne  = 1
+	mtSizeLenLSB    = mtSizeOrderBits
+	mtSizeLenOne    = 1 << mtSizeLenLSB
+	mtSizeLenNegOne = ^uint64(mtSizeOrderMask) // uint64(-1) << mtSizeLenLSB
+
+	mountSlotBytes = unsafe.Sizeof(mountSlot{})
+	mountKeyBytes  = unsafe.Sizeof(mountKey{})
+
+	// Tuning parameters.
+	//
+	// Essentially every mountTable will contain at least /proc, /sys, and
+	// /dev/shm, so there is ~no reason for mtInitCap to be < 4.
+	mtInitOrder  = 2
+	mtInitCap    = 1 << mtInitOrder
+	mtMaxLoadNum = 13
+	mtMaxLoadDen = 16
+)
+
+func init() {
+	// We can't just define mtSizeOrderBits as follows because Go doesn't have
+	// constexpr.
+	if ptrBits := uint(unsafe.Sizeof(uintptr(0)) * 8); mtSizeOrderBits != bits.TrailingZeros(ptrBits) {
+		panic(fmt.Sprintf("mtSizeOrderBits (%d) must be %d = log2 of pointer size in bits (%d)", mtSizeOrderBits, bits.TrailingZeros(ptrBits), ptrBits))
+	}
+	if bits.OnesCount(uint(mountSlotBytes)) != 1 {
+		panic(fmt.Sprintf("sizeof(mountSlotBytes) (%d) must be a power of 2 to use bit masking for wraparound", mountSlotBytes))
+	}
+	if mtInitCap <= 1 {
+		panic(fmt.Sprintf("mtInitCap (%d) must be at least 2 since mountTable methods assume that there will always be at least one empty slot", mtInitCap))
+	}
+	if mtMaxLoadNum >= mtMaxLoadDen {
+		panic(fmt.Sprintf("invalid mountTable maximum load factor (%d/%d)", mtMaxLoadNum, mtMaxLoadDen))
+	}
+}
+
+// Init must be called exactly once on each mountTable before use.
+func (mt *mountTable) Init() {
+	mt.seed = rand32()
+	mt.size = mtInitOrder
+	mt.slots = newMountTableSlots(mtInitCap)
+}
+
+func newMountTableSlots(cap uintptr) unsafe.Pointer {
+	slice := make([]mountSlot, cap, cap)
+	hdr := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
+	return unsafe.Pointer(hdr.Data)
+}
+
+// Lookup returns the Mount with the given parent, mounted at the given point.
+// If no such Mount exists, Lookup returns nil.
+//
+// Lookup may be called even if there are concurrent mutators of mt.
+func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount {
+	key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)}
+	hash := memhash(noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes)
+
+loop:
+	for {
+		epoch := mt.seq.BeginRead()
+		size := atomic.LoadUint64(&mt.size)
+		slots := atomic.LoadPointer(&mt.slots)
+		if !mt.seq.ReadOk(epoch) {
+			continue
+		}
+		tcap := uintptr(1) << (size & mtSizeOrderMask)
+		mask := tcap - 1
+		off := (hash & mask) * mountSlotBytes
+		offmask := mask * mountSlotBytes
+		for {
+			// This avoids bounds checking.
+			slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
+			slotValue := atomic.LoadPointer(&slot.value)
+			slotHash := atomic.LoadUintptr(&slot.hash)
+			if !mt.seq.ReadOk(epoch) {
+				// The element we're looking for might have been moved into a
+				// slot we've previously checked, so restart entirely.
+				continue loop
+			}
+			if slotValue == nil {
+				return nil
+			}
+			if slotHash == hash {
+				mount := (*Mount)(slotValue)
+				var mountKey mountKey
+				mountKey.parent = atomic.LoadPointer(&mount.key.parent)
+				mountKey.point = atomic.LoadPointer(&mount.key.point)
+				if !mt.seq.ReadOk(epoch) {
+					continue loop
+				}
+				if key == mountKey {
+					return mount
+				}
+			}
+			off = (off + mountSlotBytes) & offmask
+		}
+	}
+}
+
+// Insert inserts the given mount into mt.
+//
+// Preconditions: There are no concurrent mutators of mt. mt must not already
+// contain a Mount with the same mount point and parent.
+func (mt *mountTable) Insert(mount *Mount) {
+	hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
+
+	// We're under the maximum load factor if:
+	//
+	//          (len+1) / cap <= mtMaxLoadNum / mtMaxLoadDen
+	// (len+1) * mtMaxLoadDen <= mtMaxLoadNum * cap
+	tlen := mt.size >> mtSizeLenLSB
+	order := mt.size & mtSizeOrderMask
+	tcap := uintptr(1) << order
+	if ((tlen + 1) * mtMaxLoadDen) <= (uint64(mtMaxLoadNum) << order) {
+		// Atomically insert the new element into the table.
+		mt.seq.BeginWrite()
+		atomic.AddUint64(&mt.size, mtSizeLenOne)
+		mtInsertLocked(mt.slots, tcap, unsafe.Pointer(mount), hash)
+		mt.seq.EndWrite()
+		return
+	}
+
+	// Otherwise, we have to expand. Double the number of slots in the new
+	// table.
+	newOrder := order + 1
+	if newOrder > mtSizeOrderMask {
+		panic("mount table size overflow")
+	}
+	newCap := uintptr(1) << newOrder
+	newSlots := newMountTableSlots(newCap)
+	// Copy existing elements to the new table.
+	oldCur := mt.slots
+	// Go does not permit pointers to the end of allocated objects, so we
+	// must use a pointer to the last element of the old table. The
+	// following expression is equivalent to
+	// `slots+(cap-1)*mountSlotBytes` but has a critical path length of 2
+	// arithmetic instructions instead of 3.
+	oldLast := unsafe.Pointer((uintptr(mt.slots) - mountSlotBytes) + (tcap * mountSlotBytes))
+	for {
+		oldSlot := (*mountSlot)(oldCur)
+		if oldSlot.value != nil {
+			// Don't need to lock mt.seq yet since newSlots isn't visible
+			// to readers.
+			mtInsertLocked(newSlots, newCap, oldSlot.value, oldSlot.hash)
+		}
+		if oldCur == oldLast {
+			break
+		}
+		oldCur = unsafe.Pointer(uintptr(oldCur) + mountSlotBytes)
+	}
+	// Insert the new element into the new table.
+	mtInsertLocked(newSlots, newCap, unsafe.Pointer(mount), hash)
+	// Atomically switch to the new table.
+	mt.seq.BeginWrite()
+	atomic.AddUint64(&mt.size, mtSizeLenOne|mtSizeOrderOne)
+	atomic.StorePointer(&mt.slots, newSlots)
+	mt.seq.EndWrite()
+}
+
+// Preconditions: There are no concurrent mutators of the table (slots, cap).
+// If the table is visible to readers, then mt.seq must be in a writer critical
+// section. cap must be a power of 2.
+func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, hash uintptr) {
+	mask := cap - 1
+	off := (hash & mask) * mountSlotBytes
+	offmask := mask * mountSlotBytes
+	disp := uintptr(0)
+	for {
+		slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
+		slotValue := slot.value
+		if slotValue == nil {
+			atomic.StorePointer(&slot.value, value)
+			atomic.StoreUintptr(&slot.hash, hash)
+			return
+		}
+		// If we've been displaced farther from our first-probed slot than the
+		// element stored in this one, swap elements and switch to inserting
+		// the replaced one. (This is Robin Hood insertion.)
+		slotHash := slot.hash
+		slotDisp := ((off / mountSlotBytes) - slotHash) & mask
+		if disp > slotDisp {
+			atomic.StorePointer(&slot.value, value)
+			atomic.StoreUintptr(&slot.hash, hash)
+			value = slotValue
+			hash = slotHash
+			disp = slotDisp
+		}
+		off = (off + mountSlotBytes) & offmask
+		disp++
+	}
+}
+
+// Remove removes the given mount from mt.
+//
+// Preconditions: There are no concurrent mutators of mt. mt must contain
+// mount.
+func (mt *mountTable) Remove(mount *Mount) {
+	hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
+	tcap := uintptr(1) << (mt.size & mtSizeOrderMask)
+	mask := tcap - 1
+	slots := mt.slots
+	off := (hash & mask) * mountSlotBytes
+	offmask := mask * mountSlotBytes
+	for {
+		slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
+		slotValue := slot.value
+		if slotValue == unsafe.Pointer(mount) {
+			// Found the element to remove. Move all subsequent elements
+			// backward until we either find an empty slot, or an element that
+			// is already in its first-probed slot. (This is backward shift
+			// deletion.)
+			mt.seq.BeginWrite()
+			for {
+				nextOff := (off + mountSlotBytes) & offmask
+				nextSlot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + nextOff))
+				nextSlotValue := nextSlot.value
+				if nextSlotValue == nil {
+					break
+				}
+				nextSlotHash := nextSlot.hash
+				if (nextOff / mountSlotBytes) == (nextSlotHash & mask) {
+					break
+				}
+				atomic.StorePointer(&slot.value, nextSlotValue)
+				atomic.StoreUintptr(&slot.hash, nextSlotHash)
+				off = nextOff
+				slot = nextSlot
+			}
+			atomic.StorePointer(&slot.value, nil)
+			atomic.AddUint64(&mt.size, mtSizeLenNegOne)
+			mt.seq.EndWrite()
+			return
+		}
+		if checkInvariants && slotValue == nil {
+			panic(fmt.Sprintf("mountTable.Remove() called on missing Mount %v", mount))
+		}
+		off = (off + mountSlotBytes) & offmask
+	}
+}
+
+//go:linkname memhash runtime.memhash
+func memhash(p unsafe.Pointer, seed, s uintptr) uintptr
+
+//go:linkname rand32 runtime.fastrand
+func rand32() uint32
+
+// This is copy/pasted from runtime.noescape(), and is needed because arguments
+// apparently escape from all functions defined by linkname.
+//
+//go:nosplit
+func noescape(p unsafe.Pointer) unsafe.Pointer {
+	x := uintptr(p)
+	return unsafe.Pointer(x ^ 0)
+}