Sentry virtual filesystem, v2

Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
author: Jamie Liu <jamieliu@google.com> 2019-07-18 15:09:14 -0700
committer: gVisor bot <gvisor-bot@google.com> 2019-07-18 15:10:29 -0700
commit: 163ab5e9bab4f14923433967656d20f169d0f904 (patch)
tree: 5e51b1573e48fe87fe0e277a32f13c78b0c2f058 /pkg/sentry/vfs/mount_unsafe.go
parent: 6f7e2bb388cb29a355dece8921671c0085f53ea9 (diff)
1 files changed, 356 insertions, 0 deletions
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
new file mode 100644
index 000000000..b0511aa40
--- /dev/null
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -0,0 +1,356 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build go1.12
+// +build !go1.14
+
+// Check go:linkname function signatures when updating Go version.
+
+package vfs
+
+import (
+	"fmt"
+	"math/bits"
+	"reflect"
+	"sync/atomic"
+	"unsafe"
+
+	"gvisor.dev/gvisor/third_party/gvsync"
+)
+
+// mountKey represents the location at which a Mount is mounted. It is
+// structurally identical to VirtualDentry, but stores its fields as
+// unsafe.Pointer since mutators synchronize with VFS path traversal using
+// seqcounts.
+type mountKey struct {
+	parent unsafe.Pointer // *Mount
+	point  unsafe.Pointer // *Dentry
+}
+
+// Invariant: mnt.key's fields are nil. parent and point are non-nil.
+func (mnt *Mount) storeKey(parent *Mount, point *Dentry) {
+	atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(parent))
+	atomic.StorePointer(&mnt.key.point, unsafe.Pointer(point))
+}
+
+func (mnt *Mount) loadKey() (*Mount, *Dentry) {
+	return (*Mount)(atomic.LoadPointer(&mnt.key.parent)), (*Dentry)(atomic.LoadPointer(&mnt.key.point))
+}
+
+func (mnt *Mount) parent() *Mount {
+	return (*Mount)(atomic.LoadPointer(&mnt.key.parent))
+}
+
+func (mnt *Mount) point() *Dentry {
+	return (*Dentry)(atomic.LoadPointer(&mnt.key.point))
+}
+
+// mountTable maps (mount parent, mount point) pairs to mounts. It supports
+// efficient concurrent lookup, even in the presence of concurrent mutators
+// (provided mutation is sufficiently uncommon).
+//
+// mountTable.Init() must be called on new mountTables before use.
+type mountTable struct {
+	// mountTable is implemented as a seqcount-protected hash table that
+	// resolves collisions with linear probing, featuring Robin Hood insertion
+	// and backward shift deletion. These minimize probe length variance,
+	// significantly improving the performance of linear probing at high load
+	// factors. (mountTable doesn't use bucketing, which is the other major
+	// technique commonly used in high-performance hash tables; the efficiency
+	// of bucketing is largely due to SIMD lookup, and Go lacks both SIMD
+	// intrinsics and inline assembly, limiting the performance of this
+	// approach.)
+
+	seq  gvsync.SeqCount
+	seed uint32 // for hashing keys
+
+	// size holds both length (number of elements) and capacity (number of
+	// slots): capacity is stored as its base-2 log (referred to as order) in
+	// the least significant bits of size, and length is stored in the
+	// remaining bits. Go defines bit shifts >= width of shifted unsigned
+	// operand as shifting to 0, which differs from x86's SHL, so the Go
+	// compiler inserts a bounds check for each bit shift unless we mask order
+	// anyway (cf. runtime.bucketShift()), and length isn't used by lookup;
+	// thus this bit packing gets us more bits for the length (vs. storing
+	// length and cap in separate uint32s) for ~free.
+	size uint64
+
+	slots unsafe.Pointer // []mountSlot; never nil after Init
+}
+
+type mountSlot struct {
+	// We don't store keys in slots; instead, we just check Mount.parent and
+	// Mount.point directly. Any practical use of lookup will need to touch
+	// Mounts anyway, and comparing hashes means that false positives are
+	// extremely rare, so this isn't an extra cache line touch overall.
+	value unsafe.Pointer // *Mount
+	hash  uintptr
+}
+
+const (
+	mtSizeOrderBits = 6 // log2 of pointer size in bits
+	mtSizeOrderMask = (1 << mtSizeOrderBits) - 1
+	mtSizeOrderOne  = 1
+	mtSizeLenLSB    = mtSizeOrderBits
+	mtSizeLenOne    = 1 << mtSizeLenLSB
+	mtSizeLenNegOne = ^uint64(mtSizeOrderMask) // uint64(-1) << mtSizeLenLSB
+
+	mountSlotBytes = unsafe.Sizeof(mountSlot{})
+	mountKeyBytes  = unsafe.Sizeof(mountKey{})
+
+	// Tuning parameters.
+	//
+	// Essentially every mountTable will contain at least /proc, /sys, and
+	// /dev/shm, so there is ~no reason for mtInitCap to be < 4.
+	mtInitOrder  = 2
+	mtInitCap    = 1 << mtInitOrder
+	mtMaxLoadNum = 13
+	mtMaxLoadDen = 16
+)
+
+func init() {
+	// We can't just define mtSizeOrderBits as follows because Go doesn't have
+	// constexpr.
+	if ptrBits := uint(unsafe.Sizeof(uintptr(0)) * 8); mtSizeOrderBits != bits.TrailingZeros(ptrBits) {
+		panic(fmt.Sprintf("mtSizeOrderBits (%d) must be %d = log2 of pointer size in bits (%d)", mtSizeOrderBits, bits.TrailingZeros(ptrBits), ptrBits))
+	}
+	if bits.OnesCount(uint(mountSlotBytes)) != 1 {
+		panic(fmt.Sprintf("sizeof(mountSlotBytes) (%d) must be a power of 2 to use bit masking for wraparound", mountSlotBytes))
+	}
+	if mtInitCap <= 1 {
+		panic(fmt.Sprintf("mtInitCap (%d) must be at least 2 since mountTable methods assume that there will always be at least one empty slot", mtInitCap))
+	}
+	if mtMaxLoadNum >= mtMaxLoadDen {
+		panic(fmt.Sprintf("invalid mountTable maximum load factor (%d/%d)", mtMaxLoadNum, mtMaxLoadDen))
+	}
+}
+
+// Init must be called exactly once on each mountTable before use.
+func (mt *mountTable) Init() {
+	mt.seed = rand32()
+	mt.size = mtInitOrder
+	mt.slots = newMountTableSlots(mtInitCap)
+}
+
+func newMountTableSlots(cap uintptr) unsafe.Pointer {
+	slice := make([]mountSlot, cap, cap)
+	hdr := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
+	return unsafe.Pointer(hdr.Data)
+}
+
+// Lookup returns the Mount with the given parent, mounted at the given point.
+// If no such Mount exists, Lookup returns nil.
+//
+// Lookup may be called even if there are concurrent mutators of mt.
+func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount {
+	key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)}
+	hash := memhash(noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes)
+
+loop:
+	for {
+		epoch := mt.seq.BeginRead()
+		size := atomic.LoadUint64(&mt.size)
+		slots := atomic.LoadPointer(&mt.slots)
+		if !mt.seq.ReadOk(epoch) {
+			continue
+		}
+		tcap := uintptr(1) << (size & mtSizeOrderMask)
+		mask := tcap - 1
+		off := (hash & mask) * mountSlotBytes
+		offmask := mask * mountSlotBytes
+		for {
+			// This avoids bounds checking.
+			slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
+			slotValue := atomic.LoadPointer(&slot.value)
+			slotHash := atomic.LoadUintptr(&slot.hash)
+			if !mt.seq.ReadOk(epoch) {
+				// The element we're looking for might have been moved into a
+				// slot we've previously checked, so restart entirely.
+				continue loop
+			}
+			if slotValue == nil {
+				return nil
+			}
+			if slotHash == hash {
+				mount := (*Mount)(slotValue)
+				var mountKey mountKey
+				mountKey.parent = atomic.LoadPointer(&mount.key.parent)
+				mountKey.point = atomic.LoadPointer(&mount.key.point)
+				if !mt.seq.ReadOk(epoch) {
+					continue loop
+				}
+				if key == mountKey {
+					return mount
+				}
+			}
+			off = (off + mountSlotBytes) & offmask
+		}
+	}
+}
+
+// Insert inserts the given mount into mt.
+//
+// Preconditions: There are no concurrent mutators of mt. mt must not already
+// contain a Mount with the same mount point and parent.
+func (mt *mountTable) Insert(mount *Mount) {
+	hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
+
+	// We're under the maximum load factor if:
+	//
+	//          (len+1) / cap <= mtMaxLoadNum / mtMaxLoadDen
+	// (len+1) * mtMaxLoadDen <= mtMaxLoadNum * cap
+	tlen := mt.size >> mtSizeLenLSB
+	order := mt.size & mtSizeOrderMask
+	tcap := uintptr(1) << order
+	if ((tlen + 1) * mtMaxLoadDen) <= (uint64(mtMaxLoadNum) << order) {
+		// Atomically insert the new element into the table.
+		mt.seq.BeginWrite()
+		atomic.AddUint64(&mt.size, mtSizeLenOne)
+		mtInsertLocked(mt.slots, tcap, unsafe.Pointer(mount), hash)
+		mt.seq.EndWrite()
+		return
+	}
+
+	// Otherwise, we have to expand. Double the number of slots in the new
+	// table.
+	newOrder := order + 1
+	if newOrder > mtSizeOrderMask {
+		panic("mount table size overflow")
+	}
+	newCap := uintptr(1) << newOrder
+	newSlots := newMountTableSlots(newCap)
+	// Copy existing elements to the new table.
+	oldCur := mt.slots
+	// Go does not permit pointers to the end of allocated objects, so we
+	// must use a pointer to the last element of the old table. The
+	// following expression is equivalent to
+	// `slots+(cap-1)*mountSlotBytes` but has a critical path length of 2
+	// arithmetic instructions instead of 3.
+	oldLast := unsafe.Pointer((uintptr(mt.slots) - mountSlotBytes) + (tcap * mountSlotBytes))
+	for {
+		oldSlot := (*mountSlot)(oldCur)
+		if oldSlot.value != nil {
+			// Don't need to lock mt.seq yet since newSlots isn't visible
+			// to readers.
+			mtInsertLocked(newSlots, newCap, oldSlot.value, oldSlot.hash)
+		}
+		if oldCur == oldLast {
+			break
+		}
+		oldCur = unsafe.Pointer(uintptr(oldCur) + mountSlotBytes)
+	}
+	// Insert the new element into the new table.
+	mtInsertLocked(newSlots, newCap, unsafe.Pointer(mount), hash)
+	// Atomically switch to the new table.
+	mt.seq.BeginWrite()
+	atomic.AddUint64(&mt.size, mtSizeLenOne|mtSizeOrderOne)
+	atomic.StorePointer(&mt.slots, newSlots)
+	mt.seq.EndWrite()
+}
+
+// Preconditions: There are no concurrent mutators of the table (slots, cap).
+// If the table is visible to readers, then mt.seq must be in a writer critical
+// section. cap must be a power of 2.
+func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, hash uintptr) {
+	mask := cap - 1
+	off := (hash & mask) * mountSlotBytes
+	offmask := mask * mountSlotBytes
+	disp := uintptr(0)
+	for {
+		slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
+		slotValue := slot.value
+		if slotValue == nil {
+			atomic.StorePointer(&slot.value, value)
+			atomic.StoreUintptr(&slot.hash, hash)
+			return
+		}
+		// If we've been displaced farther from our first-probed slot than the
+		// element stored in this one, swap elements and switch to inserting
+		// the replaced one. (This is Robin Hood insertion.)
+		slotHash := slot.hash
+		slotDisp := ((off / mountSlotBytes) - slotHash) & mask
+		if disp > slotDisp {
+			atomic.StorePointer(&slot.value, value)
+			atomic.StoreUintptr(&slot.hash, hash)
+			value = slotValue
+			hash = slotHash
+			disp = slotDisp
+		}
+		off = (off + mountSlotBytes) & offmask
+		disp++
+	}
+}
+
+// Remove removes the given mount from mt.
+//
+// Preconditions: There are no concurrent mutators of mt. mt must contain
+// mount.
+func (mt *mountTable) Remove(mount *Mount) {
+	hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
+	tcap := uintptr(1) << (mt.size & mtSizeOrderMask)
+	mask := tcap - 1
+	slots := mt.slots
+	off := (hash & mask) * mountSlotBytes
+	offmask := mask * mountSlotBytes
+	for {
+		slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
+		slotValue := slot.value
+		if slotValue == unsafe.Pointer(mount) {
+			// Found the element to remove. Move all subsequent elements
+			// backward until we either find an empty slot, or an element that
+			// is already in its first-probed slot. (This is backward shift
+			// deletion.)
+			mt.seq.BeginWrite()
+			for {
+				nextOff := (off + mountSlotBytes) & offmask
+				nextSlot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + nextOff))
+				nextSlotValue := nextSlot.value
+				if nextSlotValue == nil {
+					break
+				}
+				nextSlotHash := nextSlot.hash
+				if (nextOff / mountSlotBytes) == (nextSlotHash & mask) {
+					break
+				}
+				atomic.StorePointer(&slot.value, nextSlotValue)
+				atomic.StoreUintptr(&slot.hash, nextSlotHash)
+				off = nextOff
+				slot = nextSlot
+			}
+			atomic.StorePointer(&slot.value, nil)
+			atomic.AddUint64(&mt.size, mtSizeLenNegOne)
+			mt.seq.EndWrite()
+			return
+		}
+		if checkInvariants && slotValue == nil {
+			panic(fmt.Sprintf("mountTable.Remove() called on missing Mount %v", mount))
+		}
+		off = (off + mountSlotBytes) & offmask
+	}
+}
+
+//go:linkname memhash runtime.memhash
+func memhash(p unsafe.Pointer, seed, s uintptr) uintptr
+
+//go:linkname rand32 runtime.fastrand
+func rand32() uint32
+
+// This is copy/pasted from runtime.noescape(), and is needed because arguments
+// apparently escape from all functions defined by linkname.
+//
+//go:nosplit
+func noescape(p unsafe.Pointer) unsafe.Pointer {
+	x := uintptr(p)
+	return unsafe.Pointer(x ^ 0)
+}
author	Jamie Liu <jamieliu@google.com>	2019-07-18 15:09:14 -0700
committer	gVisor bot <gvisor-bot@google.com>	2019-07-18 15:10:29 -0700
commit	163ab5e9bab4f14923433967656d20f169d0f904 (patch)
tree	5e51b1573e48fe87fe0e277a32f13c78b0c2f058 /pkg/sentry/vfs/mount_unsafe.go
parent	6f7e2bb388cb29a355dece8921671c0085f53ea9 (diff)