Sentry virtual filesystem, v2

Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
author: Jamie Liu <jamieliu@google.com> 2019-07-18 15:09:14 -0700
committer: gVisor bot <gvisor-bot@google.com> 2019-07-18 15:10:29 -0700
commit: 163ab5e9bab4f14923433967656d20f169d0f904 (patch)
tree: 5e51b1573e48fe87fe0e277a32f13c78b0c2f058 /pkg/sentry/fsimpl/memfs/memfs.go
parent: 6f7e2bb388cb29a355dece8921671c0085f53ea9 (diff)
1 files changed, 299 insertions, 0 deletions
diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go
new file mode 100644
index 000000000..f381e1a88
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/memfs.go
@@ -0,0 +1,299 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package memfs provides a filesystem implementation that behaves like tmpfs:
+// the Dentry tree is the sole source of truth for the state of the filesystem.
+//
+// memfs is intended primarily to demonstrate filesystem implementation
+// patterns. Real uses cases for an in-memory filesystem should use tmpfs
+// instead.
+//
+// Lock order:
+//
+// Filesystem.mu
+//   regularFileFD.offMu
+//     regularFile.mu
+//   Inode.mu
+package memfs
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// Filesystem implements vfs.FilesystemImpl.
+type Filesystem struct {
+	vfsfs vfs.Filesystem
+
+	// mu serializes changes to the Dentry tree.
+	mu sync.RWMutex
+
+	nextInoMinusOne uint64 // accessed using atomic memory operations
+}
+
+// NewFilesystem implements vfs.FilesystemType.NewFilesystem.
+func (fstype FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	var fs Filesystem
+	fs.vfsfs.Init(&fs)
+	root := fs.newDentry(fs.newDirectory(creds, 01777))
+	return &fs.vfsfs, &root.vfsd, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *Filesystem) Release() {
+}
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *Filesystem) Sync(ctx context.Context) error {
+	// All filesystem state is in-memory.
+	return nil
+}
+
+// Dentry implements vfs.DentryImpl.
+type Dentry struct {
+	vfsd vfs.Dentry
+
+	// inode is the inode represented by this Dentry. Multiple Dentries may
+	// share a single non-directory Inode (with hard links). inode is
+	// immutable.
+	inode *Inode
+
+	// memfs doesn't count references on Dentries; because the Dentry tree is
+	// the sole source of truth, it is by definition always consistent with the
+	// state of the filesystem. However, it does count references on Inodes,
+	// because Inode resources are released when all references are dropped.
+	// (memfs doesn't really have resources to release, but we implement
+	// reference counting because tmpfs regular files will.)
+
+	// dentryEntry (ugh) links Dentries into their parent directory.childList.
+	dentryEntry
+}
+
+func (fs *Filesystem) newDentry(inode *Inode) *Dentry {
+	d := &Dentry{
+		inode: inode,
+	}
+	d.vfsd.Init(d)
+	return d
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *Dentry) IncRef(vfsfs *vfs.Filesystem) {
+	d.inode.incRef()
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *Dentry) TryIncRef(vfsfs *vfs.Filesystem) bool {
+	return d.inode.tryIncRef()
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *Dentry) DecRef(vfsfs *vfs.Filesystem) {
+	d.inode.decRef()
+}
+
+// Inode represents a filesystem object.
+type Inode struct {
+	// refs is a reference count. refs is accessed using atomic memory
+	// operations.
+	//
+	// A reference is held on all Inodes that are reachable in the filesystem
+	// tree. For non-directories (which may have multiple hard links), this
+	// means that a reference is dropped when nlink reaches 0. For directories,
+	// nlink never reaches 0 due to the "." entry; instead,
+	// Filesystem.RmdirAt() drops the reference.
+	refs int64
+
+	// Inode metadata; protected by mu and accessed using atomic memory
+	// operations unless otherwise specified.
+	mu    sync.RWMutex
+	mode  uint32 // excluding file type bits, which are based on impl
+	nlink uint32 // protected by Filesystem.mu instead of Inode.mu
+	uid   uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
+	gid   uint32 // auth.KGID, but ...
+	ino   uint64 // immutable
+
+	impl interface{} // immutable
+}
+
+func (i *Inode) init(impl interface{}, fs *Filesystem, creds *auth.Credentials, mode uint16) {
+	i.refs = 1
+	i.mode = uint32(mode)
+	i.uid = uint32(creds.EffectiveKUID)
+	i.gid = uint32(creds.EffectiveKGID)
+	i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
+	// i.nlink initialized by caller
+	i.impl = impl
+}
+
+// Preconditions: Filesystem.mu must be locked for writing.
+func (i *Inode) incLinksLocked() {
+	if atomic.AddUint32(&i.nlink, 1) <= 1 {
+		panic("memfs.Inode.incLinksLocked() called with no existing links")
+	}
+}
+
+// Preconditions: Filesystem.mu must be locked for writing.
+func (i *Inode) decLinksLocked() {
+	if nlink := atomic.AddUint32(&i.nlink, ^uint32(0)); nlink == 0 {
+		i.decRef()
+	} else if nlink == ^uint32(0) { // negative overflow
+		panic("memfs.Inode.decLinksLocked() called with no existing links")
+	}
+}
+
+func (i *Inode) incRef() {
+	if atomic.AddInt64(&i.refs, 1) <= 1 {
+		panic("memfs.Inode.incRef() called without holding a reference")
+	}
+}
+
+func (i *Inode) tryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&i.refs)
+		if refs == 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+func (i *Inode) decRef() {
+	if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
+		// This is unnecessary; it's mostly to simulate what tmpfs would do.
+		if regfile, ok := i.impl.(*regularFile); ok {
+			regfile.mu.Lock()
+			regfile.data = nil
+			atomic.StoreInt64(&regfile.dataLen, 0)
+			regfile.mu.Unlock()
+		}
+	} else if refs < 0 {
+		panic("memfs.Inode.decRef() called without holding a reference")
+	}
+}
+
+func (i *Inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
+	return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
+}
+
+// Go won't inline this function, and returning linux.Statx (which is quite
+// big) means spending a lot of time in runtime.duffcopy(), so instead it's an
+// output parameter.
+func (i *Inode) statTo(stat *linux.Statx) {
+	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+	stat.Blksize = 1 // usermem.PageSize in tmpfs
+	stat.Nlink = atomic.LoadUint32(&i.nlink)
+	stat.UID = atomic.LoadUint32(&i.uid)
+	stat.GID = atomic.LoadUint32(&i.gid)
+	stat.Mode = uint16(atomic.LoadUint32(&i.mode))
+	stat.Ino = i.ino
+	// TODO: device number
+	switch impl := i.impl.(type) {
+	case *regularFile:
+		stat.Mode |= linux.S_IFREG
+		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
+		stat.Size = uint64(atomic.LoadInt64(&impl.dataLen))
+		// In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in
+		// a uint64 accessed using atomic memory operations to avoid taking
+		// locks).
+		stat.Blocks = allocatedBlocksForSize(stat.Size)
+	case *directory:
+		stat.Mode |= linux.S_IFDIR
+	case *symlink:
+		stat.Mode |= linux.S_IFLNK
+		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
+		stat.Size = uint64(len(impl.target))
+		stat.Blocks = allocatedBlocksForSize(stat.Size)
+	default:
+		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+	}
+}
+
+// allocatedBlocksForSize returns the number of 512B blocks needed to
+// accommodate the given size in bytes, as appropriate for struct
+// stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
+// size is independent of the "preferred block size for I/O", struct
+// stat::st_blksize and struct statx::stx_blksize.)
+func allocatedBlocksForSize(size uint64) uint64 {
+	return (size + 511) / 512
+}
+
+func (i *Inode) direntType() uint8 {
+	switch i.impl.(type) {
+	case *regularFile:
+		return linux.DT_REG
+	case *directory:
+		return linux.DT_DIR
+	case *symlink:
+		return linux.DT_LNK
+	default:
+		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+	}
+}
+
+// fileDescription is embedded by memfs implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+
+	flags uint32 // status flags; immutable
+}
+
+func (fd *fileDescription) filesystem() *Filesystem {
+	return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*Filesystem)
+}
+
+func (fd *fileDescription) inode() *Inode {
+	return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
+}
+
+// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
+func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) {
+	return fd.flags, nil
+}
+
+// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
+func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
+	// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
+	// no-op.
+	return nil
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	var stat linux.Statx
+	fd.inode().statTo(&stat)
+	return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	// TODO: implement Inode.setStat
+	return syserror.EPERM
+}
author	Jamie Liu <jamieliu@google.com>	2019-07-18 15:09:14 -0700
committer	gVisor bot <gvisor-bot@google.com>	2019-07-18 15:10:29 -0700
commit	163ab5e9bab4f14923433967656d20f169d0f904 (patch)
tree	5e51b1573e48fe87fe0e277a32f13c78b0c2f058 /pkg/sentry/fsimpl/memfs/memfs.go
parent	6f7e2bb388cb29a355dece8921671c0085f53ea9 (diff)