Convert memfs into proto-tmpfs.

- Renamed memfs to tmpfs. - Copied fileRangeSet bits from fs/fsutil/ to fsimpl/tmpfs/ - Changed tmpfs to be backed by filemem instead of byte slice. - regularFileReadWriter uses a sync.Pool, similar to gofer client. PiperOrigin-RevId: 288356380
author: Nicolas Lacasse <nlacasse@google.com> 2020-01-06 12:51:35 -0800
committer: gVisor bot <gvisor-bot@google.com> 2020-01-06 12:52:55 -0800
commit: 51f3ab85e024fcd74c49d273ce5202a207577d31 (patch)
tree: 8d4e6ba070203203060a1d185b4321b89f2e8b05 /pkg/sentry/fsimpl/tmpfs/tmpfs.go
parent: 354a15a234c1270bcb9b902503f61835b2ccd2d0 (diff)
1 files changed, 299 insertions, 0 deletions
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
new file mode 100644
index 000000000..7be6faa5b
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -0,0 +1,299 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package tmpfs provides a filesystem implementation that behaves like tmpfs:
+// the Dentry tree is the sole source of truth for the state of the filesystem.
+//
+// Lock order:
+//
+// filesystem.mu
+//   regularFileFD.offMu
+//     regularFile.mu
+//   inode.mu
+package tmpfs
+
+import (
+	"fmt"
+	"math"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	vfsfs vfs.Filesystem
+
+	// memFile is used to allocate pages to for regular files.
+	memFile *pgalloc.MemoryFile
+
+	// mu serializes changes to the Dentry tree.
+	mu sync.RWMutex
+
+	nextInoMinusOne uint64 // accessed using atomic memory operations
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx)
+	if memFileProvider == nil {
+		panic("MemoryFileProviderFromContext returned nil")
+	}
+	fs := filesystem{
+		memFile: memFileProvider.MemoryFile(),
+	}
+	fs.vfsfs.Init(vfsObj, &fs)
+	root := fs.newDentry(fs.newDirectory(creds, 01777))
+	return &fs.vfsfs, &root.vfsd, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release() {
+}
+
+// dentry implements vfs.DentryImpl.
+type dentry struct {
+	vfsd vfs.Dentry
+
+	// inode is the inode represented by this dentry. Multiple Dentries may
+	// share a single non-directory inode (with hard links). inode is
+	// immutable.
+	inode *inode
+
+	// tmpfs doesn't count references on dentries; because the dentry tree is
+	// the sole source of truth, it is by definition always consistent with the
+	// state of the filesystem. However, it does count references on inodes,
+	// because inode resources are released when all references are dropped.
+	// (tmpfs doesn't really have resources to release, but we implement
+	// reference counting because tmpfs regular files will.)
+
+	// dentryEntry (ugh) links dentries into their parent directory.childList.
+	dentryEntry
+}
+
+func (fs *filesystem) newDentry(inode *inode) *dentry {
+	d := &dentry{
+		inode: inode,
+	}
+	d.vfsd.Init(d)
+	return d
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *dentry) IncRef() {
+	d.inode.incRef()
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *dentry) TryIncRef() bool {
+	return d.inode.tryIncRef()
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *dentry) DecRef() {
+	d.inode.decRef()
+}
+
+// inode represents a filesystem object.
+type inode struct {
+	// refs is a reference count. refs is accessed using atomic memory
+	// operations.
+	//
+	// A reference is held on all inodes that are reachable in the filesystem
+	// tree. For non-directories (which may have multiple hard links), this
+	// means that a reference is dropped when nlink reaches 0. For directories,
+	// nlink never reaches 0 due to the "." entry; instead,
+	// filesystem.RmdirAt() drops the reference.
+	refs int64
+
+	// Inode metadata; protected by mu and accessed using atomic memory
+	// operations unless otherwise specified.
+	mu    sync.RWMutex
+	mode  uint32 // excluding file type bits, which are based on impl
+	nlink uint32 // protected by filesystem.mu instead of inode.mu
+	uid   uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
+	gid   uint32 // auth.KGID, but ...
+	ino   uint64 // immutable
+
+	impl interface{} // immutable
+}
+
+const maxLinks = math.MaxUint32
+
+func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) {
+	i.refs = 1
+	i.mode = uint32(mode)
+	i.uid = uint32(creds.EffectiveKUID)
+	i.gid = uint32(creds.EffectiveKGID)
+	i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
+	// i.nlink initialized by caller
+	i.impl = impl
+}
+
+// incLinksLocked increments i's link count.
+//
+// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
+// i.nlink < maxLinks.
+func (i *inode) incLinksLocked() {
+	if i.nlink == 0 {
+		panic("tmpfs.inode.incLinksLocked() called with no existing links")
+	}
+	if i.nlink == maxLinks {
+		panic("memfs.inode.incLinksLocked() called with maximum link count")
+	}
+	atomic.AddUint32(&i.nlink, 1)
+}
+
+// decLinksLocked decrements i's link count.
+//
+// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
+func (i *inode) decLinksLocked() {
+	if i.nlink == 0 {
+		panic("tmpfs.inode.decLinksLocked() called with no existing links")
+	}
+	atomic.AddUint32(&i.nlink, ^uint32(0))
+}
+
+func (i *inode) incRef() {
+	if atomic.AddInt64(&i.refs, 1) <= 1 {
+		panic("tmpfs.inode.incRef() called without holding a reference")
+	}
+}
+
+func (i *inode) tryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&i.refs)
+		if refs == 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+func (i *inode) decRef() {
+	if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
+		// This is unnecessary; it's mostly to simulate what tmpfs would do.
+		if regFile, ok := i.impl.(*regularFile); ok {
+			regFile.mu.Lock()
+			regFile.data.DropAll(regFile.memFile)
+			atomic.StoreUint64(&regFile.size, 0)
+			regFile.mu.Unlock()
+		}
+	} else if refs < 0 {
+		panic("tmpfs.inode.decRef() called without holding a reference")
+	}
+}
+
+func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
+	return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
+}
+
+// Go won't inline this function, and returning linux.Statx (which is quite
+// big) means spending a lot of time in runtime.duffcopy(), so instead it's an
+// output parameter.
+func (i *inode) statTo(stat *linux.Statx) {
+	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+	stat.Blksize = 1 // usermem.PageSize in tmpfs
+	stat.Nlink = atomic.LoadUint32(&i.nlink)
+	stat.UID = atomic.LoadUint32(&i.uid)
+	stat.GID = atomic.LoadUint32(&i.gid)
+	stat.Mode = uint16(atomic.LoadUint32(&i.mode))
+	stat.Ino = i.ino
+	// TODO: device number
+	switch impl := i.impl.(type) {
+	case *regularFile:
+		stat.Mode |= linux.S_IFREG
+		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
+		stat.Size = uint64(atomic.LoadUint64(&impl.size))
+		// In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in
+		// a uint64 accessed using atomic memory operations to avoid taking
+		// locks).
+		stat.Blocks = allocatedBlocksForSize(stat.Size)
+	case *directory:
+		stat.Mode |= linux.S_IFDIR
+	case *symlink:
+		stat.Mode |= linux.S_IFLNK
+		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
+		stat.Size = uint64(len(impl.target))
+		stat.Blocks = allocatedBlocksForSize(stat.Size)
+	case *namedPipe:
+		stat.Mode |= linux.S_IFIFO
+	default:
+		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+	}
+}
+
+// allocatedBlocksForSize returns the number of 512B blocks needed to
+// accommodate the given size in bytes, as appropriate for struct
+// stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
+// size is independent of the "preferred block size for I/O", struct
+// stat::st_blksize and struct statx::stx_blksize.)
+func allocatedBlocksForSize(size uint64) uint64 {
+	return (size + 511) / 512
+}
+
+func (i *inode) direntType() uint8 {
+	switch i.impl.(type) {
+	case *regularFile:
+		return linux.DT_REG
+	case *directory:
+		return linux.DT_DIR
+	case *symlink:
+		return linux.DT_LNK
+	default:
+		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+	}
+}
+
+// fileDescription is embedded by tmpfs implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+}
+
+func (fd *fileDescription) filesystem() *filesystem {
+	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
+}
+
+func (fd *fileDescription) inode() *inode {
+	return fd.vfsfd.Dentry().Impl().(*dentry).inode
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	var stat linux.Statx
+	fd.inode().statTo(&stat)
+	return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	// TODO: implement inode.setStat
+	return syserror.EPERM
+}
author	Nicolas Lacasse <nlacasse@google.com>	2020-01-06 12:51:35 -0800
committer	gVisor bot <gvisor-bot@google.com>	2020-01-06 12:52:55 -0800
commit	51f3ab85e024fcd74c49d273ce5202a207577d31 (patch)
tree	8d4e6ba070203203060a1d185b4321b89f2e8b05 /pkg/sentry/fsimpl/tmpfs/tmpfs.go
parent	354a15a234c1270bcb9b902503f61835b2ccd2d0 (diff)