Merge release-20211011.0-59-g14f411392 (automated)

author: gVisor bot <gvisor-bot@google.com> 2021-10-21 23:10:52 +0000
committer: gVisor bot <gvisor-bot@google.com> 2021-10-21 23:10:52 +0000
commit: eff88efe81e2a2dcc0ee541e90755d4d21374315 (patch)
tree: 967638a38d03d2c6e778cab057a6558d71e4ec2c /pkg/sentry/fsimpl/mqfs
parent: ac2e48668b599b3d3b0b4f5c4453b61773cae6fd (diff)
parent: 14f4113924c8b7b8c161be7335b147106d0c4a26 (diff)
6 files changed, 951 insertions, 0 deletions
diff --git a/pkg/sentry/fsimpl/mqfs/mqfs.go b/pkg/sentry/fsimpl/mqfs/mqfs.go
new file mode 100644
index 000000000..c2b53c9d0
--- /dev/null
+++ b/pkg/sentry/fsimpl/mqfs/mqfs.go
@@ -0,0 +1,138 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package mqfs provides a filesystem implementation to back POSIX message
+// queues.
+package mqfs
+
+import (
+	"fmt"
+	"strconv"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/errors/linuxerr"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/ipc"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/mq"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+const (
+	// Name is the user-visible filesystem name.
+	Name                     = "mqueue"
+	defaultMaxCachedDentries = uint64(1000)
+)
+
+// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
+type FilesystemType struct{}
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	// mqfs is initialized only once per ipc namespace. Each ipc namespace has
+	// a POSIX message registry with a root dentry, filesystem, and a
+	// disconnected mount. We want the fs to be consistent for all processes in
+	// the same ipc namespace, so instead of creating a new fs and root dentry,
+	// we retreive them using IPCNamespace.PosixQueues and use them.
+
+	i := ipcNamespaceFromContext(ctx)
+	if i == nil {
+		return nil, nil, fmt.Errorf("mqfs.FilesystemType.GetFilesystem: ipc namespace doesn't exist")
+	}
+	defer i.DecRef(ctx)
+
+	registry := i.PosixQueues()
+	if registry == nil {
+		return nil, nil, fmt.Errorf("mqfs.FilesystemType.GetFilesystem: ipc namespace doesn't have a POSIX registry")
+	}
+	impl := registry.Impl().(*RegistryImpl)
+
+	maxCachedDentries, err := maxCachedDentries(ctx, vfs.GenericParseMountOptions(opts.Data))
+	if err != nil {
+		return nil, nil, err
+	}
+	impl.fs.MaxCachedDentries = maxCachedDentries
+
+	impl.fs.VFSFilesystem().IncRef()
+	return impl.fs.VFSFilesystem(), impl.root.VFSDentry(), nil
+}
+
+// maxCachedDentries checks mopts for dentry_cache_limit. If a value is
+// specified, parse it into uint64 and return it. Otherwise, return the default
+// value. An error is returned if a value is found but can't be parsed.
+func maxCachedDentries(ctx context.Context, mopts map[string]string) (_ uint64, err error) {
+	max := defaultMaxCachedDentries
+	if str, ok := mopts["dentry_cache_limit"]; ok {
+		delete(mopts, "dentry_cache_limit")
+		max, err = strconv.ParseUint(str, 10, 64)
+		if err != nil {
+			ctx.Warningf("mqfs.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str)
+			return 0, linuxerr.EINVAL
+		}
+	}
+	return max, nil
+}
+
+// filesystem implements kernfs.Filesystem.
+//
+// +stateify savable
+type filesystem struct {
+	kernfs.Filesystem
+	devMinor uint32
+
+	// root is the filesystem's root dentry. Since we take a reference on it in
+	// GetFilesystem, we should release it when the fs is released.
+	root *kernfs.Dentry
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release(ctx context.Context) {
+	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+	fs.Filesystem.Release(ctx)
+}
+
+// MountOptions implements vfs.FilesystemImpl.MountOptions.
+func (fs *filesystem) MountOptions() string {
+	return fmt.Sprintf("dentry_cache_limit=%d", fs.MaxCachedDentries)
+}
+
+// ipcNamespace defines functions we need from kernel.IPCNamespace. We redefine
+// ipcNamespace along with ipcNamespaceFromContext to avoid circular dependency
+// with package sentry/kernel.
+type ipcNamespace interface {
+	// PosixQueues returns a POSIX message queue registry.
+	PosixQueues() *mq.Registry
+
+	// DecRef decrements ipcNamespace's number of references.
+	DecRef(ctx context.Context)
+}
+
+// ipcNamespaceFromContext returns the IPC namespace in which ctx is executing.
+// Copied from package sentry/kernel.
+func ipcNamespaceFromContext(ctx context.Context) ipcNamespace {
+	if v := ctx.Value(ipc.CtxIPCNamespace); v != nil {
+		return v.(ipcNamespace)
+	}
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/mqfs/mqfs_state_autogen.go b/pkg/sentry/fsimpl/mqfs/mqfs_state_autogen.go
new file mode 100644
index 000000000..d6154efc3
--- /dev/null
+++ b/pkg/sentry/fsimpl/mqfs/mqfs_state_autogen.go
@@ -0,0 +1,263 @@
+// automatically generated by stateify.
+
+package mqfs
+
+import (
+	"gvisor.dev/gvisor/pkg/state"
+)
+
+func (ft *FilesystemType) StateTypeName() string {
+	return "pkg/sentry/fsimpl/mqfs.FilesystemType"
+}
+
+func (ft *FilesystemType) StateFields() []string {
+	return []string{}
+}
+
+func (ft *FilesystemType) beforeSave() {}
+
+// +checklocksignore
+func (ft *FilesystemType) StateSave(stateSinkObject state.Sink) {
+	ft.beforeSave()
+}
+
+func (ft *FilesystemType) afterLoad() {}
+
+// +checklocksignore
+func (ft *FilesystemType) StateLoad(stateSourceObject state.Source) {
+}
+
+func (fs *filesystem) StateTypeName() string {
+	return "pkg/sentry/fsimpl/mqfs.filesystem"
+}
+
+func (fs *filesystem) StateFields() []string {
+	return []string{
+		"Filesystem",
+		"devMinor",
+		"root",
+	}
+}
+
+func (fs *filesystem) beforeSave() {}
+
+// +checklocksignore
+func (fs *filesystem) StateSave(stateSinkObject state.Sink) {
+	fs.beforeSave()
+	stateSinkObject.Save(0, &fs.Filesystem)
+	stateSinkObject.Save(1, &fs.devMinor)
+	stateSinkObject.Save(2, &fs.root)
+}
+
+func (fs *filesystem) afterLoad() {}
+
+// +checklocksignore
+func (fs *filesystem) StateLoad(stateSourceObject state.Source) {
+	stateSourceObject.Load(0, &fs.Filesystem)
+	stateSourceObject.Load(1, &fs.devMinor)
+	stateSourceObject.Load(2, &fs.root)
+}
+
+func (q *queueInode) StateTypeName() string {
+	return "pkg/sentry/fsimpl/mqfs.queueInode"
+}
+
+func (q *queueInode) StateFields() []string {
+	return []string{
+		"DynamicBytesFile",
+		"queue",
+	}
+}
+
+func (q *queueInode) beforeSave() {}
+
+// +checklocksignore
+func (q *queueInode) StateSave(stateSinkObject state.Sink) {
+	q.beforeSave()
+	stateSinkObject.Save(0, &q.DynamicBytesFile)
+	stateSinkObject.Save(1, &q.queue)
+}
+
+func (q *queueInode) afterLoad() {}
+
+// +checklocksignore
+func (q *queueInode) StateLoad(stateSourceObject state.Source) {
+	stateSourceObject.Load(0, &q.DynamicBytesFile)
+	stateSourceObject.Load(1, &q.queue)
+}
+
+func (fd *queueFD) StateTypeName() string {
+	return "pkg/sentry/fsimpl/mqfs.queueFD"
+}
+
+func (fd *queueFD) StateFields() []string {
+	return []string{
+		"FileDescriptionDefaultImpl",
+		"DynamicBytesFileDescriptionImpl",
+		"LockFD",
+		"vfsfd",
+		"inode",
+		"queue",
+	}
+}
+
+func (fd *queueFD) beforeSave() {}
+
+// +checklocksignore
+func (fd *queueFD) StateSave(stateSinkObject state.Sink) {
+	fd.beforeSave()
+	stateSinkObject.Save(0, &fd.FileDescriptionDefaultImpl)
+	stateSinkObject.Save(1, &fd.DynamicBytesFileDescriptionImpl)
+	stateSinkObject.Save(2, &fd.LockFD)
+	stateSinkObject.Save(3, &fd.vfsfd)
+	stateSinkObject.Save(4, &fd.inode)
+	stateSinkObject.Save(5, &fd.queue)
+}
+
+func (fd *queueFD) afterLoad() {}
+
+// +checklocksignore
+func (fd *queueFD) StateLoad(stateSourceObject state.Source) {
+	stateSourceObject.Load(0, &fd.FileDescriptionDefaultImpl)
+	stateSourceObject.Load(1, &fd.DynamicBytesFileDescriptionImpl)
+	stateSourceObject.Load(2, &fd.LockFD)
+	stateSourceObject.Load(3, &fd.vfsfd)
+	stateSourceObject.Load(4, &fd.inode)
+	stateSourceObject.Load(5, &fd.queue)
+}
+
+func (r *RegistryImpl) StateTypeName() string {
+	return "pkg/sentry/fsimpl/mqfs.RegistryImpl"
+}
+
+func (r *RegistryImpl) StateFields() []string {
+	return []string{
+		"root",
+		"fs",
+		"mount",
+	}
+}
+
+func (r *RegistryImpl) beforeSave() {}
+
+// +checklocksignore
+func (r *RegistryImpl) StateSave(stateSinkObject state.Sink) {
+	r.beforeSave()
+	stateSinkObject.Save(0, &r.root)
+	stateSinkObject.Save(1, &r.fs)
+	stateSinkObject.Save(2, &r.mount)
+}
+
+func (r *RegistryImpl) afterLoad() {}
+
+// +checklocksignore
+func (r *RegistryImpl) StateLoad(stateSourceObject state.Source) {
+	stateSourceObject.Load(0, &r.root)
+	stateSourceObject.Load(1, &r.fs)
+	stateSourceObject.Load(2, &r.mount)
+}
+
+func (i *rootInode) StateTypeName() string {
+	return "pkg/sentry/fsimpl/mqfs.rootInode"
+}
+
+func (i *rootInode) StateFields() []string {
+	return []string{
+		"rootInodeRefs",
+		"InodeAlwaysValid",
+		"InodeAttrs",
+		"InodeDirectoryNoNewChildren",
+		"InodeNotSymlink",
+		"InodeTemporary",
+		"OrderedChildren",
+		"implStatFS",
+		"locks",
+	}
+}
+
+func (i *rootInode) beforeSave() {}
+
+// +checklocksignore
+func (i *rootInode) StateSave(stateSinkObject state.Sink) {
+	i.beforeSave()
+	stateSinkObject.Save(0, &i.rootInodeRefs)
+	stateSinkObject.Save(1, &i.InodeAlwaysValid)
+	stateSinkObject.Save(2, &i.InodeAttrs)
+	stateSinkObject.Save(3, &i.InodeDirectoryNoNewChildren)
+	stateSinkObject.Save(4, &i.InodeNotSymlink)
+	stateSinkObject.Save(5, &i.InodeTemporary)
+	stateSinkObject.Save(6, &i.OrderedChildren)
+	stateSinkObject.Save(7, &i.implStatFS)
+	stateSinkObject.Save(8, &i.locks)
+}
+
+func (i *rootInode) afterLoad() {}
+
+// +checklocksignore
+func (i *rootInode) StateLoad(stateSourceObject state.Source) {
+	stateSourceObject.Load(0, &i.rootInodeRefs)
+	stateSourceObject.Load(1, &i.InodeAlwaysValid)
+	stateSourceObject.Load(2, &i.InodeAttrs)
+	stateSourceObject.Load(3, &i.InodeDirectoryNoNewChildren)
+	stateSourceObject.Load(4, &i.InodeNotSymlink)
+	stateSourceObject.Load(5, &i.InodeTemporary)
+	stateSourceObject.Load(6, &i.OrderedChildren)
+	stateSourceObject.Load(7, &i.implStatFS)
+	stateSourceObject.Load(8, &i.locks)
+}
+
+func (i *implStatFS) StateTypeName() string {
+	return "pkg/sentry/fsimpl/mqfs.implStatFS"
+}
+
+func (i *implStatFS) StateFields() []string {
+	return []string{}
+}
+
+func (i *implStatFS) beforeSave() {}
+
+// +checklocksignore
+func (i *implStatFS) StateSave(stateSinkObject state.Sink) {
+	i.beforeSave()
+}
+
+func (i *implStatFS) afterLoad() {}
+
+// +checklocksignore
+func (i *implStatFS) StateLoad(stateSourceObject state.Source) {
+}
+
+func (r *rootInodeRefs) StateTypeName() string {
+	return "pkg/sentry/fsimpl/mqfs.rootInodeRefs"
+}
+
+func (r *rootInodeRefs) StateFields() []string {
+	return []string{
+		"refCount",
+	}
+}
+
+func (r *rootInodeRefs) beforeSave() {}
+
+// +checklocksignore
+func (r *rootInodeRefs) StateSave(stateSinkObject state.Sink) {
+	r.beforeSave()
+	stateSinkObject.Save(0, &r.refCount)
+}
+
+// +checklocksignore
+func (r *rootInodeRefs) StateLoad(stateSourceObject state.Source) {
+	stateSourceObject.Load(0, &r.refCount)
+	stateSourceObject.AfterLoad(r.afterLoad)
+}
+
+func init() {
+	state.Register((*FilesystemType)(nil))
+	state.Register((*filesystem)(nil))
+	state.Register((*queueInode)(nil))
+	state.Register((*queueFD)(nil))
+	state.Register((*RegistryImpl)(nil))
+	state.Register((*rootInode)(nil))
+	state.Register((*implStatFS)(nil))
+	state.Register((*rootInodeRefs)(nil))
+}
diff --git a/pkg/sentry/fsimpl/mqfs/queue.go b/pkg/sentry/fsimpl/mqfs/queue.go
new file mode 100644
index 000000000..933dbc6ed
--- /dev/null
+++ b/pkg/sentry/fsimpl/mqfs/queue.go
@@ -0,0 +1,145 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mqfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/errors/linuxerr"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/mq"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// queueInode represents an inode for a message queue (/dev/mqueue/[name]).
+//
+// +stateify savable
+type queueInode struct {
+	kernfs.DynamicBytesFile
+
+	// queue is the message queue backing this inode.
+	queue *mq.Queue
+}
+
+var _ kernfs.Inode = (*queueInode)(nil)
+
+// newQueueInode returns a new, initialized queueInode.
+func (fs *filesystem) newQueueInode(ctx context.Context, creds *auth.Credentials, q *mq.Queue, perm linux.FileMode) kernfs.Inode {
+	inode := &queueInode{queue: q}
+	inode.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), q, perm)
+	return inode
+}
+
+// Keep implements kernfs.Inode.Keep.
+func (q *queueInode) Keep() bool {
+	// Return true so that the fs keeps newly created dentries. This is done
+	// because inodes returned by root.Lookup are not temporary, they exist
+	// in the fs, and refer to message queues.
+	return true
+}
+
+// queueFD implements vfs.FileDescriptionImpl for FD backed by a POSIX message
+// queue. It's mostly similar to DynamicBytesFD, but implements more operations.
+//
+// +stateify savable
+type queueFD struct {
+	vfs.FileDescriptionDefaultImpl
+	vfs.DynamicBytesFileDescriptionImpl
+	vfs.LockFD
+
+	vfsfd vfs.FileDescription
+	inode kernfs.Inode
+
+	// queue is a view into the queue backing this fd.
+	queue mq.View
+}
+
+// Init initializes a queueFD. Mostly copied from DynamicBytesFD.Init, but uses
+// the queueFD as FileDescriptionImpl.
+func (fd *queueFD) Init(m *vfs.Mount, d *kernfs.Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error {
+	fd.LockFD.Init(locks)
+	if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
+		return err
+	}
+	fd.inode = d.Inode()
+	fd.SetDataSource(data)
+	return nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *queueFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	return fd.DynamicBytesFileDescriptionImpl.Seek(ctx, offset, whence)
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *queueFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return fd.DynamicBytesFileDescriptionImpl.Read(ctx, dst, opts)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *queueFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return fd.DynamicBytesFileDescriptionImpl.PRead(ctx, dst, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *queueFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return fd.DynamicBytesFileDescriptionImpl.Write(ctx, src, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *queueFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return fd.DynamicBytesFileDescriptionImpl.PWrite(ctx, src, offset, opts)
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *queueFD) Release(context.Context) {}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *queueFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return fd.inode.Stat(ctx, fs, opts)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *queueFD) SetStat(context.Context, vfs.SetStatOptions) error {
+	// DynamicBytesFiles are immutable.
+	return linuxerr.EPERM
+}
+
+// OnClose implements FileDescriptionImpl.OnClose similar to
+// ipc/mqueue.c::mqueue_flush_file.
+func (fd *queueFD) OnClose(ctx context.Context) error {
+	fd.queue.Flush(ctx)
+	return nil
+}
+
+// Readiness implements waiter.Waitable.Readiness similar to
+// ipc/mqueue.c::mqueue_poll_file.
+func (fd *queueFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return fd.queue.Readiness(mask)
+}
+
+// EventRegister implements Waitable.EventRegister.
+func (fd *queueFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	fd.queue.EventRegister(e, mask)
+}
+
+// EventUnregister implements Waitable.EventUnregister.
+func (fd *queueFD) EventUnregister(e *waiter.Entry) {
+	fd.queue.EventUnregister(e)
+}
diff --git a/pkg/sentry/fsimpl/mqfs/registry.go b/pkg/sentry/fsimpl/mqfs/registry.go
new file mode 100644
index 000000000..c8fbe4d33
--- /dev/null
+++ b/pkg/sentry/fsimpl/mqfs/registry.go
@@ -0,0 +1,176 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mqfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/errors/linuxerr"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/mq"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// RegistryImpl implements mq.RegistryImpl. It implements the interface using
+// the message queue filesystem, and is provided to mq.Registry at
+// initialization.
+//
+// RegistryImpl is not thread-safe, so it is the responsibility of the user
+// (the containing mq.Registry) to protect using a lock.
+//
+// +stateify savable
+type RegistryImpl struct {
+	// root is the root dentry of the mq filesystem. Its main usage is to
+	// retreive the root inode, which we use to add, remove, and lookup message
+	// queues.
+	//
+	// We hold a reference on root and release when the registry is destroyed.
+	root *kernfs.Dentry
+
+	// fs is the filesystem backing this registry, used mainly to initialize
+	// new inodes.
+	fs *filesystem
+
+	// mount is the mount point used for this filesystem.
+	mount *vfs.Mount
+}
+
+// NewRegistryImpl returns a new, initialized RegistryImpl, and takes a
+// reference on root.
+func NewRegistryImpl(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*RegistryImpl, error) {
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, err
+	}
+
+	var dentry kernfs.Dentry
+	fs := &filesystem{
+		devMinor: devMinor,
+		root:     &dentry,
+	}
+	fs.VFSFilesystem().Init(vfsObj, &FilesystemType{}, fs)
+	vfsfs := fs.VFSFilesystem()
+
+	dentry.InitRoot(&fs.Filesystem, fs.newRootInode(ctx, creds))
+	defer vfsfs.DecRef(ctx) // NewDisconnectedMount will obtain a ref on success.
+
+	mount, err := vfsObj.NewDisconnectedMount(vfsfs, dentry.VFSDentry(), &vfs.MountOptions{})
+	if err != nil {
+		return nil, err
+	}
+
+	return &RegistryImpl{
+		root:  &dentry,
+		fs:    fs,
+		mount: mount,
+	}, nil
+}
+
+// Get implements mq.RegistryImpl.Get.
+func (r *RegistryImpl) Get(ctx context.Context, name string, access mq.AccessType, block bool, flags uint32) (*vfs.FileDescription, bool, error) {
+	inode, err := r.lookup(ctx, name)
+	if err != nil {
+		return nil, false, nil
+	}
+
+	qInode := inode.(*queueInode)
+	if !qInode.queue.HasPermissions(auth.CredentialsFromContext(ctx), perm(access)) {
+		// "The queue exists, but the caller does not have permission to
+		//  open it in the specified mode."
+		return nil, false, linuxerr.EACCES
+	}
+
+	fd, err := r.newFD(qInode.queue, qInode, access, block, flags)
+	if err != nil {
+		return nil, false, err
+	}
+	return fd, true, nil
+}
+
+// New implements mq.RegistryImpl.New.
+func (r *RegistryImpl) New(ctx context.Context, name string, q *mq.Queue, access mq.AccessType, block bool, perm linux.FileMode, flags uint32) (*vfs.FileDescription, error) {
+	root := r.root.Inode().(*rootInode)
+	qInode := r.fs.newQueueInode(ctx, auth.CredentialsFromContext(ctx), q, perm).(*queueInode)
+	err := root.Insert(name, qInode)
+	if err != nil {
+		return nil, err
+	}
+	return r.newFD(q, qInode, access, block, flags)
+}
+
+// Unlink implements mq.RegistryImpl.Unlink.
+func (r *RegistryImpl) Unlink(ctx context.Context, name string) error {
+	creds := auth.CredentialsFromContext(ctx)
+	if err := r.root.Inode().CheckPermissions(ctx, creds, vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+
+	root := r.root.Inode().(*rootInode)
+	inode, err := r.lookup(ctx, name)
+	if err != nil {
+		return err
+	}
+	return root.Unlink(ctx, name, inode)
+}
+
+// Destroy implements mq.RegistryImpl.Destroy.
+func (r *RegistryImpl) Destroy(ctx context.Context) {
+	r.root.DecRef(ctx)
+	r.mount.DecRef(ctx)
+}
+
+// lookup retreives a kernfs.Inode using a name.
+func (r *RegistryImpl) lookup(ctx context.Context, name string) (kernfs.Inode, error) {
+	inode := r.root.Inode().(*rootInode)
+	lookup, err := inode.Lookup(ctx, name)
+	if err != nil {
+		return nil, err
+	}
+	return lookup, nil
+}
+
+// newFD returns a new file description created using the given queue and inode.
+func (r *RegistryImpl) newFD(q *mq.Queue, inode *queueInode, access mq.AccessType, block bool, flags uint32) (*vfs.FileDescription, error) {
+	view, err := mq.NewView(q, access, block)
+	if err != nil {
+		return nil, err
+	}
+
+	var dentry kernfs.Dentry
+	dentry.Init(&r.fs.Filesystem, inode)
+
+	fd := &queueFD{queue: view}
+	err = fd.Init(r.mount, &dentry, inode.queue, inode.Locks(), flags)
+	if err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// perm returns a permission mask created using given flags.
+func perm(access mq.AccessType) fs.PermMask {
+	switch access {
+	case mq.ReadWrite:
+		return fs.PermMask{Read: true, Write: true}
+	case mq.WriteOnly:
+		return fs.PermMask{Write: true}
+	case mq.ReadOnly:
+		return fs.PermMask{Read: true}
+	default:
+		return fs.PermMask{} // Can't happen, see NewView.
+	}
+}
diff --git a/pkg/sentry/fsimpl/mqfs/root.go b/pkg/sentry/fsimpl/mqfs/root.go
new file mode 100644
index 000000000..37b5749fb
--- /dev/null
+++ b/pkg/sentry/fsimpl/mqfs/root.go
@@ -0,0 +1,89 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mqfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/errors/linuxerr"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// rootInode represents inode for filesystem's root directory (/dev/mqueue).
+//
+// +stateify savable
+type rootInode struct {
+	rootInodeRefs
+	kernfs.InodeAlwaysValid
+	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary
+	kernfs.OrderedChildren
+	implStatFS
+
+	locks vfs.FileLocks
+}
+
+var _ kernfs.Inode = (*rootInode)(nil)
+
+// newRootInode returns a new, initialized rootInode.
+func (fs *filesystem) newRootInode(ctx context.Context, creds *auth.Credentials) kernfs.Inode {
+	inode := &rootInode{}
+	inode.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|linux.FileMode(0555))
+	inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true})
+	inode.InitRefs()
+	return inode
+}
+
+// Open implements kernfs.Inode.Open.
+func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	})
+	if err != nil {
+		return nil, err
+	}
+	return fd.VFSFileDescription(), nil
+}
+
+// DecRef implements kernfs.Inode.DecRef.
+func (i *rootInode) DecRef(ctx context.Context) {
+	i.rootInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
+// Rename implements Inode.Rename and overrides OrderedChildren.Rename. mqueue
+// filesystem allows files to be unlinked, but not renamed.
+func (i *rootInode) Rename(ctx context.Context, oldname, newname string, child, dstDir kernfs.Inode) error {
+	return linuxerr.EPERM
+}
+
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
+func (*rootInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+	return linuxerr.EPERM
+}
+
+// implStatFS provides an implementation of kernfs.Inode.StatFS for message
+// queues to be embedded in inodes.
+//
+// +stateify savable
+type implStatFS struct{}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.MQUEUE_MAGIC), nil
+}
diff --git a/pkg/sentry/fsimpl/mqfs/root_inode_refs.go b/pkg/sentry/fsimpl/mqfs/root_inode_refs.go
new file mode 100644
index 000000000..7462467a5
--- /dev/null
+++ b/pkg/sentry/fsimpl/mqfs/root_inode_refs.go
@@ -0,0 +1,140 @@
+package mqfs
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/refsvfs2"
+)
+
+// enableLogging indicates whether reference-related events should be logged (with
+// stack traces). This is false by default and should only be set to true for
+// debugging purposes, as it can generate an extremely large amount of output
+// and drastically degrade performance.
+const rootInodeenableLogging = false
+
+// obj is used to customize logging. Note that we use a pointer to T so that
+// we do not copy the entire object when passed as a format parameter.
+var rootInodeobj *rootInode
+
+// Refs implements refs.RefCounter. It keeps a reference count using atomic
+// operations and calls the destructor when the count reaches zero.
+//
+// NOTE: Do not introduce additional fields to the Refs struct. It is used by
+// many filesystem objects, and we want to keep it as small as possible (i.e.,
+// the same size as using an int64 directly) to avoid taking up extra cache
+// space. In general, this template should not be extended at the cost of
+// performance. If it does not offer enough flexibility for a particular object
+// (example: b/187877947), we should implement the RefCounter/CheckedObject
+// interfaces manually.
+//
+// +stateify savable
+type rootInodeRefs struct {
+	// refCount is composed of two fields:
+	//
+	//	[32-bit speculative references]:[32-bit real references]
+	//
+	// Speculative references are used for TryIncRef, to avoid a CompareAndSwap
+	// loop. See IncRef, DecRef and TryIncRef for details of how these fields are
+	// used.
+	refCount int64
+}
+
+// InitRefs initializes r with one reference and, if enabled, activates leak
+// checking.
+func (r *rootInodeRefs) InitRefs() {
+	atomic.StoreInt64(&r.refCount, 1)
+	refsvfs2.Register(r)
+}
+
+// RefType implements refsvfs2.CheckedObject.RefType.
+func (r *rootInodeRefs) RefType() string {
+	return fmt.Sprintf("%T", rootInodeobj)[1:]
+}
+
+// LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
+func (r *rootInodeRefs) LeakMessage() string {
+	return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs())
+}
+
+// LogRefs implements refsvfs2.CheckedObject.LogRefs.
+func (r *rootInodeRefs) LogRefs() bool {
+	return rootInodeenableLogging
+}
+
+// ReadRefs returns the current number of references. The returned count is
+// inherently racy and is unsafe to use without external synchronization.
+func (r *rootInodeRefs) ReadRefs() int64 {
+	return atomic.LoadInt64(&r.refCount)
+}
+
+// IncRef implements refs.RefCounter.IncRef.
+//
+//go:nosplit
+func (r *rootInodeRefs) IncRef() {
+	v := atomic.AddInt64(&r.refCount, 1)
+	if rootInodeenableLogging {
+		refsvfs2.LogIncRef(r, v)
+	}
+	if v <= 1 {
+		panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType()))
+	}
+}
+
+// TryIncRef implements refs.TryRefCounter.TryIncRef.
+//
+// To do this safely without a loop, a speculative reference is first acquired
+// on the object. This allows multiple concurrent TryIncRef calls to distinguish
+// other TryIncRef calls from genuine references held.
+//
+//go:nosplit
+func (r *rootInodeRefs) TryIncRef() bool {
+	const speculativeRef = 1 << 32
+	if v := atomic.AddInt64(&r.refCount, speculativeRef); int32(v) == 0 {
+
+		atomic.AddInt64(&r.refCount, -speculativeRef)
+		return false
+	}
+
+	v := atomic.AddInt64(&r.refCount, -speculativeRef+1)
+	if rootInodeenableLogging {
+		refsvfs2.LogTryIncRef(r, v)
+	}
+	return true
+}
+
+// DecRef implements refs.RefCounter.DecRef.
+//
+// Note that speculative references are counted here. Since they were added
+// prior to real references reaching zero, they will successfully convert to
+// real references. In other words, we see speculative references only in the
+// following case:
+//
+//	A: TryIncRef [speculative increase => sees non-negative references]
+//	B: DecRef [real decrease]
+//	A: TryIncRef [transform speculative to real]
+//
+//go:nosplit
+func (r *rootInodeRefs) DecRef(destroy func()) {
+	v := atomic.AddInt64(&r.refCount, -1)
+	if rootInodeenableLogging {
+		refsvfs2.LogDecRef(r, v)
+	}
+	switch {
+	case v < 0:
+		panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType()))
+
+	case v == 0:
+		refsvfs2.Unregister(r)
+
+		if destroy != nil {
+			destroy()
+		}
+	}
+}
+
+func (r *rootInodeRefs) afterLoad() {
+	if r.ReadRefs() > 0 {
+		refsvfs2.Register(r)
+	}
+}
author	gVisor bot <gvisor-bot@google.com>	2021-10-21 23:10:52 +0000
committer	gVisor bot <gvisor-bot@google.com>	2021-10-21 23:10:52 +0000
commit	eff88efe81e2a2dcc0ee541e90755d4d21374315 (patch)
tree	967638a38d03d2c6e778cab057a6558d71e4ec2c /pkg/sentry/fsimpl/mqfs
parent	ac2e48668b599b3d3b0b4f5c4453b61773cae6fd (diff)
parent	14f4113924c8b7b8c161be7335b147106d0c4a26 (diff)