summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorgVisor bot <gvisor-bot@google.com>2021-10-21 23:10:52 +0000
committergVisor bot <gvisor-bot@google.com>2021-10-21 23:10:52 +0000
commiteff88efe81e2a2dcc0ee541e90755d4d21374315 (patch)
tree967638a38d03d2c6e778cab057a6558d71e4ec2c
parentac2e48668b599b3d3b0b4f5c4453b61773cae6fd (diff)
parent14f4113924c8b7b8c161be7335b147106d0c4a26 (diff)
Merge release-20211011.0-59-g14f411392 (automated)
-rw-r--r--pkg/abi/linux/fs.go1
-rw-r--r--pkg/abi/linux/linux_abi_autogen_unsafe.go107
-rw-r--r--pkg/abi/linux/mqueue.go55
-rw-r--r--pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go5
-rw-r--r--pkg/sentry/fsimpl/mqfs/mqfs.go138
-rw-r--r--pkg/sentry/fsimpl/mqfs/mqfs_state_autogen.go263
-rw-r--r--pkg/sentry/fsimpl/mqfs/queue.go145
-rw-r--r--pkg/sentry/fsimpl/mqfs/registry.go176
-rw-r--r--pkg/sentry/fsimpl/mqfs/root.go89
-rw-r--r--pkg/sentry/fsimpl/mqfs/root_inode_refs.go140
-rw-r--r--pkg/sentry/kernel/context.go6
-rw-r--r--pkg/sentry/kernel/ipc/ns.go22
-rw-r--r--pkg/sentry/kernel/ipc_namespace.go38
-rw-r--r--pkg/sentry/kernel/kernel.go10
-rw-r--r--pkg/sentry/kernel/kernel_state_autogen.go3
-rw-r--r--pkg/sentry/kernel/mq/message_list.go221
-rw-r--r--pkg/sentry/kernel/mq/mq.go457
-rw-r--r--pkg/sentry/kernel/mq/mq_state_autogen.go211
-rw-r--r--pkg/sentry/kernel/task_clone.go6
-rw-r--r--pkg/sentry/kernel/task_context.go3
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/mq.go98
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/vfs2.go5
-rw-r--r--runsc/boot/vfs.go5
23 files changed, 2196 insertions, 8 deletions
diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go
index cad24fcc7..edc90e54c 100644
--- a/pkg/abi/linux/fs.go
+++ b/pkg/abi/linux/fs.go
@@ -23,6 +23,7 @@ const (
DEVPTS_SUPER_MAGIC = 0x00001cd1
EXT_SUPER_MAGIC = 0xef53
FUSE_SUPER_MAGIC = 0x65735546
+ MQUEUE_MAGIC = 0x19800202
OVERLAYFS_SUPER_MAGIC = 0x794c7630
PIPEFS_MAGIC = 0x50495045
PROC_SUPER_MAGIC = 0x9fa0
diff --git a/pkg/abi/linux/linux_abi_autogen_unsafe.go b/pkg/abi/linux/linux_abi_autogen_unsafe.go
index cd5786d12..b71bd1432 100644
--- a/pkg/abi/linux/linux_abi_autogen_unsafe.go
+++ b/pkg/abi/linux/linux_abi_autogen_unsafe.go
@@ -79,6 +79,7 @@ var _ marshal.Marshallable = (*KernelIP6TGetEntries)(nil)
var _ marshal.Marshallable = (*KernelIPTEntry)(nil)
var _ marshal.Marshallable = (*KernelIPTGetEntries)(nil)
var _ marshal.Marshallable = (*Linger)(nil)
+var _ marshal.Marshallable = (*MqAttr)(nil)
var _ marshal.Marshallable = (*MsgBuf)(nil)
var _ marshal.Marshallable = (*MsgInfo)(nil)
var _ marshal.Marshallable = (*MsqidDS)(nil)
@@ -5266,6 +5267,112 @@ func (n *NumaPolicy) WriteTo(w io.Writer) (int64, error) {
return int64(length), err
}
+// SizeBytes implements marshal.Marshallable.SizeBytes.
+func (m *MqAttr) SizeBytes() int {
+ return 32 +
+ 8*4
+}
+
+// MarshalBytes implements marshal.Marshallable.MarshalBytes.
+func (m *MqAttr) MarshalBytes(dst []byte) {
+ hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.MqFlags))
+ dst = dst[8:]
+ hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.MqMaxmsg))
+ dst = dst[8:]
+ hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.MqMsgsize))
+ dst = dst[8:]
+ hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.MqCurmsgs))
+ dst = dst[8:]
+ // Padding: dst[:sizeof(int64)*4] ~= [4]int64{0}
+ dst = dst[8*(4):]
+}
+
+// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.
+func (m *MqAttr) UnmarshalBytes(src []byte) {
+ m.MqFlags = int64(hostarch.ByteOrder.Uint64(src[:8]))
+ src = src[8:]
+ m.MqMaxmsg = int64(hostarch.ByteOrder.Uint64(src[:8]))
+ src = src[8:]
+ m.MqMsgsize = int64(hostarch.ByteOrder.Uint64(src[:8]))
+ src = src[8:]
+ m.MqCurmsgs = int64(hostarch.ByteOrder.Uint64(src[:8]))
+ src = src[8:]
+ // Padding: ~ copy([4]int64(m._), src[:sizeof(int64)*4])
+ src = src[8*(4):]
+}
+
+// Packed implements marshal.Marshallable.Packed.
+//go:nosplit
+func (m *MqAttr) Packed() bool {
+ return true
+}
+
+// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.
+func (m *MqAttr) MarshalUnsafe(dst []byte) {
+ gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(m), uintptr(m.SizeBytes()))
+}
+
+// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.
+func (m *MqAttr) UnmarshalUnsafe(src []byte) {
+ gohacks.Memmove(unsafe.Pointer(m), unsafe.Pointer(&src[0]), uintptr(m.SizeBytes()))
+}
+
+// CopyOutN implements marshal.Marshallable.CopyOutN.
+//go:nosplit
+func (m *MqAttr) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) {
+ // Construct a slice backed by dst's underlying memory.
+ var buf []byte
+ hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))
+ hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m)))
+ hdr.Len = m.SizeBytes()
+ hdr.Cap = m.SizeBytes()
+
+ length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay.
+ // Since we bypassed the compiler's escape analysis, indicate that m
+ // must live until the use above.
+ runtime.KeepAlive(m) // escapes: replaced by intrinsic.
+ return length, err
+}
+
+// CopyOut implements marshal.Marshallable.CopyOut.
+//go:nosplit
+func (m *MqAttr) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) {
+ return m.CopyOutN(cc, addr, m.SizeBytes())
+}
+
+// CopyIn implements marshal.Marshallable.CopyIn.
+//go:nosplit
+func (m *MqAttr) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) {
+ // Construct a slice backed by dst's underlying memory.
+ var buf []byte
+ hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))
+ hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m)))
+ hdr.Len = m.SizeBytes()
+ hdr.Cap = m.SizeBytes()
+
+ length, err := cc.CopyInBytes(addr, buf) // escapes: okay.
+ // Since we bypassed the compiler's escape analysis, indicate that m
+ // must live until the use above.
+ runtime.KeepAlive(m) // escapes: replaced by intrinsic.
+ return length, err
+}
+
+// WriteTo implements io.WriterTo.WriteTo.
+func (m *MqAttr) WriteTo(writer io.Writer) (int64, error) {
+ // Construct a slice backed by dst's underlying memory.
+ var buf []byte
+ hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf))
+ hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m)))
+ hdr.Len = m.SizeBytes()
+ hdr.Cap = m.SizeBytes()
+
+ length, err := writer.Write(buf)
+ // Since we bypassed the compiler's escape analysis, indicate that m
+ // must live until the use above.
+ runtime.KeepAlive(m) // escapes: replaced by intrinsic.
+ return int64(length), err
+}
+
// Packed implements marshal.Marshallable.Packed.
//go:nosplit
func (b *MsgBuf) Packed() bool {
diff --git a/pkg/abi/linux/mqueue.go b/pkg/abi/linux/mqueue.go
new file mode 100644
index 000000000..4988a2aa3
--- /dev/null
+++ b/pkg/abi/linux/mqueue.go
@@ -0,0 +1,55 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// Default values for POSIX message queues. Source:
+// include/linux/ipc_namespace.h
+const (
+ DFLT_QUEUESMAX = 256
+ MIN_MSGMAX = 1
+ DFLT_MSG uint = 10
+ DFLT_MSGMAX = 10
+ HARD_MSGMAX = 65536
+ MIN_MSGSIZEMAX = 128
+ DFLT_MSGSIZE uint = 8192
+ DFLT_MSGSIZEMAX = 8192
+ HARD_MSGSIZEMAX = (16 * 1024 * 1024)
+)
+
+// Maximum values for a message queue. Source: include/uapi/linux/mqueue.h
+const (
+ MQ_PRIO_MAX = 32768
+ MQ_BYTES_MAX = 819200
+)
+
+// Codes used by mq_notify. Source: include/uapi/linux/mqueue.h
+const (
+ NOTIFY_NONE = 0
+ NOTIFY_WOKENUP = 1
+ NOTIFY_REMOVED = 2
+
+ NOTIFY_COOKIE_LEN = 32
+)
+
+// MqAttr is equivelant to struct mq_attr. Source: include/uapi/linux/mqueue.h
+//
+// +marshal
+type MqAttr struct {
+ MqFlags int64 // Message queue flags.
+ MqMaxmsg int64 // Maximum number of messages.
+ MqMsgsize int64 // Maximum message size.
+ MqCurmsgs int64 // Number of messages currently queued.
+ _ [4]int64 // Ignored for input, zeroed for output.
+}
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 9d7526e47..652ade564 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -74,6 +74,11 @@ func (*DynamicBytesFile) SetStat(context.Context, *vfs.Filesystem, *auth.Credent
return linuxerr.EPERM
}
+// Locks returns the file locks for this file.
+func (f *DynamicBytesFile) Locks() *vfs.FileLocks {
+ return &f.locks
+}
+
// DynamicBytesFD implements vfs.FileDescriptionImpl for an FD backed by a
// DynamicBytesFile.
//
diff --git a/pkg/sentry/fsimpl/mqfs/mqfs.go b/pkg/sentry/fsimpl/mqfs/mqfs.go
new file mode 100644
index 000000000..c2b53c9d0
--- /dev/null
+++ b/pkg/sentry/fsimpl/mqfs/mqfs.go
@@ -0,0 +1,138 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package mqfs provides a filesystem implementation to back POSIX message
+// queues.
+package mqfs
+
+import (
+ "fmt"
+ "strconv"
+
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/ipc"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/mq"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+const (
+ // Name is the user-visible filesystem name.
+ Name = "mqueue"
+ defaultMaxCachedDentries = uint64(1000)
+)
+
+// FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
+type FilesystemType struct{}
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+ return Name
+}
+
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+ // mqfs is initialized only once per ipc namespace. Each ipc namespace has
+ // a POSIX message registry with a root dentry, filesystem, and a
+ // disconnected mount. We want the fs to be consistent for all processes in
+ // the same ipc namespace, so instead of creating a new fs and root dentry,
+ // we retreive them using IPCNamespace.PosixQueues and use them.
+
+ i := ipcNamespaceFromContext(ctx)
+ if i == nil {
+ return nil, nil, fmt.Errorf("mqfs.FilesystemType.GetFilesystem: ipc namespace doesn't exist")
+ }
+ defer i.DecRef(ctx)
+
+ registry := i.PosixQueues()
+ if registry == nil {
+ return nil, nil, fmt.Errorf("mqfs.FilesystemType.GetFilesystem: ipc namespace doesn't have a POSIX registry")
+ }
+ impl := registry.Impl().(*RegistryImpl)
+
+ maxCachedDentries, err := maxCachedDentries(ctx, vfs.GenericParseMountOptions(opts.Data))
+ if err != nil {
+ return nil, nil, err
+ }
+ impl.fs.MaxCachedDentries = maxCachedDentries
+
+ impl.fs.VFSFilesystem().IncRef()
+ return impl.fs.VFSFilesystem(), impl.root.VFSDentry(), nil
+}
+
+// maxCachedDentries checks mopts for dentry_cache_limit. If a value is
+// specified, parse it into uint64 and return it. Otherwise, return the default
+// value. An error is returned if a value is found but can't be parsed.
+func maxCachedDentries(ctx context.Context, mopts map[string]string) (_ uint64, err error) {
+ max := defaultMaxCachedDentries
+ if str, ok := mopts["dentry_cache_limit"]; ok {
+ delete(mopts, "dentry_cache_limit")
+ max, err = strconv.ParseUint(str, 10, 64)
+ if err != nil {
+ ctx.Warningf("mqfs.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str)
+ return 0, linuxerr.EINVAL
+ }
+ }
+ return max, nil
+}
+
+// filesystem implements kernfs.Filesystem.
+//
+// +stateify savable
+type filesystem struct {
+ kernfs.Filesystem
+ devMinor uint32
+
+ // root is the filesystem's root dentry. Since we take a reference on it in
+ // GetFilesystem, we should release it when the fs is released.
+ root *kernfs.Dentry
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release(ctx context.Context) {
+ fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+ fs.Filesystem.Release(ctx)
+}
+
+// MountOptions implements vfs.FilesystemImpl.MountOptions.
+func (fs *filesystem) MountOptions() string {
+ return fmt.Sprintf("dentry_cache_limit=%d", fs.MaxCachedDentries)
+}
+
+// ipcNamespace defines functions we need from kernel.IPCNamespace. We redefine
+// ipcNamespace along with ipcNamespaceFromContext to avoid circular dependency
+// with package sentry/kernel.
+type ipcNamespace interface {
+ // PosixQueues returns a POSIX message queue registry.
+ PosixQueues() *mq.Registry
+
+ // DecRef decrements ipcNamespace's number of references.
+ DecRef(ctx context.Context)
+}
+
+// ipcNamespaceFromContext returns the IPC namespace in which ctx is executing.
+// Copied from package sentry/kernel.
+func ipcNamespaceFromContext(ctx context.Context) ipcNamespace {
+ if v := ctx.Value(ipc.CtxIPCNamespace); v != nil {
+ return v.(ipcNamespace)
+ }
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/mqfs/mqfs_state_autogen.go b/pkg/sentry/fsimpl/mqfs/mqfs_state_autogen.go
new file mode 100644
index 000000000..d6154efc3
--- /dev/null
+++ b/pkg/sentry/fsimpl/mqfs/mqfs_state_autogen.go
@@ -0,0 +1,263 @@
+// automatically generated by stateify.
+
+package mqfs
+
+import (
+ "gvisor.dev/gvisor/pkg/state"
+)
+
+func (ft *FilesystemType) StateTypeName() string {
+ return "pkg/sentry/fsimpl/mqfs.FilesystemType"
+}
+
+func (ft *FilesystemType) StateFields() []string {
+ return []string{}
+}
+
+func (ft *FilesystemType) beforeSave() {}
+
+// +checklocksignore
+func (ft *FilesystemType) StateSave(stateSinkObject state.Sink) {
+ ft.beforeSave()
+}
+
+func (ft *FilesystemType) afterLoad() {}
+
+// +checklocksignore
+func (ft *FilesystemType) StateLoad(stateSourceObject state.Source) {
+}
+
+func (fs *filesystem) StateTypeName() string {
+ return "pkg/sentry/fsimpl/mqfs.filesystem"
+}
+
+func (fs *filesystem) StateFields() []string {
+ return []string{
+ "Filesystem",
+ "devMinor",
+ "root",
+ }
+}
+
+func (fs *filesystem) beforeSave() {}
+
+// +checklocksignore
+func (fs *filesystem) StateSave(stateSinkObject state.Sink) {
+ fs.beforeSave()
+ stateSinkObject.Save(0, &fs.Filesystem)
+ stateSinkObject.Save(1, &fs.devMinor)
+ stateSinkObject.Save(2, &fs.root)
+}
+
+func (fs *filesystem) afterLoad() {}
+
+// +checklocksignore
+func (fs *filesystem) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.Load(0, &fs.Filesystem)
+ stateSourceObject.Load(1, &fs.devMinor)
+ stateSourceObject.Load(2, &fs.root)
+}
+
+func (q *queueInode) StateTypeName() string {
+ return "pkg/sentry/fsimpl/mqfs.queueInode"
+}
+
+func (q *queueInode) StateFields() []string {
+ return []string{
+ "DynamicBytesFile",
+ "queue",
+ }
+}
+
+func (q *queueInode) beforeSave() {}
+
+// +checklocksignore
+func (q *queueInode) StateSave(stateSinkObject state.Sink) {
+ q.beforeSave()
+ stateSinkObject.Save(0, &q.DynamicBytesFile)
+ stateSinkObject.Save(1, &q.queue)
+}
+
+func (q *queueInode) afterLoad() {}
+
+// +checklocksignore
+func (q *queueInode) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.Load(0, &q.DynamicBytesFile)
+ stateSourceObject.Load(1, &q.queue)
+}
+
+func (fd *queueFD) StateTypeName() string {
+ return "pkg/sentry/fsimpl/mqfs.queueFD"
+}
+
+func (fd *queueFD) StateFields() []string {
+ return []string{
+ "FileDescriptionDefaultImpl",
+ "DynamicBytesFileDescriptionImpl",
+ "LockFD",
+ "vfsfd",
+ "inode",
+ "queue",
+ }
+}
+
+func (fd *queueFD) beforeSave() {}
+
+// +checklocksignore
+func (fd *queueFD) StateSave(stateSinkObject state.Sink) {
+ fd.beforeSave()
+ stateSinkObject.Save(0, &fd.FileDescriptionDefaultImpl)
+ stateSinkObject.Save(1, &fd.DynamicBytesFileDescriptionImpl)
+ stateSinkObject.Save(2, &fd.LockFD)
+ stateSinkObject.Save(3, &fd.vfsfd)
+ stateSinkObject.Save(4, &fd.inode)
+ stateSinkObject.Save(5, &fd.queue)
+}
+
+func (fd *queueFD) afterLoad() {}
+
+// +checklocksignore
+func (fd *queueFD) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.Load(0, &fd.FileDescriptionDefaultImpl)
+ stateSourceObject.Load(1, &fd.DynamicBytesFileDescriptionImpl)
+ stateSourceObject.Load(2, &fd.LockFD)
+ stateSourceObject.Load(3, &fd.vfsfd)
+ stateSourceObject.Load(4, &fd.inode)
+ stateSourceObject.Load(5, &fd.queue)
+}
+
+func (r *RegistryImpl) StateTypeName() string {
+ return "pkg/sentry/fsimpl/mqfs.RegistryImpl"
+}
+
+func (r *RegistryImpl) StateFields() []string {
+ return []string{
+ "root",
+ "fs",
+ "mount",
+ }
+}
+
+func (r *RegistryImpl) beforeSave() {}
+
+// +checklocksignore
+func (r *RegistryImpl) StateSave(stateSinkObject state.Sink) {
+ r.beforeSave()
+ stateSinkObject.Save(0, &r.root)
+ stateSinkObject.Save(1, &r.fs)
+ stateSinkObject.Save(2, &r.mount)
+}
+
+func (r *RegistryImpl) afterLoad() {}
+
+// +checklocksignore
+func (r *RegistryImpl) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.Load(0, &r.root)
+ stateSourceObject.Load(1, &r.fs)
+ stateSourceObject.Load(2, &r.mount)
+}
+
+func (i *rootInode) StateTypeName() string {
+ return "pkg/sentry/fsimpl/mqfs.rootInode"
+}
+
+func (i *rootInode) StateFields() []string {
+ return []string{
+ "rootInodeRefs",
+ "InodeAlwaysValid",
+ "InodeAttrs",
+ "InodeDirectoryNoNewChildren",
+ "InodeNotSymlink",
+ "InodeTemporary",
+ "OrderedChildren",
+ "implStatFS",
+ "locks",
+ }
+}
+
+func (i *rootInode) beforeSave() {}
+
+// +checklocksignore
+func (i *rootInode) StateSave(stateSinkObject state.Sink) {
+ i.beforeSave()
+ stateSinkObject.Save(0, &i.rootInodeRefs)
+ stateSinkObject.Save(1, &i.InodeAlwaysValid)
+ stateSinkObject.Save(2, &i.InodeAttrs)
+ stateSinkObject.Save(3, &i.InodeDirectoryNoNewChildren)
+ stateSinkObject.Save(4, &i.InodeNotSymlink)
+ stateSinkObject.Save(5, &i.InodeTemporary)
+ stateSinkObject.Save(6, &i.OrderedChildren)
+ stateSinkObject.Save(7, &i.implStatFS)
+ stateSinkObject.Save(8, &i.locks)
+}
+
+func (i *rootInode) afterLoad() {}
+
+// +checklocksignore
+func (i *rootInode) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.Load(0, &i.rootInodeRefs)
+ stateSourceObject.Load(1, &i.InodeAlwaysValid)
+ stateSourceObject.Load(2, &i.InodeAttrs)
+ stateSourceObject.Load(3, &i.InodeDirectoryNoNewChildren)
+ stateSourceObject.Load(4, &i.InodeNotSymlink)
+ stateSourceObject.Load(5, &i.InodeTemporary)
+ stateSourceObject.Load(6, &i.OrderedChildren)
+ stateSourceObject.Load(7, &i.implStatFS)
+ stateSourceObject.Load(8, &i.locks)
+}
+
+func (i *implStatFS) StateTypeName() string {
+ return "pkg/sentry/fsimpl/mqfs.implStatFS"
+}
+
+func (i *implStatFS) StateFields() []string {
+ return []string{}
+}
+
+func (i *implStatFS) beforeSave() {}
+
+// +checklocksignore
+func (i *implStatFS) StateSave(stateSinkObject state.Sink) {
+ i.beforeSave()
+}
+
+func (i *implStatFS) afterLoad() {}
+
+// +checklocksignore
+func (i *implStatFS) StateLoad(stateSourceObject state.Source) {
+}
+
+func (r *rootInodeRefs) StateTypeName() string {
+ return "pkg/sentry/fsimpl/mqfs.rootInodeRefs"
+}
+
+func (r *rootInodeRefs) StateFields() []string {
+ return []string{
+ "refCount",
+ }
+}
+
+func (r *rootInodeRefs) beforeSave() {}
+
+// +checklocksignore
+func (r *rootInodeRefs) StateSave(stateSinkObject state.Sink) {
+ r.beforeSave()
+ stateSinkObject.Save(0, &r.refCount)
+}
+
+// +checklocksignore
+func (r *rootInodeRefs) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.Load(0, &r.refCount)
+ stateSourceObject.AfterLoad(r.afterLoad)
+}
+
+func init() {
+ state.Register((*FilesystemType)(nil))
+ state.Register((*filesystem)(nil))
+ state.Register((*queueInode)(nil))
+ state.Register((*queueFD)(nil))
+ state.Register((*RegistryImpl)(nil))
+ state.Register((*rootInode)(nil))
+ state.Register((*implStatFS)(nil))
+ state.Register((*rootInodeRefs)(nil))
+}
diff --git a/pkg/sentry/fsimpl/mqfs/queue.go b/pkg/sentry/fsimpl/mqfs/queue.go
new file mode 100644
index 000000000..933dbc6ed
--- /dev/null
+++ b/pkg/sentry/fsimpl/mqfs/queue.go
@@ -0,0 +1,145 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mqfs
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/mq"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// queueInode represents an inode for a message queue (/dev/mqueue/[name]).
+//
+// +stateify savable
+type queueInode struct {
+ kernfs.DynamicBytesFile
+
+ // queue is the message queue backing this inode.
+ queue *mq.Queue
+}
+
+var _ kernfs.Inode = (*queueInode)(nil)
+
+// newQueueInode returns a new, initialized queueInode.
+func (fs *filesystem) newQueueInode(ctx context.Context, creds *auth.Credentials, q *mq.Queue, perm linux.FileMode) kernfs.Inode {
+ inode := &queueInode{queue: q}
+ inode.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), q, perm)
+ return inode
+}
+
+// Keep implements kernfs.Inode.Keep.
+func (q *queueInode) Keep() bool {
+ // Return true so that the fs keeps newly created dentries. This is done
+ // because inodes returned by root.Lookup are not temporary, they exist
+ // in the fs, and refer to message queues.
+ return true
+}
+
+// queueFD implements vfs.FileDescriptionImpl for FD backed by a POSIX message
+// queue. It's mostly similar to DynamicBytesFD, but implements more operations.
+//
+// +stateify savable
+type queueFD struct {
+ vfs.FileDescriptionDefaultImpl
+ vfs.DynamicBytesFileDescriptionImpl
+ vfs.LockFD
+
+ vfsfd vfs.FileDescription
+ inode kernfs.Inode
+
+ // queue is a view into the queue backing this fd.
+ queue mq.View
+}
+
+// Init initializes a queueFD. Mostly copied from DynamicBytesFD.Init, but uses
+// the queueFD as FileDescriptionImpl.
+func (fd *queueFD) Init(m *vfs.Mount, d *kernfs.Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error {
+ fd.LockFD.Init(locks)
+ if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
+ return err
+ }
+ fd.inode = d.Inode()
+ fd.SetDataSource(data)
+ return nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *queueFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ return fd.DynamicBytesFileDescriptionImpl.Seek(ctx, offset, whence)
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *queueFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ return fd.DynamicBytesFileDescriptionImpl.Read(ctx, dst, opts)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *queueFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ return fd.DynamicBytesFileDescriptionImpl.PRead(ctx, dst, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *queueFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ return fd.DynamicBytesFileDescriptionImpl.Write(ctx, src, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *queueFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ return fd.DynamicBytesFileDescriptionImpl.PWrite(ctx, src, offset, opts)
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *queueFD) Release(context.Context) {}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *queueFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
+ return fd.inode.Stat(ctx, fs, opts)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *queueFD) SetStat(context.Context, vfs.SetStatOptions) error {
+ // DynamicBytesFiles are immutable.
+ return linuxerr.EPERM
+}
+
+// OnClose implements FileDescriptionImpl.OnClose similar to
+// ipc/mqueue.c::mqueue_flush_file.
+func (fd *queueFD) OnClose(ctx context.Context) error {
+ fd.queue.Flush(ctx)
+ return nil
+}
+
+// Readiness implements waiter.Waitable.Readiness similar to
+// ipc/mqueue.c::mqueue_poll_file.
+func (fd *queueFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return fd.queue.Readiness(mask)
+}
+
+// EventRegister implements Waitable.EventRegister.
+func (fd *queueFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ fd.queue.EventRegister(e, mask)
+}
+
+// EventUnregister implements Waitable.EventUnregister.
+func (fd *queueFD) EventUnregister(e *waiter.Entry) {
+ fd.queue.EventUnregister(e)
+}
diff --git a/pkg/sentry/fsimpl/mqfs/registry.go b/pkg/sentry/fsimpl/mqfs/registry.go
new file mode 100644
index 000000000..c8fbe4d33
--- /dev/null
+++ b/pkg/sentry/fsimpl/mqfs/registry.go
@@ -0,0 +1,176 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mqfs
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/mq"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// RegistryImpl implements mq.RegistryImpl. It implements the interface using
+// the message queue filesystem, and is provided to mq.Registry at
+// initialization.
+//
+// RegistryImpl is not thread-safe, so it is the responsibility of the user
+// (the containing mq.Registry) to protect using a lock.
+//
+// +stateify savable
+type RegistryImpl struct {
+ // root is the root dentry of the mq filesystem. Its main usage is to
+ // retreive the root inode, which we use to add, remove, and lookup message
+ // queues.
+ //
+ // We hold a reference on root and release when the registry is destroyed.
+ root *kernfs.Dentry
+
+ // fs is the filesystem backing this registry, used mainly to initialize
+ // new inodes.
+ fs *filesystem
+
+ // mount is the mount point used for this filesystem.
+ mount *vfs.Mount
+}
+
+// NewRegistryImpl returns a new, initialized RegistryImpl, and takes a
+// reference on root.
+func NewRegistryImpl(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*RegistryImpl, error) {
+ devMinor, err := vfsObj.GetAnonBlockDevMinor()
+ if err != nil {
+ return nil, err
+ }
+
+ var dentry kernfs.Dentry
+ fs := &filesystem{
+ devMinor: devMinor,
+ root: &dentry,
+ }
+ fs.VFSFilesystem().Init(vfsObj, &FilesystemType{}, fs)
+ vfsfs := fs.VFSFilesystem()
+
+ dentry.InitRoot(&fs.Filesystem, fs.newRootInode(ctx, creds))
+ defer vfsfs.DecRef(ctx) // NewDisconnectedMount will obtain a ref on success.
+
+ mount, err := vfsObj.NewDisconnectedMount(vfsfs, dentry.VFSDentry(), &vfs.MountOptions{})
+ if err != nil {
+ return nil, err
+ }
+
+ return &RegistryImpl{
+ root: &dentry,
+ fs: fs,
+ mount: mount,
+ }, nil
+}
+
+// Get implements mq.RegistryImpl.Get.
+func (r *RegistryImpl) Get(ctx context.Context, name string, access mq.AccessType, block bool, flags uint32) (*vfs.FileDescription, bool, error) {
+ inode, err := r.lookup(ctx, name)
+ if err != nil {
+ return nil, false, nil
+ }
+
+ qInode := inode.(*queueInode)
+ if !qInode.queue.HasPermissions(auth.CredentialsFromContext(ctx), perm(access)) {
+ // "The queue exists, but the caller does not have permission to
+ // open it in the specified mode."
+ return nil, false, linuxerr.EACCES
+ }
+
+ fd, err := r.newFD(qInode.queue, qInode, access, block, flags)
+ if err != nil {
+ return nil, false, err
+ }
+ return fd, true, nil
+}
+
+// New implements mq.RegistryImpl.New.
+func (r *RegistryImpl) New(ctx context.Context, name string, q *mq.Queue, access mq.AccessType, block bool, perm linux.FileMode, flags uint32) (*vfs.FileDescription, error) {
+ root := r.root.Inode().(*rootInode)
+ qInode := r.fs.newQueueInode(ctx, auth.CredentialsFromContext(ctx), q, perm).(*queueInode)
+ err := root.Insert(name, qInode)
+ if err != nil {
+ return nil, err
+ }
+ return r.newFD(q, qInode, access, block, flags)
+}
+
+// Unlink implements mq.RegistryImpl.Unlink.
+func (r *RegistryImpl) Unlink(ctx context.Context, name string) error {
+ creds := auth.CredentialsFromContext(ctx)
+ if err := r.root.Inode().CheckPermissions(ctx, creds, vfs.MayWrite|vfs.MayExec); err != nil {
+ return err
+ }
+
+ root := r.root.Inode().(*rootInode)
+ inode, err := r.lookup(ctx, name)
+ if err != nil {
+ return err
+ }
+ return root.Unlink(ctx, name, inode)
+}
+
+// Destroy implements mq.RegistryImpl.Destroy.
+func (r *RegistryImpl) Destroy(ctx context.Context) {
+ r.root.DecRef(ctx)
+ r.mount.DecRef(ctx)
+}
+
+// lookup retreives a kernfs.Inode using a name.
+func (r *RegistryImpl) lookup(ctx context.Context, name string) (kernfs.Inode, error) {
+ inode := r.root.Inode().(*rootInode)
+ lookup, err := inode.Lookup(ctx, name)
+ if err != nil {
+ return nil, err
+ }
+ return lookup, nil
+}
+
+// newFD returns a new file description created using the given queue and inode.
+func (r *RegistryImpl) newFD(q *mq.Queue, inode *queueInode, access mq.AccessType, block bool, flags uint32) (*vfs.FileDescription, error) {
+ view, err := mq.NewView(q, access, block)
+ if err != nil {
+ return nil, err
+ }
+
+ var dentry kernfs.Dentry
+ dentry.Init(&r.fs.Filesystem, inode)
+
+ fd := &queueFD{queue: view}
+ err = fd.Init(r.mount, &dentry, inode.queue, inode.Locks(), flags)
+ if err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+// perm returns a permission mask created using given flags.
+func perm(access mq.AccessType) fs.PermMask {
+ switch access {
+ case mq.ReadWrite:
+ return fs.PermMask{Read: true, Write: true}
+ case mq.WriteOnly:
+ return fs.PermMask{Write: true}
+ case mq.ReadOnly:
+ return fs.PermMask{Read: true}
+ default:
+ return fs.PermMask{} // Can't happen, see NewView.
+ }
+}
diff --git a/pkg/sentry/fsimpl/mqfs/root.go b/pkg/sentry/fsimpl/mqfs/root.go
new file mode 100644
index 000000000..37b5749fb
--- /dev/null
+++ b/pkg/sentry/fsimpl/mqfs/root.go
@@ -0,0 +1,89 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mqfs
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// rootInode represents inode for filesystem's root directory (/dev/mqueue).
+//
+// +stateify savable
+type rootInode struct {
+ rootInodeRefs
+ kernfs.InodeAlwaysValid
+ kernfs.InodeAttrs
+ kernfs.InodeDirectoryNoNewChildren
+ kernfs.InodeNotSymlink
+ kernfs.InodeTemporary
+ kernfs.OrderedChildren
+ implStatFS
+
+ locks vfs.FileLocks
+}
+
+var _ kernfs.Inode = (*rootInode)(nil)
+
+// newRootInode returns a new, initialized rootInode.
+func (fs *filesystem) newRootInode(ctx context.Context, creds *auth.Credentials) kernfs.Inode {
+ inode := &rootInode{}
+ inode.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|linux.FileMode(0555))
+ inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true})
+ inode.InitRefs()
+ return inode
+}
+
+// Open implements kernfs.Inode.Open.
+func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+ SeekEnd: kernfs.SeekEndZero,
+ })
+ if err != nil {
+ return nil, err
+ }
+ return fd.VFSFileDescription(), nil
+}
+
+// DecRef implements kernfs.Inode.DecRef.
+func (i *rootInode) DecRef(ctx context.Context) {
+ i.rootInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
+// Rename implements Inode.Rename and overrides OrderedChildren.Rename. mqueue
+// filesystem allows files to be unlinked, but not renamed.
+func (i *rootInode) Rename(ctx context.Context, oldname, newname string, child, dstDir kernfs.Inode) error {
+ return linuxerr.EPERM
+}
+
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
+func (*rootInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+ return linuxerr.EPERM
+}
+
+// implStatFS provides an implementation of kernfs.Inode.StatFS for message
+// queues to be embedded in inodes.
+//
+// +stateify savable
+type implStatFS struct{}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+ return vfs.GenericStatFS(linux.MQUEUE_MAGIC), nil
+}
diff --git a/pkg/sentry/fsimpl/mqfs/root_inode_refs.go b/pkg/sentry/fsimpl/mqfs/root_inode_refs.go
new file mode 100644
index 000000000..7462467a5
--- /dev/null
+++ b/pkg/sentry/fsimpl/mqfs/root_inode_refs.go
@@ -0,0 +1,140 @@
+package mqfs
+
+import (
+ "fmt"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/refsvfs2"
+)
+
+// enableLogging indicates whether reference-related events should be logged (with
+// stack traces). This is false by default and should only be set to true for
+// debugging purposes, as it can generate an extremely large amount of output
+// and drastically degrade performance.
+const rootInodeenableLogging = false
+
+// obj is used to customize logging. Note that we use a pointer to T so that
+// we do not copy the entire object when passed as a format parameter.
+var rootInodeobj *rootInode
+
+// Refs implements refs.RefCounter. It keeps a reference count using atomic
+// operations and calls the destructor when the count reaches zero.
+//
+// NOTE: Do not introduce additional fields to the Refs struct. It is used by
+// many filesystem objects, and we want to keep it as small as possible (i.e.,
+// the same size as using an int64 directly) to avoid taking up extra cache
+// space. In general, this template should not be extended at the cost of
+// performance. If it does not offer enough flexibility for a particular object
+// (example: b/187877947), we should implement the RefCounter/CheckedObject
+// interfaces manually.
+//
+// +stateify savable
+type rootInodeRefs struct {
+ // refCount is composed of two fields:
+ //
+ // [32-bit speculative references]:[32-bit real references]
+ //
+ // Speculative references are used for TryIncRef, to avoid a CompareAndSwap
+ // loop. See IncRef, DecRef and TryIncRef for details of how these fields are
+ // used.
+ refCount int64
+}
+
+// InitRefs initializes r with one reference and, if enabled, activates leak
+// checking.
+func (r *rootInodeRefs) InitRefs() {
+ atomic.StoreInt64(&r.refCount, 1)
+ refsvfs2.Register(r)
+}
+
+// RefType implements refsvfs2.CheckedObject.RefType.
+func (r *rootInodeRefs) RefType() string {
+ return fmt.Sprintf("%T", rootInodeobj)[1:]
+}
+
+// LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
+func (r *rootInodeRefs) LeakMessage() string {
+ return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs())
+}
+
+// LogRefs implements refsvfs2.CheckedObject.LogRefs.
+func (r *rootInodeRefs) LogRefs() bool {
+ return rootInodeenableLogging
+}
+
+// ReadRefs returns the current number of references. The returned count is
+// inherently racy and is unsafe to use without external synchronization.
+func (r *rootInodeRefs) ReadRefs() int64 {
+ return atomic.LoadInt64(&r.refCount)
+}
+
+// IncRef implements refs.RefCounter.IncRef.
+//
+//go:nosplit
+func (r *rootInodeRefs) IncRef() {
+ v := atomic.AddInt64(&r.refCount, 1)
+ if rootInodeenableLogging {
+ refsvfs2.LogIncRef(r, v)
+ }
+ if v <= 1 {
+ panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType()))
+ }
+}
+
+// TryIncRef implements refs.TryRefCounter.TryIncRef.
+//
+// To do this safely without a loop, a speculative reference is first acquired
+// on the object. This allows multiple concurrent TryIncRef calls to distinguish
+// other TryIncRef calls from genuine references held.
+//
+//go:nosplit
+func (r *rootInodeRefs) TryIncRef() bool {
+ const speculativeRef = 1 << 32
+ if v := atomic.AddInt64(&r.refCount, speculativeRef); int32(v) == 0 {
+
+ atomic.AddInt64(&r.refCount, -speculativeRef)
+ return false
+ }
+
+ v := atomic.AddInt64(&r.refCount, -speculativeRef+1)
+ if rootInodeenableLogging {
+ refsvfs2.LogTryIncRef(r, v)
+ }
+ return true
+}
+
+// DecRef implements refs.RefCounter.DecRef.
+//
+// Note that speculative references are counted here. Since they were added
+// prior to real references reaching zero, they will successfully convert to
+// real references. In other words, we see speculative references only in the
+// following case:
+//
+// A: TryIncRef [speculative increase => sees non-negative references]
+// B: DecRef [real decrease]
+// A: TryIncRef [transform speculative to real]
+//
+//go:nosplit
+func (r *rootInodeRefs) DecRef(destroy func()) {
+ v := atomic.AddInt64(&r.refCount, -1)
+ if rootInodeenableLogging {
+ refsvfs2.LogDecRef(r, v)
+ }
+ switch {
+ case v < 0:
+ panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType()))
+
+ case v == 0:
+ refsvfs2.Unregister(r)
+
+ if destroy != nil {
+ destroy()
+ }
+ }
+}
+
+func (r *rootInodeRefs) afterLoad() {
+ if r.ReadRefs() > 0 {
+ refsvfs2.Register(r)
+ }
+}
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go
index a8596410f..7e11c6580 100644
--- a/pkg/sentry/kernel/context.go
+++ b/pkg/sentry/kernel/context.go
@@ -16,6 +16,7 @@ package kernel
import (
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/ipc"
)
// contextID is the kernel package's type for context.Context.Value keys.
@@ -37,9 +38,6 @@ const (
// CtxUTSNamespace is a Context.Value key for a UTSNamespace.
CtxUTSNamespace
-
- // CtxIPCNamespace is a Context.Value key for a IPCNamespace.
- CtxIPCNamespace
)
// ContextCanTrace returns true if ctx is permitted to trace t, in the same sense
@@ -82,7 +80,7 @@ func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace {
// or nil if there is no such IPC namespace. It takes a reference on the
// namespace.
func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace {
- if v := ctx.Value(CtxIPCNamespace); v != nil {
+ if v := ctx.Value(ipc.CtxIPCNamespace); v != nil {
return v.(*IPCNamespace)
}
return nil
diff --git a/pkg/sentry/kernel/ipc/ns.go b/pkg/sentry/kernel/ipc/ns.go
new file mode 100644
index 000000000..220c9eafb
--- /dev/null
+++ b/pkg/sentry/kernel/ipc/ns.go
@@ -0,0 +1,22 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ipc
+
+type contextID int
+
+// CtxIPCNamespace is the context.Value key used to retreive an IPC namespace.
+// We define it here because it's needed in several packages, and is not
+// possible to use otherwise without causing a circular depenedency.
+const CtxIPCNamespace contextID = iota
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
index 0b101b1bb..50b4e015e 100644
--- a/pkg/sentry/kernel/ipc_namespace.go
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -15,11 +15,16 @@
package kernel
import (
+ "fmt"
+
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/mqfs"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/mq"
"gvisor.dev/gvisor/pkg/sentry/kernel/msgqueue"
"gvisor.dev/gvisor/pkg/sentry/kernel/semaphore"
"gvisor.dev/gvisor/pkg/sentry/kernel/shm"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
)
// IPCNamespace represents an IPC namespace.
@@ -31,9 +36,17 @@ type IPCNamespace struct {
// User namespace which owns this IPC namespace. Immutable.
userNS *auth.UserNamespace
+ // System V utilities.
queues *msgqueue.Registry
semaphores *semaphore.Registry
shms *shm.Registry
+
+ // posixQueues is a POSIX message queue registry.
+ //
+ // posixQueues is somewhat equivelant to Linux's ipc_namespace.mq_mnt.
+ // Unlike SysV utilities, mq.Registry is not map-based, but is backed by
+ // a virtual filesystem.
+ posixQueues *mq.Registry
}
// NewIPCNamespace creates a new IPC namespace.
@@ -63,10 +76,35 @@ func (i *IPCNamespace) ShmRegistry() *shm.Registry {
return i.shms
}
+// InitPosixQueues creates a new POSIX queue registry, and returns an error if
+// the registry was previously initialized.
+func (i *IPCNamespace) InitPosixQueues(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error {
+ if i.posixQueues != nil {
+ return fmt.Errorf("IPCNamespace.InitPosixQueues: already initialized")
+ }
+
+ impl, err := mqfs.NewRegistryImpl(ctx, vfsObj, creds)
+ if err != nil {
+ return err
+ }
+ i.posixQueues = mq.NewRegistry(i.userNS, impl)
+ return nil
+}
+
+// PosixQueues returns the posix message queue registry for this namespace.
+//
+// Precondition: i.InitPosixQueues must have been called.
+func (i *IPCNamespace) PosixQueues() *mq.Registry {
+ return i.posixQueues
+}
+
// DecRef implements refsvfs2.RefCounter.DecRef.
func (i *IPCNamespace) DecRef(ctx context.Context) {
i.IPCNamespaceRefs.DecRef(func() {
i.shms.Release(ctx)
+ if i.posixQueues != nil {
+ i.posixQueues.Destroy(ctx)
+ }
})
}
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 5dc821a48..d4851ccda 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -58,6 +58,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/ipc"
"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/limits"
@@ -407,6 +408,11 @@ func (k *Kernel) Init(args InitKernelArgs) error {
return fmt.Errorf("failed to initialize VFS: %v", err)
}
+ err := k.rootIPCNamespace.InitPosixQueues(ctx, &k.vfs, auth.CredentialsFromContext(ctx))
+ if err != nil {
+ return fmt.Errorf("failed to create mqfs filesystem: %v", err)
+ }
+
pipeFilesystem, err := pipefs.NewFilesystem(&k.vfs)
if err != nil {
return fmt.Errorf("failed to create pipefs filesystem: %v", err)
@@ -837,7 +843,7 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
return ctx.args.PIDNamespace
case CtxUTSNamespace:
return ctx.args.UTSNamespace
- case CtxIPCNamespace:
+ case ipc.CtxIPCNamespace:
ipcns := ctx.args.IPCNamespace
ipcns.IncRef()
return ipcns
@@ -1665,7 +1671,7 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
return ctx.k.tasks.Root
case CtxUTSNamespace:
return ctx.k.rootUTSNamespace
- case CtxIPCNamespace:
+ case ipc.CtxIPCNamespace:
ipcns := ctx.k.rootIPCNamespace
ipcns.IncRef()
return ipcns
diff --git a/pkg/sentry/kernel/kernel_state_autogen.go b/pkg/sentry/kernel/kernel_state_autogen.go
index ef8dd88f6..303ee51b1 100644
--- a/pkg/sentry/kernel/kernel_state_autogen.go
+++ b/pkg/sentry/kernel/kernel_state_autogen.go
@@ -346,6 +346,7 @@ func (i *IPCNamespace) StateFields() []string {
"queues",
"semaphores",
"shms",
+ "posixQueues",
}
}
@@ -359,6 +360,7 @@ func (i *IPCNamespace) StateSave(stateSinkObject state.Sink) {
stateSinkObject.Save(2, &i.queues)
stateSinkObject.Save(3, &i.semaphores)
stateSinkObject.Save(4, &i.shms)
+ stateSinkObject.Save(5, &i.posixQueues)
}
func (i *IPCNamespace) afterLoad() {}
@@ -370,6 +372,7 @@ func (i *IPCNamespace) StateLoad(stateSourceObject state.Source) {
stateSourceObject.Load(2, &i.queues)
stateSourceObject.Load(3, &i.semaphores)
stateSourceObject.Load(4, &i.shms)
+ stateSourceObject.Load(5, &i.posixQueues)
}
func (r *IPCNamespaceRefs) StateTypeName() string {
diff --git a/pkg/sentry/kernel/mq/message_list.go b/pkg/sentry/kernel/mq/message_list.go
new file mode 100644
index 000000000..a5874c5b1
--- /dev/null
+++ b/pkg/sentry/kernel/mq/message_list.go
@@ -0,0 +1,221 @@
+package mq
+
+// ElementMapper provides an identity mapping by default.
+//
+// This can be replaced to provide a struct that maps elements to linker
+// objects, if they are not the same. An ElementMapper is not typically
+// required if: Linker is left as is, Element is left as is, or Linker and
+// Element are the same type.
+type msgElementMapper struct{}
+
+// linkerFor maps an Element to a Linker.
+//
+// This default implementation should be inlined.
+//
+//go:nosplit
+func (msgElementMapper) linkerFor(elem *Message) *Message { return elem }
+
+// List is an intrusive list. Entries can be added to or removed from the list
+// in O(1) time and with no additional memory allocations.
+//
+// The zero value for List is an empty list ready to use.
+//
+// To iterate over a list (where l is a List):
+// for e := l.Front(); e != nil; e = e.Next() {
+// // do something with e.
+// }
+//
+// +stateify savable
+type msgList struct {
+ head *Message
+ tail *Message
+}
+
+// Reset resets list l to the empty state.
+func (l *msgList) Reset() {
+ l.head = nil
+ l.tail = nil
+}
+
+// Empty returns true iff the list is empty.
+//
+//go:nosplit
+func (l *msgList) Empty() bool {
+ return l.head == nil
+}
+
+// Front returns the first element of list l or nil.
+//
+//go:nosplit
+func (l *msgList) Front() *Message {
+ return l.head
+}
+
+// Back returns the last element of list l or nil.
+//
+//go:nosplit
+func (l *msgList) Back() *Message {
+ return l.tail
+}
+
+// Len returns the number of elements in the list.
+//
+// NOTE: This is an O(n) operation.
+//
+//go:nosplit
+func (l *msgList) Len() (count int) {
+ for e := l.Front(); e != nil; e = (msgElementMapper{}.linkerFor(e)).Next() {
+ count++
+ }
+ return count
+}
+
+// PushFront inserts the element e at the front of list l.
+//
+//go:nosplit
+func (l *msgList) PushFront(e *Message) {
+ linker := msgElementMapper{}.linkerFor(e)
+ linker.SetNext(l.head)
+ linker.SetPrev(nil)
+ if l.head != nil {
+ msgElementMapper{}.linkerFor(l.head).SetPrev(e)
+ } else {
+ l.tail = e
+ }
+
+ l.head = e
+}
+
+// PushBack inserts the element e at the back of list l.
+//
+//go:nosplit
+func (l *msgList) PushBack(e *Message) {
+ linker := msgElementMapper{}.linkerFor(e)
+ linker.SetNext(nil)
+ linker.SetPrev(l.tail)
+ if l.tail != nil {
+ msgElementMapper{}.linkerFor(l.tail).SetNext(e)
+ } else {
+ l.head = e
+ }
+
+ l.tail = e
+}
+
+// PushBackList inserts list m at the end of list l, emptying m.
+//
+//go:nosplit
+func (l *msgList) PushBackList(m *msgList) {
+ if l.head == nil {
+ l.head = m.head
+ l.tail = m.tail
+ } else if m.head != nil {
+ msgElementMapper{}.linkerFor(l.tail).SetNext(m.head)
+ msgElementMapper{}.linkerFor(m.head).SetPrev(l.tail)
+
+ l.tail = m.tail
+ }
+ m.head = nil
+ m.tail = nil
+}
+
+// InsertAfter inserts e after b.
+//
+//go:nosplit
+func (l *msgList) InsertAfter(b, e *Message) {
+ bLinker := msgElementMapper{}.linkerFor(b)
+ eLinker := msgElementMapper{}.linkerFor(e)
+
+ a := bLinker.Next()
+
+ eLinker.SetNext(a)
+ eLinker.SetPrev(b)
+ bLinker.SetNext(e)
+
+ if a != nil {
+ msgElementMapper{}.linkerFor(a).SetPrev(e)
+ } else {
+ l.tail = e
+ }
+}
+
+// InsertBefore inserts e before a.
+//
+//go:nosplit
+func (l *msgList) InsertBefore(a, e *Message) {
+ aLinker := msgElementMapper{}.linkerFor(a)
+ eLinker := msgElementMapper{}.linkerFor(e)
+
+ b := aLinker.Prev()
+ eLinker.SetNext(a)
+ eLinker.SetPrev(b)
+ aLinker.SetPrev(e)
+
+ if b != nil {
+ msgElementMapper{}.linkerFor(b).SetNext(e)
+ } else {
+ l.head = e
+ }
+}
+
+// Remove removes e from l.
+//
+//go:nosplit
+func (l *msgList) Remove(e *Message) {
+ linker := msgElementMapper{}.linkerFor(e)
+ prev := linker.Prev()
+ next := linker.Next()
+
+ if prev != nil {
+ msgElementMapper{}.linkerFor(prev).SetNext(next)
+ } else if l.head == e {
+ l.head = next
+ }
+
+ if next != nil {
+ msgElementMapper{}.linkerFor(next).SetPrev(prev)
+ } else if l.tail == e {
+ l.tail = prev
+ }
+
+ linker.SetNext(nil)
+ linker.SetPrev(nil)
+}
+
+// Entry is a default implementation of Linker. Users can add anonymous fields
+// of this type to their structs to make them automatically implement the
+// methods needed by List.
+//
+// +stateify savable
+type msgEntry struct {
+ next *Message
+ prev *Message
+}
+
+// Next returns the entry that follows e in the list.
+//
+//go:nosplit
+func (e *msgEntry) Next() *Message {
+ return e.next
+}
+
+// Prev returns the entry that precedes e in the list.
+//
+//go:nosplit
+func (e *msgEntry) Prev() *Message {
+ return e.prev
+}
+
+// SetNext assigns 'entry' as the entry that follows e in the list.
+//
+//go:nosplit
+func (e *msgEntry) SetNext(elem *Message) {
+ e.next = elem
+}
+
+// SetPrev assigns 'entry' as the entry that precedes e in the list.
+//
+//go:nosplit
+func (e *msgEntry) SetPrev(elem *Message) {
+ e.prev = elem
+}
diff --git a/pkg/sentry/kernel/mq/mq.go b/pkg/sentry/kernel/mq/mq.go
new file mode 100644
index 000000000..50ca6d34a
--- /dev/null
+++ b/pkg/sentry/kernel/mq/mq.go
@@ -0,0 +1,457 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package mq provides an implementation for POSIX message queues.
+package mq
+
+import (
+ "bytes"
+ "fmt"
+ "strings"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/errors/linuxerr"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// AccessType is the access type passed to mq_open.
+type AccessType int
+
+// Possible access types.
+const (
+ ReadOnly AccessType = iota
+ WriteOnly
+ ReadWrite
+)
+
+// MaxName is the maximum size for a queue name.
+const MaxName = 255
+
+const (
+ maxPriority = linux.MQ_PRIO_MAX - 1 // Highest possible message priority.
+
+ maxQueuesDefault = linux.DFLT_QUEUESMAX // Default max number of queues.
+
+ maxMsgDefault = linux.DFLT_MSG // Default max number of messages per queue.
+ maxMsgMin = linux.MIN_MSGMAX // Min value for max number of messages per queue.
+ maxMsgLimit = linux.DFLT_MSGMAX // Limit for max number of messages per queue.
+ maxMsgHardLimit = linux.HARD_MSGMAX // Hard limit for max number of messages per queue.
+
+ msgSizeDefault = linux.DFLT_MSGSIZE // Default max message size.
+ msgSizeMin = linux.MIN_MSGSIZEMAX // Min value for max message size.
+ msgSizeLimit = linux.DFLT_MSGSIZEMAX // Limit for max message size.
+ msgSizeHardLimit = linux.HARD_MSGSIZEMAX // Hard limit for max message size.
+)
+
+// Registry is a POSIX message queue registry.
+//
+// Unlike SysV utilities, Registry is not map-based. It uses a provided
+// RegistryImpl backed by a virtual filesystem to implement registry operations.
+//
+// +stateify savable
+type Registry struct {
+ // userNS is the user namespace containing this registry. Immutable.
+ userNS *auth.UserNamespace
+
+ // mu protects all fields below.
+ mu sync.Mutex `state:"nosave"`
+
+ // impl is an implementation of several message queue utilities needed by
+ // the registry. impl should be provided by mqfs.
+ impl RegistryImpl
+}
+
+// RegistryImpl defines utilities needed by a Registry to provide actual
+// registry implementation. It works mainly as an abstraction layer used by
+// Registry to avoid dealing directly with the filesystem. RegistryImpl should
+// be implemented by mqfs and provided to Registry at initialization.
+type RegistryImpl interface {
+ // Get searchs for a queue with the given name, if it exists, the queue is
+ // used to create a new FD, return it and return true. If the queue doesn't
+ // exist, return false and no error. An error is returned if creation fails.
+ Get(ctx context.Context, name string, access AccessType, block bool, flags uint32) (*vfs.FileDescription, bool, error)
+
+ // New creates a new inode and file description using the given queue,
+ // inserts the inode into the filesystem tree using the given name, and
+ // returns the file description. An error is returned if creation fails, or
+ // if the name already exists.
+ New(ctx context.Context, name string, q *Queue, access AccessType, block bool, perm linux.FileMode, flags uint32) (*vfs.FileDescription, error)
+
+ // Unlink removes the queue with given name from the registry, and returns
+ // an error if the name doesn't exist.
+ Unlink(ctx context.Context, name string) error
+
+ // Destroy destroys the registry.
+ Destroy(context.Context)
+}
+
+// NewRegistry returns a new, initialized message queue registry. NewRegistry
+// should be called when a new message queue filesystem is created, once per
+// IPCNamespace.
+func NewRegistry(userNS *auth.UserNamespace, impl RegistryImpl) *Registry {
+ return &Registry{
+ userNS: userNS,
+ impl: impl,
+ }
+}
+
+// OpenOpts holds the options passed to FindOrCreate.
+type OpenOpts struct {
+ Name string
+ Access AccessType
+ Create bool
+ Exclusive bool
+ Block bool
+}
+
+// FindOrCreate creates a new POSIX message queue or opens an existing queue.
+// See mq_open(2).
+func (r *Registry) FindOrCreate(ctx context.Context, opts OpenOpts, perm linux.FileMode, attr *linux.MqAttr) (*vfs.FileDescription, error) {
+ // mq_overview(7) mentions that: "Each message queue is identified by a name
+ // of the form '/somename'", but the mq_open(3) man pages mention:
+ // "The mq_open() library function is implemented on top of a system call
+ // of the same name. The library function performs the check that the
+ // name starts with a slash (/), giving the EINVAL error if it does not.
+ // The kernel system call expects name to contain no preceding slash, so
+ // the C library function passes name without the preceding slash (i.e.,
+ // name+1) to the system call."
+ // So we don't need to check it.
+
+ if len(opts.Name) == 0 {
+ return nil, linuxerr.ENOENT
+ }
+ if len(opts.Name) > MaxName {
+ return nil, linuxerr.ENAMETOOLONG
+ }
+ if strings.ContainsRune(opts.Name, '/') {
+ return nil, linuxerr.EACCES
+ }
+ if opts.Name == "." || opts.Name == ".." {
+ return nil, linuxerr.EINVAL
+ }
+
+ // Construct status flags.
+ var flags uint32
+ if opts.Block {
+ flags = linux.O_NONBLOCK
+ }
+ switch opts.Access {
+ case ReadOnly:
+ flags = flags | linux.O_RDONLY
+ case WriteOnly:
+ flags = flags | linux.O_WRONLY
+ case ReadWrite:
+ flags = flags | linux.O_RDWR
+ }
+
+ r.mu.Lock()
+ defer r.mu.Unlock()
+ fd, ok, err := r.impl.Get(ctx, opts.Name, opts.Access, opts.Block, flags)
+ if err != nil {
+ return nil, err
+ }
+
+ if ok {
+ if opts.Create && opts.Exclusive {
+ // "Both O_CREAT and O_EXCL were specified in oflag, but a queue
+ // with this name already exists."
+ return nil, linuxerr.EEXIST
+ }
+ return fd, nil
+ }
+
+ if !opts.Create {
+ // "The O_CREAT flag was not specified in oflag, and no queue with this name
+ // exists."
+ return nil, linuxerr.ENOENT
+ }
+
+ q, err := r.newQueueLocked(auth.CredentialsFromContext(ctx), fs.FileOwnerFromContext(ctx), fs.FilePermsFromMode(perm), attr)
+ if err != nil {
+ return nil, err
+ }
+ return r.impl.New(ctx, opts.Name, q, opts.Access, opts.Block, perm, flags)
+}
+
+// newQueueLocked creates a new queue using the given attributes. If attr is nil
+// return a queue with default values, otherwise use attr to create a new queue,
+// and return an error if attributes are invalid.
+func (r *Registry) newQueueLocked(creds *auth.Credentials, owner fs.FileOwner, perms fs.FilePermissions, attr *linux.MqAttr) (*Queue, error) {
+ if attr == nil {
+ return &Queue{
+ owner: owner,
+ perms: perms,
+ maxMessageCount: int64(maxMsgDefault),
+ maxMessageSize: uint64(msgSizeDefault),
+ }, nil
+ }
+
+ // "O_CREAT was specified in oflag, and attr was not NULL, but
+ // attr->mq_maxmsg or attr->mq_msqsize was invalid. Both of these fields
+ // these fields must be greater than zero. In a process that is
+ // unprivileged (does not have the CAP_SYS_RESOURCE capability),
+ // attr->mq_maxmsg must be less than or equal to the msg_max limit, and
+ // attr->mq_msgsize must be less than or equal to the msgsize_max limit.
+ // In addition, even in a privileged process, attr->mq_maxmsg cannot
+ // exceed the HARD_MAX limit." - man mq_open(3).
+ if attr.MqMaxmsg <= 0 || attr.MqMsgsize <= 0 {
+ return nil, linuxerr.EINVAL
+ }
+
+ if attr.MqMaxmsg > maxMsgHardLimit || (!creds.HasCapabilityIn(linux.CAP_SYS_RESOURCE, r.userNS) && (attr.MqMaxmsg > maxMsgLimit || attr.MqMsgsize > msgSizeLimit)) {
+ return nil, linuxerr.EINVAL
+ }
+
+ return &Queue{
+ owner: owner,
+ perms: perms,
+ maxMessageCount: attr.MqMaxmsg,
+ maxMessageSize: uint64(attr.MqMsgsize),
+ }, nil
+}
+
+// Remove removes the queue with the given name from the registry. See
+// mq_unlink(2).
+func (r *Registry) Remove(ctx context.Context, name string) error {
+ if len(name) > MaxName {
+ return linuxerr.ENAMETOOLONG
+ }
+
+ r.mu.Lock()
+ defer r.mu.Unlock()
+ return r.impl.Unlink(ctx, name)
+}
+
+// Destroy destroys the registry and releases all held references.
+func (r *Registry) Destroy(ctx context.Context) {
+ r.mu.Lock()
+ defer r.mu.Unlock()
+ r.impl.Destroy(ctx)
+}
+
+// Impl returns RegistryImpl inside r.
+func (r *Registry) Impl() RegistryImpl {
+ return r.impl
+}
+
+// Queue represents a POSIX message queue.
+//
+// +stateify savable
+type Queue struct {
+ // owner is the registry's owner. Immutable.
+ owner fs.FileOwner
+
+ // perms is the registry's access permissions. Immutable.
+ perms fs.FilePermissions
+
+ // mu protects all the fields below.
+ mu sync.Mutex `state:"nosave"`
+
+ // senders is a queue of currently blocked senders. Senders are notified
+ // when space isi available in the queue for a new message.
+ senders waiter.Queue
+
+ // receivers is a queue of currently blocked receivers. Receivers are
+ // notified when a new message is inserted in the queue.
+ receivers waiter.Queue
+
+ // messages is a list of messages currently in the queue.
+ messages msgList
+
+ // subscriber represents a task registered to receive async notification
+ // from this queue.
+ subscriber *Subscriber
+
+ // messageCount is the number of messages currently in the queue.
+ messageCount int64
+
+ // maxMessageCount is the maximum number of messages that the queue can
+ // hold.
+ maxMessageCount int64
+
+ // maxMessageSize is the maximum size of a message held by the queue.
+ maxMessageSize uint64
+
+ // byteCount is the number of bytes of data in all messages in the queue.
+ byteCount uint64
+}
+
+// View is a view into a message queue. Views should only be used in file
+// descriptions, but not inodes, because we use inodes to retreive the actual
+// queue, and only FDs are responsible for providing user functionality.
+type View interface {
+ // TODO: Add Send and Receive when mq_timedsend(2) and mq_timedreceive(2)
+ // are implemented.
+
+ // Flush checks if the calling process has attached a notification request
+ // to this queue, if yes, then the request is removed, and another process
+ // can attach a request.
+ Flush(ctx context.Context)
+
+ waiter.Waitable
+}
+
+// ReaderWriter provides a send and receive view into a queue.
+type ReaderWriter struct {
+ *Queue
+
+ block bool
+}
+
+// Reader provides a send-only view into a queue.
+type Reader struct {
+ *Queue
+
+ block bool
+}
+
+// Writer provides a receive-only view into a queue.
+type Writer struct {
+ *Queue
+
+ block bool
+}
+
+// NewView creates a new view into a queue and returns it.
+func NewView(q *Queue, access AccessType, block bool) (View, error) {
+ switch access {
+ case ReadWrite:
+ return ReaderWriter{Queue: q, block: block}, nil
+ case WriteOnly:
+ return Writer{Queue: q, block: block}, nil
+ case ReadOnly:
+ return Reader{Queue: q, block: block}, nil
+ default:
+ // This case can't happen, due to O_RDONLY flag being 0 and O_WRONLY
+ // being 1, so one of them must be true.
+ return nil, linuxerr.EINVAL
+ }
+}
+
+// Message holds a message exchanged through a Queue via mq_timedsend(2) and
+// mq_timedreceive(2), and additional info relating to the message.
+//
+// +stateify savable
+type Message struct {
+ msgEntry
+
+ // Text is the message's sent content.
+ Text string
+
+ // Size is the message's size in bytes.
+ Size uint64
+
+ // Priority is the message's priority.
+ Priority uint32
+}
+
+// Subscriber represents a task registered for async notification from a Queue.
+//
+// +stateify savable
+type Subscriber struct {
+ // TODO: Add fields when mq_notify(2) is implemented.
+
+ // pid is the PID of the registered task.
+ pid int32
+}
+
+// Generate implements vfs.DynamicBytesSource.Generate. Queue is used as a
+// DynamicBytesSource for mqfs's queueInode.
+func (q *Queue) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ q.mu.Lock()
+ defer q.mu.Unlock()
+
+ var (
+ pid int32
+ method int
+ sigNumber int
+ )
+ if q.subscriber != nil {
+ pid = q.subscriber.pid
+ // TODO: add method and sigNumber when mq_notify(2) is implemented.
+ }
+
+ buf.WriteString(
+ fmt.Sprintf("QSIZE:%-10d NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",
+ q.byteCount, method, sigNumber, pid),
+ )
+ return nil
+}
+
+// Flush implements View.Flush.
+func (q *Queue) Flush(ctx context.Context) {
+ q.mu.Lock()
+ defer q.mu.Unlock()
+
+ pid, ok := context.ThreadGroupIDFromContext(ctx)
+ if ok {
+ if q.subscriber != nil && pid == q.subscriber.pid {
+ q.subscriber = nil
+ }
+ }
+}
+
+// Readiness implements Waitable.Readiness.
+func (q *Queue) Readiness(mask waiter.EventMask) waiter.EventMask {
+ q.mu.Lock()
+ defer q.mu.Unlock()
+
+ events := waiter.EventMask(0)
+ if q.messageCount > 0 {
+ events |= waiter.ReadableEvents
+ }
+ if q.messageCount < q.maxMessageCount {
+ events |= waiter.WritableEvents
+ }
+ return events & mask
+}
+
+// EventRegister implements Waitable.EventRegister.
+func (q *Queue) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ q.mu.Lock()
+ defer q.mu.Unlock()
+
+ if mask&waiter.WritableEvents != 0 {
+ q.senders.EventRegister(e, waiter.EventOut)
+ }
+ if mask&waiter.ReadableEvents != 0 {
+ q.receivers.EventRegister(e, waiter.EventIn)
+ }
+}
+
+// EventUnregister implements Waitable.EventUnregister.
+func (q *Queue) EventUnregister(e *waiter.Entry) {
+ q.mu.Lock()
+ defer q.mu.Unlock()
+
+ q.senders.EventUnregister(e)
+ q.receivers.EventUnregister(e)
+}
+
+// HasPermissions returns true if the given credentials meet the access
+// permissions required by the queue.
+func (q *Queue) HasPermissions(creds *auth.Credentials, req fs.PermMask) bool {
+ p := q.perms.Other
+ if q.owner.UID == creds.EffectiveKUID {
+ p = q.perms.User
+ } else if creds.InGroup(q.owner.GID) {
+ p = q.perms.Group
+ }
+ return p.SupersetOf(req)
+}
diff --git a/pkg/sentry/kernel/mq/mq_state_autogen.go b/pkg/sentry/kernel/mq/mq_state_autogen.go
new file mode 100644
index 000000000..4833c9b4e
--- /dev/null
+++ b/pkg/sentry/kernel/mq/mq_state_autogen.go
@@ -0,0 +1,211 @@
+// automatically generated by stateify.
+
+package mq
+
+import (
+ "gvisor.dev/gvisor/pkg/state"
+)
+
+func (l *msgList) StateTypeName() string {
+ return "pkg/sentry/kernel/mq.msgList"
+}
+
+func (l *msgList) StateFields() []string {
+ return []string{
+ "head",
+ "tail",
+ }
+}
+
+func (l *msgList) beforeSave() {}
+
+// +checklocksignore
+func (l *msgList) StateSave(stateSinkObject state.Sink) {
+ l.beforeSave()
+ stateSinkObject.Save(0, &l.head)
+ stateSinkObject.Save(1, &l.tail)
+}
+
+func (l *msgList) afterLoad() {}
+
+// +checklocksignore
+func (l *msgList) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.Load(0, &l.head)
+ stateSourceObject.Load(1, &l.tail)
+}
+
+func (e *msgEntry) StateTypeName() string {
+ return "pkg/sentry/kernel/mq.msgEntry"
+}
+
+func (e *msgEntry) StateFields() []string {
+ return []string{
+ "next",
+ "prev",
+ }
+}
+
+func (e *msgEntry) beforeSave() {}
+
+// +checklocksignore
+func (e *msgEntry) StateSave(stateSinkObject state.Sink) {
+ e.beforeSave()
+ stateSinkObject.Save(0, &e.next)
+ stateSinkObject.Save(1, &e.prev)
+}
+
+func (e *msgEntry) afterLoad() {}
+
+// +checklocksignore
+func (e *msgEntry) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.Load(0, &e.next)
+ stateSourceObject.Load(1, &e.prev)
+}
+
+func (r *Registry) StateTypeName() string {
+ return "pkg/sentry/kernel/mq.Registry"
+}
+
+func (r *Registry) StateFields() []string {
+ return []string{
+ "userNS",
+ "impl",
+ }
+}
+
+func (r *Registry) beforeSave() {}
+
+// +checklocksignore
+func (r *Registry) StateSave(stateSinkObject state.Sink) {
+ r.beforeSave()
+ stateSinkObject.Save(0, &r.userNS)
+ stateSinkObject.Save(1, &r.impl)
+}
+
+func (r *Registry) afterLoad() {}
+
+// +checklocksignore
+func (r *Registry) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.Load(0, &r.userNS)
+ stateSourceObject.Load(1, &r.impl)
+}
+
+func (q *Queue) StateTypeName() string {
+ return "pkg/sentry/kernel/mq.Queue"
+}
+
+func (q *Queue) StateFields() []string {
+ return []string{
+ "owner",
+ "perms",
+ "senders",
+ "receivers",
+ "messages",
+ "subscriber",
+ "messageCount",
+ "maxMessageCount",
+ "maxMessageSize",
+ "byteCount",
+ }
+}
+
+func (q *Queue) beforeSave() {}
+
+// +checklocksignore
+func (q *Queue) StateSave(stateSinkObject state.Sink) {
+ q.beforeSave()
+ stateSinkObject.Save(0, &q.owner)
+ stateSinkObject.Save(1, &q.perms)
+ stateSinkObject.Save(2, &q.senders)
+ stateSinkObject.Save(3, &q.receivers)
+ stateSinkObject.Save(4, &q.messages)
+ stateSinkObject.Save(5, &q.subscriber)
+ stateSinkObject.Save(6, &q.messageCount)
+ stateSinkObject.Save(7, &q.maxMessageCount)
+ stateSinkObject.Save(8, &q.maxMessageSize)
+ stateSinkObject.Save(9, &q.byteCount)
+}
+
+func (q *Queue) afterLoad() {}
+
+// +checklocksignore
+func (q *Queue) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.Load(0, &q.owner)
+ stateSourceObject.Load(1, &q.perms)
+ stateSourceObject.Load(2, &q.senders)
+ stateSourceObject.Load(3, &q.receivers)
+ stateSourceObject.Load(4, &q.messages)
+ stateSourceObject.Load(5, &q.subscriber)
+ stateSourceObject.Load(6, &q.messageCount)
+ stateSourceObject.Load(7, &q.maxMessageCount)
+ stateSourceObject.Load(8, &q.maxMessageSize)
+ stateSourceObject.Load(9, &q.byteCount)
+}
+
+func (m *Message) StateTypeName() string {
+ return "pkg/sentry/kernel/mq.Message"
+}
+
+func (m *Message) StateFields() []string {
+ return []string{
+ "msgEntry",
+ "Text",
+ "Size",
+ "Priority",
+ }
+}
+
+func (m *Message) beforeSave() {}
+
+// +checklocksignore
+func (m *Message) StateSave(stateSinkObject state.Sink) {
+ m.beforeSave()
+ stateSinkObject.Save(0, &m.msgEntry)
+ stateSinkObject.Save(1, &m.Text)
+ stateSinkObject.Save(2, &m.Size)
+ stateSinkObject.Save(3, &m.Priority)
+}
+
+func (m *Message) afterLoad() {}
+
+// +checklocksignore
+func (m *Message) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.Load(0, &m.msgEntry)
+ stateSourceObject.Load(1, &m.Text)
+ stateSourceObject.Load(2, &m.Size)
+ stateSourceObject.Load(3, &m.Priority)
+}
+
+func (s *Subscriber) StateTypeName() string {
+ return "pkg/sentry/kernel/mq.Subscriber"
+}
+
+func (s *Subscriber) StateFields() []string {
+ return []string{
+ "pid",
+ }
+}
+
+func (s *Subscriber) beforeSave() {}
+
+// +checklocksignore
+func (s *Subscriber) StateSave(stateSinkObject state.Sink) {
+ s.beforeSave()
+ stateSinkObject.Save(0, &s.pid)
+}
+
+func (s *Subscriber) afterLoad() {}
+
+// +checklocksignore
+func (s *Subscriber) StateLoad(stateSourceObject state.Source) {
+ stateSourceObject.Load(0, &s.pid)
+}
+
+func init() {
+ state.Register((*msgList)(nil))
+ state.Register((*msgEntry)(nil))
+ state.Register((*Registry)(nil))
+ state.Register((*Queue)(nil))
+ state.Register((*Message)(nil))
+ state.Register((*Subscriber)(nil))
+}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index a6d8fb163..69a3227f0 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -103,6 +103,9 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
ipcns := t.IPCNamespace()
if args.Flags&linux.CLONE_NEWIPC != 0 {
ipcns = NewIPCNamespace(userns)
+ if VFS2Enabled {
+ ipcns.InitPosixQueues(t, t.k.VFS(), creds)
+ }
} else {
ipcns.IncRef()
}
@@ -464,6 +467,9 @@ func (t *Task) Unshare(flags int32) error {
// namespace"
t.ipcns.DecRef(t)
t.ipcns = NewIPCNamespace(creds.UserNamespace)
+ if VFS2Enabled {
+ t.ipcns.InitPosixQueues(t, t.k.VFS(), creds)
+ }
}
var oldFDTable *FDTable
if flags&linux.CLONE_FILES != 0 {
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index c82d9e82b..cb9bcd7c0 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -23,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/ipc"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
@@ -73,7 +74,7 @@ func (t *Task) contextValue(key interface{}, isTaskGoroutine bool) interface{} {
defer t.mu.Unlock()
}
return t.utsns
- case CtxIPCNamespace:
+ case ipc.CtxIPCNamespace:
if !isTaskGoroutine {
t.mu.Lock()
defer t.mu.Unlock()
diff --git a/pkg/sentry/syscalls/linux/vfs2/mq.go b/pkg/sentry/syscalls/linux/vfs2/mq.go
new file mode 100644
index 000000000..d5d81c6e2
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/mq.go
@@ -0,0 +1,98 @@
+// Copyright 2021 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/mq"
+)
+
+// MqOpen implements mq_open(2).
+func MqOpen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ nameAddr := args[0].Pointer()
+ flag := args[1].Int()
+ mode := args[2].ModeT()
+ attrAddr := args[3].Pointer()
+
+ name, err := t.CopyInString(nameAddr, mq.MaxName)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ rOnly := flag&linux.O_RDONLY == linux.O_RDONLY
+ wOnly := flag&linux.O_WRONLY == linux.O_WRONLY
+ readWrite := flag&linux.O_RDWR == linux.O_RDWR
+
+ create := flag&linux.O_CREAT == linux.O_CREAT
+ exclusive := flag&linux.O_EXCL == linux.O_EXCL
+ block := flag&linux.O_NONBLOCK != linux.O_NONBLOCK
+
+ var attr linux.MqAttr
+ var attrPtr *linux.MqAttr
+ if attrAddr != 0 {
+ if _, err := attr.CopyIn(t, attrAddr); err != nil {
+ return 0, nil, err
+ }
+ attrPtr = &attr
+ }
+
+ opts := openOpts(name, rOnly, wOnly, readWrite, create, exclusive, block)
+
+ r := t.IPCNamespace().PosixQueues()
+ queue, err := r.FindOrCreate(t, opts, linux.FileMode(mode), attrPtr)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ fd, err := t.NewFDFromVFS2(0, queue, kernel.FDFlags{
+ CloseOnExec: flag&linux.O_CLOEXEC != 0,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+ return uintptr(fd), nil, nil
+}
+
+// MqUnlink implements mq_unlink(2).
+func MqUnlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ nameAddr := args[0].Pointer()
+ name, err := t.CopyInString(nameAddr, mq.MaxName)
+ if err != nil {
+ return 0, nil, err
+ }
+ return 0, nil, t.IPCNamespace().PosixQueues().Remove(t, name)
+}
+
+func openOpts(name string, rOnly, wOnly, readWrite, create, exclusive, block bool) mq.OpenOpts {
+ var access mq.AccessType
+ switch {
+ case readWrite:
+ access = mq.ReadWrite
+ case wOnly:
+ access = mq.WriteOnly
+ case rOnly:
+ access = mq.ReadOnly
+ }
+
+ return mq.OpenOpts{
+ Name: name,
+ Access: access,
+ Create: create,
+ Exclusive: exclusive,
+ Block: block,
+ }
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
index 0fc81e694..4eb15a7f2 100644
--- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go
+++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
@@ -112,6 +112,8 @@ func Override() {
s.Table[232] = syscalls.Supported("epoll_wait", EpollWait)
s.Table[233] = syscalls.Supported("epoll_ctl", EpollCtl)
s.Table[235] = syscalls.Supported("utimes", Utimes)
+ s.Table[240] = syscalls.Supported("mq_open", MqOpen)
+ s.Table[241] = syscalls.Supported("mq_unlink", MqUnlink)
s.Table[253] = syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil)
s.Table[254] = syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil)
s.Table[255] = syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil)
@@ -241,6 +243,8 @@ func Override() {
s.Table[86] = syscalls.Supported("timerfd_settime", TimerfdSettime)
s.Table[87] = syscalls.Supported("timerfd_gettime", TimerfdGettime)
s.Table[88] = syscalls.Supported("utimensat", Utimensat)
+ s.Table[180] = syscalls.Supported("mq_open", MqOpen)
+ s.Table[181] = syscalls.Supported("mq_unlink", MqUnlink)
s.Table[198] = syscalls.Supported("socket", Socket)
s.Table[199] = syscalls.Supported("socketpair", SocketPair)
s.Table[200] = syscalls.Supported("bind", Bind)
@@ -271,6 +275,5 @@ func Override() {
s.Table[287] = syscalls.Supported("pwritev2", Pwritev2)
s.Table[291] = syscalls.Supported("statx", Statx)
s.Table[441] = syscalls.Supported("epoll_pwait2", EpollPwait2)
-
s.Init()
}
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index ac1e5ac37..9f0d1ae36 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -36,6 +36,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/fuse"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/mqfs"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/overlay"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
@@ -95,6 +96,10 @@ func registerFilesystems(k *kernel.Kernel) error {
AllowUserList: true,
AllowUserMount: true,
})
+ vfsObj.MustRegisterFilesystemType(mqfs.Name, &mqfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ AllowUserList: true,
+ })
// Setup files in devtmpfs.
if err := memdev.Register(vfsObj); err != nil {