diff options
author | gVisor bot <gvisor-bot@google.com> | 2021-10-21 23:10:52 +0000 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2021-10-21 23:10:52 +0000 |
commit | eff88efe81e2a2dcc0ee541e90755d4d21374315 (patch) | |
tree | 967638a38d03d2c6e778cab057a6558d71e4ec2c | |
parent | ac2e48668b599b3d3b0b4f5c4453b61773cae6fd (diff) | |
parent | 14f4113924c8b7b8c161be7335b147106d0c4a26 (diff) |
Merge release-20211011.0-59-g14f411392 (automated)
23 files changed, 2196 insertions, 8 deletions
diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go index cad24fcc7..edc90e54c 100644 --- a/pkg/abi/linux/fs.go +++ b/pkg/abi/linux/fs.go @@ -23,6 +23,7 @@ const ( DEVPTS_SUPER_MAGIC = 0x00001cd1 EXT_SUPER_MAGIC = 0xef53 FUSE_SUPER_MAGIC = 0x65735546 + MQUEUE_MAGIC = 0x19800202 OVERLAYFS_SUPER_MAGIC = 0x794c7630 PIPEFS_MAGIC = 0x50495045 PROC_SUPER_MAGIC = 0x9fa0 diff --git a/pkg/abi/linux/linux_abi_autogen_unsafe.go b/pkg/abi/linux/linux_abi_autogen_unsafe.go index cd5786d12..b71bd1432 100644 --- a/pkg/abi/linux/linux_abi_autogen_unsafe.go +++ b/pkg/abi/linux/linux_abi_autogen_unsafe.go @@ -79,6 +79,7 @@ var _ marshal.Marshallable = (*KernelIP6TGetEntries)(nil) var _ marshal.Marshallable = (*KernelIPTEntry)(nil) var _ marshal.Marshallable = (*KernelIPTGetEntries)(nil) var _ marshal.Marshallable = (*Linger)(nil) +var _ marshal.Marshallable = (*MqAttr)(nil) var _ marshal.Marshallable = (*MsgBuf)(nil) var _ marshal.Marshallable = (*MsgInfo)(nil) var _ marshal.Marshallable = (*MsqidDS)(nil) @@ -5266,6 +5267,112 @@ func (n *NumaPolicy) WriteTo(w io.Writer) (int64, error) { return int64(length), err } +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (m *MqAttr) SizeBytes() int { + return 32 + + 8*4 +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (m *MqAttr) MarshalBytes(dst []byte) { + hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.MqFlags)) + dst = dst[8:] + hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.MqMaxmsg)) + dst = dst[8:] + hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.MqMsgsize)) + dst = dst[8:] + hostarch.ByteOrder.PutUint64(dst[:8], uint64(m.MqCurmsgs)) + dst = dst[8:] + // Padding: dst[:sizeof(int64)*4] ~= [4]int64{0} + dst = dst[8*(4):] +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (m *MqAttr) UnmarshalBytes(src []byte) { + m.MqFlags = int64(hostarch.ByteOrder.Uint64(src[:8])) + src = src[8:] + m.MqMaxmsg = int64(hostarch.ByteOrder.Uint64(src[:8])) + src = src[8:] + m.MqMsgsize = int64(hostarch.ByteOrder.Uint64(src[:8])) + src = src[8:] + m.MqCurmsgs = int64(hostarch.ByteOrder.Uint64(src[:8])) + src = src[8:] + // Padding: ~ copy([4]int64(m._), src[:sizeof(int64)*4]) + src = src[8*(4):] +} + +// Packed implements marshal.Marshallable.Packed. +//go:nosplit +func (m *MqAttr) Packed() bool { + return true +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (m *MqAttr) MarshalUnsafe(dst []byte) { + gohacks.Memmove(unsafe.Pointer(&dst[0]), unsafe.Pointer(m), uintptr(m.SizeBytes())) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (m *MqAttr) UnmarshalUnsafe(src []byte) { + gohacks.Memmove(unsafe.Pointer(m), unsafe.Pointer(&src[0]), uintptr(m.SizeBytes())) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +//go:nosplit +func (m *MqAttr) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) + hdr.Len = m.SizeBytes() + hdr.Cap = m.SizeBytes() + + length, err := cc.CopyOutBytes(addr, buf[:limit]) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that m + // must live until the use above. + runtime.KeepAlive(m) // escapes: replaced by intrinsic. + return length, err +} + +// CopyOut implements marshal.Marshallable.CopyOut. +//go:nosplit +func (m *MqAttr) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { + return m.CopyOutN(cc, addr, m.SizeBytes()) +} + +// CopyIn implements marshal.Marshallable.CopyIn. +//go:nosplit +func (m *MqAttr) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) + hdr.Len = m.SizeBytes() + hdr.Cap = m.SizeBytes() + + length, err := cc.CopyInBytes(addr, buf) // escapes: okay. + // Since we bypassed the compiler's escape analysis, indicate that m + // must live until the use above. + runtime.KeepAlive(m) // escapes: replaced by intrinsic. + return length, err +} + +// WriteTo implements io.WriterTo.WriteTo. +func (m *MqAttr) WriteTo(writer io.Writer) (int64, error) { + // Construct a slice backed by dst's underlying memory. + var buf []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&buf)) + hdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(m))) + hdr.Len = m.SizeBytes() + hdr.Cap = m.SizeBytes() + + length, err := writer.Write(buf) + // Since we bypassed the compiler's escape analysis, indicate that m + // must live until the use above. + runtime.KeepAlive(m) // escapes: replaced by intrinsic. + return int64(length), err +} + // Packed implements marshal.Marshallable.Packed. //go:nosplit func (b *MsgBuf) Packed() bool { diff --git a/pkg/abi/linux/mqueue.go b/pkg/abi/linux/mqueue.go new file mode 100644 index 000000000..4988a2aa3 --- /dev/null +++ b/pkg/abi/linux/mqueue.go @@ -0,0 +1,55 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +// Default values for POSIX message queues. Source: +// include/linux/ipc_namespace.h +const ( + DFLT_QUEUESMAX = 256 + MIN_MSGMAX = 1 + DFLT_MSG uint = 10 + DFLT_MSGMAX = 10 + HARD_MSGMAX = 65536 + MIN_MSGSIZEMAX = 128 + DFLT_MSGSIZE uint = 8192 + DFLT_MSGSIZEMAX = 8192 + HARD_MSGSIZEMAX = (16 * 1024 * 1024) +) + +// Maximum values for a message queue. Source: include/uapi/linux/mqueue.h +const ( + MQ_PRIO_MAX = 32768 + MQ_BYTES_MAX = 819200 +) + +// Codes used by mq_notify. Source: include/uapi/linux/mqueue.h +const ( + NOTIFY_NONE = 0 + NOTIFY_WOKENUP = 1 + NOTIFY_REMOVED = 2 + + NOTIFY_COOKIE_LEN = 32 +) + +// MqAttr is equivelant to struct mq_attr. Source: include/uapi/linux/mqueue.h +// +// +marshal +type MqAttr struct { + MqFlags int64 // Message queue flags. + MqMaxmsg int64 // Maximum number of messages. + MqMsgsize int64 // Maximum message size. + MqCurmsgs int64 // Number of messages currently queued. + _ [4]int64 // Ignored for input, zeroed for output. +} diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 9d7526e47..652ade564 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -74,6 +74,11 @@ func (*DynamicBytesFile) SetStat(context.Context, *vfs.Filesystem, *auth.Credent return linuxerr.EPERM } +// Locks returns the file locks for this file. +func (f *DynamicBytesFile) Locks() *vfs.FileLocks { + return &f.locks +} + // DynamicBytesFD implements vfs.FileDescriptionImpl for an FD backed by a // DynamicBytesFile. // diff --git a/pkg/sentry/fsimpl/mqfs/mqfs.go b/pkg/sentry/fsimpl/mqfs/mqfs.go new file mode 100644 index 000000000..c2b53c9d0 --- /dev/null +++ b/pkg/sentry/fsimpl/mqfs/mqfs.go @@ -0,0 +1,138 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package mqfs provides a filesystem implementation to back POSIX message +// queues. +package mqfs + +import ( + "fmt" + "strconv" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" + "gvisor.dev/gvisor/pkg/sentry/kernel/mq" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +const ( + // Name is the user-visible filesystem name. + Name = "mqueue" + defaultMaxCachedDentries = uint64(1000) +) + +// FilesystemType implements vfs.FilesystemType. +// +// +stateify savable +type FilesystemType struct{} + +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + +// Release implements vfs.FilesystemType.Release. +func (FilesystemType) Release(ctx context.Context) {} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + // mqfs is initialized only once per ipc namespace. Each ipc namespace has + // a POSIX message registry with a root dentry, filesystem, and a + // disconnected mount. We want the fs to be consistent for all processes in + // the same ipc namespace, so instead of creating a new fs and root dentry, + // we retreive them using IPCNamespace.PosixQueues and use them. + + i := ipcNamespaceFromContext(ctx) + if i == nil { + return nil, nil, fmt.Errorf("mqfs.FilesystemType.GetFilesystem: ipc namespace doesn't exist") + } + defer i.DecRef(ctx) + + registry := i.PosixQueues() + if registry == nil { + return nil, nil, fmt.Errorf("mqfs.FilesystemType.GetFilesystem: ipc namespace doesn't have a POSIX registry") + } + impl := registry.Impl().(*RegistryImpl) + + maxCachedDentries, err := maxCachedDentries(ctx, vfs.GenericParseMountOptions(opts.Data)) + if err != nil { + return nil, nil, err + } + impl.fs.MaxCachedDentries = maxCachedDentries + + impl.fs.VFSFilesystem().IncRef() + return impl.fs.VFSFilesystem(), impl.root.VFSDentry(), nil +} + +// maxCachedDentries checks mopts for dentry_cache_limit. If a value is +// specified, parse it into uint64 and return it. Otherwise, return the default +// value. An error is returned if a value is found but can't be parsed. +func maxCachedDentries(ctx context.Context, mopts map[string]string) (_ uint64, err error) { + max := defaultMaxCachedDentries + if str, ok := mopts["dentry_cache_limit"]; ok { + delete(mopts, "dentry_cache_limit") + max, err = strconv.ParseUint(str, 10, 64) + if err != nil { + ctx.Warningf("mqfs.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) + return 0, linuxerr.EINVAL + } + } + return max, nil +} + +// filesystem implements kernfs.Filesystem. +// +// +stateify savable +type filesystem struct { + kernfs.Filesystem + devMinor uint32 + + // root is the filesystem's root dentry. Since we take a reference on it in + // GetFilesystem, we should release it when the fs is released. + root *kernfs.Dentry +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release(ctx context.Context) { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release(ctx) +} + +// MountOptions implements vfs.FilesystemImpl.MountOptions. +func (fs *filesystem) MountOptions() string { + return fmt.Sprintf("dentry_cache_limit=%d", fs.MaxCachedDentries) +} + +// ipcNamespace defines functions we need from kernel.IPCNamespace. We redefine +// ipcNamespace along with ipcNamespaceFromContext to avoid circular dependency +// with package sentry/kernel. +type ipcNamespace interface { + // PosixQueues returns a POSIX message queue registry. + PosixQueues() *mq.Registry + + // DecRef decrements ipcNamespace's number of references. + DecRef(ctx context.Context) +} + +// ipcNamespaceFromContext returns the IPC namespace in which ctx is executing. +// Copied from package sentry/kernel. +func ipcNamespaceFromContext(ctx context.Context) ipcNamespace { + if v := ctx.Value(ipc.CtxIPCNamespace); v != nil { + return v.(ipcNamespace) + } + return nil +} diff --git a/pkg/sentry/fsimpl/mqfs/mqfs_state_autogen.go b/pkg/sentry/fsimpl/mqfs/mqfs_state_autogen.go new file mode 100644 index 000000000..d6154efc3 --- /dev/null +++ b/pkg/sentry/fsimpl/mqfs/mqfs_state_autogen.go @@ -0,0 +1,263 @@ +// automatically generated by stateify. + +package mqfs + +import ( + "gvisor.dev/gvisor/pkg/state" +) + +func (ft *FilesystemType) StateTypeName() string { + return "pkg/sentry/fsimpl/mqfs.FilesystemType" +} + +func (ft *FilesystemType) StateFields() []string { + return []string{} +} + +func (ft *FilesystemType) beforeSave() {} + +// +checklocksignore +func (ft *FilesystemType) StateSave(stateSinkObject state.Sink) { + ft.beforeSave() +} + +func (ft *FilesystemType) afterLoad() {} + +// +checklocksignore +func (ft *FilesystemType) StateLoad(stateSourceObject state.Source) { +} + +func (fs *filesystem) StateTypeName() string { + return "pkg/sentry/fsimpl/mqfs.filesystem" +} + +func (fs *filesystem) StateFields() []string { + return []string{ + "Filesystem", + "devMinor", + "root", + } +} + +func (fs *filesystem) beforeSave() {} + +// +checklocksignore +func (fs *filesystem) StateSave(stateSinkObject state.Sink) { + fs.beforeSave() + stateSinkObject.Save(0, &fs.Filesystem) + stateSinkObject.Save(1, &fs.devMinor) + stateSinkObject.Save(2, &fs.root) +} + +func (fs *filesystem) afterLoad() {} + +// +checklocksignore +func (fs *filesystem) StateLoad(stateSourceObject state.Source) { + stateSourceObject.Load(0, &fs.Filesystem) + stateSourceObject.Load(1, &fs.devMinor) + stateSourceObject.Load(2, &fs.root) +} + +func (q *queueInode) StateTypeName() string { + return "pkg/sentry/fsimpl/mqfs.queueInode" +} + +func (q *queueInode) StateFields() []string { + return []string{ + "DynamicBytesFile", + "queue", + } +} + +func (q *queueInode) beforeSave() {} + +// +checklocksignore +func (q *queueInode) StateSave(stateSinkObject state.Sink) { + q.beforeSave() + stateSinkObject.Save(0, &q.DynamicBytesFile) + stateSinkObject.Save(1, &q.queue) +} + +func (q *queueInode) afterLoad() {} + +// +checklocksignore +func (q *queueInode) StateLoad(stateSourceObject state.Source) { + stateSourceObject.Load(0, &q.DynamicBytesFile) + stateSourceObject.Load(1, &q.queue) +} + +func (fd *queueFD) StateTypeName() string { + return "pkg/sentry/fsimpl/mqfs.queueFD" +} + +func (fd *queueFD) StateFields() []string { + return []string{ + "FileDescriptionDefaultImpl", + "DynamicBytesFileDescriptionImpl", + "LockFD", + "vfsfd", + "inode", + "queue", + } +} + +func (fd *queueFD) beforeSave() {} + +// +checklocksignore +func (fd *queueFD) StateSave(stateSinkObject state.Sink) { + fd.beforeSave() + stateSinkObject.Save(0, &fd.FileDescriptionDefaultImpl) + stateSinkObject.Save(1, &fd.DynamicBytesFileDescriptionImpl) + stateSinkObject.Save(2, &fd.LockFD) + stateSinkObject.Save(3, &fd.vfsfd) + stateSinkObject.Save(4, &fd.inode) + stateSinkObject.Save(5, &fd.queue) +} + +func (fd *queueFD) afterLoad() {} + +// +checklocksignore +func (fd *queueFD) StateLoad(stateSourceObject state.Source) { + stateSourceObject.Load(0, &fd.FileDescriptionDefaultImpl) + stateSourceObject.Load(1, &fd.DynamicBytesFileDescriptionImpl) + stateSourceObject.Load(2, &fd.LockFD) + stateSourceObject.Load(3, &fd.vfsfd) + stateSourceObject.Load(4, &fd.inode) + stateSourceObject.Load(5, &fd.queue) +} + +func (r *RegistryImpl) StateTypeName() string { + return "pkg/sentry/fsimpl/mqfs.RegistryImpl" +} + +func (r *RegistryImpl) StateFields() []string { + return []string{ + "root", + "fs", + "mount", + } +} + +func (r *RegistryImpl) beforeSave() {} + +// +checklocksignore +func (r *RegistryImpl) StateSave(stateSinkObject state.Sink) { + r.beforeSave() + stateSinkObject.Save(0, &r.root) + stateSinkObject.Save(1, &r.fs) + stateSinkObject.Save(2, &r.mount) +} + +func (r *RegistryImpl) afterLoad() {} + +// +checklocksignore +func (r *RegistryImpl) StateLoad(stateSourceObject state.Source) { + stateSourceObject.Load(0, &r.root) + stateSourceObject.Load(1, &r.fs) + stateSourceObject.Load(2, &r.mount) +} + +func (i *rootInode) StateTypeName() string { + return "pkg/sentry/fsimpl/mqfs.rootInode" +} + +func (i *rootInode) StateFields() []string { + return []string{ + "rootInodeRefs", + "InodeAlwaysValid", + "InodeAttrs", + "InodeDirectoryNoNewChildren", + "InodeNotSymlink", + "InodeTemporary", + "OrderedChildren", + "implStatFS", + "locks", + } +} + +func (i *rootInode) beforeSave() {} + +// +checklocksignore +func (i *rootInode) StateSave(stateSinkObject state.Sink) { + i.beforeSave() + stateSinkObject.Save(0, &i.rootInodeRefs) + stateSinkObject.Save(1, &i.InodeAlwaysValid) + stateSinkObject.Save(2, &i.InodeAttrs) + stateSinkObject.Save(3, &i.InodeDirectoryNoNewChildren) + stateSinkObject.Save(4, &i.InodeNotSymlink) + stateSinkObject.Save(5, &i.InodeTemporary) + stateSinkObject.Save(6, &i.OrderedChildren) + stateSinkObject.Save(7, &i.implStatFS) + stateSinkObject.Save(8, &i.locks) +} + +func (i *rootInode) afterLoad() {} + +// +checklocksignore +func (i *rootInode) StateLoad(stateSourceObject state.Source) { + stateSourceObject.Load(0, &i.rootInodeRefs) + stateSourceObject.Load(1, &i.InodeAlwaysValid) + stateSourceObject.Load(2, &i.InodeAttrs) + stateSourceObject.Load(3, &i.InodeDirectoryNoNewChildren) + stateSourceObject.Load(4, &i.InodeNotSymlink) + stateSourceObject.Load(5, &i.InodeTemporary) + stateSourceObject.Load(6, &i.OrderedChildren) + stateSourceObject.Load(7, &i.implStatFS) + stateSourceObject.Load(8, &i.locks) +} + +func (i *implStatFS) StateTypeName() string { + return "pkg/sentry/fsimpl/mqfs.implStatFS" +} + +func (i *implStatFS) StateFields() []string { + return []string{} +} + +func (i *implStatFS) beforeSave() {} + +// +checklocksignore +func (i *implStatFS) StateSave(stateSinkObject state.Sink) { + i.beforeSave() +} + +func (i *implStatFS) afterLoad() {} + +// +checklocksignore +func (i *implStatFS) StateLoad(stateSourceObject state.Source) { +} + +func (r *rootInodeRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/mqfs.rootInodeRefs" +} + +func (r *rootInodeRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (r *rootInodeRefs) beforeSave() {} + +// +checklocksignore +func (r *rootInodeRefs) StateSave(stateSinkObject state.Sink) { + r.beforeSave() + stateSinkObject.Save(0, &r.refCount) +} + +// +checklocksignore +func (r *rootInodeRefs) StateLoad(stateSourceObject state.Source) { + stateSourceObject.Load(0, &r.refCount) + stateSourceObject.AfterLoad(r.afterLoad) +} + +func init() { + state.Register((*FilesystemType)(nil)) + state.Register((*filesystem)(nil)) + state.Register((*queueInode)(nil)) + state.Register((*queueFD)(nil)) + state.Register((*RegistryImpl)(nil)) + state.Register((*rootInode)(nil)) + state.Register((*implStatFS)(nil)) + state.Register((*rootInodeRefs)(nil)) +} diff --git a/pkg/sentry/fsimpl/mqfs/queue.go b/pkg/sentry/fsimpl/mqfs/queue.go new file mode 100644 index 000000000..933dbc6ed --- /dev/null +++ b/pkg/sentry/fsimpl/mqfs/queue.go @@ -0,0 +1,145 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mqfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/mq" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// queueInode represents an inode for a message queue (/dev/mqueue/[name]). +// +// +stateify savable +type queueInode struct { + kernfs.DynamicBytesFile + + // queue is the message queue backing this inode. + queue *mq.Queue +} + +var _ kernfs.Inode = (*queueInode)(nil) + +// newQueueInode returns a new, initialized queueInode. +func (fs *filesystem) newQueueInode(ctx context.Context, creds *auth.Credentials, q *mq.Queue, perm linux.FileMode) kernfs.Inode { + inode := &queueInode{queue: q} + inode.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), q, perm) + return inode +} + +// Keep implements kernfs.Inode.Keep. +func (q *queueInode) Keep() bool { + // Return true so that the fs keeps newly created dentries. This is done + // because inodes returned by root.Lookup are not temporary, they exist + // in the fs, and refer to message queues. + return true +} + +// queueFD implements vfs.FileDescriptionImpl for FD backed by a POSIX message +// queue. It's mostly similar to DynamicBytesFD, but implements more operations. +// +// +stateify savable +type queueFD struct { + vfs.FileDescriptionDefaultImpl + vfs.DynamicBytesFileDescriptionImpl + vfs.LockFD + + vfsfd vfs.FileDescription + inode kernfs.Inode + + // queue is a view into the queue backing this fd. + queue mq.View +} + +// Init initializes a queueFD. Mostly copied from DynamicBytesFD.Init, but uses +// the queueFD as FileDescriptionImpl. +func (fd *queueFD) Init(m *vfs.Mount, d *kernfs.Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error { + fd.LockFD.Init(locks) + if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { + return err + } + fd.inode = d.Inode() + fd.SetDataSource(data) + return nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *queueFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.Seek(ctx, offset, whence) +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *queueFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.Read(ctx, dst, opts) +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *queueFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.PRead(ctx, dst, offset, opts) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *queueFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.Write(ctx, src, opts) +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *queueFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.PWrite(ctx, src, offset, opts) +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *queueFD) Release(context.Context) {} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *queueFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := fd.vfsfd.VirtualDentry().Mount().Filesystem() + return fd.inode.Stat(ctx, fs, opts) +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *queueFD) SetStat(context.Context, vfs.SetStatOptions) error { + // DynamicBytesFiles are immutable. + return linuxerr.EPERM +} + +// OnClose implements FileDescriptionImpl.OnClose similar to +// ipc/mqueue.c::mqueue_flush_file. +func (fd *queueFD) OnClose(ctx context.Context) error { + fd.queue.Flush(ctx) + return nil +} + +// Readiness implements waiter.Waitable.Readiness similar to +// ipc/mqueue.c::mqueue_poll_file. +func (fd *queueFD) Readiness(mask waiter.EventMask) waiter.EventMask { + return fd.queue.Readiness(mask) +} + +// EventRegister implements Waitable.EventRegister. +func (fd *queueFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + fd.queue.EventRegister(e, mask) +} + +// EventUnregister implements Waitable.EventUnregister. +func (fd *queueFD) EventUnregister(e *waiter.Entry) { + fd.queue.EventUnregister(e) +} diff --git a/pkg/sentry/fsimpl/mqfs/registry.go b/pkg/sentry/fsimpl/mqfs/registry.go new file mode 100644 index 000000000..c8fbe4d33 --- /dev/null +++ b/pkg/sentry/fsimpl/mqfs/registry.go @@ -0,0 +1,176 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mqfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/mq" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// RegistryImpl implements mq.RegistryImpl. It implements the interface using +// the message queue filesystem, and is provided to mq.Registry at +// initialization. +// +// RegistryImpl is not thread-safe, so it is the responsibility of the user +// (the containing mq.Registry) to protect using a lock. +// +// +stateify savable +type RegistryImpl struct { + // root is the root dentry of the mq filesystem. Its main usage is to + // retreive the root inode, which we use to add, remove, and lookup message + // queues. + // + // We hold a reference on root and release when the registry is destroyed. + root *kernfs.Dentry + + // fs is the filesystem backing this registry, used mainly to initialize + // new inodes. + fs *filesystem + + // mount is the mount point used for this filesystem. + mount *vfs.Mount +} + +// NewRegistryImpl returns a new, initialized RegistryImpl, and takes a +// reference on root. +func NewRegistryImpl(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*RegistryImpl, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, err + } + + var dentry kernfs.Dentry + fs := &filesystem{ + devMinor: devMinor, + root: &dentry, + } + fs.VFSFilesystem().Init(vfsObj, &FilesystemType{}, fs) + vfsfs := fs.VFSFilesystem() + + dentry.InitRoot(&fs.Filesystem, fs.newRootInode(ctx, creds)) + defer vfsfs.DecRef(ctx) // NewDisconnectedMount will obtain a ref on success. + + mount, err := vfsObj.NewDisconnectedMount(vfsfs, dentry.VFSDentry(), &vfs.MountOptions{}) + if err != nil { + return nil, err + } + + return &RegistryImpl{ + root: &dentry, + fs: fs, + mount: mount, + }, nil +} + +// Get implements mq.RegistryImpl.Get. +func (r *RegistryImpl) Get(ctx context.Context, name string, access mq.AccessType, block bool, flags uint32) (*vfs.FileDescription, bool, error) { + inode, err := r.lookup(ctx, name) + if err != nil { + return nil, false, nil + } + + qInode := inode.(*queueInode) + if !qInode.queue.HasPermissions(auth.CredentialsFromContext(ctx), perm(access)) { + // "The queue exists, but the caller does not have permission to + // open it in the specified mode." + return nil, false, linuxerr.EACCES + } + + fd, err := r.newFD(qInode.queue, qInode, access, block, flags) + if err != nil { + return nil, false, err + } + return fd, true, nil +} + +// New implements mq.RegistryImpl.New. +func (r *RegistryImpl) New(ctx context.Context, name string, q *mq.Queue, access mq.AccessType, block bool, perm linux.FileMode, flags uint32) (*vfs.FileDescription, error) { + root := r.root.Inode().(*rootInode) + qInode := r.fs.newQueueInode(ctx, auth.CredentialsFromContext(ctx), q, perm).(*queueInode) + err := root.Insert(name, qInode) + if err != nil { + return nil, err + } + return r.newFD(q, qInode, access, block, flags) +} + +// Unlink implements mq.RegistryImpl.Unlink. +func (r *RegistryImpl) Unlink(ctx context.Context, name string) error { + creds := auth.CredentialsFromContext(ctx) + if err := r.root.Inode().CheckPermissions(ctx, creds, vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + + root := r.root.Inode().(*rootInode) + inode, err := r.lookup(ctx, name) + if err != nil { + return err + } + return root.Unlink(ctx, name, inode) +} + +// Destroy implements mq.RegistryImpl.Destroy. +func (r *RegistryImpl) Destroy(ctx context.Context) { + r.root.DecRef(ctx) + r.mount.DecRef(ctx) +} + +// lookup retreives a kernfs.Inode using a name. +func (r *RegistryImpl) lookup(ctx context.Context, name string) (kernfs.Inode, error) { + inode := r.root.Inode().(*rootInode) + lookup, err := inode.Lookup(ctx, name) + if err != nil { + return nil, err + } + return lookup, nil +} + +// newFD returns a new file description created using the given queue and inode. +func (r *RegistryImpl) newFD(q *mq.Queue, inode *queueInode, access mq.AccessType, block bool, flags uint32) (*vfs.FileDescription, error) { + view, err := mq.NewView(q, access, block) + if err != nil { + return nil, err + } + + var dentry kernfs.Dentry + dentry.Init(&r.fs.Filesystem, inode) + + fd := &queueFD{queue: view} + err = fd.Init(r.mount, &dentry, inode.queue, inode.Locks(), flags) + if err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// perm returns a permission mask created using given flags. +func perm(access mq.AccessType) fs.PermMask { + switch access { + case mq.ReadWrite: + return fs.PermMask{Read: true, Write: true} + case mq.WriteOnly: + return fs.PermMask{Write: true} + case mq.ReadOnly: + return fs.PermMask{Read: true} + default: + return fs.PermMask{} // Can't happen, see NewView. + } +} diff --git a/pkg/sentry/fsimpl/mqfs/root.go b/pkg/sentry/fsimpl/mqfs/root.go new file mode 100644 index 000000000..37b5749fb --- /dev/null +++ b/pkg/sentry/fsimpl/mqfs/root.go @@ -0,0 +1,89 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mqfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// rootInode represents inode for filesystem's root directory (/dev/mqueue). +// +// +stateify savable +type rootInode struct { + rootInodeRefs + kernfs.InodeAlwaysValid + kernfs.InodeAttrs + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeNotSymlink + kernfs.InodeTemporary + kernfs.OrderedChildren + implStatFS + + locks vfs.FileLocks +} + +var _ kernfs.Inode = (*rootInode)(nil) + +// newRootInode returns a new, initialized rootInode. +func (fs *filesystem) newRootInode(ctx context.Context, creds *auth.Credentials) kernfs.Inode { + inode := &rootInode{} + inode.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|linux.FileMode(0555)) + inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true}) + inode.InitRefs() + return inode +} + +// Open implements kernfs.Inode.Open. +func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndZero, + }) + if err != nil { + return nil, err + } + return fd.VFSFileDescription(), nil +} + +// DecRef implements kernfs.Inode.DecRef. +func (i *rootInode) DecRef(ctx context.Context) { + i.rootInodeRefs.DecRef(func() { i.Destroy(ctx) }) +} + +// Rename implements Inode.Rename and overrides OrderedChildren.Rename. mqueue +// filesystem allows files to be unlinked, but not renamed. +func (i *rootInode) Rename(ctx context.Context, oldname, newname string, child, dstDir kernfs.Inode) error { + return linuxerr.EPERM +} + +// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. +func (*rootInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { + return linuxerr.EPERM +} + +// implStatFS provides an implementation of kernfs.Inode.StatFS for message +// queues to be embedded in inodes. +// +// +stateify savable +type implStatFS struct{} + +// StatFS implements kernfs.Inode.StatFS. +func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { + return vfs.GenericStatFS(linux.MQUEUE_MAGIC), nil +} diff --git a/pkg/sentry/fsimpl/mqfs/root_inode_refs.go b/pkg/sentry/fsimpl/mqfs/root_inode_refs.go new file mode 100644 index 000000000..7462467a5 --- /dev/null +++ b/pkg/sentry/fsimpl/mqfs/root_inode_refs.go @@ -0,0 +1,140 @@ +package mqfs + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/refsvfs2" +) + +// enableLogging indicates whether reference-related events should be logged (with +// stack traces). This is false by default and should only be set to true for +// debugging purposes, as it can generate an extremely large amount of output +// and drastically degrade performance. +const rootInodeenableLogging = false + +// obj is used to customize logging. Note that we use a pointer to T so that +// we do not copy the entire object when passed as a format parameter. +var rootInodeobj *rootInode + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// NOTE: Do not introduce additional fields to the Refs struct. It is used by +// many filesystem objects, and we want to keep it as small as possible (i.e., +// the same size as using an int64 directly) to avoid taking up extra cache +// space. In general, this template should not be extended at the cost of +// performance. If it does not offer enough flexibility for a particular object +// (example: b/187877947), we should implement the RefCounter/CheckedObject +// interfaces manually. +// +// +stateify savable +type rootInodeRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +// InitRefs initializes r with one reference and, if enabled, activates leak +// checking. +func (r *rootInodeRefs) InitRefs() { + atomic.StoreInt64(&r.refCount, 1) + refsvfs2.Register(r) +} + +// RefType implements refsvfs2.CheckedObject.RefType. +func (r *rootInodeRefs) RefType() string { + return fmt.Sprintf("%T", rootInodeobj)[1:] +} + +// LeakMessage implements refsvfs2.CheckedObject.LeakMessage. +func (r *rootInodeRefs) LeakMessage() string { + return fmt.Sprintf("[%s %p] reference count of %d instead of 0", r.RefType(), r, r.ReadRefs()) +} + +// LogRefs implements refsvfs2.CheckedObject.LogRefs. +func (r *rootInodeRefs) LogRefs() bool { + return rootInodeenableLogging +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *rootInodeRefs) ReadRefs() int64 { + return atomic.LoadInt64(&r.refCount) +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *rootInodeRefs) IncRef() { + v := atomic.AddInt64(&r.refCount, 1) + if rootInodeenableLogging { + refsvfs2.LogIncRef(r, v) + } + if v <= 1 { + panic(fmt.Sprintf("Incrementing non-positive count %p on %s", r, r.RefType())) + } +} + +// TryIncRef implements refs.TryRefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *rootInodeRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + if v := atomic.AddInt64(&r.refCount, speculativeRef); int32(v) == 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + v := atomic.AddInt64(&r.refCount, -speculativeRef+1) + if rootInodeenableLogging { + refsvfs2.LogTryIncRef(r, v) + } + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *rootInodeRefs) DecRef(destroy func()) { + v := atomic.AddInt64(&r.refCount, -1) + if rootInodeenableLogging { + refsvfs2.LogDecRef(r, v) + } + switch { + case v < 0: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %s", r, r.RefType())) + + case v == 0: + refsvfs2.Unregister(r) + + if destroy != nil { + destroy() + } + } +} + +func (r *rootInodeRefs) afterLoad() { + if r.ReadRefs() > 0 { + refsvfs2.Register(r) + } +} diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go index a8596410f..7e11c6580 100644 --- a/pkg/sentry/kernel/context.go +++ b/pkg/sentry/kernel/context.go @@ -16,6 +16,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" ) // contextID is the kernel package's type for context.Context.Value keys. @@ -37,9 +38,6 @@ const ( // CtxUTSNamespace is a Context.Value key for a UTSNamespace. CtxUTSNamespace - - // CtxIPCNamespace is a Context.Value key for a IPCNamespace. - CtxIPCNamespace ) // ContextCanTrace returns true if ctx is permitted to trace t, in the same sense @@ -82,7 +80,7 @@ func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace { // or nil if there is no such IPC namespace. It takes a reference on the // namespace. func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace { - if v := ctx.Value(CtxIPCNamespace); v != nil { + if v := ctx.Value(ipc.CtxIPCNamespace); v != nil { return v.(*IPCNamespace) } return nil diff --git a/pkg/sentry/kernel/ipc/ns.go b/pkg/sentry/kernel/ipc/ns.go new file mode 100644 index 000000000..220c9eafb --- /dev/null +++ b/pkg/sentry/kernel/ipc/ns.go @@ -0,0 +1,22 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ipc + +type contextID int + +// CtxIPCNamespace is the context.Value key used to retreive an IPC namespace. +// We define it here because it's needed in several packages, and is not +// possible to use otherwise without causing a circular depenedency. +const CtxIPCNamespace contextID = iota diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go index 0b101b1bb..50b4e015e 100644 --- a/pkg/sentry/kernel/ipc_namespace.go +++ b/pkg/sentry/kernel/ipc_namespace.go @@ -15,11 +15,16 @@ package kernel import ( + "fmt" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/mqfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/mq" "gvisor.dev/gvisor/pkg/sentry/kernel/msgqueue" "gvisor.dev/gvisor/pkg/sentry/kernel/semaphore" "gvisor.dev/gvisor/pkg/sentry/kernel/shm" + "gvisor.dev/gvisor/pkg/sentry/vfs" ) // IPCNamespace represents an IPC namespace. @@ -31,9 +36,17 @@ type IPCNamespace struct { // User namespace which owns this IPC namespace. Immutable. userNS *auth.UserNamespace + // System V utilities. queues *msgqueue.Registry semaphores *semaphore.Registry shms *shm.Registry + + // posixQueues is a POSIX message queue registry. + // + // posixQueues is somewhat equivelant to Linux's ipc_namespace.mq_mnt. + // Unlike SysV utilities, mq.Registry is not map-based, but is backed by + // a virtual filesystem. + posixQueues *mq.Registry } // NewIPCNamespace creates a new IPC namespace. @@ -63,10 +76,35 @@ func (i *IPCNamespace) ShmRegistry() *shm.Registry { return i.shms } +// InitPosixQueues creates a new POSIX queue registry, and returns an error if +// the registry was previously initialized. +func (i *IPCNamespace) InitPosixQueues(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error { + if i.posixQueues != nil { + return fmt.Errorf("IPCNamespace.InitPosixQueues: already initialized") + } + + impl, err := mqfs.NewRegistryImpl(ctx, vfsObj, creds) + if err != nil { + return err + } + i.posixQueues = mq.NewRegistry(i.userNS, impl) + return nil +} + +// PosixQueues returns the posix message queue registry for this namespace. +// +// Precondition: i.InitPosixQueues must have been called. +func (i *IPCNamespace) PosixQueues() *mq.Registry { + return i.posixQueues +} + // DecRef implements refsvfs2.RefCounter.DecRef. func (i *IPCNamespace) DecRef(ctx context.Context) { i.IPCNamespaceRefs.DecRef(func() { i.shms.Release(ctx) + if i.posixQueues != nil { + i.posixQueues.Destroy(ctx) + } }) } diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 5dc821a48..d4851ccda 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -58,6 +58,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" + "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" @@ -407,6 +408,11 @@ func (k *Kernel) Init(args InitKernelArgs) error { return fmt.Errorf("failed to initialize VFS: %v", err) } + err := k.rootIPCNamespace.InitPosixQueues(ctx, &k.vfs, auth.CredentialsFromContext(ctx)) + if err != nil { + return fmt.Errorf("failed to create mqfs filesystem: %v", err) + } + pipeFilesystem, err := pipefs.NewFilesystem(&k.vfs) if err != nil { return fmt.Errorf("failed to create pipefs filesystem: %v", err) @@ -837,7 +843,7 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} { return ctx.args.PIDNamespace case CtxUTSNamespace: return ctx.args.UTSNamespace - case CtxIPCNamespace: + case ipc.CtxIPCNamespace: ipcns := ctx.args.IPCNamespace ipcns.IncRef() return ipcns @@ -1665,7 +1671,7 @@ func (ctx supervisorContext) Value(key interface{}) interface{} { return ctx.k.tasks.Root case CtxUTSNamespace: return ctx.k.rootUTSNamespace - case CtxIPCNamespace: + case ipc.CtxIPCNamespace: ipcns := ctx.k.rootIPCNamespace ipcns.IncRef() return ipcns diff --git a/pkg/sentry/kernel/kernel_state_autogen.go b/pkg/sentry/kernel/kernel_state_autogen.go index ef8dd88f6..303ee51b1 100644 --- a/pkg/sentry/kernel/kernel_state_autogen.go +++ b/pkg/sentry/kernel/kernel_state_autogen.go @@ -346,6 +346,7 @@ func (i *IPCNamespace) StateFields() []string { "queues", "semaphores", "shms", + "posixQueues", } } @@ -359,6 +360,7 @@ func (i *IPCNamespace) StateSave(stateSinkObject state.Sink) { stateSinkObject.Save(2, &i.queues) stateSinkObject.Save(3, &i.semaphores) stateSinkObject.Save(4, &i.shms) + stateSinkObject.Save(5, &i.posixQueues) } func (i *IPCNamespace) afterLoad() {} @@ -370,6 +372,7 @@ func (i *IPCNamespace) StateLoad(stateSourceObject state.Source) { stateSourceObject.Load(2, &i.queues) stateSourceObject.Load(3, &i.semaphores) stateSourceObject.Load(4, &i.shms) + stateSourceObject.Load(5, &i.posixQueues) } func (r *IPCNamespaceRefs) StateTypeName() string { diff --git a/pkg/sentry/kernel/mq/message_list.go b/pkg/sentry/kernel/mq/message_list.go new file mode 100644 index 000000000..a5874c5b1 --- /dev/null +++ b/pkg/sentry/kernel/mq/message_list.go @@ -0,0 +1,221 @@ +package mq + +// ElementMapper provides an identity mapping by default. +// +// This can be replaced to provide a struct that maps elements to linker +// objects, if they are not the same. An ElementMapper is not typically +// required if: Linker is left as is, Element is left as is, or Linker and +// Element are the same type. +type msgElementMapper struct{} + +// linkerFor maps an Element to a Linker. +// +// This default implementation should be inlined. +// +//go:nosplit +func (msgElementMapper) linkerFor(elem *Message) *Message { return elem } + +// List is an intrusive list. Entries can be added to or removed from the list +// in O(1) time and with no additional memory allocations. +// +// The zero value for List is an empty list ready to use. +// +// To iterate over a list (where l is a List): +// for e := l.Front(); e != nil; e = e.Next() { +// // do something with e. +// } +// +// +stateify savable +type msgList struct { + head *Message + tail *Message +} + +// Reset resets list l to the empty state. +func (l *msgList) Reset() { + l.head = nil + l.tail = nil +} + +// Empty returns true iff the list is empty. +// +//go:nosplit +func (l *msgList) Empty() bool { + return l.head == nil +} + +// Front returns the first element of list l or nil. +// +//go:nosplit +func (l *msgList) Front() *Message { + return l.head +} + +// Back returns the last element of list l or nil. +// +//go:nosplit +func (l *msgList) Back() *Message { + return l.tail +} + +// Len returns the number of elements in the list. +// +// NOTE: This is an O(n) operation. +// +//go:nosplit +func (l *msgList) Len() (count int) { + for e := l.Front(); e != nil; e = (msgElementMapper{}.linkerFor(e)).Next() { + count++ + } + return count +} + +// PushFront inserts the element e at the front of list l. +// +//go:nosplit +func (l *msgList) PushFront(e *Message) { + linker := msgElementMapper{}.linkerFor(e) + linker.SetNext(l.head) + linker.SetPrev(nil) + if l.head != nil { + msgElementMapper{}.linkerFor(l.head).SetPrev(e) + } else { + l.tail = e + } + + l.head = e +} + +// PushBack inserts the element e at the back of list l. +// +//go:nosplit +func (l *msgList) PushBack(e *Message) { + linker := msgElementMapper{}.linkerFor(e) + linker.SetNext(nil) + linker.SetPrev(l.tail) + if l.tail != nil { + msgElementMapper{}.linkerFor(l.tail).SetNext(e) + } else { + l.head = e + } + + l.tail = e +} + +// PushBackList inserts list m at the end of list l, emptying m. +// +//go:nosplit +func (l *msgList) PushBackList(m *msgList) { + if l.head == nil { + l.head = m.head + l.tail = m.tail + } else if m.head != nil { + msgElementMapper{}.linkerFor(l.tail).SetNext(m.head) + msgElementMapper{}.linkerFor(m.head).SetPrev(l.tail) + + l.tail = m.tail + } + m.head = nil + m.tail = nil +} + +// InsertAfter inserts e after b. +// +//go:nosplit +func (l *msgList) InsertAfter(b, e *Message) { + bLinker := msgElementMapper{}.linkerFor(b) + eLinker := msgElementMapper{}.linkerFor(e) + + a := bLinker.Next() + + eLinker.SetNext(a) + eLinker.SetPrev(b) + bLinker.SetNext(e) + + if a != nil { + msgElementMapper{}.linkerFor(a).SetPrev(e) + } else { + l.tail = e + } +} + +// InsertBefore inserts e before a. +// +//go:nosplit +func (l *msgList) InsertBefore(a, e *Message) { + aLinker := msgElementMapper{}.linkerFor(a) + eLinker := msgElementMapper{}.linkerFor(e) + + b := aLinker.Prev() + eLinker.SetNext(a) + eLinker.SetPrev(b) + aLinker.SetPrev(e) + + if b != nil { + msgElementMapper{}.linkerFor(b).SetNext(e) + } else { + l.head = e + } +} + +// Remove removes e from l. +// +//go:nosplit +func (l *msgList) Remove(e *Message) { + linker := msgElementMapper{}.linkerFor(e) + prev := linker.Prev() + next := linker.Next() + + if prev != nil { + msgElementMapper{}.linkerFor(prev).SetNext(next) + } else if l.head == e { + l.head = next + } + + if next != nil { + msgElementMapper{}.linkerFor(next).SetPrev(prev) + } else if l.tail == e { + l.tail = prev + } + + linker.SetNext(nil) + linker.SetPrev(nil) +} + +// Entry is a default implementation of Linker. Users can add anonymous fields +// of this type to their structs to make them automatically implement the +// methods needed by List. +// +// +stateify savable +type msgEntry struct { + next *Message + prev *Message +} + +// Next returns the entry that follows e in the list. +// +//go:nosplit +func (e *msgEntry) Next() *Message { + return e.next +} + +// Prev returns the entry that precedes e in the list. +// +//go:nosplit +func (e *msgEntry) Prev() *Message { + return e.prev +} + +// SetNext assigns 'entry' as the entry that follows e in the list. +// +//go:nosplit +func (e *msgEntry) SetNext(elem *Message) { + e.next = elem +} + +// SetPrev assigns 'entry' as the entry that precedes e in the list. +// +//go:nosplit +func (e *msgEntry) SetPrev(elem *Message) { + e.prev = elem +} diff --git a/pkg/sentry/kernel/mq/mq.go b/pkg/sentry/kernel/mq/mq.go new file mode 100644 index 000000000..50ca6d34a --- /dev/null +++ b/pkg/sentry/kernel/mq/mq.go @@ -0,0 +1,457 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package mq provides an implementation for POSIX message queues. +package mq + +import ( + "bytes" + "fmt" + "strings" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/waiter" +) + +// AccessType is the access type passed to mq_open. +type AccessType int + +// Possible access types. +const ( + ReadOnly AccessType = iota + WriteOnly + ReadWrite +) + +// MaxName is the maximum size for a queue name. +const MaxName = 255 + +const ( + maxPriority = linux.MQ_PRIO_MAX - 1 // Highest possible message priority. + + maxQueuesDefault = linux.DFLT_QUEUESMAX // Default max number of queues. + + maxMsgDefault = linux.DFLT_MSG // Default max number of messages per queue. + maxMsgMin = linux.MIN_MSGMAX // Min value for max number of messages per queue. + maxMsgLimit = linux.DFLT_MSGMAX // Limit for max number of messages per queue. + maxMsgHardLimit = linux.HARD_MSGMAX // Hard limit for max number of messages per queue. + + msgSizeDefault = linux.DFLT_MSGSIZE // Default max message size. + msgSizeMin = linux.MIN_MSGSIZEMAX // Min value for max message size. + msgSizeLimit = linux.DFLT_MSGSIZEMAX // Limit for max message size. + msgSizeHardLimit = linux.HARD_MSGSIZEMAX // Hard limit for max message size. +) + +// Registry is a POSIX message queue registry. +// +// Unlike SysV utilities, Registry is not map-based. It uses a provided +// RegistryImpl backed by a virtual filesystem to implement registry operations. +// +// +stateify savable +type Registry struct { + // userNS is the user namespace containing this registry. Immutable. + userNS *auth.UserNamespace + + // mu protects all fields below. + mu sync.Mutex `state:"nosave"` + + // impl is an implementation of several message queue utilities needed by + // the registry. impl should be provided by mqfs. + impl RegistryImpl +} + +// RegistryImpl defines utilities needed by a Registry to provide actual +// registry implementation. It works mainly as an abstraction layer used by +// Registry to avoid dealing directly with the filesystem. RegistryImpl should +// be implemented by mqfs and provided to Registry at initialization. +type RegistryImpl interface { + // Get searchs for a queue with the given name, if it exists, the queue is + // used to create a new FD, return it and return true. If the queue doesn't + // exist, return false and no error. An error is returned if creation fails. + Get(ctx context.Context, name string, access AccessType, block bool, flags uint32) (*vfs.FileDescription, bool, error) + + // New creates a new inode and file description using the given queue, + // inserts the inode into the filesystem tree using the given name, and + // returns the file description. An error is returned if creation fails, or + // if the name already exists. + New(ctx context.Context, name string, q *Queue, access AccessType, block bool, perm linux.FileMode, flags uint32) (*vfs.FileDescription, error) + + // Unlink removes the queue with given name from the registry, and returns + // an error if the name doesn't exist. + Unlink(ctx context.Context, name string) error + + // Destroy destroys the registry. + Destroy(context.Context) +} + +// NewRegistry returns a new, initialized message queue registry. NewRegistry +// should be called when a new message queue filesystem is created, once per +// IPCNamespace. +func NewRegistry(userNS *auth.UserNamespace, impl RegistryImpl) *Registry { + return &Registry{ + userNS: userNS, + impl: impl, + } +} + +// OpenOpts holds the options passed to FindOrCreate. +type OpenOpts struct { + Name string + Access AccessType + Create bool + Exclusive bool + Block bool +} + +// FindOrCreate creates a new POSIX message queue or opens an existing queue. +// See mq_open(2). +func (r *Registry) FindOrCreate(ctx context.Context, opts OpenOpts, perm linux.FileMode, attr *linux.MqAttr) (*vfs.FileDescription, error) { + // mq_overview(7) mentions that: "Each message queue is identified by a name + // of the form '/somename'", but the mq_open(3) man pages mention: + // "The mq_open() library function is implemented on top of a system call + // of the same name. The library function performs the check that the + // name starts with a slash (/), giving the EINVAL error if it does not. + // The kernel system call expects name to contain no preceding slash, so + // the C library function passes name without the preceding slash (i.e., + // name+1) to the system call." + // So we don't need to check it. + + if len(opts.Name) == 0 { + return nil, linuxerr.ENOENT + } + if len(opts.Name) > MaxName { + return nil, linuxerr.ENAMETOOLONG + } + if strings.ContainsRune(opts.Name, '/') { + return nil, linuxerr.EACCES + } + if opts.Name == "." || opts.Name == ".." { + return nil, linuxerr.EINVAL + } + + // Construct status flags. + var flags uint32 + if opts.Block { + flags = linux.O_NONBLOCK + } + switch opts.Access { + case ReadOnly: + flags = flags | linux.O_RDONLY + case WriteOnly: + flags = flags | linux.O_WRONLY + case ReadWrite: + flags = flags | linux.O_RDWR + } + + r.mu.Lock() + defer r.mu.Unlock() + fd, ok, err := r.impl.Get(ctx, opts.Name, opts.Access, opts.Block, flags) + if err != nil { + return nil, err + } + + if ok { + if opts.Create && opts.Exclusive { + // "Both O_CREAT and O_EXCL were specified in oflag, but a queue + // with this name already exists." + return nil, linuxerr.EEXIST + } + return fd, nil + } + + if !opts.Create { + // "The O_CREAT flag was not specified in oflag, and no queue with this name + // exists." + return nil, linuxerr.ENOENT + } + + q, err := r.newQueueLocked(auth.CredentialsFromContext(ctx), fs.FileOwnerFromContext(ctx), fs.FilePermsFromMode(perm), attr) + if err != nil { + return nil, err + } + return r.impl.New(ctx, opts.Name, q, opts.Access, opts.Block, perm, flags) +} + +// newQueueLocked creates a new queue using the given attributes. If attr is nil +// return a queue with default values, otherwise use attr to create a new queue, +// and return an error if attributes are invalid. +func (r *Registry) newQueueLocked(creds *auth.Credentials, owner fs.FileOwner, perms fs.FilePermissions, attr *linux.MqAttr) (*Queue, error) { + if attr == nil { + return &Queue{ + owner: owner, + perms: perms, + maxMessageCount: int64(maxMsgDefault), + maxMessageSize: uint64(msgSizeDefault), + }, nil + } + + // "O_CREAT was specified in oflag, and attr was not NULL, but + // attr->mq_maxmsg or attr->mq_msqsize was invalid. Both of these fields + // these fields must be greater than zero. In a process that is + // unprivileged (does not have the CAP_SYS_RESOURCE capability), + // attr->mq_maxmsg must be less than or equal to the msg_max limit, and + // attr->mq_msgsize must be less than or equal to the msgsize_max limit. + // In addition, even in a privileged process, attr->mq_maxmsg cannot + // exceed the HARD_MAX limit." - man mq_open(3). + if attr.MqMaxmsg <= 0 || attr.MqMsgsize <= 0 { + return nil, linuxerr.EINVAL + } + + if attr.MqMaxmsg > maxMsgHardLimit || (!creds.HasCapabilityIn(linux.CAP_SYS_RESOURCE, r.userNS) && (attr.MqMaxmsg > maxMsgLimit || attr.MqMsgsize > msgSizeLimit)) { + return nil, linuxerr.EINVAL + } + + return &Queue{ + owner: owner, + perms: perms, + maxMessageCount: attr.MqMaxmsg, + maxMessageSize: uint64(attr.MqMsgsize), + }, nil +} + +// Remove removes the queue with the given name from the registry. See +// mq_unlink(2). +func (r *Registry) Remove(ctx context.Context, name string) error { + if len(name) > MaxName { + return linuxerr.ENAMETOOLONG + } + + r.mu.Lock() + defer r.mu.Unlock() + return r.impl.Unlink(ctx, name) +} + +// Destroy destroys the registry and releases all held references. +func (r *Registry) Destroy(ctx context.Context) { + r.mu.Lock() + defer r.mu.Unlock() + r.impl.Destroy(ctx) +} + +// Impl returns RegistryImpl inside r. +func (r *Registry) Impl() RegistryImpl { + return r.impl +} + +// Queue represents a POSIX message queue. +// +// +stateify savable +type Queue struct { + // owner is the registry's owner. Immutable. + owner fs.FileOwner + + // perms is the registry's access permissions. Immutable. + perms fs.FilePermissions + + // mu protects all the fields below. + mu sync.Mutex `state:"nosave"` + + // senders is a queue of currently blocked senders. Senders are notified + // when space isi available in the queue for a new message. + senders waiter.Queue + + // receivers is a queue of currently blocked receivers. Receivers are + // notified when a new message is inserted in the queue. + receivers waiter.Queue + + // messages is a list of messages currently in the queue. + messages msgList + + // subscriber represents a task registered to receive async notification + // from this queue. + subscriber *Subscriber + + // messageCount is the number of messages currently in the queue. + messageCount int64 + + // maxMessageCount is the maximum number of messages that the queue can + // hold. + maxMessageCount int64 + + // maxMessageSize is the maximum size of a message held by the queue. + maxMessageSize uint64 + + // byteCount is the number of bytes of data in all messages in the queue. + byteCount uint64 +} + +// View is a view into a message queue. Views should only be used in file +// descriptions, but not inodes, because we use inodes to retreive the actual +// queue, and only FDs are responsible for providing user functionality. +type View interface { + // TODO: Add Send and Receive when mq_timedsend(2) and mq_timedreceive(2) + // are implemented. + + // Flush checks if the calling process has attached a notification request + // to this queue, if yes, then the request is removed, and another process + // can attach a request. + Flush(ctx context.Context) + + waiter.Waitable +} + +// ReaderWriter provides a send and receive view into a queue. +type ReaderWriter struct { + *Queue + + block bool +} + +// Reader provides a send-only view into a queue. +type Reader struct { + *Queue + + block bool +} + +// Writer provides a receive-only view into a queue. +type Writer struct { + *Queue + + block bool +} + +// NewView creates a new view into a queue and returns it. +func NewView(q *Queue, access AccessType, block bool) (View, error) { + switch access { + case ReadWrite: + return ReaderWriter{Queue: q, block: block}, nil + case WriteOnly: + return Writer{Queue: q, block: block}, nil + case ReadOnly: + return Reader{Queue: q, block: block}, nil + default: + // This case can't happen, due to O_RDONLY flag being 0 and O_WRONLY + // being 1, so one of them must be true. + return nil, linuxerr.EINVAL + } +} + +// Message holds a message exchanged through a Queue via mq_timedsend(2) and +// mq_timedreceive(2), and additional info relating to the message. +// +// +stateify savable +type Message struct { + msgEntry + + // Text is the message's sent content. + Text string + + // Size is the message's size in bytes. + Size uint64 + + // Priority is the message's priority. + Priority uint32 +} + +// Subscriber represents a task registered for async notification from a Queue. +// +// +stateify savable +type Subscriber struct { + // TODO: Add fields when mq_notify(2) is implemented. + + // pid is the PID of the registered task. + pid int32 +} + +// Generate implements vfs.DynamicBytesSource.Generate. Queue is used as a +// DynamicBytesSource for mqfs's queueInode. +func (q *Queue) Generate(ctx context.Context, buf *bytes.Buffer) error { + q.mu.Lock() + defer q.mu.Unlock() + + var ( + pid int32 + method int + sigNumber int + ) + if q.subscriber != nil { + pid = q.subscriber.pid + // TODO: add method and sigNumber when mq_notify(2) is implemented. + } + + buf.WriteString( + fmt.Sprintf("QSIZE:%-10d NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n", + q.byteCount, method, sigNumber, pid), + ) + return nil +} + +// Flush implements View.Flush. +func (q *Queue) Flush(ctx context.Context) { + q.mu.Lock() + defer q.mu.Unlock() + + pid, ok := context.ThreadGroupIDFromContext(ctx) + if ok { + if q.subscriber != nil && pid == q.subscriber.pid { + q.subscriber = nil + } + } +} + +// Readiness implements Waitable.Readiness. +func (q *Queue) Readiness(mask waiter.EventMask) waiter.EventMask { + q.mu.Lock() + defer q.mu.Unlock() + + events := waiter.EventMask(0) + if q.messageCount > 0 { + events |= waiter.ReadableEvents + } + if q.messageCount < q.maxMessageCount { + events |= waiter.WritableEvents + } + return events & mask +} + +// EventRegister implements Waitable.EventRegister. +func (q *Queue) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + q.mu.Lock() + defer q.mu.Unlock() + + if mask&waiter.WritableEvents != 0 { + q.senders.EventRegister(e, waiter.EventOut) + } + if mask&waiter.ReadableEvents != 0 { + q.receivers.EventRegister(e, waiter.EventIn) + } +} + +// EventUnregister implements Waitable.EventUnregister. +func (q *Queue) EventUnregister(e *waiter.Entry) { + q.mu.Lock() + defer q.mu.Unlock() + + q.senders.EventUnregister(e) + q.receivers.EventUnregister(e) +} + +// HasPermissions returns true if the given credentials meet the access +// permissions required by the queue. +func (q *Queue) HasPermissions(creds *auth.Credentials, req fs.PermMask) bool { + p := q.perms.Other + if q.owner.UID == creds.EffectiveKUID { + p = q.perms.User + } else if creds.InGroup(q.owner.GID) { + p = q.perms.Group + } + return p.SupersetOf(req) +} diff --git a/pkg/sentry/kernel/mq/mq_state_autogen.go b/pkg/sentry/kernel/mq/mq_state_autogen.go new file mode 100644 index 000000000..4833c9b4e --- /dev/null +++ b/pkg/sentry/kernel/mq/mq_state_autogen.go @@ -0,0 +1,211 @@ +// automatically generated by stateify. + +package mq + +import ( + "gvisor.dev/gvisor/pkg/state" +) + +func (l *msgList) StateTypeName() string { + return "pkg/sentry/kernel/mq.msgList" +} + +func (l *msgList) StateFields() []string { + return []string{ + "head", + "tail", + } +} + +func (l *msgList) beforeSave() {} + +// +checklocksignore +func (l *msgList) StateSave(stateSinkObject state.Sink) { + l.beforeSave() + stateSinkObject.Save(0, &l.head) + stateSinkObject.Save(1, &l.tail) +} + +func (l *msgList) afterLoad() {} + +// +checklocksignore +func (l *msgList) StateLoad(stateSourceObject state.Source) { + stateSourceObject.Load(0, &l.head) + stateSourceObject.Load(1, &l.tail) +} + +func (e *msgEntry) StateTypeName() string { + return "pkg/sentry/kernel/mq.msgEntry" +} + +func (e *msgEntry) StateFields() []string { + return []string{ + "next", + "prev", + } +} + +func (e *msgEntry) beforeSave() {} + +// +checklocksignore +func (e *msgEntry) StateSave(stateSinkObject state.Sink) { + e.beforeSave() + stateSinkObject.Save(0, &e.next) + stateSinkObject.Save(1, &e.prev) +} + +func (e *msgEntry) afterLoad() {} + +// +checklocksignore +func (e *msgEntry) StateLoad(stateSourceObject state.Source) { + stateSourceObject.Load(0, &e.next) + stateSourceObject.Load(1, &e.prev) +} + +func (r *Registry) StateTypeName() string { + return "pkg/sentry/kernel/mq.Registry" +} + +func (r *Registry) StateFields() []string { + return []string{ + "userNS", + "impl", + } +} + +func (r *Registry) beforeSave() {} + +// +checklocksignore +func (r *Registry) StateSave(stateSinkObject state.Sink) { + r.beforeSave() + stateSinkObject.Save(0, &r.userNS) + stateSinkObject.Save(1, &r.impl) +} + +func (r *Registry) afterLoad() {} + +// +checklocksignore +func (r *Registry) StateLoad(stateSourceObject state.Source) { + stateSourceObject.Load(0, &r.userNS) + stateSourceObject.Load(1, &r.impl) +} + +func (q *Queue) StateTypeName() string { + return "pkg/sentry/kernel/mq.Queue" +} + +func (q *Queue) StateFields() []string { + return []string{ + "owner", + "perms", + "senders", + "receivers", + "messages", + "subscriber", + "messageCount", + "maxMessageCount", + "maxMessageSize", + "byteCount", + } +} + +func (q *Queue) beforeSave() {} + +// +checklocksignore +func (q *Queue) StateSave(stateSinkObject state.Sink) { + q.beforeSave() + stateSinkObject.Save(0, &q.owner) + stateSinkObject.Save(1, &q.perms) + stateSinkObject.Save(2, &q.senders) + stateSinkObject.Save(3, &q.receivers) + stateSinkObject.Save(4, &q.messages) + stateSinkObject.Save(5, &q.subscriber) + stateSinkObject.Save(6, &q.messageCount) + stateSinkObject.Save(7, &q.maxMessageCount) + stateSinkObject.Save(8, &q.maxMessageSize) + stateSinkObject.Save(9, &q.byteCount) +} + +func (q *Queue) afterLoad() {} + +// +checklocksignore +func (q *Queue) StateLoad(stateSourceObject state.Source) { + stateSourceObject.Load(0, &q.owner) + stateSourceObject.Load(1, &q.perms) + stateSourceObject.Load(2, &q.senders) + stateSourceObject.Load(3, &q.receivers) + stateSourceObject.Load(4, &q.messages) + stateSourceObject.Load(5, &q.subscriber) + stateSourceObject.Load(6, &q.messageCount) + stateSourceObject.Load(7, &q.maxMessageCount) + stateSourceObject.Load(8, &q.maxMessageSize) + stateSourceObject.Load(9, &q.byteCount) +} + +func (m *Message) StateTypeName() string { + return "pkg/sentry/kernel/mq.Message" +} + +func (m *Message) StateFields() []string { + return []string{ + "msgEntry", + "Text", + "Size", + "Priority", + } +} + +func (m *Message) beforeSave() {} + +// +checklocksignore +func (m *Message) StateSave(stateSinkObject state.Sink) { + m.beforeSave() + stateSinkObject.Save(0, &m.msgEntry) + stateSinkObject.Save(1, &m.Text) + stateSinkObject.Save(2, &m.Size) + stateSinkObject.Save(3, &m.Priority) +} + +func (m *Message) afterLoad() {} + +// +checklocksignore +func (m *Message) StateLoad(stateSourceObject state.Source) { + stateSourceObject.Load(0, &m.msgEntry) + stateSourceObject.Load(1, &m.Text) + stateSourceObject.Load(2, &m.Size) + stateSourceObject.Load(3, &m.Priority) +} + +func (s *Subscriber) StateTypeName() string { + return "pkg/sentry/kernel/mq.Subscriber" +} + +func (s *Subscriber) StateFields() []string { + return []string{ + "pid", + } +} + +func (s *Subscriber) beforeSave() {} + +// +checklocksignore +func (s *Subscriber) StateSave(stateSinkObject state.Sink) { + s.beforeSave() + stateSinkObject.Save(0, &s.pid) +} + +func (s *Subscriber) afterLoad() {} + +// +checklocksignore +func (s *Subscriber) StateLoad(stateSourceObject state.Source) { + stateSourceObject.Load(0, &s.pid) +} + +func init() { + state.Register((*msgList)(nil)) + state.Register((*msgEntry)(nil)) + state.Register((*Registry)(nil)) + state.Register((*Queue)(nil)) + state.Register((*Message)(nil)) + state.Register((*Subscriber)(nil)) +} diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index a6d8fb163..69a3227f0 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -103,6 +103,9 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) { ipcns := t.IPCNamespace() if args.Flags&linux.CLONE_NEWIPC != 0 { ipcns = NewIPCNamespace(userns) + if VFS2Enabled { + ipcns.InitPosixQueues(t, t.k.VFS(), creds) + } } else { ipcns.IncRef() } @@ -464,6 +467,9 @@ func (t *Task) Unshare(flags int32) error { // namespace" t.ipcns.DecRef(t) t.ipcns = NewIPCNamespace(creds.UserNamespace) + if VFS2Enabled { + t.ipcns.InitPosixQueues(t, t.k.VFS(), creds) + } } var oldFDTable *FDTable if flags&linux.CLONE_FILES != 0 { diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go index c82d9e82b..cb9bcd7c0 100644 --- a/pkg/sentry/kernel/task_context.go +++ b/pkg/sentry/kernel/task_context.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/pgalloc" @@ -73,7 +74,7 @@ func (t *Task) contextValue(key interface{}, isTaskGoroutine bool) interface{} { defer t.mu.Unlock() } return t.utsns - case CtxIPCNamespace: + case ipc.CtxIPCNamespace: if !isTaskGoroutine { t.mu.Lock() defer t.mu.Unlock() diff --git a/pkg/sentry/syscalls/linux/vfs2/mq.go b/pkg/sentry/syscalls/linux/vfs2/mq.go new file mode 100644 index 000000000..d5d81c6e2 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/mq.go @@ -0,0 +1,98 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/mq" +) + +// MqOpen implements mq_open(2). +func MqOpen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + nameAddr := args[0].Pointer() + flag := args[1].Int() + mode := args[2].ModeT() + attrAddr := args[3].Pointer() + + name, err := t.CopyInString(nameAddr, mq.MaxName) + if err != nil { + return 0, nil, err + } + + rOnly := flag&linux.O_RDONLY == linux.O_RDONLY + wOnly := flag&linux.O_WRONLY == linux.O_WRONLY + readWrite := flag&linux.O_RDWR == linux.O_RDWR + + create := flag&linux.O_CREAT == linux.O_CREAT + exclusive := flag&linux.O_EXCL == linux.O_EXCL + block := flag&linux.O_NONBLOCK != linux.O_NONBLOCK + + var attr linux.MqAttr + var attrPtr *linux.MqAttr + if attrAddr != 0 { + if _, err := attr.CopyIn(t, attrAddr); err != nil { + return 0, nil, err + } + attrPtr = &attr + } + + opts := openOpts(name, rOnly, wOnly, readWrite, create, exclusive, block) + + r := t.IPCNamespace().PosixQueues() + queue, err := r.FindOrCreate(t, opts, linux.FileMode(mode), attrPtr) + if err != nil { + return 0, nil, err + } + + fd, err := t.NewFDFromVFS2(0, queue, kernel.FDFlags{ + CloseOnExec: flag&linux.O_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + return uintptr(fd), nil, nil +} + +// MqUnlink implements mq_unlink(2). +func MqUnlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + nameAddr := args[0].Pointer() + name, err := t.CopyInString(nameAddr, mq.MaxName) + if err != nil { + return 0, nil, err + } + return 0, nil, t.IPCNamespace().PosixQueues().Remove(t, name) +} + +func openOpts(name string, rOnly, wOnly, readWrite, create, exclusive, block bool) mq.OpenOpts { + var access mq.AccessType + switch { + case readWrite: + access = mq.ReadWrite + case wOnly: + access = mq.WriteOnly + case rOnly: + access = mq.ReadOnly + } + + return mq.OpenOpts{ + Name: name, + Access: access, + Create: create, + Exclusive: exclusive, + Block: block, + } +} diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index 0fc81e694..4eb15a7f2 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -112,6 +112,8 @@ func Override() { s.Table[232] = syscalls.Supported("epoll_wait", EpollWait) s.Table[233] = syscalls.Supported("epoll_ctl", EpollCtl) s.Table[235] = syscalls.Supported("utimes", Utimes) + s.Table[240] = syscalls.Supported("mq_open", MqOpen) + s.Table[241] = syscalls.Supported("mq_unlink", MqUnlink) s.Table[253] = syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil) s.Table[254] = syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil) s.Table[255] = syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil) @@ -241,6 +243,8 @@ func Override() { s.Table[86] = syscalls.Supported("timerfd_settime", TimerfdSettime) s.Table[87] = syscalls.Supported("timerfd_gettime", TimerfdGettime) s.Table[88] = syscalls.Supported("utimensat", Utimensat) + s.Table[180] = syscalls.Supported("mq_open", MqOpen) + s.Table[181] = syscalls.Supported("mq_unlink", MqUnlink) s.Table[198] = syscalls.Supported("socket", Socket) s.Table[199] = syscalls.Supported("socketpair", SocketPair) s.Table[200] = syscalls.Supported("bind", Bind) @@ -271,6 +275,5 @@ func Override() { s.Table[287] = syscalls.Supported("pwritev2", Pwritev2) s.Table[291] = syscalls.Supported("statx", Statx) s.Table[441] = syscalls.Supported("epoll_pwait2", EpollPwait2) - s.Init() } diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index ac1e5ac37..9f0d1ae36 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -36,6 +36,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/fuse" "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/mqfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/overlay" "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys" @@ -95,6 +96,10 @@ func registerFilesystems(k *kernel.Kernel) error { AllowUserList: true, AllowUserMount: true, }) + vfsObj.MustRegisterFilesystemType(mqfs.Name, &mqfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) // Setup files in devtmpfs. if err := memdev.Register(vfsObj); err != nil { |