diff options
author | gVisor bot <gvisor-bot@google.com> | 2021-10-21 16:05:53 -0700 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2021-10-21 16:05:53 -0700 |
commit | 14f4113924c8b7b8c161be7335b147106d0c4a26 (patch) | |
tree | 55997b73a8b0f8959077b12bfff9418187e8e2ac /pkg/sentry/kernel/mq/mq.go | |
parent | b928a241efc838d378d531ba2446e8611d95c1ac (diff) | |
parent | f03dc73f0f46d0ff1ae209fefc98ee3d7fc725d2 (diff) |
Merge pull request #6345 from sudo-sturbia:mq/syscalls
PiperOrigin-RevId: 404901660
Diffstat (limited to 'pkg/sentry/kernel/mq/mq.go')
-rw-r--r-- | pkg/sentry/kernel/mq/mq.go | 457 |
1 files changed, 457 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/mq/mq.go b/pkg/sentry/kernel/mq/mq.go new file mode 100644 index 000000000..50ca6d34a --- /dev/null +++ b/pkg/sentry/kernel/mq/mq.go @@ -0,0 +1,457 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package mq provides an implementation for POSIX message queues. +package mq + +import ( + "bytes" + "fmt" + "strings" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/waiter" +) + +// AccessType is the access type passed to mq_open. +type AccessType int + +// Possible access types. +const ( + ReadOnly AccessType = iota + WriteOnly + ReadWrite +) + +// MaxName is the maximum size for a queue name. +const MaxName = 255 + +const ( + maxPriority = linux.MQ_PRIO_MAX - 1 // Highest possible message priority. + + maxQueuesDefault = linux.DFLT_QUEUESMAX // Default max number of queues. + + maxMsgDefault = linux.DFLT_MSG // Default max number of messages per queue. + maxMsgMin = linux.MIN_MSGMAX // Min value for max number of messages per queue. + maxMsgLimit = linux.DFLT_MSGMAX // Limit for max number of messages per queue. + maxMsgHardLimit = linux.HARD_MSGMAX // Hard limit for max number of messages per queue. + + msgSizeDefault = linux.DFLT_MSGSIZE // Default max message size. + msgSizeMin = linux.MIN_MSGSIZEMAX // Min value for max message size. + msgSizeLimit = linux.DFLT_MSGSIZEMAX // Limit for max message size. + msgSizeHardLimit = linux.HARD_MSGSIZEMAX // Hard limit for max message size. +) + +// Registry is a POSIX message queue registry. +// +// Unlike SysV utilities, Registry is not map-based. It uses a provided +// RegistryImpl backed by a virtual filesystem to implement registry operations. +// +// +stateify savable +type Registry struct { + // userNS is the user namespace containing this registry. Immutable. + userNS *auth.UserNamespace + + // mu protects all fields below. + mu sync.Mutex `state:"nosave"` + + // impl is an implementation of several message queue utilities needed by + // the registry. impl should be provided by mqfs. + impl RegistryImpl +} + +// RegistryImpl defines utilities needed by a Registry to provide actual +// registry implementation. It works mainly as an abstraction layer used by +// Registry to avoid dealing directly with the filesystem. RegistryImpl should +// be implemented by mqfs and provided to Registry at initialization. +type RegistryImpl interface { + // Get searchs for a queue with the given name, if it exists, the queue is + // used to create a new FD, return it and return true. If the queue doesn't + // exist, return false and no error. An error is returned if creation fails. + Get(ctx context.Context, name string, access AccessType, block bool, flags uint32) (*vfs.FileDescription, bool, error) + + // New creates a new inode and file description using the given queue, + // inserts the inode into the filesystem tree using the given name, and + // returns the file description. An error is returned if creation fails, or + // if the name already exists. + New(ctx context.Context, name string, q *Queue, access AccessType, block bool, perm linux.FileMode, flags uint32) (*vfs.FileDescription, error) + + // Unlink removes the queue with given name from the registry, and returns + // an error if the name doesn't exist. + Unlink(ctx context.Context, name string) error + + // Destroy destroys the registry. + Destroy(context.Context) +} + +// NewRegistry returns a new, initialized message queue registry. NewRegistry +// should be called when a new message queue filesystem is created, once per +// IPCNamespace. +func NewRegistry(userNS *auth.UserNamespace, impl RegistryImpl) *Registry { + return &Registry{ + userNS: userNS, + impl: impl, + } +} + +// OpenOpts holds the options passed to FindOrCreate. +type OpenOpts struct { + Name string + Access AccessType + Create bool + Exclusive bool + Block bool +} + +// FindOrCreate creates a new POSIX message queue or opens an existing queue. +// See mq_open(2). +func (r *Registry) FindOrCreate(ctx context.Context, opts OpenOpts, perm linux.FileMode, attr *linux.MqAttr) (*vfs.FileDescription, error) { + // mq_overview(7) mentions that: "Each message queue is identified by a name + // of the form '/somename'", but the mq_open(3) man pages mention: + // "The mq_open() library function is implemented on top of a system call + // of the same name. The library function performs the check that the + // name starts with a slash (/), giving the EINVAL error if it does not. + // The kernel system call expects name to contain no preceding slash, so + // the C library function passes name without the preceding slash (i.e., + // name+1) to the system call." + // So we don't need to check it. + + if len(opts.Name) == 0 { + return nil, linuxerr.ENOENT + } + if len(opts.Name) > MaxName { + return nil, linuxerr.ENAMETOOLONG + } + if strings.ContainsRune(opts.Name, '/') { + return nil, linuxerr.EACCES + } + if opts.Name == "." || opts.Name == ".." { + return nil, linuxerr.EINVAL + } + + // Construct status flags. + var flags uint32 + if opts.Block { + flags = linux.O_NONBLOCK + } + switch opts.Access { + case ReadOnly: + flags = flags | linux.O_RDONLY + case WriteOnly: + flags = flags | linux.O_WRONLY + case ReadWrite: + flags = flags | linux.O_RDWR + } + + r.mu.Lock() + defer r.mu.Unlock() + fd, ok, err := r.impl.Get(ctx, opts.Name, opts.Access, opts.Block, flags) + if err != nil { + return nil, err + } + + if ok { + if opts.Create && opts.Exclusive { + // "Both O_CREAT and O_EXCL were specified in oflag, but a queue + // with this name already exists." + return nil, linuxerr.EEXIST + } + return fd, nil + } + + if !opts.Create { + // "The O_CREAT flag was not specified in oflag, and no queue with this name + // exists." + return nil, linuxerr.ENOENT + } + + q, err := r.newQueueLocked(auth.CredentialsFromContext(ctx), fs.FileOwnerFromContext(ctx), fs.FilePermsFromMode(perm), attr) + if err != nil { + return nil, err + } + return r.impl.New(ctx, opts.Name, q, opts.Access, opts.Block, perm, flags) +} + +// newQueueLocked creates a new queue using the given attributes. If attr is nil +// return a queue with default values, otherwise use attr to create a new queue, +// and return an error if attributes are invalid. +func (r *Registry) newQueueLocked(creds *auth.Credentials, owner fs.FileOwner, perms fs.FilePermissions, attr *linux.MqAttr) (*Queue, error) { + if attr == nil { + return &Queue{ + owner: owner, + perms: perms, + maxMessageCount: int64(maxMsgDefault), + maxMessageSize: uint64(msgSizeDefault), + }, nil + } + + // "O_CREAT was specified in oflag, and attr was not NULL, but + // attr->mq_maxmsg or attr->mq_msqsize was invalid. Both of these fields + // these fields must be greater than zero. In a process that is + // unprivileged (does not have the CAP_SYS_RESOURCE capability), + // attr->mq_maxmsg must be less than or equal to the msg_max limit, and + // attr->mq_msgsize must be less than or equal to the msgsize_max limit. + // In addition, even in a privileged process, attr->mq_maxmsg cannot + // exceed the HARD_MAX limit." - man mq_open(3). + if attr.MqMaxmsg <= 0 || attr.MqMsgsize <= 0 { + return nil, linuxerr.EINVAL + } + + if attr.MqMaxmsg > maxMsgHardLimit || (!creds.HasCapabilityIn(linux.CAP_SYS_RESOURCE, r.userNS) && (attr.MqMaxmsg > maxMsgLimit || attr.MqMsgsize > msgSizeLimit)) { + return nil, linuxerr.EINVAL + } + + return &Queue{ + owner: owner, + perms: perms, + maxMessageCount: attr.MqMaxmsg, + maxMessageSize: uint64(attr.MqMsgsize), + }, nil +} + +// Remove removes the queue with the given name from the registry. See +// mq_unlink(2). +func (r *Registry) Remove(ctx context.Context, name string) error { + if len(name) > MaxName { + return linuxerr.ENAMETOOLONG + } + + r.mu.Lock() + defer r.mu.Unlock() + return r.impl.Unlink(ctx, name) +} + +// Destroy destroys the registry and releases all held references. +func (r *Registry) Destroy(ctx context.Context) { + r.mu.Lock() + defer r.mu.Unlock() + r.impl.Destroy(ctx) +} + +// Impl returns RegistryImpl inside r. +func (r *Registry) Impl() RegistryImpl { + return r.impl +} + +// Queue represents a POSIX message queue. +// +// +stateify savable +type Queue struct { + // owner is the registry's owner. Immutable. + owner fs.FileOwner + + // perms is the registry's access permissions. Immutable. + perms fs.FilePermissions + + // mu protects all the fields below. + mu sync.Mutex `state:"nosave"` + + // senders is a queue of currently blocked senders. Senders are notified + // when space isi available in the queue for a new message. + senders waiter.Queue + + // receivers is a queue of currently blocked receivers. Receivers are + // notified when a new message is inserted in the queue. + receivers waiter.Queue + + // messages is a list of messages currently in the queue. + messages msgList + + // subscriber represents a task registered to receive async notification + // from this queue. + subscriber *Subscriber + + // messageCount is the number of messages currently in the queue. + messageCount int64 + + // maxMessageCount is the maximum number of messages that the queue can + // hold. + maxMessageCount int64 + + // maxMessageSize is the maximum size of a message held by the queue. + maxMessageSize uint64 + + // byteCount is the number of bytes of data in all messages in the queue. + byteCount uint64 +} + +// View is a view into a message queue. Views should only be used in file +// descriptions, but not inodes, because we use inodes to retreive the actual +// queue, and only FDs are responsible for providing user functionality. +type View interface { + // TODO: Add Send and Receive when mq_timedsend(2) and mq_timedreceive(2) + // are implemented. + + // Flush checks if the calling process has attached a notification request + // to this queue, if yes, then the request is removed, and another process + // can attach a request. + Flush(ctx context.Context) + + waiter.Waitable +} + +// ReaderWriter provides a send and receive view into a queue. +type ReaderWriter struct { + *Queue + + block bool +} + +// Reader provides a send-only view into a queue. +type Reader struct { + *Queue + + block bool +} + +// Writer provides a receive-only view into a queue. +type Writer struct { + *Queue + + block bool +} + +// NewView creates a new view into a queue and returns it. +func NewView(q *Queue, access AccessType, block bool) (View, error) { + switch access { + case ReadWrite: + return ReaderWriter{Queue: q, block: block}, nil + case WriteOnly: + return Writer{Queue: q, block: block}, nil + case ReadOnly: + return Reader{Queue: q, block: block}, nil + default: + // This case can't happen, due to O_RDONLY flag being 0 and O_WRONLY + // being 1, so one of them must be true. + return nil, linuxerr.EINVAL + } +} + +// Message holds a message exchanged through a Queue via mq_timedsend(2) and +// mq_timedreceive(2), and additional info relating to the message. +// +// +stateify savable +type Message struct { + msgEntry + + // Text is the message's sent content. + Text string + + // Size is the message's size in bytes. + Size uint64 + + // Priority is the message's priority. + Priority uint32 +} + +// Subscriber represents a task registered for async notification from a Queue. +// +// +stateify savable +type Subscriber struct { + // TODO: Add fields when mq_notify(2) is implemented. + + // pid is the PID of the registered task. + pid int32 +} + +// Generate implements vfs.DynamicBytesSource.Generate. Queue is used as a +// DynamicBytesSource for mqfs's queueInode. +func (q *Queue) Generate(ctx context.Context, buf *bytes.Buffer) error { + q.mu.Lock() + defer q.mu.Unlock() + + var ( + pid int32 + method int + sigNumber int + ) + if q.subscriber != nil { + pid = q.subscriber.pid + // TODO: add method and sigNumber when mq_notify(2) is implemented. + } + + buf.WriteString( + fmt.Sprintf("QSIZE:%-10d NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n", + q.byteCount, method, sigNumber, pid), + ) + return nil +} + +// Flush implements View.Flush. +func (q *Queue) Flush(ctx context.Context) { + q.mu.Lock() + defer q.mu.Unlock() + + pid, ok := context.ThreadGroupIDFromContext(ctx) + if ok { + if q.subscriber != nil && pid == q.subscriber.pid { + q.subscriber = nil + } + } +} + +// Readiness implements Waitable.Readiness. +func (q *Queue) Readiness(mask waiter.EventMask) waiter.EventMask { + q.mu.Lock() + defer q.mu.Unlock() + + events := waiter.EventMask(0) + if q.messageCount > 0 { + events |= waiter.ReadableEvents + } + if q.messageCount < q.maxMessageCount { + events |= waiter.WritableEvents + } + return events & mask +} + +// EventRegister implements Waitable.EventRegister. +func (q *Queue) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + q.mu.Lock() + defer q.mu.Unlock() + + if mask&waiter.WritableEvents != 0 { + q.senders.EventRegister(e, waiter.EventOut) + } + if mask&waiter.ReadableEvents != 0 { + q.receivers.EventRegister(e, waiter.EventIn) + } +} + +// EventUnregister implements Waitable.EventUnregister. +func (q *Queue) EventUnregister(e *waiter.Entry) { + q.mu.Lock() + defer q.mu.Unlock() + + q.senders.EventUnregister(e) + q.receivers.EventUnregister(e) +} + +// HasPermissions returns true if the given credentials meet the access +// permissions required by the queue. +func (q *Queue) HasPermissions(creds *auth.Credentials, req fs.PermMask) bool { + p := q.perms.Other + if q.owner.UID == creds.EffectiveKUID { + p = q.perms.User + } else if creds.InGroup(q.owner.GID) { + p = q.perms.Group + } + return p.SupersetOf(req) +} |