// Copyright 2021 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package mq provides an implementation for POSIX message queues. package mq import ( "bytes" "fmt" "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) // AccessType is the access type passed to mq_open. type AccessType int // Possible access types. const ( ReadOnly AccessType = iota WriteOnly ReadWrite ) // MaxName is the maximum size for a queue name. const MaxName = 255 const ( maxPriority = linux.MQ_PRIO_MAX - 1 // Highest possible message priority. maxQueuesDefault = linux.DFLT_QUEUESMAX // Default max number of queues. maxMsgDefault = linux.DFLT_MSG // Default max number of messages per queue. maxMsgMin = linux.MIN_MSGMAX // Min value for max number of messages per queue. maxMsgLimit = linux.DFLT_MSGMAX // Limit for max number of messages per queue. maxMsgHardLimit = linux.HARD_MSGMAX // Hard limit for max number of messages per queue. msgSizeDefault = linux.DFLT_MSGSIZE // Default max message size. msgSizeMin = linux.MIN_MSGSIZEMAX // Min value for max message size. msgSizeLimit = linux.DFLT_MSGSIZEMAX // Limit for max message size. msgSizeHardLimit = linux.HARD_MSGSIZEMAX // Hard limit for max message size. ) // Registry is a POSIX message queue registry. // // Unlike SysV utilities, Registry is not map-based. It uses a provided // RegistryImpl backed by a virtual filesystem to implement registry operations. // // +stateify savable type Registry struct { // userNS is the user namespace containing this registry. Immutable. userNS *auth.UserNamespace // mu protects all fields below. mu sync.Mutex `state:"nosave"` // impl is an implementation of several message queue utilities needed by // the registry. impl should be provided by mqfs. impl RegistryImpl } // RegistryImpl defines utilities needed by a Registry to provide actual // registry implementation. It works mainly as an abstraction layer used by // Registry to avoid dealing directly with the filesystem. RegistryImpl should // be implemented by mqfs and provided to Registry at initialization. type RegistryImpl interface { // Get searchs for a queue with the given name, if it exists, the queue is // used to create a new FD, return it and return true. If the queue doesn't // exist, return false and no error. An error is returned if creation fails. Get(ctx context.Context, name string, access AccessType, block bool, flags uint32) (*vfs.FileDescription, bool, error) // New creates a new inode and file description using the given queue, // inserts the inode into the filesystem tree using the given name, and // returns the file description. An error is returned if creation fails, or // if the name already exists. New(ctx context.Context, name string, q *Queue, access AccessType, block bool, perm linux.FileMode, flags uint32) (*vfs.FileDescription, error) // Unlink removes the queue with given name from the registry, and returns // an error if the name doesn't exist. Unlink(ctx context.Context, name string) error // Destroy destroys the registry. Destroy(context.Context) } // NewRegistry returns a new, initialized message queue registry. NewRegistry // should be called when a new message queue filesystem is created, once per // IPCNamespace. func NewRegistry(userNS *auth.UserNamespace, impl RegistryImpl) *Registry { return &Registry{ userNS: userNS, impl: impl, } } // OpenOpts holds the options passed to FindOrCreate. type OpenOpts struct { Name string Access AccessType Create bool Exclusive bool Block bool } // FindOrCreate creates a new POSIX message queue or opens an existing queue. // See mq_open(2). func (r *Registry) FindOrCreate(ctx context.Context, opts OpenOpts, mode linux.FileMode, attr *linux.MqAttr) (*vfs.FileDescription, error) { // mq_overview(7) mentions that: "Each message queue is identified by a name // of the form '/somename'", but the mq_open(3) man pages mention: // "The mq_open() library function is implemented on top of a system call // of the same name. The library function performs the check that the // name starts with a slash (/), giving the EINVAL error if it does not. // The kernel system call expects name to contain no preceding slash, so // the C library function passes name without the preceding slash (i.e., // name+1) to the system call." // So we don't need to check it. if len(opts.Name) == 0 { return nil, linuxerr.ENOENT } if len(opts.Name) > MaxName { return nil, linuxerr.ENAMETOOLONG } if strings.ContainsRune(opts.Name, '/') { return nil, linuxerr.EACCES } if opts.Name == "." || opts.Name == ".." { return nil, linuxerr.EINVAL } // Construct status flags. var flags uint32 if opts.Block { flags = linux.O_NONBLOCK } switch opts.Access { case ReadOnly: flags = flags | linux.O_RDONLY case WriteOnly: flags = flags | linux.O_WRONLY case ReadWrite: flags = flags | linux.O_RDWR } r.mu.Lock() defer r.mu.Unlock() fd, ok, err := r.impl.Get(ctx, opts.Name, opts.Access, opts.Block, flags) if err != nil { return nil, err } if ok { if opts.Create && opts.Exclusive { // "Both O_CREAT and O_EXCL were specified in oflag, but a queue // with this name already exists." return nil, linuxerr.EEXIST } return fd, nil } if !opts.Create { // "The O_CREAT flag was not specified in oflag, and no queue with this name // exists." return nil, linuxerr.ENOENT } q, err := r.newQueueLocked(auth.CredentialsFromContext(ctx), fs.FileOwnerFromContext(ctx), fs.FilePermsFromMode(mode), attr) if err != nil { return nil, err } return r.impl.New(ctx, opts.Name, q, opts.Access, opts.Block, mode.Permissions(), flags) } // newQueueLocked creates a new queue using the given attributes. If attr is nil // return a queue with default values, otherwise use attr to create a new queue, // and return an error if attributes are invalid. func (r *Registry) newQueueLocked(creds *auth.Credentials, owner fs.FileOwner, perms fs.FilePermissions, attr *linux.MqAttr) (*Queue, error) { if attr == nil { return &Queue{ owner: owner, perms: perms, maxMessageCount: int64(maxMsgDefault), maxMessageSize: uint64(msgSizeDefault), }, nil } // "O_CREAT was specified in oflag, and attr was not NULL, but // attr->mq_maxmsg or attr->mq_msqsize was invalid. Both of these fields // these fields must be greater than zero. In a process that is // unprivileged (does not have the CAP_SYS_RESOURCE capability), // attr->mq_maxmsg must be less than or equal to the msg_max limit, and // attr->mq_msgsize must be less than or equal to the msgsize_max limit. // In addition, even in a privileged process, attr->mq_maxmsg cannot // exceed the HARD_MAX limit." - man mq_open(3). if attr.MqMaxmsg <= 0 || attr.MqMsgsize <= 0 { return nil, linuxerr.EINVAL } if attr.MqMaxmsg > maxMsgHardLimit || (!creds.HasCapabilityIn(linux.CAP_SYS_RESOURCE, r.userNS) && (attr.MqMaxmsg > maxMsgLimit || attr.MqMsgsize > msgSizeLimit)) { return nil, linuxerr.EINVAL } return &Queue{ owner: owner, perms: perms, maxMessageCount: attr.MqMaxmsg, maxMessageSize: uint64(attr.MqMsgsize), }, nil } // Remove removes the queue with the given name from the registry. See // mq_unlink(2). func (r *Registry) Remove(ctx context.Context, name string) error { if len(name) > MaxName { return linuxerr.ENAMETOOLONG } r.mu.Lock() defer r.mu.Unlock() return r.impl.Unlink(ctx, name) } // Destroy destroys the registry and releases all held references. func (r *Registry) Destroy(ctx context.Context) { r.mu.Lock() defer r.mu.Unlock() r.impl.Destroy(ctx) } // Impl returns RegistryImpl inside r. func (r *Registry) Impl() RegistryImpl { return r.impl } // Queue represents a POSIX message queue. // // +stateify savable type Queue struct { // owner is the registry's owner. Immutable. owner fs.FileOwner // perms is the registry's access permissions. Immutable. perms fs.FilePermissions // mu protects all the fields below. mu sync.Mutex `state:"nosave"` // senders is a queue of currently blocked senders. Senders are notified // when space isi available in the queue for a new message. senders waiter.Queue // receivers is a queue of currently blocked receivers. Receivers are // notified when a new message is inserted in the queue. receivers waiter.Queue // messages is a list of messages currently in the queue. messages msgList // subscriber represents a task registered to receive async notification // from this queue. subscriber *Subscriber // messageCount is the number of messages currently in the queue. messageCount int64 // maxMessageCount is the maximum number of messages that the queue can // hold. maxMessageCount int64 // maxMessageSize is the maximum size of a message held by the queue. maxMessageSize uint64 // byteCount is the number of bytes of data in all messages in the queue. byteCount uint64 } // View is a view into a message queue. Views should only be used in file // descriptions, but not inodes, because we use inodes to retreive the actual // queue, and only FDs are responsible for providing user functionality. type View interface { // TODO: Add Send and Receive when mq_timedsend(2) and mq_timedreceive(2) // are implemented. // Flush checks if the calling process has attached a notification request // to this queue, if yes, then the request is removed, and another process // can attach a request. Flush(ctx context.Context) waiter.Waitable } // ReaderWriter provides a send and receive view into a queue. type ReaderWriter struct { *Queue block bool } // Reader provides a send-only view into a queue. type Reader struct { *Queue block bool } // Writer provides a receive-only view into a queue. type Writer struct { *Queue block bool } // NewView creates a new view into a queue and returns it. func NewView(q *Queue, access AccessType, block bool) (View, error) { switch access { case ReadWrite: return ReaderWriter{Queue: q, block: block}, nil case WriteOnly: return Writer{Queue: q, block: block}, nil case ReadOnly: return Reader{Queue: q, block: block}, nil default: // This case can't happen, due to O_RDONLY flag being 0 and O_WRONLY // being 1, so one of them must be true. return nil, linuxerr.EINVAL } } // Message holds a message exchanged through a Queue via mq_timedsend(2) and // mq_timedreceive(2), and additional info relating to the message. // // +stateify savable type Message struct { msgEntry // Text is the message's sent content. Text string // Size is the message's size in bytes. Size uint64 // Priority is the message's priority. Priority uint32 } // Subscriber represents a task registered for async notification from a Queue. // // +stateify savable type Subscriber struct { // TODO: Add fields when mq_notify(2) is implemented. // pid is the PID of the registered task. pid int32 } // Generate implements vfs.DynamicBytesSource.Generate. Queue is used as a // DynamicBytesSource for mqfs's queueInode. func (q *Queue) Generate(ctx context.Context, buf *bytes.Buffer) error { q.mu.Lock() defer q.mu.Unlock() var ( pid int32 method int sigNumber int ) if q.subscriber != nil { pid = q.subscriber.pid // TODO: add method and sigNumber when mq_notify(2) is implemented. } buf.WriteString( fmt.Sprintf("QSIZE:%-10d NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n", q.byteCount, method, sigNumber, pid), ) return nil } // Flush implements View.Flush. func (q *Queue) Flush(ctx context.Context) { q.mu.Lock() defer q.mu.Unlock() pid, ok := auth.ThreadGroupIDFromContext(ctx) if ok { if q.subscriber != nil && pid == q.subscriber.pid { q.subscriber = nil } } } // Readiness implements Waitable.Readiness. func (q *Queue) Readiness(mask waiter.EventMask) waiter.EventMask { q.mu.Lock() defer q.mu.Unlock() events := waiter.EventMask(0) if q.messageCount > 0 { events |= waiter.ReadableEvents } if q.messageCount < q.maxMessageCount { events |= waiter.WritableEvents } return events & mask } // EventRegister implements Waitable.EventRegister. func (q *Queue) EventRegister(e *waiter.Entry, mask waiter.EventMask) { q.mu.Lock() defer q.mu.Unlock() if mask&waiter.WritableEvents != 0 { q.senders.EventRegister(e, waiter.EventOut) } if mask&waiter.ReadableEvents != 0 { q.receivers.EventRegister(e, waiter.EventIn) } } // EventUnregister implements Waitable.EventUnregister. func (q *Queue) EventUnregister(e *waiter.Entry) { q.mu.Lock() defer q.mu.Unlock() q.senders.EventUnregister(e) q.receivers.EventUnregister(e) } // HasPermissions returns true if the given credentials meet the access // permissions required by the queue. func (q *Queue) HasPermissions(creds *auth.Credentials, req fs.PermMask) bool { p := q.perms.Other if q.owner.UID == creds.EffectiveKUID { p = q.perms.User } else if creds.InGroup(q.owner.GID) { p = q.perms.Group } return p.SupersetOf(req) }