From fc8819f43cae94db8345060f0af00e013dc86098 Mon Sep 17 00:00:00 2001 From: "Zyad A. Ali" Date: Thu, 15 Jul 2021 15:23:28 +0200 Subject: Test creation and deletion of POSIX message queues. Updates #136 --- test/syscalls/BUILD | 4 + test/syscalls/linux/BUILD | 16 ++++ test/syscalls/linux/mq.cc | 234 ++++++++++++++++++++++++++++++++++++++++++++++ test/util/temp_path.cc | 26 +++--- test/util/temp_path.h | 3 + 5 files changed, 269 insertions(+), 14 deletions(-) create mode 100644 test/syscalls/linux/mq.cc diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 3b55112e6..43854e80b 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -325,6 +325,10 @@ syscall_test( test = "//test/syscalls/linux:mount_test", ) +syscall_test( + test = "//test/syscalls/linux:mq_test", +) + syscall_test( size = "medium", test = "//test/syscalls/linux:mremap_test", diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index a4efd9486..ed2adfbc9 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -4170,6 +4170,22 @@ cc_binary( ], ) +cc_binary( + name = "mq_test", + testonly = 1, + srcs = ["mq.cc"], + linkstatic = 1, + linkopts = [ + "-lrt", + ], + deps = [ + "//test/util:posix_error", + "//test/util:test_main", + "//test/util:temp_path", + "//test/util:test_util", + ], +) + cc_binary( name = "semaphore_test", testonly = 1, diff --git a/test/syscalls/linux/mq.cc b/test/syscalls/linux/mq.cc new file mode 100644 index 000000000..610d41fb6 --- /dev/null +++ b/test/syscalls/linux/mq.cc @@ -0,0 +1,234 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include + +#include "test/util/posix_error.h" +#include "test/util/temp_path.h" +#include "test/util/test_util.h" + +#define NAME_MAX 255 + +namespace gvisor { +namespace testing { +namespace { + +using ::testing::_; + +// PosixQueue is a RAII class used to automatically clean POSIX message queues. +class PosixQueue { + public: + PosixQueue(mqd_t fd, std::string name) : fd_(fd), name_(std::string(name)) {} + PosixQueue(const PosixQueue&) = delete; + PosixQueue& operator=(const PosixQueue&) = delete; + + // Move constructor. + PosixQueue(PosixQueue&& q) : fd_(q.fd_), name_(q.name_) { + // Call PosixQueue::release, to prevent the object being released from + // unlinking the underlying queue. + q.release(); + } + + ~PosixQueue() { + if (fd_ != -1) { + EXPECT_THAT(mq_close(fd_), SyscallSucceeds()); + EXPECT_THAT(mq_unlink(name_.c_str()), SyscallSucceeds()); + } + } + + mqd_t fd() { return fd_; } + + const char* name() { return name_.c_str(); } + + mqd_t release() { + mqd_t old = fd_; + fd_ = -1; + return old; + } + + private: + mqd_t fd_; + std::string name_; +}; + +// MqOpen wraps mq_open(3) using a given name. +PosixErrorOr MqOpen(std::string name, int oflag) { + mqd_t fd = mq_open(name.c_str(), oflag); + if (fd == -1) { + return PosixError(errno, absl::StrFormat("mq_open(%s, %d)", name, oflag)); + } + return PosixQueue(fd, name); +} + +// MqOpen wraps mq_open(3) using a given name. +PosixErrorOr MqOpen(int oflag, mode_t mode, struct mq_attr* attr) { + auto name = "/" + NextTempBasename(); + mqd_t fd = mq_open(name.c_str(), oflag, mode, attr); + if (fd == -1) { + return PosixError(errno, absl::StrFormat("mq_open(%d)", oflag)); + } + return PosixQueue(fd, name); +} + +// MqOpen wraps mq_open(3) using a generated name. +PosixErrorOr MqOpen(std::string name, int oflag, mode_t mode, + struct mq_attr* attr) { + mqd_t fd = mq_open(name.c_str(), oflag, mode, attr); + if (fd == -1) { + return PosixError(errno, absl::StrFormat("mq_open(%d)", oflag)); + } + return PosixQueue(fd, name); +} + +// MqUnlink wraps mq_unlink(2). +PosixError MqUnlink(std::string name) { + int err = mq_unlink(name.c_str()); + if (err == -1) { + return PosixError(errno, absl::StrFormat("mq_unlink(%s)", name.c_str())); + } + return NoError(); +} + +// MqClose wraps mq_close(2). +PosixError MqClose(mqd_t fd) { + int err = mq_close(fd); + if (err == -1) { + return PosixError(errno, absl::StrFormat("mq_close(%d)", fd)); + } + return NoError(); +} + +// Test simple opening and closing of a message queue. +TEST(MqTest, Open) { + GTEST_SKIP(); + EXPECT_THAT(MqOpen(O_RDWR | O_CREAT | O_EXCL, 0777, nullptr), + IsPosixErrorOkAndHolds(_)); +} + +// Test mq_open(2) after mq_unlink(2). +TEST(MqTest, OpenAfterUnlink) { + GTEST_SKIP(); + + PosixQueue queue = ASSERT_NO_ERRNO_AND_VALUE( + MqOpen(O_RDWR | O_CREAT | O_EXCL, 0777, nullptr)); + + ASSERT_NO_ERRNO(MqUnlink(queue.name())); + EXPECT_THAT(MqOpen(queue.name(), O_RDWR), PosixErrorIs(ENOENT)); + ASSERT_NO_ERRNO(MqClose(queue.release())); +} + +// Test using invalid args with mq_open. +TEST(MqTest, OpenInvalidArgs) { + GTEST_SKIP(); + + // Name must start with a slash. + EXPECT_THAT(MqOpen("test", O_RDWR), PosixErrorIs(EINVAL)); + + // Name can't contain more that one slash. + EXPECT_THAT(MqOpen("/test/name", O_RDWR), PosixErrorIs(EACCES)); + + // Both "." and ".." can't be used as queue names. + EXPECT_THAT(MqOpen(".", O_RDWR), PosixErrorIs(EINVAL)); + EXPECT_THAT(MqOpen("..", O_RDWR), PosixErrorIs(EINVAL)); + + // mq_attr's mq_maxmsg and mq_msgsize must be > 0. + struct mq_attr attr; + attr.mq_maxmsg = -1; + attr.mq_msgsize = 10; + + EXPECT_THAT(MqOpen(O_RDWR | O_CREAT | O_EXCL, 0777, &attr), + PosixErrorIs(EINVAL)); + + attr.mq_maxmsg = 10; + attr.mq_msgsize = -1; + + EXPECT_THAT(MqOpen(O_RDWR | O_CREAT | O_EXCL, 0777, &attr), + PosixErrorIs(EINVAL)); + + // Names should be shorter than NAME_MAX. + char max[NAME_MAX + 3]; + max[0] = '/'; + for (size_t i = 1; i < NAME_MAX + 2; i++) { + max[i] = 'a'; + } + max[NAME_MAX + 2] = '\0'; + + EXPECT_THAT(MqOpen(std::string(max), O_RDWR | O_CREAT | O_EXCL, 0777, &attr), + PosixErrorIs(ENAMETOOLONG)); +} + +// Test creating a queue that already exists. +TEST(MqTest, CreateAlreadyExists) { + GTEST_SKIP(); + + PosixQueue queue = ASSERT_NO_ERRNO_AND_VALUE( + MqOpen(O_RDWR | O_CREAT | O_EXCL, 0777, nullptr)); + + EXPECT_THAT(MqOpen(queue.name(), O_RDWR | O_CREAT | O_EXCL, 0777, nullptr), + PosixErrorIs(EEXIST)); +} + +// Test opening a queue that doesn't exists. +TEST(MqTest, NoQueueExists) { + GTEST_SKIP(); + + // Choose a name to pass that's unlikely to exist if the test is run locally. + EXPECT_THAT(MqOpen("/gvisor-mq-test-nonexistent-queue", O_RDWR), + PosixErrorIs(ENOENT)); +} + +// Test trying to re-open a queue with invalid permissions. +TEST(MqTest, OpenNoAccess) { + SKIP_IF(IsRunningWithVFS1()); + + PosixQueue queue = ASSERT_NO_ERRNO_AND_VALUE( + MqOpen(O_RDWR | O_CREAT | O_EXCL, 0000, nullptr)); + + EXPECT_THAT(MqOpen(queue.name(), O_RDONLY), PosixErrorIs(EACCES)); + EXPECT_THAT(MqOpen(queue.name(), O_WRONLY), PosixErrorIs(EACCES)); + EXPECT_THAT(MqOpen(queue.name(), O_RDWR), PosixErrorIs(EACCES)); +} + +// Test trying to re-open a read-only queue for write. +TEST(MqTest, OpenReadAccess) { + SKIP_IF(IsRunningWithVFS1()); + + PosixQueue queue = ASSERT_NO_ERRNO_AND_VALUE( + MqOpen(O_RDWR | O_CREAT | O_EXCL, 0400, nullptr)); + + EXPECT_THAT(MqOpen(queue.name(), O_WRONLY), PosixErrorIs(EACCES)); + EXPECT_NO_ERRNO(MqOpen(queue.name(), O_RDONLY)); + queue.release(); +} + +// Test trying to re-open a write-only queue for read. +TEST(MqTest, OpenWriteAccess) { + SKIP_IF(IsRunningWithVFS1()); + + PosixQueue queue = ASSERT_NO_ERRNO_AND_VALUE( + MqOpen(O_RDWR | O_CREAT | O_EXCL, 0200, nullptr)); + + EXPECT_THAT(MqOpen(queue.name(), O_RDONLY), PosixErrorIs(EACCES)); + EXPECT_NO_ERRNO(MqOpen(queue.name(), O_WRONLY)); + queue.release(); +} + +} // namespace +} // namespace testing +} // namespace gvisor diff --git a/test/util/temp_path.cc b/test/util/temp_path.cc index e1bdee7fd..11399f372 100644 --- a/test/util/temp_path.cc +++ b/test/util/temp_path.cc @@ -34,20 +34,6 @@ namespace { std::atomic global_temp_file_number = ATOMIC_VAR_INIT(1); -// Return a new temp filename, intended to be unique system-wide. -// -// The global file number helps maintain file naming consistency across -// different runs of a test. -// -// The timestamp is necessary because the test infrastructure invokes each -// test case in a separate process (resetting global_temp_file_number) and -// potentially in parallel, which allows for races between selecting and using a -// name. -std::string NextTempBasename() { - return absl::StrCat("gvisor_test_temp_", global_temp_file_number++, "_", - absl::ToUnixNanos(absl::Now())); -} - void TryDeleteRecursively(std::string const& path) { if (!path.empty()) { int undeleted_dirs = 0; @@ -85,6 +71,18 @@ std::string GetAbsoluteTestTmpdir() { return MakeAbsolute(tmp_dir, "").ValueOrDie(); } +// The global file number helps maintain file naming consistency across +// different runs of a test. +// +// The timestamp is necessary because the test infrastructure invokes each +// test case in a separate process (resetting global_temp_file_number) and +// potentially in parallel, which allows for races between selecting and using a +// name. +std::string NextTempBasename() { + return absl::StrCat("gvisor_test_temp_", global_temp_file_number++, "_", + absl::ToUnixNanos(absl::Now())); +} + PosixErrorOr TempPath::CreateFileWith(absl::string_view const parent, absl::string_view const content, mode_t const mode) { diff --git a/test/util/temp_path.h b/test/util/temp_path.h index 9e5ac11f4..6c8900b6b 100644 --- a/test/util/temp_path.h +++ b/test/util/temp_path.h @@ -27,6 +27,9 @@ namespace gvisor { namespace testing { +// Return a new temp filename, intended to be unique system-wide. +std::string NextTempBasename(); + // Returns an absolute path for a file in `dir` that does not yet exist. // Distinct calls to NewTempAbsPathInDir from the same process, even from // multiple threads, are guaranteed to return different paths. Distinct calls to -- cgit v1.2.3 From 2774b3ea3bcae10afa8c87e1fce46d60e7580fe1 Mon Sep 17 00:00:00 2001 From: "Zyad A. Ali" Date: Thu, 5 Aug 2021 22:12:16 +0200 Subject: Test read(2), poll(2), mount(2), and unshare(2) for message queues. Updates #136 --- test/syscalls/linux/BUILD | 3 + test/syscalls/linux/mq.cc | 152 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 155 insertions(+) diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index ed2adfbc9..8d501a3a3 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -4179,6 +4179,9 @@ cc_binary( "-lrt", ], deps = [ + "//test/util:capability_util", + "//test/util:fs_util", + "//test/util:mount_util", "//test/util:posix_error", "//test/util:test_main", "//test/util:temp_path", diff --git a/test/syscalls/linux/mq.cc b/test/syscalls/linux/mq.cc index 610d41fb6..9a4a87dfe 100644 --- a/test/syscalls/linux/mq.cc +++ b/test/syscalls/linux/mq.cc @@ -14,11 +14,16 @@ #include #include +#include +#include #include #include #include +#include "test/util/capability_util.h" +#include "test/util/fs_util.h" +#include "test/util/mount_util.h" #include "test/util/posix_error.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" @@ -229,6 +234,153 @@ TEST(MqTest, OpenWriteAccess) { queue.release(); } +// Test changing IPC namespace. +TEST(MqTest, ChangeIpcNamespace) { + GTEST_SKIP(); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + // When changing IPC namespaces, Linux doesn't invalidate or close the + // previously opened file descriptions and allows operations to be performed + // on them normally, until they're closed. + // + // To test this we create a new queue, use unshare(CLONE_NEWIPC) to change + // into a new IPC namespace, and trying performing a read(2) on the queue. + PosixQueue queue = ASSERT_NO_ERRNO_AND_VALUE( + MqOpen(O_RDWR | O_CREAT | O_EXCL, 0777, nullptr)); + + // As mq_unlink(2) uses queue's name, it should fail after changing IPC + // namespace. To clean the queue, we should unlink it now, this should not + // cause a problem, as the queue presists until the last mq_close(2). + ASSERT_NO_ERRNO(MqUnlink(queue.name())); + + ASSERT_THAT(unshare(CLONE_NEWIPC), SyscallSucceeds()); + + const size_t msgSize = 60; + char queueRead[msgSize]; + ASSERT_THAT(read(queue.fd(), &queueRead[0], msgSize - 1), SyscallSucceeds()); + + ASSERT_NO_ERRNO(MqClose(queue.release())); + + // Unlinking should fail now after changing IPC namespace. + EXPECT_THAT(MqUnlink(queue.name()), PosixErrorIs(ENOENT)); +} + +// Test mounting the mqueue filesystem. +TEST(MqTest, Mount) { + GTEST_SKIP(); + SKIP_IF(IsRunningWithVFS1() || + !ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + PosixQueue queue = ASSERT_NO_ERRNO_AND_VALUE( + MqOpen(O_RDWR | O_CREAT | O_EXCL, 0777, nullptr)); + + auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + ASSERT_NO_ERRNO(Mount("none", dir.path(), "mqueue", 0, "", 0)); +} + +// Test mounting the mqueue filesystem to several places. +TEST(MqTest, MountSeveral) { + GTEST_SKIP(); + SKIP_IF(IsRunningWithVFS1() || + !ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + PosixQueue queue = ASSERT_NO_ERRNO_AND_VALUE( + MqOpen(O_RDWR | O_CREAT | O_EXCL, 0777, nullptr)); + + auto const dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + // Assign the pointer so it doesn't get destroyed before the second mount is + // created. + auto mnt = + ASSERT_NO_ERRNO_AND_VALUE(Mount("none", dir1.path(), "mqueue", 0, "", 0)); + + auto const dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + ASSERT_NO_ERRNO(Mount("none", dir2.path(), "mqueue", 0, "", 0)); +} + +// Test mounting mqueue and opening a queue as normal file. +TEST(MqTest, OpenAsFile) { + GTEST_SKIP(); + SKIP_IF(IsRunningWithVFS1() || + !ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + PosixQueue queue = ASSERT_NO_ERRNO_AND_VALUE( + MqOpen(O_RDWR | O_CREAT | O_EXCL, 0777, nullptr)); + + auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + auto mnt = + ASSERT_NO_ERRNO_AND_VALUE(Mount("none", dir.path(), "mqueue", 0, "", 0)); + + // Open queue using open(2). + auto fd = ASSERT_NO_ERRNO_AND_VALUE( + Open(JoinPath(dir.path(), queue.name()), O_RDONLY)); + + const size_t msgSize = 60; + char queueRead[msgSize]; + queueRead[msgSize - 1] = '\0'; + + ASSERT_THAT(read(fd.get(), &queueRead[0], msgSize - 1), SyscallSucceeds()); + + std::string want( + "QSIZE:0 NOTIFY:0 SIGNO:0 NOTIFY_PID:0 "); + std::string got(queueRead); + EXPECT_EQ(got, want); +} + +// Test removing a queue using unlink(2). +TEST(MqTest, UnlinkAsFile) { + GTEST_SKIP(); + SKIP_IF(IsRunningWithVFS1() || + !ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + PosixQueue queue = ASSERT_NO_ERRNO_AND_VALUE( + MqOpen(O_RDWR | O_CREAT | O_EXCL, 0777, nullptr)); + + auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + auto mnt = + ASSERT_NO_ERRNO_AND_VALUE(Mount("none", dir.path(), "mqueue", 0, "", 0)); + + ASSERT_NO_ERRNO( + UnlinkAt(FileDescriptor(), JoinPath(dir.path(), queue.name()), 0)); + + // Trying to unlink again should fail. + EXPECT_THAT(MqUnlink(queue.name()), PosixErrorIs(ENOENT)); + queue.release(); +} + +// Test read(2) from an empty queue. +TEST(MqTest, ReadEmpty) { + GTEST_SKIP(); + + PosixQueue queue = ASSERT_NO_ERRNO_AND_VALUE( + MqOpen(O_RDWR | O_CREAT | O_EXCL, 0777, nullptr)); + + const size_t msgSize = 60; + char queueRead[msgSize]; + queueRead[msgSize - 1] = '\0'; + + ASSERT_THAT(read(queue.fd(), &queueRead[0], msgSize - 1), SyscallSucceeds()); + + std::string want( + "QSIZE:0 NOTIFY:0 SIGNO:0 NOTIFY_PID:0 "); + std::string got(queueRead); + EXPECT_EQ(got, want); +} + +// Test poll(2) on an empty queue. +TEST(MqTest, PollEmpty) { + GTEST_SKIP(); + + PosixQueue queue = ASSERT_NO_ERRNO_AND_VALUE( + MqOpen(O_RDWR | O_CREAT | O_EXCL, 0777, nullptr)); + + struct pollfd pfd; + pfd.fd = queue.fd(); + pfd.events = POLLOUT | POLLIN | POLLRDNORM | POLLWRNORM; + + ASSERT_THAT(poll(&pfd, 1, -1), SyscallSucceeds()); + ASSERT_EQ(pfd.revents, POLLOUT | POLLWRNORM); +} + } // namespace } // namespace testing } // namespace gvisor -- cgit v1.2.3 From 61cbf77873ad5095f1a5f4726f486c883cf7e7dc Mon Sep 17 00:00:00 2001 From: "Zyad A. Ali" Date: Mon, 19 Jul 2021 18:22:31 +0200 Subject: Implement mqfs.rootInode. rootInode represents the root inode for mqueue filesystem (/dev/mqueue). Updates #136 --- pkg/abi/linux/fs.go | 1 + pkg/sentry/fsimpl/mqfs/BUILD | 33 +++++++++++++++ pkg/sentry/fsimpl/mqfs/inodes.go | 91 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 125 insertions(+) create mode 100644 pkg/sentry/fsimpl/mqfs/BUILD create mode 100644 pkg/sentry/fsimpl/mqfs/inodes.go diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go index cad24fcc7..edc90e54c 100644 --- a/pkg/abi/linux/fs.go +++ b/pkg/abi/linux/fs.go @@ -23,6 +23,7 @@ const ( DEVPTS_SUPER_MAGIC = 0x00001cd1 EXT_SUPER_MAGIC = 0xef53 FUSE_SUPER_MAGIC = 0x65735546 + MQUEUE_MAGIC = 0x19800202 OVERLAYFS_SUPER_MAGIC = 0x794c7630 PIPEFS_MAGIC = 0x50495045 PROC_SUPER_MAGIC = 0x9fa0 diff --git a/pkg/sentry/fsimpl/mqfs/BUILD b/pkg/sentry/fsimpl/mqfs/BUILD new file mode 100644 index 000000000..8a5925396 --- /dev/null +++ b/pkg/sentry/fsimpl/mqfs/BUILD @@ -0,0 +1,33 @@ +load("//tools:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "root_inode_refs", + out = "root_inode_refs.go", + package = "mqfs", + prefix = "rootInode", + template = "//pkg/refsvfs2:refs_template", + types = { + "T": "rootInode", + }, +) + +go_library( + name = "mqfs", + srcs = [ + "inodes.go", + "root_inode_refs.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/errors/linuxerr", + "//pkg/refsvfs2", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + ], +) diff --git a/pkg/sentry/fsimpl/mqfs/inodes.go b/pkg/sentry/fsimpl/mqfs/inodes.go new file mode 100644 index 000000000..702db59ee --- /dev/null +++ b/pkg/sentry/fsimpl/mqfs/inodes.go @@ -0,0 +1,91 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mqfs + +import ( + "bytes" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// rootInode represents inode for filesystem's root directory (/dev/mqueue). +// +// +stateify savable +type rootInode struct { + rootInodeRefs + kernfs.InodeAlwaysValid + kernfs.InodeAttrs + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeNotSymlink + kernfs.InodeTemporary + kernfs.OrderedChildren + implStatFS + + locks vfs.FileLocks +} + +var _ kernfs.Inode = (*rootInode)(nil) + +// newRootInode returns a new, initialized rootInode. +func (fs *filesystem) newRootInode(ctx context.Context, creds *auth.Credentials) kernfs.Inode { + inode := &rootInode{} + inode.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|linux.FileMode(0555)) + inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true}) + inode.InitRefs() + return inode +} + +// Open implements kernfs.Inode.Open. +func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndZero, + }) + if err != nil { + return nil, err + } + return fd.VFSFileDescription(), nil +} + +// DecRef implements kernfs.Inode.DecRef. +func (i *rootInode) DecRef(ctx context.Context) { + i.rootInodeRefs.DecRef(func() { i.Destroy(ctx) }) +} + +// Rename implements Inode.Rename and overrides OrderedChildren.Rename. mqueue +// filesystem allows files to be unlinked, but not renamed. +func (i *rootInode) Rename(ctx context.Context, oldname, newname string, child, dstDir kernfs.Inode) error { + return linuxerr.EPERM +} + +// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. +func (*rootInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { + return linuxerr.EPERM +} + +// implStatFS provides an implementation of kernfs.Inode.StatFS for message +// queues to be embedded in inodes. +// +// +stateify savable +type implStatFS struct{} + +// StatFS implements kernfs.Inode.StatFS. +func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { + return vfs.GenericStatFS(linux.MQUEUE_MAGIC), nil +} -- cgit v1.2.3 From fc5469af61148365c8c2baeb54f6664095ef6fd0 Mon Sep 17 00:00:00 2001 From: "Zyad A. Ali" Date: Mon, 19 Jul 2021 18:24:58 +0200 Subject: Implement vfs.FilesystemType and kernfs.Filesystem for mqfs. Updates #136 --- pkg/sentry/fsimpl/mqfs/BUILD | 1 + pkg/sentry/fsimpl/mqfs/mqfs.go | 107 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 pkg/sentry/fsimpl/mqfs/mqfs.go diff --git a/pkg/sentry/fsimpl/mqfs/BUILD b/pkg/sentry/fsimpl/mqfs/BUILD index 8a5925396..afe1f3cd5 100644 --- a/pkg/sentry/fsimpl/mqfs/BUILD +++ b/pkg/sentry/fsimpl/mqfs/BUILD @@ -17,6 +17,7 @@ go_template_instance( go_library( name = "mqfs", srcs = [ + "mqfs.go", "inodes.go", "root_inode_refs.go", ], diff --git a/pkg/sentry/fsimpl/mqfs/mqfs.go b/pkg/sentry/fsimpl/mqfs/mqfs.go new file mode 100644 index 000000000..18bc66134 --- /dev/null +++ b/pkg/sentry/fsimpl/mqfs/mqfs.go @@ -0,0 +1,107 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package mqfs provides a filesystem implementation to back POSIX message +// queues. +package mqfs + +import ( + "fmt" + "strconv" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +const ( + fsName = "mqueue" + defaultMaxCachedDentries = uint64(1000) +) + +// FilesystemType implements vfs.FilesystemType. +// +// +stateify savable +type FilesystemType struct{} + +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return fsName +} + +// Release implements vfs.FilesystemType.Release. +func (FilesystemType) Release(ctx context.Context) {} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + + maxCachedDentries, err := maxCachedDentries(ctx, vfs.GenericParseMountOptions(opts.Data)) + if err != nil { + return nil, nil, err + } + + fs := &filesystem{ + devMinor: devMinor, + Filesystem: kernfs.Filesystem{ + MaxCachedDentries: maxCachedDentries, + }, + } + fs.VFSFilesystem().Init(vfsObj, &ft, fs) + + var dentry kernfs.Dentry + dentry.InitRoot(&fs.Filesystem, fs.newRootInode(ctx, creds)) + + return fs.VFSFilesystem(), dentry.VFSDentry(), nil +} + +// maxCachedDentries checks mopts for dentry_cache_limit. If a value is +// specified, parse it into uint64 and return it. Otherwise, return the default +// value. An error is returned if a value is found but can't be parsed. +func maxCachedDentries(ctx context.Context, mopts map[string]string) (_ uint64, err error) { + max := defaultMaxCachedDentries + if str, ok := mopts["dentry_cache_limit"]; ok { + delete(mopts, "dentry_cache_limit") + max, err = strconv.ParseUint(str, 10, 64) + if err != nil { + ctx.Warningf("mqfs.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) + return 0, linuxerr.EINVAL + } + } + return max, nil +} + +// filesystem implements kernfs.Filesystem. +// +// +stateify savable +type filesystem struct { + kernfs.Filesystem + devMinor uint32 +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release(ctx context.Context) { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release(ctx) +} + +// MountOptions implements vfs.FilesystemImpl.MountOptions. +func (fs *filesystem) MountOptions() string { + return fmt.Sprintf("dentry_cache_limit=%d", fs.MaxCachedDentries) +} -- cgit v1.2.3 From 4fdfb178e7764a4f2ce761929cb59212427e3c24 Mon Sep 17 00:00:00 2001 From: "Zyad A. Ali" Date: Mon, 19 Jul 2021 18:49:10 +0200 Subject: Add abi definitions for POSIX message queues. Updates #136 --- pkg/abi/linux/BUILD | 1 + pkg/abi/linux/mqueue.go | 55 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 pkg/abi/linux/mqueue.go diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD index 3576396c1..a7b271120 100644 --- a/pkg/abi/linux/BUILD +++ b/pkg/abi/linux/BUILD @@ -41,6 +41,7 @@ go_library( "linux.go", "membarrier.go", "mm.go", + "mqueue.go", "msgqueue.go", "netdevice.go", "netfilter.go", diff --git a/pkg/abi/linux/mqueue.go b/pkg/abi/linux/mqueue.go new file mode 100644 index 000000000..4988a2aa3 --- /dev/null +++ b/pkg/abi/linux/mqueue.go @@ -0,0 +1,55 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +// Default values for POSIX message queues. Source: +// include/linux/ipc_namespace.h +const ( + DFLT_QUEUESMAX = 256 + MIN_MSGMAX = 1 + DFLT_MSG uint = 10 + DFLT_MSGMAX = 10 + HARD_MSGMAX = 65536 + MIN_MSGSIZEMAX = 128 + DFLT_MSGSIZE uint = 8192 + DFLT_MSGSIZEMAX = 8192 + HARD_MSGSIZEMAX = (16 * 1024 * 1024) +) + +// Maximum values for a message queue. Source: include/uapi/linux/mqueue.h +const ( + MQ_PRIO_MAX = 32768 + MQ_BYTES_MAX = 819200 +) + +// Codes used by mq_notify. Source: include/uapi/linux/mqueue.h +const ( + NOTIFY_NONE = 0 + NOTIFY_WOKENUP = 1 + NOTIFY_REMOVED = 2 + + NOTIFY_COOKIE_LEN = 32 +) + +// MqAttr is equivelant to struct mq_attr. Source: include/uapi/linux/mqueue.h +// +// +marshal +type MqAttr struct { + MqFlags int64 // Message queue flags. + MqMaxmsg int64 // Maximum number of messages. + MqMsgsize int64 // Maximum message size. + MqCurmsgs int64 // Number of messages currently queued. + _ [4]int64 // Ignored for input, zeroed for output. +} -- cgit v1.2.3 From 1c8a014e7e129d6a49c1280e28434354881ace94 Mon Sep 17 00:00:00 2001 From: "Zyad A. Ali" Date: Tue, 20 Jul 2021 18:51:39 +0200 Subject: Create mq package. Create package mq to implement POSIX message queues, and define initial struct definitions. Updates #136 --- pkg/sentry/kernel/mq/BUILD | 31 +++++++++++++++ pkg/sentry/kernel/mq/mq.go | 96 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) create mode 100644 pkg/sentry/kernel/mq/BUILD create mode 100644 pkg/sentry/kernel/mq/mq.go diff --git a/pkg/sentry/kernel/mq/BUILD b/pkg/sentry/kernel/mq/BUILD new file mode 100644 index 000000000..ec9cd18a9 --- /dev/null +++ b/pkg/sentry/kernel/mq/BUILD @@ -0,0 +1,31 @@ +load("//tools:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "message_list", + out = "message_list.go", + package = "mq", + prefix = "msg", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*Message", + "Linker": "*Message", + }, +) + +go_library( + name = "mq", + srcs = [ + "message_list.go", + "mq.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/fs", + "//pkg/sync", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/kernel/mq/mq.go b/pkg/sentry/kernel/mq/mq.go new file mode 100644 index 000000000..df9bdc267 --- /dev/null +++ b/pkg/sentry/kernel/mq/mq.go @@ -0,0 +1,96 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package mq provides an implementation for POSIX message queues. +package mq + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/waiter" +) + +const ( + maxPriority = linux.MQ_PRIO_MAX - 1 // Highest possible message priority. +) + +// Queue represents a POSIX message queue. +// +// +stateify savable +type Queue struct { + // owner is the registry's owner. Immutable. + owner fs.FileOwner + + // perms is the registry's access permissions. Immutable. + perms fs.FilePermissions + + // mu protects all the fields below. + mu sync.Mutex `state:"nosave"` + + // senders is a queue of currently blocked senders. Senders are notified + // when space isi available in the queue for a new message. + senders waiter.Queue + + // receivers is a queue of currently blocked receivers. Receivers are + // notified when a new message is inserted in the queue. + receivers waiter.Queue + + // messages is a list of messages currently in the queue. + messages msgList + + // subscriber represents a task registered to receive async notification + // from this queue. + subscriber Subscriber + + // nonBlock is true if this queue is non-blocking. + nonBlock bool + + // messageCount is the number of messages currently in the queue. + messageCount int64 + + // maxMessageCount is the maximum number of messages that the queue can + // hold. + maxMessageCount int64 + + // maxMessageSize is the maximum size of a message held by the queue. + maxMessageSize uint64 + + // byteCount is the number of bytes of data in all messages in the queue. + byteCount uint64 +} + +// Message holds a message exchanged through a Queue via mq_timedsend(2) and +// mq_timedreceive(2), and additional info relating to the message. +// +// +stateify savable +type Message struct { + msgEntry + + // Text is the message's sent content. + Text string + + // Size is the message's size in bytes. + Size uint64 + + // Priority is the message's priority. + Priority uint32 +} + +// Subscriber represents a task registered for async notification from a Queue. +// +// +stateify savable +type Subscriber struct { + // TODO: Add fields when mq_notify(2) is implemented. +} -- cgit v1.2.3 From 0061d0e4e5d74efce8af8d706437cba3d040cd5f Mon Sep 17 00:00:00 2001 From: "Zyad A. Ali" Date: Tue, 20 Jul 2021 20:31:46 +0200 Subject: Implement queueInode and queueFD in mqfs. Implement inode and file description representing a POSIX message queue, and other utilities needed to implement file operations. Updates #136 --- pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 5 + pkg/sentry/fsimpl/mqfs/BUILD | 6 +- pkg/sentry/fsimpl/mqfs/inodes.go | 91 ---------------- pkg/sentry/fsimpl/mqfs/queue.go | 145 +++++++++++++++++++++++++ pkg/sentry/fsimpl/mqfs/root.go | 89 +++++++++++++++ pkg/sentry/kernel/mq/BUILD | 1 + pkg/sentry/kernel/mq/mq.go | 84 +++++++++++++- 7 files changed, 328 insertions(+), 93 deletions(-) delete mode 100644 pkg/sentry/fsimpl/mqfs/inodes.go create mode 100644 pkg/sentry/fsimpl/mqfs/queue.go create mode 100644 pkg/sentry/fsimpl/mqfs/root.go diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 9d7526e47..652ade564 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -74,6 +74,11 @@ func (*DynamicBytesFile) SetStat(context.Context, *vfs.Filesystem, *auth.Credent return linuxerr.EPERM } +// Locks returns the file locks for this file. +func (f *DynamicBytesFile) Locks() *vfs.FileLocks { + return &f.locks +} + // DynamicBytesFD implements vfs.FileDescriptionImpl for an FD backed by a // DynamicBytesFile. // diff --git a/pkg/sentry/fsimpl/mqfs/BUILD b/pkg/sentry/fsimpl/mqfs/BUILD index afe1f3cd5..6b22ffabd 100644 --- a/pkg/sentry/fsimpl/mqfs/BUILD +++ b/pkg/sentry/fsimpl/mqfs/BUILD @@ -18,7 +18,8 @@ go_library( name = "mqfs", srcs = [ "mqfs.go", - "inodes.go", + "root.go", + "queue.go", "root_inode_refs.go", ], visibility = ["//pkg/sentry:internal"], @@ -29,6 +30,9 @@ go_library( "//pkg/refsvfs2", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/mq", "//pkg/sentry/vfs", + "//pkg/usermem", + "//pkg/waiter", ], ) diff --git a/pkg/sentry/fsimpl/mqfs/inodes.go b/pkg/sentry/fsimpl/mqfs/inodes.go deleted file mode 100644 index 702db59ee..000000000 --- a/pkg/sentry/fsimpl/mqfs/inodes.go +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright 2021 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package mqfs - -import ( - "bytes" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/errors/linuxerr" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/vfs" -) - -// rootInode represents inode for filesystem's root directory (/dev/mqueue). -// -// +stateify savable -type rootInode struct { - rootInodeRefs - kernfs.InodeAlwaysValid - kernfs.InodeAttrs - kernfs.InodeDirectoryNoNewChildren - kernfs.InodeNotSymlink - kernfs.InodeTemporary - kernfs.OrderedChildren - implStatFS - - locks vfs.FileLocks -} - -var _ kernfs.Inode = (*rootInode)(nil) - -// newRootInode returns a new, initialized rootInode. -func (fs *filesystem) newRootInode(ctx context.Context, creds *auth.Credentials) kernfs.Inode { - inode := &rootInode{} - inode.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|linux.FileMode(0555)) - inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true}) - inode.InitRefs() - return inode -} - -// Open implements kernfs.Inode.Open. -func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ - SeekEnd: kernfs.SeekEndZero, - }) - if err != nil { - return nil, err - } - return fd.VFSFileDescription(), nil -} - -// DecRef implements kernfs.Inode.DecRef. -func (i *rootInode) DecRef(ctx context.Context) { - i.rootInodeRefs.DecRef(func() { i.Destroy(ctx) }) -} - -// Rename implements Inode.Rename and overrides OrderedChildren.Rename. mqueue -// filesystem allows files to be unlinked, but not renamed. -func (i *rootInode) Rename(ctx context.Context, oldname, newname string, child, dstDir kernfs.Inode) error { - return linuxerr.EPERM -} - -// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. -func (*rootInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { - return linuxerr.EPERM -} - -// implStatFS provides an implementation of kernfs.Inode.StatFS for message -// queues to be embedded in inodes. -// -// +stateify savable -type implStatFS struct{} - -// StatFS implements kernfs.Inode.StatFS. -func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { - return vfs.GenericStatFS(linux.MQUEUE_MAGIC), nil -} diff --git a/pkg/sentry/fsimpl/mqfs/queue.go b/pkg/sentry/fsimpl/mqfs/queue.go new file mode 100644 index 000000000..a8e9bc722 --- /dev/null +++ b/pkg/sentry/fsimpl/mqfs/queue.go @@ -0,0 +1,145 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mqfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/mq" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// queueInode represents an inode for a message queue (/dev/mqueue/[name]). +// +// +stateify savable +type queueInode struct { + kernfs.DynamicBytesFile + + // queue is the message queue backing this inode. + queue *mq.Queue +} + +var _ kernfs.Inode = (*queueInode)(nil) + +// newQueueInode returns a new, initialized queueInode. +func (fs *filesystem) newQueueInode(ctx context.Context, creds *auth.Credentials, q *mq.Queue, perm linux.FileMode) kernfs.Inode { + inode := &queueInode{queue: q} + inode.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), q, perm) + return inode +} + +// Keep implements kernfs.Inode.Keep. +func (q *queueInode) Keep() bool { + // Return true so that the fs keeps newly created dentries. This is done + // because inodes returned by root.Lookup are not temporary, they exist + // in the fs, and refer to message queues. + return true +} + +// queueFD implements vfs.FileDescriptionImpl for FD backed by a POSIX message +// queue. It's mostly similar to DynamicBytesFD, but implements more operations. +// +// +stateify savable +type queueFD struct { + vfs.FileDescriptionDefaultImpl + vfs.DynamicBytesFileDescriptionImpl + vfs.LockFD + + vfsfd vfs.FileDescription + inode kernfs.Inode + + // queue is the queue backing this fd. + queue *mq.Queue +} + +// Init initializes a queueFD. Mostly copied from DynamicBytesFD.Init, but uses +// the queueFD as FileDescriptionImpl. +func (fd *queueFD) Init(m *vfs.Mount, d *kernfs.Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error { + fd.LockFD.Init(locks) + if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { + return err + } + fd.inode = d.Inode() + fd.SetDataSource(data) + return nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *queueFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.Seek(ctx, offset, whence) +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *queueFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.Read(ctx, dst, opts) +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *queueFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.PRead(ctx, dst, offset, opts) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *queueFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.Write(ctx, src, opts) +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *queueFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.PWrite(ctx, src, offset, opts) +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *queueFD) Release(context.Context) {} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *queueFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := fd.vfsfd.VirtualDentry().Mount().Filesystem() + return fd.inode.Stat(ctx, fs, opts) +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *queueFD) SetStat(context.Context, vfs.SetStatOptions) error { + // DynamicBytesFiles are immutable. + return linuxerr.EPERM +} + +// OnClose implements FileDescriptionImpl.OnClose similar to +// ipc/mqueue.c::mqueue_flush_file. +func (fd *queueFD) OnClose(ctx context.Context) error { + fd.queue.Flush(ctx) + return nil +} + +// Readiness implements waiter.Waitable.Readiness similar to +// ipc/mqueue.c::mqueue_poll_file. +func (fd *queueFD) Readiness(mask waiter.EventMask) waiter.EventMask { + return fd.queue.Readiness(mask) +} + +// EventRegister implements Waitable.EventRegister. +func (fd *queueFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + fd.queue.EventRegister(e, mask) +} + +// EventUnregister implements Waitable.EventUnregister. +func (fd *queueFD) EventUnregister(e *waiter.Entry) { + fd.queue.EventUnregister(e) +} diff --git a/pkg/sentry/fsimpl/mqfs/root.go b/pkg/sentry/fsimpl/mqfs/root.go new file mode 100644 index 000000000..37b5749fb --- /dev/null +++ b/pkg/sentry/fsimpl/mqfs/root.go @@ -0,0 +1,89 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mqfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// rootInode represents inode for filesystem's root directory (/dev/mqueue). +// +// +stateify savable +type rootInode struct { + rootInodeRefs + kernfs.InodeAlwaysValid + kernfs.InodeAttrs + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeNotSymlink + kernfs.InodeTemporary + kernfs.OrderedChildren + implStatFS + + locks vfs.FileLocks +} + +var _ kernfs.Inode = (*rootInode)(nil) + +// newRootInode returns a new, initialized rootInode. +func (fs *filesystem) newRootInode(ctx context.Context, creds *auth.Credentials) kernfs.Inode { + inode := &rootInode{} + inode.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|linux.FileMode(0555)) + inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true}) + inode.InitRefs() + return inode +} + +// Open implements kernfs.Inode.Open. +func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ + SeekEnd: kernfs.SeekEndZero, + }) + if err != nil { + return nil, err + } + return fd.VFSFileDescription(), nil +} + +// DecRef implements kernfs.Inode.DecRef. +func (i *rootInode) DecRef(ctx context.Context) { + i.rootInodeRefs.DecRef(func() { i.Destroy(ctx) }) +} + +// Rename implements Inode.Rename and overrides OrderedChildren.Rename. mqueue +// filesystem allows files to be unlinked, but not renamed. +func (i *rootInode) Rename(ctx context.Context, oldname, newname string, child, dstDir kernfs.Inode) error { + return linuxerr.EPERM +} + +// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. +func (*rootInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { + return linuxerr.EPERM +} + +// implStatFS provides an implementation of kernfs.Inode.StatFS for message +// queues to be embedded in inodes. +// +// +stateify savable +type implStatFS struct{} + +// StatFS implements kernfs.Inode.StatFS. +func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { + return vfs.GenericStatFS(linux.MQUEUE_MAGIC), nil +} diff --git a/pkg/sentry/kernel/mq/BUILD b/pkg/sentry/kernel/mq/BUILD index ec9cd18a9..b4e17b582 100644 --- a/pkg/sentry/kernel/mq/BUILD +++ b/pkg/sentry/kernel/mq/BUILD @@ -24,6 +24,7 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/sentry/fs", "//pkg/sync", "//pkg/waiter", diff --git a/pkg/sentry/kernel/mq/mq.go b/pkg/sentry/kernel/mq/mq.go index df9bdc267..29a46e8a9 100644 --- a/pkg/sentry/kernel/mq/mq.go +++ b/pkg/sentry/kernel/mq/mq.go @@ -16,7 +16,11 @@ package mq import ( + "bytes" + "fmt" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" @@ -52,7 +56,7 @@ type Queue struct { // subscriber represents a task registered to receive async notification // from this queue. - subscriber Subscriber + subscriber *Subscriber // nonBlock is true if this queue is non-blocking. nonBlock bool @@ -93,4 +97,82 @@ type Message struct { // +stateify savable type Subscriber struct { // TODO: Add fields when mq_notify(2) is implemented. + + // pid is the PID of the registered task. + pid int32 +} + +// Generate implements vfs.DynamicBytesSource.Generate. Queue is used as a +// dynamic bytes source for mqfs's queueInode. +func (q *Queue) Generate(ctx context.Context, buf *bytes.Buffer) error { + q.mu.Lock() + defer q.mu.Unlock() + + var ( + pid int32 + method int + sigNumber int + ) + if q.subscriber != nil { + pid = q.subscriber.pid + // TODO: add method and sigNumber when mq_notify(2) is implemented. + } + + buf.WriteString( + fmt.Sprintf("QSIZE:%-10d NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n", + q.byteCount, method, sigNumber, pid), + ) + return nil +} + +// Flush checks if the calling process has attached a notification request to +// this queue, if yes, then the request is removed, and another process can +// attach a request. +func (q *Queue) Flush(ctx context.Context) { + q.mu.Lock() + defer q.mu.Unlock() + + pid, ok := context.ThreadGroupIDFromContext(ctx) + if ok { + if q.subscriber != nil && pid == q.subscriber.pid { + q.subscriber = nil + } + } +} + +// Readiness implements Waitable.Readiness. +func (q *Queue) Readiness(mask waiter.EventMask) waiter.EventMask { + q.mu.Lock() + defer q.mu.Unlock() + + events := waiter.EventMask(0) + if q.messageCount > 0 { + events |= waiter.ReadableEvents + } + if q.messageCount < q.maxMessageCount { + events |= waiter.WritableEvents + } + return events & mask +} + +// EventRegister implements Waitable.EventRegister. +func (q *Queue) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + q.mu.Lock() + defer q.mu.Unlock() + + if mask&waiter.WritableEvents != 0 { + q.senders.EventRegister(e, waiter.EventOut) + } + if mask&waiter.ReadableEvents != 0 { + q.receivers.EventRegister(e, waiter.EventIn) + } +} + +// EventUnregister implements Waitable.EventUnregister. +func (q *Queue) EventUnregister(e *waiter.Entry) { + q.mu.Lock() + defer q.mu.Unlock() + + q.senders.EventUnregister(e) + q.receivers.EventUnregister(e) } -- cgit v1.2.3 From e452ecd49526f4a0bbacc462840fbc6e88781e36 Mon Sep 17 00:00:00 2001 From: "Zyad A. Ali" Date: Sat, 24 Jul 2021 19:15:54 +0200 Subject: Create mq.Registry and mqfs.RegistryImpl. Define a POSIX message queue Registry and RegistryImpl in mq package, implement RegistryImpl in mqfs, and add a Registry object to IPCNamespace initialized at filesystem creation. Updates #136 --- pkg/sentry/fsimpl/mqfs/BUILD | 2 + pkg/sentry/fsimpl/mqfs/mqfs.go | 4 +- pkg/sentry/fsimpl/mqfs/registry.go | 121 +++++++++++++++++++++++++++++++++++++ pkg/sentry/kernel/BUILD | 1 + pkg/sentry/kernel/ipc_namespace.go | 25 ++++++++ pkg/sentry/kernel/mq/BUILD | 1 + pkg/sentry/kernel/mq/mq.go | 51 +++++++++++++++- 7 files changed, 202 insertions(+), 3 deletions(-) create mode 100644 pkg/sentry/fsimpl/mqfs/registry.go diff --git a/pkg/sentry/fsimpl/mqfs/BUILD b/pkg/sentry/fsimpl/mqfs/BUILD index 6b22ffabd..6892c6c25 100644 --- a/pkg/sentry/fsimpl/mqfs/BUILD +++ b/pkg/sentry/fsimpl/mqfs/BUILD @@ -20,6 +20,7 @@ go_library( "mqfs.go", "root.go", "queue.go", + "registry.go", "root_inode_refs.go", ], visibility = ["//pkg/sentry:internal"], @@ -32,6 +33,7 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/mq", "//pkg/sentry/vfs", + "//pkg/sync", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fsimpl/mqfs/mqfs.go b/pkg/sentry/fsimpl/mqfs/mqfs.go index 18bc66134..a92012deb 100644 --- a/pkg/sentry/fsimpl/mqfs/mqfs.go +++ b/pkg/sentry/fsimpl/mqfs/mqfs.go @@ -28,7 +28,7 @@ import ( ) const ( - fsName = "mqueue" + Name = "mqueue" defaultMaxCachedDentries = uint64(1000) ) @@ -39,7 +39,7 @@ type FilesystemType struct{} // Name implements vfs.FilesystemType.Name. func (FilesystemType) Name() string { - return fsName + return Name } // Release implements vfs.FilesystemType.Release. diff --git a/pkg/sentry/fsimpl/mqfs/registry.go b/pkg/sentry/fsimpl/mqfs/registry.go new file mode 100644 index 000000000..3875b39ee --- /dev/null +++ b/pkg/sentry/fsimpl/mqfs/registry.go @@ -0,0 +1,121 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mqfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/mq" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" +) + +// RegistryImpl implements mq.RegistryImpl. It implements the interface using +// the message queue filesystem, and is provided to mq.Registry at +// initialization. +// +// +stateify savable +type RegistryImpl struct { + // mu protects all fields below. + mu sync.Mutex + + // root is the root dentry of the mq filesystem. Its main usage is to + // retreive the root inode, which we use to add, remove, and lookup message + // queues. + // + // We hold a reference on root and release when the registry is destroyed. + root *kernfs.Dentry + + // fs is the filesystem backing this registry, used mainly to initialize + // new inodes. + fs *filesystem + + // mount is the mount point used for this filesystem. + mount *vfs.Mount +} + +// NewRegistryImpl returns a new, initialized RegistryImpl, and takes a +// reference on root. +func NewRegistryImpl(root *kernfs.Dentry, fs *filesystem) *RegistryImpl { + root.IncRef() + return &RegistryImpl{ + root: root, + fs: fs, + } +} + +// Lookup implements mq.RegistryImpl.Lookup. +func (r *RegistryImpl) Lookup(ctx context.Context, name string) *mq.Queue { + r.mu.Lock() + defer r.mu.Unlock() + + inode, err := r.lookup(ctx, name) + if err != nil { + return nil + } + return inode.(*queueInode).queue +} + +// New implements mq.RegistryImpl.New. +func (r *RegistryImpl) New(ctx context.Context, name string, q *mq.Queue, perm linux.FileMode) (*vfs.FileDescription, error) { + r.mu.Lock() + defer r.mu.Unlock() + + root := r.root.Inode().(*rootInode) + qInode := r.fs.newQueueInode(ctx, auth.CredentialsFromContext(ctx), q, perm).(*queueInode) + err := root.Insert(name, qInode) + if err != nil { + return nil, err + } + + fd := &queueFD{queue: q} + err = fd.Init(r.mount, r.root, qInode.data, &qInode.locks, 0 /* flags */) + if err != nil { + return nil, err + } + return fd.VFSFileDescription(), nil +} + +// Unlink implements mq.RegistryImpl.Unlink. +func (r *RegistryImpl) Unlink(ctx context.Context, name string) error { + r.mu.Lock() + defer r.mu.Unlock() + + root := r.root.Inode().(*rootInode) + inode, err := r.lookup(ctx, name) + if err != nil { + return err + } + return root.Unlink(ctx, name, inode) +} + +// lookup retreives a kernfs.Inode using a name. +// +// Precondition: r.mu must be held. +func (r *RegistryImpl) lookup(ctx context.Context, name string) (kernfs.Inode, error) { + inode := r.root.Inode().(*rootInode) + lookup, err := inode.Lookup(ctx, name) + if err != nil { + return nil, err + } + return lookup, nil +} + +// Destroy implements mq.RegistryImpl.Destroy. +func (r *RegistryImpl) Destroy(ctx context.Context) { + r.root.DecRef(ctx) +} diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index c0f13bf52..e91338da7 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -257,6 +257,7 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/epoll", "//pkg/sentry/kernel/futex", + "//pkg/sentry/kernel/mq", "//pkg/sentry/kernel/msgqueue", "//pkg/sentry/kernel/sched", "//pkg/sentry/kernel/semaphore", diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go index 0b101b1bb..aa9c3fb31 100644 --- a/pkg/sentry/kernel/ipc_namespace.go +++ b/pkg/sentry/kernel/ipc_namespace.go @@ -17,6 +17,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/mq" "gvisor.dev/gvisor/pkg/sentry/kernel/msgqueue" "gvisor.dev/gvisor/pkg/sentry/kernel/semaphore" "gvisor.dev/gvisor/pkg/sentry/kernel/shm" @@ -31,9 +32,17 @@ type IPCNamespace struct { // User namespace which owns this IPC namespace. Immutable. userNS *auth.UserNamespace + // System V utilities. queues *msgqueue.Registry semaphores *semaphore.Registry shms *shm.Registry + + // posixQueues is a POSIX message queue registry. + // + // posixQueues is somewhat equivelant to Linux's ipc_namespace.mq_mnt. + // Unlike SysV utilities, mq.Registry is not map-based, but is backed by + // a virtual filesystem. + posixQueues *mq.Registry } // NewIPCNamespace creates a new IPC namespace. @@ -63,10 +72,26 @@ func (i *IPCNamespace) ShmRegistry() *shm.Registry { return i.shms } +// SetPosixQueues sets value of posixQueues if the value is currently nil, +// otherwise returns without doing anything. +func (i *IPCNamespace) SetPosixQueues(r *mq.Registry) { + if i.posixQueues == nil { + i.posixQueues = r + } +} + +// PosixQueues returns the posix message queue registry for this namespace. +func (i *IPCNamespace) PosixQueues() *mq.Registry { + return i.posixQueues +} + // DecRef implements refsvfs2.RefCounter.DecRef. func (i *IPCNamespace) DecRef(ctx context.Context) { i.IPCNamespaceRefs.DecRef(func() { i.shms.Release(ctx) + if i.posixQueues != nil { + i.posixQueues.Destroy(ctx) + } }) } diff --git a/pkg/sentry/kernel/mq/BUILD b/pkg/sentry/kernel/mq/BUILD index b4e17b582..7b00b8346 100644 --- a/pkg/sentry/kernel/mq/BUILD +++ b/pkg/sentry/kernel/mq/BUILD @@ -26,6 +26,7 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/sentry/fs", + "//pkg/sentry/vfs", "//pkg/sync", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/mq/mq.go b/pkg/sentry/kernel/mq/mq.go index 29a46e8a9..be46f78c8 100644 --- a/pkg/sentry/kernel/mq/mq.go +++ b/pkg/sentry/kernel/mq/mq.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) @@ -30,6 +31,54 @@ const ( maxPriority = linux.MQ_PRIO_MAX - 1 // Highest possible message priority. ) +// Registry is a POSIX message queue registry. +// +// Unlike SysV utilities, Registry is not map-based. It uses a provided +// RegistryImpl backed by a virtual filesystem to implement registry operations. +// +// +stateify savable +type Registry struct { + // impl is an implementation of several message queue utilities needed by + // the registry. impl should be provided by mqfs. + impl RegistryImpl +} + +// RegistryImpl defines utilities needed by a Registry to provide actual +// registry implementation. It works mainly as an abstraction layer used by +// Registry to avoid dealing directly with the filesystem. RegistryImpl should +// be implemented by mqfs and provided to Registry at initialization. +type RegistryImpl interface { + // Lookup returns the queue with the given name, nil if non exists. + Lookup(context.Context, string) *Queue + + // New creates a new inode and file description using the given queue, + // inserts the inode into the filesystem tree with the given name, and + // returns the file description. An error is returned if creation fails, or + // if the name already exists. + New(context.Context, string, *Queue, linux.FileMode) (*vfs.FileDescription, error) + + // Unlink removes the queue with given name from the registry, and returns + // an error if the name doesn't exist. + Unlink(context.Context, string) error + + // Destroy destroys the registry. + Destroy(context.Context) +} + +// NewRegistry returns a new, initialized message queue registry. NewRegistry +// should be called when a new message queue filesystem is created, once per +// IPCNamespace. +func NewRegistry(impl RegistryImpl) *Registry { + return &Registry{ + impl: impl, + } +} + +// Destroy destroys the registry and releases all held references. +func (r *Registry) Destroy(ctx context.Context) { + r.impl.Destroy(ctx) +} + // Queue represents a POSIX message queue. // // +stateify savable @@ -103,7 +152,7 @@ type Subscriber struct { } // Generate implements vfs.DynamicBytesSource.Generate. Queue is used as a -// dynamic bytes source for mqfs's queueInode. +// DynamicBytesSource for mqfs's queueInode. func (q *Queue) Generate(ctx context.Context, buf *bytes.Buffer) error { q.mu.Lock() defer q.mu.Unlock() -- cgit v1.2.3 From bcef079ec24d56d37a670c4c4149c638be6fb110 Mon Sep 17 00:00:00 2001 From: "Zyad A. Ali" Date: Wed, 11 Aug 2021 21:24:10 +0200 Subject: Move CtxIPCNamespace to kernel/ipc package. CtxIPCNamespace is needed by mqfs package to be able to retreive an IPCNamespace using ctx.Value. As ctx.Value compares keys as interfaces, we need to use type kernel.contextID in package mqfs, which is not possible due to circular depenedency, so move it to kernel/ipc instead. Updates #136 --- pkg/sentry/kernel/BUILD | 1 + pkg/sentry/kernel/context.go | 6 ++---- pkg/sentry/kernel/ipc/BUILD | 1 + pkg/sentry/kernel/ipc/ns.go | 22 ++++++++++++++++++++++ pkg/sentry/kernel/kernel.go | 5 +++-- pkg/sentry/kernel/task_context.go | 3 ++- 6 files changed, 31 insertions(+), 7 deletions(-) create mode 100644 pkg/sentry/kernel/ipc/ns.go diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index e91338da7..6ff3deb97 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -216,6 +216,7 @@ go_library( visibility = ["//:sandbox"], deps = [ ":uncaught_signal_go_proto", + "//pkg/sentry/kernel/ipc", "//pkg/abi", "//pkg/abi/linux", "//pkg/abi/linux/errno", diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go index a8596410f..7e11c6580 100644 --- a/pkg/sentry/kernel/context.go +++ b/pkg/sentry/kernel/context.go @@ -16,6 +16,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" ) // contextID is the kernel package's type for context.Context.Value keys. @@ -37,9 +38,6 @@ const ( // CtxUTSNamespace is a Context.Value key for a UTSNamespace. CtxUTSNamespace - - // CtxIPCNamespace is a Context.Value key for a IPCNamespace. - CtxIPCNamespace ) // ContextCanTrace returns true if ctx is permitted to trace t, in the same sense @@ -82,7 +80,7 @@ func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace { // or nil if there is no such IPC namespace. It takes a reference on the // namespace. func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace { - if v := ctx.Value(CtxIPCNamespace); v != nil { + if v := ctx.Value(ipc.CtxIPCNamespace); v != nil { return v.(*IPCNamespace) } return nil diff --git a/pkg/sentry/kernel/ipc/BUILD b/pkg/sentry/kernel/ipc/BUILD index e42a94e15..a5cbb2b51 100644 --- a/pkg/sentry/kernel/ipc/BUILD +++ b/pkg/sentry/kernel/ipc/BUILD @@ -7,6 +7,7 @@ go_library( srcs = [ "object.go", "registry.go", + "ns.go", ], visibility = ["//pkg/sentry:internal"], deps = [ diff --git a/pkg/sentry/kernel/ipc/ns.go b/pkg/sentry/kernel/ipc/ns.go new file mode 100644 index 000000000..220c9eafb --- /dev/null +++ b/pkg/sentry/kernel/ipc/ns.go @@ -0,0 +1,22 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ipc + +type contextID int + +// CtxIPCNamespace is the context.Value key used to retreive an IPC namespace. +// We define it here because it's needed in several packages, and is not +// possible to use otherwise without causing a circular depenedency. +const CtxIPCNamespace contextID = iota diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index df5160b67..6ce3625d4 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -59,6 +59,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" + "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" @@ -861,7 +862,7 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} { return ctx.args.PIDNamespace case CtxUTSNamespace: return ctx.args.UTSNamespace - case CtxIPCNamespace: + case ipc.CtxIPCNamespace: ipcns := ctx.args.IPCNamespace ipcns.IncRef() return ipcns @@ -1689,7 +1690,7 @@ func (ctx supervisorContext) Value(key interface{}) interface{} { return ctx.k.tasks.Root case CtxUTSNamespace: return ctx.k.rootUTSNamespace - case CtxIPCNamespace: + case ipc.CtxIPCNamespace: ipcns := ctx.k.rootIPCNamespace ipcns.IncRef() return ipcns diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go index c82d9e82b..cb9bcd7c0 100644 --- a/pkg/sentry/kernel/task_context.go +++ b/pkg/sentry/kernel/task_context.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/pgalloc" @@ -73,7 +74,7 @@ func (t *Task) contextValue(key interface{}, isTaskGoroutine bool) interface{} { defer t.mu.Unlock() } return t.utsns - case CtxIPCNamespace: + case ipc.CtxIPCNamespace: if !isTaskGoroutine { t.mu.Lock() defer t.mu.Unlock() -- cgit v1.2.3 From 229c01552e2b819c2fa6bf1f5aa017cff366869e Mon Sep 17 00:00:00 2001 From: "Zyad A. Ali" Date: Thu, 29 Jul 2021 17:33:45 +0200 Subject: Move filesystem creation from GetFilesystem to RegistryImpl. Move root dentry and filesystem creation from GetFilesystem to NewRegistryImpl, create IPCNamespace.InitPosixQueues to create a new mqueue filesystem for each ipc namespace, and update GetFilesystem to retreive fs and root dentry from IPCNamespace and return them. Updates #136 --- pkg/sentry/fsimpl/mqfs/BUILD | 1 + pkg/sentry/fsimpl/mqfs/mqfs.go | 61 ++++++++++++++++++++++++++++---------- pkg/sentry/fsimpl/mqfs/registry.go | 34 ++++++++++++++++----- pkg/sentry/kernel/BUILD | 1 + pkg/sentry/kernel/ipc_namespace.go | 21 +++++++++---- pkg/sentry/kernel/mq/mq.go | 5 ++++ 6 files changed, 96 insertions(+), 27 deletions(-) diff --git a/pkg/sentry/fsimpl/mqfs/BUILD b/pkg/sentry/fsimpl/mqfs/BUILD index 6892c6c25..e688b9b48 100644 --- a/pkg/sentry/fsimpl/mqfs/BUILD +++ b/pkg/sentry/fsimpl/mqfs/BUILD @@ -31,6 +31,7 @@ go_library( "//pkg/refsvfs2", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/ipc", "//pkg/sentry/kernel/mq", "//pkg/sentry/vfs", "//pkg/sync", diff --git a/pkg/sentry/fsimpl/mqfs/mqfs.go b/pkg/sentry/fsimpl/mqfs/mqfs.go index a92012deb..ed559cd13 100644 --- a/pkg/sentry/fsimpl/mqfs/mqfs.go +++ b/pkg/sentry/fsimpl/mqfs/mqfs.go @@ -24,6 +24,8 @@ import ( "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/ipc" + "gvisor.dev/gvisor/pkg/sentry/kernel/mq" "gvisor.dev/gvisor/pkg/sentry/vfs" ) @@ -47,28 +49,32 @@ func (FilesystemType) Release(ctx context.Context) {} // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { - devMinor, err := vfsObj.GetAnonBlockDevMinor() - if err != nil { - return nil, nil, err + // mqfs is initialized only once per ipc namespace. Each ipc namespace has + // a POSIX message registry with a root dentry, filesystem, and a + // disconnected mount. We want the fs to be consistent for all processes in + // the same ipc namespace, so instead of creating a new fs and root dentry, + // we retreive them using IPCNamespace.PosixQueues and use them. + + i := ipcNamespaceFromContext(ctx) + if i == nil { + return nil, nil, fmt.Errorf("mqfs.FilesystemType.GetFilesystem: ipc namespace doesn't exist") + } + defer i.DecRef(ctx) + + registry := i.PosixQueues() + if registry == nil { + return nil, nil, fmt.Errorf("mqfs.FilesystemType.GetFilesystem: ipc namespace doesn't have a POSIX registry") } + impl := registry.Impl().(*RegistryImpl) maxCachedDentries, err := maxCachedDentries(ctx, vfs.GenericParseMountOptions(opts.Data)) if err != nil { return nil, nil, err } + impl.fs.MaxCachedDentries = maxCachedDentries - fs := &filesystem{ - devMinor: devMinor, - Filesystem: kernfs.Filesystem{ - MaxCachedDentries: maxCachedDentries, - }, - } - fs.VFSFilesystem().Init(vfsObj, &ft, fs) - - var dentry kernfs.Dentry - dentry.InitRoot(&fs.Filesystem, fs.newRootInode(ctx, creds)) - - return fs.VFSFilesystem(), dentry.VFSDentry(), nil + impl.root.IncRef() + return impl.fs.VFSFilesystem(), impl.root.VFSDentry(), nil } // maxCachedDentries checks mopts for dentry_cache_limit. If a value is @@ -93,15 +99,40 @@ func maxCachedDentries(ctx context.Context, mopts map[string]string) (_ uint64, type filesystem struct { kernfs.Filesystem devMinor uint32 + + // root is the filesystem's root dentry. Since we take a reference on it in + // GetFilesystem, we should release it when the fs is released. + root *kernfs.Dentry } // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) fs.Filesystem.Release(ctx) + fs.root.DecRef(ctx) } // MountOptions implements vfs.FilesystemImpl.MountOptions. func (fs *filesystem) MountOptions() string { return fmt.Sprintf("dentry_cache_limit=%d", fs.MaxCachedDentries) } + +// ipcNamespace defines functions we need from kernel.IPCNamespace. We redefine +// ipcNamespace along with ipcNamespaceFromContext to avoid circular dependency +// with package sentry/kernel. +type ipcNamespace interface { + // PosixQueues returns a POSIX message queue registry. + PosixQueues() *mq.Registry + + // DecRef decrements ipcNamespace's number of references. + DecRef(ctx context.Context) +} + +// ipcNamespaceFromContext returns the IPC namespace in which ctx is executing. +// Copied from package sentry/kernel. +func ipcNamespaceFromContext(ctx context.Context) ipcNamespace { + if v := ctx.Value(ipc.CtxIPCNamespace); v != nil { + return v.(ipcNamespace) + } + return nil +} diff --git a/pkg/sentry/fsimpl/mqfs/registry.go b/pkg/sentry/fsimpl/mqfs/registry.go index 3875b39ee..9361b7eb4 100644 --- a/pkg/sentry/fsimpl/mqfs/registry.go +++ b/pkg/sentry/fsimpl/mqfs/registry.go @@ -50,12 +50,32 @@ type RegistryImpl struct { // NewRegistryImpl returns a new, initialized RegistryImpl, and takes a // reference on root. -func NewRegistryImpl(root *kernfs.Dentry, fs *filesystem) *RegistryImpl { - root.IncRef() - return &RegistryImpl{ - root: root, - fs: fs, +func NewRegistryImpl(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*RegistryImpl, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, err + } + + var dentry kernfs.Dentry + fs := &filesystem{ + devMinor: devMinor, + root: &dentry, + } + fs.VFSFilesystem().Init(vfsObj, &FilesystemType{}, fs) + + dentry.InitRoot(&fs.Filesystem, fs.newRootInode(ctx, creds)) + dentry.IncRef() + + mount, err := vfsObj.NewDisconnectedMount(fs.VFSFilesystem(), dentry.VFSDentry(), &vfs.MountOptions{}) + if err != nil { + return nil, err } + + return &RegistryImpl{ + root: &dentry, + fs: fs, + mount: mount, + }, nil } // Lookup implements mq.RegistryImpl.Lookup. @@ -83,11 +103,11 @@ func (r *RegistryImpl) New(ctx context.Context, name string, q *mq.Queue, perm l } fd := &queueFD{queue: q} - err = fd.Init(r.mount, r.root, qInode.data, &qInode.locks, 0 /* flags */) + err = fd.Init(r.mount, r.root, q, qInode.Locks(), 0 /* flags */) if err != nil { return nil, err } - return fd.VFSFileDescription(), nil + return &fd.vfsfd, nil } // Unlink implements mq.RegistryImpl.Unlink. diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 6ff3deb97..9f30a7706 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -249,6 +249,7 @@ go_library( "//pkg/sentry/fs/timerfd", "//pkg/sentry/fsbridge", "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/fsimpl/mqfs", "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/fsimpl/timerfd", diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go index aa9c3fb31..11b4545c6 100644 --- a/pkg/sentry/kernel/ipc_namespace.go +++ b/pkg/sentry/kernel/ipc_namespace.go @@ -15,12 +15,16 @@ package kernel import ( + "fmt" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/mqfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/mq" "gvisor.dev/gvisor/pkg/sentry/kernel/msgqueue" "gvisor.dev/gvisor/pkg/sentry/kernel/semaphore" "gvisor.dev/gvisor/pkg/sentry/kernel/shm" + "gvisor.dev/gvisor/pkg/sentry/vfs" ) // IPCNamespace represents an IPC namespace. @@ -72,12 +76,19 @@ func (i *IPCNamespace) ShmRegistry() *shm.Registry { return i.shms } -// SetPosixQueues sets value of posixQueues if the value is currently nil, -// otherwise returns without doing anything. -func (i *IPCNamespace) SetPosixQueues(r *mq.Registry) { - if i.posixQueues == nil { - i.posixQueues = r +// InitPosixQueues creates a new POSIX queue registry, and returns an error if +// the registry was previously initialized. +func (i *IPCNamespace) InitPosixQueues(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error { + if i.posixQueues != nil { + return fmt.Errorf("IPCNamespace.InitPosixQueues: already initialized") + } + + impl, err := mqfs.NewRegistryImpl(ctx, vfsObj, creds) + if err != nil { + return err } + i.posixQueues = mq.NewRegistry(impl) + return nil } // PosixQueues returns the posix message queue registry for this namespace. diff --git a/pkg/sentry/kernel/mq/mq.go b/pkg/sentry/kernel/mq/mq.go index be46f78c8..739ea2f1c 100644 --- a/pkg/sentry/kernel/mq/mq.go +++ b/pkg/sentry/kernel/mq/mq.go @@ -79,6 +79,11 @@ func (r *Registry) Destroy(ctx context.Context) { r.impl.Destroy(ctx) } +// Impl returns RegistryImpl inside r. +func (r *Registry) Impl() RegistryImpl { + return r.impl +} + // Queue represents a POSIX message queue. // // +stateify savable -- cgit v1.2.3 From 2c0d07959e866a711ab9293da0847a77304ed2ba Mon Sep 17 00:00:00 2001 From: "Zyad A. Ali" Date: Thu, 29 Jul 2021 21:06:24 +0200 Subject: Initialize POSIX queues' registry after creating a new IPCNamespace. Updates #136 --- pkg/sentry/kernel/ipc_namespace.go | 2 ++ pkg/sentry/kernel/kernel.go | 5 +++++ pkg/sentry/kernel/task_clone.go | 6 ++++++ 3 files changed, 13 insertions(+) diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go index 11b4545c6..429a4b983 100644 --- a/pkg/sentry/kernel/ipc_namespace.go +++ b/pkg/sentry/kernel/ipc_namespace.go @@ -92,6 +92,8 @@ func (i *IPCNamespace) InitPosixQueues(ctx context.Context, vfsObj *vfs.VirtualF } // PosixQueues returns the posix message queue registry for this namespace. +// +// Precondition: i.InitPosixQueues must have been called. func (i *IPCNamespace) PosixQueues() *mq.Registry { return i.posixQueues } diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 6ce3625d4..04b24369a 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -401,6 +401,11 @@ func (k *Kernel) Init(args InitKernelArgs) error { return fmt.Errorf("failed to initialize VFS: %v", err) } + err := k.rootIPCNamespace.InitPosixQueues(ctx, &k.vfs, auth.CredentialsFromContext(ctx)) + if err != nil { + return fmt.Errorf("failed to create mqfs filesystem: %v", err) + } + pipeFilesystem, err := pipefs.NewFilesystem(&k.vfs) if err != nil { return fmt.Errorf("failed to create pipefs filesystem: %v", err) diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 26a981f36..e174913d1 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -103,6 +103,9 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) { ipcns := t.IPCNamespace() if args.Flags&linux.CLONE_NEWIPC != 0 { ipcns = NewIPCNamespace(userns) + if VFS2Enabled { + ipcns.InitPosixQueues(t, t.k.VFS(), creds) + } } else { ipcns.IncRef() } @@ -464,6 +467,9 @@ func (t *Task) Unshare(flags int32) error { // namespace" t.ipcns.DecRef(t) t.ipcns = NewIPCNamespace(creds.UserNamespace) + if VFS2Enabled { + t.ipcns.InitPosixQueues(t, t.k.VFS(), creds) + } } var oldFDTable *FDTable if flags&linux.CLONE_FILES != 0 { -- cgit v1.2.3 From b2be0611cd75b83eafc5ec5e7a0fbaeb1eaf9663 Mon Sep 17 00:00:00 2001 From: "Zyad A. Ali" Date: Thu, 29 Jul 2021 21:10:17 +0200 Subject: Register mqfs.FilesystemType Updates #136 --- runsc/boot/BUILD | 1 + runsc/boot/vfs.go | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index b8585c1e9..7028a465f 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -66,6 +66,7 @@ go_library( "//pkg/sentry/fsimpl/fuse", "//pkg/sentry/fsimpl/gofer", "//pkg/sentry/fsimpl/host", + "//pkg/sentry/fsimpl/mqfs", "//pkg/sentry/fsimpl/overlay", "//pkg/sentry/fsimpl/proc", "//pkg/sentry/fsimpl/sys", diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 346796d9c..4286fb978 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -36,6 +36,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/fuse" "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/mqfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/overlay" "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys" @@ -95,6 +96,10 @@ func registerFilesystems(k *kernel.Kernel) error { AllowUserList: true, AllowUserMount: true, }) + vfsObj.MustRegisterFilesystemType(mqfs.Name, &mqfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) // Setup files in devtmpfs. if err := memdev.Register(vfsObj); err != nil { -- cgit v1.2.3 From 7508a0efeeef19a3e08e06e80be8258743438412 Mon Sep 17 00:00:00 2001 From: "Zyad A. Ali" Date: Thu, 29 Jul 2021 22:17:26 +0200 Subject: Define mq.View and use it for mqfs.queueFD. View makes it easier to handle O_RDONLY, O_WRONLY, and ORDWR options in mq_open(2). Updates #136 --- pkg/sentry/fsimpl/mqfs/queue.go | 4 ++-- pkg/sentry/kernel/mq/mq.go | 19 ++++++++++++++++--- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/pkg/sentry/fsimpl/mqfs/queue.go b/pkg/sentry/fsimpl/mqfs/queue.go index a8e9bc722..933dbc6ed 100644 --- a/pkg/sentry/fsimpl/mqfs/queue.go +++ b/pkg/sentry/fsimpl/mqfs/queue.go @@ -65,8 +65,8 @@ type queueFD struct { vfsfd vfs.FileDescription inode kernfs.Inode - // queue is the queue backing this fd. - queue *mq.Queue + // queue is a view into the queue backing this fd. + queue mq.View } // Init initializes a queueFD. Mostly copied from DynamicBytesFD.Init, but uses diff --git a/pkg/sentry/kernel/mq/mq.go b/pkg/sentry/kernel/mq/mq.go index 739ea2f1c..954883c5f 100644 --- a/pkg/sentry/kernel/mq/mq.go +++ b/pkg/sentry/kernel/mq/mq.go @@ -129,6 +129,21 @@ type Queue struct { byteCount uint64 } +// View is a view into a message queue. Views should only be used in file +// descriptions, but not inodes, because we use inodes to retreive the actual +// queue, and only FDs are responsible for providing user functionality. +type View interface { + // TODO: Add Send and Receive when mq_timedsend(2) and mq_timedreceive(2) + // are implemented. + + // Flush checks if the calling process has attached a notification request + // to this queue, if yes, then the request is removed, and another process + // can attach a request. + Flush(ctx context.Context) + + waiter.Waitable +} + // Message holds a message exchanged through a Queue via mq_timedsend(2) and // mq_timedreceive(2), and additional info relating to the message. // @@ -179,9 +194,7 @@ func (q *Queue) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } -// Flush checks if the calling process has attached a notification request to -// this queue, if yes, then the request is removed, and another process can -// attach a request. +// Flush implements View.Flush. func (q *Queue) Flush(ctx context.Context) { q.mu.Lock() defer q.mu.Unlock() -- cgit v1.2.3 From 13d36561b8a9cab6cf20b4b5053752955f451518 Mon Sep 17 00:00:00 2001 From: "Zyad A. Ali" Date: Fri, 30 Jul 2021 16:57:59 +0200 Subject: Return FDs in RegistryImpl functions and use Views. Update RegistryImpl functions to return file descriptions, instead of queues, and use Views in queue inodes. Updates #136 --- pkg/sentry/fsimpl/mqfs/BUILD | 1 + pkg/sentry/fsimpl/mqfs/registry.go | 75 ++++++++++++++++++++++++++++++-------- pkg/sentry/kernel/mq/BUILD | 2 + pkg/sentry/kernel/mq/mq.go | 64 +++++++++++++++++++++++++++++--- 4 files changed, 122 insertions(+), 20 deletions(-) diff --git a/pkg/sentry/fsimpl/mqfs/BUILD b/pkg/sentry/fsimpl/mqfs/BUILD index e688b9b48..ef843015d 100644 --- a/pkg/sentry/fsimpl/mqfs/BUILD +++ b/pkg/sentry/fsimpl/mqfs/BUILD @@ -29,6 +29,7 @@ go_library( "//pkg/context", "//pkg/errors/linuxerr", "//pkg/refsvfs2", + "//pkg/sentry/fs", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/ipc", diff --git a/pkg/sentry/fsimpl/mqfs/registry.go b/pkg/sentry/fsimpl/mqfs/registry.go index 9361b7eb4..89ffaaf04 100644 --- a/pkg/sentry/fsimpl/mqfs/registry.go +++ b/pkg/sentry/fsimpl/mqfs/registry.go @@ -17,6 +17,8 @@ package mqfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/mq" @@ -78,20 +80,32 @@ func NewRegistryImpl(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds * }, nil } -// Lookup implements mq.RegistryImpl.Lookup. -func (r *RegistryImpl) Lookup(ctx context.Context, name string) *mq.Queue { +// Get implements mq.RegistryImpl.Get. +func (r *RegistryImpl) Get(ctx context.Context, name string, rOnly, wOnly, readWrite, block bool, flags uint32) (*vfs.FileDescription, bool, error) { r.mu.Lock() defer r.mu.Unlock() inode, err := r.lookup(ctx, name) if err != nil { - return nil + return nil, false, nil } - return inode.(*queueInode).queue + + qInode := inode.(*queueInode) + if !qInode.queue.HasPermissions(auth.CredentialsFromContext(ctx), perm(rOnly, wOnly, readWrite)) { + // "The queue exists, but the caller does not have permission to + // open it in the specified mode." + return nil, false, linuxerr.EACCES + } + + fd, err := r.newFD(qInode.queue, qInode, rOnly, wOnly, readWrite, block, flags) + if err != nil { + return nil, false, err + } + return fd, true, nil } // New implements mq.RegistryImpl.New. -func (r *RegistryImpl) New(ctx context.Context, name string, q *mq.Queue, perm linux.FileMode) (*vfs.FileDescription, error) { +func (r *RegistryImpl) New(ctx context.Context, name string, q *mq.Queue, rOnly, wOnly, readWrite, block bool, perm linux.FileMode, flags uint32) (*vfs.FileDescription, error) { r.mu.Lock() defer r.mu.Unlock() @@ -101,13 +115,7 @@ func (r *RegistryImpl) New(ctx context.Context, name string, q *mq.Queue, perm l if err != nil { return nil, err } - - fd := &queueFD{queue: q} - err = fd.Init(r.mount, r.root, q, qInode.Locks(), 0 /* flags */) - if err != nil { - return nil, err - } - return &fd.vfsfd, nil + return r.newFD(q, qInode, rOnly, wOnly, readWrite, block, flags) } // Unlink implements mq.RegistryImpl.Unlink. @@ -115,6 +123,11 @@ func (r *RegistryImpl) Unlink(ctx context.Context, name string) error { r.mu.Lock() defer r.mu.Unlock() + creds := auth.CredentialsFromContext(ctx) + if err := r.root.Inode().CheckPermissions(ctx, creds, vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + root := r.root.Inode().(*rootInode) inode, err := r.lookup(ctx, name) if err != nil { @@ -123,6 +136,11 @@ func (r *RegistryImpl) Unlink(ctx context.Context, name string) error { return root.Unlink(ctx, name, inode) } +// Destroy implements mq.RegistryImpl.Destroy. +func (r *RegistryImpl) Destroy(ctx context.Context) { + r.root.DecRef(ctx) +} + // lookup retreives a kernfs.Inode using a name. // // Precondition: r.mu must be held. @@ -135,7 +153,34 @@ func (r *RegistryImpl) lookup(ctx context.Context, name string) (kernfs.Inode, e return lookup, nil } -// Destroy implements mq.RegistryImpl.Destroy. -func (r *RegistryImpl) Destroy(ctx context.Context) { - r.root.DecRef(ctx) +// newFD returns a new file description created using the given queue and inode. +func (r *RegistryImpl) newFD(q *mq.Queue, inode *queueInode, rOnly, wOnly, readWrite, block bool, flags uint32) (*vfs.FileDescription, error) { + view, err := mq.NewView(q, rOnly, wOnly, readWrite, block) + if err != nil { + return nil, err + } + + var dentry kernfs.Dentry + dentry.Init(&r.fs.Filesystem, inode) + + fd := &queueFD{queue: view} + err = fd.Init(r.mount, &dentry, inode.queue, inode.Locks(), flags) + if err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// perm returns a permission mask created using given flags. +func perm(rOnly, wOnly, readWrite bool) fs.PermMask { + switch { + case readWrite: + return fs.PermMask{Read: true, Write: true} + case wOnly: + return fs.PermMask{Write: true} + case rOnly: + return fs.PermMask{Read: true} + default: + return fs.PermMask{} // Can't happen, see NewView. + } } diff --git a/pkg/sentry/kernel/mq/BUILD b/pkg/sentry/kernel/mq/BUILD index 7b00b8346..fefac3ba5 100644 --- a/pkg/sentry/kernel/mq/BUILD +++ b/pkg/sentry/kernel/mq/BUILD @@ -25,7 +25,9 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/errors/linuxerr", "//pkg/sentry/fs", + "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", "//pkg/sync", "//pkg/waiter", diff --git a/pkg/sentry/kernel/mq/mq.go b/pkg/sentry/kernel/mq/mq.go index 954883c5f..217478dca 100644 --- a/pkg/sentry/kernel/mq/mq.go +++ b/pkg/sentry/kernel/mq/mq.go @@ -21,13 +21,16 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) const ( + MaxName = 255 // Maximum size for a queue name. maxPriority = linux.MQ_PRIO_MAX - 1 // Highest possible message priority. ) @@ -48,18 +51,20 @@ type Registry struct { // Registry to avoid dealing directly with the filesystem. RegistryImpl should // be implemented by mqfs and provided to Registry at initialization. type RegistryImpl interface { - // Lookup returns the queue with the given name, nil if non exists. - Lookup(context.Context, string) *Queue + // Get searchs for a queue with the given name, if it exists, the queue is + // used to create a new FD, return it and return true. If the queue doesn't + // exist, return false and no error. An error is returned if creation fails. + Get(ctx context.Context, name string, rOnly, wOnly, readWrite, block bool, flags uint32) (*vfs.FileDescription, bool, error) // New creates a new inode and file description using the given queue, - // inserts the inode into the filesystem tree with the given name, and + // inserts the inode into the filesystem tree using the given name, and // returns the file description. An error is returned if creation fails, or // if the name already exists. - New(context.Context, string, *Queue, linux.FileMode) (*vfs.FileDescription, error) + New(ctx context.Context, name string, q *Queue, rOnly, wOnly, readWrite, block bool, perm linux.FileMode, flags uint32) (*vfs.FileDescription, error) // Unlink removes the queue with given name from the registry, and returns // an error if the name doesn't exist. - Unlink(context.Context, string) error + Unlink(ctx context.Context, name string) error // Destroy destroys the registry. Destroy(context.Context) @@ -144,6 +149,43 @@ type View interface { waiter.Waitable } +// ReaderWriter provides a send and receive view into a queue. +type ReaderWriter struct { + *Queue + + block bool +} + +// Reader provides a send-only view into a queue. +type Reader struct { + *Queue + + block bool +} + +// Writer provides a receive-only view into a queue. +type Writer struct { + *Queue + + block bool +} + +// NewView creates a new view into a queue and returns it. +func NewView(q *Queue, rOnly, wOnly, readWrite, block bool) (View, error) { + switch { + case readWrite: + return ReaderWriter{Queue: q, block: block}, nil + case wOnly: + return Writer{Queue: q, block: block}, nil + case rOnly: + return Reader{Queue: q, block: block}, nil + default: + // This case can't happen, due to O_RDONLY flag being 0 and O_WRONLY + // being 1, so one of them must be true. + return nil, linuxerr.EINVAL + } +} + // Message holds a message exchanged through a Queue via mq_timedsend(2) and // mq_timedreceive(2), and additional info relating to the message. // @@ -243,3 +285,15 @@ func (q *Queue) EventUnregister(e *waiter.Entry) { q.senders.EventUnregister(e) q.receivers.EventUnregister(e) } + +// HasPermissions returns true if the given credentials meet the access +// permissions required by the queue. +func (q *Queue) HasPermissions(creds *auth.Credentials, req fs.PermMask) bool { + p := q.perms.Other + if q.owner.UID == creds.EffectiveKUID { + p = q.perms.User + } else if creds.InGroup(q.owner.GID) { + p = q.perms.Group + } + return p.SupersetOf(req) +} -- cgit v1.2.3 From 7df562d47337b29e6ac53c06c120cd4d88dd5da3 Mon Sep 17 00:00:00 2001 From: "Zyad A. Ali" Date: Fri, 30 Jul 2021 22:15:29 +0200 Subject: Implement Registry.FindOrCreate. FindOrCreate implements the behaviour of mq_open(2). Updates #136 --- pkg/sentry/fsimpl/mqfs/registry.go | 24 +++--- pkg/sentry/kernel/ipc_namespace.go | 2 +- pkg/sentry/kernel/mq/mq.go | 166 ++++++++++++++++++++++++++++++++++--- 3 files changed, 167 insertions(+), 25 deletions(-) diff --git a/pkg/sentry/fsimpl/mqfs/registry.go b/pkg/sentry/fsimpl/mqfs/registry.go index 89ffaaf04..ea1f2981c 100644 --- a/pkg/sentry/fsimpl/mqfs/registry.go +++ b/pkg/sentry/fsimpl/mqfs/registry.go @@ -81,7 +81,7 @@ func NewRegistryImpl(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds * } // Get implements mq.RegistryImpl.Get. -func (r *RegistryImpl) Get(ctx context.Context, name string, rOnly, wOnly, readWrite, block bool, flags uint32) (*vfs.FileDescription, bool, error) { +func (r *RegistryImpl) Get(ctx context.Context, name string, access mq.AccessType, block bool, flags uint32) (*vfs.FileDescription, bool, error) { r.mu.Lock() defer r.mu.Unlock() @@ -91,13 +91,13 @@ func (r *RegistryImpl) Get(ctx context.Context, name string, rOnly, wOnly, readW } qInode := inode.(*queueInode) - if !qInode.queue.HasPermissions(auth.CredentialsFromContext(ctx), perm(rOnly, wOnly, readWrite)) { + if !qInode.queue.HasPermissions(auth.CredentialsFromContext(ctx), perm(access)) { // "The queue exists, but the caller does not have permission to // open it in the specified mode." return nil, false, linuxerr.EACCES } - fd, err := r.newFD(qInode.queue, qInode, rOnly, wOnly, readWrite, block, flags) + fd, err := r.newFD(qInode.queue, qInode, access, block, flags) if err != nil { return nil, false, err } @@ -105,7 +105,7 @@ func (r *RegistryImpl) Get(ctx context.Context, name string, rOnly, wOnly, readW } // New implements mq.RegistryImpl.New. -func (r *RegistryImpl) New(ctx context.Context, name string, q *mq.Queue, rOnly, wOnly, readWrite, block bool, perm linux.FileMode, flags uint32) (*vfs.FileDescription, error) { +func (r *RegistryImpl) New(ctx context.Context, name string, q *mq.Queue, access mq.AccessType, block bool, perm linux.FileMode, flags uint32) (*vfs.FileDescription, error) { r.mu.Lock() defer r.mu.Unlock() @@ -115,7 +115,7 @@ func (r *RegistryImpl) New(ctx context.Context, name string, q *mq.Queue, rOnly, if err != nil { return nil, err } - return r.newFD(q, qInode, rOnly, wOnly, readWrite, block, flags) + return r.newFD(q, qInode, access, block, flags) } // Unlink implements mq.RegistryImpl.Unlink. @@ -154,8 +154,8 @@ func (r *RegistryImpl) lookup(ctx context.Context, name string) (kernfs.Inode, e } // newFD returns a new file description created using the given queue and inode. -func (r *RegistryImpl) newFD(q *mq.Queue, inode *queueInode, rOnly, wOnly, readWrite, block bool, flags uint32) (*vfs.FileDescription, error) { - view, err := mq.NewView(q, rOnly, wOnly, readWrite, block) +func (r *RegistryImpl) newFD(q *mq.Queue, inode *queueInode, access mq.AccessType, block bool, flags uint32) (*vfs.FileDescription, error) { + view, err := mq.NewView(q, access, block) if err != nil { return nil, err } @@ -172,13 +172,13 @@ func (r *RegistryImpl) newFD(q *mq.Queue, inode *queueInode, rOnly, wOnly, readW } // perm returns a permission mask created using given flags. -func perm(rOnly, wOnly, readWrite bool) fs.PermMask { - switch { - case readWrite: +func perm(access mq.AccessType) fs.PermMask { + switch access { + case mq.ReadWrite: return fs.PermMask{Read: true, Write: true} - case wOnly: + case mq.WriteOnly: return fs.PermMask{Write: true} - case rOnly: + case mq.ReadOnly: return fs.PermMask{Read: true} default: return fs.PermMask{} // Can't happen, see NewView. diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go index 429a4b983..50b4e015e 100644 --- a/pkg/sentry/kernel/ipc_namespace.go +++ b/pkg/sentry/kernel/ipc_namespace.go @@ -87,7 +87,7 @@ func (i *IPCNamespace) InitPosixQueues(ctx context.Context, vfsObj *vfs.VirtualF if err != nil { return err } - i.posixQueues = mq.NewRegistry(impl) + i.posixQueues = mq.NewRegistry(i.userNS, impl) return nil } diff --git a/pkg/sentry/kernel/mq/mq.go b/pkg/sentry/kernel/mq/mq.go index 217478dca..c21cc9d47 100644 --- a/pkg/sentry/kernel/mq/mq.go +++ b/pkg/sentry/kernel/mq/mq.go @@ -18,6 +18,7 @@ package mq import ( "bytes" "fmt" + "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -29,9 +30,31 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// AccessType is the access type passed to mq_open. +type AccessType int + +// Possible access types. +const ( + ReadOnly AccessType = iota + WriteOnly + ReadWrite +) + const ( MaxName = 255 // Maximum size for a queue name. maxPriority = linux.MQ_PRIO_MAX - 1 // Highest possible message priority. + + maxQueuesDefault = linux.DFLT_QUEUESMAX // Default max number of queues. + + maxMsgDefault = linux.DFLT_MSG // Default max number of messages per queue. + maxMsgMin = linux.MIN_MSGMAX // Min value for max number of messages per queue. + maxMsgLimit = linux.DFLT_MSGMAX // Limit for max number of messages per queue. + maxMsgHardLimit = linux.HARD_MSGMAX // Hard limit for max number of messages per queue. + + msgSizeDefault = linux.DFLT_MSGSIZE // Default max message size. + msgSizeMin = linux.MIN_MSGSIZEMAX // Min value for max message size. + msgSizeLimit = linux.DFLT_MSGSIZEMAX // Limit for max message size. + msgSizeHardLimit = linux.HARD_MSGSIZEMAX // Hard limit for max message size. ) // Registry is a POSIX message queue registry. @@ -41,6 +64,12 @@ const ( // // +stateify savable type Registry struct { + // userNS is the user namespace containing this registry. Immutable. + userNS *auth.UserNamespace + + // mu protects all fields below. + mu sync.Mutex `state:"nosave"` + // impl is an implementation of several message queue utilities needed by // the registry. impl should be provided by mqfs. impl RegistryImpl @@ -54,13 +83,13 @@ type RegistryImpl interface { // Get searchs for a queue with the given name, if it exists, the queue is // used to create a new FD, return it and return true. If the queue doesn't // exist, return false and no error. An error is returned if creation fails. - Get(ctx context.Context, name string, rOnly, wOnly, readWrite, block bool, flags uint32) (*vfs.FileDescription, bool, error) + Get(ctx context.Context, name string, access AccessType, block bool, flags uint32) (*vfs.FileDescription, bool, error) // New creates a new inode and file description using the given queue, // inserts the inode into the filesystem tree using the given name, and // returns the file description. An error is returned if creation fails, or // if the name already exists. - New(ctx context.Context, name string, q *Queue, rOnly, wOnly, readWrite, block bool, perm linux.FileMode, flags uint32) (*vfs.FileDescription, error) + New(ctx context.Context, name string, q *Queue, access AccessType, block bool, perm linux.FileMode, flags uint32) (*vfs.FileDescription, error) // Unlink removes the queue with given name from the registry, and returns // an error if the name doesn't exist. @@ -73,12 +102,128 @@ type RegistryImpl interface { // NewRegistry returns a new, initialized message queue registry. NewRegistry // should be called when a new message queue filesystem is created, once per // IPCNamespace. -func NewRegistry(impl RegistryImpl) *Registry { +func NewRegistry(userNS *auth.UserNamespace, impl RegistryImpl) *Registry { return &Registry{ - impl: impl, + userNS: userNS, + impl: impl, } } +// OpenOpts holds the options passed to FindOrCreate. +type OpenOpts struct { + Name string + Access AccessType + Create bool + Exclusive bool + Block bool +} + +// FindOrCreate creates a new POSIX message queue or opens an existing queue. +// See mq_open(2). +func (r *Registry) FindOrCreate(ctx context.Context, opts OpenOpts, perm linux.FileMode, attr *linux.MqAttr) (*vfs.FileDescription, error) { + // mq_overview(7) mentions that: "Each message queue is identified by a name + // of the form '/somename'", but the mq_open(3) man pages mention: + // "The mq_open() library function is implemented on top of a system call + // of the same name. The library function performs the check that the + // name starts with a slash (/), giving the EINVAL error if it does not. + // The kernel system call expects name to contain no preceding slash, so + // the C library function passes name without the preceding slash (i.e., + // name+1) to the system call." + // So we don't need to check it. + + if len(opts.Name) == 0 { + return nil, linuxerr.ENOENT + } + if len(opts.Name) > MaxName { + return nil, linuxerr.ENAMETOOLONG + } + if strings.ContainsRune(opts.Name, '/') { + return nil, linuxerr.EACCES + } + if opts.Name == "." || opts.Name == ".." { + return nil, linuxerr.EINVAL + } + + // Construct status flags. + var flags uint32 + if opts.Block { + flags = linux.O_NONBLOCK + } + switch opts.Access { + case ReadOnly: + flags = flags | linux.O_RDONLY + case WriteOnly: + flags = flags | linux.O_WRONLY + case ReadWrite: + flags = flags | linux.O_RDWR + } + + r.mu.Lock() + defer r.mu.Unlock() + fd, ok, err := r.impl.Get(ctx, opts.Name, opts.Access, opts.Block, flags) + if err != nil { + return nil, err + } + + if ok { + if opts.Create && opts.Exclusive { + // "Both O_CREAT and O_EXCL were specified in oflag, but a queue + // with this name already exists." + return nil, linuxerr.EEXIST + } + return fd, nil + } + + if !opts.Create { + // "The O_CREAT flag was not specified in oflag, and no queue with this name + // exists." + return nil, linuxerr.ENOENT + } + + q, err := r.newQueueLocked(auth.CredentialsFromContext(ctx), fs.FileOwnerFromContext(ctx), fs.FilePermsFromMode(perm), attr) + if err != nil { + return nil, err + } + return r.impl.New(ctx, opts.Name, q, opts.Access, opts.Block, perm, flags) +} + +// newQueueLocked creates a new queue using the given attributes. If attr is nil +// return a queue with default values, otherwise use attr to create a new queue, +// and return an error if attributes are invalid. +func (r *Registry) newQueueLocked(creds *auth.Credentials, owner fs.FileOwner, perms fs.FilePermissions, attr *linux.MqAttr) (*Queue, error) { + if attr == nil { + return &Queue{ + owner: owner, + perms: perms, + maxMessageCount: int64(maxMsgDefault), + maxMessageSize: uint64(msgSizeDefault), + }, nil + } + + // "O_CREAT was specified in oflag, and attr was not NULL, but + // attr->mq_maxmsg or attr->mq_msqsize was invalid. Both of these fields + // these fields must be greater than zero. In a process that is + // unprivileged (does not have the CAP_SYS_RESOURCE capability), + // attr->mq_maxmsg must be less than or equal to the msg_max limit, and + // attr->mq_msgsize must be less than or equal to the msgsize_max limit. + // In addition, even in a privileged process, attr->mq_maxmsg cannot + // exceed the HARD_MAX limit." - man mq_open(3). + if attr.MqMaxmsg <= 0 || attr.MqMsgsize <= 0 { + return nil, linuxerr.EINVAL + } + + if attr.MqMaxmsg > maxMsgHardLimit || (!creds.HasCapabilityIn(linux.CAP_SYS_RESOURCE, r.userNS) && (attr.MqMaxmsg > maxMsgLimit || attr.MqMsgsize > msgSizeLimit)) { + return nil, linuxerr.EINVAL + } + + return &Queue{ + owner: owner, + perms: perms, + maxMessageCount: attr.MqMaxmsg, + maxMessageSize: uint64(attr.MqMsgsize), + }, nil +} + // Destroy destroys the registry and releases all held references. func (r *Registry) Destroy(ctx context.Context) { r.impl.Destroy(ctx) @@ -117,9 +262,6 @@ type Queue struct { // from this queue. subscriber *Subscriber - // nonBlock is true if this queue is non-blocking. - nonBlock bool - // messageCount is the number of messages currently in the queue. messageCount int64 @@ -171,13 +313,13 @@ type Writer struct { } // NewView creates a new view into a queue and returns it. -func NewView(q *Queue, rOnly, wOnly, readWrite, block bool) (View, error) { - switch { - case readWrite: +func NewView(q *Queue, access AccessType, block bool) (View, error) { + switch access { + case ReadWrite: return ReaderWriter{Queue: q, block: block}, nil - case wOnly: + case WriteOnly: return Writer{Queue: q, block: block}, nil - case rOnly: + case ReadOnly: return Reader{Queue: q, block: block}, nil default: // This case can't happen, due to O_RDONLY flag being 0 and O_WRONLY -- cgit v1.2.3 From 583e18501ee8ee263295edd6f5ccf30d0278d854 Mon Sep 17 00:00:00 2001 From: "Zyad A. Ali" Date: Fri, 30 Jul 2021 22:16:36 +0200 Subject: Use one mutex for both Registry and RegistryImpl. Updates #136 --- pkg/sentry/fsimpl/mqfs/BUILD | 1 - pkg/sentry/fsimpl/mqfs/registry.go | 18 +++--------------- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/pkg/sentry/fsimpl/mqfs/BUILD b/pkg/sentry/fsimpl/mqfs/BUILD index ef843015d..e1a38686b 100644 --- a/pkg/sentry/fsimpl/mqfs/BUILD +++ b/pkg/sentry/fsimpl/mqfs/BUILD @@ -35,7 +35,6 @@ go_library( "//pkg/sentry/kernel/ipc", "//pkg/sentry/kernel/mq", "//pkg/sentry/vfs", - "//pkg/sync", "//pkg/usermem", "//pkg/waiter", ], diff --git a/pkg/sentry/fsimpl/mqfs/registry.go b/pkg/sentry/fsimpl/mqfs/registry.go index ea1f2981c..2c9c79f01 100644 --- a/pkg/sentry/fsimpl/mqfs/registry.go +++ b/pkg/sentry/fsimpl/mqfs/registry.go @@ -23,18 +23,17 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/mq" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sync" ) // RegistryImpl implements mq.RegistryImpl. It implements the interface using // the message queue filesystem, and is provided to mq.Registry at // initialization. // +// RegistryImpl is not thread-safe, so it is the responsibility of the user +// (the containing mq.Registry) to protect using a lock. +// // +stateify savable type RegistryImpl struct { - // mu protects all fields below. - mu sync.Mutex - // root is the root dentry of the mq filesystem. Its main usage is to // retreive the root inode, which we use to add, remove, and lookup message // queues. @@ -82,9 +81,6 @@ func NewRegistryImpl(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds * // Get implements mq.RegistryImpl.Get. func (r *RegistryImpl) Get(ctx context.Context, name string, access mq.AccessType, block bool, flags uint32) (*vfs.FileDescription, bool, error) { - r.mu.Lock() - defer r.mu.Unlock() - inode, err := r.lookup(ctx, name) if err != nil { return nil, false, nil @@ -106,9 +102,6 @@ func (r *RegistryImpl) Get(ctx context.Context, name string, access mq.AccessTyp // New implements mq.RegistryImpl.New. func (r *RegistryImpl) New(ctx context.Context, name string, q *mq.Queue, access mq.AccessType, block bool, perm linux.FileMode, flags uint32) (*vfs.FileDescription, error) { - r.mu.Lock() - defer r.mu.Unlock() - root := r.root.Inode().(*rootInode) qInode := r.fs.newQueueInode(ctx, auth.CredentialsFromContext(ctx), q, perm).(*queueInode) err := root.Insert(name, qInode) @@ -120,9 +113,6 @@ func (r *RegistryImpl) New(ctx context.Context, name string, q *mq.Queue, access // Unlink implements mq.RegistryImpl.Unlink. func (r *RegistryImpl) Unlink(ctx context.Context, name string) error { - r.mu.Lock() - defer r.mu.Unlock() - creds := auth.CredentialsFromContext(ctx) if err := r.root.Inode().CheckPermissions(ctx, creds, vfs.MayWrite|vfs.MayExec); err != nil { return err @@ -142,8 +132,6 @@ func (r *RegistryImpl) Destroy(ctx context.Context) { } // lookup retreives a kernfs.Inode using a name. -// -// Precondition: r.mu must be held. func (r *RegistryImpl) lookup(ctx context.Context, name string) (kernfs.Inode, error) { inode := r.root.Inode().(*rootInode) lookup, err := inode.Lookup(ctx, name) -- cgit v1.2.3 From 9bde727f4f2e5b7cf52211a3a4fe71c7a0e4f1ea Mon Sep 17 00:00:00 2001 From: "Zyad A. Ali" Date: Fri, 30 Jul 2021 22:24:54 +0200 Subject: Implement Registry.Remove. Remove implements the behaviour of mq_unlink(2). Updates #136 --- pkg/sentry/kernel/mq/mq.go | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pkg/sentry/kernel/mq/mq.go b/pkg/sentry/kernel/mq/mq.go index c21cc9d47..a7c787081 100644 --- a/pkg/sentry/kernel/mq/mq.go +++ b/pkg/sentry/kernel/mq/mq.go @@ -224,8 +224,22 @@ func (r *Registry) newQueueLocked(creds *auth.Credentials, owner fs.FileOwner, p }, nil } +// Remove removes the queue with the given name from the registry. See +// mq_unlink(2). +func (r *Registry) Remove(ctx context.Context, name string) error { + if len(name) > MaxName { + return linuxerr.ENAMETOOLONG + } + + r.mu.Lock() + defer r.mu.Unlock() + return r.impl.Unlink(ctx, name) +} + // Destroy destroys the registry and releases all held references. func (r *Registry) Destroy(ctx context.Context) { + r.mu.Lock() + defer r.mu.Unlock() r.impl.Destroy(ctx) } -- cgit v1.2.3 From f03dc73f0f46d0ff1ae209fefc98ee3d7fc725d2 Mon Sep 17 00:00:00 2001 From: "Zyad A. Ali" Date: Tue, 3 Aug 2021 07:48:24 +0200 Subject: Implement stubs for mq_open(2) and mq_unlink(2). Support mq_open and mq_unlink, and enable syscall tests. Updates #136 --- pkg/sentry/syscalls/linux/vfs2/BUILD | 2 + pkg/sentry/syscalls/linux/vfs2/mq.go | 98 ++++++++++++++++++++++++++++++++++ pkg/sentry/syscalls/linux/vfs2/vfs2.go | 5 +- test/syscalls/linux/mq.cc | 22 ++++---- 4 files changed, 113 insertions(+), 14 deletions(-) create mode 100644 pkg/sentry/syscalls/linux/vfs2/mq.go diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index 1e3bd2a50..4a03008f8 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -19,6 +19,7 @@ go_library( "memfd.go", "mmap.go", "mount.go", + "mq.go", "path.go", "pipe.go", "poll.go", @@ -59,6 +60,7 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/fasync", + "//pkg/sentry/kernel/mq", "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", "//pkg/sentry/limits", diff --git a/pkg/sentry/syscalls/linux/vfs2/mq.go b/pkg/sentry/syscalls/linux/vfs2/mq.go new file mode 100644 index 000000000..d5d81c6e2 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/mq.go @@ -0,0 +1,98 @@ +// Copyright 2021 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/mq" +) + +// MqOpen implements mq_open(2). +func MqOpen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + nameAddr := args[0].Pointer() + flag := args[1].Int() + mode := args[2].ModeT() + attrAddr := args[3].Pointer() + + name, err := t.CopyInString(nameAddr, mq.MaxName) + if err != nil { + return 0, nil, err + } + + rOnly := flag&linux.O_RDONLY == linux.O_RDONLY + wOnly := flag&linux.O_WRONLY == linux.O_WRONLY + readWrite := flag&linux.O_RDWR == linux.O_RDWR + + create := flag&linux.O_CREAT == linux.O_CREAT + exclusive := flag&linux.O_EXCL == linux.O_EXCL + block := flag&linux.O_NONBLOCK != linux.O_NONBLOCK + + var attr linux.MqAttr + var attrPtr *linux.MqAttr + if attrAddr != 0 { + if _, err := attr.CopyIn(t, attrAddr); err != nil { + return 0, nil, err + } + attrPtr = &attr + } + + opts := openOpts(name, rOnly, wOnly, readWrite, create, exclusive, block) + + r := t.IPCNamespace().PosixQueues() + queue, err := r.FindOrCreate(t, opts, linux.FileMode(mode), attrPtr) + if err != nil { + return 0, nil, err + } + + fd, err := t.NewFDFromVFS2(0, queue, kernel.FDFlags{ + CloseOnExec: flag&linux.O_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + return uintptr(fd), nil, nil +} + +// MqUnlink implements mq_unlink(2). +func MqUnlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + nameAddr := args[0].Pointer() + name, err := t.CopyInString(nameAddr, mq.MaxName) + if err != nil { + return 0, nil, err + } + return 0, nil, t.IPCNamespace().PosixQueues().Remove(t, name) +} + +func openOpts(name string, rOnly, wOnly, readWrite, create, exclusive, block bool) mq.OpenOpts { + var access mq.AccessType + switch { + case readWrite: + access = mq.ReadWrite + case wOnly: + access = mq.WriteOnly + case rOnly: + access = mq.ReadOnly + } + + return mq.OpenOpts{ + Name: name, + Access: access, + Create: create, + Exclusive: exclusive, + Block: block, + } +} diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index 0fc81e694..4eb15a7f2 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -112,6 +112,8 @@ func Override() { s.Table[232] = syscalls.Supported("epoll_wait", EpollWait) s.Table[233] = syscalls.Supported("epoll_ctl", EpollCtl) s.Table[235] = syscalls.Supported("utimes", Utimes) + s.Table[240] = syscalls.Supported("mq_open", MqOpen) + s.Table[241] = syscalls.Supported("mq_unlink", MqUnlink) s.Table[253] = syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil) s.Table[254] = syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil) s.Table[255] = syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil) @@ -241,6 +243,8 @@ func Override() { s.Table[86] = syscalls.Supported("timerfd_settime", TimerfdSettime) s.Table[87] = syscalls.Supported("timerfd_gettime", TimerfdGettime) s.Table[88] = syscalls.Supported("utimensat", Utimensat) + s.Table[180] = syscalls.Supported("mq_open", MqOpen) + s.Table[181] = syscalls.Supported("mq_unlink", MqUnlink) s.Table[198] = syscalls.Supported("socket", Socket) s.Table[199] = syscalls.Supported("socketpair", SocketPair) s.Table[200] = syscalls.Supported("bind", Bind) @@ -271,6 +275,5 @@ func Override() { s.Table[287] = syscalls.Supported("pwritev2", Pwritev2) s.Table[291] = syscalls.Supported("statx", Statx) s.Table[441] = syscalls.Supported("epoll_pwait2", EpollPwait2) - s.Init() } diff --git a/test/syscalls/linux/mq.cc b/test/syscalls/linux/mq.cc index 9a4a87dfe..b3cd6d1b9 100644 --- a/test/syscalls/linux/mq.cc +++ b/test/syscalls/linux/mq.cc @@ -121,14 +121,14 @@ PosixError MqClose(mqd_t fd) { // Test simple opening and closing of a message queue. TEST(MqTest, Open) { - GTEST_SKIP(); + SKIP_IF(IsRunningWithVFS1()); EXPECT_THAT(MqOpen(O_RDWR | O_CREAT | O_EXCL, 0777, nullptr), IsPosixErrorOkAndHolds(_)); } // Test mq_open(2) after mq_unlink(2). TEST(MqTest, OpenAfterUnlink) { - GTEST_SKIP(); + SKIP_IF(IsRunningWithVFS1()); PosixQueue queue = ASSERT_NO_ERRNO_AND_VALUE( MqOpen(O_RDWR | O_CREAT | O_EXCL, 0777, nullptr)); @@ -140,7 +140,7 @@ TEST(MqTest, OpenAfterUnlink) { // Test using invalid args with mq_open. TEST(MqTest, OpenInvalidArgs) { - GTEST_SKIP(); + SKIP_IF(IsRunningWithVFS1()); // Name must start with a slash. EXPECT_THAT(MqOpen("test", O_RDWR), PosixErrorIs(EINVAL)); @@ -180,7 +180,7 @@ TEST(MqTest, OpenInvalidArgs) { // Test creating a queue that already exists. TEST(MqTest, CreateAlreadyExists) { - GTEST_SKIP(); + SKIP_IF(IsRunningWithVFS1()); PosixQueue queue = ASSERT_NO_ERRNO_AND_VALUE( MqOpen(O_RDWR | O_CREAT | O_EXCL, 0777, nullptr)); @@ -191,7 +191,7 @@ TEST(MqTest, CreateAlreadyExists) { // Test opening a queue that doesn't exists. TEST(MqTest, NoQueueExists) { - GTEST_SKIP(); + SKIP_IF(IsRunningWithVFS1()); // Choose a name to pass that's unlikely to exist if the test is run locally. EXPECT_THAT(MqOpen("/gvisor-mq-test-nonexistent-queue", O_RDWR), @@ -236,8 +236,8 @@ TEST(MqTest, OpenWriteAccess) { // Test changing IPC namespace. TEST(MqTest, ChangeIpcNamespace) { - GTEST_SKIP(); - SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + SKIP_IF(IsRunningWithVFS1() || + !ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); // When changing IPC namespaces, Linux doesn't invalidate or close the // previously opened file descriptions and allows operations to be performed @@ -267,7 +267,6 @@ TEST(MqTest, ChangeIpcNamespace) { // Test mounting the mqueue filesystem. TEST(MqTest, Mount) { - GTEST_SKIP(); SKIP_IF(IsRunningWithVFS1() || !ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); @@ -280,7 +279,6 @@ TEST(MqTest, Mount) { // Test mounting the mqueue filesystem to several places. TEST(MqTest, MountSeveral) { - GTEST_SKIP(); SKIP_IF(IsRunningWithVFS1() || !ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); @@ -299,7 +297,6 @@ TEST(MqTest, MountSeveral) { // Test mounting mqueue and opening a queue as normal file. TEST(MqTest, OpenAsFile) { - GTEST_SKIP(); SKIP_IF(IsRunningWithVFS1() || !ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); @@ -328,7 +325,6 @@ TEST(MqTest, OpenAsFile) { // Test removing a queue using unlink(2). TEST(MqTest, UnlinkAsFile) { - GTEST_SKIP(); SKIP_IF(IsRunningWithVFS1() || !ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); @@ -349,7 +345,7 @@ TEST(MqTest, UnlinkAsFile) { // Test read(2) from an empty queue. TEST(MqTest, ReadEmpty) { - GTEST_SKIP(); + SKIP_IF(IsRunningWithVFS1()); PosixQueue queue = ASSERT_NO_ERRNO_AND_VALUE( MqOpen(O_RDWR | O_CREAT | O_EXCL, 0777, nullptr)); @@ -368,7 +364,7 @@ TEST(MqTest, ReadEmpty) { // Test poll(2) on an empty queue. TEST(MqTest, PollEmpty) { - GTEST_SKIP(); + SKIP_IF(IsRunningWithVFS1()); PosixQueue queue = ASSERT_NO_ERRNO_AND_VALUE( MqOpen(O_RDWR | O_CREAT | O_EXCL, 0777, nullptr)); -- cgit v1.2.3