diff options
-rw-r--r-- | pkg/abi/linux/file.go | 18 | ||||
-rw-r--r-- | pkg/sentry/fs/tmpfs/inode_file.go | 151 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/BUILD | 1 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/linux64.go | 2 | ||||
-rw-r--r-- | pkg/sentry/syscalls/linux/sys_file.go | 59 | ||||
-rw-r--r-- | test/syscalls/linux/BUILD | 17 | ||||
-rw-r--r-- | test/syscalls/linux/memfd.cc | 546 |
7 files changed, 793 insertions, 1 deletions
diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go index e5a51a9fd..46b10ca97 100644 --- a/pkg/abi/linux/file.go +++ b/pkg/abi/linux/file.go @@ -236,3 +236,21 @@ var fileType = abi.ValueSet{ ModeCharacterDevice: "S_IFCHR", ModeNamedPipe: "S_IFIFO", } + +// Constants for memfd_create(2). Source: include/uapi/linux/memfd.h +const ( + MFD_CLOEXEC = 0x0001 + MFD_ALLOW_SEALING = 0x0002 +) + +// Constants related to file seals. Source: include/uapi/{asm-generic,linux}/fcntl.h +const ( + F_LINUX_SPECIFIC_BASE = 1024 + F_ADD_SEALS = F_LINUX_SPECIFIC_BASE + 9 + F_GET_SEALS = F_LINUX_SPECIFIC_BASE + 10 + + F_SEAL_SEAL = 0x0001 // Prevent further seals from being set. + F_SEAL_SHRINK = 0x0002 // Prevent file from shrinking. + F_SEAL_GROW = 0x0004 // Prevent file from growing. + F_SEAL_WRITE = 0x0008 // Prevent writes. +) diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go index 25bf2b9dd..7c80d711b 100644 --- a/pkg/sentry/fs/tmpfs/inode_file.go +++ b/pkg/sentry/fs/tmpfs/inode_file.go @@ -15,10 +15,12 @@ package tmpfs import ( + "fmt" "io" "sync" "time" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/metric" "gvisor.googlesource.com/gvisor/pkg/sentry/context" "gvisor.googlesource.com/gvisor/pkg/sentry/fs" @@ -29,6 +31,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" "gvisor.googlesource.com/gvisor/pkg/sentry/usage" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" ) var ( @@ -42,6 +45,8 @@ var ( // These files are backed by pages allocated from a platform.Memory, and may be // directly mapped. // +// Lock order: attrMu -> mapsMu -> dataMu. +// // +stateify savable type fileInodeOperations struct { fsutil.InodeGenericChecker `state:"nosave"` @@ -74,6 +79,17 @@ type fileInodeOperations struct { // mappings is protected by mapsMu. mappings memmap.MappingSet + // writableMappingPages tracks how many pages of virtual memory are mapped + // as potentially writable from this file. If a page has multiple mappings, + // each mapping is counted separately. + // + // This counter is susceptible to overflow as we can potentially count + // mappings from many VMAs. We count pages rather than bytes to slightly + // mitigate this. + // + // Protected by mapsMu. + writableMappingPages uint64 + dataMu sync.RWMutex `state:"nosave"` // data maps offsets into the file to offsets into platform.Memory() that @@ -81,6 +97,11 @@ type fileInodeOperations struct { // // data is protected by dataMu. data fsutil.FileRangeSet + + // seals represents file seals on this inode. + // + // Protected by dataMu. + seals uint32 } var _ fs.InodeOperations = (*fileInodeOperations)(nil) @@ -91,9 +112,30 @@ func NewInMemoryFile(ctx context.Context, usage usage.MemoryKind, uattr fs.Unsta attr: uattr, kernel: kernel.KernelFromContext(ctx), memUsage: usage, + seals: linux.F_SEAL_SEAL, } } +// NewMemfdInode creates a new inode backing a memfd. Memory used by the memfd +// is backed by platform memory. +func NewMemfdInode(ctx context.Context, allowSeals bool) *fs.Inode { + // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with + // S_IRWXUGO. + perms := fs.PermMask{Read: true, Write: true, Execute: true} + iops := NewInMemoryFile(ctx, usage.Tmpfs, fs.UnstableAttr{ + Owner: fs.FileOwnerFromContext(ctx), + Perms: fs.FilePermissions{User: perms, Group: perms, Other: perms}}).(*fileInodeOperations) + if allowSeals { + iops.seals = 0 + } + return fs.NewInode(iops, fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{ + Type: fs.RegularFile, + DeviceID: tmpfsDevice.DeviceID(), + InodeID: tmpfsDevice.NextIno(), + BlockSize: usermem.PageSize, + }) +} + // Release implements fs.InodeOperations.Release. func (f *fileInodeOperations) Release(context.Context) { f.dataMu.Lock() @@ -170,6 +212,16 @@ func (f *fileInodeOperations) Truncate(ctx context.Context, _ *fs.Inode, size in f.dataMu.Lock() oldSize := f.attr.Size + + // Check if current seals allow truncation. + switch { + case size > oldSize && f.seals&linux.F_SEAL_GROW != 0: // Grow sealed + fallthrough + case oldSize > size && f.seals&linux.F_SEAL_SHRINK != 0: // Shrink sealed + f.dataMu.Unlock() + return syserror.EPERM + } + if oldSize != size { f.attr.Size = size // Update mtime and ctime. @@ -370,6 +422,34 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) return 0, nil } + // Check if seals prevent either file growth or all writes. + switch { + case rw.f.seals&linux.F_SEAL_WRITE != 0: // Write sealed + return 0, syserror.EPERM + case end > rw.f.attr.Size && rw.f.seals&linux.F_SEAL_GROW != 0: // Grow sealed + // When growth is sealed, Linux effectively allows writes which would + // normally grow the file to partially succeed up to the current EOF, + // rounded down to the page boundary before the EOF. + // + // This happens because writes (and thus the growth check) for tmpfs + // files proceed page-by-page on Linux, and the final write to the page + // containing EOF fails, resulting in a partial write up to the start of + // that page. + // + // To emulate this behaviour, artifically truncate the write to the + // start of the page containing the current EOF. + // + // See Linux, mm/filemap.c:generic_perform_write() and + // mm/shmem.c:shmem_write_begin(). + if pgstart := int64(usermem.Addr(rw.f.attr.Size).RoundDown()); end > pgstart { + end = pgstart + } + if end <= rw.offset { + // Truncation would result in no data being written. + return 0, syserror.EPERM + } + } + defer func() { // If the write ends beyond the file's previous size, it causes the // file to grow. @@ -431,7 +511,27 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) func (f *fileInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { f.mapsMu.Lock() defer f.mapsMu.Unlock() + + f.dataMu.RLock() + defer f.dataMu.RUnlock() + + // Reject writable mapping if F_SEAL_WRITE is set. + if f.seals&linux.F_SEAL_WRITE != 0 && writable { + return syserror.EPERM + } + f.mappings.AddMapping(ms, ar, offset, writable) + if writable { + pagesBefore := f.writableMappingPages + + // ar is guaranteed to be page aligned per memmap.Mappable. + f.writableMappingPages += uint64(ar.Length() / usermem.PageSize) + + if f.writableMappingPages < pagesBefore { + panic(fmt.Sprintf("Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, f.writableMappingPages)) + } + } + return nil } @@ -439,7 +539,19 @@ func (f *fileInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingS func (f *fileInodeOperations) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) { f.mapsMu.Lock() defer f.mapsMu.Unlock() + f.mappings.RemoveMapping(ms, ar, offset, writable) + + if writable { + pagesBefore := f.writableMappingPages + + // ar is guaranteed to be page aligned per memmap.Mappable. + f.writableMappingPages -= uint64(ar.Length() / usermem.PageSize) + + if f.writableMappingPages > pagesBefore { + panic(fmt.Sprintf("Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, f.writableMappingPages)) + } + } } // CopyMapping implements memmap.Mappable.CopyMapping. @@ -501,3 +613,42 @@ func (f *fileInodeOperations) Translate(ctx context.Context, required, optional func (f *fileInodeOperations) InvalidateUnsavable(ctx context.Context) error { return nil } + +// GetSeals returns the current set of seals on a memfd inode. +func GetSeals(inode *fs.Inode) (uint32, error) { + if f, ok := inode.InodeOperations.(*fileInodeOperations); ok { + f.dataMu.RLock() + defer f.dataMu.RUnlock() + return f.seals, nil + } + // Not a memfd inode. + return 0, syserror.EINVAL +} + +// AddSeals adds new file seals to a memfd inode. +func AddSeals(inode *fs.Inode, val uint32) error { + if f, ok := inode.InodeOperations.(*fileInodeOperations); ok { + f.mapsMu.Lock() + defer f.mapsMu.Unlock() + f.dataMu.Lock() + defer f.dataMu.Unlock() + + if f.seals&linux.F_SEAL_SEAL != 0 { + // Seal applied which prevents addition of any new seals. + return syserror.EPERM + } + + // F_SEAL_WRITE can only be added if there are no active writable maps. + if f.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 { + if f.writableMappingPages > 0 { + return syserror.EBUSY + } + } + + // Seals can only be added, never removed. + f.seals |= val + return nil + } + // Not a memfd inode. + return syserror.EINVAL +} diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 846601881..6e2843b36 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -63,6 +63,7 @@ go_library( "//pkg/sentry/fs/anon", "//pkg/sentry/fs/lock", "//pkg/sentry/fs/timerfd", + "//pkg/sentry/fs/tmpfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/epoll", diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go index e855590e6..888b5aa9f 100644 --- a/pkg/sentry/syscalls/linux/linux64.go +++ b/pkg/sentry/syscalls/linux/linux64.go @@ -367,7 +367,7 @@ var AMD64 = &kernel.SyscallTable{ // 316: Renameat2, TODO 317: Seccomp, 318: GetRandom, - // 319: MemfdCreate, TODO + 319: MemfdCreate, 320: syscalls.CapError(linux.CAP_SYS_BOOT), // KexecFileLoad, infeasible to support 321: syscalls.CapError(linux.CAP_SYS_ADMIN), // Bpf, requires cap_sys_admin for all commands // 322: Execveat, TODO diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index cf6fdc190..3193718b5 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -23,6 +23,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/context" "gvisor.googlesource.com/gvisor/pkg/sentry/fs" "gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/fasync" @@ -933,6 +934,15 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.F_SETOWN: fSetOwn(t, file, args[2].Int()) return 0, nil, nil + case linux.F_GET_SEALS: + val, err := tmpfs.GetSeals(file.Dirent.Inode) + return uintptr(val), nil, err + case linux.F_ADD_SEALS: + if !file.Flags().Write { + return 0, nil, syserror.EPERM + } + err := tmpfs.AddSeals(file.Dirent.Inode, args[2].Uint()) + return 0, nil, err default: // Everything else is not yet supported. return 0, nil, syserror.EINVAL @@ -2066,3 +2076,52 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // arbitrarily. return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "sendfile", inFile) } + +const ( + memfdPrefix = "/memfd:" + memfdAllFlags = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING) + memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix) + 1 +) + +// MemfdCreate implements the linux syscall memfd_create(2). +func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := args[1].Uint() + + if flags&^memfdAllFlags != 0 { + // Unknown bits in flags. + return 0, nil, syserror.EINVAL + } + + allowSeals := flags&linux.MFD_ALLOW_SEALING != 0 + cloExec := flags&linux.MFD_CLOEXEC != 0 + + name, err := t.CopyInString(addr, syscall.PathMax-len(memfdPrefix)) + if err != nil { + return 0, nil, err + } + if len(name) > memfdMaxNameLen { + return 0, nil, syserror.EINVAL + } + name = memfdPrefix + name + + inode := tmpfs.NewMemfdInode(t, allowSeals) + dirent := fs.NewDirent(inode, name) + // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with + // FMODE_READ | FMODE_WRITE. + file, err := inode.GetFile(t, dirent, fs.FileFlags{Read: true, Write: true}) + if err != nil { + return 0, nil, err + } + + defer dirent.DecRef() + defer file.DecRef() + + fdFlags := kernel.FDFlags{CloseOnExec: cloExec} + newFD, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits()) + if err != nil { + return 0, nil, err + } + + return uintptr(newFD), nil, nil +} diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 2c214925e..7dd63dd0a 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -3261,3 +3261,20 @@ cc_binary( "@com_google_googletest//:gtest", ], ) + +cc_binary( + name = "memfd_test", + testonly = 1, + srcs = ["memfd.cc"], + linkstatic = 1, + deps = [ + "//test/util:file_descriptor", + "//test/util:fs_util", + "//test/util:memory_util", + "//test/util:multiprocess_util", + "//test/util:temp_path", + "//test/util:test_main", + "//test/util:test_util", + "@com_google_googletest//:gtest", + ], +) diff --git a/test/syscalls/linux/memfd.cc b/test/syscalls/linux/memfd.cc new file mode 100644 index 000000000..ccdddd4e5 --- /dev/null +++ b/test/syscalls/linux/memfd.cc @@ -0,0 +1,546 @@ +// Copyright 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <errno.h> +#include <fcntl.h> +#include <linux/magic.h> +#include <linux/memfd.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/statfs.h> +#include <sys/syscall.h> + +#include <vector> + +#include "gtest/gtest.h" +#include "test/util/file_descriptor.h" +#include "test/util/fs_util.h" +#include "test/util/memory_util.h" +#include "test/util/multiprocess_util.h" +#include "test/util/temp_path.h" +#include "test/util/test_util.h" + +namespace gvisor { +namespace testing { +namespace { + +// The header sys/memfd.h isn't available on all systems, so redefining some of +// the constants here. +#define F_LINUX_SPECIFIC_BASE 1024 +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) +#define F_SEAL_SEAL 0x0001 +#define F_SEAL_SHRINK 0x0002 +#define F_SEAL_GROW 0x0004 +#define F_SEAL_WRITE 0x0008 + +using ::testing::StartsWith; + +const std::string kMemfdName = "some-memfd"; + +int memfd_create(const std::string& name, unsigned int flags) { + return syscall(__NR_memfd_create, name.c_str(), flags); +} + +PosixErrorOr<FileDescriptor> MemfdCreate(const std::string& name, uint32_t flags) { + int fd = memfd_create(name, flags); + if (fd < 0) { + return PosixError( + errno, absl::StrFormat("memfd_create(\"%s\", %#x)", name, flags)); + } + MaybeSave(); + return FileDescriptor(fd); +} + +// Procfs entries for memfds display the appropriate name. +TEST(MemfdTest, Name) { + const FileDescriptor memfd = + ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0)); + const std::string proc_name = ASSERT_NO_ERRNO_AND_VALUE( + ReadLink(absl::StrFormat("/proc/self/fd/%d", memfd.get()))); + EXPECT_THAT(proc_name, StartsWith("/memfd:" + kMemfdName)); +} + +// Memfds support read/write syscalls. +TEST(MemfdTest, WriteRead) { + const FileDescriptor memfd = + ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0)); + + // Write a random page of data to the memfd via write(2). + std::vector<char> buf(kPageSize); + RandomizeBuffer(buf.data(), buf.size()); + ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()), + SyscallSucceedsWithValue(kPageSize)); + + // Read back the same data and verify. + std::vector<char> buf2(kPageSize); + ASSERT_THAT(lseek(memfd.get(), 0, SEEK_SET), SyscallSucceeds()); + EXPECT_THAT(read(memfd.get(), buf2.data(), buf2.size()), + SyscallSucceedsWithValue(kPageSize)); + EXPECT_EQ(buf, buf2); +} + +// Memfds can be mapped and used as usual. +TEST(MemfdTest, Mmap) { + const FileDescriptor memfd = + ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0)); + const Mapping m1 = ASSERT_NO_ERRNO_AND_VALUE(Mmap( + nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, memfd.get(), 0)); + + // Write a random page of data to the memfd via mmap m1. + std::vector<char> buf(kPageSize); + RandomizeBuffer(buf.data(), buf.size()); + ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds()); + memcpy(m1.ptr(), buf.data(), buf.size()); + + // Read the data back via a read syscall on the memfd. + std::vector<char> buf2(kPageSize); + EXPECT_THAT(read(memfd.get(), buf2.data(), buf2.size()), + SyscallSucceedsWithValue(kPageSize)); + EXPECT_EQ(buf, buf2); + + // The same data should be accessible via a new mapping m2. + const Mapping m2 = ASSERT_NO_ERRNO_AND_VALUE(Mmap( + nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, memfd.get(), 0)); + EXPECT_EQ(0, memcmp(m1.ptr(), m2.ptr(), kPageSize)); +} + +TEST(MemfdTest, DuplicateFDsShareContent) { + const FileDescriptor memfd = + ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0)); + const Mapping m1 = ASSERT_NO_ERRNO_AND_VALUE(Mmap( + nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, memfd.get(), 0)); + const FileDescriptor memfd2 = ASSERT_NO_ERRNO_AND_VALUE(memfd.Dup()); + + // Write a random page of data to the memfd via mmap m1. + std::vector<char> buf(kPageSize); + RandomizeBuffer(buf.data(), buf.size()); + ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds()); + memcpy(m1.ptr(), buf.data(), buf.size()); + + // Read the data back via a read syscall on a duplicate fd. + std::vector<char> buf2(kPageSize); + EXPECT_THAT(read(memfd2.get(), buf2.data(), buf2.size()), + SyscallSucceedsWithValue(kPageSize)); + EXPECT_EQ(buf, buf2); +} + +// File seals are disabled by default on memfds. +TEST(MemfdTest, SealingDisabledByDefault) { + const FileDescriptor memfd = + ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0)); + EXPECT_THAT(fcntl(memfd.get(), F_GET_SEALS), + SyscallSucceedsWithValue(F_SEAL_SEAL)); + // Attempting to set any seal should fail. + EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), + SyscallFailsWithErrno(EPERM)); +} + +// Seals can be retrieved and updated for memfds. +TEST(MemfdTest, SealsGetSet) { + const FileDescriptor memfd = + ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING)); + int seals; + ASSERT_THAT(seals = fcntl(memfd.get(), F_GET_SEALS), SyscallSucceeds()); + // No seals are set yet. + EXPECT_EQ(0, seals); + + // Set a seal and check that we can get it back. + ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds()); + EXPECT_THAT(fcntl(memfd.get(), F_GET_SEALS), + SyscallSucceedsWithValue(F_SEAL_WRITE)); + + // Set some more seals and verify. + ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_GROW | F_SEAL_SHRINK), + SyscallSucceeds()); + EXPECT_THAT( + fcntl(memfd.get(), F_GET_SEALS), + SyscallSucceedsWithValue(F_SEAL_WRITE | F_SEAL_GROW | F_SEAL_SHRINK)); + + // Attempting to set a seal that is already set is a no-op. + ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds()); + EXPECT_THAT( + fcntl(memfd.get(), F_GET_SEALS), + SyscallSucceedsWithValue(F_SEAL_WRITE | F_SEAL_GROW | F_SEAL_SHRINK)); + + // Add remaining seals and verify. + ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_SEAL), SyscallSucceeds()); + EXPECT_THAT(fcntl(memfd.get(), F_GET_SEALS), + SyscallSucceedsWithValue(F_SEAL_WRITE | F_SEAL_GROW | + F_SEAL_SHRINK | F_SEAL_SEAL)); +} + +// F_SEAL_GROW prevents a memfd from being grown using ftruncate. +TEST(MemfdTest, SealGrowWithTruncate) { + const FileDescriptor memfd = + ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING)); + ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds()); + ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_GROW), SyscallSucceeds()); + + // Try grow the memfd by 1 page. + ASSERT_THAT(ftruncate(memfd.get(), kPageSize * 2), + SyscallFailsWithErrno(EPERM)); + + // Ftruncate calls that don't actually grow the memfd are allowed. + ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds()); + ASSERT_THAT(ftruncate(memfd.get(), kPageSize / 2), SyscallSucceeds()); + + // After shrinking, growing back is not allowed. + ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallFailsWithErrno(EPERM)); +} + +// F_SEAL_GROW prevents a memfd from being grown using the write syscall. +TEST(MemfdTest, SealGrowWithWrite) { + const FileDescriptor memfd = + ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING)); + + // Initially, writing to the memfd succeeds. + const std::vector<char> buf(kPageSize); + EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()), + SyscallSucceedsWithValue(kPageSize)); + + // Apply F_SEAL_GROW, subsequent writes which extend the memfd should fail. + ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_GROW), SyscallSucceeds()); + EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()), + SyscallFailsWithErrno(EPERM)); + + // However, zero-length writes are ok since they don't grow the memfd. + EXPECT_THAT(write(memfd.get(), buf.data(), 0), SyscallSucceeds()); + + // Writing to existing parts of the memfd is also ok. + ASSERT_THAT(lseek(memfd.get(), 0, SEEK_SET), SyscallSucceeds()); + EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()), + SyscallSucceedsWithValue(kPageSize)); + + // Returning the end of the file and writing still not allowed. + EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()), + SyscallFailsWithErrno(EPERM)); +} + +// F_SEAL_GROW causes writes which partially extend off the current EOF to +// partially succeed, up to the page containing the EOF. +TEST(MemfdTest, SealGrowPartialWriteTruncated) { + const FileDescriptor memfd = + ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING)); + ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds()); + ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_GROW), SyscallSucceeds()); + + // FD offset: 1 page, EOF: 1 page. + + ASSERT_THAT(lseek(memfd.get(), kPageSize * 3 / 4, SEEK_SET), + SyscallSucceeds()); + + // FD offset: 3/4 page. Writing a full page now should only write 1/4 page + // worth of data. This partially succeeds because the first page is entirely + // within the file and requires no growth, but attempting to write the final + // 3/4 page would require growing the file. + const std::vector<char> buf(kPageSize); + EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()), + SyscallSucceedsWithValue(kPageSize / 4)); +} + +// F_SEAL_GROW causes writes which partially extend off the current EOF to fail +// in its entirety if the only data written would be to the page containing the +// EOF. +TEST(MemfdTest, SealGrowPartialWriteTruncatedSamePage) { + const FileDescriptor memfd = + ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING)); + ASSERT_THAT(ftruncate(memfd.get(), kPageSize * 3 / 4), SyscallSucceeds()); + ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_GROW), SyscallSucceeds()); + + // EOF: 3/4 page, writing 1/2 page starting at 1/2 page would cause the file + // to grow. Since this would require only the page containing the EOF to be + // modified, the write is rejected entirely. + const std::vector<char> buf(kPageSize / 2); + EXPECT_THAT(pwrite(memfd.get(), buf.data(), buf.size(), kPageSize / 2), + SyscallFailsWithErrno(EPERM)); + + // However, writing up to EOF is fine. + EXPECT_THAT(pwrite(memfd.get(), buf.data(), buf.size() / 2, kPageSize / 2), + SyscallSucceedsWithValue(kPageSize / 4)); +} + +// F_SEAL_SHRINK prevents a memfd from being shrunk using ftruncate. +TEST(MemfdTest, SealShrink) { + const FileDescriptor memfd = + ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING)); + ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds()); + ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_SHRINK), + SyscallSucceeds()); + + // Shrink by half a page. + ASSERT_THAT(ftruncate(memfd.get(), kPageSize / 2), + SyscallFailsWithErrno(EPERM)); + + // Ftruncate calls that don't actually shrink the file are allowed. + ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds()); + ASSERT_THAT(ftruncate(memfd.get(), kPageSize * 2), SyscallSucceeds()); + + // After growing, shrinking is still not allowed. + ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallFailsWithErrno(EPERM)); +} + +// F_SEAL_WRITE prevents a memfd from being written to through a write +// syscall. +TEST(MemfdTest, SealWriteWithWrite) { + const FileDescriptor memfd = + ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING)); + const std::vector<char> buf(kPageSize); + ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()), + SyscallSucceedsWithValue(kPageSize)); + ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds()); + + // Attemping to write at the end of the file fails. + EXPECT_THAT(write(memfd.get(), buf.data(), 1), SyscallFailsWithErrno(EPERM)); + + // Attemping to overwrite an existing part of the memfd fails. + EXPECT_THAT(pwrite(memfd.get(), buf.data(), 1, 0), + SyscallFailsWithErrno(EPERM)); + EXPECT_THAT(pwrite(memfd.get(), buf.data(), buf.size() / 2, kPageSize / 2), + SyscallFailsWithErrno(EPERM)); + EXPECT_THAT(pwrite(memfd.get(), buf.data(), buf.size(), kPageSize / 2), + SyscallFailsWithErrno(EPERM)); + + // Zero-length writes however do not fail. + EXPECT_THAT(write(memfd.get(), buf.data(), 0), SyscallSucceeds()); +} + +// F_SEAL_WRITE prevents a memfd from being written to through an mmap. +TEST(MemfdTest, SealWriteWithMmap) { + const FileDescriptor memfd = + ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING)); + const std::vector<char> buf(kPageSize); + ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()), + SyscallSucceedsWithValue(kPageSize)); + ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds()); + + // Can't create a shared mapping with writes sealed. + void* ret = mmap(nullptr, kPageSize, PROT_WRITE, MAP_SHARED, memfd.get(), 0); + EXPECT_EQ(ret, MAP_FAILED); + EXPECT_EQ(errno, EPERM); + ret = mmap(nullptr, kPageSize, PROT_READ, MAP_SHARED, memfd.get(), 0); + EXPECT_EQ(ret, MAP_FAILED); + EXPECT_EQ(errno, EPERM); + + // However, private mappings are ok. + EXPECT_NO_ERRNO(Mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, + memfd.get(), 0)); +} + +// Adding F_SEAL_WRITE fails when there are outstanding writable mappings to a +// memfd. +TEST(MemfdTest, SealWriteWithOutstandingWritbleMapping) { + const FileDescriptor memfd = + ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING)); + const std::vector<char> buf(kPageSize); + ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()), + SyscallSucceedsWithValue(kPageSize)); + + // Attempting to add F_SEAL_WRITE with active shared mapping with any set of + // permissions fails. + + // Read-only shared mapping. + { + const Mapping m = ASSERT_NO_ERRNO_AND_VALUE( + Mmap(nullptr, kPageSize, PROT_READ, MAP_SHARED, memfd.get(), 0)); + EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), + SyscallFailsWithErrno(EBUSY)); + } + + // Write-only shared mapping. + { + const Mapping m = ASSERT_NO_ERRNO_AND_VALUE( + Mmap(nullptr, kPageSize, PROT_WRITE, MAP_SHARED, memfd.get(), 0)); + EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), + SyscallFailsWithErrno(EBUSY)); + } + + // Read-write shared mapping. + { + const Mapping m = ASSERT_NO_ERRNO_AND_VALUE( + Mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, + memfd.get(), 0)); + EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), + SyscallFailsWithErrno(EBUSY)); + } + + // F_SEAL_WRITE can be set with private mappings with any permissions. + { + const Mapping m = ASSERT_NO_ERRNO_AND_VALUE( + Mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, + memfd.get(), 0)); + EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), + SyscallSucceeds()); + } +} + +// When applying F_SEAL_WRITE fails due to outstanding writable mappings, any +// additional seals passed to the same add seal call are also rejected. +TEST(MemfdTest, NoPartialSealApplicationWhenWriteSealRejected) { + const FileDescriptor memfd = + ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING)); + const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(Mmap( + nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, memfd.get(), 0)); + + // Try add some seals along with F_SEAL_WRITE. The seal application should + // fail since there exists an active shared mapping. + EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE | F_SEAL_GROW), + SyscallFailsWithErrno(EBUSY)); + + // None of the seals should be applied. + EXPECT_THAT(fcntl(memfd.get(), F_GET_SEALS), SyscallSucceedsWithValue(0)); +} + +// Seals are inode level properties, and apply to all file descriptors referring +// to a memfd. +TEST(MemfdTest, SealsAreInodeLevelProperties) { + const FileDescriptor memfd = + ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING)); + const FileDescriptor memfd2 = ASSERT_NO_ERRNO_AND_VALUE(memfd.Dup()); + + // Add seal through the original memfd, and verify that it appears on the + // dupped fd. + ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds()); + EXPECT_THAT(fcntl(memfd2.get(), F_GET_SEALS), + SyscallSucceedsWithValue(F_SEAL_WRITE)); + + // Verify the seal actually applies to both fds. + std::vector<char> buf(kPageSize); + EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()), + SyscallFailsWithErrno(EPERM)); + EXPECT_THAT(write(memfd2.get(), buf.data(), buf.size()), + SyscallFailsWithErrno(EPERM)); + + // Seals are enforced on new FDs that are dupped after the seal is already + // applied. + const FileDescriptor memfd3 = ASSERT_NO_ERRNO_AND_VALUE(memfd2.Dup()); + EXPECT_THAT(write(memfd3.get(), buf.data(), buf.size()), + SyscallFailsWithErrno(EPERM)); + + // Try a new seal applied to one of the dupped fds. + ASSERT_THAT(fcntl(memfd3.get(), F_ADD_SEALS, F_SEAL_GROW), SyscallSucceeds()); + EXPECT_THAT(ftruncate(memfd.get(), kPageSize), SyscallFailsWithErrno(EPERM)); + EXPECT_THAT(ftruncate(memfd2.get(), kPageSize), SyscallFailsWithErrno(EPERM)); + EXPECT_THAT(ftruncate(memfd3.get(), kPageSize), SyscallFailsWithErrno(EPERM)); +} + +PosixErrorOr<bool> IsTmpfs(const std::string& path) { + struct statfs stat; + if (statfs(path.c_str(), &stat)) { + if (errno == ENOENT) { + // Nothing at path, don't raise this as an error. Instead, just report no + // tmpfs at path. + return false; + } + return PosixError(errno, + absl::StrFormat("statfs(\"%s\", %#p)", path, &stat)); + } + return stat.f_type == TMPFS_MAGIC; +} + +// Tmpfs files also support seals, but are created with F_SEAL_SEAL. +TEST(MemfdTest, TmpfsFilesHaveSealSeal) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsTmpfs("/tmp"))); + const TempPath tmpfs_file = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn("/tmp")); + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(tmpfs_file.path(), O_RDWR, 0644)); + EXPECT_THAT(fcntl(fd.get(), F_GET_SEALS), + SyscallSucceedsWithValue(F_SEAL_SEAL)); +} + +// Can open a memfd from procfs and use as normal. +TEST(MemfdTest, CanOpenFromProcfs) { + const FileDescriptor memfd = + ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING)); + + // Write a random page of data to the memfd via write(2). + std::vector<char> buf(kPageSize); + RandomizeBuffer(buf.data(), buf.size()); + ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()), + SyscallSucceedsWithValue(kPageSize)); + + // Read back the same data from the fd obtained from procfs and verify. + const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE( + Open(absl::StrFormat("/proc/self/fd/%d", memfd.get()), O_RDWR)); + std::vector<char> buf2(kPageSize); + EXPECT_THAT(pread(fd.get(), buf2.data(), buf2.size(), 0), + SyscallSucceedsWithValue(kPageSize)); + EXPECT_EQ(buf, buf2); +} + +// Test that memfd permissions are set up correctly to allow another process to +// open it from procfs. +TEST(MemfdTest, OtherProcessCanOpenFromProcfs) { + const FileDescriptor memfd = + ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING)); + pid_t pid = getpid(); + const auto rest = [&] { + ASSERT_NO_ERRNO( + Open(absl::StrFormat("/proc/self/%d/%d", pid, memfd.get()), O_RDWR)); + }; + EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0)); +} + +// Test that only files opened as writable can have seals applied to them. +// Normally there's no way to specify file permissions on memfds, but we can +// obtain a read-only memfd by opening the corresponding procfs fd entry as +// read-only. +TEST(MemfdTest, MemfdMustBeWritableToModifySeals) { + const FileDescriptor memfd = + ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING)); + + // Initially adding a seal works. + EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds()); + + // Re-open the memfd as read-only from procfs. + const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE( + Open(absl::StrFormat("/proc/self/fd/%d", memfd.get()), O_RDONLY)); + + // Can't add seals through an unwritable fd. + EXPECT_THAT(fcntl(fd.get(), F_ADD_SEALS, F_SEAL_GROW), + SyscallFailsWithErrno(EPERM)); +} + +// Test that the memfd implementation internally tracks potentially writable +// maps correctly. +TEST(MemfdTest, MultipleWritableAndNonWritableRefsToSameFileRegion) { + const FileDescriptor memfd = + ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0)); + + // Populate with a random page of data. + std::vector<char> buf(kPageSize); + RandomizeBuffer(buf.data(), buf.size()); + ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()), + SyscallSucceedsWithValue(kPageSize)); + + // Read-only map to the page. This should cause an initial mapping to be + // created. + Mapping m1 = ASSERT_NO_ERRNO_AND_VALUE( + Mmap(nullptr, kPageSize, PROT_READ, MAP_PRIVATE, memfd.get(), 0)); + + // Create a shared writable map to the page. This should cause the internal + // mapping to become potentially writable. + Mapping m2 = ASSERT_NO_ERRNO_AND_VALUE(Mmap( + nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, memfd.get(), 0)); + + // Drop the read-only mapping first. If writable-ness isn't tracked correctly, + // this can cause some misaccounting, which can trigger asserts internally. + m1.reset(); + m2.reset(); +} + +} // namespace +} // namespace testing +} // namespace gvisor |