summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRahat Mahmood <rahat@google.com>2019-03-26 16:15:55 -0700
committerShentubot <shentubot@google.com>2019-03-26 16:16:57 -0700
commit06ec97a3f823f1f5d928fc9c2beb3a11c2c88487 (patch)
treea2b501718c82aede761d7235527492782ef65cc2
parent79aca14a0cd70720e8a8f8bd6c1499ab1ffbd8d3 (diff)
Implement memfd_create.
Memfds are simply anonymous tmpfs files with no associated mounts. Also implementing file seals, which Linux only implements for memfds at the moment. PiperOrigin-RevId: 240450031 Change-Id: I31de78b950101ae8d7a13d0e93fe52d98ea06f2f
-rw-r--r--pkg/abi/linux/file.go18
-rw-r--r--pkg/sentry/fs/tmpfs/inode_file.go151
-rw-r--r--pkg/sentry/syscalls/linux/BUILD1
-rw-r--r--pkg/sentry/syscalls/linux/linux64.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_file.go59
-rw-r--r--test/syscalls/linux/BUILD17
-rw-r--r--test/syscalls/linux/memfd.cc546
7 files changed, 793 insertions, 1 deletions
diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index e5a51a9fd..46b10ca97 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -236,3 +236,21 @@ var fileType = abi.ValueSet{
ModeCharacterDevice: "S_IFCHR",
ModeNamedPipe: "S_IFIFO",
}
+
+// Constants for memfd_create(2). Source: include/uapi/linux/memfd.h
+const (
+ MFD_CLOEXEC = 0x0001
+ MFD_ALLOW_SEALING = 0x0002
+)
+
+// Constants related to file seals. Source: include/uapi/{asm-generic,linux}/fcntl.h
+const (
+ F_LINUX_SPECIFIC_BASE = 1024
+ F_ADD_SEALS = F_LINUX_SPECIFIC_BASE + 9
+ F_GET_SEALS = F_LINUX_SPECIFIC_BASE + 10
+
+ F_SEAL_SEAL = 0x0001 // Prevent further seals from being set.
+ F_SEAL_SHRINK = 0x0002 // Prevent file from shrinking.
+ F_SEAL_GROW = 0x0004 // Prevent file from growing.
+ F_SEAL_WRITE = 0x0008 // Prevent writes.
+)
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 25bf2b9dd..7c80d711b 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -15,10 +15,12 @@
package tmpfs
import (
+ "fmt"
"io"
"sync"
"time"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/metric"
"gvisor.googlesource.com/gvisor/pkg/sentry/context"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
@@ -29,6 +31,7 @@ import (
"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
)
var (
@@ -42,6 +45,8 @@ var (
// These files are backed by pages allocated from a platform.Memory, and may be
// directly mapped.
//
+// Lock order: attrMu -> mapsMu -> dataMu.
+//
// +stateify savable
type fileInodeOperations struct {
fsutil.InodeGenericChecker `state:"nosave"`
@@ -74,6 +79,17 @@ type fileInodeOperations struct {
// mappings is protected by mapsMu.
mappings memmap.MappingSet
+ // writableMappingPages tracks how many pages of virtual memory are mapped
+ // as potentially writable from this file. If a page has multiple mappings,
+ // each mapping is counted separately.
+ //
+ // This counter is susceptible to overflow as we can potentially count
+ // mappings from many VMAs. We count pages rather than bytes to slightly
+ // mitigate this.
+ //
+ // Protected by mapsMu.
+ writableMappingPages uint64
+
dataMu sync.RWMutex `state:"nosave"`
// data maps offsets into the file to offsets into platform.Memory() that
@@ -81,6 +97,11 @@ type fileInodeOperations struct {
//
// data is protected by dataMu.
data fsutil.FileRangeSet
+
+ // seals represents file seals on this inode.
+ //
+ // Protected by dataMu.
+ seals uint32
}
var _ fs.InodeOperations = (*fileInodeOperations)(nil)
@@ -91,9 +112,30 @@ func NewInMemoryFile(ctx context.Context, usage usage.MemoryKind, uattr fs.Unsta
attr: uattr,
kernel: kernel.KernelFromContext(ctx),
memUsage: usage,
+ seals: linux.F_SEAL_SEAL,
}
}
+// NewMemfdInode creates a new inode backing a memfd. Memory used by the memfd
+// is backed by platform memory.
+func NewMemfdInode(ctx context.Context, allowSeals bool) *fs.Inode {
+ // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with
+ // S_IRWXUGO.
+ perms := fs.PermMask{Read: true, Write: true, Execute: true}
+ iops := NewInMemoryFile(ctx, usage.Tmpfs, fs.UnstableAttr{
+ Owner: fs.FileOwnerFromContext(ctx),
+ Perms: fs.FilePermissions{User: perms, Group: perms, Other: perms}}).(*fileInodeOperations)
+ if allowSeals {
+ iops.seals = 0
+ }
+ return fs.NewInode(iops, fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{
+ Type: fs.RegularFile,
+ DeviceID: tmpfsDevice.DeviceID(),
+ InodeID: tmpfsDevice.NextIno(),
+ BlockSize: usermem.PageSize,
+ })
+}
+
// Release implements fs.InodeOperations.Release.
func (f *fileInodeOperations) Release(context.Context) {
f.dataMu.Lock()
@@ -170,6 +212,16 @@ func (f *fileInodeOperations) Truncate(ctx context.Context, _ *fs.Inode, size in
f.dataMu.Lock()
oldSize := f.attr.Size
+
+ // Check if current seals allow truncation.
+ switch {
+ case size > oldSize && f.seals&linux.F_SEAL_GROW != 0: // Grow sealed
+ fallthrough
+ case oldSize > size && f.seals&linux.F_SEAL_SHRINK != 0: // Shrink sealed
+ f.dataMu.Unlock()
+ return syserror.EPERM
+ }
+
if oldSize != size {
f.attr.Size = size
// Update mtime and ctime.
@@ -370,6 +422,34 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
return 0, nil
}
+ // Check if seals prevent either file growth or all writes.
+ switch {
+ case rw.f.seals&linux.F_SEAL_WRITE != 0: // Write sealed
+ return 0, syserror.EPERM
+ case end > rw.f.attr.Size && rw.f.seals&linux.F_SEAL_GROW != 0: // Grow sealed
+ // When growth is sealed, Linux effectively allows writes which would
+ // normally grow the file to partially succeed up to the current EOF,
+ // rounded down to the page boundary before the EOF.
+ //
+ // This happens because writes (and thus the growth check) for tmpfs
+ // files proceed page-by-page on Linux, and the final write to the page
+ // containing EOF fails, resulting in a partial write up to the start of
+ // that page.
+ //
+ // To emulate this behaviour, artifically truncate the write to the
+ // start of the page containing the current EOF.
+ //
+ // See Linux, mm/filemap.c:generic_perform_write() and
+ // mm/shmem.c:shmem_write_begin().
+ if pgstart := int64(usermem.Addr(rw.f.attr.Size).RoundDown()); end > pgstart {
+ end = pgstart
+ }
+ if end <= rw.offset {
+ // Truncation would result in no data being written.
+ return 0, syserror.EPERM
+ }
+ }
+
defer func() {
// If the write ends beyond the file's previous size, it causes the
// file to grow.
@@ -431,7 +511,27 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
func (f *fileInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
f.mapsMu.Lock()
defer f.mapsMu.Unlock()
+
+ f.dataMu.RLock()
+ defer f.dataMu.RUnlock()
+
+ // Reject writable mapping if F_SEAL_WRITE is set.
+ if f.seals&linux.F_SEAL_WRITE != 0 && writable {
+ return syserror.EPERM
+ }
+
f.mappings.AddMapping(ms, ar, offset, writable)
+ if writable {
+ pagesBefore := f.writableMappingPages
+
+ // ar is guaranteed to be page aligned per memmap.Mappable.
+ f.writableMappingPages += uint64(ar.Length() / usermem.PageSize)
+
+ if f.writableMappingPages < pagesBefore {
+ panic(fmt.Sprintf("Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, f.writableMappingPages))
+ }
+ }
+
return nil
}
@@ -439,7 +539,19 @@ func (f *fileInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingS
func (f *fileInodeOperations) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
f.mapsMu.Lock()
defer f.mapsMu.Unlock()
+
f.mappings.RemoveMapping(ms, ar, offset, writable)
+
+ if writable {
+ pagesBefore := f.writableMappingPages
+
+ // ar is guaranteed to be page aligned per memmap.Mappable.
+ f.writableMappingPages -= uint64(ar.Length() / usermem.PageSize)
+
+ if f.writableMappingPages > pagesBefore {
+ panic(fmt.Sprintf("Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, f.writableMappingPages))
+ }
+ }
}
// CopyMapping implements memmap.Mappable.CopyMapping.
@@ -501,3 +613,42 @@ func (f *fileInodeOperations) Translate(ctx context.Context, required, optional
func (f *fileInodeOperations) InvalidateUnsavable(ctx context.Context) error {
return nil
}
+
+// GetSeals returns the current set of seals on a memfd inode.
+func GetSeals(inode *fs.Inode) (uint32, error) {
+ if f, ok := inode.InodeOperations.(*fileInodeOperations); ok {
+ f.dataMu.RLock()
+ defer f.dataMu.RUnlock()
+ return f.seals, nil
+ }
+ // Not a memfd inode.
+ return 0, syserror.EINVAL
+}
+
+// AddSeals adds new file seals to a memfd inode.
+func AddSeals(inode *fs.Inode, val uint32) error {
+ if f, ok := inode.InodeOperations.(*fileInodeOperations); ok {
+ f.mapsMu.Lock()
+ defer f.mapsMu.Unlock()
+ f.dataMu.Lock()
+ defer f.dataMu.Unlock()
+
+ if f.seals&linux.F_SEAL_SEAL != 0 {
+ // Seal applied which prevents addition of any new seals.
+ return syserror.EPERM
+ }
+
+ // F_SEAL_WRITE can only be added if there are no active writable maps.
+ if f.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 {
+ if f.writableMappingPages > 0 {
+ return syserror.EBUSY
+ }
+ }
+
+ // Seals can only be added, never removed.
+ f.seals |= val
+ return nil
+ }
+ // Not a memfd inode.
+ return syserror.EINVAL
+}
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 846601881..6e2843b36 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -63,6 +63,7 @@ go_library(
"//pkg/sentry/fs/anon",
"//pkg/sentry/fs/lock",
"//pkg/sentry/fs/timerfd",
+ "//pkg/sentry/fs/tmpfs",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/kernel/epoll",
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index e855590e6..888b5aa9f 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -367,7 +367,7 @@ var AMD64 = &kernel.SyscallTable{
// 316: Renameat2, TODO
317: Seccomp,
318: GetRandom,
- // 319: MemfdCreate, TODO
+ 319: MemfdCreate,
320: syscalls.CapError(linux.CAP_SYS_BOOT), // KexecFileLoad, infeasible to support
321: syscalls.CapError(linux.CAP_SYS_ADMIN), // Bpf, requires cap_sys_admin for all commands
// 322: Execveat, TODO
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index cf6fdc190..3193718b5 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -23,6 +23,7 @@ import (
"gvisor.googlesource.com/gvisor/pkg/sentry/context"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/fasync"
@@ -933,6 +934,15 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
case linux.F_SETOWN:
fSetOwn(t, file, args[2].Int())
return 0, nil, nil
+ case linux.F_GET_SEALS:
+ val, err := tmpfs.GetSeals(file.Dirent.Inode)
+ return uintptr(val), nil, err
+ case linux.F_ADD_SEALS:
+ if !file.Flags().Write {
+ return 0, nil, syserror.EPERM
+ }
+ err := tmpfs.AddSeals(file.Dirent.Inode, args[2].Uint())
+ return 0, nil, err
default:
// Everything else is not yet supported.
return 0, nil, syserror.EINVAL
@@ -2066,3 +2076,52 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// arbitrarily.
return uintptr(n), nil, handleIOError(t, n != 0, err, kernel.ERESTARTSYS, "sendfile", inFile)
}
+
+const (
+ memfdPrefix = "/memfd:"
+ memfdAllFlags = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING)
+ memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix) + 1
+)
+
+// MemfdCreate implements the linux syscall memfd_create(2).
+func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ flags := args[1].Uint()
+
+ if flags&^memfdAllFlags != 0 {
+ // Unknown bits in flags.
+ return 0, nil, syserror.EINVAL
+ }
+
+ allowSeals := flags&linux.MFD_ALLOW_SEALING != 0
+ cloExec := flags&linux.MFD_CLOEXEC != 0
+
+ name, err := t.CopyInString(addr, syscall.PathMax-len(memfdPrefix))
+ if err != nil {
+ return 0, nil, err
+ }
+ if len(name) > memfdMaxNameLen {
+ return 0, nil, syserror.EINVAL
+ }
+ name = memfdPrefix + name
+
+ inode := tmpfs.NewMemfdInode(t, allowSeals)
+ dirent := fs.NewDirent(inode, name)
+ // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with
+ // FMODE_READ | FMODE_WRITE.
+ file, err := inode.GetFile(t, dirent, fs.FileFlags{Read: true, Write: true})
+ if err != nil {
+ return 0, nil, err
+ }
+
+ defer dirent.DecRef()
+ defer file.DecRef()
+
+ fdFlags := kernel.FDFlags{CloseOnExec: cloExec}
+ newFD, err := t.FDMap().NewFDFrom(0, file, fdFlags, t.ThreadGroup().Limits())
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return uintptr(newFD), nil, nil
+}
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 2c214925e..7dd63dd0a 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -3261,3 +3261,20 @@ cc_binary(
"@com_google_googletest//:gtest",
],
)
+
+cc_binary(
+ name = "memfd_test",
+ testonly = 1,
+ srcs = ["memfd.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "//test/util:memory_util",
+ "//test/util:multiprocess_util",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
diff --git a/test/syscalls/linux/memfd.cc b/test/syscalls/linux/memfd.cc
new file mode 100644
index 000000000..ccdddd4e5
--- /dev/null
+++ b/test/syscalls/linux/memfd.cc
@@ -0,0 +1,546 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/magic.h>
+#include <linux/memfd.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/statfs.h>
+#include <sys/syscall.h>
+
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/memory_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+// The header sys/memfd.h isn't available on all systems, so redefining some of
+// the constants here.
+#define F_LINUX_SPECIFIC_BASE 1024
+#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
+#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
+#define F_SEAL_SEAL 0x0001
+#define F_SEAL_SHRINK 0x0002
+#define F_SEAL_GROW 0x0004
+#define F_SEAL_WRITE 0x0008
+
+using ::testing::StartsWith;
+
+const std::string kMemfdName = "some-memfd";
+
+int memfd_create(const std::string& name, unsigned int flags) {
+ return syscall(__NR_memfd_create, name.c_str(), flags);
+}
+
+PosixErrorOr<FileDescriptor> MemfdCreate(const std::string& name, uint32_t flags) {
+ int fd = memfd_create(name, flags);
+ if (fd < 0) {
+ return PosixError(
+ errno, absl::StrFormat("memfd_create(\"%s\", %#x)", name, flags));
+ }
+ MaybeSave();
+ return FileDescriptor(fd);
+}
+
+// Procfs entries for memfds display the appropriate name.
+TEST(MemfdTest, Name) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0));
+ const std::string proc_name = ASSERT_NO_ERRNO_AND_VALUE(
+ ReadLink(absl::StrFormat("/proc/self/fd/%d", memfd.get())));
+ EXPECT_THAT(proc_name, StartsWith("/memfd:" + kMemfdName));
+}
+
+// Memfds support read/write syscalls.
+TEST(MemfdTest, WriteRead) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0));
+
+ // Write a random page of data to the memfd via write(2).
+ std::vector<char> buf(kPageSize);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Read back the same data and verify.
+ std::vector<char> buf2(kPageSize);
+ ASSERT_THAT(lseek(memfd.get(), 0, SEEK_SET), SyscallSucceeds());
+ EXPECT_THAT(read(memfd.get(), buf2.data(), buf2.size()),
+ SyscallSucceedsWithValue(kPageSize));
+ EXPECT_EQ(buf, buf2);
+}
+
+// Memfds can be mapped and used as usual.
+TEST(MemfdTest, Mmap) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0));
+ const Mapping m1 = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+ nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, memfd.get(), 0));
+
+ // Write a random page of data to the memfd via mmap m1.
+ std::vector<char> buf(kPageSize);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds());
+ memcpy(m1.ptr(), buf.data(), buf.size());
+
+ // Read the data back via a read syscall on the memfd.
+ std::vector<char> buf2(kPageSize);
+ EXPECT_THAT(read(memfd.get(), buf2.data(), buf2.size()),
+ SyscallSucceedsWithValue(kPageSize));
+ EXPECT_EQ(buf, buf2);
+
+ // The same data should be accessible via a new mapping m2.
+ const Mapping m2 = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+ nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, memfd.get(), 0));
+ EXPECT_EQ(0, memcmp(m1.ptr(), m2.ptr(), kPageSize));
+}
+
+TEST(MemfdTest, DuplicateFDsShareContent) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0));
+ const Mapping m1 = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+ nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, memfd.get(), 0));
+ const FileDescriptor memfd2 = ASSERT_NO_ERRNO_AND_VALUE(memfd.Dup());
+
+ // Write a random page of data to the memfd via mmap m1.
+ std::vector<char> buf(kPageSize);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds());
+ memcpy(m1.ptr(), buf.data(), buf.size());
+
+ // Read the data back via a read syscall on a duplicate fd.
+ std::vector<char> buf2(kPageSize);
+ EXPECT_THAT(read(memfd2.get(), buf2.data(), buf2.size()),
+ SyscallSucceedsWithValue(kPageSize));
+ EXPECT_EQ(buf, buf2);
+}
+
+// File seals are disabled by default on memfds.
+TEST(MemfdTest, SealingDisabledByDefault) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0));
+ EXPECT_THAT(fcntl(memfd.get(), F_GET_SEALS),
+ SyscallSucceedsWithValue(F_SEAL_SEAL));
+ // Attempting to set any seal should fail.
+ EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE),
+ SyscallFailsWithErrno(EPERM));
+}
+
+// Seals can be retrieved and updated for memfds.
+TEST(MemfdTest, SealsGetSet) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+ int seals;
+ ASSERT_THAT(seals = fcntl(memfd.get(), F_GET_SEALS), SyscallSucceeds());
+ // No seals are set yet.
+ EXPECT_EQ(0, seals);
+
+ // Set a seal and check that we can get it back.
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds());
+ EXPECT_THAT(fcntl(memfd.get(), F_GET_SEALS),
+ SyscallSucceedsWithValue(F_SEAL_WRITE));
+
+ // Set some more seals and verify.
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_GROW | F_SEAL_SHRINK),
+ SyscallSucceeds());
+ EXPECT_THAT(
+ fcntl(memfd.get(), F_GET_SEALS),
+ SyscallSucceedsWithValue(F_SEAL_WRITE | F_SEAL_GROW | F_SEAL_SHRINK));
+
+ // Attempting to set a seal that is already set is a no-op.
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds());
+ EXPECT_THAT(
+ fcntl(memfd.get(), F_GET_SEALS),
+ SyscallSucceedsWithValue(F_SEAL_WRITE | F_SEAL_GROW | F_SEAL_SHRINK));
+
+ // Add remaining seals and verify.
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_SEAL), SyscallSucceeds());
+ EXPECT_THAT(fcntl(memfd.get(), F_GET_SEALS),
+ SyscallSucceedsWithValue(F_SEAL_WRITE | F_SEAL_GROW |
+ F_SEAL_SHRINK | F_SEAL_SEAL));
+}
+
+// F_SEAL_GROW prevents a memfd from being grown using ftruncate.
+TEST(MemfdTest, SealGrowWithTruncate) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds());
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_GROW), SyscallSucceeds());
+
+ // Try grow the memfd by 1 page.
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize * 2),
+ SyscallFailsWithErrno(EPERM));
+
+ // Ftruncate calls that don't actually grow the memfd are allowed.
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds());
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize / 2), SyscallSucceeds());
+
+ // After shrinking, growing back is not allowed.
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallFailsWithErrno(EPERM));
+}
+
+// F_SEAL_GROW prevents a memfd from being grown using the write syscall.
+TEST(MemfdTest, SealGrowWithWrite) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+
+ // Initially, writing to the memfd succeeds.
+ const std::vector<char> buf(kPageSize);
+ EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Apply F_SEAL_GROW, subsequent writes which extend the memfd should fail.
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_GROW), SyscallSucceeds());
+ EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallFailsWithErrno(EPERM));
+
+ // However, zero-length writes are ok since they don't grow the memfd.
+ EXPECT_THAT(write(memfd.get(), buf.data(), 0), SyscallSucceeds());
+
+ // Writing to existing parts of the memfd is also ok.
+ ASSERT_THAT(lseek(memfd.get(), 0, SEEK_SET), SyscallSucceeds());
+ EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Returning the end of the file and writing still not allowed.
+ EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallFailsWithErrno(EPERM));
+}
+
+// F_SEAL_GROW causes writes which partially extend off the current EOF to
+// partially succeed, up to the page containing the EOF.
+TEST(MemfdTest, SealGrowPartialWriteTruncated) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds());
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_GROW), SyscallSucceeds());
+
+ // FD offset: 1 page, EOF: 1 page.
+
+ ASSERT_THAT(lseek(memfd.get(), kPageSize * 3 / 4, SEEK_SET),
+ SyscallSucceeds());
+
+ // FD offset: 3/4 page. Writing a full page now should only write 1/4 page
+ // worth of data. This partially succeeds because the first page is entirely
+ // within the file and requires no growth, but attempting to write the final
+ // 3/4 page would require growing the file.
+ const std::vector<char> buf(kPageSize);
+ EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize / 4));
+}
+
+// F_SEAL_GROW causes writes which partially extend off the current EOF to fail
+// in its entirety if the only data written would be to the page containing the
+// EOF.
+TEST(MemfdTest, SealGrowPartialWriteTruncatedSamePage) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize * 3 / 4), SyscallSucceeds());
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_GROW), SyscallSucceeds());
+
+ // EOF: 3/4 page, writing 1/2 page starting at 1/2 page would cause the file
+ // to grow. Since this would require only the page containing the EOF to be
+ // modified, the write is rejected entirely.
+ const std::vector<char> buf(kPageSize / 2);
+ EXPECT_THAT(pwrite(memfd.get(), buf.data(), buf.size(), kPageSize / 2),
+ SyscallFailsWithErrno(EPERM));
+
+ // However, writing up to EOF is fine.
+ EXPECT_THAT(pwrite(memfd.get(), buf.data(), buf.size() / 2, kPageSize / 2),
+ SyscallSucceedsWithValue(kPageSize / 4));
+}
+
+// F_SEAL_SHRINK prevents a memfd from being shrunk using ftruncate.
+TEST(MemfdTest, SealShrink) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds());
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_SHRINK),
+ SyscallSucceeds());
+
+ // Shrink by half a page.
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize / 2),
+ SyscallFailsWithErrno(EPERM));
+
+ // Ftruncate calls that don't actually shrink the file are allowed.
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds());
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize * 2), SyscallSucceeds());
+
+ // After growing, shrinking is still not allowed.
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallFailsWithErrno(EPERM));
+}
+
+// F_SEAL_WRITE prevents a memfd from being written to through a write
+// syscall.
+TEST(MemfdTest, SealWriteWithWrite) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+ const std::vector<char> buf(kPageSize);
+ ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds());
+
+ // Attemping to write at the end of the file fails.
+ EXPECT_THAT(write(memfd.get(), buf.data(), 1), SyscallFailsWithErrno(EPERM));
+
+ // Attemping to overwrite an existing part of the memfd fails.
+ EXPECT_THAT(pwrite(memfd.get(), buf.data(), 1, 0),
+ SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(pwrite(memfd.get(), buf.data(), buf.size() / 2, kPageSize / 2),
+ SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(pwrite(memfd.get(), buf.data(), buf.size(), kPageSize / 2),
+ SyscallFailsWithErrno(EPERM));
+
+ // Zero-length writes however do not fail.
+ EXPECT_THAT(write(memfd.get(), buf.data(), 0), SyscallSucceeds());
+}
+
+// F_SEAL_WRITE prevents a memfd from being written to through an mmap.
+TEST(MemfdTest, SealWriteWithMmap) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+ const std::vector<char> buf(kPageSize);
+ ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds());
+
+ // Can't create a shared mapping with writes sealed.
+ void* ret = mmap(nullptr, kPageSize, PROT_WRITE, MAP_SHARED, memfd.get(), 0);
+ EXPECT_EQ(ret, MAP_FAILED);
+ EXPECT_EQ(errno, EPERM);
+ ret = mmap(nullptr, kPageSize, PROT_READ, MAP_SHARED, memfd.get(), 0);
+ EXPECT_EQ(ret, MAP_FAILED);
+ EXPECT_EQ(errno, EPERM);
+
+ // However, private mappings are ok.
+ EXPECT_NO_ERRNO(Mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
+ memfd.get(), 0));
+}
+
+// Adding F_SEAL_WRITE fails when there are outstanding writable mappings to a
+// memfd.
+TEST(MemfdTest, SealWriteWithOutstandingWritbleMapping) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+ const std::vector<char> buf(kPageSize);
+ ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Attempting to add F_SEAL_WRITE with active shared mapping with any set of
+ // permissions fails.
+
+ // Read-only shared mapping.
+ {
+ const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+ Mmap(nullptr, kPageSize, PROT_READ, MAP_SHARED, memfd.get(), 0));
+ EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE),
+ SyscallFailsWithErrno(EBUSY));
+ }
+
+ // Write-only shared mapping.
+ {
+ const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+ Mmap(nullptr, kPageSize, PROT_WRITE, MAP_SHARED, memfd.get(), 0));
+ EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE),
+ SyscallFailsWithErrno(EBUSY));
+ }
+
+ // Read-write shared mapping.
+ {
+ const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+ Mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+ memfd.get(), 0));
+ EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE),
+ SyscallFailsWithErrno(EBUSY));
+ }
+
+ // F_SEAL_WRITE can be set with private mappings with any permissions.
+ {
+ const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+ Mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
+ memfd.get(), 0));
+ EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE),
+ SyscallSucceeds());
+ }
+}
+
+// When applying F_SEAL_WRITE fails due to outstanding writable mappings, any
+// additional seals passed to the same add seal call are also rejected.
+TEST(MemfdTest, NoPartialSealApplicationWhenWriteSealRejected) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+ const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+ nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, memfd.get(), 0));
+
+ // Try add some seals along with F_SEAL_WRITE. The seal application should
+ // fail since there exists an active shared mapping.
+ EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE | F_SEAL_GROW),
+ SyscallFailsWithErrno(EBUSY));
+
+ // None of the seals should be applied.
+ EXPECT_THAT(fcntl(memfd.get(), F_GET_SEALS), SyscallSucceedsWithValue(0));
+}
+
+// Seals are inode level properties, and apply to all file descriptors referring
+// to a memfd.
+TEST(MemfdTest, SealsAreInodeLevelProperties) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+ const FileDescriptor memfd2 = ASSERT_NO_ERRNO_AND_VALUE(memfd.Dup());
+
+ // Add seal through the original memfd, and verify that it appears on the
+ // dupped fd.
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds());
+ EXPECT_THAT(fcntl(memfd2.get(), F_GET_SEALS),
+ SyscallSucceedsWithValue(F_SEAL_WRITE));
+
+ // Verify the seal actually applies to both fds.
+ std::vector<char> buf(kPageSize);
+ EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(write(memfd2.get(), buf.data(), buf.size()),
+ SyscallFailsWithErrno(EPERM));
+
+ // Seals are enforced on new FDs that are dupped after the seal is already
+ // applied.
+ const FileDescriptor memfd3 = ASSERT_NO_ERRNO_AND_VALUE(memfd2.Dup());
+ EXPECT_THAT(write(memfd3.get(), buf.data(), buf.size()),
+ SyscallFailsWithErrno(EPERM));
+
+ // Try a new seal applied to one of the dupped fds.
+ ASSERT_THAT(fcntl(memfd3.get(), F_ADD_SEALS, F_SEAL_GROW), SyscallSucceeds());
+ EXPECT_THAT(ftruncate(memfd.get(), kPageSize), SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(ftruncate(memfd2.get(), kPageSize), SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(ftruncate(memfd3.get(), kPageSize), SyscallFailsWithErrno(EPERM));
+}
+
+PosixErrorOr<bool> IsTmpfs(const std::string& path) {
+ struct statfs stat;
+ if (statfs(path.c_str(), &stat)) {
+ if (errno == ENOENT) {
+ // Nothing at path, don't raise this as an error. Instead, just report no
+ // tmpfs at path.
+ return false;
+ }
+ return PosixError(errno,
+ absl::StrFormat("statfs(\"%s\", %#p)", path, &stat));
+ }
+ return stat.f_type == TMPFS_MAGIC;
+}
+
+// Tmpfs files also support seals, but are created with F_SEAL_SEAL.
+TEST(MemfdTest, TmpfsFilesHaveSealSeal) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsTmpfs("/tmp")));
+ const TempPath tmpfs_file =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn("/tmp"));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(tmpfs_file.path(), O_RDWR, 0644));
+ EXPECT_THAT(fcntl(fd.get(), F_GET_SEALS),
+ SyscallSucceedsWithValue(F_SEAL_SEAL));
+}
+
+// Can open a memfd from procfs and use as normal.
+TEST(MemfdTest, CanOpenFromProcfs) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+
+ // Write a random page of data to the memfd via write(2).
+ std::vector<char> buf(kPageSize);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Read back the same data from the fd obtained from procfs and verify.
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Open(absl::StrFormat("/proc/self/fd/%d", memfd.get()), O_RDWR));
+ std::vector<char> buf2(kPageSize);
+ EXPECT_THAT(pread(fd.get(), buf2.data(), buf2.size(), 0),
+ SyscallSucceedsWithValue(kPageSize));
+ EXPECT_EQ(buf, buf2);
+}
+
+// Test that memfd permissions are set up correctly to allow another process to
+// open it from procfs.
+TEST(MemfdTest, OtherProcessCanOpenFromProcfs) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+ pid_t pid = getpid();
+ const auto rest = [&] {
+ ASSERT_NO_ERRNO(
+ Open(absl::StrFormat("/proc/self/%d/%d", pid, memfd.get()), O_RDWR));
+ };
+ EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+// Test that only files opened as writable can have seals applied to them.
+// Normally there's no way to specify file permissions on memfds, but we can
+// obtain a read-only memfd by opening the corresponding procfs fd entry as
+// read-only.
+TEST(MemfdTest, MemfdMustBeWritableToModifySeals) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+
+ // Initially adding a seal works.
+ EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds());
+
+ // Re-open the memfd as read-only from procfs.
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Open(absl::StrFormat("/proc/self/fd/%d", memfd.get()), O_RDONLY));
+
+ // Can't add seals through an unwritable fd.
+ EXPECT_THAT(fcntl(fd.get(), F_ADD_SEALS, F_SEAL_GROW),
+ SyscallFailsWithErrno(EPERM));
+}
+
+// Test that the memfd implementation internally tracks potentially writable
+// maps correctly.
+TEST(MemfdTest, MultipleWritableAndNonWritableRefsToSameFileRegion) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0));
+
+ // Populate with a random page of data.
+ std::vector<char> buf(kPageSize);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Read-only map to the page. This should cause an initial mapping to be
+ // created.
+ Mapping m1 = ASSERT_NO_ERRNO_AND_VALUE(
+ Mmap(nullptr, kPageSize, PROT_READ, MAP_PRIVATE, memfd.get(), 0));
+
+ // Create a shared writable map to the page. This should cause the internal
+ // mapping to become potentially writable.
+ Mapping m2 = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+ nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, memfd.get(), 0));
+
+ // Drop the read-only mapping first. If writable-ness isn't tracked correctly,
+ // this can cause some misaccounting, which can trigger asserts internally.
+ m1.reset();
+ m2.reset();
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor