diff options
author | Rahat Mahmood <rahat@google.com> | 2019-03-26 16:15:55 -0700 |
---|---|---|
committer | Shentubot <shentubot@google.com> | 2019-03-26 16:16:57 -0700 |
commit | 06ec97a3f823f1f5d928fc9c2beb3a11c2c88487 (patch) | |
tree | a2b501718c82aede761d7235527492782ef65cc2 /pkg/sentry/fs/tmpfs | |
parent | 79aca14a0cd70720e8a8f8bd6c1499ab1ffbd8d3 (diff) |
Implement memfd_create.
Memfds are simply anonymous tmpfs files with no associated
mounts. Also implementing file seals, which Linux only implements for
memfds at the moment.
PiperOrigin-RevId: 240450031
Change-Id: I31de78b950101ae8d7a13d0e93fe52d98ea06f2f
Diffstat (limited to 'pkg/sentry/fs/tmpfs')
-rw-r--r-- | pkg/sentry/fs/tmpfs/inode_file.go | 151 |
1 files changed, 151 insertions, 0 deletions
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go index 25bf2b9dd..7c80d711b 100644 --- a/pkg/sentry/fs/tmpfs/inode_file.go +++ b/pkg/sentry/fs/tmpfs/inode_file.go @@ -15,10 +15,12 @@ package tmpfs import ( + "fmt" "io" "sync" "time" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/metric" "gvisor.googlesource.com/gvisor/pkg/sentry/context" "gvisor.googlesource.com/gvisor/pkg/sentry/fs" @@ -29,6 +31,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" "gvisor.googlesource.com/gvisor/pkg/sentry/usage" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" ) var ( @@ -42,6 +45,8 @@ var ( // These files are backed by pages allocated from a platform.Memory, and may be // directly mapped. // +// Lock order: attrMu -> mapsMu -> dataMu. +// // +stateify savable type fileInodeOperations struct { fsutil.InodeGenericChecker `state:"nosave"` @@ -74,6 +79,17 @@ type fileInodeOperations struct { // mappings is protected by mapsMu. mappings memmap.MappingSet + // writableMappingPages tracks how many pages of virtual memory are mapped + // as potentially writable from this file. If a page has multiple mappings, + // each mapping is counted separately. + // + // This counter is susceptible to overflow as we can potentially count + // mappings from many VMAs. We count pages rather than bytes to slightly + // mitigate this. + // + // Protected by mapsMu. + writableMappingPages uint64 + dataMu sync.RWMutex `state:"nosave"` // data maps offsets into the file to offsets into platform.Memory() that @@ -81,6 +97,11 @@ type fileInodeOperations struct { // // data is protected by dataMu. data fsutil.FileRangeSet + + // seals represents file seals on this inode. + // + // Protected by dataMu. + seals uint32 } var _ fs.InodeOperations = (*fileInodeOperations)(nil) @@ -91,9 +112,30 @@ func NewInMemoryFile(ctx context.Context, usage usage.MemoryKind, uattr fs.Unsta attr: uattr, kernel: kernel.KernelFromContext(ctx), memUsage: usage, + seals: linux.F_SEAL_SEAL, } } +// NewMemfdInode creates a new inode backing a memfd. Memory used by the memfd +// is backed by platform memory. +func NewMemfdInode(ctx context.Context, allowSeals bool) *fs.Inode { + // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with + // S_IRWXUGO. + perms := fs.PermMask{Read: true, Write: true, Execute: true} + iops := NewInMemoryFile(ctx, usage.Tmpfs, fs.UnstableAttr{ + Owner: fs.FileOwnerFromContext(ctx), + Perms: fs.FilePermissions{User: perms, Group: perms, Other: perms}}).(*fileInodeOperations) + if allowSeals { + iops.seals = 0 + } + return fs.NewInode(iops, fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{ + Type: fs.RegularFile, + DeviceID: tmpfsDevice.DeviceID(), + InodeID: tmpfsDevice.NextIno(), + BlockSize: usermem.PageSize, + }) +} + // Release implements fs.InodeOperations.Release. func (f *fileInodeOperations) Release(context.Context) { f.dataMu.Lock() @@ -170,6 +212,16 @@ func (f *fileInodeOperations) Truncate(ctx context.Context, _ *fs.Inode, size in f.dataMu.Lock() oldSize := f.attr.Size + + // Check if current seals allow truncation. + switch { + case size > oldSize && f.seals&linux.F_SEAL_GROW != 0: // Grow sealed + fallthrough + case oldSize > size && f.seals&linux.F_SEAL_SHRINK != 0: // Shrink sealed + f.dataMu.Unlock() + return syserror.EPERM + } + if oldSize != size { f.attr.Size = size // Update mtime and ctime. @@ -370,6 +422,34 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) return 0, nil } + // Check if seals prevent either file growth or all writes. + switch { + case rw.f.seals&linux.F_SEAL_WRITE != 0: // Write sealed + return 0, syserror.EPERM + case end > rw.f.attr.Size && rw.f.seals&linux.F_SEAL_GROW != 0: // Grow sealed + // When growth is sealed, Linux effectively allows writes which would + // normally grow the file to partially succeed up to the current EOF, + // rounded down to the page boundary before the EOF. + // + // This happens because writes (and thus the growth check) for tmpfs + // files proceed page-by-page on Linux, and the final write to the page + // containing EOF fails, resulting in a partial write up to the start of + // that page. + // + // To emulate this behaviour, artifically truncate the write to the + // start of the page containing the current EOF. + // + // See Linux, mm/filemap.c:generic_perform_write() and + // mm/shmem.c:shmem_write_begin(). + if pgstart := int64(usermem.Addr(rw.f.attr.Size).RoundDown()); end > pgstart { + end = pgstart + } + if end <= rw.offset { + // Truncation would result in no data being written. + return 0, syserror.EPERM + } + } + defer func() { // If the write ends beyond the file's previous size, it causes the // file to grow. @@ -431,7 +511,27 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) func (f *fileInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { f.mapsMu.Lock() defer f.mapsMu.Unlock() + + f.dataMu.RLock() + defer f.dataMu.RUnlock() + + // Reject writable mapping if F_SEAL_WRITE is set. + if f.seals&linux.F_SEAL_WRITE != 0 && writable { + return syserror.EPERM + } + f.mappings.AddMapping(ms, ar, offset, writable) + if writable { + pagesBefore := f.writableMappingPages + + // ar is guaranteed to be page aligned per memmap.Mappable. + f.writableMappingPages += uint64(ar.Length() / usermem.PageSize) + + if f.writableMappingPages < pagesBefore { + panic(fmt.Sprintf("Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, f.writableMappingPages)) + } + } + return nil } @@ -439,7 +539,19 @@ func (f *fileInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingS func (f *fileInodeOperations) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) { f.mapsMu.Lock() defer f.mapsMu.Unlock() + f.mappings.RemoveMapping(ms, ar, offset, writable) + + if writable { + pagesBefore := f.writableMappingPages + + // ar is guaranteed to be page aligned per memmap.Mappable. + f.writableMappingPages -= uint64(ar.Length() / usermem.PageSize) + + if f.writableMappingPages > pagesBefore { + panic(fmt.Sprintf("Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, f.writableMappingPages)) + } + } } // CopyMapping implements memmap.Mappable.CopyMapping. @@ -501,3 +613,42 @@ func (f *fileInodeOperations) Translate(ctx context.Context, required, optional func (f *fileInodeOperations) InvalidateUnsavable(ctx context.Context) error { return nil } + +// GetSeals returns the current set of seals on a memfd inode. +func GetSeals(inode *fs.Inode) (uint32, error) { + if f, ok := inode.InodeOperations.(*fileInodeOperations); ok { + f.dataMu.RLock() + defer f.dataMu.RUnlock() + return f.seals, nil + } + // Not a memfd inode. + return 0, syserror.EINVAL +} + +// AddSeals adds new file seals to a memfd inode. +func AddSeals(inode *fs.Inode, val uint32) error { + if f, ok := inode.InodeOperations.(*fileInodeOperations); ok { + f.mapsMu.Lock() + defer f.mapsMu.Unlock() + f.dataMu.Lock() + defer f.dataMu.Unlock() + + if f.seals&linux.F_SEAL_SEAL != 0 { + // Seal applied which prevents addition of any new seals. + return syserror.EPERM + } + + // F_SEAL_WRITE can only be added if there are no active writable maps. + if f.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 { + if f.writableMappingPages > 0 { + return syserror.EBUSY + } + } + + // Seals can only be added, never removed. + f.seals |= val + return nil + } + // Not a memfd inode. + return syserror.EINVAL +} |