summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/fs
diff options
context:
space:
mode:
authorRahat Mahmood <rahat@google.com>2019-03-26 16:15:55 -0700
committerShentubot <shentubot@google.com>2019-03-26 16:16:57 -0700
commit06ec97a3f823f1f5d928fc9c2beb3a11c2c88487 (patch)
treea2b501718c82aede761d7235527492782ef65cc2 /pkg/sentry/fs
parent79aca14a0cd70720e8a8f8bd6c1499ab1ffbd8d3 (diff)
Implement memfd_create.
Memfds are simply anonymous tmpfs files with no associated mounts. Also implementing file seals, which Linux only implements for memfds at the moment. PiperOrigin-RevId: 240450031 Change-Id: I31de78b950101ae8d7a13d0e93fe52d98ea06f2f
Diffstat (limited to 'pkg/sentry/fs')
-rw-r--r--pkg/sentry/fs/tmpfs/inode_file.go151
1 files changed, 151 insertions, 0 deletions
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index 25bf2b9dd..7c80d711b 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -15,10 +15,12 @@
package tmpfs
import (
+ "fmt"
"io"
"sync"
"time"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/metric"
"gvisor.googlesource.com/gvisor/pkg/sentry/context"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
@@ -29,6 +31,7 @@ import (
"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
)
var (
@@ -42,6 +45,8 @@ var (
// These files are backed by pages allocated from a platform.Memory, and may be
// directly mapped.
//
+// Lock order: attrMu -> mapsMu -> dataMu.
+//
// +stateify savable
type fileInodeOperations struct {
fsutil.InodeGenericChecker `state:"nosave"`
@@ -74,6 +79,17 @@ type fileInodeOperations struct {
// mappings is protected by mapsMu.
mappings memmap.MappingSet
+ // writableMappingPages tracks how many pages of virtual memory are mapped
+ // as potentially writable from this file. If a page has multiple mappings,
+ // each mapping is counted separately.
+ //
+ // This counter is susceptible to overflow as we can potentially count
+ // mappings from many VMAs. We count pages rather than bytes to slightly
+ // mitigate this.
+ //
+ // Protected by mapsMu.
+ writableMappingPages uint64
+
dataMu sync.RWMutex `state:"nosave"`
// data maps offsets into the file to offsets into platform.Memory() that
@@ -81,6 +97,11 @@ type fileInodeOperations struct {
//
// data is protected by dataMu.
data fsutil.FileRangeSet
+
+ // seals represents file seals on this inode.
+ //
+ // Protected by dataMu.
+ seals uint32
}
var _ fs.InodeOperations = (*fileInodeOperations)(nil)
@@ -91,9 +112,30 @@ func NewInMemoryFile(ctx context.Context, usage usage.MemoryKind, uattr fs.Unsta
attr: uattr,
kernel: kernel.KernelFromContext(ctx),
memUsage: usage,
+ seals: linux.F_SEAL_SEAL,
}
}
+// NewMemfdInode creates a new inode backing a memfd. Memory used by the memfd
+// is backed by platform memory.
+func NewMemfdInode(ctx context.Context, allowSeals bool) *fs.Inode {
+ // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with
+ // S_IRWXUGO.
+ perms := fs.PermMask{Read: true, Write: true, Execute: true}
+ iops := NewInMemoryFile(ctx, usage.Tmpfs, fs.UnstableAttr{
+ Owner: fs.FileOwnerFromContext(ctx),
+ Perms: fs.FilePermissions{User: perms, Group: perms, Other: perms}}).(*fileInodeOperations)
+ if allowSeals {
+ iops.seals = 0
+ }
+ return fs.NewInode(iops, fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{
+ Type: fs.RegularFile,
+ DeviceID: tmpfsDevice.DeviceID(),
+ InodeID: tmpfsDevice.NextIno(),
+ BlockSize: usermem.PageSize,
+ })
+}
+
// Release implements fs.InodeOperations.Release.
func (f *fileInodeOperations) Release(context.Context) {
f.dataMu.Lock()
@@ -170,6 +212,16 @@ func (f *fileInodeOperations) Truncate(ctx context.Context, _ *fs.Inode, size in
f.dataMu.Lock()
oldSize := f.attr.Size
+
+ // Check if current seals allow truncation.
+ switch {
+ case size > oldSize && f.seals&linux.F_SEAL_GROW != 0: // Grow sealed
+ fallthrough
+ case oldSize > size && f.seals&linux.F_SEAL_SHRINK != 0: // Shrink sealed
+ f.dataMu.Unlock()
+ return syserror.EPERM
+ }
+
if oldSize != size {
f.attr.Size = size
// Update mtime and ctime.
@@ -370,6 +422,34 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
return 0, nil
}
+ // Check if seals prevent either file growth or all writes.
+ switch {
+ case rw.f.seals&linux.F_SEAL_WRITE != 0: // Write sealed
+ return 0, syserror.EPERM
+ case end > rw.f.attr.Size && rw.f.seals&linux.F_SEAL_GROW != 0: // Grow sealed
+ // When growth is sealed, Linux effectively allows writes which would
+ // normally grow the file to partially succeed up to the current EOF,
+ // rounded down to the page boundary before the EOF.
+ //
+ // This happens because writes (and thus the growth check) for tmpfs
+ // files proceed page-by-page on Linux, and the final write to the page
+ // containing EOF fails, resulting in a partial write up to the start of
+ // that page.
+ //
+ // To emulate this behaviour, artifically truncate the write to the
+ // start of the page containing the current EOF.
+ //
+ // See Linux, mm/filemap.c:generic_perform_write() and
+ // mm/shmem.c:shmem_write_begin().
+ if pgstart := int64(usermem.Addr(rw.f.attr.Size).RoundDown()); end > pgstart {
+ end = pgstart
+ }
+ if end <= rw.offset {
+ // Truncation would result in no data being written.
+ return 0, syserror.EPERM
+ }
+ }
+
defer func() {
// If the write ends beyond the file's previous size, it causes the
// file to grow.
@@ -431,7 +511,27 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
func (f *fileInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
f.mapsMu.Lock()
defer f.mapsMu.Unlock()
+
+ f.dataMu.RLock()
+ defer f.dataMu.RUnlock()
+
+ // Reject writable mapping if F_SEAL_WRITE is set.
+ if f.seals&linux.F_SEAL_WRITE != 0 && writable {
+ return syserror.EPERM
+ }
+
f.mappings.AddMapping(ms, ar, offset, writable)
+ if writable {
+ pagesBefore := f.writableMappingPages
+
+ // ar is guaranteed to be page aligned per memmap.Mappable.
+ f.writableMappingPages += uint64(ar.Length() / usermem.PageSize)
+
+ if f.writableMappingPages < pagesBefore {
+ panic(fmt.Sprintf("Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, f.writableMappingPages))
+ }
+ }
+
return nil
}
@@ -439,7 +539,19 @@ func (f *fileInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingS
func (f *fileInodeOperations) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
f.mapsMu.Lock()
defer f.mapsMu.Unlock()
+
f.mappings.RemoveMapping(ms, ar, offset, writable)
+
+ if writable {
+ pagesBefore := f.writableMappingPages
+
+ // ar is guaranteed to be page aligned per memmap.Mappable.
+ f.writableMappingPages -= uint64(ar.Length() / usermem.PageSize)
+
+ if f.writableMappingPages > pagesBefore {
+ panic(fmt.Sprintf("Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, f.writableMappingPages))
+ }
+ }
}
// CopyMapping implements memmap.Mappable.CopyMapping.
@@ -501,3 +613,42 @@ func (f *fileInodeOperations) Translate(ctx context.Context, required, optional
func (f *fileInodeOperations) InvalidateUnsavable(ctx context.Context) error {
return nil
}
+
+// GetSeals returns the current set of seals on a memfd inode.
+func GetSeals(inode *fs.Inode) (uint32, error) {
+ if f, ok := inode.InodeOperations.(*fileInodeOperations); ok {
+ f.dataMu.RLock()
+ defer f.dataMu.RUnlock()
+ return f.seals, nil
+ }
+ // Not a memfd inode.
+ return 0, syserror.EINVAL
+}
+
+// AddSeals adds new file seals to a memfd inode.
+func AddSeals(inode *fs.Inode, val uint32) error {
+ if f, ok := inode.InodeOperations.(*fileInodeOperations); ok {
+ f.mapsMu.Lock()
+ defer f.mapsMu.Unlock()
+ f.dataMu.Lock()
+ defer f.dataMu.Unlock()
+
+ if f.seals&linux.F_SEAL_SEAL != 0 {
+ // Seal applied which prevents addition of any new seals.
+ return syserror.EPERM
+ }
+
+ // F_SEAL_WRITE can only be added if there are no active writable maps.
+ if f.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 {
+ if f.writableMappingPages > 0 {
+ return syserror.EBUSY
+ }
+ }
+
+ // Seals can only be added, never removed.
+ f.seals |= val
+ return nil
+ }
+ // Not a memfd inode.
+ return syserror.EINVAL
+}