diff options
Diffstat (limited to 'pkg/sentry/fsimpl/tmpfs')
-rw-r--r-- | pkg/sentry/fsimpl/tmpfs/benchmark_test.go | 8 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/tmpfs/device_file.go | 1 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/tmpfs/directory.go | 9 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/tmpfs/filesystem.go | 78 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/tmpfs/named_pipe.go | 5 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/tmpfs/pipe_test.go | 3 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/tmpfs/regular_file.go | 80 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/tmpfs/socket_file.go | 2 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/tmpfs/symlink.go | 1 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/tmpfs/tmpfs.go | 213 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/tmpfs/tmpfs_test.go | 3 |
11 files changed, 263 insertions, 140 deletions
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index d263147c2..3cc63e732 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -182,7 +182,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) { vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{}) if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } @@ -193,6 +193,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) { // Create nested directories with given depth. root := mntns.Root() + root.IncRef() defer root.DecRef(ctx) vd := root vd.IncRef() @@ -376,7 +377,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{}) if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } @@ -387,6 +388,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { // Create the mount point. root := mntns.Root() + root.IncRef() defer root.DecRef(ctx) pop := vfs.PathOperation{ Root: root, @@ -405,7 +407,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { } defer mountPoint.DecRef(ctx) // Create and mount the submount. - if err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil { + if _, err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil { b.Fatalf("failed to mount tmpfs submount: %v", err) } filePathBuilder.WriteString(mountPointName) diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go index ac54d420d..9129d35b7 100644 --- a/pkg/sentry/fsimpl/tmpfs/device_file.go +++ b/pkg/sentry/fsimpl/tmpfs/device_file.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" ) +// +stateify savable type deviceFile struct { inode inode kind vfs.DeviceKind diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index 78b4fc5be..e90669cf0 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +// +stateify savable type directory struct { // Since directories can't be hard-linked, each directory can only be // associated with a single dentry, which we can store in the directory @@ -44,7 +45,7 @@ type directory struct { // (with inode == nil) that represent the iteration position of // directoryFDs. childList is used to support directoryFD.IterDirents() // efficiently. childList is protected by iterMu. - iterMu sync.Mutex + iterMu sync.Mutex `state:"nosave"` childList dentryList } @@ -57,8 +58,9 @@ func (fs *filesystem) newDirectory(kuid auth.KUID, kgid auth.KGID, mode linux.Fi return dir } -// Preconditions: filesystem.mu must be locked for writing. dir must not -// already contain a child with the given name. +// Preconditions: +// * filesystem.mu must be locked for writing. +// * dir must not already contain a child with the given name. func (dir *directory) insertChildLocked(child *dentry, name string) { child.parent = &dir.dentry child.name = name @@ -85,6 +87,7 @@ func (dir *directory) mayDelete(creds *auth.Credentials, child *dentry) error { return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&dir.inode.mode)), auth.KUID(atomic.LoadUint32(&child.inode.uid))) } +// +stateify savable type directoryFD struct { fileDescription vfs.DirectoryFileDescriptionDefaultImpl diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index b0ec177e6..e39cd305b 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" ) // Sync implements vfs.FilesystemImpl.Sync. @@ -39,7 +38,9 @@ func (fs *filesystem) Sync(ctx context.Context) error { // // stepLocked is loosely analogous to fs/namei.c:walk_component(). // -// Preconditions: filesystem.mu must be locked. !rp.Done(). +// Preconditions: +// * filesystem.mu must be locked. +// * !rp.Done(). func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { dir, ok := d.inode.impl.(*directory) if !ok { @@ -97,7 +98,9 @@ afterSymlink: // walkParentDirLocked is loosely analogous to Linux's // fs/namei.c:path_parentat(). // -// Preconditions: filesystem.mu must be locked. !rp.Done(). +// Preconditions: +// * filesystem.mu must be locked. +// * !rp.Done(). func walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*directory, error) { for !rp.Final() { next, err := stepLocked(ctx, rp, d) @@ -139,8 +142,9 @@ func resolveLocked(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error) // doCreateAt is loosely analogous to a conjunction of Linux's // fs/namei.c:filename_create() and done_path_create(). // -// Preconditions: !rp.Done(). For the final path component in rp, -// !rp.ShouldFollowSymlink(). +// Preconditions: +// * !rp.Done(). +// * For the final path component in rp, !rp.ShouldFollowSymlink(). func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error { fs.mu.Lock() defer fs.mu.Unlock() @@ -669,11 +673,11 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts fs.mu.RUnlock() return err } - if err := d.inode.setStat(ctx, rp.Credentials(), &opts); err != nil { - fs.mu.RUnlock() + err = d.inode.setStat(ctx, rp.Credentials(), &opts) + fs.mu.RUnlock() + if err != nil { return err } - fs.mu.RUnlock() if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) @@ -701,16 +705,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu if _, err := resolveLocked(ctx, rp); err != nil { return linux.Statfs{}, err } - statfs := linux.Statfs{ - Type: linux.TMPFS_MAGIC, - BlockSize: usermem.PageSize, - FragmentSize: usermem.PageSize, - NameLength: linux.NAME_MAX, - // TODO(b/29637826): Allow configuring a tmpfs size and enforce it. - Blocks: 0, - BlocksFree: 0, - } - return statfs, nil + return globalStatfs, nil } // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. @@ -775,7 +770,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return nil } -// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { fs.mu.RLock() defer fs.mu.RUnlock() @@ -788,65 +783,68 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath } switch impl := d.inode.impl.(type) { case *socketFile: + if impl.ep == nil { + return nil, syserror.ECONNREFUSED + } return impl.ep, nil default: return nil, syserror.ECONNREFUSED } } -// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { +// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. +func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { fs.mu.RLock() defer fs.mu.RUnlock() d, err := resolveLocked(ctx, rp) if err != nil { return nil, err } - return d.inode.listxattr(size) + return d.inode.listXattr(size) } -// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { +// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. +func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { fs.mu.RLock() defer fs.mu.RUnlock() d, err := resolveLocked(ctx, rp) if err != nil { return "", err } - return d.inode.getxattr(rp.Credentials(), &opts) + return d.inode.getXattr(rp.Credentials(), &opts) } -// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. -func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { +// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. +func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { fs.mu.RLock() d, err := resolveLocked(ctx, rp) if err != nil { fs.mu.RUnlock() return err } - if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil { - fs.mu.RUnlock() + err = d.inode.setXattr(rp.Credentials(), &opts) + fs.mu.RUnlock() + if err != nil { return err } - fs.mu.RUnlock() d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } -// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. -func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { +// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. +func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { fs.mu.RLock() d, err := resolveLocked(ctx, rp) if err != nil { fs.mu.RUnlock() return err } - if err := d.inode.removexattr(rp.Credentials(), name); err != nil { - fs.mu.RUnlock() + err = d.inode.removeXattr(rp.Credentials(), name) + fs.mu.RUnlock() + if err != nil { return err } - fs.mu.RUnlock() d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil @@ -867,8 +865,16 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe } if d.parent == nil { if d.name != "" { - // This must be an anonymous memfd file. + // This file must have been created by + // newUnlinkedRegularFileDescription(). In Linux, + // mm/shmem.c:__shmem_file_setup() => + // fs/file_table.c:alloc_file_pseudo() sets the created + // dentry's dentry_operations to anon_ops, for which d_dname == + // simple_dname. fs/d_path.c:simple_dname() defines the + // dentry's pathname to be its name, prefixed with "/" and + // suffixed with " (deleted)". b.PrependComponent("/" + d.name) + b.AppendString(" (deleted)") return vfs.PrependPathSyntheticError{} } return vfs.PrependPathAtNonMountRootError{} diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go index 739350cf0..d772db9e9 100644 --- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go +++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// +stateify savable type namedPipe struct { inode inode @@ -28,8 +29,8 @@ type namedPipe struct { } // Preconditions: -// * fs.mu must be locked. -// * rp.Mount().CheckBeginWrite() has been called successfully. +// * fs.mu must be locked. +// * rp.Mount().CheckBeginWrite() has been called successfully. func (fs *filesystem) newNamedPipe(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode { file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)} file.inode.init(file, fs, kuid, kgid, linux.S_IFIFO|mode) diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go index ec2701d8b..2f856ce36 100644 --- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go +++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go @@ -158,13 +158,14 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{}) if err != nil { t.Fatalf("failed to create tmpfs root mount: %v", err) } // Create the pipe. root := mntns.Root() + root.IncRef() pop := vfs.PathOperation{ Root: root, Start: root, diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 0710b65db..ce4e3eda7 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -36,12 +36,18 @@ import ( ) // regularFile is a regular (=S_IFREG) tmpfs file. +// +// +stateify savable type regularFile struct { inode inode // memFile is a platform.File used to allocate pages to this regularFile. memFile *pgalloc.MemoryFile + // memoryUsageKind is the memory accounting category under which pages backing + // this regularFile's contents are accounted. + memoryUsageKind usage.MemoryKind + // mapsMu protects mappings. mapsMu sync.Mutex `state:"nosave"` @@ -62,7 +68,7 @@ type regularFile struct { writableMappingPages uint64 // dataMu protects the fields below. - dataMu sync.RWMutex + dataMu sync.RWMutex `state:"nosave"` // data maps offsets into the file to offsets into memFile that store // the file's data. @@ -86,14 +92,75 @@ type regularFile struct { func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode { file := ®ularFile{ - memFile: fs.memFile, - seals: linux.F_SEAL_SEAL, + memFile: fs.memFile, + memoryUsageKind: usage.Tmpfs, + seals: linux.F_SEAL_SEAL, } file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode) file.inode.nlink = 1 // from parent directory return &file.inode } +// newUnlinkedRegularFileDescription creates a regular file on the tmpfs +// filesystem represented by mount and returns an FD representing that file. +// The new file is not reachable by path traversal from any other file. +// +// newUnlinkedRegularFileDescription is analogous to Linux's +// mm/shmem.c:__shmem_file_setup(). +// +// Preconditions: mount must be a tmpfs mount. +func newUnlinkedRegularFileDescription(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, name string) (*regularFileFD, error) { + fs, ok := mount.Filesystem().Impl().(*filesystem) + if !ok { + panic("tmpfs.newUnlinkedRegularFileDescription() called with non-tmpfs mount") + } + + inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777) + d := fs.newDentry(inode) + defer d.DecRef(ctx) + d.name = name + + fd := ®ularFileFD{} + fd.Init(&inode.locks) + flags := uint32(linux.O_RDWR) + if err := fd.vfsfd.Init(fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return fd, nil +} + +// NewZeroFile creates a new regular file and file description as for +// mmap(MAP_SHARED | MAP_ANONYMOUS). The file has the given size and is +// initially (implicitly) filled with zeroes. +// +// Preconditions: mount must be a tmpfs mount. +func NewZeroFile(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, size uint64) (*vfs.FileDescription, error) { + // Compare mm/shmem.c:shmem_zero_setup(). + fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, "dev/zero") + if err != nil { + return nil, err + } + rf := fd.inode().impl.(*regularFile) + rf.memoryUsageKind = usage.Anonymous + rf.size = size + return &fd.vfsfd, err +} + +// NewMemfd creates a new regular file and file description as for +// memfd_create. +// +// Preconditions: mount must be a tmpfs mount. +func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) { + fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, name) + if err != nil { + return nil, err + } + if allowSeals { + fd.inode().impl.(*regularFile).seals = 0 + } + return &fd.vfsfd, nil +} + // truncate grows or shrinks the file to the given size. It returns true if the // file size was updated. func (rf *regularFile) truncate(newSize uint64) (bool, error) { @@ -226,7 +293,7 @@ func (rf *regularFile) Translate(ctx context.Context, required, optional memmap. optional.End = pgend } - cerr := rf.data.Fill(ctx, required, optional, rf.memFile, usage.Tmpfs, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) { + cerr := rf.data.Fill(ctx, required, optional, rf.size, rf.memFile, rf.memoryUsageKind, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) { // Newly-allocated pages are zeroed, so we don't need to do anything. return dsts.NumBytes(), nil }) @@ -260,13 +327,14 @@ func (*regularFile) InvalidateUnsavable(context.Context) error { return nil } +// +stateify savable type regularFileFD struct { fileDescription // off is the file offset. off is accessed using atomic memory operations. // offMu serializes operations that may mutate off. off int64 - offMu sync.Mutex + offMu sync.Mutex `state:"nosave"` } // Release implements vfs.FileDescriptionImpl.Release. @@ -575,7 +643,7 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, case gap.Ok(): // Allocate memory for the write. gapMR := gap.Range().Intersect(pgMR) - fr, err := rw.file.memFile.Allocate(gapMR.Length(), usage.Tmpfs) + fr, err := rw.file.memFile.Allocate(gapMR.Length(), rw.file.memoryUsageKind) if err != nil { retErr = err goto exitLoop diff --git a/pkg/sentry/fsimpl/tmpfs/socket_file.go b/pkg/sentry/fsimpl/tmpfs/socket_file.go index 3ed650474..5699d5975 100644 --- a/pkg/sentry/fsimpl/tmpfs/socket_file.go +++ b/pkg/sentry/fsimpl/tmpfs/socket_file.go @@ -21,6 +21,8 @@ import ( ) // socketFile is a socket (=S_IFSOCK) tmpfs file. +// +// +stateify savable type socketFile struct { inode inode ep transport.BoundEndpoint diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go index b0de5fabe..a102a2ee2 100644 --- a/pkg/sentry/fsimpl/tmpfs/symlink.go +++ b/pkg/sentry/fsimpl/tmpfs/symlink.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) +// +stateify savable type symlink struct { inode inode target string // immutable diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index de2af6d01..e2a0aac69 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -51,9 +51,13 @@ import ( const Name = "tmpfs" // FilesystemType implements vfs.FilesystemType. +// +// +stateify savable type FilesystemType struct{} // filesystem implements vfs.FilesystemImpl. +// +// +stateify savable type filesystem struct { vfsfs vfs.Filesystem @@ -67,9 +71,11 @@ type filesystem struct { devMinor uint32 // mu serializes changes to the Dentry tree. - mu sync.RWMutex + mu sync.RWMutex `state:"nosave"` nextInoMinusOne uint64 // accessed using atomic memory operations + + root *dentry } // Name implements vfs.FilesystemType.Name. @@ -77,7 +83,12 @@ func (FilesystemType) Name() string { return Name } +// Release implements vfs.FilesystemType.Release. +func (FilesystemType) Release(ctx context.Context) {} + // FilesystemOpts is used to pass configuration data to tmpfs. +// +// +stateify savable type FilesystemOpts struct { // RootFileType is the FileType of the filesystem root. Valid values // are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR. @@ -188,6 +199,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fs.vfsfs.DecRef(ctx) return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) } + fs.root = root return &fs.vfsfs, &root.vfsd, nil } @@ -199,9 +211,61 @@ func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *au // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.mu.Lock() + if fs.root.inode.isDir() { + fs.root.releaseChildrenLocked(ctx) + } + fs.mu.Unlock() +} + +// releaseChildrenLocked is called on the mount point by filesystem.Release() to +// destroy all objects in the mount. It performs a depth-first walk of the +// filesystem and "unlinks" everything by decrementing link counts +// appropriately. There should be no open file descriptors when this is called, +// so each inode should only have one outstanding reference that is removed once +// its link count hits zero. +// +// Note that we do not update filesystem state precisely while tearing down (for +// instance, the child maps are ignored)--we only care to remove all remaining +// references so that every filesystem object gets destroyed. Also note that we +// do not need to trigger DecRef on the mount point itself or any child mount; +// these are taken care of by the destructor of the enclosing MountNamespace. +// +// Precondition: filesystem.mu is held. +func (d *dentry) releaseChildrenLocked(ctx context.Context) { + dir := d.inode.impl.(*directory) + for _, child := range dir.childMap { + if child.inode.isDir() { + child.releaseChildrenLocked(ctx) + child.inode.decLinksLocked(ctx) // link for child/. + dir.inode.decLinksLocked(ctx) // link for child/.. + } + child.inode.decLinksLocked(ctx) // link for child + } +} + +// immutable +var globalStatfs = linux.Statfs{ + Type: linux.TMPFS_MAGIC, + BlockSize: usermem.PageSize, + FragmentSize: usermem.PageSize, + NameLength: linux.NAME_MAX, + + // tmpfs currently does not support configurable size limits. In Linux, + // such a tmpfs mount will return f_blocks == f_bfree == f_bavail == 0 from + // statfs(2). However, many applications treat this as having a size limit + // of 0. To work around this, claim to have a very large but non-zero size, + // chosen to ensure that BlockSize * Blocks does not overflow int64 (which + // applications may also handle incorrectly). + // TODO(b/29637826): allow configuring a tmpfs size and enforce it. + Blocks: math.MaxInt64 / usermem.PageSize, + BlocksFree: math.MaxInt64 / usermem.PageSize, + BlocksAvailable: math.MaxInt64 / usermem.PageSize, } // dentry implements vfs.DentryImpl. +// +// +stateify savable type dentry struct { vfsd vfs.Dentry @@ -281,6 +345,8 @@ func (d *dentry) Watches() *vfs.Watches { func (d *dentry) OnZeroWatches(context.Context) {} // inode represents a filesystem object. +// +// +stateify savable type inode struct { // fs is the owning filesystem. fs is immutable. fs *filesystem @@ -297,12 +363,12 @@ type inode struct { // Inode metadata. Writing multiple fields atomically requires holding // mu, othewise atomic operations can be used. - mu sync.Mutex - mode uint32 // file type and mode - nlink uint32 // protected by filesystem.mu instead of inode.mu - uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic - gid uint32 // auth.KGID, but ... - ino uint64 // immutable + mu sync.Mutex `state:"nosave"` + mode uint32 // file type and mode + nlink uint32 // protected by filesystem.mu instead of inode.mu + uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic + gid uint32 // auth.KGID, but ... + ino uint64 // immutable // Linux's tmpfs has no concept of btime. atime int64 // nanoseconds @@ -340,8 +406,10 @@ func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth // incLinksLocked increments i's link count. // -// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. -// i.nlink < maxLinks. +// Preconditions: +// * filesystem.mu must be locked for writing. +// * i.nlink != 0. +// * i.nlink < maxLinks. func (i *inode) incLinksLocked() { if i.nlink == 0 { panic("tmpfs.inode.incLinksLocked() called with no existing links") @@ -355,7 +423,9 @@ func (i *inode) incLinksLocked() { // decLinksLocked decrements i's link count. If the link count reaches 0, we // remove a reference on i as well. // -// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. +// Preconditions: +// * filesystem.mu must be locked for writing. +// * i.nlink != 0. func (i *inode) decLinksLocked(ctx context.Context) { if i.nlink == 0 { panic("tmpfs.inode.decLinksLocked() called with no existing links") @@ -594,66 +664,59 @@ func (i *inode) touchCMtime() { i.mu.Unlock() } -// Preconditions: The caller has called vfs.Mount.CheckBeginWrite() and holds -// inode.mu. +// Preconditions: +// * The caller has called vfs.Mount.CheckBeginWrite(). +// * inode.mu must be locked. func (i *inode) touchCMtimeLocked() { now := i.fs.clock.Now().Nanoseconds() atomic.StoreInt64(&i.mtime, now) atomic.StoreInt64(&i.ctime, now) } -func (i *inode) listxattr(size uint64) ([]string, error) { - return i.xattrs.Listxattr(size) +func (i *inode) listXattr(size uint64) ([]string, error) { + return i.xattrs.ListXattr(size) } -func (i *inode) getxattr(creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) { - if err := i.checkPermissions(creds, vfs.MayRead); err != nil { +func (i *inode) getXattr(creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { + if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil { return "", err } - if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { - return "", syserror.EOPNOTSUPP - } - if !i.userXattrSupported() { - return "", syserror.ENODATA - } - return i.xattrs.Getxattr(opts) + return i.xattrs.GetXattr(opts) } -func (i *inode) setxattr(creds *auth.Credentials, opts *vfs.SetxattrOptions) error { - if err := i.checkPermissions(creds, vfs.MayWrite); err != nil { +func (i *inode) setXattr(creds *auth.Credentials, opts *vfs.SetXattrOptions) error { + if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil { return err } - if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { - return syserror.EOPNOTSUPP - } - if !i.userXattrSupported() { - return syserror.EPERM - } - return i.xattrs.Setxattr(opts) + return i.xattrs.SetXattr(opts) } -func (i *inode) removexattr(creds *auth.Credentials, name string) error { - if err := i.checkPermissions(creds, vfs.MayWrite); err != nil { +func (i *inode) removeXattr(creds *auth.Credentials, name string) error { + if err := i.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil { return err } - if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return i.xattrs.RemoveXattr(name) +} + +func (i *inode) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error { + // We currently only support extended attributes in the user.* and + // trusted.* namespaces. See b/148380782. + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) && !strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) { return syserror.EOPNOTSUPP } - if !i.userXattrSupported() { - return syserror.EPERM + mode := linux.FileMode(atomic.LoadUint32(&i.mode)) + kuid := auth.KUID(atomic.LoadUint32(&i.uid)) + kgid := auth.KGID(atomic.LoadUint32(&i.gid)) + if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil { + return err } - return i.xattrs.Removexattr(name) -} - -// Extended attributes in the user.* namespace are only supported for regular -// files and directories. -func (i *inode) userXattrSupported() bool { - filetype := linux.S_IFMT & atomic.LoadUint32(&i.mode) - return filetype == linux.S_IFREG || filetype == linux.S_IFDIR + return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name) } // fileDescription is embedded by tmpfs implementations of // vfs.FileDescriptionImpl. +// +// +stateify savable type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl @@ -693,20 +756,25 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) return nil } -// Listxattr implements vfs.FileDescriptionImpl.Listxattr. -func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) { - return fd.inode().listxattr(size) +// StatFS implements vfs.FileDescriptionImpl.StatFS. +func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { + return globalStatfs, nil } -// Getxattr implements vfs.FileDescriptionImpl.Getxattr. -func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) { - return fd.inode().getxattr(auth.CredentialsFromContext(ctx), &opts) +// ListXattr implements vfs.FileDescriptionImpl.ListXattr. +func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { + return fd.inode().listXattr(size) } -// Setxattr implements vfs.FileDescriptionImpl.Setxattr. -func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { +// GetXattr implements vfs.FileDescriptionImpl.GetXattr. +func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { + return fd.inode().getXattr(auth.CredentialsFromContext(ctx), &opts) +} + +// SetXattr implements vfs.FileDescriptionImpl.SetXattr. +func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { d := fd.dentry() - if err := d.inode.setxattr(auth.CredentialsFromContext(ctx), &opts); err != nil { + if err := d.inode.setXattr(auth.CredentialsFromContext(ctx), &opts); err != nil { return err } @@ -715,10 +783,10 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption return nil } -// Removexattr implements vfs.FileDescriptionImpl.Removexattr. -func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { +// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. +func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { d := fd.dentry() - if err := d.inode.removexattr(auth.CredentialsFromContext(ctx), name); err != nil { + if err := d.inode.removeXattr(auth.CredentialsFromContext(ctx), name); err != nil { return err } @@ -727,37 +795,6 @@ func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { return nil } -// NewMemfd creates a new tmpfs regular file and file description that can back -// an anonymous fd created by memfd_create. -func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) { - fs, ok := mount.Filesystem().Impl().(*filesystem) - if !ok { - panic("NewMemfd() called with non-tmpfs mount") - } - - // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with - // S_IRWXUGO. - inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777) - rf := inode.impl.(*regularFile) - if allowSeals { - rf.seals = 0 - } - - d := fs.newDentry(inode) - defer d.DecRef(ctx) - d.name = name - - // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with - // FMODE_READ | FMODE_WRITE. - var fd regularFileFD - fd.Init(&inode.locks) - flags := uint32(linux.O_RDWR) - if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { - return nil, err - } - return &fd.vfsfd, nil -} - // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go index 6f3e3ae6f..fc5323abc 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go @@ -41,11 +41,12 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{}) if err != nil { return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err) } root := mntns.Root() + root.IncRef() return vfsObj, root, func() { root.DecRef(ctx) mntns.DecRef(ctx) |