From b72e1b3c0873ea29d031db42e39ca053923eecff Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Mon, 25 Nov 2019 18:09:15 -0800 Subject: Minor VFS2 interface changes. - Remove the Filesystem argument from DentryImpl.*Ref(); in general DentryImpls that need the Filesystem for reference counting will probably also need it for other interface methods that don't plumb Filesystem, so it's easier to just store a pointer to the filesystem in the DentryImpl. - Add a pointer to the VirtualFilesystem to Filesystem, which is needed by the gofer client to disown dentries for cache eviction triggered by dentry reference count changes. - Rename FilesystemType.NewFilesystem to GetFilesystem; in some cases (e.g. sysfs, cgroupfs) it's much cleaner for there to be only one Filesystem that is used by all mounts, and in at least one case (devtmpfs) it's visibly incorrect not to do so, so NewFilesystem doesn't always actually create and return a *new* Filesystem. - Require callers of FileDescription.Init() to increment Mount/Dentry references. This is because the gofer client may, in the OpenAt() path, take a reference on a dentry with 0 references, which is safe due to synchronization that is outside the scope of this CL, and it would be safer to still have its implementation of DentryImpl.IncRef() check for an increment for 0 references in other cases. - Add FileDescription.TryIncRef. This is used by the gofer client to take references on "special file descriptions" (FDs for files such as pipes, sockets, and devices), which use per-FD handles (fids) instead of dentry-shared handles, for sync() and syncfs(). PiperOrigin-RevId: 282473364 --- pkg/sentry/fsimpl/ext/inode.go | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'pkg/sentry/fsimpl/ext/inode.go') diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index e6c847a71..24249525c 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -16,7 +16,6 @@ package ext import ( "fmt" - "io" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -42,13 +41,13 @@ type inode struct { // refs is a reference count. refs is accessed using atomic memory operations. refs int64 + // fs is the containing filesystem. + fs *filesystem + // inodeNum is the inode number of this inode on disk. This is used to // identify inodes within the ext filesystem. inodeNum uint32 - // dev represents the underlying device. Same as filesystem.dev. - dev io.ReaderAt - // blkSize is the fs data block size. Same as filesystem.sb.BlockSize(). blkSize uint64 @@ -81,10 +80,10 @@ func (in *inode) tryIncRef() bool { // decRef decrements the inode ref count and releases the inode resources if // the ref count hits 0. // -// Precondition: Must have locked fs.mu. -func (in *inode) decRef(fs *filesystem) { +// Precondition: Must have locked filesystem.mu. +func (in *inode) decRef() { if refs := atomic.AddInt64(&in.refs, -1); refs == 0 { - delete(fs.inodeCache, in.inodeNum) + delete(in.fs.inodeCache, in.inodeNum) } else if refs < 0 { panic("ext.inode.decRef() called without holding a reference") } @@ -117,8 +116,8 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) { // Build the inode based on its type. inode := inode{ + fs: fs, inodeNum: inodeNum, - dev: fs.dev, blkSize: blkSize, diskInode: diskInode, } @@ -154,11 +153,14 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v if err := in.checkPermissions(rp.Credentials(), ats); err != nil { return nil, err } + mnt := rp.Mount() switch in.impl.(type) { case *regularFile: var fd regularFileFD fd.flags = flags - fd.vfsfd.Init(&fd, rp.Mount(), vfsd) + mnt.IncRef() + vfsd.IncRef() + fd.vfsfd.Init(&fd, mnt, vfsd) return &fd.vfsfd, nil case *directory: // Can't open directories writably. This check is not necessary for a read @@ -167,8 +169,10 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v return nil, syserror.EISDIR } var fd directoryFD - fd.vfsfd.Init(&fd, rp.Mount(), vfsd) fd.flags = flags + mnt.IncRef() + vfsd.IncRef() + fd.vfsfd.Init(&fd, mnt, vfsd) return &fd.vfsfd, nil case *symlink: if flags&linux.O_PATH == 0 { @@ -177,7 +181,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v } var fd symlinkFD fd.flags = flags - fd.vfsfd.Init(&fd, rp.Mount(), vfsd) + mnt.IncRef() + vfsd.IncRef() + fd.vfsfd.Init(&fd, mnt, vfsd) return &fd.vfsfd, nil default: panic(fmt.Sprintf("unknown inode type: %T", in.impl)) -- cgit v1.2.3 From 3eb489ed6c67b069bc135ab92cb031ce80b40d8f Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Fri, 20 Dec 2019 11:52:24 -0800 Subject: Move VFS2 file description status flags to vfs.FileDescription. PiperOrigin-RevId: 286616668 --- pkg/sentry/fsimpl/ext/file_description.go | 19 --- pkg/sentry/fsimpl/ext/inode.go | 9 +- pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 16 +-- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 16 +-- pkg/sentry/fsimpl/memfs/filesystem.go | 11 +- pkg/sentry/fsimpl/memfs/memfs.go | 14 --- pkg/sentry/fsimpl/memfs/named_pipe.go | 2 +- pkg/sentry/vfs/file_description.go | 141 +++++++++++++++------- pkg/sentry/vfs/file_description_impl_util_test.go | 2 +- 9 files changed, 107 insertions(+), 123 deletions(-) (limited to 'pkg/sentry/fsimpl/ext/inode.go') diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go index 5eca2b83f..841274daf 100644 --- a/pkg/sentry/fsimpl/ext/file_description.go +++ b/pkg/sentry/fsimpl/ext/file_description.go @@ -26,13 +26,6 @@ import ( type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl - - // flags is the same as vfs.OpenOptions.Flags which are passed to - // vfs.FilesystemImpl.OpenAt. - // TODO(b/134676337): syscalls like read(2), write(2), fchmod(2), fchown(2), - // fgetxattr(2), ioctl(2), mmap(2) should fail with EBADF if O_PATH is set. - // Only close(2), fstat(2), fstatfs(2) should work. - flags uint32 } func (fd *fileDescription) filesystem() *filesystem { @@ -43,18 +36,6 @@ func (fd *fileDescription) inode() *inode { return fd.vfsfd.Dentry().Impl().(*dentry).inode } -// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. -func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) { - return fd.flags, nil -} - -// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags. -func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error { - // None of the flags settable by fcntl(F_SETFL) are supported, so this is a - // no-op. - return nil -} - // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { var stat linux.Statx diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index 24249525c..b2cc826c7 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -157,10 +157,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v switch in.impl.(type) { case *regularFile: var fd regularFileFD - fd.flags = flags mnt.IncRef() vfsd.IncRef() - fd.vfsfd.Init(&fd, mnt, vfsd) + fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil case *directory: // Can't open directories writably. This check is not necessary for a read @@ -169,10 +168,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v return nil, syserror.EISDIR } var fd directoryFD - fd.flags = flags mnt.IncRef() vfsd.IncRef() - fd.vfsfd.Init(&fd, mnt, vfsd) + fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil case *symlink: if flags&linux.O_PATH == 0 { @@ -180,10 +178,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v return nil, syserror.ELOOP } var fd symlinkFD - fd.flags = flags mnt.IncRef() vfsd.IncRef() - fd.vfsfd.Init(&fd, mnt, vfsd) + fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil default: panic(fmt.Sprintf("unknown inode type: %T", in.impl)) diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 30c06baf0..51102ce48 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -65,17 +65,15 @@ type DynamicBytesFD struct { vfsfd vfs.FileDescription inode Inode - flags uint32 } // Init initializes a DynamicBytesFD. func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) { m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. - fd.flags = flags fd.inode = d.Impl().(*Dentry).inode fd.SetDataSource(data) - fd.vfsfd.Init(fd, m, d) + fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}) } // Seek implements vfs.FileDescriptionImpl.Seek. @@ -117,15 +115,3 @@ func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error { // DynamicBytesFiles are immutable. return syserror.EPERM } - -// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. -func (fd *DynamicBytesFD) StatusFlags(ctx context.Context) (uint32, error) { - return fd.flags, nil -} - -// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags. -func (fd *DynamicBytesFD) SetStatusFlags(ctx context.Context, flags uint32) error { - // None of the flags settable by fcntl(F_SETFL) are supported, so this is a - // no-op. - return nil -} diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index d6c18937a..bd402330f 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -39,7 +39,6 @@ type GenericDirectoryFD struct { vfsfd vfs.FileDescription children *OrderedChildren - flags uint32 off int64 } @@ -48,8 +47,7 @@ func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *Ordere m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. fd.children = children - fd.flags = flags - fd.vfsfd.Init(fd, m, d) + fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}) } // VFSFileDescription returns a pointer to the vfs.FileDescription representing @@ -180,18 +178,6 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int return offset, nil } -// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. -func (fd *GenericDirectoryFD) StatusFlags(ctx context.Context) (uint32, error) { - return fd.flags, nil -} - -// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags. -func (fd *GenericDirectoryFD) SetStatusFlags(ctx context.Context, flags uint32) error { - // None of the flags settable by fcntl(F_SETFL) are supported, so this is a - // no-op. - return nil -} - // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { fs := fd.filesystem() diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go index 22f1e811f..af4389459 100644 --- a/pkg/sentry/fsimpl/memfs/filesystem.go +++ b/pkg/sentry/fsimpl/memfs/filesystem.go @@ -282,9 +282,8 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { // Filter out flags that are not supported by memfs. O_DIRECTORY and // O_NOFOLLOW have no effect here (they're handled by VFS by setting - // appropriate bits in rp), but are returned by - // FileDescriptionImpl.StatusFlags(). O_NONBLOCK is supported only by - // pipes. + // appropriate bits in rp), but are visible in FD status flags. O_NONBLOCK + // is supported only by pipes. opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK if opts.Flags&linux.O_CREAT == 0 { @@ -384,7 +383,6 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr switch impl := i.impl.(type) { case *regularFile: var fd regularFileFD - fd.flags = flags fd.readable = vfs.MayReadFileWithOpenFlags(flags) fd.writable = vfs.MayWriteFileWithOpenFlags(flags) if fd.writable { @@ -395,7 +393,7 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr } mnt.IncRef() vfsd.IncRef() - fd.vfsfd.Init(&fd, mnt, vfsd) + fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) if flags&linux.O_TRUNC != 0 { impl.mu.Lock() impl.data = impl.data[:0] @@ -411,8 +409,7 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr var fd directoryFD mnt.IncRef() vfsd.IncRef() - fd.vfsfd.Init(&fd, mnt, vfsd) - fd.flags = flags + fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil case *symlink: // Can't open symlinks without O_PATH (which is unimplemented). diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go index 4cb2a4e0f..9d509f6e4 100644 --- a/pkg/sentry/fsimpl/memfs/memfs.go +++ b/pkg/sentry/fsimpl/memfs/memfs.go @@ -261,8 +261,6 @@ func (i *inode) direntType() uint8 { type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl - - flags uint32 // status flags; immutable } func (fd *fileDescription) filesystem() *filesystem { @@ -273,18 +271,6 @@ func (fd *fileDescription) inode() *inode { return fd.vfsfd.Dentry().Impl().(*dentry).inode } -// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. -func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) { - return fd.flags, nil -} - -// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags. -func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error { - // None of the flags settable by fcntl(F_SETFL) are supported, so this is a - // no-op. - return nil -} - // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { var stat linux.Statx diff --git a/pkg/sentry/fsimpl/memfs/named_pipe.go b/pkg/sentry/fsimpl/memfs/named_pipe.go index 91cb4b1fc..d5060850e 100644 --- a/pkg/sentry/fsimpl/memfs/named_pipe.go +++ b/pkg/sentry/fsimpl/memfs/named_pipe.go @@ -57,6 +57,6 @@ func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, v mnt := rp.Mount() mnt.IncRef() vfsd.IncRef() - fd.vfsfd.Init(&fd, mnt, vfsd) + fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil } diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index c5a9adca3..df03886c3 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" @@ -39,49 +40,43 @@ type FileDescription struct { // operations. refs int64 + // statusFlags contains status flags, "initialized by open(2) and possibly + // modified by fcntl()" - fcntl(2). statusFlags is accessed using atomic + // memory operations. + statusFlags uint32 + // vd is the filesystem location at which this FileDescription was opened. // A reference is held on vd. vd is immutable. vd VirtualDentry + opts FileDescriptionOptions + // impl is the FileDescriptionImpl associated with this Filesystem. impl is // immutable. This should be the last field in FileDescription. impl FileDescriptionImpl } +// FileDescriptionOptions contains options to FileDescription.Init(). +type FileDescriptionOptions struct { + // If AllowDirectIO is true, allow O_DIRECT to be set on the file. This is + // usually only the case if O_DIRECT would actually have an effect. + AllowDirectIO bool +} + // Init must be called before first use of fd. It takes ownership of references -// on mnt and d held by the caller. -func (fd *FileDescription) Init(impl FileDescriptionImpl, mnt *Mount, d *Dentry) { +// on mnt and d held by the caller. statusFlags is the initial file description +// status flags, which is usually the full set of flags passed to open(2). +func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) { fd.refs = 1 + fd.statusFlags = statusFlags | linux.O_LARGEFILE fd.vd = VirtualDentry{ mount: mnt, dentry: d, } + fd.opts = *opts fd.impl = impl } -// Impl returns the FileDescriptionImpl associated with fd. -func (fd *FileDescription) Impl() FileDescriptionImpl { - return fd.impl -} - -// Mount returns the mount on which fd was opened. It does not take a reference -// on the returned Mount. -func (fd *FileDescription) Mount() *Mount { - return fd.vd.mount -} - -// Dentry returns the dentry at which fd was opened. It does not take a -// reference on the returned Dentry. -func (fd *FileDescription) Dentry() *Dentry { - return fd.vd.dentry -} - -// VirtualDentry returns the location at which fd was opened. It does not take -// a reference on the returned VirtualDentry. -func (fd *FileDescription) VirtualDentry() VirtualDentry { - return fd.vd -} - // IncRef increments fd's reference count. func (fd *FileDescription) IncRef() { atomic.AddInt64(&fd.refs, 1) @@ -113,6 +108,82 @@ func (fd *FileDescription) DecRef() { } } +// Mount returns the mount on which fd was opened. It does not take a reference +// on the returned Mount. +func (fd *FileDescription) Mount() *Mount { + return fd.vd.mount +} + +// Dentry returns the dentry at which fd was opened. It does not take a +// reference on the returned Dentry. +func (fd *FileDescription) Dentry() *Dentry { + return fd.vd.dentry +} + +// VirtualDentry returns the location at which fd was opened. It does not take +// a reference on the returned VirtualDentry. +func (fd *FileDescription) VirtualDentry() VirtualDentry { + return fd.vd +} + +// StatusFlags returns file description status flags, as for fcntl(F_GETFL). +func (fd *FileDescription) StatusFlags() uint32 { + return atomic.LoadUint32(&fd.statusFlags) +} + +// SetStatusFlags sets file description status flags, as for fcntl(F_SETFL). +func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Credentials, flags uint32) error { + // Compare Linux's fs/fcntl.c:setfl(). + oldFlags := fd.StatusFlags() + // Linux documents this check as "O_APPEND cannot be cleared if the file is + // marked as append-only and the file is open for write", which would make + // sense. However, the check as actually implemented seems to be "O_APPEND + // cannot be changed if the file is marked as append-only". + if (flags^oldFlags)&linux.O_APPEND != 0 { + stat, err := fd.impl.Stat(ctx, StatOptions{ + // There is no mask bit for stx_attributes. + Mask: 0, + // Linux just reads inode::i_flags directly. + Sync: linux.AT_STATX_DONT_SYNC, + }) + if err != nil { + return err + } + if (stat.AttributesMask&linux.STATX_ATTR_APPEND != 0) && (stat.Attributes&linux.STATX_ATTR_APPEND != 0) { + return syserror.EPERM + } + } + if (flags&linux.O_NOATIME != 0) && (oldFlags&linux.O_NOATIME == 0) { + stat, err := fd.impl.Stat(ctx, StatOptions{ + Mask: linux.STATX_UID, + // Linux's inode_owner_or_capable() just reads inode::i_uid + // directly. + Sync: linux.AT_STATX_DONT_SYNC, + }) + if err != nil { + return err + } + if stat.Mask&linux.STATX_UID == 0 { + return syserror.EPERM + } + if !CanActAsOwner(creds, auth.KUID(stat.UID)) { + return syserror.EPERM + } + } + if flags&linux.O_DIRECT != 0 && !fd.opts.AllowDirectIO { + return syserror.EINVAL + } + // TODO(jamieliu): FileDescriptionImpl.SetOAsync()? + const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK + atomic.StoreUint32(&fd.statusFlags, (oldFlags&^settableFlags)|(flags&settableFlags)) + return nil +} + +// Impl returns the FileDescriptionImpl associated with fd. +func (fd *FileDescription) Impl() FileDescriptionImpl { + return fd.impl +} + // FileDescriptionImpl contains implementation details for an FileDescription. // Implementations of FileDescriptionImpl should contain their associated // FileDescription by value as their first field. @@ -132,14 +203,6 @@ type FileDescriptionImpl interface { // prevent the file descriptor from being closed. OnClose(ctx context.Context) error - // StatusFlags returns file description status flags, as for - // fcntl(F_GETFL). - StatusFlags(ctx context.Context) (uint32, error) - - // SetStatusFlags sets file description status flags, as for - // fcntl(F_SETFL). - SetStatusFlags(ctx context.Context, flags uint32) error - // Stat returns metadata for the file represented by the FileDescription. Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) @@ -264,18 +327,6 @@ func (fd *FileDescription) OnClose(ctx context.Context) error { return fd.impl.OnClose(ctx) } -// StatusFlags returns file description status flags, as for fcntl(F_GETFL). -func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) { - flags, err := fd.impl.StatusFlags(ctx) - flags |= linux.O_LARGEFILE - return flags, err -} - -// SetStatusFlags sets file description status flags, as for fcntl(F_SETFL). -func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error { - return fd.impl.SetStatusFlags(ctx, flags) -} - // Stat returns metadata for the file represented by fd. func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { return fd.impl.Stat(ctx, opts) diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index ac7799296..678be07fe 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -48,7 +48,7 @@ type genCountFD struct { func newGenCountFD(mnt *Mount, vfsd *Dentry) *FileDescription { var fd genCountFD - fd.vfsfd.Init(&fd, mnt, vfsd) + fd.vfsfd.Init(&fd, 0 /* statusFlags */, mnt, vfsd, &FileDescriptionOptions{}) fd.DynamicBytesFileDescriptionImpl.SetDataSource(&fd) return &fd.vfsfd } -- cgit v1.2.3 From 1f384ac42b9ee8b52000dc2bff79d975853519ed Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Mon, 30 Dec 2019 11:35:06 -0800 Subject: Add VFS2 support for device special files. - Add FileDescriptionOptions.UseDentryMetadata, which reduces the amount of boilerplate needed for device FDs and the like between filesystems. - Switch back to having FileDescription.Init() take references on the Mount and Dentry; otherwise managing refcounts around failed calls to OpenDeviceSpecialFile() / Device.Open() is tricky. PiperOrigin-RevId: 287575574 --- pkg/sentry/fsimpl/ext/inode.go | 6 -- pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 2 - pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 2 - pkg/sentry/fsimpl/memfs/filesystem.go | 4 - pkg/sentry/fsimpl/memfs/named_pipe.go | 2 - pkg/sentry/vfs/BUILD | 1 + pkg/sentry/vfs/device.go | 100 ++++++++++++++++++++++++ pkg/sentry/vfs/file_description.go | 101 +++++++++++++++++++++++-- pkg/sentry/vfs/file_description_impl_util.go | 15 ++++ pkg/sentry/vfs/filesystem.go | 21 +++++ pkg/sentry/vfs/vfs.go | 6 ++ 11 files changed, 236 insertions(+), 24 deletions(-) create mode 100644 pkg/sentry/vfs/device.go (limited to 'pkg/sentry/fsimpl/ext/inode.go') diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index b2cc826c7..8608805bf 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -157,8 +157,6 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v switch in.impl.(type) { case *regularFile: var fd regularFileFD - mnt.IncRef() - vfsd.IncRef() fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil case *directory: @@ -168,8 +166,6 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v return nil, syserror.EISDIR } var fd directoryFD - mnt.IncRef() - vfsd.IncRef() fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil case *symlink: @@ -178,8 +174,6 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v return nil, syserror.ELOOP } var fd symlinkFD - mnt.IncRef() - vfsd.IncRef() fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil default: diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index c5fe65722..606ca692d 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -81,8 +81,6 @@ type DynamicBytesFD struct { // Init initializes a DynamicBytesFD. func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) { - m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. - d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. fd.inode = d.Impl().(*Dentry).inode fd.SetDataSource(data) fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}) diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 77975583b..bcf069b5f 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -44,8 +44,6 @@ type GenericDirectoryFD struct { // Init initializes a GenericDirectoryFD. func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) { - m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. - d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. fd.children = children fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}) } diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go index 4a83f310c..b063e09a3 100644 --- a/pkg/sentry/fsimpl/memfs/filesystem.go +++ b/pkg/sentry/fsimpl/memfs/filesystem.go @@ -348,8 +348,6 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, } // mnt.EndWrite() is called by regularFileFD.Release(). } - mnt.IncRef() - d.IncRef() fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}) if flags&linux.O_TRUNC != 0 { impl.mu.Lock() @@ -364,8 +362,6 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, return nil, syserror.EISDIR } var fd directoryFD - mnt.IncRef() - d.IncRef() fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil case *symlink: diff --git a/pkg/sentry/fsimpl/memfs/named_pipe.go b/pkg/sentry/fsimpl/memfs/named_pipe.go index d5060850e..b5a204438 100644 --- a/pkg/sentry/fsimpl/memfs/named_pipe.go +++ b/pkg/sentry/fsimpl/memfs/named_pipe.go @@ -55,8 +55,6 @@ func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, v return nil, err } mnt := rp.Mount() - mnt.IncRef() - vfsd.IncRef() fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil } diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index e3e554b88..4c6aa04a1 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -9,6 +9,7 @@ go_library( "context.go", "debug.go", "dentry.go", + "device.go", "file_description.go", "file_description_impl_util.go", "filesystem.go", diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go new file mode 100644 index 000000000..cb672e36f --- /dev/null +++ b/pkg/sentry/vfs/device.go @@ -0,0 +1,100 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/syserror" +) + +// DeviceKind indicates whether a device is a block or character device. +type DeviceKind uint32 + +const ( + // BlockDevice indicates a block device. + BlockDevice DeviceKind = iota + + // CharDevice indicates a character device. + CharDevice +) + +// String implements fmt.Stringer.String. +func (kind DeviceKind) String() string { + switch kind { + case BlockDevice: + return "block" + case CharDevice: + return "character" + default: + return fmt.Sprintf("invalid device kind %d", kind) + } +} + +type devTuple struct { + kind DeviceKind + major uint32 + minor uint32 +} + +// A Device backs device special files. +type Device interface { + // Open returns a FileDescription representing this device. + Open(ctx context.Context, mnt *Mount, d *Dentry, opts OpenOptions) (*FileDescription, error) +} + +type registeredDevice struct { + dev Device + opts RegisterDeviceOptions +} + +// RegisterDeviceOptions contains options to +// VirtualFilesystem.RegisterDevice(). +type RegisterDeviceOptions struct { + // GroupName is the name shown for this device registration in + // /proc/devices. If GroupName is empty, this registration will not be + // shown in /proc/devices. + GroupName string +} + +// RegisterDevice registers the given Device in vfs with the given major and +// minor device numbers. +func (vfs *VirtualFilesystem) RegisterDevice(kind DeviceKind, major, minor uint32, dev Device, opts *RegisterDeviceOptions) error { + tup := devTuple{kind, major, minor} + vfs.devicesMu.Lock() + defer vfs.devicesMu.Unlock() + if existing, ok := vfs.devices[tup]; ok { + return fmt.Errorf("%s device number (%d, %d) is already registered to device type %T", kind, major, minor, existing.dev) + } + vfs.devices[tup] = ®isteredDevice{ + dev: dev, + opts: *opts, + } + return nil +} + +// OpenDeviceSpecialFile returns a FileDescription representing the given +// device. +func (vfs *VirtualFilesystem) OpenDeviceSpecialFile(ctx context.Context, mnt *Mount, d *Dentry, kind DeviceKind, major, minor uint32, opts *OpenOptions) (*FileDescription, error) { + tup := devTuple{kind, major, minor} + vfs.devicesMu.RLock() + defer vfs.devicesMu.RUnlock() + rd, ok := vfs.devices[tup] + if !ok { + return nil, syserror.ENXIO + } + return rd.dev.Open(ctx, mnt, d, *opts) +} diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 0b053201a..6afe280bc 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -61,11 +61,25 @@ type FileDescriptionOptions struct { // If AllowDirectIO is true, allow O_DIRECT to be set on the file. This is // usually only the case if O_DIRECT would actually have an effect. AllowDirectIO bool + + // If UseDentryMetadata is true, calls to FileDescription methods that + // interact with file and filesystem metadata (Stat, SetStat, StatFS, + // Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling + // the corresponding FilesystemImpl methods instead of the corresponding + // FileDescriptionImpl methods. + // + // UseDentryMetadata is intended for file descriptions that are implemented + // outside of individual filesystems, such as pipes, sockets, and device + // special files. FileDescriptions for which UseDentryMetadata is true may + // embed DentryMetadataFileDescriptionImpl to obtain appropriate + // implementations of FileDescriptionImpl methods that should not be + // called. + UseDentryMetadata bool } -// Init must be called before first use of fd. It takes ownership of references -// on mnt and d held by the caller. statusFlags is the initial file description -// status flags, which is usually the full set of flags passed to open(2). +// Init must be called before first use of fd. It takes references on mnt and +// d. statusFlags is the initial file description status flags, which is +// usually the full set of flags passed to open(2). func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) { fd.refs = 1 fd.statusFlags = statusFlags | linux.O_LARGEFILE @@ -73,6 +87,7 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mn mount: mnt, dentry: d, } + fd.vd.IncRef() fd.opts = *opts fd.impl = impl } @@ -140,7 +155,7 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede // sense. However, the check as actually implemented seems to be "O_APPEND // cannot be changed if the file is marked as append-only". if (flags^oldFlags)&linux.O_APPEND != 0 { - stat, err := fd.impl.Stat(ctx, StatOptions{ + stat, err := fd.Stat(ctx, StatOptions{ // There is no mask bit for stx_attributes. Mask: 0, // Linux just reads inode::i_flags directly. @@ -154,7 +169,7 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede } } if (flags&linux.O_NOATIME != 0) && (oldFlags&linux.O_NOATIME == 0) { - stat, err := fd.impl.Stat(ctx, StatOptions{ + stat, err := fd.Stat(ctx, StatOptions{ Mask: linux.STATX_UID, // Linux's inode_owner_or_capable() just reads inode::i_uid // directly. @@ -348,17 +363,47 @@ func (fd *FileDescription) OnClose(ctx context.Context) error { // Stat returns metadata for the file represented by fd. func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts) + vfsObj.putResolvingPath(rp) + return stat, err + } return fd.impl.Stat(ctx, opts) } // SetStat updates metadata for the file represented by fd. func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) error { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts) + vfsObj.putResolvingPath(rp) + return err + } return fd.impl.SetStat(ctx, opts) } // StatFS returns metadata for the filesystem containing the file represented // by fd. func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp) + vfsObj.putResolvingPath(rp) + return statfs, err + } return fd.impl.StatFS(ctx) } @@ -417,6 +462,16 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch. // Listxattr returns all extended attribute names for the file represented by // fd. func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp) + vfsObj.putResolvingPath(rp) + return names, err + } names, err := fd.impl.Listxattr(ctx) if err == syserror.ENOTSUP { // Linux doesn't actually return ENOTSUP in this case; instead, @@ -431,18 +486,48 @@ func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) { // Getxattr returns the value associated with the given extended attribute for // the file represented by fd. func (fd *FileDescription) Getxattr(ctx context.Context, name string) (string, error) { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, name) + vfsObj.putResolvingPath(rp) + return val, err + } return fd.impl.Getxattr(ctx, name) } // Setxattr changes the value associated with the given extended attribute for // the file represented by fd. func (fd *FileDescription) Setxattr(ctx context.Context, opts SetxattrOptions) error { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, opts) + vfsObj.putResolvingPath(rp) + return err + } return fd.impl.Setxattr(ctx, opts) } // Removexattr removes the given extended attribute from the file represented // by fd. func (fd *FileDescription) Removexattr(ctx context.Context, name string) error { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name) + vfsObj.putResolvingPath(rp) + return err + } return fd.impl.Removexattr(ctx, name) } @@ -464,7 +549,7 @@ func (fd *FileDescription) MappedName(ctx context.Context) string { // DeviceID implements memmap.MappingIdentity.DeviceID. func (fd *FileDescription) DeviceID() uint64 { - stat, err := fd.impl.Stat(context.Background(), StatOptions{ + stat, err := fd.Stat(context.Background(), StatOptions{ // There is no STATX_DEV; we assume that Stat will return it if it's // available regardless of mask. Mask: 0, @@ -480,7 +565,7 @@ func (fd *FileDescription) DeviceID() uint64 { // InodeID implements memmap.MappingIdentity.InodeID. func (fd *FileDescription) InodeID() uint64 { - stat, err := fd.impl.Stat(context.Background(), StatOptions{ + stat, err := fd.Stat(context.Background(), StatOptions{ Mask: linux.STATX_INO, // fs/proc/task_mmu.c:show_map_vma() just reads inode::i_ino directly. Sync: linux.AT_STATX_DONT_SYNC, @@ -493,5 +578,5 @@ func (fd *FileDescription) InodeID() uint64 { // Msync implements memmap.MappingIdentity.Msync. func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error { - return fd.impl.Sync(ctx) + return fd.Sync(ctx) } diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index de782e577..66eb57bc2 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -177,6 +177,21 @@ func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src userme return 0, syserror.EISDIR } +// DentryMetadataFileDescriptionImpl may be embedded by implementations of +// FileDescriptionImpl for which FileDescriptionOptions.UseDentryMetadata is +// true to obtain implementations of Stat and SetStat that panic. +type DentryMetadataFileDescriptionImpl struct{} + +// Stat implements FileDescriptionImpl.Stat. +func (DentryMetadataFileDescriptionImpl) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { + panic("illegal call to DentryMetadataFileDescriptionImpl.Stat") +} + +// SetStat implements FileDescriptionImpl.SetStat. +func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetStatOptions) error { + panic("illegal call to DentryMetadataFileDescriptionImpl.SetStat") +} + // DynamicBytesFileDescriptionImpl may be embedded by implementations of // FileDescriptionImpl that represent read-only regular files whose contents // are backed by a bytes.Buffer that is regenerated when necessary, consistent diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 89bd58864..ea78f555b 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -418,17 +418,38 @@ type FilesystemImpl interface { UnlinkAt(ctx context.Context, rp *ResolvingPath) error // ListxattrAt returns all extended attribute names for the file at rp. + // + // Errors: + // + // - If extended attributes are not supported by the filesystem, + // ListxattrAt returns nil. (See FileDescription.Listxattr for an + // explanation.) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) // GetxattrAt returns the value associated with the given extended // attribute for the file at rp. + // + // Errors: + // + // - If extended attributes are not supported by the filesystem, GetxattrAt + // returns ENOTSUP. GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) // SetxattrAt changes the value associated with the given extended // attribute for the file at rp. + // + // Errors: + // + // - If extended attributes are not supported by the filesystem, SetxattrAt + // returns ENOTSUP. SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error // RemovexattrAt removes the given extended attribute from the file at rp. + // + // Errors: + // + // - If extended attributes are not supported by the filesystem, + // RemovexattrAt returns ENOTSUP. RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error // PrependPath prepends a path from vd to vd.Mount().Root() to b. diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index a3bdb5805..ea2db7031 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -75,6 +75,11 @@ type VirtualFilesystem struct { // mountpoints is analogous to Linux's mountpoint_hashtable. mountpoints map[*Dentry]map[*Mount]struct{} + // devices contains all registered Devices. devices is protected by + // devicesMu. + devicesMu sync.RWMutex + devices map[devTuple]*registeredDevice + // fsTypes contains all registered FilesystemTypes. fsTypes is protected by // fsTypesMu. fsTypesMu sync.RWMutex @@ -90,6 +95,7 @@ type VirtualFilesystem struct { func New() *VirtualFilesystem { vfs := &VirtualFilesystem{ mountpoints: make(map[*Dentry]map[*Mount]struct{}), + devices: make(map[devTuple]*registeredDevice), fsTypes: make(map[string]*registeredFilesystemType), filesystems: make(map[*Filesystem]struct{}), } -- cgit v1.2.3 From 5ab1213a6c405071546c783d6d93b4e9af52842e Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Wed, 22 Jan 2020 12:27:16 -0800 Subject: Move VFS2 handling of FD readability/writability to vfs.FileDescription. PiperOrigin-RevId: 291006713 --- pkg/sentry/fsimpl/ext/inode.go | 8 +++- pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 11 +++-- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 11 ++++- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 18 +++++-- pkg/sentry/fsimpl/tmpfs/filesystem.go | 15 ++---- pkg/sentry/fsimpl/tmpfs/named_pipe.go | 5 +- pkg/sentry/fsimpl/tmpfs/regular_file.go | 14 +----- pkg/sentry/kernel/pipe/vfs.go | 12 ++--- pkg/sentry/vfs/file_description.go | 66 ++++++++++++++++++++++++-- pkg/sentry/vfs/permissions.go | 5 +- 10 files changed, 111 insertions(+), 54 deletions(-) (limited to 'pkg/sentry/fsimpl/ext/inode.go') diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index 8608805bf..191b39970 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -157,7 +157,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v switch in.impl.(type) { case *regularFile: var fd regularFileFD - fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) + if err := fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } return &fd.vfsfd, nil case *directory: // Can't open directories writably. This check is not necessary for a read @@ -166,7 +168,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v return nil, syserror.EISDIR } var fd directoryFD - fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) + if err := fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } return &fd.vfsfd, nil case *symlink: if flags&linux.O_PATH == 0 { diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 606ca692d..75624e0b1 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -55,7 +55,9 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.Dy // Open implements Inode.Open. func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { fd := &DynamicBytesFD{} - fd.Init(rp.Mount(), vfsd, f.data, flags) + if err := fd.Init(rp.Mount(), vfsd, f.data, flags); err != nil { + return nil, err + } return &fd.vfsfd, nil } @@ -80,10 +82,13 @@ type DynamicBytesFD struct { } // Init initializes a DynamicBytesFD. -func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) { +func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) error { + if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { + return err + } fd.inode = d.Impl().(*Dentry).inode fd.SetDataSource(data) - fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}) + return nil } // Seek implements vfs.FileDescriptionImpl.Seek. diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index bcf069b5f..5fa1fa67b 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -43,9 +43,16 @@ type GenericDirectoryFD struct { } // Init initializes a GenericDirectoryFD. -func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) { +func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) error { + if vfs.AccessTypesForOpenFlags(flags)&vfs.MayWrite != 0 { + // Can't open directories for writing. + return syserror.EISDIR + } + if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { + return err + } fd.children = children - fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}) + return nil } // VFSFileDescription returns a pointer to the vfs.FileDescription representing diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index a5fdfbde5..aa3fe76ee 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -115,7 +115,9 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags) + if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags); err != nil { + return nil, err + } return fd.VFSFileDescription(), nil } @@ -225,7 +227,9 @@ func TestReadStaticFile(t *testing.T) { defer sys.Destroy() pop := sys.PathOpAtRoot("file1") - fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{}) + fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) if err != nil { t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) } @@ -258,7 +262,9 @@ func TestCreateNewFileInStaticDir(t *testing.T) { // Close the file. The file should persist. fd.DecRef() - fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{}) + fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) if err != nil { t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err) } @@ -272,7 +278,9 @@ func TestDirFDReadWrite(t *testing.T) { defer sys.Destroy() pop := sys.PathOpAtRoot("/") - fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{}) + fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) if err != nil { t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) } @@ -282,7 +290,7 @@ func TestDirFDReadWrite(t *testing.T) { if _, err := fd.Read(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR { t.Fatalf("Read for directory FD failed with unexpected error: %v", err) } - if _, err := fd.Write(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); err != syserror.EISDIR { + if _, err := fd.Write(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); err != syserror.EBADF { t.Fatalf("Write for directory FD failed with unexpected error: %v", err) } } diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 4cd7e9aea..a9f66a42a 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -337,19 +337,12 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, return nil, err } } - mnt := rp.Mount() switch impl := d.inode.impl.(type) { case *regularFile: var fd regularFileFD - fd.readable = vfs.MayReadFileWithOpenFlags(flags) - fd.writable = vfs.MayWriteFileWithOpenFlags(flags) - if fd.writable { - if err := mnt.CheckBeginWrite(); err != nil { - return nil, err - } - // mnt.EndWrite() is called by regularFileFD.Release(). + if err := fd.vfsfd.Init(&fd, flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err } - fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}) if flags&linux.O_TRUNC != 0 { impl.mu.Lock() impl.data.Truncate(0, impl.memFile) @@ -363,7 +356,9 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, return nil, syserror.EISDIR } var fd directoryFD - fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}) + if err := fd.vfsfd.Init(&fd, flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } return &fd.vfsfd, nil case *symlink: // Can't open symlinks without O_PATH (which is unimplemented). diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go index 40bde54de..482aabd52 100644 --- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go +++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go @@ -50,11 +50,10 @@ type namedPipeFD struct { func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { var err error var fd namedPipeFD - fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, rp, vfsd, &fd.vfsfd, flags) + fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, vfsd, &fd.vfsfd, flags) if err != nil { return nil, err } - mnt := rp.Mount() - fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) + fd.vfsfd.Init(&fd, flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil } diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 5fa70cc6d..7c633c1b0 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -101,10 +101,6 @@ func (rf *regularFile) truncate(size uint64) (bool, error) { type regularFileFD struct { fileDescription - // These are immutable. - readable bool - writable bool - // off is the file offset. off is accessed using atomic memory operations. // offMu serializes operations that may mutate off. off int64 @@ -113,16 +109,11 @@ type regularFileFD struct { // Release implements vfs.FileDescriptionImpl.Release. func (fd *regularFileFD) Release() { - if fd.writable { - fd.vfsfd.VirtualDentry().Mount().EndWrite() - } + // noop } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - if !fd.readable { - return 0, syserror.EINVAL - } if offset < 0 { return 0, syserror.EINVAL } @@ -147,9 +138,6 @@ func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - if !fd.writable { - return 0, syserror.EINVAL - } if offset < 0 { return 0, syserror.EINVAL } diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index bf7461cbb..6f83e3cee 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -66,7 +66,7 @@ func NewVFSPipe(sizeBytes, atomicIOBytes int64) *VFSPipe { // for read and write will succeed both in blocking and nonblocking mode. POSIX // leaves this behavior undefined. This can be used to open a FIFO for writing // while there are no readers available." - fifo(7) -func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) { +func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) { vp.mu.Lock() defer vp.mu.Unlock() @@ -76,7 +76,7 @@ func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, rp *vfs.ResolvingPath, vfsd return nil, syserror.EINVAL } - vfd, err := vp.open(rp, vfsd, vfsfd, flags) + vfd, err := vp.open(vfsd, vfsfd, flags) if err != nil { return nil, err } @@ -118,19 +118,13 @@ func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, rp *vfs.ResolvingPath, vfsd } // Preconditions: vp.mu must be held. -func (vp *VFSPipe) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) { +func (vp *VFSPipe) open(vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) { var fd VFSPipeFD fd.flags = flags fd.readable = vfs.MayReadFileWithOpenFlags(flags) fd.writable = vfs.MayWriteFileWithOpenFlags(flags) fd.vfsfd = vfsfd fd.pipe = &vp.pipe - if fd.writable { - // The corresponding Mount.EndWrite() is in VFSPipe.Release(). - if err := rp.Mount().CheckBeginWrite(); err != nil { - return nil, err - } - } switch { case fd.readable && fd.writable: diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 6afe280bc..51c95c2d9 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -49,8 +49,23 @@ type FileDescription struct { // A reference is held on vd. vd is immutable. vd VirtualDentry + // opts contains options passed to FileDescription.Init(). opts is + // immutable. opts FileDescriptionOptions + // readable is MayReadFileWithOpenFlags(statusFlags). readable is + // immutable. + // + // readable is analogous to Linux's FMODE_READ. + readable bool + + // writable is MayWriteFileWithOpenFlags(statusFlags). If writable is true, + // the FileDescription holds a write count on vd.mount. writable is + // immutable. + // + // writable is analogous to Linux's FMODE_WRITE. + writable bool + // impl is the FileDescriptionImpl associated with this Filesystem. impl is // immutable. This should be the last field in FileDescription. impl FileDescriptionImpl @@ -77,10 +92,17 @@ type FileDescriptionOptions struct { UseDentryMetadata bool } -// Init must be called before first use of fd. It takes references on mnt and -// d. statusFlags is the initial file description status flags, which is -// usually the full set of flags passed to open(2). -func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) { +// Init must be called before first use of fd. If it succeeds, it takes +// references on mnt and d. statusFlags is the initial file description status +// flags, which is usually the full set of flags passed to open(2). +func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) error { + writable := MayWriteFileWithOpenFlags(statusFlags) + if writable { + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + } + fd.refs = 1 fd.statusFlags = statusFlags | linux.O_LARGEFILE fd.vd = VirtualDentry{ @@ -89,7 +111,10 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mn } fd.vd.IncRef() fd.opts = *opts + fd.readable = MayReadFileWithOpenFlags(statusFlags) + fd.writable = writable fd.impl = impl + return nil } // IncRef increments fd's reference count. @@ -117,6 +142,9 @@ func (fd *FileDescription) TryIncRef() bool { func (fd *FileDescription) DecRef() { if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 { fd.impl.Release() + if fd.writable { + fd.vd.mount.EndWrite() + } fd.vd.DecRef() } else if refs < 0 { panic("FileDescription.DecRef() called without holding a reference") @@ -194,6 +222,16 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede return nil } +// IsReadable returns true if fd was opened for reading. +func (fd *FileDescription) IsReadable() bool { + return fd.readable +} + +// IsWritable returns true if fd was opened for writing. +func (fd *FileDescription) IsWritable() bool { + return fd.writable +} + // Impl returns the FileDescriptionImpl associated with fd. func (fd *FileDescription) Impl() FileDescriptionImpl { return fd.impl @@ -241,6 +279,8 @@ type FileDescriptionImpl interface { // Errors: // // - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP. + // + // Preconditions: The FileDescription was opened for reading. PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) // Read is similar to PRead, but does not specify an offset. @@ -254,6 +294,8 @@ type FileDescriptionImpl interface { // Errors: // // - If opts.Flags specifies unsupported options, Read returns EOPNOTSUPP. + // + // Preconditions: The FileDescription was opened for reading. Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) // PWrite writes src to the file, starting at the given offset, and returns @@ -268,6 +310,8 @@ type FileDescriptionImpl interface { // // - If opts.Flags specifies unsupported options, PWrite returns // EOPNOTSUPP. + // + // Preconditions: The FileDescription was opened for writing. PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) // Write is similar to PWrite, but does not specify an offset, which is @@ -281,6 +325,8 @@ type FileDescriptionImpl interface { // Errors: // // - If opts.Flags specifies unsupported options, Write returns EOPNOTSUPP. + // + // Preconditions: The FileDescription was opened for writing. Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) // IterDirents invokes cb on each entry in the directory represented by the @@ -411,11 +457,17 @@ func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { // offset, and returns the number of bytes read. PRead is permitted to return // partial reads with a nil error. func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + if !fd.readable { + return 0, syserror.EBADF + } return fd.impl.PRead(ctx, dst, offset, opts) } // Read is similar to PRead, but does not specify an offset. func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + if !fd.readable { + return 0, syserror.EBADF + } return fd.impl.Read(ctx, dst, opts) } @@ -423,11 +475,17 @@ func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opt // offset, and returns the number of bytes written. PWrite is permitted to // return partial writes with a nil error. func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + if !fd.writable { + return 0, syserror.EBADF + } return fd.impl.PWrite(ctx, src, offset, opts) } // Write is similar to PWrite, but does not specify an offset. func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + if !fd.writable { + return 0, syserror.EBADF + } return fd.impl.Write(ctx, src, opts) } diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index d279d05ca..f664581f4 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -94,14 +94,13 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo // the set of accesses permitted for the opened file: // // - O_TRUNC causes MayWrite to be set in the returned AccessTypes (since it -// mutates the file), but does not permit the opened to write to the file +// mutates the file), but does not permit writing to the open file description // thereafter. // // - "Linux reserves the special, nonstandard access mode 3 (binary 11) in // flags to mean: check for read and write permission on the file and return a // file descriptor that can't be used for reading or writing." - open(2). Thus -// AccessTypesForOpenFlags returns MayRead|MayWrite in this case, but -// filesystems are responsible for ensuring that access is denied. +// AccessTypesForOpenFlags returns MayRead|MayWrite in this case. // // Use May{Read,Write}FileWithOpenFlags() for these checks instead. func AccessTypesForOpenFlags(flags uint32) AccessTypes { -- cgit v1.2.3 From a6024f7f5f6f438c11e30be0f93657b1956fd5ba Mon Sep 17 00:00:00 2001 From: gVisor bot Date: Thu, 13 Feb 2020 17:56:34 -0800 Subject: Add FileExec flag to OpenOptions This allow callers to say whether the file is being opened to be executed, so that the proper checks can be done from FilesystemImpl.OpenAt() Updates #1623 PiperOrigin-RevId: 295042595 --- pkg/sentry/fsimpl/ext/filesystem.go | 2 +- pkg/sentry/fsimpl/ext/inode.go | 12 ++++++------ pkg/sentry/fsimpl/gofer/filesystem.go | 24 ++++++++++++------------ pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 4 ++-- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 6 +++--- pkg/sentry/fsimpl/kernfs/filesystem.go | 10 +++++----- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 6 +++--- pkg/sentry/fsimpl/kernfs/kernfs.go | 2 +- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 8 ++++---- pkg/sentry/fsimpl/proc/subtasks.go | 4 ++-- pkg/sentry/fsimpl/proc/task.go | 4 ++-- pkg/sentry/fsimpl/proc/tasks.go | 4 ++-- pkg/sentry/fsimpl/sys/sys.go | 4 ++-- pkg/sentry/fsimpl/tmpfs/filesystem.go | 2 +- pkg/sentry/vfs/options.go | 5 +++++ pkg/sentry/vfs/permissions.go | 19 ++++++++++++------- pkg/sentry/vfs/vfs.go | 19 +++++++++++++++++++ 17 files changed, 82 insertions(+), 53 deletions(-) (limited to 'pkg/sentry/fsimpl/ext/inode.go') diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 07bf58953..e05429d41 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -296,7 +296,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if vfs.MayWriteFileWithOpenFlags(opts.Flags) || opts.Flags&(linux.O_CREAT|linux.O_EXCL|linux.O_TMPFILE) != 0 { return nil, syserror.EROFS } - return inode.open(rp, vfsd, opts.Flags) + return inode.open(rp, vfsd, &opts) } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index 191b39970..6962083f5 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -148,8 +148,8 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) { } // open creates and returns a file description for the dentry passed in. -func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { - ats := vfs.AccessTypesForOpenFlags(flags) +func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(opts) if err := in.checkPermissions(rp.Credentials(), ats); err != nil { return nil, err } @@ -157,7 +157,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v switch in.impl.(type) { case *regularFile: var fd regularFileFD - if err := fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return &fd.vfsfd, nil @@ -168,17 +168,17 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v return nil, syserror.EISDIR } var fd directoryFD - if err := fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return &fd.vfsfd, nil case *symlink: - if flags&linux.O_PATH == 0 { + if opts.Flags&linux.O_PATH == 0 { // Can't open symlinks without O_PATH. return nil, syserror.ELOOP } var fd symlinkFD - fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) + fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil default: panic(fmt.Sprintf("unknown inode type: %T", in.impl)) diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 8eb61debf..138adb9f7 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -593,7 +593,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf } } if rp.Done() { - return start.openLocked(ctx, rp, opts.Flags) + return start.openLocked(ctx, rp, &opts) } afterTrailingSymlink: @@ -633,12 +633,12 @@ afterTrailingSymlink: start = parent goto afterTrailingSymlink } - return child.openLocked(ctx, rp, opts.Flags) + return child.openLocked(ctx, rp, &opts) } // Preconditions: fs.renameMu must be locked. -func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags uint32) (*vfs.FileDescription, error) { - ats := vfs.AccessTypesForOpenFlags(flags) +func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(opts) if err := d.checkPermissions(rp.Credentials(), ats, d.isDir()); err != nil { return nil, err } @@ -646,11 +646,11 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags ui filetype := d.fileType() switch { case filetype == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD: - if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, flags&linux.O_TRUNC != 0); err != nil { + if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0); err != nil { return nil, err } fd := ®ularFileFD{} - if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ AllowDirectIO: true, }); err != nil { return nil, err @@ -658,21 +658,21 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags ui return &fd.vfsfd, nil case filetype == linux.S_IFDIR: // Can't open directories with O_CREAT. - if flags&linux.O_CREAT != 0 { + if opts.Flags&linux.O_CREAT != 0 { return nil, syserror.EISDIR } // Can't open directories writably. if ats&vfs.MayWrite != 0 { return nil, syserror.EISDIR } - if flags&linux.O_DIRECT != 0 { + if opts.Flags&linux.O_DIRECT != 0 { return nil, syserror.EINVAL } if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil { return nil, err } fd := &directoryFD{} - if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return &fd.vfsfd, nil @@ -680,17 +680,17 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags ui // Can't open symlinks without O_PATH (which is unimplemented). return nil, syserror.ELOOP default: - if flags&linux.O_DIRECT != 0 { + if opts.Flags&linux.O_DIRECT != 0 { return nil, syserror.EINVAL } - h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, flags&linux.O_TRUNC != 0) + h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0) if err != nil { return nil, err } fd := &specialFileFD{ handle: h, } - if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { h.close(ctx) return nil, err } diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 733792c78..d092ccb2a 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -53,9 +53,9 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.Dy } // Open implements Inode.Open. -func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &DynamicBytesFD{} - if err := fd.Init(rp.Mount(), vfsd, f.data, flags); err != nil { + if err := fd.Init(rp.Mount(), vfsd, f.data, opts.Flags); err != nil { return nil, err } return &fd.vfsfd, nil diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 6104751c8..eda781155 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -43,12 +43,12 @@ type GenericDirectoryFD struct { } // Init initializes a GenericDirectoryFD. -func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) error { - if vfs.AccessTypesForOpenFlags(flags)&vfs.MayWrite != 0 { +func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, opts *vfs.OpenOptions) error { + if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 { // Can't open directories for writing. return syserror.EISDIR } - if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { return err } fd.children = children diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index e49303c26..ee98eb66a 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -365,7 +365,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // appropriate bits in rp), but are returned by // FileDescriptionImpl.StatusFlags(). opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW - ats := vfs.AccessTypesForOpenFlags(opts.Flags) + ats := vfs.AccessTypesForOpenFlags(&opts) // Do not create new file. if opts.Flags&linux.O_CREAT == 0 { @@ -379,7 +379,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - return inode.Open(rp, vfsd, opts.Flags) + return inode.Open(rp, vfsd, opts) } // May create new file. @@ -398,7 +398,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - return inode.Open(rp, vfsd, opts.Flags) + return inode.Open(rp, vfsd, opts) } afterTrailingSymlink: parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) @@ -438,7 +438,7 @@ afterTrailingSymlink: return nil, err } parentVFSD.Impl().(*Dentry).InsertChild(pc, child) - return child.Impl().(*Dentry).inode.Open(rp, child, opts.Flags) + return child.Impl().(*Dentry).inode.Open(rp, child, opts) } // Open existing file or follow symlink. if mustCreate { @@ -463,7 +463,7 @@ afterTrailingSymlink: if err := childInode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - return childInode.Open(rp, childVFSD, opts.Flags) + return childInode.Open(rp, childVFSD, opts) } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index adca2313f..099d70a16 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -507,7 +507,7 @@ type InodeSymlink struct { } // Open implements Inode.Open. -func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { return nil, syserror.ELOOP } @@ -549,8 +549,8 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, ino uint64, perm linux.F } // Open implements kernfs.Inode. -func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &s.OrderedChildren, flags) + fd.Init(rp.Mount(), vfsd, &s.OrderedChildren, &opts) return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 79ebea8a5..c74fa999b 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -303,7 +303,7 @@ type Inode interface { // inode for its lifetime. // // Precondition: !rp.Done(). vfsd.Impl() must be a kernfs Dentry. - Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) + Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) } type inodeRefs interface { diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index ee65cf491..96a16e654 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -113,9 +113,9 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod return &dir.dentry } -func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &kernfs.GenericDirectoryFD{} - if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags); err != nil { + if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts); err != nil { return nil, err } return fd.VFSFileDescription(), nil @@ -143,9 +143,9 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte return &dir.dentry } -func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags) + fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts) return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index 353e37195..102af0e93 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -114,9 +114,9 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb } // Open implements kernfs.Inode. -func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags) + fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index eb5bc62c0..2d814668a 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -98,9 +98,9 @@ func (i *taskInode) Valid(ctx context.Context) bool { } // Open implements kernfs.Inode. -func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags) + fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 14bd334e8..ebe21630c 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -205,9 +205,9 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback } // Open implements kernfs.Inode. -func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags) + fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index e35d52d17..d693fceae 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -97,9 +97,9 @@ func (d *dir) SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error { } // Open implements kernfs.Inode.Open. -func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags) + fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts) return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 72bc15264..8785452b6 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -334,7 +334,7 @@ afterTrailingSymlink: } func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) { - ats := vfs.AccessTypesForOpenFlags(opts.Flags) + ats := vfs.AccessTypesForOpenFlags(opts) if !afterCreate { if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil { return nil, err diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index b7774bf28..fdf8be157 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -72,6 +72,11 @@ type OpenOptions struct { // If FilesystemImpl.OpenAt() creates a file, Mode is the file mode for the // created file. Mode linux.FileMode + + // FileExec is set when the file is being opened to be executed. + // VirtualFilesystem.OpenAt() checks that the caller has execute permissions + // on the file, and that the file is a regular file. + FileExec bool } // ReadOptions contains options to FileDescription.PRead(), diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index f664581f4..8e250998a 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -103,17 +103,22 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo // AccessTypesForOpenFlags returns MayRead|MayWrite in this case. // // Use May{Read,Write}FileWithOpenFlags() for these checks instead. -func AccessTypesForOpenFlags(flags uint32) AccessTypes { - switch flags & linux.O_ACCMODE { +func AccessTypesForOpenFlags(opts *OpenOptions) AccessTypes { + ats := AccessTypes(0) + if opts.FileExec { + ats |= MayExec + } + + switch opts.Flags & linux.O_ACCMODE { case linux.O_RDONLY: - if flags&linux.O_TRUNC != 0 { - return MayRead | MayWrite + if opts.Flags&linux.O_TRUNC != 0 { + return ats | MayRead | MayWrite } - return MayRead + return ats | MayRead case linux.O_WRONLY: - return MayWrite + return ats | MayWrite default: - return MayRead | MayWrite + return ats | MayRead | MayWrite } } diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 908c69f91..9629afee9 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -379,6 +379,25 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) if err == nil { vfs.putResolvingPath(rp) + + // TODO(gvisor.dev/issue/1193): Move inside fsimpl to avoid another call + // to FileDescription.Stat(). + if opts.FileExec { + // Only a regular file can be executed. + stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE}) + if err != nil { + return nil, err + } + if stat.Mask&linux.STATX_TYPE != 0 { + // This shouldn't happen, but if type can't be retrieved, file can't + // be executed. + return nil, syserror.EACCES + } + if linux.FileMode(stat.Mode).FileType() != linux.ModeRegular { + return nil, syserror.EACCES + } + } + return fd, nil } if !rp.handleError(err) { -- cgit v1.2.3 From de694e5484502d53166d70b36141e62fcdf07803 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Wed, 25 Mar 2020 19:12:25 -0700 Subject: Combine file mode and isDir arguments Updates #1035 PiperOrigin-RevId: 303021328 --- pkg/abi/linux/file.go | 5 +++++ pkg/sentry/fsimpl/ext/inode.go | 2 +- pkg/sentry/fsimpl/gofer/filesystem.go | 22 +++++++++++----------- pkg/sentry/fsimpl/gofer/gofer.go | 7 ++++--- pkg/sentry/fsimpl/host/host.go | 6 +++--- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 3 +-- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 6 ++---- pkg/sentry/fsimpl/proc/task.go | 9 +-------- pkg/sentry/fsimpl/tmpfs/filesystem.go | 24 ++++++++++++------------ pkg/sentry/fsimpl/tmpfs/tmpfs.go | 8 +++++--- pkg/sentry/vfs/anonfs.go | 2 +- pkg/sentry/vfs/permissions.go | 23 ++++++++++------------- 12 files changed, 56 insertions(+), 61 deletions(-) (limited to 'pkg/sentry/fsimpl/ext/inode.go') diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go index dbe58acbe..055ac1d7c 100644 --- a/pkg/abi/linux/file.go +++ b/pkg/abi/linux/file.go @@ -287,6 +287,11 @@ func (m FileMode) ExtraBits() FileMode { return m &^ (PermissionsMask | FileTypeMask) } +// IsDir returns true if file type represents a directory. +func (m FileMode) IsDir() bool { + return m.FileType() == S_IFDIR +} + // String returns a string representation of m. func (m FileMode) String() string { var s []string diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index 6962083f5..a39a37318 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -186,7 +186,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt } func (in *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { - return vfs.GenericCheckPermissions(creds, ats, in.isDir(), uint16(in.diskInode.Mode()), in.diskInode.UID(), in.diskInode.GID()) + return vfs.GenericCheckPermissions(creds, ats, in.diskInode.Mode(), in.diskInode.UID(), in.diskInode.GID()) } // statTo writes the statx fields to the output parameter. diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 26b492185..1e43df9ec 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -119,7 +119,7 @@ func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d * if !d.isDir() { return nil, syserror.ENOTDIR } - if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } afterSymlink: @@ -314,7 +314,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if err != nil { return err } - if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } if parent.isDeleted() { @@ -378,7 +378,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b if err != nil { return err } - if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } if err := rp.Mount().CheckBeginWrite(); err != nil { @@ -512,7 +512,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds if err != nil { return err } - return d.checkPermissions(creds, ats, d.isDir()) + return d.checkPermissions(creds, ats) } // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. @@ -528,7 +528,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op if !d.isDir() { return nil, syserror.ENOTDIR } - if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } } @@ -624,7 +624,7 @@ afterTrailingSymlink: return nil, err } // Check for search permission in the parent directory. - if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } // Determine whether or not we need to create a file. @@ -661,7 +661,7 @@ afterTrailingSymlink: // Preconditions: fs.renameMu must be locked. func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) - if err := d.checkPermissions(rp.Credentials(), ats, d.isDir()); err != nil { + if err := d.checkPermissions(rp.Credentials(), ats); err != nil { return nil, err } mnt := rp.Mount() @@ -722,7 +722,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf // Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked. func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { - if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { + if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } if d.isDeleted() { @@ -884,7 +884,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return err } } - if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } vfsObj := rp.VirtualFilesystem() @@ -904,7 +904,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return syserror.EINVAL } if oldParent != newParent { - if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { + if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return err } } @@ -915,7 +915,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } if oldParent != newParent { - if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } newParent.dirMu.Lock() diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 13928ce36..cf276a417 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -721,7 +721,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { return syserror.EPERM } - if err := vfs.CheckSetStat(ctx, creds, stat, uint16(atomic.LoadUint32(&d.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { + mode := linux.FileMode(atomic.LoadUint32(&d.mode)) + if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { return err } if err := mnt.CheckBeginWrite(); err != nil { @@ -843,8 +844,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin return nil } -func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error { - return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&d.mode))&0777, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) +func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) } // IncRef implements vfs.DentryImpl.IncRef. diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 1f735628f..a54985ef5 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -167,8 +167,8 @@ func fileFlagsFromHostFD(fd int) (int, error) { } // CheckPermissions implements kernfs.Inode. -func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, atx vfs.AccessTypes) error { - return vfs.GenericCheckPermissions(creds, atx, false /* isDir */, uint16(i.mode), i.uid, i.gid) +func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, ats, i.mode, i.uid, i.gid) } // Mode implements kernfs.Inode. @@ -306,7 +306,7 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre if m&^(linux.STATX_MODE|linux.STATX_SIZE|linux.STATX_ATIME|linux.STATX_MTIME) != 0 { return syserror.EPERM } - if err := vfs.CheckSetStat(ctx, creds, &s, uint16(i.Mode().Permissions()), i.uid, i.gid); err != nil { + if err := vfs.CheckSetStat(ctx, creds, &s, i.Mode(), i.uid, i.gid); err != nil { return err } diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 75c4bab1a..bfa786c88 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -206,8 +206,7 @@ func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (l // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { - fs := fd.filesystem() creds := auth.CredentialsFromContext(ctx) inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode - return inode.SetStat(ctx, fs, creds, opts) + return inode.SetStat(ctx, fd.filesystem(), creds, opts) } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index c612dcf07..5c84b10c9 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -241,7 +241,7 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 { return syserror.EPERM } - if err := vfs.CheckSetStat(ctx, creds, &opts.Stat, uint16(a.Mode().Permissions()), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil { + if err := vfs.CheckSetStat(ctx, creds, &opts.Stat, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil { return err } @@ -273,12 +273,10 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut // CheckPermissions implements Inode.CheckPermissions. func (a *InodeAttrs) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { - mode := a.Mode() return vfs.GenericCheckPermissions( creds, ats, - mode.FileType() == linux.ModeDirectory, - uint16(mode), + a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid)), ) diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 49d6efb0e..aee2a4392 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -172,14 +172,7 @@ func (i *taskOwnedInode) Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.S func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { mode := i.Mode() uid, gid := i.getOwner(mode) - return vfs.GenericCheckPermissions( - creds, - ats, - mode.FileType() == linux.ModeDirectory, - uint16(mode), - uid, - gid, - ) + return vfs.GenericCheckPermissions(creds, ats, mode, uid, gid) } func (i *taskOwnedInode) getOwner(mode linux.FileMode) (auth.KUID, auth.KGID) { diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 75d01b853..12cc64385 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -41,7 +41,7 @@ func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { if !d.inode.isDir() { return nil, syserror.ENOTDIR } - if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } afterSymlink: @@ -125,7 +125,7 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa if err != nil { return err } - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } name := rp.Component() @@ -163,7 +163,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds if err != nil { return err } - return d.inode.checkPermissions(creds, ats, d.inode.isDir()) + return d.inode.checkPermissions(creds, ats) } // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. @@ -178,7 +178,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op if !d.inode.isDir() { return nil, syserror.ENOTDIR } - if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true /* isDir */); err != nil { + if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } } @@ -301,7 +301,7 @@ afterTrailingSymlink: return nil, err } // Check for search permission in the parent directory. - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } // Reject attempts to open directories with O_CREAT. @@ -316,7 +316,7 @@ afterTrailingSymlink: child, err := stepLocked(rp, parent) if err == syserror.ENOENT { // Already checked for searchability above; now check for writability. - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } if err := rp.Mount().CheckBeginWrite(); err != nil { @@ -347,7 +347,7 @@ afterTrailingSymlink: func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) if !afterCreate { - if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil { + if err := d.inode.checkPermissions(rp.Credentials(), ats); err != nil { return nil, err } } @@ -428,7 +428,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa defer mnt.EndWrite() oldParent := oldParentVD.Dentry().Impl().(*dentry) - if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } // Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(), @@ -445,7 +445,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } if oldParent != newParent { // Writability is needed to change renamed's "..". - if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true /* isDir */); err != nil { + if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return err } } @@ -455,7 +455,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } } - if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } replacedVFSD := newParent.vfsd.Child(newName) @@ -528,7 +528,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error if err != nil { return err } - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } name := rp.Component() @@ -621,7 +621,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error if err != nil { return err } - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } name := rp.Component() diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 2d5070a46..2f9e6c876 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -245,8 +245,9 @@ func (i *inode) decRef() { } } -func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error { - return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))) +func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + mode := linux.FileMode(atomic.LoadUint32(&i.mode)) + return vfs.GenericCheckPermissions(creds, ats, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))) } // Go won't inline this function, and returning linux.Statx (which is quite @@ -299,7 +300,8 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 { return syserror.EPERM } - if err := vfs.CheckSetStat(ctx, creds, stat, uint16(atomic.LoadUint32(&i.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { + mode := linux.FileMode(atomic.LoadUint32(&i.mode)) + if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { return err } i.mu.Lock() diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index 925996517..a62e43589 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -83,7 +83,7 @@ func (fs *anonFilesystem) AccessAt(ctx context.Context, rp *ResolvingPath, creds if !rp.Done() { return syserror.ENOTDIR } - return GenericCheckPermissions(creds, ats, false /* isDir */, anonFileMode, anonFileUID, anonFileGID) + return GenericCheckPermissions(creds, ats, anonFileMode, anonFileUID, anonFileGID) } // GetDentryAt implements FilesystemImpl.GetDentryAt. diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index 2c8f23f55..f9647f90e 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -29,9 +29,9 @@ type AccessTypes uint16 // Bits in AccessTypes. const ( + MayExec AccessTypes = 1 + MayWrite AccessTypes = 2 MayRead AccessTypes = 4 - MayWrite = 2 - MayExec = 1 ) // OnlyRead returns true if access _only_ allows read. @@ -56,16 +56,17 @@ func (a AccessTypes) MayExec() bool { // GenericCheckPermissions checks that creds has the given access rights on a // file with the given permissions, UID, and GID, subject to the rules of -// fs/namei.c:generic_permission(). isDir is true if the file is a directory. -func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir bool, mode uint16, kuid auth.KUID, kgid auth.KGID) error { +// fs/namei.c:generic_permission(). +func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error { // Check permission bits. - perms := mode + perms := uint16(mode.Permissions()) if creds.EffectiveKUID == kuid { perms >>= 6 } else if creds.InGroup(kgid) { perms >>= 3 } if uint16(ats)&perms == uint16(ats) { + // All permission bits match, access granted. return nil } @@ -77,7 +78,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo } // CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary // directories, and read arbitrary non-directory files. - if (isDir && !ats.MayWrite()) || ats.OnlyRead() { + if (mode.IsDir() && !ats.MayWrite()) || ats.OnlyRead() { if creds.HasCapability(linux.CAP_DAC_READ_SEARCH) { return nil } @@ -85,7 +86,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo // CAP_DAC_OVERRIDE allows arbitrary access to directories, read/write // access to non-directory files, and execute access to non-directory files // for which at least one execute bit is set. - if isDir || !ats.MayExec() || (mode&0111 != 0) { + if mode.IsDir() || !ats.MayExec() || (mode.Permissions()&0111 != 0) { if creds.HasCapability(linux.CAP_DAC_OVERRIDE) { return nil } @@ -151,7 +152,7 @@ func MayWriteFileWithOpenFlags(flags uint32) bool { // CheckSetStat checks that creds has permission to change the metadata of a // file with the given permissions, UID, and GID as specified by stat, subject // to the rules of Linux's fs/attr.c:setattr_prepare(). -func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mode uint16, kuid auth.KUID, kgid auth.KGID) error { +func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error { if stat.Mask&linux.STATX_SIZE != 0 { limit, err := CheckLimit(ctx, 0, int64(stat.Size)) if err != nil { @@ -190,11 +191,7 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Stat (stat.Mask&linux.STATX_CTIME != 0 && stat.Ctime.Nsec != linux.UTIME_NOW) { return syserror.EPERM } - // isDir is irrelevant in the following call to - // GenericCheckPermissions since ats == MayWrite means that - // CAP_DAC_READ_SEARCH does not apply, and CAP_DAC_OVERRIDE - // applies, regardless of isDir. - if err := GenericCheckPermissions(creds, MayWrite, false /* isDir */, mode, kuid, kgid); err != nil { + if err := GenericCheckPermissions(creds, MayWrite, mode, kuid, kgid); err != nil { return err } } -- cgit v1.2.3 From 9b5e305e05ef3ad51778981062d6152cea1cd4fb Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Tue, 21 Apr 2020 12:16:42 -0700 Subject: Remove filesystem structure from vfs.Dentry. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change: - Drastically simplifies the synchronization model: filesystem structure is both implementation-defined and implementation-synchronized. - Allows implementations of vfs.DentryImpl to use implementation-specific dentry types, reducing casts during path traversal. - Doesn't require dentries representing non-directory files to waste space on a map of children. - Allows dentry revalidation and mount lookup to be correctly ordered (fixed FIXME in fsimpl/gofer/filesystem.go). - Removes the need to have two separate maps in gofer.dentry (dentry.vfsd.children and dentry.negativeChildren) for positive and negative lookups respectively. //pkg/sentry/fsimpl/tmpfs/benchmark_test.go: name old time/op new time/op delta VFS2TmpfsStat/1-112 172ns ± 4% 165ns ± 3% -4.08% (p=0.002 n=9+9) VFS2TmpfsStat/2-112 199ns ± 3% 195ns ±10% ~ (p=0.132 n=8+9) VFS2TmpfsStat/3-112 230ns ± 2% 216ns ± 2% -6.15% (p=0.000 n=8+8) VFS2TmpfsStat/8-112 390ns ± 2% 358ns ± 4% -8.33% (p=0.000 n=9+8) VFS2TmpfsStat/64-112 2.20µs ± 3% 2.01µs ± 3% -8.48% (p=0.000 n=10+8) VFS2TmpfsStat/100-112 3.42µs ± 9% 3.08µs ± 2% -9.82% (p=0.000 n=9+8) VFS2TmpfsMountStat/1-112 278ns ± 1% 286ns ±15% ~ (p=0.712 n=8+10) VFS2TmpfsMountStat/2-112 311ns ± 4% 298ns ± 2% -4.27% (p=0.000 n=9+8) VFS2TmpfsMountStat/3-112 339ns ± 3% 330ns ± 9% ~ (p=0.070 n=8+9) VFS2TmpfsMountStat/8-112 503ns ± 3% 466ns ± 3% -7.38% (p=0.000 n=8+8) VFS2TmpfsMountStat/64-112 2.53µs ±16% 2.17µs ± 7% -14.19% (p=0.000 n=10+9) VFS2TmpfsMountStat/100-112 3.60µs ± 4% 3.30µs ± 8% -8.33% (p=0.001 n=8+9) Updates #1035 PiperOrigin-RevId: 307655892 --- pkg/sentry/fsimpl/ext/BUILD | 12 ++ pkg/sentry/fsimpl/ext/dentry.go | 4 + pkg/sentry/fsimpl/ext/directory.go | 21 ++- pkg/sentry/fsimpl/ext/filesystem.go | 54 ++++-- pkg/sentry/fsimpl/ext/inode.go | 2 +- pkg/sentry/fsimpl/gofer/BUILD | 12 ++ pkg/sentry/fsimpl/gofer/directory.go | 55 +++--- pkg/sentry/fsimpl/gofer/filesystem.go | 202 +++++++++++--------- pkg/sentry/fsimpl/gofer/gofer.go | 66 ++++--- pkg/sentry/fsimpl/gofer/gofer_test.go | 3 +- pkg/sentry/fsimpl/kernfs/BUILD | 12 ++ pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 2 +- pkg/sentry/fsimpl/kernfs/filesystem.go | 159 +++++++++------- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 2 +- pkg/sentry/fsimpl/kernfs/kernfs.go | 33 ++-- pkg/sentry/fsimpl/proc/tasks_test.go | 16 +- pkg/sentry/fsimpl/tmpfs/BUILD | 12 ++ pkg/sentry/fsimpl/tmpfs/benchmark_test.go | 7 - pkg/sentry/fsimpl/tmpfs/directory.go | 84 ++++++--- pkg/sentry/fsimpl/tmpfs/filesystem.go | 249 +++++++++++++------------ pkg/sentry/fsimpl/tmpfs/stat_test.go | 12 +- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 82 ++++---- pkg/sentry/vfs/dentry.go | 259 +++++--------------------- pkg/sentry/vfs/file_description.go | 3 +- pkg/sentry/vfs/filesystem.go | 5 +- pkg/sentry/vfs/filesystem_impl_util.go | 26 --- pkg/sentry/vfs/genericfstree/BUILD | 16 ++ pkg/sentry/vfs/genericfstree/genericfstree.go | 80 ++++++++ pkg/sentry/vfs/mount.go | 9 +- pkg/sentry/vfs/pathname.go | 6 +- pkg/sentry/vfs/resolving_path.go | 85 +++------ 31 files changed, 836 insertions(+), 754 deletions(-) create mode 100644 pkg/sentry/vfs/genericfstree/BUILD create mode 100644 pkg/sentry/vfs/genericfstree/genericfstree.go (limited to 'pkg/sentry/fsimpl/ext/inode.go') diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index d83d75b3d..a4947c480 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -15,6 +15,17 @@ go_template_instance( }, ) +go_template_instance( + name = "fstree", + out = "fstree.go", + package = "ext", + prefix = "generic", + template = "//pkg/sentry/vfs/genericfstree:generic_fstree", + types = { + "Dentry": "dentry", + }, +) + go_library( name = "ext", srcs = [ @@ -26,6 +37,7 @@ go_library( "extent_file.go", "file_description.go", "filesystem.go", + "fstree.go", "inode.go", "regular_file.go", "symlink.go", diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go index a080cb189..bfbd7c3d4 100644 --- a/pkg/sentry/fsimpl/ext/dentry.go +++ b/pkg/sentry/fsimpl/ext/dentry.go @@ -22,6 +22,10 @@ import ( type dentry struct { vfsd vfs.Dentry + // Protected by filesystem.mu. + parent *dentry + name string + // inode is the inode represented by this dentry. Multiple Dentries may // share a single non-directory Inode (with hard links). inode is // immutable. diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go index bd6ede995..12b875c8f 100644 --- a/pkg/sentry/fsimpl/ext/directory.go +++ b/pkg/sentry/fsimpl/ext/directory.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" - "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -31,6 +30,10 @@ import ( type directory struct { inode inode + // childCache maps filenames to dentries for children for which dentries + // have been instantiated. childCache is protected by filesystem.mu. + childCache map[string]*dentry + // mu serializes the changes to childList. // Lock Order (outermost locks must be taken first): // directory.mu @@ -50,9 +53,13 @@ type directory struct { childMap map[string]*dirent } -// newDirectroy is the directory constructor. -func newDirectroy(inode inode, newDirent bool) (*directory, error) { - file := &directory{inode: inode, childMap: make(map[string]*dirent)} +// newDirectory is the directory constructor. +func newDirectory(inode inode, newDirent bool) (*directory, error) { + file := &directory{ + inode: inode, + childCache: make(map[string]*dentry), + childMap: make(map[string]*dirent), + } file.inode.impl = file // Initialize childList by reading dirents from the underlying file. @@ -299,9 +306,3 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in fd.off = offset return offset, nil } - -// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. -func (fd *directoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { - // mmap(2) specifies that EACCESS should be returned for non-regular file fds. - return syserror.EACCES -} diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index afea58f65..2c22a04af 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -89,14 +89,33 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo } for { - nextVFSD, err := rp.ResolveComponent(vfsd) - if err != nil { - return nil, nil, err + name := rp.Component() + if name == "." { + rp.Advance() + return vfsd, inode, nil } - if nextVFSD == nil { - // Since the Dentry tree is not the sole source of truth for extfs, if it's - // not in the Dentry tree, it might need to be pulled from disk. - childDirent, ok := inode.impl.(*directory).childMap[rp.Component()] + d := vfsd.Impl().(*dentry) + if name == ".." { + isRoot, err := rp.CheckRoot(vfsd) + if err != nil { + return nil, nil, err + } + if isRoot || d.parent == nil { + rp.Advance() + return vfsd, inode, nil + } + if err := rp.CheckMount(&d.parent.vfsd); err != nil { + return nil, nil, err + } + rp.Advance() + return &d.parent.vfsd, d.parent.inode, nil + } + + dir := inode.impl.(*directory) + child, ok := dir.childCache[name] + if !ok { + // We may need to instantiate a new dentry for this child. + childDirent, ok := dir.childMap[name] if !ok { // The underlying inode does not exist on disk. return nil, nil, syserror.ENOENT @@ -115,21 +134,22 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo } // incRef because this is being added to the dentry tree. childInode.incRef() - child := newDentry(childInode) - vfsd.InsertChild(&child.vfsd, rp.Component()) - - // Continue as usual now that nextVFSD is not nil. - nextVFSD = &child.vfsd + child = newDentry(childInode) + child.parent = d + child.name = name + dir.childCache[name] = child + } + if err := rp.CheckMount(&child.vfsd); err != nil { + return nil, nil, err } - nextInode := nextVFSD.Impl().(*dentry).inode - if nextInode.isSymlink() && rp.ShouldFollowSymlink() { - if err := rp.HandleSymlink(inode.impl.(*symlink).target); err != nil { + if child.inode.isSymlink() && rp.ShouldFollowSymlink() { + if err := rp.HandleSymlink(child.inode.impl.(*symlink).target); err != nil { return nil, nil, err } continue } rp.Advance() - return nextVFSD, nextInode, nil + return &child.vfsd, child.inode, nil } } @@ -515,5 +535,5 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.mu.RLock() defer fs.mu.RUnlock() - return vfs.GenericPrependPath(vfsroot, vd, b) + return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) } diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index a39a37318..a98512350 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -136,7 +136,7 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) { } return &f.inode, nil case linux.ModeDirectory: - f, err := newDirectroy(inode, fs.sb.IncompatibleFeatures().DirentFileType) + f, err := newDirectory(inode, fs.sb.IncompatibleFeatures().DirentFileType) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index 99d1e3f8f..acd061905 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -15,12 +15,24 @@ go_template_instance( }, ) +go_template_instance( + name = "fstree", + out = "fstree.go", + package = "gofer", + prefix = "generic", + template = "//pkg/sentry/vfs/genericfstree:generic_fstree", + types = { + "Dentry": "dentry", + }, +) + go_library( name = "gofer", srcs = [ "dentry_list.go", "directory.go", "filesystem.go", + "fstree.go", "gofer.go", "handle.go", "handle_unsafe.go", diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 49d9f859b..d02691232 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -29,13 +29,25 @@ func (d *dentry) isDir() bool { return d.fileType() == linux.S_IFDIR } +// Preconditions: filesystem.renameMu must be locked. d.dirMu must be locked. +// d.isDir(). child must be a newly-created dentry that has never had a parent. +func (d *dentry) cacheNewChildLocked(child *dentry, name string) { + d.IncRef() // reference held by child on its parent + child.parent = d + child.name = name + if d.children == nil { + d.children = make(map[string]*dentry) + } + d.children[name] = child +} + // Preconditions: d.dirMu must be locked. d.isDir(). fs.opts.interop != // InteropModeShared. func (d *dentry) cacheNegativeChildLocked(name string) { - if d.negativeChildren == nil { - d.negativeChildren = make(map[string]struct{}) + if d.children == nil { + d.children = make(map[string]*dentry) } - d.negativeChildren[name] = struct{}{} + d.children[name] = nil } type directoryFD struct { @@ -80,34 +92,32 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba // Preconditions: d.isDir(). There exists at least one directoryFD representing d. func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { - // 9P2000.L's readdir does not specify behavior in the presence of - // concurrent mutation of an iterated directory, so implementations may - // duplicate or omit entries in this case, which violates POSIX semantics. - // Thus we read all directory entries while holding d.dirMu to exclude - // directory mutations. (Note that it is impossible for the client to - // exclude concurrent mutation from other remote filesystem users. Since - // there is no way to detect if the server has incorrectly omitted - // directory entries, we simply assume that the server is well-behaved - // under InteropModeShared.) This is inconsistent with Linux (which appears - // to assume that directory fids have the correct semantics, and translates - // struct file_operations::readdir calls directly to readdir RPCs), but is - // consistent with VFS1. - // - // NOTE(b/135560623): In particular, some gofer implementations may not - // retain state between calls to Readdir, so may not provide a coherent - // directory stream across in the presence of mutation. - + // NOTE(b/135560623): 9P2000.L's readdir does not specify behavior in the + // presence of concurrent mutation of an iterated directory, so + // implementations may duplicate or omit entries in this case, which + // violates POSIX semantics. Thus we read all directory entries while + // holding d.dirMu to exclude directory mutations. (Note that it is + // impossible for the client to exclude concurrent mutation from other + // remote filesystem users. Since there is no way to detect if the server + // has incorrectly omitted directory entries, we simply assume that the + // server is well-behaved under InteropModeShared.) This is inconsistent + // with Linux (which appears to assume that directory fids have the correct + // semantics, and translates struct file_operations::readdir calls directly + // to readdir RPCs), but is consistent with VFS1. + + // filesystem.renameMu is needed for d.parent, and must be locked before + // dentry.dirMu. d.fs.renameMu.RLock() - defer d.fs.renameMu.RUnlock() d.dirMu.Lock() defer d.dirMu.Unlock() if d.dirents != nil { + d.fs.renameMu.RUnlock() return d.dirents, nil } // It's not clear if 9P2000.L's readdir is expected to return "." and "..", // so we generate them here. - parent := d.vfsd.ParentOrSelf().Impl().(*dentry) + parent := genericParentOrSelf(d) dirents := []vfs.Dirent{ { Name: ".", @@ -122,6 +132,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { NextOff: 2, }, } + d.fs.renameMu.RUnlock() off := uint64(0) const count = 64 * 1024 // for consistency with the vfs1 client d.handleMu.RLock() diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index cd744bf5e..43e863c61 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -116,6 +116,8 @@ func putDentrySlice(ds *[]*dentry) { // Preconditions: fs.renameMu must be locked. d.dirMu must be locked. // !rp.Done(). If fs.opts.interop == InteropModeShared, then d's cached // metadata must be up to date. +// +// Postconditions: The returned dentry's cached metadata is up to date. func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { if !d.isDir() { return nil, syserror.ENOTDIR @@ -130,39 +132,42 @@ afterSymlink: return d, nil } if name == ".." { - parentVFSD, err := rp.ResolveParent(&d.vfsd) - if err != nil { + if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil { + return nil, err + } else if isRoot || d.parent == nil { + rp.Advance() + return d, nil + } + // We must assume that d.parent is correct, because if d has been moved + // elsewhere in the remote filesystem so that its parent has changed, + // we have no way of determining its new parent's location in the + // filesystem. + // + // Call rp.CheckMount() before updating d.parent's metadata, since if + // we traverse to another mount then d.parent's metadata is irrelevant. + if err := rp.CheckMount(&d.parent.vfsd); err != nil { return nil, err } - parent := parentVFSD.Impl().(*dentry) - if fs.opts.interop == InteropModeShared { - // We must assume that parentVFSD is correct, because if d has been - // moved elsewhere in the remote filesystem so that its parent has - // changed, we have no way of determining its new parent's location - // in the filesystem. Get updated metadata for parentVFSD. - _, attrMask, attr, err := parent.file.getAttr(ctx, dentryAttrMask()) + if fs.opts.interop == InteropModeShared && d != d.parent { + _, attrMask, attr, err := d.parent.file.getAttr(ctx, dentryAttrMask()) if err != nil { return nil, err } - parent.updateFromP9Attrs(attrMask, &attr) + d.parent.updateFromP9Attrs(attrMask, &attr) } rp.Advance() - return parent, nil + return d.parent, nil } - childVFSD, err := rp.ResolveChild(&d.vfsd, name) - if err != nil { - return nil, err - } - // FIXME(jamieliu): Linux performs revalidation before mount lookup - // (fs/namei.c:lookup_fast() => __d_lookup_rcu(), d_revalidate(), - // __follow_mount_rcu()). - child, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, childVFSD, ds) + child, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), d, name, ds) if err != nil { return nil, err } if child == nil { return nil, syserror.ENOENT } + if err := rp.CheckMount(&child.vfsd); err != nil { + return nil, err + } if child.isSymlink() && rp.ShouldFollowSymlink() { target, err := child.readlink(ctx, rp.Mount()) if err != nil { @@ -177,38 +182,37 @@ afterSymlink: return child, nil } -// revalidateChildLocked must be called after a call to parent.vfsd.Child(name) -// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be -// nil) to verify that the returned child (or lack thereof) is correct. If no file -// exists at name, revalidateChildLocked returns (nil, nil). +// getChildLocked returns a dentry representing the child of parent with the +// given name. If no such child exists, getChildLocked returns (nil, nil). // // Preconditions: fs.renameMu must be locked. parent.dirMu must be locked. // parent.isDir(). name is not "." or "..". // -// Postconditions: If revalidateChildLocked returns a non-nil dentry, its -// cached metadata is up to date. -func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, childVFSD *vfs.Dentry, ds **[]*dentry) (*dentry, error) { - if childVFSD != nil && fs.opts.interop != InteropModeShared { - // We have a cached dentry that is assumed to be correct. - return childVFSD.Impl().(*dentry), nil - } - // We either don't have a cached dentry or need to verify that it's still - // correct, either of which requires a remote lookup. Check if this name is - // valid before performing the lookup. +// Postconditions: If getChildLocked returns a non-nil dentry, its cached +// metadata is up to date. +func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, ds **[]*dentry) (*dentry, error) { if len(name) > maxFilenameLen { return nil, syserror.ENAMETOOLONG } - // Check if we've already cached this lookup with a negative result. - if _, ok := parent.negativeChildren[name]; ok { - return nil, nil + child, ok := parent.children[name] + if ok && fs.opts.interop != InteropModeShared { + // Whether child is nil or not, it is cached information that is + // assumed to be correct. + return child, nil } - // Perform the remote lookup. + // We either don't have cached information or need to verify that it's + // still correct, either of which requires a remote lookup. Check if this + // name is valid before performing the lookup. + return fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, ds) +} + +// Preconditions: As for getChildLocked. +func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) { qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name) if err != nil && err != syserror.ENOENT { return nil, err } - if childVFSD != nil { - child := childVFSD.Impl().(*dentry) + if child != nil { if !file.isNil() && qid.Path == child.ino { // The file at this path hasn't changed. Just update cached // metadata. @@ -219,9 +223,8 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // The file at this path has changed or no longer exists. Remove // the stale dentry from the tree, and re-evaluate its caching // status (i.e. if it has 0 references, drop it). - vfsObj.ForceDeleteDentry(childVFSD) + vfsObj.InvalidateDentry(&child.vfsd) *ds = appendDentry(*ds, child) - childVFSD = nil } if file.isNil() { // No file exists at this path now. Cache the negative lookup if @@ -232,13 +235,12 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir return nil, nil } // Create a new dentry representing the file. - child, err := fs.newDentry(ctx, file, qid, attrMask, &attr) + child, err = fs.newDentry(ctx, file, qid, attrMask, &attr) if err != nil { file.close(ctx) return nil, err } - parent.IncRef() // reference held by child on its parent - parent.vfsd.InsertChild(&child.vfsd, name) + parent.cacheNewChildLocked(child, name) // For now, child has 0 references, so our caller should call // child.checkCachingLocked(). *ds = appendDentry(*ds, child) @@ -318,9 +320,6 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } - if parent.isDeleted() { - return syserror.ENOENT - } name := rp.Component() if name == "." || name == ".." { return syserror.EEXIST @@ -331,6 +330,9 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if !dir && rp.MustBeDir() { return syserror.ENOENT } + if parent.isDeleted() { + return syserror.ENOENT + } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { return err @@ -348,7 +350,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir // it's used. return create(parent, name) } - if parent.vfsd.Child(name) != nil { + if child := parent.children[name]; child != nil { return syserror.EEXIST } // No cached dentry exists; however, there might still be an existing file @@ -356,10 +358,11 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if err := create(parent, name); err != nil { return err } - if fs.opts.interop != InteropModeShared { - parent.touchCMtime() - } - delete(parent.negativeChildren, name) + parent.touchCMtime() + // Either parent.children[name] doesn't exist (in which case this is a + // no-op) or is nil (in which case this erases the now-stale information + // that the file doesn't exist). + delete(parent.children, name) parent.dirents = nil return nil } @@ -407,56 +410,55 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b defer mntns.DecRef() parent.dirMu.Lock() defer parent.dirMu.Unlock() - childVFSD := parent.vfsd.Child(name) - var child *dentry + child, ok := parent.children[name] + if ok && child == nil { + return syserror.ENOENT + } // We only need a dentry representing the file at name if it can be a mount - // point. If childVFSD is nil, then it can't be a mount point. If childVFSD - // is non-nil but stale, the actual file can't be a mount point either; we + // point. If child is nil, then it can't be a mount point. If child is + // non-nil but stale, the actual file can't be a mount point either; we // detect this case by just speculatively calling PrepareDeleteDentry and // only revalidating the dentry if that fails (indicating that the existing // dentry is a mount point). - if childVFSD != nil { - child = childVFSD.Impl().(*dentry) - if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { - child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, childVFSD, &ds) + if child != nil { + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { + if fs.opts.interop != InteropModeShared { + return err + } + child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds) if err != nil { return err } if child != nil { - childVFSD = &child.vfsd - if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } - } else { - childVFSD = nil } } - } else if _, ok := parent.negativeChildren[name]; ok { - return syserror.ENOENT } flags := uint32(0) if dir { if child != nil && !child.isDir() { - vfsObj.AbortDeleteDentry(childVFSD) + vfsObj.AbortDeleteDentry(&child.vfsd) return syserror.ENOTDIR } flags = linux.AT_REMOVEDIR } else { if child != nil && child.isDir() { - vfsObj.AbortDeleteDentry(childVFSD) + vfsObj.AbortDeleteDentry(&child.vfsd) return syserror.EISDIR } if rp.MustBeDir() { - if childVFSD != nil { - vfsObj.AbortDeleteDentry(childVFSD) + if child != nil { + vfsObj.AbortDeleteDentry(&child.vfsd) } return syserror.ENOTDIR } } err = parent.file.unlinkAt(ctx, name, flags) if err != nil { - if childVFSD != nil { - vfsObj.AbortDeleteDentry(childVFSD) + if child != nil { + vfsObj.AbortDeleteDentry(&child.vfsd) } return err } @@ -467,10 +469,12 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b } parent.cacheNegativeChildLocked(name) parent.dirents = nil + } else { + delete(parent.children, name) } if child != nil { child.setDeleted() - vfsObj.CommitDeleteDentry(childVFSD) + vfsObj.CommitDeleteDentry(&child.vfsd) ds = appendDentry(ds, child) } return nil @@ -806,16 +810,14 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving // eligible for caching yet, so we don't need to append to a dentry slice.) child.refs = 1 // Insert the dentry into the tree. - d.IncRef() // reference held by child on its parent d - d.vfsd.InsertChild(&child.vfsd, name) + d.cacheNewChildLocked(child, name) if d.fs.opts.interop != InteropModeShared { - delete(d.negativeChildren, name) + d.touchCMtime() d.dirents = nil } // Finally, construct a file description representing the created file. var childVFSFD *vfs.FileDescription - mnt.IncRef() if useRegularFileFD { fd := ®ularFileFD{} if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{ @@ -840,9 +842,6 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } childVFSFD = &fd.vfsfd } - if d.fs.opts.interop != InteropModeShared { - d.touchCMtime() - } return childVFSFD, nil } @@ -902,7 +901,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // directory, we need to check for write permission on it. oldParent.dirMu.Lock() defer oldParent.dirMu.Unlock() - renamed, err := fs.revalidateChildLocked(ctx, vfsObj, oldParent, oldName, oldParent.vfsd.Child(oldName), &ds) + renamed, err := fs.getChildLocked(ctx, vfsObj, oldParent, oldName, &ds) if err != nil { return err } @@ -910,7 +909,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return syserror.ENOENT } if renamed.isDir() { - if renamed == newParent || renamed.vfsd.IsAncestorOf(&newParent.vfsd) { + if renamed == newParent || genericIsAncestorDentry(renamed, newParent) { return syserror.EINVAL } if oldParent != newParent { @@ -934,16 +933,17 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if newParent.isDeleted() { return syserror.ENOENT } - replacedVFSD := newParent.vfsd.Child(newName) - var replaced *dentry + replaced := newParent.children[newName] // This is similar to unlinkAt, except: // - // - We revalidate the replaced dentry unconditionally for simplicity. + // - If a dentry exists for the file to be replaced, we revalidate it + // unconditionally (instead of only if PrepareRenameDentry fails) for + // simplicity. // // - If rp.MustBeDir(), then we need a dentry representing the replaced // file regardless to confirm that it's a directory. - if replacedVFSD != nil || rp.MustBeDir() { - replaced, err = fs.revalidateChildLocked(ctx, vfsObj, newParent, newName, replacedVFSD, &ds) + if replaced != nil || rp.MustBeDir() { + replaced, err = fs.getChildLocked(ctx, rp.VirtualFilesystem(), newParent, newName, &ds) if err != nil { return err } @@ -957,11 +957,12 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return syserror.ENOTDIR } } - replacedVFSD = &replaced.vfsd - } else { - replacedVFSD = nil } } + var replacedVFSD *vfs.Dentry + if replaced != nil { + replacedVFSD = &replaced.vfsd + } if oldParent == newParent && oldName == newName { return nil @@ -978,7 +979,6 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if fs.opts.interop != InteropModeShared { oldParent.cacheNegativeChildLocked(oldName) oldParent.dirents = nil - delete(newParent.negativeChildren, newName) newParent.dirents = nil if renamed.isDir() { oldParent.decLinks() @@ -987,8 +987,24 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa oldParent.touchCMtime() newParent.touchCMtime() renamed.touchCtime() + } else { + delete(oldParent.children, oldName) + } + if oldParent != newParent { + appendDentry(ds, oldParent) + newParent.IncRef() + } + renamed.parent = newParent + renamed.name = newName + if newParent.children == nil { + newParent.children = make(map[string]*dentry) + } + newParent.children[newName] = renamed + if replaced != nil { + replaced.setDeleted() + appendDentry(ds, replaced) } - vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, &newParent.vfsd, newName, replacedVFSD) + vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD) return nil } @@ -1131,5 +1147,5 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.renameMu.RLock() defer fs.renameMu.RUnlock() - return vfs.GenericPrependPath(vfsroot, vd, b) + return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) } diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 2485cdb53..293df2545 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -452,6 +452,16 @@ type dentry struct { // fs is the owning filesystem. fs is immutable. fs *filesystem + // parent is this dentry's parent directory. Each dentry holds a reference + // on its parent. If this dentry is a filesystem root, parent is nil. + // parent is protected by filesystem.renameMu. + parent *dentry + + // name is the name of this dentry in its parent. If this dentry is a + // filesystem root, name is the empty string. name is protected by + // filesystem.renameMu. + name string + // We don't support hard links, so each dentry maps 1:1 to an inode. // file is the unopened p9.File that backs this dentry. file is immutable. @@ -469,10 +479,15 @@ type dentry struct { dirMu sync.Mutex - // If this dentry represents a directory, and InteropModeShared is not in - // effect, negativeChildren is a set of child names in this directory that - // are known not to exist. negativeChildren is protected by dirMu. - negativeChildren map[string]struct{} + // If this dentry represents a directory, children contains: + // + // - Mappings of child filenames to dentries representing those children. + // + // - Mappings of child filenames that are known not to exist to nil + // dentries (only if InteropModeShared is not in effect). + // + // children is protected by dirMu. + children map[string]*dentry // If this dentry represents a directory, InteropModeShared is not in // effect, and dirents is not nil, it is a cache of all entries in the @@ -910,9 +925,9 @@ func (d *dentry) checkCachingLocked() { // Dentry has already been destroyed. return } - // Non-child dentries with zero references are no longer reachable by path - // resolution and should be dropped immediately. - if d.vfsd.Parent() == nil || d.vfsd.IsDisowned() { + // Deleted and invalidated dentries with zero references are no longer + // reachable by path resolution and should be dropped immediately. + if d.vfsd.IsDead() { if d.cached { d.fs.cachedDentries.Remove(d) d.fs.cachedDentriesLen-- @@ -937,28 +952,26 @@ func (d *dentry) checkCachingLocked() { d.fs.cachedDentries.Remove(victim) d.fs.cachedDentriesLen-- victim.cached = false - // victim.refs may have become non-zero from an earlier path - // resolution since it was inserted into fs.cachedDentries; see - // dentry.incRefLocked(). Either way, we brought - // fs.cachedDentriesLen back down to fs.opts.maxCachedDentries, so - // we don't loop. + // victim.refs may have become non-zero from an earlier path resolution + // since it was inserted into fs.cachedDentries. if atomic.LoadInt64(&victim.refs) == 0 { - if victimParentVFSD := victim.vfsd.Parent(); victimParentVFSD != nil { - victimParent := victimParentVFSD.Impl().(*dentry) - victimParent.dirMu.Lock() - if !victim.vfsd.IsDisowned() { - // victim can't be a mount point (in any mount - // namespace), since VFS holds references on mount - // points. - d.fs.vfsfs.VirtualFilesystem().ForceDeleteDentry(&victim.vfsd) + if victim.parent != nil { + victim.parent.dirMu.Lock() + if !victim.vfsd.IsDead() { + // Note that victim can't be a mount point (in any mount + // namespace), since VFS holds references on mount points. + d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(&victim.vfsd) + delete(victim.parent.children, victim.name) // We're only deleting the dentry, not the file it // represents, so we don't need to update // victimParent.dirents etc. } - victimParent.dirMu.Unlock() + victim.parent.dirMu.Unlock() } victim.destroyLocked() } + // Whether or not victim was destroyed, we brought fs.cachedDentriesLen + // back down to fs.opts.maxCachedDentries, so we don't loop. } } @@ -1005,12 +1018,11 @@ func (d *dentry) destroyLocked() { d.fs.syncMu.Lock() delete(d.fs.dentries, d) d.fs.syncMu.Unlock() - // Drop the reference held by d on its parent. - if parentVFSD := d.vfsd.Parent(); parentVFSD != nil { - parent := parentVFSD.Impl().(*dentry) - // This is parent.DecRef() without recursive locking of d.fs.renameMu. - if refs := atomic.AddInt64(&parent.refs, -1); refs == 0 { - parent.checkCachingLocked() + // Drop the reference held by d on its parent without recursively locking + // d.fs.renameMu. + if d.parent != nil { + if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 { + d.parent.checkCachingLocked() } else if refs < 0 { panic("gofer.dentry.DecRef() called without holding a reference") } diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go index 82bc239db..4041fb252 100644 --- a/pkg/sentry/fsimpl/gofer/gofer_test.go +++ b/pkg/sentry/fsimpl/gofer/gofer_test.go @@ -48,8 +48,7 @@ func TestDestroyIdempotent(t *testing.T) { if err != nil { t.Fatalf("fs.newDentry(): %v", err) } - parent.IncRef() // reference held by child on its parent. - parent.vfsd.InsertChild(&child.vfsd, "child") + parent.cacheNewChildLocked(child, "child") child.checkCachingLocked() if got := atomic.LoadInt64(&child.refs); got != -1 { diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index b3d6299d0..ef34cb28a 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -3,6 +3,17 @@ load("//tools/go_generics:defs.bzl", "go_template_instance") licenses(["notice"]) +go_template_instance( + name = "fstree", + out = "fstree.go", + package = "kernfs", + prefix = "generic", + template = "//pkg/sentry/vfs/genericfstree:generic_fstree", + types = { + "Dentry": "Dentry", + }, +) + go_template_instance( name = "slot_list", out = "slot_list.go", @@ -21,6 +32,7 @@ go_library( "dynamic_bytes_file.go", "fd_impl_util.go", "filesystem.go", + "fstree.go", "inode_impl_util.go", "kernfs.go", "slot_list.go", diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index bfa786c88..e8a4670b8 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -129,7 +129,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent // Handle "..". if fd.off == 1 { - parentInode := vfsd.ParentOrSelf().Impl().(*Dentry).inode + parentInode := genericParentOrSelf(vfsd.Impl().(*Dentry)).inode stat, err := parentInode.Stat(vfsFS, opts) if err != nil { return err diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index baf81b4db..01c23d192 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -56,25 +56,28 @@ afterSymlink: return vfsd, nil } if name == ".." { - nextVFSD, err := rp.ResolveParent(vfsd) - if err != nil { + if isRoot, err := rp.CheckRoot(vfsd); err != nil { + return nil, err + } else if isRoot || d.parent == nil { + rp.Advance() + return vfsd, nil + } + if err := rp.CheckMount(&d.parent.vfsd); err != nil { return nil, err } rp.Advance() - return nextVFSD, nil + return &d.parent.vfsd, nil } if len(name) > linux.NAME_MAX { return nil, syserror.ENAMETOOLONG } d.dirMu.Lock() - nextVFSD, err := rp.ResolveChild(vfsd, name) + next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, d.children[name]) + d.dirMu.Unlock() if err != nil { - d.dirMu.Unlock() return nil, err } - next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, nextVFSD) - d.dirMu.Unlock() - if err != nil { + if err := rp.CheckMount(&next.vfsd); err != nil { return nil, err } // Resolve any symlink at current path component. @@ -108,17 +111,17 @@ afterSymlink: // parent.dirMu must be locked. parent.isDir(). name is not "." or "..". // // Postconditions: Caller must call fs.processDeferredDecRefs*. -func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, childVFSD *vfs.Dentry) (*Dentry, error) { - if childVFSD != nil { +func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, child *Dentry) (*Dentry, error) { + if child != nil { // Cached dentry exists, revalidate. - child := childVFSD.Impl().(*Dentry) if !child.inode.Valid(ctx) { - vfsObj.ForceDeleteDentry(childVFSD) - fs.deferDecRef(childVFSD) // Reference from Lookup. - childVFSD = nil + delete(parent.children, name) + vfsObj.InvalidateDentry(&child.vfsd) + fs.deferDecRef(&child.vfsd) // Reference from Lookup. + child = nil } } - if childVFSD == nil { + if child == nil { // Dentry isn't cached; it either doesn't exist or failed // revalidation. Attempt to resolve it via Lookup. // @@ -126,15 +129,15 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // *(kernfs.)Dentry, not *vfs.Dentry, since (kernfs.)Filesystem assumes // that all dentries in the filesystem are (kernfs.)Dentry and performs // vfs.DentryImpl casts accordingly. - var err error - childVFSD, err = parent.inode.Lookup(ctx, name) + childVFSD, err := parent.inode.Lookup(ctx, name) if err != nil { return nil, err } // Reference on childVFSD dropped by a corresponding Valid. - parent.insertChildLocked(name, childVFSD) + child = childVFSD.Impl().(*Dentry) + parent.insertChildLocked(name, child) } - return childVFSD.Impl().(*Dentry), nil + return child, nil } // walkExistingLocked resolves rp to an existing file. @@ -203,14 +206,11 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v if len(pc) > linux.NAME_MAX { return "", syserror.ENAMETOOLONG } - childVFSD, err := rp.ResolveChild(parentVFSD, pc) - if err != nil { - return "", err - } - if childVFSD != nil { + // FIXME(gvisor.dev/issue/1193): Data race due to not holding dirMu. + if _, ok := parentVFSD.Impl().(*Dentry).children[pc]; ok { return "", syserror.EEXIST } - if parentVFSD.IsDisowned() { + if parentVFSD.IsDead() { return "", syserror.ENOENT } return pc, nil @@ -220,14 +220,14 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v // // Preconditions: Filesystem.mu must be locked for at least reading. func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error { - parentVFSD := vfsd.Parent() - if parentVFSD == nil { + parent := vfsd.Impl().(*Dentry).parent + if parent == nil { return syserror.EBUSY } - if parentVFSD.IsDisowned() { + if parent.vfsd.IsDead() { return syserror.ENOENT } - if err := parentVFSD.Impl().(*Dentry).inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } return nil @@ -321,11 +321,11 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EPERM } - child, err := parentInode.NewLink(ctx, pc, d.inode) + childVFSD, err := parentInode.NewLink(ctx, pc, d.inode) if err != nil { return err } - parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry)) return nil } @@ -349,11 +349,11 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return err } defer rp.Mount().EndWrite() - child, err := parentInode.NewDir(ctx, pc, opts) + childVFSD, err := parentInode.NewDir(ctx, pc, opts) if err != nil { return err } - parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry)) return nil } @@ -377,11 +377,11 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return err } defer rp.Mount().EndWrite() - new, err := parentInode.NewNode(ctx, pc, opts) + newVFSD, err := parentInode.NewNode(ctx, pc, opts) if err != nil { return err } - parentVFSD.Impl().(*Dentry).InsertChild(pc, new) + parentVFSD.Impl().(*Dentry).InsertChild(pc, newVFSD.Impl().(*Dentry)) return nil } @@ -449,11 +449,8 @@ afterTrailingSymlink: return nil, syserror.ENAMETOOLONG } // Determine whether or not we need to create a file. - childVFSD, err := rp.ResolveChild(parentVFSD, pc) - if err != nil { - return nil, err - } - if childVFSD == nil { + childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD) + if err == syserror.ENOENT { // Already checked for searchability above; now check for writability. if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { return nil, err @@ -463,21 +460,24 @@ afterTrailingSymlink: } defer rp.Mount().EndWrite() // Create and open the child. - child, err := parentInode.NewFile(ctx, pc, opts) + childVFSD, err = parentInode.NewFile(ctx, pc, opts) if err != nil { return nil, err } + child := childVFSD.Impl().(*Dentry) parentVFSD.Impl().(*Dentry).InsertChild(pc, child) - return child.Impl().(*Dentry).inode.Open(rp, child, opts) + return child.inode.Open(rp, childVFSD, opts) + } + if err != nil { + return nil, err } // Open existing file or follow symlink. if mustCreate { return nil, syserror.EEXIST } - childDentry := childVFSD.Impl().(*Dentry) - childInode := childDentry.inode - if rp.ShouldFollowSymlink() && childDentry.isSymlink() { - targetVD, targetPathname, err := childInode.Getlink(ctx) + child := childVFSD.Impl().(*Dentry) + if rp.ShouldFollowSymlink() && child.isSymlink() { + targetVD, targetPathname, err := child.inode.Getlink(ctx) if err != nil { return nil, err } @@ -496,10 +496,10 @@ afterTrailingSymlink: // symlink target. goto afterTrailingSymlink } - if err := childInode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { + if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - return childInode.Open(rp, childVFSD, opts) + return child.inode.Open(rp, &child.vfsd, opts) } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. @@ -526,15 +526,16 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0 fs.mu.Lock() - defer fs.mu.Lock() + defer fs.processDeferredDecRefsLocked() + defer fs.mu.Unlock() // Resolve the destination directory first to verify that it's on this // Mount. dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() if err != nil { return err } + dstDir := dstDirVFSD.Impl().(*Dentry) mnt := rp.Mount() if mnt != oldParentVD.Mount() { return syserror.EXDEV @@ -547,9 +548,8 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa srcDirVFSD := oldParentVD.Dentry() srcDir := srcDirVFSD.Impl().(*Dentry) srcDir.dirMu.Lock() - src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName, srcDirVFSD.Child(oldName)) + src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName, srcDir.children[oldName]) srcDir.dirMu.Unlock() - fs.processDeferredDecRefsLocked() if err != nil { return err } @@ -561,7 +561,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } // Can we create the dst dentry? - var dstVFSD *vfs.Dentry + var dst *Dentry pc, err := checkCreateLocked(ctx, rp, dstDirVFSD, dstDirInode) switch err { case nil: @@ -571,38 +571,51 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // Won't overwrite existing node since RENAME_NOREPLACE was requested. return syserror.EEXIST } - dstVFSD, err = rp.ResolveChild(dstDirVFSD, pc) - if err != nil { + dst = dstDir.children[pc] + if dst == nil { panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD)) } default: return err } + var dstVFSD *vfs.Dentry + if dst != nil { + dstVFSD = &dst.vfsd + } mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef() virtfs := rp.VirtualFilesystem() - srcDirDentry := srcDirVFSD.Impl().(*Dentry) - dstDirDentry := dstDirVFSD.Impl().(*Dentry) - // We can't deadlock here due to lock ordering because we're protected from // concurrent renames by fs.mu held for writing. - srcDirDentry.dirMu.Lock() - defer srcDirDentry.dirMu.Unlock() - dstDirDentry.dirMu.Lock() - defer dstDirDentry.dirMu.Unlock() + srcDir.dirMu.Lock() + defer srcDir.dirMu.Unlock() + if srcDir != dstDir { + dstDir.dirMu.Lock() + defer dstDir.dirMu.Unlock() + } if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil { return err } - srcDirInode := srcDirDentry.inode - replaced, err := srcDirInode.Rename(ctx, srcVFSD.Name(), pc, srcVFSD, dstDirVFSD) + replaced, err := srcDir.inode.Rename(ctx, src.name, pc, srcVFSD, dstDirVFSD) if err != nil { virtfs.AbortRenameDentry(srcVFSD, dstVFSD) return err } - virtfs.CommitRenameReplaceDentry(srcVFSD, dstDirVFSD, pc, replaced) + delete(srcDir.children, src.name) + if srcDir != dstDir { + fs.deferDecRef(srcDirVFSD) + dstDir.IncRef() + } + src.parent = dstDir + src.name = pc + if dstDir.children == nil { + dstDir.children = make(map[string]*Dentry) + } + dstDir.children[pc] = src + virtfs.CommitRenameReplaceDentry(srcVFSD, replaced) return nil } @@ -622,14 +635,15 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error if err := checkDeleteLocked(ctx, rp, vfsd); err != nil { return err } - if !vfsd.Impl().(*Dentry).isDir() { + d := vfsd.Impl().(*Dentry) + if !d.isDir() { return syserror.ENOTDIR } if inode.HasChildren() { return syserror.ENOTEMPTY } virtfs := rp.VirtualFilesystem() - parentDentry := vfsd.Parent().Impl().(*Dentry) + parentDentry := d.parent parentDentry.dirMu.Lock() defer parentDentry.dirMu.Unlock() @@ -706,11 +720,11 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ return err } defer rp.Mount().EndWrite() - child, err := parentInode.NewSymlink(ctx, pc, target) + childVFSD, err := parentInode.NewSymlink(ctx, pc, target) if err != nil { return err } - parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry)) return nil } @@ -730,11 +744,12 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error if err := checkDeleteLocked(ctx, rp, vfsd); err != nil { return err } - if vfsd.Impl().(*Dentry).isDir() { + d := vfsd.Impl().(*Dentry) + if d.isDir() { return syserror.EISDIR } virtfs := rp.VirtualFilesystem() - parentDentry := vfsd.Parent().Impl().(*Dentry) + parentDentry := d.parent parentDentry.dirMu.Lock() defer parentDentry.dirMu.Unlock() mntns := vfs.MountNamespaceFromContext(ctx) @@ -818,5 +833,5 @@ func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.mu.RLock() defer fs.mu.RUnlock() - return vfs.GenericPrependPath(vfsroot, vd, b) + return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*Dentry), b) } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 65f09af5d..9f526359e 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -370,7 +370,7 @@ func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint3 if err := o.Insert(name, child.VFSDentry()); err != nil { panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d)) } - d.InsertChild(name, child.VFSDentry()) + d.InsertChild(name, child) } return links } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index ad76b9f64..f5041824f 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -168,17 +168,22 @@ const ( // // Must be initialized by Init prior to first use. type Dentry struct { - refs.AtomicRefCount + vfsd vfs.Dentry - vfsd vfs.Dentry - inode Inode + refs.AtomicRefCount // flags caches useful information about the dentry from the inode. See the // dflags* consts above. Must be accessed by atomic ops. flags uint32 - // dirMu protects vfsd.children for directory dentries. - dirMu sync.Mutex + parent *Dentry + name string + + // dirMu protects children and the names of child Dentries. + dirMu sync.Mutex + children map[string]*Dentry + + inode Inode } // Init initializes this dentry. @@ -222,8 +227,8 @@ func (d *Dentry) DecRef() { func (d *Dentry) destroy() { d.inode.DecRef() // IncRef from Init. d.inode = nil - if parent := d.vfsd.Parent(); parent != nil { - parent.DecRef() // IncRef from Dentry.InsertChild. + if d.parent != nil { + d.parent.DecRef() // IncRef from Dentry.InsertChild. } } @@ -233,7 +238,7 @@ func (d *Dentry) destroy() { // updates the link count on d if required. // // Precondition: d must represent a directory inode. -func (d *Dentry) InsertChild(name string, child *vfs.Dentry) { +func (d *Dentry) InsertChild(name string, child *Dentry) { d.dirMu.Lock() d.insertChildLocked(name, child) d.dirMu.Unlock() @@ -243,13 +248,17 @@ func (d *Dentry) InsertChild(name string, child *vfs.Dentry) { // preconditions. // // Precondition: d.dirMu must be locked. -func (d *Dentry) insertChildLocked(name string, child *vfs.Dentry) { +func (d *Dentry) insertChildLocked(name string, child *Dentry) { if !d.isDir() { panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d)) } - vfsDentry := d.VFSDentry() - vfsDentry.IncRef() // DecRef in child's Dentry.destroy. - vfsDentry.InsertChild(child, name) + d.IncRef() // DecRef in child's Dentry.destroy. + child.parent = d + child.name = name + if d.children == nil { + d.children = make(map[string]*Dentry) + } + d.children[name] = child } // The Inode interface maps filesystem-level operations that operate on paths to diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index d0f97c137..19abb5034 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -415,36 +415,36 @@ func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.F if d.Name == "." || d.Name == ".." { continue } - childPath := path.Join(fd.MappedName(ctx), d.Name) + absPath := path.Join(fd.MappedName(ctx), d.Name) if d.Type == linux.DT_LNK { link, err := s.VFS.ReadlinkAt( ctx, auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(childPath)}, + &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(absPath)}, ) if err != nil { - t.Errorf("vfsfs.ReadlinkAt(%v) failed: %v", childPath, err) + t.Errorf("vfsfs.ReadlinkAt(%v) failed: %v", absPath, err) } else { - t.Logf("Skipping symlink: /proc%s => %s", childPath, link) + t.Logf("Skipping symlink: %s => %s", absPath, link) } continue } - t.Logf("Opening: /proc%s", childPath) + t.Logf("Opening: %s", absPath) child, err := s.VFS.OpenAt( ctx, auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(childPath)}, + &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(absPath)}, &vfs.OpenOptions{}, ) if err != nil { - t.Errorf("vfsfs.OpenAt(%v) failed: %v", childPath, err) + t.Errorf("vfsfs.OpenAt(%v) failed: %v", absPath, err) continue } defer child.DecRef() stat, err := child.Stat(ctx, vfs.StatOptions{}) if err != nil { - t.Errorf("Stat(%v) failed: %v", childPath, err) + t.Errorf("Stat(%v) failed: %v", absPath, err) } if got := linux.FileMode(stat.Mode).DirentType(); got != d.Type { t.Errorf("wrong file mode, stat: %v, dirent: %v", got, d.Type) diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 4e6cd3491..a2d9649e7 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -15,6 +15,17 @@ go_template_instance( }, ) +go_template_instance( + name = "fstree", + out = "fstree.go", + package = "tmpfs", + prefix = "generic", + template = "//pkg/sentry/vfs/genericfstree:generic_fstree", + types = { + "Dentry": "dentry", + }, +) + go_library( name = "tmpfs", srcs = [ @@ -22,6 +33,7 @@ go_library( "device_file.go", "directory.go", "filesystem.go", + "fstree.go", "named_pipe.go", "regular_file.go", "socket_file.go", diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index 651912169..2fb5c4d84 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -438,13 +438,6 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { filePathBuilder.WriteByte('/') } - // Verify that we didn't create any directories under the mount - // point (i.e. they were all created on the submount). - firstDirName := fmt.Sprintf("%d", depth) - if child := mountPoint.Dentry().Child(firstDirName); child != nil { - b.Fatalf("created directory %q under root mount, not submount", firstDirName) - } - // Create the file that will be stat'd. fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ Root: root, diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index 45712c9b9..f2399981b 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -15,35 +15,77 @@ package tmpfs import ( + "sync/atomic" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) type directory struct { - inode inode + // Since directories can't be hard-linked, each directory can only be + // associated with a single dentry, which we can store in the directory + // struct. + dentry dentry + inode inode + + // childMap maps the names of the directory's children to their dentries. + // childMap is protected by filesystem.mu. + childMap map[string]*dentry - // childList is a list containing (1) child Dentries and (2) fake Dentries + // numChildren is len(childMap), but accessed using atomic memory + // operations to avoid locking in inode.statTo(). + numChildren int64 + + // childList is a list containing (1) child dentries and (2) fake dentries // (with inode == nil) that represent the iteration position of // directoryFDs. childList is used to support directoryFD.IterDirents() - // efficiently. childList is protected by filesystem.mu. + // efficiently. childList is protected by iterMu. + iterMu sync.Mutex childList dentryList } -func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *inode { +func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *directory { dir := &directory{} dir.inode.init(dir, fs, creds, linux.S_IFDIR|mode) dir.inode.nlink = 2 // from "." and parent directory or ".." for root - return &dir.inode + dir.dentry.inode = &dir.inode + dir.dentry.vfsd.Init(&dir.dentry) + return dir +} + +// Preconditions: filesystem.mu must be locked for writing. dir must not +// already contain a child with the given name. +func (dir *directory) insertChildLocked(child *dentry, name string) { + child.parent = &dir.dentry + child.name = name + if dir.childMap == nil { + dir.childMap = make(map[string]*dentry) + } + dir.childMap[name] = child + atomic.AddInt64(&dir.numChildren, 1) + dir.iterMu.Lock() + dir.childList.PushBack(child) + dir.iterMu.Unlock() +} + +// Preconditions: filesystem.mu must be locked for writing. +func (dir *directory) removeChildLocked(child *dentry) { + delete(dir.childMap, child.name) + atomic.AddInt64(&dir.numChildren, -1) + dir.iterMu.Lock() + dir.childList.Remove(child) + dir.iterMu.Unlock() } type directoryFD struct { fileDescription vfs.DirectoryFileDescriptionDefaultImpl - // Protected by filesystem.mu. + // Protected by directory.iterMu. iter *dentry off int64 } @@ -51,11 +93,10 @@ type directoryFD struct { // Release implements vfs.FileDescriptionImpl.Release. func (fd *directoryFD) Release() { if fd.iter != nil { - fs := fd.filesystem() dir := fd.inode().impl.(*directory) - fs.mu.Lock() + dir.iterMu.Lock() dir.childList.Remove(fd.iter) - fs.mu.Unlock() + dir.iterMu.Unlock() fd.iter = nil } } @@ -63,10 +104,13 @@ func (fd *directoryFD) Release() { // IterDirents implements vfs.FileDescriptionImpl.IterDirents. func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { fs := fd.filesystem() - vfsd := fd.vfsfd.VirtualDentry().Dentry() + dir := fd.inode().impl.(*directory) - fs.mu.Lock() - defer fs.mu.Unlock() + // fs.mu is required to read d.parent and dentry.name. + fs.mu.RLock() + defer fs.mu.RUnlock() + dir.iterMu.Lock() + defer dir.iterMu.Unlock() fd.inode().touchAtime(fd.vfsfd.Mount()) @@ -74,15 +118,16 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba if err := cb.Handle(vfs.Dirent{ Name: ".", Type: linux.DT_DIR, - Ino: vfsd.Impl().(*dentry).inode.ino, + Ino: dir.inode.ino, NextOff: 1, }); err != nil { return err } fd.off++ } + if fd.off == 1 { - parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode + parentInode := genericParentOrSelf(&dir.dentry).inode if err := cb.Handle(vfs.Dirent{ Name: "..", Type: parentInode.direntType(), @@ -94,7 +139,6 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fd.off++ } - dir := vfsd.Impl().(*dentry).inode.impl.(*directory) var child *dentry if fd.iter == nil { // Start iteration at the beginning of dir. @@ -109,7 +153,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba // Skip other directoryFD iterators. if child.inode != nil { if err := cb.Handle(vfs.Dirent{ - Name: child.vfsd.Name(), + Name: child.name, Type: child.inode.direntType(), Ino: child.inode.ino, NextOff: fd.off + 1, @@ -127,9 +171,9 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - fs := fd.filesystem() - fs.mu.Lock() - defer fs.mu.Unlock() + dir := fd.inode().impl.(*directory) + dir.iterMu.Lock() + defer dir.iterMu.Unlock() switch whence { case linux.SEEK_SET: @@ -157,8 +201,6 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in remChildren = offset - 2 } - dir := fd.inode().impl.(*directory) - // Ensure that fd.iter exists and is not linked into dir.childList. if fd.iter == nil { fd.iter = &dentry{} diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 452c4e2e0..5b62f9ebb 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -39,27 +39,43 @@ func (fs *filesystem) Sync(ctx context.Context) error { // // Preconditions: filesystem.mu must be locked. !rp.Done(). func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { - if !d.inode.isDir() { + dir, ok := d.inode.impl.(*directory) + if !ok { return nil, syserror.ENOTDIR } if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } afterSymlink: - if len(rp.Component()) > linux.NAME_MAX { - return nil, syserror.ENAMETOOLONG + name := rp.Component() + if name == "." { + rp.Advance() + return d, nil } - nextVFSD, err := rp.ResolveComponent(&d.vfsd) - if err != nil { - return nil, err + if name == ".." { + if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil { + return nil, err + } else if isRoot || d.parent == nil { + rp.Advance() + return d, nil + } + if err := rp.CheckMount(&d.parent.vfsd); err != nil { + return nil, err + } + rp.Advance() + return d.parent, nil } - if nextVFSD == nil { - // Since the Dentry tree is the sole source of truth for tmpfs, if it's - // not in the Dentry tree, it doesn't exist. + if len(name) > linux.NAME_MAX { + return nil, syserror.ENAMETOOLONG + } + child, ok := dir.childMap[name] + if !ok { return nil, syserror.ENOENT } - next := nextVFSD.Impl().(*dentry) - if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { + if err := rp.CheckMount(&child.vfsd); err != nil { + return nil, err + } + if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { // TODO(gvisor.dev/issue/1197): Symlink traversals updates // access time. if err := rp.HandleSymlink(symlink.target); err != nil { @@ -68,7 +84,7 @@ afterSymlink: goto afterSymlink // don't check the current directory again } rp.Advance() - return next, nil + return child, nil } // walkParentDirLocked resolves all but the last path component of rp to an @@ -80,7 +96,7 @@ afterSymlink: // fs/namei.c:path_parentat(). // // Preconditions: filesystem.mu must be locked. !rp.Done(). -func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { +func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*directory, error) { for !rp.Final() { next, err := stepLocked(rp, d) if err != nil { @@ -88,10 +104,11 @@ func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { } d = next } - if !d.inode.isDir() { + dir, ok := d.inode.impl.(*directory) + if !ok { return nil, syserror.ENOTDIR } - return d, nil + return dir, nil } // resolveLocked resolves rp to an existing file. @@ -122,14 +139,14 @@ func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) { // // Preconditions: !rp.Done(). For the final path component in rp, // !rp.ShouldFollowSymlink(). -func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error { +func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error { fs.mu.Lock() defer fs.mu.Unlock() - parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) if err != nil { return err } - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } name := rp.Component() @@ -139,19 +156,15 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa if len(name) > linux.NAME_MAX { return syserror.ENAMETOOLONG } - // Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(), - // because if the child exists we want to return EEXIST immediately instead - // of attempting symlink/mount traversal. - if parent.vfsd.Child(name) != nil { + if _, ok := parentDir.childMap[name]; ok { return syserror.EEXIST } if !dir && rp.MustBeDir() { return syserror.ENOENT } - // In tmpfs, the only way to cause a dentry to be disowned is by removing - // it from the filesystem, so this check is equivalent to checking if - // parent has been removed. - if parent.vfsd.IsDisowned() { + // tmpfs never calls VFS.InvalidateDentry(), so parentDir.dentry can only + // be dead if it was deleted. + if parentDir.dentry.vfsd.IsDead() { return syserror.ENOENT } mnt := rp.Mount() @@ -159,10 +172,10 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa return err } defer mnt.EndWrite() - if err := create(parent, name); err != nil { + if err := create(parentDir, name); err != nil { return err } - parent.inode.touchCMtime() + parentDir.inode.touchCMtime() return nil } @@ -201,17 +214,17 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + dir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) if err != nil { return nil, err } - d.IncRef() - return &d.vfsd, nil + dir.dentry.IncRef() + return &dir.dentry.vfsd, nil } // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { - return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error { + return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { if rp.Mount() != vd.Mount() { return syserror.EXDEV } @@ -226,30 +239,27 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EMLINK } d.inode.incLinksLocked() - child := fs.newDentry(d.inode) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) + parentDir.insertChildLocked(fs.newDentry(d.inode), name) return nil }) } // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { - return fs.doCreateAt(rp, true /* dir */, func(parent *dentry, name string) error { - if parent.inode.nlink == maxLinks { + return fs.doCreateAt(rp, true /* dir */, func(parentDir *directory, name string) error { + if parentDir.inode.nlink == maxLinks { return syserror.EMLINK } - parent.inode.incLinksLocked() // from child's ".." - child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode)) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) + parentDir.inode.incLinksLocked() // from child's ".." + childDir := fs.newDirectory(rp.Credentials(), opts.Mode) + parentDir.insertChildLocked(&childDir.dentry, name) return nil }) } // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { - return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error { + return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { var childInode *inode switch opts.Mode.FileType() { case 0, linux.S_IFREG: @@ -266,8 +276,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return syserror.EINVAL } child := fs.newDentry(childInode) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) + parentDir.insertChildLocked(child, name) return nil }) } @@ -306,12 +315,12 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf return start.open(ctx, rp, &opts, false /* afterCreate */) } afterTrailingSymlink: - parent, err := walkParentDirLocked(rp, start) + parentDir, err := walkParentDirLocked(rp, start) if err != nil { return nil, err } // Check for search permission in the parent directory. - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } // Reject attempts to open directories with O_CREAT. @@ -322,11 +331,14 @@ afterTrailingSymlink: if name == "." || name == ".." { return nil, syserror.EISDIR } + if len(name) > linux.NAME_MAX { + return nil, syserror.ENAMETOOLONG + } // Determine whether or not we need to create a file. - child, err := stepLocked(rp, parent) - if err == syserror.ENOENT { + child, ok := parentDir.childMap[name] + if !ok { // Already checked for searchability above; now check for writability. - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } if err := rp.Mount().CheckBeginWrite(); err != nil { @@ -335,21 +347,26 @@ afterTrailingSymlink: defer rp.Mount().EndWrite() // Create and open the child. child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode)) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) + parentDir.insertChildLocked(child, name) fd, err := child.open(ctx, rp, &opts, true) if err != nil { return nil, err } - parent.inode.touchCMtime() + parentDir.inode.touchCMtime() return fd, nil } - if err != nil { + // Is the file mounted over? + if err := rp.CheckMount(&child.vfsd); err != nil { return nil, err } // Do we need to resolve a trailing symlink? - if !rp.Done() { - start = parent + if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { + // TODO(gvisor.dev/issue/1197): Symlink traversals updates + // access time. + if err := rp.HandleSymlink(symlink.target); err != nil { + return nil, err + } + start = &parentDir.dentry goto afterTrailingSymlink } // Open existing file. @@ -428,7 +445,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // Resolve newParent first to verify that it's on this Mount. fs.mu.Lock() defer fs.mu.Unlock() - newParent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + newParentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) if err != nil { return err } @@ -445,23 +462,22 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } defer mnt.EndWrite() - oldParent := oldParentVD.Dentry().Impl().(*dentry) - if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + oldParentDir := oldParentVD.Dentry().Impl().(*dentry).inode.impl.(*directory) + if err := oldParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } - // Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(), - // because if the existing child is a symlink or mount point then we want - // to rename over it rather than follow it. - renamedVFSD := oldParent.vfsd.Child(oldName) - if renamedVFSD == nil { + renamed, ok := oldParentDir.childMap[oldName] + if !ok { return syserror.ENOENT } - renamed := renamedVFSD.Impl().(*dentry) + // Note that we don't need to call rp.CheckMount(), since if renamed is a + // mount point then we want to rename the mount point, not anything in the + // mounted filesystem. if renamed.inode.isDir() { - if renamed == newParent || renamedVFSD.IsAncestorOf(&newParent.vfsd) { + if renamed == &newParentDir.dentry || genericIsAncestorDentry(renamed, &newParentDir.dentry) { return syserror.EINVAL } - if oldParent != newParent { + if oldParentDir != newParentDir { // Writability is needed to change renamed's "..". if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return err @@ -473,18 +489,17 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } } - if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + if err := newParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } - replacedVFSD := newParent.vfsd.Child(newName) - var replaced *dentry - if replacedVFSD != nil { - replaced = replacedVFSD.Impl().(*dentry) - if replaced.inode.isDir() { + replaced, ok := newParentDir.childMap[newName] + if ok { + replacedDir, ok := replaced.inode.impl.(*directory) + if ok { if !renamed.inode.isDir() { return syserror.EISDIR } - if replaced.vfsd.HasChildren() { + if len(replacedDir.childMap) != 0 { return syserror.ENOTEMPTY } } else { @@ -496,11 +511,13 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } } } else { - if renamed.inode.isDir() && newParent.inode.nlink == maxLinks { + if renamed.inode.isDir() && newParentDir.inode.nlink == maxLinks { return syserror.EMLINK } } - if newParent.vfsd.IsDisowned() { + // tmpfs never calls VFS.InvalidateDentry(), so newParentDir.dentry can + // only be dead if it was deleted. + if newParentDir.dentry.vfsd.IsDead() { return syserror.ENOENT } @@ -508,36 +525,38 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // simplicity, under the assumption that applications are not intentionally // doing noop renames expecting them to succeed where non-noop renames // would fail. - if renamedVFSD == replacedVFSD { + if renamed == replaced { return nil } vfsObj := rp.VirtualFilesystem() - oldParentDir := oldParent.inode.impl.(*directory) - newParentDir := newParent.inode.impl.(*directory) mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef() - if err := vfsObj.PrepareRenameDentry(mntns, renamedVFSD, replacedVFSD); err != nil { + var replacedVFSD *vfs.Dentry + if replaced != nil { + replacedVFSD = &replaced.vfsd + } + if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { return err } if replaced != nil { - newParentDir.childList.Remove(replaced) + newParentDir.removeChildLocked(replaced) if replaced.inode.isDir() { - newParent.inode.decLinksLocked() // from replaced's ".." + newParentDir.inode.decLinksLocked() // from replaced's ".." } replaced.inode.decLinksLocked() } - oldParentDir.childList.Remove(renamed) - newParentDir.childList.PushBack(renamed) - if renamed.inode.isDir() { - oldParent.inode.decLinksLocked() - newParent.inode.incLinksLocked() + oldParentDir.removeChildLocked(renamed) + newParentDir.insertChildLocked(renamed, newName) + vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD) + oldParentDir.inode.touchCMtime() + if oldParentDir != newParentDir { + if renamed.inode.isDir() { + oldParentDir.inode.decLinksLocked() + newParentDir.inode.incLinksLocked() + } + newParentDir.inode.touchCMtime() } - oldParent.inode.touchCMtime() - newParent.inode.touchCMtime() renamed.inode.touchCtime() - // TODO(gvisor.dev/issue/1197): Update timestamps and parent directory - // sizes. - vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD) return nil } @@ -545,11 +564,11 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() - parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) if err != nil { return err } - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } name := rp.Component() @@ -559,15 +578,15 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error if name == ".." { return syserror.ENOTEMPTY } - childVFSD := parent.vfsd.Child(name) - if childVFSD == nil { + child, ok := parentDir.childMap[name] + if !ok { return syserror.ENOENT } - child := childVFSD.Impl().(*dentry) - if !child.inode.isDir() { + childDir, ok := child.inode.impl.(*directory) + if !ok { return syserror.ENOTDIR } - if childVFSD.HasChildren() { + if len(childDir.childMap) != 0 { return syserror.ENOTEMPTY } mnt := rp.Mount() @@ -578,14 +597,14 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef() - if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } - parent.inode.impl.(*directory).childList.Remove(child) - parent.inode.decLinksLocked() // from child's ".." + parentDir.removeChildLocked(child) + parentDir.inode.decLinksLocked() // from child's ".." child.inode.decLinksLocked() - vfsObj.CommitDeleteDentry(childVFSD) - parent.inode.touchCMtime() + vfsObj.CommitDeleteDentry(&child.vfsd) + parentDir.inode.touchCMtime() return nil } @@ -627,10 +646,9 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { - return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error { + return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { child := fs.newDentry(fs.newSymlink(rp.Credentials(), target)) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) + parentDir.insertChildLocked(child, name) return nil }) } @@ -639,22 +657,21 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() - parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) if err != nil { return err } - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } name := rp.Component() if name == "." || name == ".." { return syserror.EISDIR } - childVFSD := parent.vfsd.Child(name) - if childVFSD == nil { + child, ok := parentDir.childMap[name] + if !ok { return syserror.ENOENT } - child := childVFSD.Impl().(*dentry) if child.inode.isDir() { return syserror.EISDIR } @@ -669,13 +686,13 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef() - if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } - parent.inode.impl.(*directory).childList.Remove(child) + parentDir.removeChildLocked(child) child.inode.decLinksLocked() - vfsObj.CommitDeleteDentry(childVFSD) - parent.inode.touchCMtime() + vfsObj.CommitDeleteDentry(&child.vfsd) + parentDir.inode.touchCMtime() return nil } @@ -743,5 +760,5 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.mu.RLock() defer fs.mu.RUnlock() - return vfs.GenericPrependPath(vfsroot, vd, b) + return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) } diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go index d4f59ee5b..60c2c980e 100644 --- a/pkg/sentry/fsimpl/tmpfs/stat_test.go +++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go @@ -71,9 +71,15 @@ func TestStatAfterCreate(t *testing.T) { t.Errorf("got btime %d, want 0", got.Btime.ToNsec()) } - // Size should be 0. - if got.Size != 0 { - t.Errorf("got size %d, want 0", got.Size) + // Size should be 0 (except for directories, which make up a size + // of 20 per entry, including the "." and ".." entries present in + // otherwise-empty directories). + wantSize := uint64(0) + if typ == "dir" { + wantSize = 40 + } + if got.Size != wantSize { + t.Errorf("got size %d, want %d", got.Size, wantSize) } // Nlink should be 1 for files, 2 for dirs. diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 82c709b43..efc931468 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -12,16 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package tmpfs provides a filesystem implementation that behaves like tmpfs: -// the Dentry tree is the sole source of truth for the state of the filesystem. +// Package tmpfs provides an in-memory filesystem whose contents are +// application-mutable, consistent with Linux's tmpfs. // // Lock order: // // filesystem.mu // inode.mu // regularFileFD.offMu +// *** "memmap.Mappable locks" below this point // regularFile.mapsMu +// *** "memmap.Mappable locks taken by Translate" below this point // regularFile.dataMu +// directory.iterMu package tmpfs import ( @@ -41,6 +44,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs/memxattr" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Name is the default filesystem name. @@ -112,18 +116,18 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fs.vfsfs.Init(vfsObj, newFSType, &fs) - var root *inode + var root *dentry switch rootFileType { case linux.S_IFREG: - root = fs.newRegularFile(creds, 0777) + root = fs.newDentry(fs.newRegularFile(creds, 0777)) case linux.S_IFLNK: - root = fs.newSymlink(creds, tmpfsOpts.RootSymlinkTarget) + root = fs.newDentry(fs.newSymlink(creds, tmpfsOpts.RootSymlinkTarget)) case linux.S_IFDIR: - root = fs.newDirectory(creds, 01777) + root = &fs.newDirectory(creds, 01777).dentry default: return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) } - return &fs.vfsfs, &fs.newDentry(root).vfsd, nil + return &fs.vfsfs, &root.vfsd, nil } // Release implements vfs.FilesystemImpl.Release. @@ -134,20 +138,29 @@ func (fs *filesystem) Release() { type dentry struct { vfsd vfs.Dentry + // parent is this dentry's parent directory. Each referenced dentry holds a + // reference on parent.dentry. If this dentry is a filesystem root, parent + // is nil. parent is protected by filesystem.mu. + parent *dentry + + // name is the name of this dentry in its parent. If this dentry is a + // filesystem root, name is the empty string. name is protected by + // filesystem.mu. + name string + + // dentryEntry (ugh) links dentries into their parent directory.childList. + dentryEntry + // inode is the inode represented by this dentry. Multiple Dentries may // share a single non-directory inode (with hard links). inode is // immutable. - inode *inode - + // // tmpfs doesn't count references on dentries; because the dentry tree is // the sole source of truth, it is by definition always consistent with the // state of the filesystem. However, it does count references on inodes, // because inode resources are released when all references are dropped. - // (tmpfs doesn't really have resources to release, but we implement - // reference counting because tmpfs regular files will.) - - // dentryEntry (ugh) links dentries into their parent directory.childList. - dentryEntry + // dentry therefore forwards reference counting directly to inode. + inode *inode } func (fs *filesystem) newDentry(inode *inode) *dentry { @@ -207,10 +220,6 @@ type inode struct { ctime int64 // nanoseconds mtime int64 // nanoseconds - // Only meaningful for device special files. - rdevMajor uint32 - rdevMinor uint32 - // Advisory file locks, which lock at the inode level. locks lock.FileLocks @@ -230,7 +239,7 @@ func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, i.gid = uint32(creds.EffectiveKGID) i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1) // Tmpfs creation sets atime, ctime, and mtime to current time. - now := i.clock.Now().Nanoseconds() + now := fs.clock.Now().Nanoseconds() i.atime = now i.ctime = now i.mtime = now @@ -283,14 +292,10 @@ func (i *inode) tryIncRef() bool { func (i *inode) decRef() { if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { if regFile, ok := i.impl.(*regularFile); ok { - // Hold inode.mu and regFile.dataMu while mutating - // size. - i.mu.Lock() - regFile.dataMu.Lock() + // Release memory used by regFile to store data. Since regFile is + // no longer usable, we don't need to grab any locks or update any + // metadata. regFile.data.DropAll(regFile.memFile) - atomic.StoreUint64(®File.size, 0) - regFile.dataMu.Unlock() - i.mu.Unlock() } } else if refs < 0 { panic("tmpfs.inode.decRef() called without holding a reference") @@ -310,15 +315,15 @@ func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) e // a concurrent modification), so we do not require holding inode.mu. func (i *inode) statTo(stat *linux.Statx) { stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | - linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_ATIME | - linux.STATX_BTIME | linux.STATX_CTIME | linux.STATX_MTIME - stat.Blksize = 1 // usermem.PageSize in tmpfs + linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | + linux.STATX_BLOCKS | linux.STATX_ATIME | linux.STATX_CTIME | + linux.STATX_MTIME + stat.Blksize = usermem.PageSize stat.Nlink = atomic.LoadUint32(&i.nlink) stat.UID = atomic.LoadUint32(&i.uid) stat.GID = atomic.LoadUint32(&i.gid) stat.Mode = uint16(atomic.LoadUint32(&i.mode)) stat.Ino = i.ino - // Linux's tmpfs has no concept of btime, so zero-value is returned. stat.Atime = linux.NsecToStatxTimestamp(i.atime) stat.Ctime = linux.NsecToStatxTimestamp(i.ctime) stat.Mtime = linux.NsecToStatxTimestamp(i.mtime) @@ -327,19 +332,22 @@ func (i *inode) statTo(stat *linux.Statx) { case *regularFile: stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS stat.Size = uint64(atomic.LoadUint64(&impl.size)) - // In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in - // a uint64 accessed using atomic memory operations to avoid taking - // locks). + // TODO(jamieliu): This should be impl.data.Span() / 512, but this is + // too expensive to compute here. Cache it in regularFile. stat.Blocks = allocatedBlocksForSize(stat.Size) + case *directory: + // "20" is mm/shmem.c:BOGO_DIRENT_SIZE. + stat.Size = 20 * (2 + uint64(atomic.LoadInt64(&impl.numChildren))) + // stat.Blocks is 0. case *symlink: - stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS stat.Size = uint64(len(impl.target)) - stat.Blocks = allocatedBlocksForSize(stat.Size) + // stat.Blocks is 0. + case *namedPipe, *socketFile: + // stat.Size and stat.Blocks are 0. case *deviceFile: + // stat.Size and stat.Blocks are 0. stat.RdevMajor = impl.major stat.RdevMinor = impl.minor - case *socketFile, *directory, *namedPipe: - // Nothing to do. default: panic(fmt.Sprintf("unknown inode type: %T", i.impl)) } diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index 35b208721..8624dbd5d 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -15,34 +15,17 @@ package vfs import ( - "fmt" "sync/atomic" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) -// Dentry represents a node in a Filesystem tree which may represent a file. +// Dentry represents a node in a Filesystem tree at which a file exists. // // Dentries are reference-counted. Unless otherwise specified, all Dentry // methods require that a reference is held. // -// A Dentry transitions through up to 3 different states through its lifetime: -// -// - Dentries are initially "independent". Independent Dentries have no parent, -// and consequently no name. -// -// - Dentry.InsertChild() causes an independent Dentry to become a "child" of -// another Dentry. A child node has a parent node, and a name in that parent, -// both of which are mutable by DentryMoveChild(). Each child Dentry's name is -// unique within its parent. -// -// - Dentry.RemoveChild() causes a child Dentry to become "disowned". A -// disowned Dentry can still refer to its former parent and its former name in -// said parent, but the disowned Dentry is no longer reachable from its parent, -// and a new Dentry with the same name may become a child of the parent. (This -// is analogous to a struct dentry being "unhashed" in Linux.) -// // Dentry is loosely analogous to Linux's struct dentry, but: // // - VFS does not associate Dentries with inodes. gVisor interacts primarily @@ -57,9 +40,6 @@ import ( // and/or FileDescriptionImpl methods in gVisor's VFS. Filesystems that do // support inodes may store appropriate state in implementations of DentryImpl. // -// - VFS does not provide synchronization for mutable Dentry fields, other than -// mount-related ones. -// // - VFS does not require that Dentries are instantiated for all paths accessed // through VFS, only those that are tracked beyond the scope of a single // Filesystem operation. This includes file descriptions, mount points, mount @@ -67,6 +47,10 @@ import ( // of Dentries for operations on mutable remote filesystems that can't actually // cache any state in the Dentry. // +// - VFS does not track filesystem structure (i.e. relationships between +// Dentries), since both the relevant state and synchronization are +// filesystem-specific. +// // - For the reasons above, VFS is not directly responsible for managing Dentry // lifetime. Dentry reference counts only indicate the extent to which VFS // requires Dentries to exist; Filesystems may elect to cache or discard @@ -74,36 +58,23 @@ import ( // // +stateify savable type Dentry struct { - // parent is this Dentry's parent in this Filesystem. If this Dentry is - // independent, parent is nil. - parent *Dentry - - // name is this Dentry's name in parent. - name string + // mu synchronizes deletion/invalidation and mounting over this Dentry. + mu sync.Mutex `state:"nosave"` - flags uint32 + // dead is true if the file represented by this Dentry has been deleted (by + // CommitDeleteDentry or CommitRenameReplaceDentry) or invalidated (by + // InvalidateDentry). dead is protected by mu. + dead bool // mounts is the number of Mounts for which this Dentry is Mount.point. // mounts is accessed using atomic memory operations. mounts uint32 - // children are child Dentries. - children map[string]*Dentry - - // mu synchronizes disowning and mounting over this Dentry. - mu sync.Mutex `state:"nosave"` - // impl is the DentryImpl associated with this Dentry. impl is immutable. // This should be the last field in Dentry. impl DentryImpl } -const ( - // dflagsDisownedMask is set in Dentry.flags if the Dentry has been - // disowned. - dflagsDisownedMask = 1 << iota -) - // Init must be called before first use of d. func (d *Dentry) Init(impl DentryImpl) { d.impl = impl @@ -134,20 +105,6 @@ type DentryImpl interface { DecRef() } -// IsDisowned returns true if d is disowned. -func (d *Dentry) IsDisowned() bool { - return atomic.LoadUint32(&d.flags)&dflagsDisownedMask != 0 -} - -// Preconditions: !d.IsDisowned(). -func (d *Dentry) setDisowned() { - atomic.AddUint32(&d.flags, dflagsDisownedMask) -} - -func (d *Dentry) isMounted() bool { - return atomic.LoadUint32(&d.mounts) != 0 -} - // IncRef increments d's reference count. func (d *Dentry) IncRef() { d.impl.IncRef() @@ -164,104 +121,26 @@ func (d *Dentry) DecRef() { d.impl.DecRef() } -// These functions are exported so that filesystem implementations can use -// them. The vfs package, and users of VFS, should not call these functions. -// Unless otherwise specified, these methods require that there are no -// concurrent mutators of d. - -// Name returns d's name in its parent in its owning Filesystem. If d is -// independent, Name returns an empty string. -func (d *Dentry) Name() string { - return d.name -} - -// Parent returns d's parent in its owning Filesystem. It does not take a -// reference on the returned Dentry. If d is independent, Parent returns nil. -func (d *Dentry) Parent() *Dentry { - return d.parent -} - -// ParentOrSelf is equivalent to Parent, but returns d if d is independent. -func (d *Dentry) ParentOrSelf() *Dentry { - if d.parent == nil { - return d - } - return d.parent -} - -// Child returns d's child with the given name in its owning Filesystem. It -// does not take a reference on the returned Dentry. If no such child exists, -// Child returns nil. -func (d *Dentry) Child(name string) *Dentry { - return d.children[name] -} - -// HasChildren returns true if d has any children. -func (d *Dentry) HasChildren() bool { - return len(d.children) != 0 -} - -// Children returns a map containing all of d's children. -func (d *Dentry) Children() map[string]*Dentry { - if !d.HasChildren() { - return nil - } - m := make(map[string]*Dentry) - for name, child := range d.children { - m[name] = child - } - return m +// IsDead returns true if d has been deleted or invalidated by its owning +// filesystem. +func (d *Dentry) IsDead() bool { + d.mu.Lock() + defer d.mu.Unlock() + return d.dead } -// InsertChild makes child a child of d with the given name. -// -// InsertChild is a mutator of d and child. -// -// Preconditions: child must be an independent Dentry. d and child must be from -// the same Filesystem. d must not already have a child with the given name. -func (d *Dentry) InsertChild(child *Dentry, name string) { - if checkInvariants { - if _, ok := d.children[name]; ok { - panic(fmt.Sprintf("parent already contains a child named %q", name)) - } - if child.parent != nil || child.name != "" { - panic(fmt.Sprintf("child is not independent: parent = %v, name = %q", child.parent, child.name)) - } - } - if d.children == nil { - d.children = make(map[string]*Dentry) - } - d.children[name] = child - child.parent = d - child.name = name +func (d *Dentry) isMounted() bool { + return atomic.LoadUint32(&d.mounts) != 0 } -// IsAncestorOf returns true if d is an ancestor of d2; that is, d is either -// d2's parent or an ancestor of d2's parent. -func (d *Dentry) IsAncestorOf(d2 *Dentry) bool { - for d2.parent != nil { - if d2.parent == d { - return true - } - d2 = d2.parent - } - return false -} +// The following functions are exported so that filesystem implementations can +// use them. The vfs package, and users of VFS, should not call these +// functions. // PrepareDeleteDentry must be called before attempting to delete the file // represented by d. If PrepareDeleteDentry succeeds, the caller must call // AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome. -// -// Preconditions: d is a child Dentry. func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dentry) error { - if checkInvariants { - if d.parent == nil { - panic("d is independent") - } - if d.IsDisowned() { - panic("d is already disowned") - } - } vfs.mountMu.Lock() if mntns.mountpoints[d] != 0 { vfs.mountMu.Unlock() @@ -280,42 +159,27 @@ func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) { d.mu.Unlock() } -// CommitDeleteDentry must be called after the file represented by d is -// deleted, and causes d to become disowned. -// -// CommitDeleteDentry is a mutator of d and d.Parent(). -// -// Preconditions: PrepareDeleteDentry was previously called on d. +// CommitDeleteDentry must be called after PrepareDeleteDentry if the deletion +// succeeds. func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) { - if d.parent != nil { - delete(d.parent.children, d.name) - } - d.setDisowned() + d.dead = true d.mu.Unlock() if d.isMounted() { - vfs.forgetDisownedMountpoint(d) + vfs.forgetDeadMountpoint(d) } } -// ForceDeleteDentry causes d to become disowned. It should only be used in -// cases where VFS has no ability to stop the deletion (e.g. d represents the -// local state of a file on a remote filesystem on which the file has already -// been deleted). -// -// ForceDeleteDentry is a mutator of d and d.Parent(). -// -// Preconditions: d is a child Dentry. -func (vfs *VirtualFilesystem) ForceDeleteDentry(d *Dentry) { - if checkInvariants { - if d.parent == nil { - panic("d is independent") - } - if d.IsDisowned() { - panic("d is already disowned") - } - } +// InvalidateDentry is called when d ceases to represent the file it formerly +// did for reasons outside of VFS' control (e.g. d represents the local state +// of a file on a remote filesystem on which the file has already been +// deleted). +func (vfs *VirtualFilesystem) InvalidateDentry(d *Dentry) { d.mu.Lock() - vfs.CommitDeleteDentry(d) + d.dead = true + d.mu.Unlock() + if d.isMounted() { + vfs.forgetDeadMountpoint(d) + } } // PrepareRenameDentry must be called before attempting to rename the file @@ -324,25 +188,9 @@ func (vfs *VirtualFilesystem) ForceDeleteDentry(d *Dentry) { // caller must call AbortRenameDentry, CommitRenameReplaceDentry, or // CommitRenameExchangeDentry depending on the rename's outcome. // -// Preconditions: from is a child Dentry. If to is not nil, it must be a child -// Dentry from the same Filesystem. from != to. +// Preconditions: If to is not nil, it must be a child Dentry from the same +// Filesystem. from != to. func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error { - if checkInvariants { - if from.parent == nil { - panic("from is independent") - } - if from.IsDisowned() { - panic("from is already disowned") - } - if to != nil { - if to.parent == nil { - panic("to is independent") - } - if to.IsDisowned() { - panic("to is already disowned") - } - } - } vfs.mountMu.Lock() if mntns.mountpoints[from] != 0 { vfs.mountMu.Unlock() @@ -376,24 +224,14 @@ func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) { // is renamed without RENAME_EXCHANGE. If to is not nil, it represents the file // that was replaced by from. // -// CommitRenameReplaceDentry is a mutator of from, to, from.Parent(), and -// to.Parent(). -// // Preconditions: PrepareRenameDentry was previously called on from and to. -// newParent.Child(newName) == to. -func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, newParent *Dentry, newName string, to *Dentry) { - if newParent.children == nil { - newParent.children = make(map[string]*Dentry) - } - newParent.children[newName] = from - from.parent = newParent - from.name = newName +func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, to *Dentry) { from.mu.Unlock() if to != nil { - to.setDisowned() + to.dead = true to.mu.Unlock() if to.isMounted() { - vfs.forgetDisownedMountpoint(to) + vfs.forgetDeadMountpoint(to) } } } @@ -401,25 +239,18 @@ func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, newParent *Dentry, // CommitRenameExchangeDentry must be called after the files represented by // from and to are exchanged by rename(RENAME_EXCHANGE). // -// CommitRenameExchangeDentry is a mutator of from, to, from.Parent(), and -// to.Parent(). -// // Preconditions: PrepareRenameDentry was previously called on from and to. func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) { - from.parent, to.parent = to.parent, from.parent - from.name, to.name = to.name, from.name - from.parent.children[from.name] = from - to.parent.children[to.name] = to from.mu.Unlock() to.mu.Unlock() } -// forgetDisownedMountpoint is called when a mount point is deleted to umount -// all mounts using it in all other mount namespaces. +// forgetDeadMountpoint is called when a mount point is deleted or invalidated +// to umount all mounts using it in all other mount namespaces. // -// forgetDisownedMountpoint is analogous to Linux's +// forgetDeadMountpoint is analogous to Linux's // fs/namespace.c:__detach_mounts(). -func (vfs *VirtualFilesystem) forgetDisownedMountpoint(d *Dentry) { +func (vfs *VirtualFilesystem) forgetDeadMountpoint(d *Dentry) { var ( vdsToDecRef []VirtualDentry mountsToDecRef []*Mount diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 5976b5ccd..15cc091e2 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -127,7 +127,8 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mn mount: mnt, dentry: d, } - fd.vd.IncRef() + mnt.IncRef() + d.IncRef() fd.opts = *opts fd.readable = MayReadFileWithOpenFlags(statusFlags) fd.writable = writable diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index a537a29d1..74577bc2f 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -346,7 +346,10 @@ type FilesystemImpl interface { // ENOTEMPTY. // // Preconditions: !rp.Done(). For the final path component in rp, - // !rp.ShouldFollowSymlink(). oldName is not "." or "..". + // !rp.ShouldFollowSymlink(). oldParentVD.Dentry() was obtained from a + // previous call to + // oldParentVD.Mount().Filesystem().Impl().GetParentDentryAt(). oldName is + // not "." or "..". // // Postconditions: If RenameAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go index 7315a588e..465e610e0 100644 --- a/pkg/sentry/vfs/filesystem_impl_util.go +++ b/pkg/sentry/vfs/filesystem_impl_util.go @@ -16,8 +16,6 @@ package vfs import ( "strings" - - "gvisor.dev/gvisor/pkg/fspath" ) // GenericParseMountOptions parses a comma-separated list of options of the @@ -43,27 +41,3 @@ func GenericParseMountOptions(str string) map[string]string { } return m } - -// GenericPrependPath may be used by implementations of -// FilesystemImpl.PrependPath() for which a single statically-determined lock -// or set of locks is sufficient to ensure its preconditions (as opposed to -// e.g. per-Dentry locks). -// -// Preconditions: Dentry.Name() and Dentry.Parent() must be held constant for -// vd.Dentry() and all of its ancestors. -func GenericPrependPath(vfsroot, vd VirtualDentry, b *fspath.Builder) error { - mnt, d := vd.mount, vd.dentry - for { - if mnt == vfsroot.mount && d == vfsroot.dentry { - return PrependPathAtVFSRootError{} - } - if d == mnt.root { - return nil - } - if d.parent == nil { - return PrependPathAtNonMountRootError{} - } - b.PrependComponent(d.name) - d = d.parent - } -} diff --git a/pkg/sentry/vfs/genericfstree/BUILD b/pkg/sentry/vfs/genericfstree/BUILD new file mode 100644 index 000000000..d8fd92677 --- /dev/null +++ b/pkg/sentry/vfs/genericfstree/BUILD @@ -0,0 +1,16 @@ +load("//tools/go_generics:defs.bzl", "go_template") + +package( + default_visibility = ["//:sandbox"], + licenses = ["notice"], +) + +go_template( + name = "generic_fstree", + srcs = [ + "genericfstree.go", + ], + types = [ + "Dentry", + ], +) diff --git a/pkg/sentry/vfs/genericfstree/genericfstree.go b/pkg/sentry/vfs/genericfstree/genericfstree.go new file mode 100644 index 000000000..286510195 --- /dev/null +++ b/pkg/sentry/vfs/genericfstree/genericfstree.go @@ -0,0 +1,80 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package genericfstree provides tools for implementing vfs.FilesystemImpls +// where a single statically-determined lock or set of locks is sufficient to +// ensure that a Dentry's name and parent are contextually immutable. +// +// Clients using this package must use the go_template_instance rule in +// tools/go_generics/defs.bzl to create an instantiation of this template +// package, providing types to use in place of Dentry. +package genericfstree + +import ( + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// Dentry is a required type parameter that is a struct with the given fields. +type Dentry struct { + // vfsd is the embedded vfs.Dentry corresponding to this vfs.DentryImpl. + vfsd vfs.Dentry + + // parent is the parent of this Dentry in the filesystem's tree. If this + // Dentry is a filesystem root, parent is nil. + parent *Dentry + + // name is the name of this Dentry in its parent. If this Dentry is a + // filesystem root, name is unspecified. + name string +} + +// IsAncestorDentry returns true if d is an ancestor of d2; that is, d is +// either d2's parent or an ancestor of d2's parent. +func IsAncestorDentry(d, d2 *Dentry) bool { + for { + if d2.parent == d { + return true + } + if d2.parent == d2 { + return false + } + d2 = d2.parent + } +} + +// ParentOrSelf returns d.parent. If d.parent is nil, ParentOrSelf returns d. +func ParentOrSelf(d *Dentry) *Dentry { + if d.parent != nil { + return d.parent + } + return d +} + +// PrependPath is a generic implementation of FilesystemImpl.PrependPath(). +func PrependPath(vfsroot vfs.VirtualDentry, mnt *vfs.Mount, d *Dentry, b *fspath.Builder) error { + for { + if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() { + return vfs.PrependPathAtVFSRootError{} + } + if &d.vfsd == mnt.Root() { + return nil + } + if d.parent == nil { + return vfs.PrependPathAtNonMountRootError{} + } + b.PrependComponent(d.name) + d = d.parent + } +} diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index f06946103..02850b65c 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -188,6 +188,7 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia if err != nil { return err } + // We can't hold vfs.mountMu while calling FilesystemImpl methods due to // lock ordering. vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{}) @@ -199,7 +200,7 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia vfs.mountMu.Lock() vd.dentry.mu.Lock() for { - if vd.dentry.IsDisowned() { + if vd.dentry.dead { vd.dentry.mu.Unlock() vfs.mountMu.Unlock() vd.DecRef() @@ -665,6 +666,12 @@ func (mnt *Mount) submountsLocked() []*Mount { return mounts } +// Root returns the mount's root. It does not take a reference on the returned +// Dentry. +func (mnt *Mount) Root() *Dentry { + return mnt.root +} + // Root returns mntns' root. A reference is taken on the returned // VirtualDentry. func (mntns *MountNamespace) Root() VirtualDentry { diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go index f21a88034..cd78d66bc 100644 --- a/pkg/sentry/vfs/pathname.go +++ b/pkg/sentry/vfs/pathname.go @@ -58,7 +58,7 @@ loop: switch err.(type) { case nil: if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry { - // GenericPrependPath() will have returned + // genericfstree.PrependPath() will have returned // PrependPathAtVFSRootError in this case since it checks // against vfsroot before mnt.root, but other implementations // of FilesystemImpl.PrependPath() may return nil instead. @@ -84,7 +84,7 @@ loop: } } b.PrependByte('/') - if origD.IsDisowned() { + if origD.IsDead() { b.AppendString(" (deleted)") } return b.String(), nil @@ -136,7 +136,7 @@ loop: // PathnameForGetcwd returns an absolute pathname to vd, consistent with // Linux's sys_getcwd(). func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) { - if vd.dentry.IsDisowned() { + if vd.dentry.IsDead() { return "", syserror.ENOENT } diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index 8f31495da..9d047ff88 100644 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go @@ -29,7 +29,9 @@ import ( // // From the perspective of FilesystemImpl methods, a ResolvingPath represents a // starting Dentry on the associated Filesystem (on which a reference is -// already held) and a stream of path components relative to that Dentry. +// already held), a stream of path components relative to that Dentry, and +// elements of the invoking Context that are commonly required by +// FilesystemImpl methods. // // ResolvingPath is loosely analogous to Linux's struct nameidata. type ResolvingPath struct { @@ -251,18 +253,17 @@ func (rp *ResolvingPath) relpathCommit() { rp.origParts[rp.curPart] = rp.pit } -// ResolveParent returns the VFS parent of d. It does not take a reference on -// the returned Dentry. -// -// Preconditions: There are no concurrent mutators of d. -// -// Postconditions: If the returned error is nil, then the returned Dentry is -// not nil. -func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) { - var parent *Dentry +// CheckRoot is called before resolving the parent of the Dentry d. If the +// Dentry is contextually a VFS root, such that path resolution should treat +// d's parent as itself, CheckRoot returns (true, nil). If the Dentry is the +// root of a non-root mount, such that path resolution should switch to another +// Mount, CheckRoot returns (unspecified, non-nil error). Otherwise, path +// resolution should resolve d's parent normally, and CheckRoot returns (false, +// nil). +func (rp *ResolvingPath) CheckRoot(d *Dentry) (bool, error) { if d == rp.root.dentry && rp.mount == rp.root.mount { - // At contextual VFS root. - parent = d + // At contextual VFS root (due to e.g. chroot(2)). + return true, nil } else if d == rp.mount.root { // At mount root ... vd := rp.vfs.getMountpointAt(rp.mount, rp.root) @@ -270,59 +271,27 @@ func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) { // ... of non-root mount. rp.nextMount = vd.mount rp.nextStart = vd.dentry - return nil, resolveMountRootOrJumpError{} + return false, resolveMountRootOrJumpError{} } // ... of root mount. - parent = d - } else if d.parent == nil { - // At filesystem root. - parent = d - } else { - parent = d.parent + return true, nil } - if parent.isMounted() { - if mnt := rp.vfs.getMountAt(rp.mount, parent); mnt != nil { - rp.nextMount = mnt - return nil, resolveMountPointError{} - } - } - return parent, nil + return false, nil } -// ResolveChild returns the VFS child of d with the given name. It does not -// take a reference on the returned Dentry. If no such child exists, -// ResolveChild returns (nil, nil). -// -// Preconditions: There are no concurrent mutators of d. -func (rp *ResolvingPath) ResolveChild(d *Dentry, name string) (*Dentry, error) { - child := d.children[name] - if child == nil { - return nil, nil +// CheckMount is called after resolving the parent or child of another Dentry +// to d. If d is a mount point, such that path resolution should switch to +// another Mount, CheckMount returns a non-nil error. Otherwise, CheckMount +// returns nil. +func (rp *ResolvingPath) CheckMount(d *Dentry) error { + if !d.isMounted() { + return nil } - if child.isMounted() { - if mnt := rp.vfs.getMountAt(rp.mount, child); mnt != nil { - rp.nextMount = mnt - return nil, resolveMountPointError{} - } - } - return child, nil -} - -// ResolveComponent returns the Dentry reached by starting at d and resolving -// the current path component in the stream represented by rp. It does not -// advance the stream. It does not take a reference on the returned Dentry. If -// no such Dentry exists, ResolveComponent returns (nil, nil). -// -// Preconditions: !rp.Done(). There are no concurrent mutators of d. -func (rp *ResolvingPath) ResolveComponent(d *Dentry) (*Dentry, error) { - switch pc := rp.Component(); pc { - case ".": - return d, nil - case "..": - return rp.ResolveParent(d) - default: - return rp.ResolveChild(d, pc) + if mnt := rp.vfs.getMountAt(rp.mount, d); mnt != nil { + rp.nextMount = mnt + return resolveMountPointError{} } + return nil } // ShouldFollowSymlink returns true if, supposing that the current path -- cgit v1.2.3