// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package ext import ( "errors" "io" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) var ( // errResolveDirent indicates that the vfs.ResolvingPath.Component() does // not exist on the dentry tree but does exist on disk. So it has to be read in // using the in-memory dirent and added to the dentry tree. Usually indicates // the need to lock filesystem.mu for writing. errResolveDirent = errors.New("resolve path component using dirent") ) // filesystem implements vfs.FilesystemImpl. type filesystem struct { vfsfs vfs.Filesystem // mu serializes changes to the Dentry tree. mu sync.RWMutex // dev represents the underlying fs device. It does not require protection // because io.ReaderAt permits concurrent read calls to it. It translates to // the pread syscall which passes on the read request directly to the device // driver. Device drivers are intelligent in serving multiple concurrent read // requests in the optimal order (taking locality into consideration). dev io.ReaderAt // inodeCache maps absolute inode numbers to the corresponding Inode struct. // Inodes should be removed from this once their reference count hits 0. // // Protected by mu because most additions (see IterDirents) and all removals // from this corresponds to a change in the dentry tree. inodeCache map[uint32]*inode // sb represents the filesystem superblock. Immutable after initialization. sb disklayout.SuperBlock // bgs represents all the block group descriptors for the filesystem. // Immutable after initialization. bgs []disklayout.BlockGroup } // Compiles only if filesystem implements vfs.FilesystemImpl. var _ vfs.FilesystemImpl = (*filesystem)(nil) // stepLocked resolves rp.Component() in parent directory vfsd. The write // parameter passed tells if the caller has acquired filesystem.mu for writing // or not. If set to true, an existing inode on disk can be added to the dentry // tree if not present already. // // stepLocked is loosely analogous to fs/namei.c:walk_component(). // // Preconditions: // - filesystem.mu must be locked (for writing if write param is true). // - !rp.Done(). // - inode == vfsd.Impl().(*Dentry).inode. func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) { if !inode.isDir() { return nil, nil, syserror.ENOTDIR } if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, nil, err } for { name := rp.Component() if name == "." { rp.Advance() return vfsd, inode, nil } d := vfsd.Impl().(*dentry) if name == ".." { isRoot, err := rp.CheckRoot(vfsd) if err != nil { return nil, nil, err } if isRoot || d.parent == nil { rp.Advance() return vfsd, inode, nil } if err := rp.CheckMount(&d.parent.vfsd); err != nil { return nil, nil, err } rp.Advance() return &d.parent.vfsd, d.parent.inode, nil } dir := inode.impl.(*directory) child, ok := dir.childCache[name] if !ok { // We may need to instantiate a new dentry for this child. childDirent, ok := dir.childMap[name] if !ok { // The underlying inode does not exist on disk. return nil, nil, syserror.ENOENT } if !write { // filesystem.mu must be held for writing to add to the dentry tree. return nil, nil, errResolveDirent } // Create and add the component's dirent to the dentry tree. fs := rp.Mount().Filesystem().Impl().(*filesystem) childInode, err := fs.getOrCreateInodeLocked(childDirent.diskDirent.Inode()) if err != nil { return nil, nil, err } // incRef because this is being added to the dentry tree. childInode.incRef() child = newDentry(childInode) child.parent = d child.name = name dir.childCache[name] = child } if err := rp.CheckMount(&child.vfsd); err != nil { return nil, nil, err } if child.inode.isSymlink() && rp.ShouldFollowSymlink() { if err := rp.HandleSymlink(child.inode.impl.(*symlink).target); err != nil { return nil, nil, err } continue } rp.Advance() return &child.vfsd, child.inode, nil } } // walkLocked resolves rp to an existing file. The write parameter // passed tells if the caller has acquired filesystem.mu for writing or not. // If set to true, additions can be made to the dentry tree while walking. // If errResolveDirent is returned, the walk needs to be continued with an // upgraded filesystem.mu. // // walkLocked is loosely analogous to Linux's fs/namei.c:path_lookupat(). // // Preconditions: // - filesystem.mu must be locked (for writing if write param is true). func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { vfsd := rp.Start() inode := vfsd.Impl().(*dentry).inode for !rp.Done() { var err error vfsd, inode, err = stepLocked(rp, vfsd, inode, write) if err != nil { return nil, nil, err } } if rp.MustBeDir() && !inode.isDir() { return nil, nil, syserror.ENOTDIR } return vfsd, inode, nil } // walkParentLocked resolves all but the last path component of rp to an // existing directory. It does not check that the returned directory is // searchable by the provider of rp. The write parameter passed tells if the // caller has acquired filesystem.mu for writing or not. If set to true, // additions can be made to the dentry tree while walking. // If errResolveDirent is returned, the walk needs to be continued with an // upgraded filesystem.mu. // // walkParentLocked is loosely analogous to Linux's fs/namei.c:path_parentat(). // // Preconditions: // - filesystem.mu must be locked (for writing if write param is true). // - !rp.Done(). func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { vfsd := rp.Start() inode := vfsd.Impl().(*dentry).inode for !rp.Final() { var err error vfsd, inode, err = stepLocked(rp, vfsd, inode, write) if err != nil { return nil, nil, err } } if !inode.isDir() { return nil, nil, syserror.ENOTDIR } return vfsd, inode, nil } // walk resolves rp to an existing file. If parent is set to true, it resolves // the rp till the parent of the last component which should be an existing // directory. If parent is false then resolves rp entirely. Attemps to resolve // the path as far as it can with a read lock and upgrades the lock if needed. func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) { var ( vfsd *vfs.Dentry inode *inode err error ) // Try walking with the hopes that all dentries have already been pulled out // of disk. This reduces congestion (allows concurrent walks). fs.mu.RLock() if parent { vfsd, inode, err = walkParentLocked(rp, false) } else { vfsd, inode, err = walkLocked(rp, false) } fs.mu.RUnlock() if err == errResolveDirent { // Upgrade lock and continue walking. Lock upgrading in the middle of the // walk is fine as this is a read only filesystem. fs.mu.Lock() if parent { vfsd, inode, err = walkParentLocked(rp, true) } else { vfsd, inode, err = walkLocked(rp, true) } fs.mu.Unlock() } return vfsd, inode, err } // getOrCreateInodeLocked gets the inode corresponding to the inode number passed in. // It creates a new one with the given inode number if one does not exist. // The caller must increment the ref count if adding this to the dentry tree. // // Precondition: must be holding fs.mu for writing. func (fs *filesystem) getOrCreateInodeLocked(inodeNum uint32) (*inode, error) { if in, ok := fs.inodeCache[inodeNum]; ok { return in, nil } in, err := newInode(fs, inodeNum) if err != nil { return nil, err } fs.inodeCache[inodeNum] = in return in, nil } // statTo writes the statfs fields to the output parameter. func (fs *filesystem) statTo(stat *linux.Statfs) { stat.Type = uint64(fs.sb.Magic()) stat.BlockSize = int64(fs.sb.BlockSize()) stat.Blocks = fs.sb.BlocksCount() stat.BlocksFree = fs.sb.FreeBlocksCount() stat.BlocksAvailable = fs.sb.FreeBlocksCount() stat.Files = uint64(fs.sb.InodesCount()) stat.FilesFree = uint64(fs.sb.FreeInodesCount()) stat.NameLength = disklayout.MaxFileName stat.FragmentSize = int64(fs.sb.BlockSize()) // TODO(b/134676337): Set Statfs.Flags and Statfs.FSID. } // AccessAt implements vfs.Filesystem.Impl.AccessAt. func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { _, inode, err := fs.walk(rp, false) if err != nil { return err } return inode.checkPermissions(rp.Credentials(), ats) } // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { vfsd, inode, err := fs.walk(rp, false) if err != nil { return nil, err } if opts.CheckSearchable { if !inode.isDir() { return nil, syserror.ENOTDIR } if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } } inode.incRef() return vfsd, nil } // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { vfsd, inode, err := fs.walk(rp, true) if err != nil { return nil, err } inode.incRef() return vfsd, nil } // OpenAt implements vfs.FilesystemImpl.OpenAt. func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { vfsd, inode, err := fs.walk(rp, false) if err != nil { return nil, err } // EROFS is returned if write access is needed. if vfs.MayWriteFileWithOpenFlags(opts.Flags) || opts.Flags&(linux.O_CREAT|linux.O_EXCL|linux.O_TMPFILE) != 0 { return nil, syserror.EROFS } return inode.open(rp, vfsd, &opts) } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { _, inode, err := fs.walk(rp, false) if err != nil { return "", err } symlink, ok := inode.impl.(*symlink) if !ok { return "", syserror.EINVAL } return symlink.target, nil } // StatAt implements vfs.FilesystemImpl.StatAt. func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { _, inode, err := fs.walk(rp, false) if err != nil { return linux.Statx{}, err } var stat linux.Statx inode.statTo(&stat) return stat, nil } // StatFSAt implements vfs.FilesystemImpl.StatFSAt. func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { if _, _, err := fs.walk(rp, false); err != nil { return linux.Statfs{}, err } var stat linux.Statfs fs.statTo(&stat) return stat, nil } // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release() {} // Sync implements vfs.FilesystemImpl.Sync. func (fs *filesystem) Sync(ctx context.Context) error { // This is a readonly filesystem for now. return nil } // The vfs.FilesystemImpl functions below return EROFS because their respective // man pages say that EROFS must be returned if the path resolves to a file on // this read-only filesystem. // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { if rp.Done() { return syserror.EEXIST } if _, _, err := fs.walk(rp, true); err != nil { return err } return syserror.EROFS } // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { if rp.Done() { return syserror.EEXIST } if _, _, err := fs.walk(rp, true); err != nil { return err } return syserror.EROFS } // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { if rp.Done() { return syserror.EEXIST } _, _, err := fs.walk(rp, true) if err != nil { return err } return syserror.EROFS } // RenameAt implements vfs.FilesystemImpl.RenameAt. func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { if rp.Done() { return syserror.ENOENT } _, _, err := fs.walk(rp, false) if err != nil { return err } return syserror.EROFS } // RmdirAt implements vfs.FilesystemImpl.RmdirAt. func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { _, inode, err := fs.walk(rp, false) if err != nil { return err } if !inode.isDir() { return syserror.ENOTDIR } return syserror.EROFS } // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { _, _, err := fs.walk(rp, false) if err != nil { return err } return syserror.EROFS } // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { if rp.Done() { return syserror.EEXIST } _, _, err := fs.walk(rp, true) if err != nil { return err } return syserror.EROFS } // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { _, inode, err := fs.walk(rp, false) if err != nil { return err } if inode.isDir() { return syserror.EISDIR } return syserror.EROFS } // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) { _, _, err := fs.walk(rp, false) if err != nil { return nil, err } // TODO(b/134676337): Support sockets. return nil, syserror.ECONNREFUSED } // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { _, _, err := fs.walk(rp, false) if err != nil { return nil, err } return nil, syserror.ENOTSUP } // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { _, _, err := fs.walk(rp, false) if err != nil { return "", err } return "", syserror.ENOTSUP } // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { _, _, err := fs.walk(rp, false) if err != nil { return err } return syserror.ENOTSUP } // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { _, _, err := fs.walk(rp, false) if err != nil { return err } return syserror.ENOTSUP } // PrependPath implements vfs.FilesystemImpl.PrependPath. func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.mu.RLock() defer fs.mu.RUnlock() return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) }