// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package tmpfs provides a filesystem implementation that behaves like tmpfs: // the Dentry tree is the sole source of truth for the state of the filesystem. // // Lock order: // // filesystem.mu // inode.mu // regularFileFD.offMu // regularFile.mapsMu // regularFile.dataMu package tmpfs import ( "fmt" "math" "strings" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sentry/vfs/memxattr" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) // Name is the default filesystem name. const Name = "tmpfs" // FilesystemType implements vfs.FilesystemType. type FilesystemType struct{} // filesystem implements vfs.FilesystemImpl. type filesystem struct { vfsfs vfs.Filesystem // memFile is used to allocate pages to for regular files. memFile *pgalloc.MemoryFile // clock is a realtime clock used to set timestamps in file operations. clock time.Clock // mu serializes changes to the Dentry tree. mu sync.RWMutex nextInoMinusOne uint64 // accessed using atomic memory operations } // Name implements vfs.FilesystemType.Name. func (FilesystemType) Name() string { return Name } // FilesystemOpts is used to pass configuration data to tmpfs. type FilesystemOpts struct { // RootFileType is the FileType of the filesystem root. Valid values // are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR. RootFileType uint16 // RootSymlinkTarget is the target of the root symlink. Only valid if // RootFileType == S_IFLNK. RootSymlinkTarget string // FilesystemType allows setting a different FilesystemType for this // tmpfs filesystem. This allows tmpfs to "impersonate" other // filesystems, like ramdiskfs and cgroupfs. FilesystemType vfs.FilesystemType } // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx) if memFileProvider == nil { panic("MemoryFileProviderFromContext returned nil") } clock := time.RealtimeClockFromContext(ctx) fs := filesystem{ memFile: memFileProvider.MemoryFile(), clock: clock, } rootFileType := uint16(linux.S_IFDIR) newFSType := vfs.FilesystemType(&fstype) tmpfsOpts, ok := opts.InternalData.(FilesystemOpts) if ok { if tmpfsOpts.RootFileType != 0 { rootFileType = tmpfsOpts.RootFileType } if tmpfsOpts.FilesystemType != nil { newFSType = tmpfsOpts.FilesystemType } } fs.vfsfs.Init(vfsObj, newFSType, &fs) var root *inode switch rootFileType { case linux.S_IFREG: root = fs.newRegularFile(creds, 0777) case linux.S_IFLNK: root = fs.newSymlink(creds, tmpfsOpts.RootSymlinkTarget) case linux.S_IFDIR: root = fs.newDirectory(creds, 01777) default: return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) } return &fs.vfsfs, &fs.newDentry(root).vfsd, nil } // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release() { } // dentry implements vfs.DentryImpl. type dentry struct { vfsd vfs.Dentry // inode is the inode represented by this dentry. Multiple Dentries may // share a single non-directory inode (with hard links). inode is // immutable. inode *inode // tmpfs doesn't count references on dentries; because the dentry tree is // the sole source of truth, it is by definition always consistent with the // state of the filesystem. However, it does count references on inodes, // because inode resources are released when all references are dropped. // (tmpfs doesn't really have resources to release, but we implement // reference counting because tmpfs regular files will.) // dentryEntry (ugh) links dentries into their parent directory.childList. dentryEntry } func (fs *filesystem) newDentry(inode *inode) *dentry { d := &dentry{ inode: inode, } d.vfsd.Init(d) return d } // IncRef implements vfs.DentryImpl.IncRef. func (d *dentry) IncRef() { d.inode.incRef() } // TryIncRef implements vfs.DentryImpl.TryIncRef. func (d *dentry) TryIncRef() bool { return d.inode.tryIncRef() } // DecRef implements vfs.DentryImpl.DecRef. func (d *dentry) DecRef() { d.inode.decRef() } // inode represents a filesystem object. type inode struct { // clock is a realtime clock used to set timestamps in file operations. clock time.Clock // refs is a reference count. refs is accessed using atomic memory // operations. // // A reference is held on all inodes that are reachable in the filesystem // tree. For non-directories (which may have multiple hard links), this // means that a reference is dropped when nlink reaches 0. For directories, // nlink never reaches 0 due to the "." entry; instead, // filesystem.RmdirAt() drops the reference. refs int64 // xattrs implements extended attributes. // // TODO(b/148380782): Support xattrs other than user.* xattrs memxattr.SimpleExtendedAttributes // Inode metadata. Writing multiple fields atomically requires holding // mu, othewise atomic operations can be used. mu sync.Mutex mode uint32 // file type and mode nlink uint32 // protected by filesystem.mu instead of inode.mu uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic gid uint32 // auth.KGID, but ... ino uint64 // immutable // Linux's tmpfs has no concept of btime. atime int64 // nanoseconds ctime int64 // nanoseconds mtime int64 // nanoseconds // Only meaningful for device special files. rdevMajor uint32 rdevMinor uint32 // Advisory file locks, which lock at the inode level. locks lock.FileLocks impl interface{} // immutable } const maxLinks = math.MaxUint32 func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) { if mode.FileType() == 0 { panic("file type is required in FileMode") } i.clock = fs.clock i.refs = 1 i.mode = uint32(mode) i.uid = uint32(creds.EffectiveKUID) i.gid = uint32(creds.EffectiveKGID) i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1) // Tmpfs creation sets atime, ctime, and mtime to current time. now := i.clock.Now().Nanoseconds() i.atime = now i.ctime = now i.mtime = now // i.nlink initialized by caller i.impl = impl } // incLinksLocked increments i's link count. // // Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. // i.nlink < maxLinks. func (i *inode) incLinksLocked() { if i.nlink == 0 { panic("tmpfs.inode.incLinksLocked() called with no existing links") } if i.nlink == maxLinks { panic("tmpfs.inode.incLinksLocked() called with maximum link count") } atomic.AddUint32(&i.nlink, 1) } // decLinksLocked decrements i's link count. // // Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. func (i *inode) decLinksLocked() { if i.nlink == 0 { panic("tmpfs.inode.decLinksLocked() called with no existing links") } atomic.AddUint32(&i.nlink, ^uint32(0)) } func (i *inode) incRef() { if atomic.AddInt64(&i.refs, 1) <= 1 { panic("tmpfs.inode.incRef() called without holding a reference") } } func (i *inode) tryIncRef() bool { for { refs := atomic.LoadInt64(&i.refs) if refs == 0 { return false } if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) { return true } } } func (i *inode) decRef() { if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { if regFile, ok := i.impl.(*regularFile); ok { // Hold inode.mu and regFile.dataMu while mutating // size. i.mu.Lock() regFile.dataMu.Lock() regFile.data.DropAll(regFile.memFile) atomic.StoreUint64(®File.size, 0) regFile.dataMu.Unlock() i.mu.Unlock() } } else if refs < 0 { panic("tmpfs.inode.decRef() called without holding a reference") } } func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { mode := linux.FileMode(atomic.LoadUint32(&i.mode)) return vfs.GenericCheckPermissions(creds, ats, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))) } // Go won't inline this function, and returning linux.Statx (which is quite // big) means spending a lot of time in runtime.duffcopy(), so instead it's an // output parameter. // // Note that Linux does not guarantee to return consistent data (in the case of // a concurrent modification), so we do not require holding inode.mu. func (i *inode) statTo(stat *linux.Statx) { stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_ATIME | linux.STATX_BTIME | linux.STATX_CTIME | linux.STATX_MTIME stat.Blksize = 1 // usermem.PageSize in tmpfs stat.Nlink = atomic.LoadUint32(&i.nlink) stat.UID = atomic.LoadUint32(&i.uid) stat.GID = atomic.LoadUint32(&i.gid) stat.Mode = uint16(atomic.LoadUint32(&i.mode)) stat.Ino = i.ino // Linux's tmpfs has no concept of btime, so zero-value is returned. stat.Atime = linux.NsecToStatxTimestamp(i.atime) stat.Ctime = linux.NsecToStatxTimestamp(i.ctime) stat.Mtime = linux.NsecToStatxTimestamp(i.mtime) // TODO(gvisor.dev/issue/1197): Device number. switch impl := i.impl.(type) { case *regularFile: stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS stat.Size = uint64(atomic.LoadUint64(&impl.size)) // In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in // a uint64 accessed using atomic memory operations to avoid taking // locks). stat.Blocks = allocatedBlocksForSize(stat.Size) case *symlink: stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS stat.Size = uint64(len(impl.target)) stat.Blocks = allocatedBlocksForSize(stat.Size) case *deviceFile: stat.RdevMajor = impl.major stat.RdevMinor = impl.minor case *socketFile, *directory, *namedPipe: // Nothing to do. default: panic(fmt.Sprintf("unknown inode type: %T", i.impl)) } } func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx) error { if stat.Mask == 0 { return nil } if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 { return syserror.EPERM } mode := linux.FileMode(atomic.LoadUint32(&i.mode)) if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { return err } i.mu.Lock() defer i.mu.Unlock() var ( needsMtimeBump bool needsCtimeBump bool ) mask := stat.Mask if mask&linux.STATX_MODE != 0 { ft := atomic.LoadUint32(&i.mode) & linux.S_IFMT atomic.StoreUint32(&i.mode, ft|uint32(stat.Mode&^linux.S_IFMT)) needsCtimeBump = true } if mask&linux.STATX_UID != 0 { atomic.StoreUint32(&i.uid, stat.UID) needsCtimeBump = true } if mask&linux.STATX_GID != 0 { atomic.StoreUint32(&i.gid, stat.GID) needsCtimeBump = true } if mask&linux.STATX_SIZE != 0 { switch impl := i.impl.(type) { case *regularFile: updated, err := impl.truncateLocked(stat.Size) if err != nil { return err } if updated { needsMtimeBump = true needsCtimeBump = true } case *directory: return syserror.EISDIR default: return syserror.EINVAL } } now := i.clock.Now().Nanoseconds() if mask&linux.STATX_ATIME != 0 { if stat.Atime.Nsec == linux.UTIME_NOW { atomic.StoreInt64(&i.atime, now) } else { atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped()) } needsCtimeBump = true } if mask&linux.STATX_MTIME != 0 { if stat.Mtime.Nsec == linux.UTIME_NOW { atomic.StoreInt64(&i.mtime, now) } else { atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped()) } needsCtimeBump = true // Ignore the mtime bump, since we just set it ourselves. needsMtimeBump = false } if mask&linux.STATX_CTIME != 0 { if stat.Ctime.Nsec == linux.UTIME_NOW { atomic.StoreInt64(&i.ctime, now) } else { atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped()) } // Ignore the ctime bump, since we just set it ourselves. needsCtimeBump = false } if needsMtimeBump { atomic.StoreInt64(&i.mtime, now) } if needsCtimeBump { atomic.StoreInt64(&i.ctime, now) } return nil } // TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. func (i *inode) lockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { switch i.impl.(type) { case *regularFile: return i.locks.LockBSD(uid, t, block) } return syserror.EBADF } // TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. func (i *inode) unlockBSD(uid fslock.UniqueID) error { switch i.impl.(type) { case *regularFile: i.locks.UnlockBSD(uid) return nil } return syserror.EBADF } // TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. func (i *inode) lockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { switch i.impl.(type) { case *regularFile: return i.locks.LockPOSIX(uid, t, rng, block) } return syserror.EBADF } // TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. func (i *inode) unlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) error { switch i.impl.(type) { case *regularFile: i.locks.UnlockPOSIX(uid, rng) return nil } return syserror.EBADF } // allocatedBlocksForSize returns the number of 512B blocks needed to // accommodate the given size in bytes, as appropriate for struct // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block // size is independent of the "preferred block size for I/O", struct // stat::st_blksize and struct statx::stx_blksize.) func allocatedBlocksForSize(size uint64) uint64 { return (size + 511) / 512 } func (i *inode) direntType() uint8 { switch impl := i.impl.(type) { case *regularFile: return linux.DT_REG case *directory: return linux.DT_DIR case *symlink: return linux.DT_LNK case *socketFile: return linux.DT_SOCK case *deviceFile: switch impl.kind { case vfs.BlockDevice: return linux.DT_BLK case vfs.CharDevice: return linux.DT_CHR default: panic(fmt.Sprintf("unknown vfs.DeviceKind: %v", impl.kind)) } default: panic(fmt.Sprintf("unknown inode type: %T", i.impl)) } } func (i *inode) isDir() bool { return linux.FileMode(i.mode).FileType() == linux.S_IFDIR } func (i *inode) touchAtime(mnt *vfs.Mount) { if err := mnt.CheckBeginWrite(); err != nil { return } now := i.clock.Now().Nanoseconds() i.mu.Lock() atomic.StoreInt64(&i.atime, now) i.mu.Unlock() mnt.EndWrite() } // Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). func (i *inode) touchCtime() { now := i.clock.Now().Nanoseconds() i.mu.Lock() atomic.StoreInt64(&i.ctime, now) i.mu.Unlock() } // Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). func (i *inode) touchCMtime() { now := i.clock.Now().Nanoseconds() i.mu.Lock() atomic.StoreInt64(&i.mtime, now) atomic.StoreInt64(&i.ctime, now) i.mu.Unlock() } // Preconditions: The caller has called vfs.Mount.CheckBeginWrite() and holds // inode.mu. func (i *inode) touchCMtimeLocked() { now := i.clock.Now().Nanoseconds() atomic.StoreInt64(&i.mtime, now) atomic.StoreInt64(&i.ctime, now) } func (i *inode) listxattr(size uint64) ([]string, error) { return i.xattrs.Listxattr(size) } func (i *inode) getxattr(creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) { if err := i.checkPermissions(creds, vfs.MayRead); err != nil { return "", err } if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { return "", syserror.EOPNOTSUPP } if !i.userXattrSupported() { return "", syserror.ENODATA } return i.xattrs.Getxattr(opts) } func (i *inode) setxattr(creds *auth.Credentials, opts *vfs.SetxattrOptions) error { if err := i.checkPermissions(creds, vfs.MayWrite); err != nil { return err } if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { return syserror.EOPNOTSUPP } if !i.userXattrSupported() { return syserror.EPERM } return i.xattrs.Setxattr(opts) } func (i *inode) removexattr(creds *auth.Credentials, name string) error { if err := i.checkPermissions(creds, vfs.MayWrite); err != nil { return err } if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { return syserror.EOPNOTSUPP } if !i.userXattrSupported() { return syserror.EPERM } return i.xattrs.Removexattr(name) } // Extended attributes in the user.* namespace are only supported for regular // files and directories. func (i *inode) userXattrSupported() bool { filetype := linux.S_IFMT & atomic.LoadUint32(&i.mode) return filetype == linux.S_IFREG || filetype == linux.S_IFDIR } // fileDescription is embedded by tmpfs implementations of // vfs.FileDescriptionImpl. type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl } func (fd *fileDescription) filesystem() *filesystem { return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) } func (fd *fileDescription) inode() *inode { return fd.vfsfd.Dentry().Impl().(*dentry).inode } // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { var stat linux.Statx fd.inode().statTo(&stat) return stat, nil } // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { creds := auth.CredentialsFromContext(ctx) return fd.inode().setStat(ctx, creds, &opts.Stat) } // Listxattr implements vfs.FileDescriptionImpl.Listxattr. func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) { return fd.inode().listxattr(size) } // Getxattr implements vfs.FileDescriptionImpl.Getxattr. func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) { return fd.inode().getxattr(auth.CredentialsFromContext(ctx), &opts) } // Setxattr implements vfs.FileDescriptionImpl.Setxattr. func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { return fd.inode().setxattr(auth.CredentialsFromContext(ctx), &opts) } // Removexattr implements vfs.FileDescriptionImpl.Removexattr. func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { return fd.inode().removexattr(auth.CredentialsFromContext(ctx), name) }