summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/fsimpl/tmpfs
diff options
context:
space:
mode:
authorJamie Liu <jamieliu@google.com>2020-04-21 12:16:42 -0700
committergVisor bot <gvisor-bot@google.com>2020-04-21 12:18:07 -0700
commit9b5e305e05ef3ad51778981062d6152cea1cd4fb (patch)
tree7fa4a4f665e482efd9435ef7b0df2ccce505c70a /pkg/sentry/fsimpl/tmpfs
parent639c8dd80870133f61465588e717b725417a0c41 (diff)
Remove filesystem structure from vfs.Dentry.
This change: - Drastically simplifies the synchronization model: filesystem structure is both implementation-defined and implementation-synchronized. - Allows implementations of vfs.DentryImpl to use implementation-specific dentry types, reducing casts during path traversal. - Doesn't require dentries representing non-directory files to waste space on a map of children. - Allows dentry revalidation and mount lookup to be correctly ordered (fixed FIXME in fsimpl/gofer/filesystem.go). - Removes the need to have two separate maps in gofer.dentry (dentry.vfsd.children and dentry.negativeChildren) for positive and negative lookups respectively. //pkg/sentry/fsimpl/tmpfs/benchmark_test.go: name old time/op new time/op delta VFS2TmpfsStat/1-112 172ns ± 4% 165ns ± 3% -4.08% (p=0.002 n=9+9) VFS2TmpfsStat/2-112 199ns ± 3% 195ns ±10% ~ (p=0.132 n=8+9) VFS2TmpfsStat/3-112 230ns ± 2% 216ns ± 2% -6.15% (p=0.000 n=8+8) VFS2TmpfsStat/8-112 390ns ± 2% 358ns ± 4% -8.33% (p=0.000 n=9+8) VFS2TmpfsStat/64-112 2.20µs ± 3% 2.01µs ± 3% -8.48% (p=0.000 n=10+8) VFS2TmpfsStat/100-112 3.42µs ± 9% 3.08µs ± 2% -9.82% (p=0.000 n=9+8) VFS2TmpfsMountStat/1-112 278ns ± 1% 286ns ±15% ~ (p=0.712 n=8+10) VFS2TmpfsMountStat/2-112 311ns ± 4% 298ns ± 2% -4.27% (p=0.000 n=9+8) VFS2TmpfsMountStat/3-112 339ns ± 3% 330ns ± 9% ~ (p=0.070 n=8+9) VFS2TmpfsMountStat/8-112 503ns ± 3% 466ns ± 3% -7.38% (p=0.000 n=8+8) VFS2TmpfsMountStat/64-112 2.53µs ±16% 2.17µs ± 7% -14.19% (p=0.000 n=10+9) VFS2TmpfsMountStat/100-112 3.60µs ± 4% 3.30µs ± 8% -8.33% (p=0.001 n=8+9) Updates #1035 PiperOrigin-RevId: 307655892
Diffstat (limited to 'pkg/sentry/fsimpl/tmpfs')
-rw-r--r--pkg/sentry/fsimpl/tmpfs/BUILD12
-rw-r--r--pkg/sentry/fsimpl/tmpfs/benchmark_test.go7
-rw-r--r--pkg/sentry/fsimpl/tmpfs/directory.go84
-rw-r--r--pkg/sentry/fsimpl/tmpfs/filesystem.go249
-rw-r--r--pkg/sentry/fsimpl/tmpfs/stat_test.go12
-rw-r--r--pkg/sentry/fsimpl/tmpfs/tmpfs.go82
6 files changed, 262 insertions, 184 deletions
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index 4e6cd3491..a2d9649e7 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -15,6 +15,17 @@ go_template_instance(
},
)
+go_template_instance(
+ name = "fstree",
+ out = "fstree.go",
+ package = "tmpfs",
+ prefix = "generic",
+ template = "//pkg/sentry/vfs/genericfstree:generic_fstree",
+ types = {
+ "Dentry": "dentry",
+ },
+)
+
go_library(
name = "tmpfs",
srcs = [
@@ -22,6 +33,7 @@ go_library(
"device_file.go",
"directory.go",
"filesystem.go",
+ "fstree.go",
"named_pipe.go",
"regular_file.go",
"socket_file.go",
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index 651912169..2fb5c4d84 100644
--- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -438,13 +438,6 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
filePathBuilder.WriteByte('/')
}
- // Verify that we didn't create any directories under the mount
- // point (i.e. they were all created on the submount).
- firstDirName := fmt.Sprintf("%d", depth)
- if child := mountPoint.Dentry().Child(firstDirName); child != nil {
- b.Fatalf("created directory %q under root mount, not submount", firstDirName)
- }
-
// Create the file that will be stat'd.
fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
Root: root,
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index 45712c9b9..f2399981b 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -15,35 +15,77 @@
package tmpfs
import (
+ "sync/atomic"
+
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
type directory struct {
- inode inode
+ // Since directories can't be hard-linked, each directory can only be
+ // associated with a single dentry, which we can store in the directory
+ // struct.
+ dentry dentry
+ inode inode
+
+ // childMap maps the names of the directory's children to their dentries.
+ // childMap is protected by filesystem.mu.
+ childMap map[string]*dentry
- // childList is a list containing (1) child Dentries and (2) fake Dentries
+ // numChildren is len(childMap), but accessed using atomic memory
+ // operations to avoid locking in inode.statTo().
+ numChildren int64
+
+ // childList is a list containing (1) child dentries and (2) fake dentries
// (with inode == nil) that represent the iteration position of
// directoryFDs. childList is used to support directoryFD.IterDirents()
- // efficiently. childList is protected by filesystem.mu.
+ // efficiently. childList is protected by iterMu.
+ iterMu sync.Mutex
childList dentryList
}
-func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *inode {
+func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *directory {
dir := &directory{}
dir.inode.init(dir, fs, creds, linux.S_IFDIR|mode)
dir.inode.nlink = 2 // from "." and parent directory or ".." for root
- return &dir.inode
+ dir.dentry.inode = &dir.inode
+ dir.dentry.vfsd.Init(&dir.dentry)
+ return dir
+}
+
+// Preconditions: filesystem.mu must be locked for writing. dir must not
+// already contain a child with the given name.
+func (dir *directory) insertChildLocked(child *dentry, name string) {
+ child.parent = &dir.dentry
+ child.name = name
+ if dir.childMap == nil {
+ dir.childMap = make(map[string]*dentry)
+ }
+ dir.childMap[name] = child
+ atomic.AddInt64(&dir.numChildren, 1)
+ dir.iterMu.Lock()
+ dir.childList.PushBack(child)
+ dir.iterMu.Unlock()
+}
+
+// Preconditions: filesystem.mu must be locked for writing.
+func (dir *directory) removeChildLocked(child *dentry) {
+ delete(dir.childMap, child.name)
+ atomic.AddInt64(&dir.numChildren, -1)
+ dir.iterMu.Lock()
+ dir.childList.Remove(child)
+ dir.iterMu.Unlock()
}
type directoryFD struct {
fileDescription
vfs.DirectoryFileDescriptionDefaultImpl
- // Protected by filesystem.mu.
+ // Protected by directory.iterMu.
iter *dentry
off int64
}
@@ -51,11 +93,10 @@ type directoryFD struct {
// Release implements vfs.FileDescriptionImpl.Release.
func (fd *directoryFD) Release() {
if fd.iter != nil {
- fs := fd.filesystem()
dir := fd.inode().impl.(*directory)
- fs.mu.Lock()
+ dir.iterMu.Lock()
dir.childList.Remove(fd.iter)
- fs.mu.Unlock()
+ dir.iterMu.Unlock()
fd.iter = nil
}
}
@@ -63,10 +104,13 @@ func (fd *directoryFD) Release() {
// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
fs := fd.filesystem()
- vfsd := fd.vfsfd.VirtualDentry().Dentry()
+ dir := fd.inode().impl.(*directory)
- fs.mu.Lock()
- defer fs.mu.Unlock()
+ // fs.mu is required to read d.parent and dentry.name.
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ dir.iterMu.Lock()
+ defer dir.iterMu.Unlock()
fd.inode().touchAtime(fd.vfsfd.Mount())
@@ -74,15 +118,16 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
if err := cb.Handle(vfs.Dirent{
Name: ".",
Type: linux.DT_DIR,
- Ino: vfsd.Impl().(*dentry).inode.ino,
+ Ino: dir.inode.ino,
NextOff: 1,
}); err != nil {
return err
}
fd.off++
}
+
if fd.off == 1 {
- parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode
+ parentInode := genericParentOrSelf(&dir.dentry).inode
if err := cb.Handle(vfs.Dirent{
Name: "..",
Type: parentInode.direntType(),
@@ -94,7 +139,6 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
fd.off++
}
- dir := vfsd.Impl().(*dentry).inode.impl.(*directory)
var child *dentry
if fd.iter == nil {
// Start iteration at the beginning of dir.
@@ -109,7 +153,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
// Skip other directoryFD iterators.
if child.inode != nil {
if err := cb.Handle(vfs.Dirent{
- Name: child.vfsd.Name(),
+ Name: child.name,
Type: child.inode.direntType(),
Ino: child.inode.ino,
NextOff: fd.off + 1,
@@ -127,9 +171,9 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
- fs := fd.filesystem()
- fs.mu.Lock()
- defer fs.mu.Unlock()
+ dir := fd.inode().impl.(*directory)
+ dir.iterMu.Lock()
+ defer dir.iterMu.Unlock()
switch whence {
case linux.SEEK_SET:
@@ -157,8 +201,6 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in
remChildren = offset - 2
}
- dir := fd.inode().impl.(*directory)
-
// Ensure that fd.iter exists and is not linked into dir.childList.
if fd.iter == nil {
fd.iter = &dentry{}
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 452c4e2e0..5b62f9ebb 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -39,27 +39,43 @@ func (fs *filesystem) Sync(ctx context.Context) error {
//
// Preconditions: filesystem.mu must be locked. !rp.Done().
func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
- if !d.inode.isDir() {
+ dir, ok := d.inode.impl.(*directory)
+ if !ok {
return nil, syserror.ENOTDIR
}
if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
return nil, err
}
afterSymlink:
- if len(rp.Component()) > linux.NAME_MAX {
- return nil, syserror.ENAMETOOLONG
+ name := rp.Component()
+ if name == "." {
+ rp.Advance()
+ return d, nil
}
- nextVFSD, err := rp.ResolveComponent(&d.vfsd)
- if err != nil {
- return nil, err
+ if name == ".." {
+ if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil {
+ return nil, err
+ } else if isRoot || d.parent == nil {
+ rp.Advance()
+ return d, nil
+ }
+ if err := rp.CheckMount(&d.parent.vfsd); err != nil {
+ return nil, err
+ }
+ rp.Advance()
+ return d.parent, nil
}
- if nextVFSD == nil {
- // Since the Dentry tree is the sole source of truth for tmpfs, if it's
- // not in the Dentry tree, it doesn't exist.
+ if len(name) > linux.NAME_MAX {
+ return nil, syserror.ENAMETOOLONG
+ }
+ child, ok := dir.childMap[name]
+ if !ok {
return nil, syserror.ENOENT
}
- next := nextVFSD.Impl().(*dentry)
- if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
+ if err := rp.CheckMount(&child.vfsd); err != nil {
+ return nil, err
+ }
+ if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
// TODO(gvisor.dev/issue/1197): Symlink traversals updates
// access time.
if err := rp.HandleSymlink(symlink.target); err != nil {
@@ -68,7 +84,7 @@ afterSymlink:
goto afterSymlink // don't check the current directory again
}
rp.Advance()
- return next, nil
+ return child, nil
}
// walkParentDirLocked resolves all but the last path component of rp to an
@@ -80,7 +96,7 @@ afterSymlink:
// fs/namei.c:path_parentat().
//
// Preconditions: filesystem.mu must be locked. !rp.Done().
-func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
+func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*directory, error) {
for !rp.Final() {
next, err := stepLocked(rp, d)
if err != nil {
@@ -88,10 +104,11 @@ func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
}
d = next
}
- if !d.inode.isDir() {
+ dir, ok := d.inode.impl.(*directory)
+ if !ok {
return nil, syserror.ENOTDIR
}
- return d, nil
+ return dir, nil
}
// resolveLocked resolves rp to an existing file.
@@ -122,14 +139,14 @@ func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) {
//
// Preconditions: !rp.Done(). For the final path component in rp,
// !rp.ShouldFollowSymlink().
-func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error {
+func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error {
fs.mu.Lock()
defer fs.mu.Unlock()
- parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
if err != nil {
return err
}
- if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+ if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
return err
}
name := rp.Component()
@@ -139,19 +156,15 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa
if len(name) > linux.NAME_MAX {
return syserror.ENAMETOOLONG
}
- // Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(),
- // because if the child exists we want to return EEXIST immediately instead
- // of attempting symlink/mount traversal.
- if parent.vfsd.Child(name) != nil {
+ if _, ok := parentDir.childMap[name]; ok {
return syserror.EEXIST
}
if !dir && rp.MustBeDir() {
return syserror.ENOENT
}
- // In tmpfs, the only way to cause a dentry to be disowned is by removing
- // it from the filesystem, so this check is equivalent to checking if
- // parent has been removed.
- if parent.vfsd.IsDisowned() {
+ // tmpfs never calls VFS.InvalidateDentry(), so parentDir.dentry can only
+ // be dead if it was deleted.
+ if parentDir.dentry.vfsd.IsDead() {
return syserror.ENOENT
}
mnt := rp.Mount()
@@ -159,10 +172,10 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa
return err
}
defer mnt.EndWrite()
- if err := create(parent, name); err != nil {
+ if err := create(parentDir, name); err != nil {
return err
}
- parent.inode.touchCMtime()
+ parentDir.inode.touchCMtime()
return nil
}
@@ -201,17 +214,17 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
- d, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ dir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
if err != nil {
return nil, err
}
- d.IncRef()
- return &d.vfsd, nil
+ dir.dentry.IncRef()
+ return &dir.dentry.vfsd, nil
}
// LinkAt implements vfs.FilesystemImpl.LinkAt.
func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
- return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+ return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error {
if rp.Mount() != vd.Mount() {
return syserror.EXDEV
}
@@ -226,30 +239,27 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
return syserror.EMLINK
}
d.inode.incLinksLocked()
- child := fs.newDentry(d.inode)
- parent.vfsd.InsertChild(&child.vfsd, name)
- parent.inode.impl.(*directory).childList.PushBack(child)
+ parentDir.insertChildLocked(fs.newDentry(d.inode), name)
return nil
})
}
// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
- return fs.doCreateAt(rp, true /* dir */, func(parent *dentry, name string) error {
- if parent.inode.nlink == maxLinks {
+ return fs.doCreateAt(rp, true /* dir */, func(parentDir *directory, name string) error {
+ if parentDir.inode.nlink == maxLinks {
return syserror.EMLINK
}
- parent.inode.incLinksLocked() // from child's ".."
- child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode))
- parent.vfsd.InsertChild(&child.vfsd, name)
- parent.inode.impl.(*directory).childList.PushBack(child)
+ parentDir.inode.incLinksLocked() // from child's ".."
+ childDir := fs.newDirectory(rp.Credentials(), opts.Mode)
+ parentDir.insertChildLocked(&childDir.dentry, name)
return nil
})
}
// MknodAt implements vfs.FilesystemImpl.MknodAt.
func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
- return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+ return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error {
var childInode *inode
switch opts.Mode.FileType() {
case 0, linux.S_IFREG:
@@ -266,8 +276,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
return syserror.EINVAL
}
child := fs.newDentry(childInode)
- parent.vfsd.InsertChild(&child.vfsd, name)
- parent.inode.impl.(*directory).childList.PushBack(child)
+ parentDir.insertChildLocked(child, name)
return nil
})
}
@@ -306,12 +315,12 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
return start.open(ctx, rp, &opts, false /* afterCreate */)
}
afterTrailingSymlink:
- parent, err := walkParentDirLocked(rp, start)
+ parentDir, err := walkParentDirLocked(rp, start)
if err != nil {
return nil, err
}
// Check for search permission in the parent directory.
- if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
return nil, err
}
// Reject attempts to open directories with O_CREAT.
@@ -322,11 +331,14 @@ afterTrailingSymlink:
if name == "." || name == ".." {
return nil, syserror.EISDIR
}
+ if len(name) > linux.NAME_MAX {
+ return nil, syserror.ENAMETOOLONG
+ }
// Determine whether or not we need to create a file.
- child, err := stepLocked(rp, parent)
- if err == syserror.ENOENT {
+ child, ok := parentDir.childMap[name]
+ if !ok {
// Already checked for searchability above; now check for writability.
- if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+ if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
return nil, err
}
if err := rp.Mount().CheckBeginWrite(); err != nil {
@@ -335,21 +347,26 @@ afterTrailingSymlink:
defer rp.Mount().EndWrite()
// Create and open the child.
child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
- parent.vfsd.InsertChild(&child.vfsd, name)
- parent.inode.impl.(*directory).childList.PushBack(child)
+ parentDir.insertChildLocked(child, name)
fd, err := child.open(ctx, rp, &opts, true)
if err != nil {
return nil, err
}
- parent.inode.touchCMtime()
+ parentDir.inode.touchCMtime()
return fd, nil
}
- if err != nil {
+ // Is the file mounted over?
+ if err := rp.CheckMount(&child.vfsd); err != nil {
return nil, err
}
// Do we need to resolve a trailing symlink?
- if !rp.Done() {
- start = parent
+ if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
+ // TODO(gvisor.dev/issue/1197): Symlink traversals updates
+ // access time.
+ if err := rp.HandleSymlink(symlink.target); err != nil {
+ return nil, err
+ }
+ start = &parentDir.dentry
goto afterTrailingSymlink
}
// Open existing file.
@@ -428,7 +445,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
// Resolve newParent first to verify that it's on this Mount.
fs.mu.Lock()
defer fs.mu.Unlock()
- newParent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ newParentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
if err != nil {
return err
}
@@ -445,23 +462,22 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
}
defer mnt.EndWrite()
- oldParent := oldParentVD.Dentry().Impl().(*dentry)
- if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+ oldParentDir := oldParentVD.Dentry().Impl().(*dentry).inode.impl.(*directory)
+ if err := oldParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
return err
}
- // Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(),
- // because if the existing child is a symlink or mount point then we want
- // to rename over it rather than follow it.
- renamedVFSD := oldParent.vfsd.Child(oldName)
- if renamedVFSD == nil {
+ renamed, ok := oldParentDir.childMap[oldName]
+ if !ok {
return syserror.ENOENT
}
- renamed := renamedVFSD.Impl().(*dentry)
+ // Note that we don't need to call rp.CheckMount(), since if renamed is a
+ // mount point then we want to rename the mount point, not anything in the
+ // mounted filesystem.
if renamed.inode.isDir() {
- if renamed == newParent || renamedVFSD.IsAncestorOf(&newParent.vfsd) {
+ if renamed == &newParentDir.dentry || genericIsAncestorDentry(renamed, &newParentDir.dentry) {
return syserror.EINVAL
}
- if oldParent != newParent {
+ if oldParentDir != newParentDir {
// Writability is needed to change renamed's "..".
if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
return err
@@ -473,18 +489,17 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
}
}
- if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+ if err := newParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
return err
}
- replacedVFSD := newParent.vfsd.Child(newName)
- var replaced *dentry
- if replacedVFSD != nil {
- replaced = replacedVFSD.Impl().(*dentry)
- if replaced.inode.isDir() {
+ replaced, ok := newParentDir.childMap[newName]
+ if ok {
+ replacedDir, ok := replaced.inode.impl.(*directory)
+ if ok {
if !renamed.inode.isDir() {
return syserror.EISDIR
}
- if replaced.vfsd.HasChildren() {
+ if len(replacedDir.childMap) != 0 {
return syserror.ENOTEMPTY
}
} else {
@@ -496,11 +511,13 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
}
}
} else {
- if renamed.inode.isDir() && newParent.inode.nlink == maxLinks {
+ if renamed.inode.isDir() && newParentDir.inode.nlink == maxLinks {
return syserror.EMLINK
}
}
- if newParent.vfsd.IsDisowned() {
+ // tmpfs never calls VFS.InvalidateDentry(), so newParentDir.dentry can
+ // only be dead if it was deleted.
+ if newParentDir.dentry.vfsd.IsDead() {
return syserror.ENOENT
}
@@ -508,36 +525,38 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
// simplicity, under the assumption that applications are not intentionally
// doing noop renames expecting them to succeed where non-noop renames
// would fail.
- if renamedVFSD == replacedVFSD {
+ if renamed == replaced {
return nil
}
vfsObj := rp.VirtualFilesystem()
- oldParentDir := oldParent.inode.impl.(*directory)
- newParentDir := newParent.inode.impl.(*directory)
mntns := vfs.MountNamespaceFromContext(ctx)
defer mntns.DecRef()
- if err := vfsObj.PrepareRenameDentry(mntns, renamedVFSD, replacedVFSD); err != nil {
+ var replacedVFSD *vfs.Dentry
+ if replaced != nil {
+ replacedVFSD = &replaced.vfsd
+ }
+ if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
return err
}
if replaced != nil {
- newParentDir.childList.Remove(replaced)
+ newParentDir.removeChildLocked(replaced)
if replaced.inode.isDir() {
- newParent.inode.decLinksLocked() // from replaced's ".."
+ newParentDir.inode.decLinksLocked() // from replaced's ".."
}
replaced.inode.decLinksLocked()
}
- oldParentDir.childList.Remove(renamed)
- newParentDir.childList.PushBack(renamed)
- if renamed.inode.isDir() {
- oldParent.inode.decLinksLocked()
- newParent.inode.incLinksLocked()
+ oldParentDir.removeChildLocked(renamed)
+ newParentDir.insertChildLocked(renamed, newName)
+ vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD)
+ oldParentDir.inode.touchCMtime()
+ if oldParentDir != newParentDir {
+ if renamed.inode.isDir() {
+ oldParentDir.inode.decLinksLocked()
+ newParentDir.inode.incLinksLocked()
+ }
+ newParentDir.inode.touchCMtime()
}
- oldParent.inode.touchCMtime()
- newParent.inode.touchCMtime()
renamed.inode.touchCtime()
- // TODO(gvisor.dev/issue/1197): Update timestamps and parent directory
- // sizes.
- vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD)
return nil
}
@@ -545,11 +564,11 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
defer fs.mu.Unlock()
- parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
if err != nil {
return err
}
- if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+ if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
return err
}
name := rp.Component()
@@ -559,15 +578,15 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
if name == ".." {
return syserror.ENOTEMPTY
}
- childVFSD := parent.vfsd.Child(name)
- if childVFSD == nil {
+ child, ok := parentDir.childMap[name]
+ if !ok {
return syserror.ENOENT
}
- child := childVFSD.Impl().(*dentry)
- if !child.inode.isDir() {
+ childDir, ok := child.inode.impl.(*directory)
+ if !ok {
return syserror.ENOTDIR
}
- if childVFSD.HasChildren() {
+ if len(childDir.childMap) != 0 {
return syserror.ENOTEMPTY
}
mnt := rp.Mount()
@@ -578,14 +597,14 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
vfsObj := rp.VirtualFilesystem()
mntns := vfs.MountNamespaceFromContext(ctx)
defer mntns.DecRef()
- if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil {
+ if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
return err
}
- parent.inode.impl.(*directory).childList.Remove(child)
- parent.inode.decLinksLocked() // from child's ".."
+ parentDir.removeChildLocked(child)
+ parentDir.inode.decLinksLocked() // from child's ".."
child.inode.decLinksLocked()
- vfsObj.CommitDeleteDentry(childVFSD)
- parent.inode.touchCMtime()
+ vfsObj.CommitDeleteDentry(&child.vfsd)
+ parentDir.inode.touchCMtime()
return nil
}
@@ -627,10 +646,9 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
- return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+ return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error {
child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
- parent.vfsd.InsertChild(&child.vfsd, name)
- parent.inode.impl.(*directory).childList.PushBack(child)
+ parentDir.insertChildLocked(child, name)
return nil
})
}
@@ -639,22 +657,21 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
defer fs.mu.Unlock()
- parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
if err != nil {
return err
}
- if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+ if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
return err
}
name := rp.Component()
if name == "." || name == ".." {
return syserror.EISDIR
}
- childVFSD := parent.vfsd.Child(name)
- if childVFSD == nil {
+ child, ok := parentDir.childMap[name]
+ if !ok {
return syserror.ENOENT
}
- child := childVFSD.Impl().(*dentry)
if child.inode.isDir() {
return syserror.EISDIR
}
@@ -669,13 +686,13 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
vfsObj := rp.VirtualFilesystem()
mntns := vfs.MountNamespaceFromContext(ctx)
defer mntns.DecRef()
- if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil {
+ if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
return err
}
- parent.inode.impl.(*directory).childList.Remove(child)
+ parentDir.removeChildLocked(child)
child.inode.decLinksLocked()
- vfsObj.CommitDeleteDentry(childVFSD)
- parent.inode.touchCMtime()
+ vfsObj.CommitDeleteDentry(&child.vfsd)
+ parentDir.inode.touchCMtime()
return nil
}
@@ -743,5 +760,5 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
fs.mu.RLock()
defer fs.mu.RUnlock()
- return vfs.GenericPrependPath(vfsroot, vd, b)
+ return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
}
diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go
index d4f59ee5b..60c2c980e 100644
--- a/pkg/sentry/fsimpl/tmpfs/stat_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go
@@ -71,9 +71,15 @@ func TestStatAfterCreate(t *testing.T) {
t.Errorf("got btime %d, want 0", got.Btime.ToNsec())
}
- // Size should be 0.
- if got.Size != 0 {
- t.Errorf("got size %d, want 0", got.Size)
+ // Size should be 0 (except for directories, which make up a size
+ // of 20 per entry, including the "." and ".." entries present in
+ // otherwise-empty directories).
+ wantSize := uint64(0)
+ if typ == "dir" {
+ wantSize = 40
+ }
+ if got.Size != wantSize {
+ t.Errorf("got size %d, want %d", got.Size, wantSize)
}
// Nlink should be 1 for files, 2 for dirs.
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 82c709b43..efc931468 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -12,16 +12,19 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// Package tmpfs provides a filesystem implementation that behaves like tmpfs:
-// the Dentry tree is the sole source of truth for the state of the filesystem.
+// Package tmpfs provides an in-memory filesystem whose contents are
+// application-mutable, consistent with Linux's tmpfs.
//
// Lock order:
//
// filesystem.mu
// inode.mu
// regularFileFD.offMu
+// *** "memmap.Mappable locks" below this point
// regularFile.mapsMu
+// *** "memmap.Mappable locks taken by Translate" below this point
// regularFile.dataMu
+// directory.iterMu
package tmpfs
import (
@@ -41,6 +44,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/vfs/memxattr"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// Name is the default filesystem name.
@@ -112,18 +116,18 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
fs.vfsfs.Init(vfsObj, newFSType, &fs)
- var root *inode
+ var root *dentry
switch rootFileType {
case linux.S_IFREG:
- root = fs.newRegularFile(creds, 0777)
+ root = fs.newDentry(fs.newRegularFile(creds, 0777))
case linux.S_IFLNK:
- root = fs.newSymlink(creds, tmpfsOpts.RootSymlinkTarget)
+ root = fs.newDentry(fs.newSymlink(creds, tmpfsOpts.RootSymlinkTarget))
case linux.S_IFDIR:
- root = fs.newDirectory(creds, 01777)
+ root = &fs.newDirectory(creds, 01777).dentry
default:
return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType)
}
- return &fs.vfsfs, &fs.newDentry(root).vfsd, nil
+ return &fs.vfsfs, &root.vfsd, nil
}
// Release implements vfs.FilesystemImpl.Release.
@@ -134,20 +138,29 @@ func (fs *filesystem) Release() {
type dentry struct {
vfsd vfs.Dentry
+ // parent is this dentry's parent directory. Each referenced dentry holds a
+ // reference on parent.dentry. If this dentry is a filesystem root, parent
+ // is nil. parent is protected by filesystem.mu.
+ parent *dentry
+
+ // name is the name of this dentry in its parent. If this dentry is a
+ // filesystem root, name is the empty string. name is protected by
+ // filesystem.mu.
+ name string
+
+ // dentryEntry (ugh) links dentries into their parent directory.childList.
+ dentryEntry
+
// inode is the inode represented by this dentry. Multiple Dentries may
// share a single non-directory inode (with hard links). inode is
// immutable.
- inode *inode
-
+ //
// tmpfs doesn't count references on dentries; because the dentry tree is
// the sole source of truth, it is by definition always consistent with the
// state of the filesystem. However, it does count references on inodes,
// because inode resources are released when all references are dropped.
- // (tmpfs doesn't really have resources to release, but we implement
- // reference counting because tmpfs regular files will.)
-
- // dentryEntry (ugh) links dentries into their parent directory.childList.
- dentryEntry
+ // dentry therefore forwards reference counting directly to inode.
+ inode *inode
}
func (fs *filesystem) newDentry(inode *inode) *dentry {
@@ -207,10 +220,6 @@ type inode struct {
ctime int64 // nanoseconds
mtime int64 // nanoseconds
- // Only meaningful for device special files.
- rdevMajor uint32
- rdevMinor uint32
-
// Advisory file locks, which lock at the inode level.
locks lock.FileLocks
@@ -230,7 +239,7 @@ func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials,
i.gid = uint32(creds.EffectiveKGID)
i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
// Tmpfs creation sets atime, ctime, and mtime to current time.
- now := i.clock.Now().Nanoseconds()
+ now := fs.clock.Now().Nanoseconds()
i.atime = now
i.ctime = now
i.mtime = now
@@ -283,14 +292,10 @@ func (i *inode) tryIncRef() bool {
func (i *inode) decRef() {
if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
if regFile, ok := i.impl.(*regularFile); ok {
- // Hold inode.mu and regFile.dataMu while mutating
- // size.
- i.mu.Lock()
- regFile.dataMu.Lock()
+ // Release memory used by regFile to store data. Since regFile is
+ // no longer usable, we don't need to grab any locks or update any
+ // metadata.
regFile.data.DropAll(regFile.memFile)
- atomic.StoreUint64(&regFile.size, 0)
- regFile.dataMu.Unlock()
- i.mu.Unlock()
}
} else if refs < 0 {
panic("tmpfs.inode.decRef() called without holding a reference")
@@ -310,15 +315,15 @@ func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) e
// a concurrent modification), so we do not require holding inode.mu.
func (i *inode) statTo(stat *linux.Statx) {
stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK |
- linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_ATIME |
- linux.STATX_BTIME | linux.STATX_CTIME | linux.STATX_MTIME
- stat.Blksize = 1 // usermem.PageSize in tmpfs
+ linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE |
+ linux.STATX_BLOCKS | linux.STATX_ATIME | linux.STATX_CTIME |
+ linux.STATX_MTIME
+ stat.Blksize = usermem.PageSize
stat.Nlink = atomic.LoadUint32(&i.nlink)
stat.UID = atomic.LoadUint32(&i.uid)
stat.GID = atomic.LoadUint32(&i.gid)
stat.Mode = uint16(atomic.LoadUint32(&i.mode))
stat.Ino = i.ino
- // Linux's tmpfs has no concept of btime, so zero-value is returned.
stat.Atime = linux.NsecToStatxTimestamp(i.atime)
stat.Ctime = linux.NsecToStatxTimestamp(i.ctime)
stat.Mtime = linux.NsecToStatxTimestamp(i.mtime)
@@ -327,19 +332,22 @@ func (i *inode) statTo(stat *linux.Statx) {
case *regularFile:
stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
stat.Size = uint64(atomic.LoadUint64(&impl.size))
- // In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in
- // a uint64 accessed using atomic memory operations to avoid taking
- // locks).
+ // TODO(jamieliu): This should be impl.data.Span() / 512, but this is
+ // too expensive to compute here. Cache it in regularFile.
stat.Blocks = allocatedBlocksForSize(stat.Size)
+ case *directory:
+ // "20" is mm/shmem.c:BOGO_DIRENT_SIZE.
+ stat.Size = 20 * (2 + uint64(atomic.LoadInt64(&impl.numChildren)))
+ // stat.Blocks is 0.
case *symlink:
- stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
stat.Size = uint64(len(impl.target))
- stat.Blocks = allocatedBlocksForSize(stat.Size)
+ // stat.Blocks is 0.
+ case *namedPipe, *socketFile:
+ // stat.Size and stat.Blocks are 0.
case *deviceFile:
+ // stat.Size and stat.Blocks are 0.
stat.RdevMajor = impl.major
stat.RdevMinor = impl.minor
- case *socketFile, *directory, *namedPipe:
- // Nothing to do.
default:
panic(fmt.Sprintf("unknown inode type: %T", i.impl))
}