summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorJamie Liu <jamieliu@google.com>2020-04-23 15:35:56 -0700
committergVisor bot <gvisor-bot@google.com>2020-04-23 15:37:10 -0700
commit5042ea7e2cbdc0c04fd454583589a3b1e152f95d (patch)
treecc1c58e4f0b1176205a86358badfca4aa031610b
parentcc5de905e628c5e9aca7e7a333d6dd9638719b6a (diff)
Add vfs.MkdirOptions.ForSyntheticMountpoint.
PiperOrigin-RevId: 308143529
-rw-r--r--pkg/sentry/fsimpl/gofer/directory.go147
-rw-r--r--pkg/sentry/fsimpl/gofer/filesystem.go323
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer.go177
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer_test.go2
-rw-r--r--pkg/sentry/vfs/filesystem.go3
-rw-r--r--pkg/sentry/vfs/options.go19
-rw-r--r--runsc/boot/vfs.go6
7 files changed, 461 insertions, 216 deletions
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index d02691232..c67766ab2 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -21,8 +21,10 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/p9"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
func (d *dentry) isDir() bool {
@@ -41,15 +43,46 @@ func (d *dentry) cacheNewChildLocked(child *dentry, name string) {
d.children[name] = child
}
-// Preconditions: d.dirMu must be locked. d.isDir(). fs.opts.interop !=
-// InteropModeShared.
-func (d *dentry) cacheNegativeChildLocked(name string) {
+// Preconditions: d.dirMu must be locked. d.isDir().
+func (d *dentry) cacheNegativeLookupLocked(name string) {
+ // Don't cache negative lookups if InteropModeShared is in effect (since
+ // this makes remote lookup unavoidable), or if d.isSynthetic() (in which
+ // case the only files in the directory are those for which a dentry exists
+ // in d.children). Instead, just delete any previously-cached dentry.
+ if d.fs.opts.interop == InteropModeShared || d.isSynthetic() {
+ delete(d.children, name)
+ return
+ }
if d.children == nil {
d.children = make(map[string]*dentry)
}
d.children[name] = nil
}
+// createSyntheticDirectory creates a synthetic directory with the given name
+// in d.
+//
+// Preconditions: d.dirMu must be locked. d.isDir(). d does not already contain
+// a child with the given name.
+func (d *dentry) createSyntheticDirectoryLocked(name string, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) {
+ d2 := &dentry{
+ refs: 1, // held by d
+ fs: d.fs,
+ mode: uint32(mode) | linux.S_IFDIR,
+ uid: uint32(kuid),
+ gid: uint32(kgid),
+ blockSize: usermem.PageSize, // arbitrary
+ handle: handle{
+ fd: -1,
+ },
+ }
+ d2.pf.dentry = d2
+ d2.vfsd.Init(d2)
+
+ d.cacheNewChildLocked(d2, name)
+ d.syntheticChildren++
+}
+
type directoryFD struct {
fileDescription
vfs.DirectoryFileDescriptionDefaultImpl
@@ -77,7 +110,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
fd.dirents = ds
}
- if d.fs.opts.interop != InteropModeShared {
+ if d.cachedMetadataAuthoritative() {
d.touchAtime(fd.vfsfd.Mount())
}
@@ -108,10 +141,10 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
// filesystem.renameMu is needed for d.parent, and must be locked before
// dentry.dirMu.
d.fs.renameMu.RLock()
+ defer d.fs.renameMu.RUnlock()
d.dirMu.Lock()
defer d.dirMu.Unlock()
if d.dirents != nil {
- d.fs.renameMu.RUnlock()
return d.dirents, nil
}
@@ -132,51 +165,81 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
NextOff: 2,
},
}
- d.fs.renameMu.RUnlock()
- off := uint64(0)
- const count = 64 * 1024 // for consistency with the vfs1 client
- d.handleMu.RLock()
- defer d.handleMu.RUnlock()
- if !d.handleReadable {
- // This should not be possible because a readable handle should have
- // been opened when the calling directoryFD was opened.
- panic("gofer.dentry.getDirents called without a readable handle")
- }
- for {
- p9ds, err := d.handle.file.readdir(ctx, off, count)
- if err != nil {
- return nil, err
+ var realChildren map[string]struct{}
+ if !d.isSynthetic() {
+ if d.syntheticChildren != 0 && d.fs.opts.interop == InteropModeShared {
+ // Record the set of children d actually has so that we don't emit
+ // duplicate entries for synthetic children.
+ realChildren = make(map[string]struct{})
}
- if len(p9ds) == 0 {
- // Cache dirents for future directoryFDs if permitted.
- if d.fs.opts.interop != InteropModeShared {
- d.dirents = dirents
+ off := uint64(0)
+ const count = 64 * 1024 // for consistency with the vfs1 client
+ d.handleMu.RLock()
+ if !d.handleReadable {
+ // This should not be possible because a readable handle should
+ // have been opened when the calling directoryFD was opened.
+ d.handleMu.RUnlock()
+ panic("gofer.dentry.getDirents called without a readable handle")
+ }
+ for {
+ p9ds, err := d.handle.file.readdir(ctx, off, count)
+ if err != nil {
+ d.handleMu.RUnlock()
+ return nil, err
+ }
+ if len(p9ds) == 0 {
+ d.handleMu.RUnlock()
+ break
+ }
+ for _, p9d := range p9ds {
+ if p9d.Name == "." || p9d.Name == ".." {
+ continue
+ }
+ dirent := vfs.Dirent{
+ Name: p9d.Name,
+ Ino: p9d.QID.Path,
+ NextOff: int64(len(dirents) + 1),
+ }
+ // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or
+ // DMSOCKET.
+ switch p9d.Type {
+ case p9.TypeSymlink:
+ dirent.Type = linux.DT_LNK
+ case p9.TypeDir:
+ dirent.Type = linux.DT_DIR
+ default:
+ dirent.Type = linux.DT_REG
+ }
+ dirents = append(dirents, dirent)
+ if realChildren != nil {
+ realChildren[p9d.Name] = struct{}{}
+ }
}
- return dirents, nil
+ off = p9ds[len(p9ds)-1].Offset
}
- for _, p9d := range p9ds {
- if p9d.Name == "." || p9d.Name == ".." {
+ }
+ // Emit entries for synthetic children.
+ if d.syntheticChildren != 0 {
+ for _, child := range d.children {
+ if child == nil || !child.isSynthetic() {
continue
}
- dirent := vfs.Dirent{
- Name: p9d.Name,
- Ino: p9d.QID.Path,
- NextOff: int64(len(dirents) + 1),
- }
- // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or
- // DMSOCKET.
- switch p9d.Type {
- case p9.TypeSymlink:
- dirent.Type = linux.DT_LNK
- case p9.TypeDir:
- dirent.Type = linux.DT_DIR
- default:
- dirent.Type = linux.DT_REG
+ if _, ok := realChildren[child.name]; ok {
+ continue
}
- dirents = append(dirents, dirent)
+ dirents = append(dirents, vfs.Dirent{
+ Name: child.name,
+ Type: uint8(atomic.LoadUint32(&child.mode) >> 12),
+ Ino: child.ino,
+ NextOff: int64(len(dirents) + 1),
+ })
}
- off = p9ds[len(p9ds)-1].Offset
}
+ // Cache dirents for future directoryFDs if permitted.
+ if d.cachedMetadataAuthoritative() {
+ d.dirents = dirents
+ }
+ return dirents, nil
}
// Seek implements vfs.FileDescriptionImpl.Seek.
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index eba4aabe8..98ccb42fd 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -29,14 +29,16 @@ import (
// Sync implements vfs.FilesystemImpl.Sync.
func (fs *filesystem) Sync(ctx context.Context) error {
- // Snapshot current dentries and special files.
+ // Snapshot current syncable dentries and special files.
fs.syncMu.Lock()
- ds := make([]*dentry, 0, len(fs.dentries))
- for d := range fs.dentries {
+ ds := make([]*dentry, 0, len(fs.syncableDentries))
+ for d := range fs.syncableDentries {
+ d.IncRef()
ds = append(ds, d)
}
sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs))
for sffd := range fs.specialFileFDs {
+ sffd.vfsfd.IncRef()
sffds = append(sffds, sffd)
}
fs.syncMu.Unlock()
@@ -47,9 +49,6 @@ func (fs *filesystem) Sync(ctx context.Context) error {
// Sync regular files.
for _, d := range ds {
- if !d.TryIncRef() {
- continue
- }
err := d.syncSharedHandle(ctx)
d.DecRef()
if err != nil && retErr == nil {
@@ -60,9 +59,6 @@ func (fs *filesystem) Sync(ctx context.Context) error {
// Sync special files, which may be writable but do not use dentry shared
// handles (so they won't be synced by the above).
for _, sffd := range sffds {
- if !sffd.vfsfd.TryIncRef() {
- continue
- }
err := sffd.Sync(ctx)
sffd.vfsfd.DecRef()
if err != nil && retErr == nil {
@@ -114,8 +110,8 @@ func putDentrySlice(ds *[]*dentry) {
// to *ds.
//
// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
-// !rp.Done(). If fs.opts.interop == InteropModeShared, then d's cached
-// metadata must be up to date.
+// !rp.Done(). If !d.cachedMetadataAuthoritative(), then d's cached metadata
+// must be up to date.
//
// Postconditions: The returned dentry's cached metadata is up to date.
func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
@@ -148,7 +144,7 @@ afterSymlink:
if err := rp.CheckMount(&d.parent.vfsd); err != nil {
return nil, err
}
- if fs.opts.interop == InteropModeShared && d != d.parent {
+ if d != d.parent && !d.cachedMetadataAuthoritative() {
_, attrMask, attr, err := d.parent.file.getAttr(ctx, dentryAttrMask())
if err != nil {
return nil, err
@@ -195,7 +191,7 @@ func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFil
return nil, syserror.ENAMETOOLONG
}
child, ok := parent.children[name]
- if ok && fs.opts.interop != InteropModeShared {
+ if (ok && fs.opts.interop != InteropModeShared) || parent.isSynthetic() {
// Whether child is nil or not, it is cached information that is
// assumed to be correct.
return child, nil
@@ -206,7 +202,7 @@ func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFil
return fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, ds)
}
-// Preconditions: As for getChildLocked.
+// Preconditions: As for getChildLocked. !parent.isSynthetic().
func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) {
qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name)
if err != nil && err != syserror.ENOENT {
@@ -220,24 +216,41 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
child.updateFromP9Attrs(attrMask, &attr)
return child, nil
}
- // The file at this path has changed or no longer exists. Remove
- // the stale dentry from the tree, and re-evaluate its caching
- // status (i.e. if it has 0 references, drop it).
+ if file.isNil() && child.isSynthetic() {
+ // We have a synthetic file, and no remote file has arisen to
+ // replace it.
+ return child, nil
+ }
+ // The file at this path has changed or no longer exists. Mark the
+ // dentry invalidated, and re-evaluate its caching status (i.e. if it
+ // has 0 references, drop it). Wait to update parent.children until we
+ // know what to replace the existing dentry with (i.e. one of the
+ // returns below), to avoid a redundant map access.
vfsObj.InvalidateDentry(&child.vfsd)
+ if child.isSynthetic() {
+ // Normally we don't mark invalidated dentries as deleted since
+ // they may still exist (but at a different path), and also for
+ // consistency with Linux. However, synthetic files are guaranteed
+ // to become unreachable if their dentries are invalidated, so
+ // treat their invalidation as deletion.
+ child.setDeleted()
+ parent.syntheticChildren--
+ child.decRefLocked()
+ parent.dirents = nil
+ }
*ds = appendDentry(*ds, child)
}
if file.isNil() {
// No file exists at this path now. Cache the negative lookup if
// allowed.
- if fs.opts.interop != InteropModeShared {
- parent.cacheNegativeChildLocked(name)
- }
+ parent.cacheNegativeLookupLocked(name)
return nil, nil
}
// Create a new dentry representing the file.
child, err = fs.newDentry(ctx, file, qid, attrMask, &attr)
if err != nil {
file.close(ctx)
+ delete(parent.children, name)
return nil, err
}
parent.cacheNewChildLocked(child, name)
@@ -252,8 +265,9 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
// rp.Start().Impl().(*dentry)). It does not check that the returned directory
// is searchable by the provider of rp.
//
-// Preconditions: fs.renameMu must be locked. !rp.Done(). If fs.opts.interop ==
-// InteropModeShared, then d's cached metadata must be up to date.
+// Preconditions: fs.renameMu must be locked. !rp.Done(). If
+// !d.cachedMetadataAuthoritative(), then d's cached metadata must be up to
+// date.
func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
for !rp.Final() {
d.dirMu.Lock()
@@ -275,7 +289,7 @@ func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving
// Preconditions: fs.renameMu must be locked.
func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
d := rp.Start().Impl().(*dentry)
- if fs.opts.interop == InteropModeShared {
+ if !d.cachedMetadataAuthoritative() {
// Get updated metadata for rp.Start() as required by fs.stepLocked().
if err := d.updateFromGetattr(ctx); err != nil {
return nil, err
@@ -297,16 +311,17 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
}
// doCreateAt checks that creating a file at rp is permitted, then invokes
-// create to do so.
+// createInRemoteDir (if the parent directory is a real remote directory) or
+// createInSyntheticDir (if the parent directory is synthetic) to do so.
//
// Preconditions: !rp.Done(). For the final path component in rp,
// !rp.ShouldFollowSymlink().
-func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error {
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string) error, createInSyntheticDir func(parent *dentry, name string) error) error {
var ds *[]*dentry
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckCaching(&ds)
start := rp.Start().Impl().(*dentry)
- if fs.opts.interop == InteropModeShared {
+ if !start.cachedMetadataAuthoritative() {
// Get updated metadata for start as required by
// fs.walkParentDirLocked().
if err := start.updateFromGetattr(ctx); err != nil {
@@ -340,6 +355,20 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
defer mnt.EndWrite()
parent.dirMu.Lock()
defer parent.dirMu.Unlock()
+ if parent.isSynthetic() {
+ if child := parent.children[name]; child != nil {
+ return syserror.EEXIST
+ }
+ if createInSyntheticDir == nil {
+ return syserror.EPERM
+ }
+ if err := createInSyntheticDir(parent, name); err != nil {
+ return err
+ }
+ parent.touchCMtime()
+ parent.dirents = nil
+ return nil
+ }
if fs.opts.interop == InteropModeShared {
// The existence of a dentry at name would be inconclusive because the
// file it represents may have been deleted from the remote filesystem,
@@ -348,21 +377,21 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
// will fail with EEXIST like we would have. If the RPC succeeds, and a
// stale dentry exists, the dentry will fail revalidation next time
// it's used.
- return create(parent, name)
+ return createInRemoteDir(parent, name)
}
if child := parent.children[name]; child != nil {
return syserror.EEXIST
}
// No cached dentry exists; however, there might still be an existing file
// at name. As above, we attempt the file creation RPC anyway.
- if err := create(parent, name); err != nil {
+ if err := createInRemoteDir(parent, name); err != nil {
return err
}
+ if child, ok := parent.children[name]; ok && child == nil {
+ // Delete the now-stale negative dentry.
+ delete(parent.children, name)
+ }
parent.touchCMtime()
- // Either parent.children[name] doesn't exist (in which case this is a
- // no-op) or is nil (in which case this erases the now-stale information
- // that the file doesn't exist).
- delete(parent.children, name)
parent.dirents = nil
return nil
}
@@ -373,7 +402,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckCaching(&ds)
start := rp.Start().Impl().(*dentry)
- if fs.opts.interop == InteropModeShared {
+ if !start.cachedMetadataAuthoritative() {
// Get updated metadata for start as required by
// fs.walkParentDirLocked().
if err := start.updateFromGetattr(ctx); err != nil {
@@ -421,8 +450,10 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
// only revalidating the dentry if that fails (indicating that the existing
// dentry is a mount point).
if child != nil {
+ child.dirMu.Lock()
+ defer child.dirMu.Unlock()
if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
- if fs.opts.interop != InteropModeShared {
+ if parent.cachedMetadataAuthoritative() {
return err
}
child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds)
@@ -437,13 +468,37 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
}
}
flags := uint32(0)
+ // If a dentry exists, use it for best-effort checks on its deletability.
if dir {
- if child != nil && !child.isDir() {
- vfsObj.AbortDeleteDentry(&child.vfsd)
- return syserror.ENOTDIR
+ if child != nil {
+ // child must be an empty directory.
+ if child.syntheticChildren != 0 {
+ // This is definitely not an empty directory, irrespective of
+ // fs.opts.interop.
+ vfsObj.AbortDeleteDentry(&child.vfsd)
+ return syserror.ENOTEMPTY
+ }
+ // If InteropModeShared is in effect and the first call to
+ // PrepareDeleteDentry above succeeded, then child wasn't
+ // revalidated (so we can't expect its file type to be correct) and
+ // individually revalidating its children (to confirm that they
+ // still exist) would be a waste of time.
+ if child.cachedMetadataAuthoritative() {
+ if !child.isDir() {
+ vfsObj.AbortDeleteDentry(&child.vfsd)
+ return syserror.ENOTDIR
+ }
+ for _, grandchild := range child.children {
+ if grandchild != nil {
+ vfsObj.AbortDeleteDentry(&child.vfsd)
+ return syserror.ENOTEMPTY
+ }
+ }
+ }
}
flags = linux.AT_REMOVEDIR
} else {
+ // child must be a non-directory file.
if child != nil && child.isDir() {
vfsObj.AbortDeleteDentry(&child.vfsd)
return syserror.EISDIR
@@ -455,28 +510,36 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b
return syserror.ENOTDIR
}
}
- err = parent.file.unlinkAt(ctx, name, flags)
- if err != nil {
- if child != nil {
- vfsObj.AbortDeleteDentry(&child.vfsd)
- }
- return err
- }
- if fs.opts.interop != InteropModeShared {
- parent.touchCMtime()
- if dir {
- parent.decLinks()
+ if parent.isSynthetic() {
+ if child == nil {
+ return syserror.ENOENT
}
- parent.cacheNegativeChildLocked(name)
- parent.dirents = nil
} else {
- delete(parent.children, name)
+ err = parent.file.unlinkAt(ctx, name, flags)
+ if err != nil {
+ if child != nil {
+ vfsObj.AbortDeleteDentry(&child.vfsd)
+ }
+ return err
+ }
}
if child != nil {
- child.setDeleted()
vfsObj.CommitDeleteDentry(&child.vfsd)
+ child.setDeleted()
+ if child.isSynthetic() {
+ parent.syntheticChildren--
+ child.decRefLocked()
+ }
ds = appendDentry(ds, child)
}
+ parent.cacheNegativeLookupLocked(name)
+ if parent.cachedMetadataAuthoritative() {
+ parent.dirents = nil
+ parent.touchCMtime()
+ if dir {
+ parent.decLinks()
+ }
+ }
return nil
}
@@ -554,7 +617,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa
fs.renameMu.RLock()
defer fs.renameMuRUnlockAndCheckCaching(&ds)
start := rp.Start().Impl().(*dentry)
- if fs.opts.interop == InteropModeShared {
+ if !start.cachedMetadataAuthoritative() {
// Get updated metadata for start as required by
// fs.walkParentDirLocked().
if err := start.updateFromGetattr(ctx); err != nil {
@@ -577,20 +640,32 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
}
// 9P2000.L supports hard links, but we don't.
return syserror.EPERM
- })
+ }, nil)
}
// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+ creds := rp.Credentials()
return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error {
- creds := rp.Credentials()
if _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)); err != nil {
- return err
+ if !opts.ForSyntheticMountpoint || err == syserror.EEXIST {
+ return err
+ }
+ ctx.Infof("Failed to create remote directory %q: %v; falling back to synthetic directory", name, err)
+ parent.createSyntheticDirectoryLocked(name, opts.Mode, creds.EffectiveKUID, creds.EffectiveKGID)
}
if fs.opts.interop != InteropModeShared {
parent.incLinks()
}
return nil
+ }, func(parent *dentry, name string) error {
+ if !opts.ForSyntheticMountpoint {
+ // Can't create non-synthetic files in synthetic directories.
+ return syserror.EPERM
+ }
+ parent.createSyntheticDirectoryLocked(name, opts.Mode, creds.EffectiveKUID, creds.EffectiveKGID)
+ parent.incLinks()
+ return nil
})
}
@@ -600,7 +675,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
creds := rp.Credentials()
_, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
return err
- })
+ }, nil)
}
// OpenAt implements vfs.FilesystemImpl.OpenAt.
@@ -620,7 +695,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
defer fs.renameMuRUnlockAndCheckCaching(&ds)
start := rp.Start().Impl().(*dentry)
- if fs.opts.interop == InteropModeShared {
+ if !start.cachedMetadataAuthoritative() {
// Get updated metadata for start as required by fs.stepLocked().
if err := start.updateFromGetattr(ctx); err != nil {
return nil, err
@@ -643,6 +718,10 @@ afterTrailingSymlink:
parent.dirMu.Lock()
child, err := fs.stepLocked(ctx, rp, parent, &ds)
if err == syserror.ENOENT && mayCreate {
+ if parent.isSynthetic() {
+ parent.dirMu.Unlock()
+ return nil, syserror.EPERM
+ }
fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts)
parent.dirMu.Unlock()
return fd, err
@@ -702,8 +781,10 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
if opts.Flags&linux.O_DIRECT != 0 {
return nil, syserror.EINVAL
}
- if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil {
- return nil, err
+ if !d.isSynthetic() {
+ if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil {
+ return nil, err
+ }
}
fd := &directoryFD{}
if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
@@ -733,6 +814,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
}
// Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked.
+// !d.isSynthetic().
func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
return nil, err
@@ -811,7 +893,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
child.refs = 1
// Insert the dentry into the tree.
d.cacheNewChildLocked(child, name)
- if d.fs.opts.interop != InteropModeShared {
+ if d.cachedMetadataAuthoritative() {
d.touchCMtime()
d.dirents = nil
}
@@ -888,7 +970,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
defer mnt.EndWrite()
oldParent := oldParentVD.Dentry().Impl().(*dentry)
- if fs.opts.interop == InteropModeShared {
+ if !oldParent.cachedMetadataAuthoritative() {
if err := oldParent.updateFromGetattr(ctx); err != nil {
return err
}
@@ -933,35 +1015,22 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if newParent.isDeleted() {
return syserror.ENOENT
}
- replaced := newParent.children[newName]
- // This is similar to unlinkAt, except:
- //
- // - If a dentry exists for the file to be replaced, we revalidate it
- // unconditionally (instead of only if PrepareRenameDentry fails) for
- // simplicity.
- //
- // - If rp.MustBeDir(), then we need a dentry representing the replaced
- // file regardless to confirm that it's a directory.
- if replaced != nil || rp.MustBeDir() {
- replaced, err = fs.getChildLocked(ctx, rp.VirtualFilesystem(), newParent, newName, &ds)
- if err != nil {
- return err
- }
- if replaced != nil {
- if replaced.isDir() {
- if !renamed.isDir() {
- return syserror.EISDIR
- }
- } else {
- if rp.MustBeDir() || renamed.isDir() {
- return syserror.ENOTDIR
- }
- }
- }
+ replaced, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), newParent, newName, &ds)
+ if err != nil {
+ return err
}
var replacedVFSD *vfs.Dentry
if replaced != nil {
replacedVFSD = &replaced.vfsd
+ if replaced.isDir() {
+ if !renamed.isDir() {
+ return syserror.EISDIR
+ }
+ } else {
+ if rp.MustBeDir() || renamed.isDir() {
+ return syserror.ENOTDIR
+ }
+ }
}
if oldParent == newParent && oldName == newName {
@@ -972,27 +1041,47 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
return err
}
- if err := renamed.file.rename(ctx, newParent.file, newName); err != nil {
- vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
- return err
+
+ // Update the remote filesystem.
+ if !renamed.isSynthetic() {
+ if err := renamed.file.rename(ctx, newParent.file, newName); err != nil {
+ vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+ return err
+ }
+ } else if replaced != nil && !replaced.isSynthetic() {
+ // We are replacing an existing real file with a synthetic one, so we
+ // need to unlink the former.
+ flags := uint32(0)
+ if replaced.isDir() {
+ flags = linux.AT_REMOVEDIR
+ }
+ if err := newParent.file.unlinkAt(ctx, newName, flags); err != nil {
+ vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+ return err
+ }
}
- if fs.opts.interop != InteropModeShared {
- oldParent.cacheNegativeChildLocked(oldName)
- oldParent.dirents = nil
- newParent.dirents = nil
- if renamed.isDir() {
- oldParent.decLinks()
- newParent.incLinks()
+
+ // Update the dentry tree.
+ vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD)
+ if replaced != nil {
+ replaced.setDeleted()
+ if replaced.isSynthetic() {
+ newParent.syntheticChildren--
+ replaced.decRefLocked()
}
- oldParent.touchCMtime()
- newParent.touchCMtime()
- renamed.touchCtime()
- } else {
- delete(oldParent.children, oldName)
+ ds = appendDentry(ds, replaced)
}
+ oldParent.cacheNegativeLookupLocked(oldName)
+ // We don't use newParent.cacheNewChildLocked() since we don't want to mess
+ // with reference counts and queue oldParent for checkCachingLocked if the
+ // parent isn't actually changing.
if oldParent != newParent {
- appendDentry(ds, oldParent)
+ ds = appendDentry(ds, oldParent)
newParent.IncRef()
+ if renamed.isSynthetic() {
+ oldParent.syntheticChildren--
+ newParent.syntheticChildren++
+ }
}
renamed.parent = newParent
renamed.name = newName
@@ -1000,11 +1089,25 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
newParent.children = make(map[string]*dentry)
}
newParent.children[newName] = renamed
- if replaced != nil {
- replaced.setDeleted()
- appendDentry(ds, replaced)
+
+ // Update metadata.
+ if renamed.cachedMetadataAuthoritative() {
+ renamed.touchCtime()
+ }
+ if oldParent.cachedMetadataAuthoritative() {
+ oldParent.dirents = nil
+ oldParent.touchCMtime()
+ if renamed.isDir() {
+ oldParent.decLinks()
+ }
+ }
+ if newParent.cachedMetadataAuthoritative() {
+ newParent.dirents = nil
+ newParent.touchCMtime()
+ if renamed.isDir() {
+ newParent.incLinks()
+ }
}
- vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD)
return nil
}
@@ -1051,6 +1154,10 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
if err != nil {
return linux.Statfs{}, err
}
+ // If d is synthetic, invoke statfs on the first ancestor of d that isn't.
+ for d.isSynthetic() {
+ d = d.parent
+ }
fsstat, err := d.file.statFS(ctx)
if err != nil {
return linux.Statfs{}, err
@@ -1080,7 +1187,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
creds := rp.Credentials()
_, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
return err
- })
+ }, nil)
}
// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 293df2545..8b4e91d17 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -27,8 +27,9 @@
// dentry.handleMu
// dentry.dataMu
//
-// Locking dentry.dirMu in multiple dentries requires holding
-// filesystem.renameMu for writing.
+// Locking dentry.dirMu in multiple dentries requires that either ancestor
+// dentries are locked before descendant dentries, or that filesystem.renameMu
+// is locked for writing.
package gofer
import (
@@ -102,11 +103,12 @@ type filesystem struct {
cachedDentries dentryList
cachedDentriesLen uint64
- // dentries contains all dentries in this filesystem. specialFileFDs
- // contains all open specialFileFDs. These fields are protected by syncMu.
- syncMu sync.Mutex
- dentries map[*dentry]struct{}
- specialFileFDs map[*specialFileFD]struct{}
+ // syncableDentries contains all dentries in this filesystem for which
+ // !dentry.file.isNil(). specialFileFDs contains all open specialFileFDs.
+ // These fields are protected by syncMu.
+ syncMu sync.Mutex
+ syncableDentries map[*dentry]struct{}
+ specialFileFDs map[*specialFileFD]struct{}
}
type filesystemOptions struct {
@@ -187,7 +189,8 @@ const (
// InteropModeShared is appropriate when there are users of the remote
// filesystem that may mutate its state other than the client.
//
- // - The client must verify cached filesystem state before using it.
+ // - The client must verify ("revalidate") cached filesystem state before
+ // using it.
//
// - Client changes to filesystem state must be sent to the remote
// filesystem synchronously.
@@ -376,14 +379,14 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
// Construct the filesystem object.
fs := &filesystem{
- mfp: mfp,
- opts: fsopts,
- uid: creds.EffectiveKUID,
- gid: creds.EffectiveKGID,
- client: client,
- clock: ktime.RealtimeClockFromContext(ctx),
- dentries: make(map[*dentry]struct{}),
- specialFileFDs: make(map[*specialFileFD]struct{}),
+ mfp: mfp,
+ opts: fsopts,
+ uid: creds.EffectiveKUID,
+ gid: creds.EffectiveKGID,
+ client: client,
+ clock: ktime.RealtimeClockFromContext(ctx),
+ syncableDentries: make(map[*dentry]struct{}),
+ specialFileFDs: make(map[*specialFileFD]struct{}),
}
fs.vfsfs.Init(vfsObj, &fstype, fs)
@@ -409,7 +412,7 @@ func (fs *filesystem) Release() {
mf := fs.mfp.MemoryFile()
fs.syncMu.Lock()
- for d := range fs.dentries {
+ for d := range fs.syncableDentries {
d.handleMu.Lock()
d.dataMu.Lock()
if d.handleWritable {
@@ -444,9 +447,11 @@ type dentry struct {
vfsd vfs.Dentry
// refs is the reference count. Each dentry holds a reference on its
- // parent, even if disowned. refs is accessed using atomic memory
- // operations. When refs reaches 0, the dentry may be added to the cache or
- // destroyed. If refs==-1 the dentry has already been destroyed.
+ // parent, even if disowned. An additional reference is held on all
+ // synthetic dentries until they are unlinked or invalidated. When refs
+ // reaches 0, the dentry may be added to the cache or destroyed. If refs ==
+ // -1, the dentry has already been destroyed. refs is accessed using atomic
+ // memory operations.
refs int64
// fs is the owning filesystem. fs is immutable.
@@ -465,6 +470,12 @@ type dentry struct {
// We don't support hard links, so each dentry maps 1:1 to an inode.
// file is the unopened p9.File that backs this dentry. file is immutable.
+ //
+ // If file.isNil(), this dentry represents a synthetic file, i.e. a file
+ // that does not exist on the remote filesystem. As of this writing, this
+ // is only possible for a directory created with
+ // MkdirOptions.ForSyntheticMountpoint == true.
+ // TODO(gvisor.dev/issue/1476): Support synthetic sockets (and pipes).
file p9file
// If deleted is non-zero, the file represented by this dentry has been
@@ -484,15 +495,21 @@ type dentry struct {
// - Mappings of child filenames to dentries representing those children.
//
// - Mappings of child filenames that are known not to exist to nil
- // dentries (only if InteropModeShared is not in effect).
+ // dentries (only if InteropModeShared is not in effect and the directory
+ // is not synthetic).
//
// children is protected by dirMu.
children map[string]*dentry
- // If this dentry represents a directory, InteropModeShared is not in
- // effect, and dirents is not nil, it is a cache of all entries in the
- // directory, in the order they were returned by the server. dirents is
- // protected by dirMu.
+ // If this dentry represents a directory, syntheticChildren is the number
+ // of child dentries for which dentry.isSynthetic() == true.
+ // syntheticChildren is protected by dirMu.
+ syntheticChildren int
+
+ // If this dentry represents a directory,
+ // dentry.cachedMetadataAuthoritative() == true, and dirents is not nil, it
+ // is a cache of all entries in the directory, in the order they were
+ // returned by the server. dirents is protected by dirMu.
dirents []vfs.Dirent
// Cached metadata; protected by metadataMu and accessed using atomic
@@ -589,6 +606,8 @@ func dentryAttrMask() p9.AttrMask {
// initially has no references, but is not cached; it is the caller's
// responsibility to set the dentry's reference count and/or call
// dentry.checkCachingLocked() as appropriate.
+//
+// Preconditions: !file.isNil().
func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) {
if !mask.Mode {
ctx.Warningf("can't create gofer.dentry without file type")
@@ -612,10 +631,10 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
},
}
d.pf.dentry = d
- if mask.UID {
+ if mask.UID && attr.UID != auth.NoID {
d.uid = uint32(attr.UID)
}
- if mask.GID {
+ if mask.GID && attr.GID != auth.NoID {
d.gid = uint32(attr.GID)
}
if mask.Size {
@@ -642,11 +661,19 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
d.vfsd.Init(d)
fs.syncMu.Lock()
- fs.dentries[d] = struct{}{}
+ fs.syncableDentries[d] = struct{}{}
fs.syncMu.Unlock()
return d, nil
}
+func (d *dentry) isSynthetic() bool {
+ return d.file.isNil()
+}
+
+func (d *dentry) cachedMetadataAuthoritative() bool {
+ return d.fs.opts.interop != InteropModeShared || d.isSynthetic()
+}
+
// updateFromP9Attrs is called to update d's metadata after an update from the
// remote filesystem.
func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
@@ -691,6 +718,7 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) {
d.metadataMu.Unlock()
}
+// Preconditions: !d.isSynthetic()
func (d *dentry) updateFromGetattr(ctx context.Context) error {
// Use d.handle.file, which represents a 9P fid that has been opened, in
// preference to d.file, which represents a 9P fid that has not. This may
@@ -758,7 +786,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
defer mnt.EndWrite()
setLocalAtime := false
setLocalMtime := false
- if d.fs.opts.interop != InteropModeShared {
+ if d.cachedMetadataAuthoritative() {
// Timestamp updates will be handled locally.
setLocalAtime = stat.Mask&linux.STATX_ATIME != 0
setLocalMtime = stat.Mask&linux.STATX_MTIME != 0
@@ -771,35 +799,37 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
}
d.metadataMu.Lock()
defer d.metadataMu.Unlock()
- if stat.Mask != 0 {
- if err := d.file.setAttr(ctx, p9.SetAttrMask{
- Permissions: stat.Mask&linux.STATX_MODE != 0,
- UID: stat.Mask&linux.STATX_UID != 0,
- GID: stat.Mask&linux.STATX_GID != 0,
- Size: stat.Mask&linux.STATX_SIZE != 0,
- ATime: stat.Mask&linux.STATX_ATIME != 0,
- MTime: stat.Mask&linux.STATX_MTIME != 0,
- ATimeNotSystemTime: stat.Atime.Nsec != linux.UTIME_NOW,
- MTimeNotSystemTime: stat.Mtime.Nsec != linux.UTIME_NOW,
- }, p9.SetAttr{
- Permissions: p9.FileMode(stat.Mode),
- UID: p9.UID(stat.UID),
- GID: p9.GID(stat.GID),
- Size: stat.Size,
- ATimeSeconds: uint64(stat.Atime.Sec),
- ATimeNanoSeconds: uint64(stat.Atime.Nsec),
- MTimeSeconds: uint64(stat.Mtime.Sec),
- MTimeNanoSeconds: uint64(stat.Mtime.Nsec),
- }); err != nil {
- return err
+ if !d.isSynthetic() {
+ if stat.Mask != 0 {
+ if err := d.file.setAttr(ctx, p9.SetAttrMask{
+ Permissions: stat.Mask&linux.STATX_MODE != 0,
+ UID: stat.Mask&linux.STATX_UID != 0,
+ GID: stat.Mask&linux.STATX_GID != 0,
+ Size: stat.Mask&linux.STATX_SIZE != 0,
+ ATime: stat.Mask&linux.STATX_ATIME != 0,
+ MTime: stat.Mask&linux.STATX_MTIME != 0,
+ ATimeNotSystemTime: stat.Atime.Nsec != linux.UTIME_NOW,
+ MTimeNotSystemTime: stat.Mtime.Nsec != linux.UTIME_NOW,
+ }, p9.SetAttr{
+ Permissions: p9.FileMode(stat.Mode),
+ UID: p9.UID(stat.UID),
+ GID: p9.GID(stat.GID),
+ Size: stat.Size,
+ ATimeSeconds: uint64(stat.Atime.Sec),
+ ATimeNanoSeconds: uint64(stat.Atime.Nsec),
+ MTimeSeconds: uint64(stat.Mtime.Sec),
+ MTimeNanoSeconds: uint64(stat.Mtime.Nsec),
+ }); err != nil {
+ return err
+ }
+ }
+ if d.fs.opts.interop == InteropModeShared {
+ // There's no point to updating d's metadata in this case since
+ // it'll be overwritten by revalidation before the next time it's
+ // used anyway. (InteropModeShared inhibits client caching of
+ // regular file data, so there's no cache to truncate either.)
+ return nil
}
- }
- if d.fs.opts.interop == InteropModeShared {
- // There's no point to updating d's metadata in this case since it'll
- // be overwritten by revalidation before the next time it's used
- // anyway. (InteropModeShared inhibits client caching of regular file
- // data, so there's no cache to truncate either.)
- return nil
}
now := d.fs.clock.Now().Nanoseconds()
if stat.Mask&linux.STATX_MODE != 0 {
@@ -897,6 +927,15 @@ func (d *dentry) DecRef() {
}
}
+// decRefLocked decrements d's reference count without calling
+// d.checkCachingLocked, even if d's reference count reaches 0; callers are
+// responsible for ensuring that d.checkCachingLocked will be called later.
+func (d *dentry) decRefLocked() {
+ if refs := atomic.AddInt64(&d.refs, -1); refs < 0 {
+ panic("gofer.dentry.decRefLocked() called without holding a reference")
+ }
+}
+
// checkCachingLocked should be called after d's reference count becomes 0 or it
// becomes disowned.
//
@@ -1013,11 +1052,11 @@ func (d *dentry) destroyLocked() {
if !d.file.isNil() {
d.file.close(ctx)
d.file = p9file{}
+ // Remove d from the set of syncable dentries.
+ d.fs.syncMu.Lock()
+ delete(d.fs.syncableDentries, d)
+ d.fs.syncMu.Unlock()
}
- // Remove d from the set of all dentries.
- d.fs.syncMu.Lock()
- delete(d.fs.dentries, d)
- d.fs.syncMu.Unlock()
// Drop the reference held by d on its parent without recursively locking
// d.fs.renameMu.
if d.parent != nil {
@@ -1040,6 +1079,9 @@ func (d *dentry) setDeleted() {
// We only support xattrs prefixed with "user." (see b/148380782). Currently,
// there is no need to expose any other xattrs through a gofer.
func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
+ if d.file.isNil() {
+ return nil, nil
+ }
xattrMap, err := d.file.listXattr(ctx, size)
if err != nil {
return nil, err
@@ -1054,6 +1096,9 @@ func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size ui
}
func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+ if d.file.isNil() {
+ return "", syserror.ENODATA
+ }
if err := d.checkPermissions(creds, vfs.MayRead); err != nil {
return "", err
}
@@ -1064,6 +1109,9 @@ func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vf
}
func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+ if d.file.isNil() {
+ return syserror.EPERM
+ }
if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
return err
}
@@ -1074,6 +1122,9 @@ func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vf
}
func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error {
+ if d.file.isNil() {
+ return syserror.EPERM
+ }
if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
return err
}
@@ -1083,7 +1134,7 @@ func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name
return d.file.removeXattr(ctx, name)
}
-// Preconditions: d.isRegularFile() || d.isDirectory().
+// Preconditions: !d.file.isNil(). d.isRegularFile() || d.isDirectory().
func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error {
// O_TRUNC unconditionally requires us to obtain a new handle (opened with
// O_TRUNC).
@@ -1213,7 +1264,7 @@ func (fd *fileDescription) dentry() *dentry {
func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
d := fd.dentry()
const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME)
- if d.fs.opts.interop == InteropModeShared && opts.Mask&(validMask) != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
+ if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
// TODO(jamieliu): Use specialFileFD.handle.file for the getattr if
// available?
if err := d.updateFromGetattr(ctx); err != nil {
diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go
index 4041fb252..adff39490 100644
--- a/pkg/sentry/fsimpl/gofer/gofer_test.go
+++ b/pkg/sentry/fsimpl/gofer/gofer_test.go
@@ -24,7 +24,7 @@ import (
func TestDestroyIdempotent(t *testing.T) {
fs := filesystem{
- dentries: make(map[*dentry]struct{}),
+ syncableDentries: make(map[*dentry]struct{}),
opts: filesystemOptions{
// Test relies on no dentry being held in the cache.
maxCachedDentries: 0,
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 74577bc2f..20e5bb072 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -443,8 +443,7 @@ type FilesystemImpl interface {
// Errors:
//
// - If extended attributes are not supported by the filesystem,
- // ListxattrAt returns nil. (See FileDescription.Listxattr for an
- // explanation.)
+ // ListxattrAt returns ENOTSUP.
//
// - If the size of the list (including a NUL terminating byte after every
// entry) would exceed size, ERANGE may be returned. Note that
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 534528ce6..022bac127 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -33,6 +33,25 @@ type GetDentryOptions struct {
type MkdirOptions struct {
// Mode is the file mode bits for the created directory.
Mode linux.FileMode
+
+ // If ForSyntheticMountpoint is true, FilesystemImpl.MkdirAt() may create
+ // the given directory in memory only (as opposed to persistent storage).
+ // The created directory should be able to support the creation of
+ // subdirectories with ForSyntheticMountpoint == true. It does not need to
+ // support the creation of subdirectories with ForSyntheticMountpoint ==
+ // false, or files of other types.
+ //
+ // FilesystemImpls are permitted to ignore the ForSyntheticMountpoint
+ // option.
+ //
+ // The ForSyntheticMountpoint option exists because, unlike mount(2), the
+ // OCI Runtime Specification permits the specification of mount points that
+ // do not exist, under the expectation that container runtimes will create
+ // them. (More accurately, the OCI Runtime Specification completely fails
+ // to document this feature, but it's implemented by runc.)
+ // ForSyntheticMountpoint allows such mount points to be created even when
+ // the underlying persistent filesystem is immutable.
+ ForSyntheticMountpoint bool
}
// MknodOptions contains options to VirtualFilesystem.MknodAt() and
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index 82083c57d..bce3a3593 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -251,6 +251,12 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config,
// All writes go to upper, be paranoid and make lower readonly.
opts.ReadOnly = useOverlay
+ if err := c.k.VFS().MkdirAt(ctx, creds, target, &vfs.MkdirOptions{
+ ForSyntheticMountpoint: true,
+ }); err != nil && err != syserror.EEXIST {
+ // Log a warning, but attempt the mount anyway.
+ log.Warningf("Failed to create mount point at %q: %v", submount.Destination, err)
+ }
if err := c.k.VFS().MountAt(ctx, creds, "", target, submount.Type, opts); err != nil {
return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
}