From 303c913c5e0e6f0bac766970d9ed19c08aabb980 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Wed, 24 Feb 2021 12:59:08 -0800 Subject: Use mapped device number + topmost inode number for all files in VFS2 overlay. Before this CL, VFS2's overlayfs uses a single private device number and an autoincrementing generated inode number for directories; this is consistent with Linux's overlayfs in the non-samefs non-xino case. However, this breaks some applications more consistently than on Linux due to more aggressive caching of Linux overlayfs dentries. Switch from using mapped device numbers + the topmost layer's inode number for just non-copied-up non-directory files, to doing so for all files. This still allows directory dev/ino numbers to change across copy-up, but otherwise keeps them consistent. Fixes #5545: ``` $ docker run --runtime=runsc-vfs2-overlay --rm ubuntu:focal bash -c "mkdir -p 1/2/3/4/5/6/7/8 && rm -rf 1 && echo done" done ``` PiperOrigin-RevId: 359350716 --- pkg/sentry/fsimpl/overlay/filesystem.go | 24 ++++----- pkg/sentry/fsimpl/overlay/overlay.go | 91 +++++++++++++-------------------- 2 files changed, 44 insertions(+), 71 deletions(-) diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go index 890b01cdb..f7f795b10 100644 --- a/pkg/sentry/fsimpl/overlay/filesystem.go +++ b/pkg/sentry/fsimpl/overlay/filesystem.go @@ -313,22 +313,16 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str return nil, topLookupLayer, syserror.ENOENT } - // Device and inode numbers were copied from the topmost layer above; - // override them if necessary. - if child.isDir() { - child.devMajor = linux.UNNAMED_MAJOR - child.devMinor = fs.dirDevMinor - child.ino = fs.newDirIno() - } else if !child.upperVD.Ok() { - childDevMinor, err := fs.getLowerDevMinor(child.devMajor, child.devMinor) - if err != nil { - ctx.Infof("overlay.filesystem.lookupLocked: failed to map lower layer device number (%d, %d) to an overlay-specific device number: %v", child.devMajor, child.devMinor, err) - child.destroyLocked(ctx) - return nil, topLookupLayer, err - } - child.devMajor = linux.UNNAMED_MAJOR - child.devMinor = childDevMinor + // Device and inode numbers were copied from the topmost layer above. Remap + // the device number to an appropriate overlay-private one. + childDevMinor, err := fs.getPrivateDevMinor(child.devMajor, child.devMinor) + if err != nil { + ctx.Infof("overlay.filesystem.lookupLocked: failed to map layer device number (%d, %d) to an overlay-specific device number: %v", child.devMajor, child.devMinor, err) + child.destroyLocked(ctx) + return nil, topLookupLayer, err } + child.devMajor = linux.UNNAMED_MAJOR + child.devMinor = childDevMinor parent.IncRef() child.parent = parent diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go index acd3684c6..58680bc80 100644 --- a/pkg/sentry/fsimpl/overlay/overlay.go +++ b/pkg/sentry/fsimpl/overlay/overlay.go @@ -97,28 +97,29 @@ type filesystem struct { // used for accesses to the filesystem's layers. creds is immutable. creds *auth.Credentials - // dirDevMinor is the device minor number used for directories. dirDevMinor - // is immutable. - dirDevMinor uint32 - - // lowerDevMinors maps device numbers from lower layer filesystems to - // device minor numbers assigned to non-directory files originating from - // that filesystem. (This remapping is necessary for lower layers because a - // file on a lower layer, and that same file on an overlay, are - // distinguishable because they will diverge after copy-up; this isn't true - // for non-directory files already on the upper layer.) lowerDevMinors is - // protected by devMu. - devMu sync.Mutex `state:"nosave"` - lowerDevMinors map[layerDevNumber]uint32 + // privateDevMinors maps device numbers from layer filesystems to device + // minor numbers assigned to files originating from that filesystem. + // + // For non-directory files, this remapping is necessary for lower layers + // because a file on a lower layer, and that same file on an overlay, are + // distinguishable because they will diverge after copy-up. (Once a + // non-directory file has been copied up, its contents on the upper layer + // completely determine its contents in the overlay, so this is no longer + // true; but we still do the mapping for consistency.) + // + // For directories, this remapping may be necessary even if the directory + // exists on the upper layer due to directory merging; rather than make the + // mapping conditional on whether the directory is opaque, we again + // unconditionally apply the mapping unconditionally. + // + // privateDevMinors is protected by devMu. + devMu sync.Mutex `state:"nosave"` + privateDevMinors map[layerDevNumber]uint32 // renameMu synchronizes renaming with non-renaming operations in order to // ensure consistent lock ordering between dentry.dirMu in different // dentries. renameMu sync.RWMutex `state:"nosave"` - - // lastDirIno is the last inode number assigned to a directory. lastDirIno - // is accessed using atomic memory operations. - lastDirIno uint64 } // +stateify savable @@ -232,12 +233,6 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return nil, nil, syserror.EINVAL } - // Allocate dirDevMinor. lowerDevMinors are allocated dynamically. - dirDevMinor, err := vfsObj.GetAnonBlockDevMinor() - if err != nil { - return nil, nil, err - } - // Take extra references held by the filesystem. if fsopts.UpperRoot.Ok() { fsopts.UpperRoot.IncRef() @@ -247,10 +242,9 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } fs := &filesystem{ - opts: fsopts, - creds: creds.Fork(), - dirDevMinor: dirDevMinor, - lowerDevMinors: make(map[layerDevNumber]uint32), + opts: fsopts, + creds: creds.Fork(), + privateDevMinors: make(map[layerDevNumber]uint32), } fs.vfsfs.Init(vfsObj, &fstype, fs) @@ -294,26 +288,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt root.mode = uint32(rootStat.Mode) root.uid = rootStat.UID root.gid = rootStat.GID - if rootStat.Mode&linux.S_IFMT == linux.S_IFDIR { - root.devMajor = linux.UNNAMED_MAJOR - root.devMinor = fs.dirDevMinor - root.ino = fs.newDirIno() - } else if !root.upperVD.Ok() { - root.devMajor = linux.UNNAMED_MAJOR - rootDevMinor, err := fs.getLowerDevMinor(rootStat.DevMajor, rootStat.DevMinor) - if err != nil { - ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to get device number for root: %v", err) - root.destroyLocked(ctx) - fs.vfsfs.DecRef(ctx) - return nil, nil, err - } - root.devMinor = rootDevMinor - root.ino = rootStat.Ino - } else { - root.devMajor = rootStat.DevMajor - root.devMinor = rootStat.DevMinor - root.ino = rootStat.Ino + root.devMajor = linux.UNNAMED_MAJOR + rootDevMinor, err := fs.getPrivateDevMinor(rootStat.DevMajor, rootStat.DevMinor) + if err != nil { + ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to get device number for root: %v", err) + root.destroyLocked(ctx) + fs.vfsfs.DecRef(ctx) + return nil, nil, err } + root.devMinor = rootDevMinor + root.ino = rootStat.Ino return &fs.vfsfs, &root.vfsd, nil } @@ -344,9 +328,8 @@ func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forc // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { vfsObj := fs.vfsfs.VirtualFilesystem() - vfsObj.PutAnonBlockDevMinor(fs.dirDevMinor) - for _, lowerDevMinor := range fs.lowerDevMinors { - vfsObj.PutAnonBlockDevMinor(lowerDevMinor) + for _, devMinor := range fs.privateDevMinors { + vfsObj.PutAnonBlockDevMinor(devMinor) } if fs.opts.UpperRoot.Ok() { fs.opts.UpperRoot.DecRef(ctx) @@ -376,22 +359,18 @@ func (fs *filesystem) statFS(ctx context.Context) (linux.Statfs, error) { return fsstat, nil } -func (fs *filesystem) newDirIno() uint64 { - return atomic.AddUint64(&fs.lastDirIno, 1) -} - -func (fs *filesystem) getLowerDevMinor(layerMajor, layerMinor uint32) (uint32, error) { +func (fs *filesystem) getPrivateDevMinor(layerMajor, layerMinor uint32) (uint32, error) { fs.devMu.Lock() defer fs.devMu.Unlock() orig := layerDevNumber{layerMajor, layerMinor} - if minor, ok := fs.lowerDevMinors[orig]; ok { + if minor, ok := fs.privateDevMinors[orig]; ok { return minor, nil } minor, err := fs.vfsfs.VirtualFilesystem().GetAnonBlockDevMinor() if err != nil { return 0, err } - fs.lowerDevMinors[orig] = minor + fs.privateDevMinors[orig] = minor return minor, nil } -- cgit v1.2.3