summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorJamie Liu <jamieliu@google.com>2021-02-24 12:59:08 -0800
committergVisor bot <gvisor-bot@google.com>2021-02-24 13:00:59 -0800
commit303c913c5e0e6f0bac766970d9ed19c08aabb980 (patch)
treed5df28fd5b511141e23afa92af6dcd4efc29887e
parentfcd4ff4fca31e3c594b30aff6b008e7af4c7c1a5 (diff)
Use mapped device number + topmost inode number for all files in VFS2 overlay.
Before this CL, VFS2's overlayfs uses a single private device number and an autoincrementing generated inode number for directories; this is consistent with Linux's overlayfs in the non-samefs non-xino case. However, this breaks some applications more consistently than on Linux due to more aggressive caching of Linux overlayfs dentries. Switch from using mapped device numbers + the topmost layer's inode number for just non-copied-up non-directory files, to doing so for all files. This still allows directory dev/ino numbers to change across copy-up, but otherwise keeps them consistent. Fixes #5545: ``` $ docker run --runtime=runsc-vfs2-overlay --rm ubuntu:focal bash -c "mkdir -p 1/2/3/4/5/6/7/8 && rm -rf 1 && echo done" done ``` PiperOrigin-RevId: 359350716
-rw-r--r--pkg/sentry/fsimpl/overlay/filesystem.go24
-rw-r--r--pkg/sentry/fsimpl/overlay/overlay.go91
2 files changed, 44 insertions, 71 deletions
diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
index 890b01cdb..f7f795b10 100644
--- a/pkg/sentry/fsimpl/overlay/filesystem.go
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -313,22 +313,16 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
return nil, topLookupLayer, syserror.ENOENT
}
- // Device and inode numbers were copied from the topmost layer above;
- // override them if necessary.
- if child.isDir() {
- child.devMajor = linux.UNNAMED_MAJOR
- child.devMinor = fs.dirDevMinor
- child.ino = fs.newDirIno()
- } else if !child.upperVD.Ok() {
- childDevMinor, err := fs.getLowerDevMinor(child.devMajor, child.devMinor)
- if err != nil {
- ctx.Infof("overlay.filesystem.lookupLocked: failed to map lower layer device number (%d, %d) to an overlay-specific device number: %v", child.devMajor, child.devMinor, err)
- child.destroyLocked(ctx)
- return nil, topLookupLayer, err
- }
- child.devMajor = linux.UNNAMED_MAJOR
- child.devMinor = childDevMinor
+ // Device and inode numbers were copied from the topmost layer above. Remap
+ // the device number to an appropriate overlay-private one.
+ childDevMinor, err := fs.getPrivateDevMinor(child.devMajor, child.devMinor)
+ if err != nil {
+ ctx.Infof("overlay.filesystem.lookupLocked: failed to map layer device number (%d, %d) to an overlay-specific device number: %v", child.devMajor, child.devMinor, err)
+ child.destroyLocked(ctx)
+ return nil, topLookupLayer, err
}
+ child.devMajor = linux.UNNAMED_MAJOR
+ child.devMinor = childDevMinor
parent.IncRef()
child.parent = parent
diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go
index acd3684c6..58680bc80 100644
--- a/pkg/sentry/fsimpl/overlay/overlay.go
+++ b/pkg/sentry/fsimpl/overlay/overlay.go
@@ -97,28 +97,29 @@ type filesystem struct {
// used for accesses to the filesystem's layers. creds is immutable.
creds *auth.Credentials
- // dirDevMinor is the device minor number used for directories. dirDevMinor
- // is immutable.
- dirDevMinor uint32
-
- // lowerDevMinors maps device numbers from lower layer filesystems to
- // device minor numbers assigned to non-directory files originating from
- // that filesystem. (This remapping is necessary for lower layers because a
- // file on a lower layer, and that same file on an overlay, are
- // distinguishable because they will diverge after copy-up; this isn't true
- // for non-directory files already on the upper layer.) lowerDevMinors is
- // protected by devMu.
- devMu sync.Mutex `state:"nosave"`
- lowerDevMinors map[layerDevNumber]uint32
+ // privateDevMinors maps device numbers from layer filesystems to device
+ // minor numbers assigned to files originating from that filesystem.
+ //
+ // For non-directory files, this remapping is necessary for lower layers
+ // because a file on a lower layer, and that same file on an overlay, are
+ // distinguishable because they will diverge after copy-up. (Once a
+ // non-directory file has been copied up, its contents on the upper layer
+ // completely determine its contents in the overlay, so this is no longer
+ // true; but we still do the mapping for consistency.)
+ //
+ // For directories, this remapping may be necessary even if the directory
+ // exists on the upper layer due to directory merging; rather than make the
+ // mapping conditional on whether the directory is opaque, we again
+ // unconditionally apply the mapping unconditionally.
+ //
+ // privateDevMinors is protected by devMu.
+ devMu sync.Mutex `state:"nosave"`
+ privateDevMinors map[layerDevNumber]uint32
// renameMu synchronizes renaming with non-renaming operations in order to
// ensure consistent lock ordering between dentry.dirMu in different
// dentries.
renameMu sync.RWMutex `state:"nosave"`
-
- // lastDirIno is the last inode number assigned to a directory. lastDirIno
- // is accessed using atomic memory operations.
- lastDirIno uint64
}
// +stateify savable
@@ -232,12 +233,6 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
return nil, nil, syserror.EINVAL
}
- // Allocate dirDevMinor. lowerDevMinors are allocated dynamically.
- dirDevMinor, err := vfsObj.GetAnonBlockDevMinor()
- if err != nil {
- return nil, nil, err
- }
-
// Take extra references held by the filesystem.
if fsopts.UpperRoot.Ok() {
fsopts.UpperRoot.IncRef()
@@ -247,10 +242,9 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
}
fs := &filesystem{
- opts: fsopts,
- creds: creds.Fork(),
- dirDevMinor: dirDevMinor,
- lowerDevMinors: make(map[layerDevNumber]uint32),
+ opts: fsopts,
+ creds: creds.Fork(),
+ privateDevMinors: make(map[layerDevNumber]uint32),
}
fs.vfsfs.Init(vfsObj, &fstype, fs)
@@ -294,26 +288,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
root.mode = uint32(rootStat.Mode)
root.uid = rootStat.UID
root.gid = rootStat.GID
- if rootStat.Mode&linux.S_IFMT == linux.S_IFDIR {
- root.devMajor = linux.UNNAMED_MAJOR
- root.devMinor = fs.dirDevMinor
- root.ino = fs.newDirIno()
- } else if !root.upperVD.Ok() {
- root.devMajor = linux.UNNAMED_MAJOR
- rootDevMinor, err := fs.getLowerDevMinor(rootStat.DevMajor, rootStat.DevMinor)
- if err != nil {
- ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to get device number for root: %v", err)
- root.destroyLocked(ctx)
- fs.vfsfs.DecRef(ctx)
- return nil, nil, err
- }
- root.devMinor = rootDevMinor
- root.ino = rootStat.Ino
- } else {
- root.devMajor = rootStat.DevMajor
- root.devMinor = rootStat.DevMinor
- root.ino = rootStat.Ino
+ root.devMajor = linux.UNNAMED_MAJOR
+ rootDevMinor, err := fs.getPrivateDevMinor(rootStat.DevMajor, rootStat.DevMinor)
+ if err != nil {
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to get device number for root: %v", err)
+ root.destroyLocked(ctx)
+ fs.vfsfs.DecRef(ctx)
+ return nil, nil, err
}
+ root.devMinor = rootDevMinor
+ root.ino = rootStat.Ino
return &fs.vfsfs, &root.vfsd, nil
}
@@ -344,9 +328,8 @@ func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forc
// Release implements vfs.FilesystemImpl.Release.
func (fs *filesystem) Release(ctx context.Context) {
vfsObj := fs.vfsfs.VirtualFilesystem()
- vfsObj.PutAnonBlockDevMinor(fs.dirDevMinor)
- for _, lowerDevMinor := range fs.lowerDevMinors {
- vfsObj.PutAnonBlockDevMinor(lowerDevMinor)
+ for _, devMinor := range fs.privateDevMinors {
+ vfsObj.PutAnonBlockDevMinor(devMinor)
}
if fs.opts.UpperRoot.Ok() {
fs.opts.UpperRoot.DecRef(ctx)
@@ -376,22 +359,18 @@ func (fs *filesystem) statFS(ctx context.Context) (linux.Statfs, error) {
return fsstat, nil
}
-func (fs *filesystem) newDirIno() uint64 {
- return atomic.AddUint64(&fs.lastDirIno, 1)
-}
-
-func (fs *filesystem) getLowerDevMinor(layerMajor, layerMinor uint32) (uint32, error) {
+func (fs *filesystem) getPrivateDevMinor(layerMajor, layerMinor uint32) (uint32, error) {
fs.devMu.Lock()
defer fs.devMu.Unlock()
orig := layerDevNumber{layerMajor, layerMinor}
- if minor, ok := fs.lowerDevMinors[orig]; ok {
+ if minor, ok := fs.privateDevMinors[orig]; ok {
return minor, nil
}
minor, err := fs.vfsfs.VirtualFilesystem().GetAnonBlockDevMinor()
if err != nil {
return 0, err
}
- fs.lowerDevMinors[orig] = minor
+ fs.privateDevMinors[orig] = minor
return minor, nil
}