diff options
author | Fabricio Voznika <fvoznika@google.com> | 2021-07-12 18:27:59 -0700 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2021-07-12 18:30:46 -0700 |
commit | 520795aaad701854e9ffe84de1108954cf2b77f8 (patch) | |
tree | 9f2aa17263053f5fd3edd7614679341202129c08 /pkg/sentry | |
parent | 275932bf0852431b6f307ba9c45f26073d20ac62 (diff) |
Fix deadlock in procfs
Kernfs provides an internal mechanism to defer calls to `DecRef()` because
on the last reference `Filesystem.mu` must be held and most places that
need to call `DecRef()` are inside the lock. The same can be true for
filesystems that extend kernfs. procfs needs to look up files and `DecRef()`
them inside the `kernfs.Filesystem.mu`. If the files happen to be procfs
files, it can deadlock trying to decrement if it's the last reference.
This change extends the mechanism to external callers to defer DecRefs
to `vfs.FileDescription` and `vfs.VirtualDentries`.
PiperOrigin-RevId: 384361647
Diffstat (limited to 'pkg/sentry')
-rw-r--r-- | pkg/sentry/fsimpl/kernfs/kernfs.go | 54 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/proc/task.go | 4 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/proc/task_fds.go | 24 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/proc/task_files.go | 27 | ||||
-rw-r--r-- | pkg/sentry/kernel/abstract_socket_namespace.go | 8 |
5 files changed, 78 insertions, 39 deletions
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 6f699c9cd..0e2867d49 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -52,7 +52,7 @@ // vfs.VirtualFilesystem.mountMu // vfs.Dentry.mu // (inode implementation locks, if any) -// kernfs.Filesystem.droppedDentriesMu +// kernfs.Filesystem.deferredDecRefsMu package kernfs import ( @@ -76,12 +76,12 @@ import ( type Filesystem struct { vfsfs vfs.Filesystem - droppedDentriesMu sync.Mutex `state:"nosave"` + deferredDecRefsMu sync.Mutex `state:"nosave"` - // droppedDentries is a list of dentries waiting to be DecRef()ed. This is + // deferredDecRefs is a list of dentries waiting to be DecRef()ed. This is // used to defer dentry destruction until mu can be acquired for - // writing. Protected by droppedDentriesMu. - droppedDentries []*Dentry + // writing. Protected by deferredDecRefsMu. + deferredDecRefs []refsvfs2.RefCounter // mu synchronizes the lifetime of Dentries on this filesystem. Holding it // for reading guarantees continued existence of any resolved dentries, but @@ -131,25 +131,49 @@ type Filesystem struct { // deferDecRef defers dropping a dentry ref until the next call to // processDeferredDecRefs{,Locked}. See comment on Filesystem.mu. // This may be called while Filesystem.mu or Dentry.dirMu is locked. -func (fs *Filesystem) deferDecRef(d *Dentry) { - fs.droppedDentriesMu.Lock() - fs.droppedDentries = append(fs.droppedDentries, d) - fs.droppedDentriesMu.Unlock() +func (fs *Filesystem) deferDecRef(d refsvfs2.RefCounter) { + fs.deferredDecRefsMu.Lock() + fs.deferredDecRefs = append(fs.deferredDecRefs, d) + fs.deferredDecRefsMu.Unlock() +} + +// SafeDecRefFD safely DecRef the FileDescription making sure DecRef is deferred +// in case Filesystem.mu is held. See comment on Filesystem.mu. +func (fs *Filesystem) SafeDecRefFD(ctx context.Context, fd *vfs.FileDescription) { + if d, ok := fd.Dentry().Impl().(*Dentry); ok && d.fs == fs { + // Only defer if dentry belongs to this filesystem, since locks cannot cross + // filesystems. + fs.deferDecRef(fd) + return + } + fd.DecRef(ctx) +} + +// SafeDecRef safely DecRef the virtual dentry making sure DecRef is deferred +// in case Filesystem.mu is held. See comment on Filesystem.mu. +func (fs *Filesystem) SafeDecRef(ctx context.Context, vd vfs.VirtualDentry) { + if d, ok := vd.Dentry().Impl().(*Dentry); ok && d.fs == fs { + // Only defer if dentry belongs to this filesystem, since locks cannot cross + // filesystems. + fs.deferDecRef(&vd) + return + } + vd.DecRef(ctx) } // processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the -// droppedDentries list. See comment on Filesystem.mu. +// deferredDecRefs list. See comment on Filesystem.mu. // // Precondition: Filesystem.mu or Dentry.dirMu must NOT be locked. func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) { - fs.droppedDentriesMu.Lock() - for _, d := range fs.droppedDentries { - // Defer the DecRef call so that we are not holding droppedDentriesMu + fs.deferredDecRefsMu.Lock() + for _, d := range fs.deferredDecRefs { + // Defer the DecRef call so that we are not holding deferredDecRefsMu // when DecRef is called. defer d.DecRef(ctx) } - fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse. - fs.droppedDentriesMu.Unlock() + fs.deferredDecRefs = fs.deferredDecRefs[:0] // Keep slice memory for reuse. + fs.deferredDecRefsMu.Unlock() } // VFSFilesystem returns the generic vfs filesystem object. diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 2717e1359..cbbc0935a 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -65,8 +65,8 @@ func (fs *filesystem) newTaskInode(ctx context.Context, task *kernel.Task, pidns "io": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0400, newIO(task, isThreadGroup)), "maps": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mapsData{task: task}), "mem": fs.newMemInode(ctx, task, fs.NextIno(), 0400), - "mountinfo": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountInfoData{task: task}), - "mounts": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountsData{task: task}), + "mountinfo": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountInfoData{fs: fs, task: task}), + "mounts": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountsData{fs: fs, task: task}), "net": fs.newTaskNetDir(ctx, task), "ns": fs.newTaskOwnedDir(ctx, task, fs.NextIno(), 0511, map[string]kernfs.Inode{ "net": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), "net"), diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index 4718fac7a..dfc0a924e 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -42,12 +42,12 @@ func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags) return file, flags } -func taskFDExists(ctx context.Context, t *kernel.Task, fd int32) bool { +func taskFDExists(ctx context.Context, fs *filesystem, t *kernel.Task, fd int32) bool { file, _ := getTaskFD(t, fd) if file == nil { return false } - file.DecRef(ctx) + fs.SafeDecRefFD(ctx, file) return true } @@ -145,7 +145,7 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, err return nil, syserror.ENOENT } fd := int32(fdInt) - if !taskFDExists(ctx, i.task, fd) { + if !taskFDExists(ctx, i.fs, i.task, fd) { return nil, syserror.ENOENT } return i.fs.newFDSymlink(ctx, i.task, fd, i.fs.NextIno()), nil @@ -198,6 +198,7 @@ type fdSymlink struct { kernfs.InodeNoopRefCount kernfs.InodeSymlink + fs *filesystem task *kernel.Task fd int32 } @@ -206,6 +207,7 @@ var _ kernfs.Inode = (*fdSymlink)(nil) func (fs *filesystem) newFDSymlink(ctx context.Context, task *kernel.Task, fd int32, ino uint64) kernfs.Inode { inode := &fdSymlink{ + fs: fs, task: task, fd: fd, } @@ -218,9 +220,9 @@ func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) if file == nil { return "", syserror.ENOENT } - defer file.DecRef(ctx) + defer s.fs.SafeDecRefFD(ctx, file) root := vfs.RootFromContext(ctx) - defer root.DecRef(ctx) + defer s.fs.SafeDecRef(ctx, root) // Note: it's safe to reenter kernfs from Readlink if needed to resolve path. return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry()) @@ -231,7 +233,7 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen if file == nil { return vfs.VirtualDentry{}, "", syserror.ENOENT } - defer file.DecRef(ctx) + defer s.fs.SafeDecRefFD(ctx, file) vd := file.VirtualDentry() vd.IncRef() return vd, "", nil @@ -239,7 +241,7 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen // Valid implements kernfs.Inode.Valid. func (s *fdSymlink) Valid(ctx context.Context) bool { - return taskFDExists(ctx, s.task, s.fd) + return taskFDExists(ctx, s.fs, s.task, s.fd) } // fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory. @@ -279,10 +281,11 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, return nil, syserror.ENOENT } fd := int32(fdInt) - if !taskFDExists(ctx, i.task, fd) { + if !taskFDExists(ctx, i.fs, i.task, fd) { return nil, syserror.ENOENT } data := &fdInfoData{ + fs: i.fs, task: i.task, fd: fd, } @@ -316,6 +319,7 @@ func (i *fdInfoDirInode) DecRef(ctx context.Context) { type fdInfoData struct { kernfs.DynamicBytesFile + fs *filesystem task *kernel.Task fd int32 } @@ -328,7 +332,7 @@ func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { if file == nil { return syserror.ENOENT } - defer file.DecRef(ctx) + defer d.fs.SafeDecRefFD(ctx, file) // TODO(b/121266871): Include pos, locks, and other data. For now we only // have flags. // See https://www.kernel.org/doc/Documentation/filesystems/proc.txt @@ -339,5 +343,5 @@ func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Valid implements kernfs.Inode.Valid. func (d *fdInfoData) Valid(ctx context.Context) bool { - return taskFDExists(ctx, d.task, d.fd) + return taskFDExists(ctx, d.fs, d.task, d.fd) } diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 5526cac1e..5bb6bc372 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -803,13 +803,17 @@ type exeSymlink struct { kernfs.InodeNoopRefCount kernfs.InodeSymlink + fs *filesystem task *kernel.Task } var _ kernfs.Inode = (*exeSymlink)(nil) func (fs *filesystem) newExeSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { - inode := &exeSymlink{task: task} + inode := &exeSymlink{ + fs: fs, + task: task, + } inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) return inode } @@ -820,14 +824,14 @@ func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) if err != nil { return "", err } - defer exec.DecRef(ctx) + defer s.fs.SafeDecRef(ctx, exec) root := vfs.RootFromContext(ctx) if !root.Ok() { // It could have raced with process deletion. return "", linuxerr.ESRCH } - defer root.DecRef(ctx) + defer s.fs.SafeDecRef(ctx, root) vfsObj := exec.Mount().Filesystem().VirtualFilesystem() name, _ := vfsObj.PathnameWithDeleted(ctx, root, exec) @@ -879,13 +883,17 @@ type cwdSymlink struct { kernfs.InodeNoopRefCount kernfs.InodeSymlink + fs *filesystem task *kernel.Task } var _ kernfs.Inode = (*cwdSymlink)(nil) func (fs *filesystem) newCwdSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { - inode := &cwdSymlink{task: task} + inode := &cwdSymlink{ + fs: fs, + task: task, + } inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) return inode } @@ -896,14 +904,14 @@ func (s *cwdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) if err != nil { return "", err } - defer cwd.DecRef(ctx) + defer s.fs.SafeDecRef(ctx, cwd) root := vfs.RootFromContext(ctx) if !root.Ok() { // It could have raced with process deletion. return "", linuxerr.ESRCH } - defer root.DecRef(ctx) + defer s.fs.SafeDecRef(ctx, root) vfsObj := cwd.Mount().Filesystem().VirtualFilesystem() name, _ := vfsObj.PathnameWithDeleted(ctx, root, cwd) @@ -923,6 +931,7 @@ func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent // It could have raced with process deletion. return vfs.VirtualDentry{}, "", linuxerr.ESRCH } + // The reference is transferred to the caller. return cwd, "", nil } @@ -932,6 +941,7 @@ func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent type mountInfoData struct { kernfs.DynamicBytesFile + fs *filesystem task *kernel.Task } @@ -952,7 +962,7 @@ func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Root has been destroyed. Don't try to read mounts. return nil } - defer rootDir.DecRef(ctx) + defer i.fs.SafeDecRef(ctx, rootDir) i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf) return nil } @@ -963,6 +973,7 @@ func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { type mountsData struct { kernfs.DynamicBytesFile + fs *filesystem task *kernel.Task } @@ -983,7 +994,7 @@ func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Root has been destroyed. Don't try to read mounts. return nil } - defer rootDir.DecRef(ctx) + defer i.fs.SafeDecRef(ctx, rootDir) i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf) return nil } diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go index d100e58d7..5d86a04f3 100644 --- a/pkg/sentry/kernel/abstract_socket_namespace.go +++ b/pkg/sentry/kernel/abstract_socket_namespace.go @@ -27,7 +27,7 @@ import ( // +stateify savable type abstractEndpoint struct { ep transport.BoundEndpoint - socket refsvfs2.RefCounter + socket refsvfs2.TryRefCounter name string ns *AbstractSocketNamespace } @@ -57,7 +57,7 @@ func NewAbstractSocketNamespace() *AbstractSocketNamespace { // its backing socket. type boundEndpoint struct { transport.BoundEndpoint - socket refsvfs2.RefCounter + socket refsvfs2.TryRefCounter } // Release implements transport.BoundEndpoint.Release. @@ -89,7 +89,7 @@ func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndp // // When the last reference managed by socket is dropped, ep may be removed from the // namespace. -func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refsvfs2.RefCounter) error { +func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refsvfs2.TryRefCounter) error { a.mu.Lock() defer a.mu.Unlock() @@ -109,7 +109,7 @@ func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep tran // Remove removes the specified socket at name from the abstract socket // namespace, if it has not yet been replaced. -func (a *AbstractSocketNamespace) Remove(name string, socket refsvfs2.RefCounter) { +func (a *AbstractSocketNamespace) Remove(name string, socket refsvfs2.TryRefCounter) { a.mu.Lock() defer a.mu.Unlock() |