summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry
diff options
context:
space:
mode:
authorFabricio Voznika <fvoznika@google.com>2021-07-12 18:27:59 -0700
committergVisor bot <gvisor-bot@google.com>2021-07-12 18:30:46 -0700
commit520795aaad701854e9ffe84de1108954cf2b77f8 (patch)
tree9f2aa17263053f5fd3edd7614679341202129c08 /pkg/sentry
parent275932bf0852431b6f307ba9c45f26073d20ac62 (diff)
Fix deadlock in procfs
Kernfs provides an internal mechanism to defer calls to `DecRef()` because on the last reference `Filesystem.mu` must be held and most places that need to call `DecRef()` are inside the lock. The same can be true for filesystems that extend kernfs. procfs needs to look up files and `DecRef()` them inside the `kernfs.Filesystem.mu`. If the files happen to be procfs files, it can deadlock trying to decrement if it's the last reference. This change extends the mechanism to external callers to defer DecRefs to `vfs.FileDescription` and `vfs.VirtualDentries`. PiperOrigin-RevId: 384361647
Diffstat (limited to 'pkg/sentry')
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs.go54
-rw-r--r--pkg/sentry/fsimpl/proc/task.go4
-rw-r--r--pkg/sentry/fsimpl/proc/task_fds.go24
-rw-r--r--pkg/sentry/fsimpl/proc/task_files.go27
-rw-r--r--pkg/sentry/kernel/abstract_socket_namespace.go8
5 files changed, 78 insertions, 39 deletions
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 6f699c9cd..0e2867d49 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -52,7 +52,7 @@
// vfs.VirtualFilesystem.mountMu
// vfs.Dentry.mu
// (inode implementation locks, if any)
-// kernfs.Filesystem.droppedDentriesMu
+// kernfs.Filesystem.deferredDecRefsMu
package kernfs
import (
@@ -76,12 +76,12 @@ import (
type Filesystem struct {
vfsfs vfs.Filesystem
- droppedDentriesMu sync.Mutex `state:"nosave"`
+ deferredDecRefsMu sync.Mutex `state:"nosave"`
- // droppedDentries is a list of dentries waiting to be DecRef()ed. This is
+ // deferredDecRefs is a list of dentries waiting to be DecRef()ed. This is
// used to defer dentry destruction until mu can be acquired for
- // writing. Protected by droppedDentriesMu.
- droppedDentries []*Dentry
+ // writing. Protected by deferredDecRefsMu.
+ deferredDecRefs []refsvfs2.RefCounter
// mu synchronizes the lifetime of Dentries on this filesystem. Holding it
// for reading guarantees continued existence of any resolved dentries, but
@@ -131,25 +131,49 @@ type Filesystem struct {
// deferDecRef defers dropping a dentry ref until the next call to
// processDeferredDecRefs{,Locked}. See comment on Filesystem.mu.
// This may be called while Filesystem.mu or Dentry.dirMu is locked.
-func (fs *Filesystem) deferDecRef(d *Dentry) {
- fs.droppedDentriesMu.Lock()
- fs.droppedDentries = append(fs.droppedDentries, d)
- fs.droppedDentriesMu.Unlock()
+func (fs *Filesystem) deferDecRef(d refsvfs2.RefCounter) {
+ fs.deferredDecRefsMu.Lock()
+ fs.deferredDecRefs = append(fs.deferredDecRefs, d)
+ fs.deferredDecRefsMu.Unlock()
+}
+
+// SafeDecRefFD safely DecRef the FileDescription making sure DecRef is deferred
+// in case Filesystem.mu is held. See comment on Filesystem.mu.
+func (fs *Filesystem) SafeDecRefFD(ctx context.Context, fd *vfs.FileDescription) {
+ if d, ok := fd.Dentry().Impl().(*Dentry); ok && d.fs == fs {
+ // Only defer if dentry belongs to this filesystem, since locks cannot cross
+ // filesystems.
+ fs.deferDecRef(fd)
+ return
+ }
+ fd.DecRef(ctx)
+}
+
+// SafeDecRef safely DecRef the virtual dentry making sure DecRef is deferred
+// in case Filesystem.mu is held. See comment on Filesystem.mu.
+func (fs *Filesystem) SafeDecRef(ctx context.Context, vd vfs.VirtualDentry) {
+ if d, ok := vd.Dentry().Impl().(*Dentry); ok && d.fs == fs {
+ // Only defer if dentry belongs to this filesystem, since locks cannot cross
+ // filesystems.
+ fs.deferDecRef(&vd)
+ return
+ }
+ vd.DecRef(ctx)
}
// processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the
-// droppedDentries list. See comment on Filesystem.mu.
+// deferredDecRefs list. See comment on Filesystem.mu.
//
// Precondition: Filesystem.mu or Dentry.dirMu must NOT be locked.
func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) {
- fs.droppedDentriesMu.Lock()
- for _, d := range fs.droppedDentries {
- // Defer the DecRef call so that we are not holding droppedDentriesMu
+ fs.deferredDecRefsMu.Lock()
+ for _, d := range fs.deferredDecRefs {
+ // Defer the DecRef call so that we are not holding deferredDecRefsMu
// when DecRef is called.
defer d.DecRef(ctx)
}
- fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse.
- fs.droppedDentriesMu.Unlock()
+ fs.deferredDecRefs = fs.deferredDecRefs[:0] // Keep slice memory for reuse.
+ fs.deferredDecRefsMu.Unlock()
}
// VFSFilesystem returns the generic vfs filesystem object.
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 2717e1359..cbbc0935a 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -65,8 +65,8 @@ func (fs *filesystem) newTaskInode(ctx context.Context, task *kernel.Task, pidns
"io": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0400, newIO(task, isThreadGroup)),
"maps": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mapsData{task: task}),
"mem": fs.newMemInode(ctx, task, fs.NextIno(), 0400),
- "mountinfo": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountInfoData{task: task}),
- "mounts": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountsData{task: task}),
+ "mountinfo": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountInfoData{fs: fs, task: task}),
+ "mounts": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountsData{fs: fs, task: task}),
"net": fs.newTaskNetDir(ctx, task),
"ns": fs.newTaskOwnedDir(ctx, task, fs.NextIno(), 0511, map[string]kernfs.Inode{
"net": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), "net"),
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index 4718fac7a..dfc0a924e 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -42,12 +42,12 @@ func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags)
return file, flags
}
-func taskFDExists(ctx context.Context, t *kernel.Task, fd int32) bool {
+func taskFDExists(ctx context.Context, fs *filesystem, t *kernel.Task, fd int32) bool {
file, _ := getTaskFD(t, fd)
if file == nil {
return false
}
- file.DecRef(ctx)
+ fs.SafeDecRefFD(ctx, file)
return true
}
@@ -145,7 +145,7 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, err
return nil, syserror.ENOENT
}
fd := int32(fdInt)
- if !taskFDExists(ctx, i.task, fd) {
+ if !taskFDExists(ctx, i.fs, i.task, fd) {
return nil, syserror.ENOENT
}
return i.fs.newFDSymlink(ctx, i.task, fd, i.fs.NextIno()), nil
@@ -198,6 +198,7 @@ type fdSymlink struct {
kernfs.InodeNoopRefCount
kernfs.InodeSymlink
+ fs *filesystem
task *kernel.Task
fd int32
}
@@ -206,6 +207,7 @@ var _ kernfs.Inode = (*fdSymlink)(nil)
func (fs *filesystem) newFDSymlink(ctx context.Context, task *kernel.Task, fd int32, ino uint64) kernfs.Inode {
inode := &fdSymlink{
+ fs: fs,
task: task,
fd: fd,
}
@@ -218,9 +220,9 @@ func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error)
if file == nil {
return "", syserror.ENOENT
}
- defer file.DecRef(ctx)
+ defer s.fs.SafeDecRefFD(ctx, file)
root := vfs.RootFromContext(ctx)
- defer root.DecRef(ctx)
+ defer s.fs.SafeDecRef(ctx, root)
// Note: it's safe to reenter kernfs from Readlink if needed to resolve path.
return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry())
@@ -231,7 +233,7 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen
if file == nil {
return vfs.VirtualDentry{}, "", syserror.ENOENT
}
- defer file.DecRef(ctx)
+ defer s.fs.SafeDecRefFD(ctx, file)
vd := file.VirtualDentry()
vd.IncRef()
return vd, "", nil
@@ -239,7 +241,7 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen
// Valid implements kernfs.Inode.Valid.
func (s *fdSymlink) Valid(ctx context.Context) bool {
- return taskFDExists(ctx, s.task, s.fd)
+ return taskFDExists(ctx, s.fs, s.task, s.fd)
}
// fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory.
@@ -279,10 +281,11 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode,
return nil, syserror.ENOENT
}
fd := int32(fdInt)
- if !taskFDExists(ctx, i.task, fd) {
+ if !taskFDExists(ctx, i.fs, i.task, fd) {
return nil, syserror.ENOENT
}
data := &fdInfoData{
+ fs: i.fs,
task: i.task,
fd: fd,
}
@@ -316,6 +319,7 @@ func (i *fdInfoDirInode) DecRef(ctx context.Context) {
type fdInfoData struct {
kernfs.DynamicBytesFile
+ fs *filesystem
task *kernel.Task
fd int32
}
@@ -328,7 +332,7 @@ func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
if file == nil {
return syserror.ENOENT
}
- defer file.DecRef(ctx)
+ defer d.fs.SafeDecRefFD(ctx, file)
// TODO(b/121266871): Include pos, locks, and other data. For now we only
// have flags.
// See https://www.kernel.org/doc/Documentation/filesystems/proc.txt
@@ -339,5 +343,5 @@ func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
// Valid implements kernfs.Inode.Valid.
func (d *fdInfoData) Valid(ctx context.Context) bool {
- return taskFDExists(ctx, d.task, d.fd)
+ return taskFDExists(ctx, d.fs, d.task, d.fd)
}
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 5526cac1e..5bb6bc372 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -803,13 +803,17 @@ type exeSymlink struct {
kernfs.InodeNoopRefCount
kernfs.InodeSymlink
+ fs *filesystem
task *kernel.Task
}
var _ kernfs.Inode = (*exeSymlink)(nil)
func (fs *filesystem) newExeSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode {
- inode := &exeSymlink{task: task}
+ inode := &exeSymlink{
+ fs: fs,
+ task: task,
+ }
inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
return inode
}
@@ -820,14 +824,14 @@ func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error)
if err != nil {
return "", err
}
- defer exec.DecRef(ctx)
+ defer s.fs.SafeDecRef(ctx, exec)
root := vfs.RootFromContext(ctx)
if !root.Ok() {
// It could have raced with process deletion.
return "", linuxerr.ESRCH
}
- defer root.DecRef(ctx)
+ defer s.fs.SafeDecRef(ctx, root)
vfsObj := exec.Mount().Filesystem().VirtualFilesystem()
name, _ := vfsObj.PathnameWithDeleted(ctx, root, exec)
@@ -879,13 +883,17 @@ type cwdSymlink struct {
kernfs.InodeNoopRefCount
kernfs.InodeSymlink
+ fs *filesystem
task *kernel.Task
}
var _ kernfs.Inode = (*cwdSymlink)(nil)
func (fs *filesystem) newCwdSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode {
- inode := &cwdSymlink{task: task}
+ inode := &cwdSymlink{
+ fs: fs,
+ task: task,
+ }
inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
return inode
}
@@ -896,14 +904,14 @@ func (s *cwdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error)
if err != nil {
return "", err
}
- defer cwd.DecRef(ctx)
+ defer s.fs.SafeDecRef(ctx, cwd)
root := vfs.RootFromContext(ctx)
if !root.Ok() {
// It could have raced with process deletion.
return "", linuxerr.ESRCH
}
- defer root.DecRef(ctx)
+ defer s.fs.SafeDecRef(ctx, root)
vfsObj := cwd.Mount().Filesystem().VirtualFilesystem()
name, _ := vfsObj.PathnameWithDeleted(ctx, root, cwd)
@@ -923,6 +931,7 @@ func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent
// It could have raced with process deletion.
return vfs.VirtualDentry{}, "", linuxerr.ESRCH
}
+ // The reference is transferred to the caller.
return cwd, "", nil
}
@@ -932,6 +941,7 @@ func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent
type mountInfoData struct {
kernfs.DynamicBytesFile
+ fs *filesystem
task *kernel.Task
}
@@ -952,7 +962,7 @@ func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
// Root has been destroyed. Don't try to read mounts.
return nil
}
- defer rootDir.DecRef(ctx)
+ defer i.fs.SafeDecRef(ctx, rootDir)
i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf)
return nil
}
@@ -963,6 +973,7 @@ func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
type mountsData struct {
kernfs.DynamicBytesFile
+ fs *filesystem
task *kernel.Task
}
@@ -983,7 +994,7 @@ func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
// Root has been destroyed. Don't try to read mounts.
return nil
}
- defer rootDir.DecRef(ctx)
+ defer i.fs.SafeDecRef(ctx, rootDir)
i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf)
return nil
}
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
index d100e58d7..5d86a04f3 100644
--- a/pkg/sentry/kernel/abstract_socket_namespace.go
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -27,7 +27,7 @@ import (
// +stateify savable
type abstractEndpoint struct {
ep transport.BoundEndpoint
- socket refsvfs2.RefCounter
+ socket refsvfs2.TryRefCounter
name string
ns *AbstractSocketNamespace
}
@@ -57,7 +57,7 @@ func NewAbstractSocketNamespace() *AbstractSocketNamespace {
// its backing socket.
type boundEndpoint struct {
transport.BoundEndpoint
- socket refsvfs2.RefCounter
+ socket refsvfs2.TryRefCounter
}
// Release implements transport.BoundEndpoint.Release.
@@ -89,7 +89,7 @@ func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndp
//
// When the last reference managed by socket is dropped, ep may be removed from the
// namespace.
-func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refsvfs2.RefCounter) error {
+func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refsvfs2.TryRefCounter) error {
a.mu.Lock()
defer a.mu.Unlock()
@@ -109,7 +109,7 @@ func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep tran
// Remove removes the specified socket at name from the abstract socket
// namespace, if it has not yet been replaced.
-func (a *AbstractSocketNamespace) Remove(name string, socket refsvfs2.RefCounter) {
+func (a *AbstractSocketNamespace) Remove(name string, socket refsvfs2.TryRefCounter) {
a.mu.Lock()
defer a.mu.Unlock()