diff options
Diffstat (limited to 'pkg/sentry')
59 files changed, 3365 insertions, 357 deletions
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index 3f3a099bd..0eaff9087 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -83,6 +83,7 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds } root.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555) root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + root.EnableLeakCheck() root.dentry.Init(root) // Construct the pts master inode and dentry. Linux always uses inode @@ -110,6 +111,7 @@ func (fs *filesystem) Release(ctx context.Context) { // rootInode is the root directory inode for the devpts mounts. type rootInode struct { + rootInodeRefs kernfs.AlwaysValid kernfs.InodeAttrs kernfs.InodeDirectoryNoNewChildren @@ -233,3 +235,8 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, } return offset, nil } + +// DecRef implements kernfs.Inode. +func (i *rootInode) DecRef(context.Context) { + i.rootInodeRefs.DecRef(i.Destroy) +} diff --git a/pkg/sentry/fsimpl/devpts/devpts_state_autogen.go b/pkg/sentry/fsimpl/devpts/devpts_state_autogen.go index 5942f7bac..adc184d1b 100644 --- a/pkg/sentry/fsimpl/devpts/devpts_state_autogen.go +++ b/pkg/sentry/fsimpl/devpts/devpts_state_autogen.go @@ -120,6 +120,29 @@ func (x *queue) StateLoad(m state.Source) { m.Load(4, &x.transformer) } +func (x *rootInodeRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/devpts.rootInodeRefs" +} + +func (x *rootInodeRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *rootInodeRefs) beforeSave() {} + +func (x *rootInodeRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *rootInodeRefs) afterLoad() {} + +func (x *rootInodeRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *Terminal) StateTypeName() string { return "pkg/sentry/fsimpl/devpts.Terminal" } @@ -157,5 +180,6 @@ func init() { state.Register((*outputQueueTransformer)(nil)) state.Register((*inputQueueTransformer)(nil)) state.Register((*queue)(nil)) + state.Register((*rootInodeRefs)(nil)) state.Register((*Terminal)(nil)) } diff --git a/pkg/sentry/fsimpl/devpts/root_inode_refs.go b/pkg/sentry/fsimpl/devpts/root_inode_refs.go new file mode 100644 index 000000000..051801202 --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/root_inode_refs.go @@ -0,0 +1,118 @@ +package devpts + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var rootInodeownerType *rootInode + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type rootInodeRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *rootInodeRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, rootInodeownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *rootInodeRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*rootInodeRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *rootInodeRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *rootInodeRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, rootInodeownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *rootInodeRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *rootInodeRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, rootInodeownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go b/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go index e4ce04322..f72fe342e 100644 --- a/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go +++ b/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go @@ -99,6 +99,29 @@ func (x *futureResponse) StateLoad(m state.Source) { m.Load(3, &x.data) } +func (x *inodeRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/fuse.inodeRefs" +} + +func (x *inodeRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *inodeRefs) beforeSave() {} + +func (x *inodeRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *inodeRefs) afterLoad() {} + +func (x *inodeRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *requestList) StateTypeName() string { return "pkg/sentry/fsimpl/fuse.requestList" } @@ -155,6 +178,7 @@ func init() { state.Register((*Request)(nil)) state.Register((*Response)(nil)) state.Register((*futureResponse)(nil)) + state.Register((*inodeRefs)(nil)) state.Register((*requestList)(nil)) state.Register((*requestEntry)(nil)) } diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index 44021ee4b..9717c0e15 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -198,6 +198,7 @@ func (fs *filesystem) Release(ctx context.Context) { // inode implements kernfs.Inode. type inode struct { + inodeRefs kernfs.InodeAttrs kernfs.InodeNoDynamicLookup kernfs.InodeNotSymlink @@ -213,6 +214,7 @@ func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *ke i := &inode{} i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + i.EnableLeakCheck() i.dentry.Init(i) return &i.dentry @@ -324,3 +326,8 @@ func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptio return statFromFUSEAttr(out.Attr, opts.Mask, fusefs.devMinor), nil } + +// DecRef implements kernfs.Inode. +func (i *inode) DecRef(context.Context) { + i.inodeRefs.DecRef(i.Destroy) +} diff --git a/pkg/sentry/fsimpl/fuse/inode_refs.go b/pkg/sentry/fsimpl/fuse/inode_refs.go new file mode 100644 index 000000000..6b9456e1d --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/inode_refs.go @@ -0,0 +1,118 @@ +package fuse + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var inodeownerType *inode + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type inodeRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *inodeRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, inodeownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *inodeRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*inodeRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *inodeRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *inodeRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, inodeownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *inodeRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *inodeRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, inodeownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/host/connected_endpoint_refs.go b/pkg/sentry/fsimpl/host/connected_endpoint_refs.go index 3b7bf599e..babb3f664 100644 --- a/pkg/sentry/fsimpl/host/connected_endpoint_refs.go +++ b/pkg/sentry/fsimpl/host/connected_endpoint_refs.go @@ -1,6 +1,7 @@ package host import ( + "fmt" "runtime" "sync/atomic" @@ -18,6 +19,11 @@ var ConnectedEndpointownerType *ConnectedEndpoint // Note that the number of references is actually refCount + 1 so that a default // zero-value Refs object contains one reference. // +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// // +stateify savable type ConnectedEndpointRefs struct { // refCount is composed of two fields: @@ -62,7 +68,7 @@ func (r *ConnectedEndpointRefs) ReadRefs() int64 { //go:nosplit func (r *ConnectedEndpointRefs) IncRef() { if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { - panic("Incrementing non-positive ref count") + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, ConnectedEndpointownerType)) } } @@ -101,7 +107,7 @@ func (r *ConnectedEndpointRefs) TryIncRef() bool { func (r *ConnectedEndpointRefs) DecRef(destroy func()) { switch v := atomic.AddInt64(&r.refCount, -1); { case v < -1: - panic("Decrementing non-positive ref count") + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, ConnectedEndpointownerType)) case v == -1: diff --git a/pkg/sentry/fsimpl/host/inode_refs.go b/pkg/sentry/fsimpl/host/inode_refs.go index 55c0fb3a9..17f90ce4a 100644 --- a/pkg/sentry/fsimpl/host/inode_refs.go +++ b/pkg/sentry/fsimpl/host/inode_refs.go @@ -1,6 +1,7 @@ package host import ( + "fmt" "runtime" "sync/atomic" @@ -18,6 +19,11 @@ var inodeownerType *inode // Note that the number of references is actually refCount + 1 so that a default // zero-value Refs object contains one reference. // +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// // +stateify savable type inodeRefs struct { // refCount is composed of two fields: @@ -62,7 +68,7 @@ func (r *inodeRefs) ReadRefs() int64 { //go:nosplit func (r *inodeRefs) IncRef() { if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { - panic("Incrementing non-positive ref count") + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, inodeownerType)) } } @@ -101,7 +107,7 @@ func (r *inodeRefs) TryIncRef() bool { func (r *inodeRefs) DecRef(destroy func()) { switch v := atomic.AddInt64(&r.refCount, -1); { case v < -1: - panic("Decrementing non-positive ref count") + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, inodeownerType)) case v == -1: diff --git a/pkg/sentry/fsimpl/kernfs/dentry_refs.go b/pkg/sentry/fsimpl/kernfs/dentry_refs.go new file mode 100644 index 000000000..79863b3bc --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/dentry_refs.go @@ -0,0 +1,118 @@ +package kernfs + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var DentryownerType *Dentry + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type DentryRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *DentryRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, DentryownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *DentryRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*DentryRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *DentryRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *DentryRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, DentryownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *DentryRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *DentryRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, DentryownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 885856868..f442a5606 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -344,8 +343,6 @@ type OrderedChildrenOptions struct { // // Must be initialize with Init before first use. type OrderedChildren struct { - refs.AtomicRefCount - // Can children be modified by user syscalls? It set to false, interface // methods that would modify the children return EPERM. Immutable. writable bool @@ -361,14 +358,14 @@ func (o *OrderedChildren) Init(opts OrderedChildrenOptions) { o.set = make(map[string]*slot) } -// DecRef implements Inode.DecRef. -func (o *OrderedChildren) DecRef(ctx context.Context) { - o.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) { - o.mu.Lock() - defer o.mu.Unlock() - o.order.Reset() - o.set = nil - }) +// Destroy clears the children stored in o. It should be called by structs +// embedding OrderedChildren upon destruction, i.e. when their reference count +// reaches zero. +func (o *OrderedChildren) Destroy() { + o.mu.Lock() + defer o.mu.Unlock() + o.order.Reset() + o.set = nil } // Populate inserts children into this OrderedChildren, and d's dentry @@ -549,6 +546,7 @@ func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D // // +stateify savable type StaticDirectory struct { + StaticDirectoryRefs InodeNotSymlink InodeDirectoryNoNewChildren InodeAttrs @@ -594,11 +592,16 @@ func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd return fd.VFSFileDescription(), nil } -// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } +// DecRef implements kernfs.Inode. +func (s *StaticDirectory) DecRef(context.Context) { + s.StaticDirectoryRefs.DecRef(s.Destroy) +} + // AlwaysValid partially implements kernfs.inodeDynamicLookup. type AlwaysValid struct{} diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 51dbc050c..ca3685800 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -57,7 +57,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -161,9 +160,9 @@ const ( // // Must be initialized by Init prior to first use. type Dentry struct { - vfsd vfs.Dentry + DentryRefs - refs.AtomicRefCount + vfsd vfs.Dentry // flags caches useful information about the dentry from the inode. See the // dflags* consts above. Must be accessed by atomic ops. @@ -194,6 +193,7 @@ func (d *Dentry) Init(inode Inode) { if ftype == linux.ModeSymlink { d.flags |= dflagsIsSymlink } + d.EnableLeakCheck() } // VFSDentry returns the generic vfs dentry for this kernfs dentry. @@ -213,16 +213,14 @@ func (d *Dentry) isSymlink() bool { // DecRef implements vfs.DentryImpl.DecRef. func (d *Dentry) DecRef(ctx context.Context) { - d.AtomicRefCount.DecRefWithDestructor(ctx, d.destroy) -} - -// Precondition: Dentry must be removed from VFS' dentry cache. -func (d *Dentry) destroy(ctx context.Context) { - d.inode.DecRef(ctx) // IncRef from Init. - d.inode = nil - if d.parent != nil { - d.parent.DecRef(ctx) // IncRef from Dentry.InsertChild. - } + // Before the destructor is called, Dentry must be removed from VFS' dentry cache. + d.DentryRefs.DecRef(func() { + d.inode.DecRef(ctx) // IncRef from Init. + d.inode = nil + if d.parent != nil { + d.parent.DecRef(ctx) // IncRef from Dentry.InsertChild. + } + }) } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_state_autogen.go b/pkg/sentry/fsimpl/kernfs/kernfs_state_autogen.go index f396affb0..12aaf797f 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_state_autogen.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_state_autogen.go @@ -6,6 +6,29 @@ import ( "gvisor.dev/gvisor/pkg/state" ) +func (x *DentryRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/kernfs.DentryRefs" +} + +func (x *DentryRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *DentryRefs) beforeSave() {} + +func (x *DentryRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *DentryRefs) afterLoad() {} + +func (x *DentryRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *DynamicBytesFile) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.DynamicBytesFile" } @@ -85,6 +108,7 @@ func (x *StaticDirectory) StateTypeName() string { func (x *StaticDirectory) StateFields() []string { return []string{ + "StaticDirectoryRefs", "InodeNotSymlink", "InodeDirectoryNoNewChildren", "InodeAttrs", @@ -99,25 +123,27 @@ func (x *StaticDirectory) beforeSave() {} func (x *StaticDirectory) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.InodeNotSymlink) - m.Save(1, &x.InodeDirectoryNoNewChildren) - m.Save(2, &x.InodeAttrs) - m.Save(3, &x.InodeNoDynamicLookup) - m.Save(4, &x.OrderedChildren) - m.Save(5, &x.locks) - m.Save(6, &x.fdOpts) + m.Save(0, &x.StaticDirectoryRefs) + m.Save(1, &x.InodeNotSymlink) + m.Save(2, &x.InodeDirectoryNoNewChildren) + m.Save(3, &x.InodeAttrs) + m.Save(4, &x.InodeNoDynamicLookup) + m.Save(5, &x.OrderedChildren) + m.Save(6, &x.locks) + m.Save(7, &x.fdOpts) } func (x *StaticDirectory) afterLoad() {} func (x *StaticDirectory) StateLoad(m state.Source) { - m.Load(0, &x.InodeNotSymlink) - m.Load(1, &x.InodeDirectoryNoNewChildren) - m.Load(2, &x.InodeAttrs) - m.Load(3, &x.InodeNoDynamicLookup) - m.Load(4, &x.OrderedChildren) - m.Load(5, &x.locks) - m.Load(6, &x.fdOpts) + m.Load(0, &x.StaticDirectoryRefs) + m.Load(1, &x.InodeNotSymlink) + m.Load(2, &x.InodeDirectoryNoNewChildren) + m.Load(3, &x.InodeAttrs) + m.Load(4, &x.InodeNoDynamicLookup) + m.Load(5, &x.OrderedChildren) + m.Load(6, &x.locks) + m.Load(7, &x.fdOpts) } func (x *slotList) StateTypeName() string { @@ -172,10 +198,35 @@ func (x *slotEntry) StateLoad(m state.Source) { m.Load(1, &x.prev) } +func (x *StaticDirectoryRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/kernfs.StaticDirectoryRefs" +} + +func (x *StaticDirectoryRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *StaticDirectoryRefs) beforeSave() {} + +func (x *StaticDirectoryRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *StaticDirectoryRefs) afterLoad() {} + +func (x *StaticDirectoryRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func init() { + state.Register((*DentryRefs)(nil)) state.Register((*DynamicBytesFile)(nil)) state.Register((*DynamicBytesFD)(nil)) state.Register((*StaticDirectory)(nil)) state.Register((*slotList)(nil)) state.Register((*slotEntry)(nil)) + state.Register((*StaticDirectoryRefs)(nil)) } diff --git a/pkg/sentry/fsimpl/kernfs/static_directory_refs.go b/pkg/sentry/fsimpl/kernfs/static_directory_refs.go new file mode 100644 index 000000000..478b04bdd --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/static_directory_refs.go @@ -0,0 +1,118 @@ +package kernfs + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var StaticDirectoryownerType *StaticDirectory + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type StaticDirectoryRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *StaticDirectoryRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, StaticDirectoryownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *StaticDirectoryRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*StaticDirectoryRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *StaticDirectoryRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *StaticDirectoryRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, StaticDirectoryownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *StaticDirectoryRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *StaticDirectoryRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, StaticDirectoryownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go b/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go new file mode 100644 index 000000000..9431c1506 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go @@ -0,0 +1,118 @@ +package proc + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var fdDirInodeownerType *fdDirInode + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type fdDirInodeRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *fdDirInodeRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, fdDirInodeownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *fdDirInodeRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*fdDirInodeRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *fdDirInodeRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *fdDirInodeRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, fdDirInodeownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *fdDirInodeRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *fdDirInodeRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, fdDirInodeownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go b/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go new file mode 100644 index 000000000..872b20eb0 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go @@ -0,0 +1,118 @@ +package proc + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var fdInfoDirInodeownerType *fdInfoDirInode + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type fdInfoDirInodeRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *fdInfoDirInodeRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, fdInfoDirInodeownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *fdInfoDirInodeRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*fdInfoDirInodeRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *fdInfoDirInodeRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *fdInfoDirInodeRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, fdInfoDirInodeownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *fdInfoDirInodeRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *fdInfoDirInodeRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, fdInfoDirInodeownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/proc/proc_state_autogen.go b/pkg/sentry/fsimpl/proc/proc_state_autogen.go index a7e43b7a7..907ef38e0 100644 --- a/pkg/sentry/fsimpl/proc/proc_state_autogen.go +++ b/pkg/sentry/fsimpl/proc/proc_state_autogen.go @@ -6,6 +6,52 @@ import ( "gvisor.dev/gvisor/pkg/state" ) +func (x *fdDirInodeRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/proc.fdDirInodeRefs" +} + +func (x *fdDirInodeRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *fdDirInodeRefs) beforeSave() {} + +func (x *fdDirInodeRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *fdDirInodeRefs) afterLoad() {} + +func (x *fdDirInodeRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + +func (x *fdInfoDirInodeRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/proc.fdInfoDirInodeRefs" +} + +func (x *fdInfoDirInodeRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *fdInfoDirInodeRefs) beforeSave() {} + +func (x *fdInfoDirInodeRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *fdInfoDirInodeRefs) afterLoad() {} + +func (x *fdInfoDirInodeRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *FilesystemType) StateTypeName() string { return "pkg/sentry/fsimpl/proc.FilesystemType" } @@ -31,6 +77,7 @@ func (x *subtasksInode) StateTypeName() string { func (x *subtasksInode) StateFields() []string { return []string{ + "subtasksInodeRefs", "InodeNotSymlink", "InodeDirectoryNoNewChildren", "InodeAttrs", @@ -48,31 +95,56 @@ func (x *subtasksInode) beforeSave() {} func (x *subtasksInode) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.InodeNotSymlink) - m.Save(1, &x.InodeDirectoryNoNewChildren) - m.Save(2, &x.InodeAttrs) - m.Save(3, &x.OrderedChildren) - m.Save(4, &x.AlwaysValid) - m.Save(5, &x.locks) - m.Save(6, &x.fs) - m.Save(7, &x.task) - m.Save(8, &x.pidns) - m.Save(9, &x.cgroupControllers) + m.Save(0, &x.subtasksInodeRefs) + m.Save(1, &x.InodeNotSymlink) + m.Save(2, &x.InodeDirectoryNoNewChildren) + m.Save(3, &x.InodeAttrs) + m.Save(4, &x.OrderedChildren) + m.Save(5, &x.AlwaysValid) + m.Save(6, &x.locks) + m.Save(7, &x.fs) + m.Save(8, &x.task) + m.Save(9, &x.pidns) + m.Save(10, &x.cgroupControllers) } func (x *subtasksInode) afterLoad() {} func (x *subtasksInode) StateLoad(m state.Source) { - m.Load(0, &x.InodeNotSymlink) - m.Load(1, &x.InodeDirectoryNoNewChildren) - m.Load(2, &x.InodeAttrs) - m.Load(3, &x.OrderedChildren) - m.Load(4, &x.AlwaysValid) - m.Load(5, &x.locks) - m.Load(6, &x.fs) - m.Load(7, &x.task) - m.Load(8, &x.pidns) - m.Load(9, &x.cgroupControllers) + m.Load(0, &x.subtasksInodeRefs) + m.Load(1, &x.InodeNotSymlink) + m.Load(2, &x.InodeDirectoryNoNewChildren) + m.Load(3, &x.InodeAttrs) + m.Load(4, &x.OrderedChildren) + m.Load(5, &x.AlwaysValid) + m.Load(6, &x.locks) + m.Load(7, &x.fs) + m.Load(8, &x.task) + m.Load(9, &x.pidns) + m.Load(10, &x.cgroupControllers) +} + +func (x *subtasksInodeRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/proc.subtasksInodeRefs" +} + +func (x *subtasksInodeRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *subtasksInodeRefs) beforeSave() {} + +func (x *subtasksInodeRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *subtasksInodeRefs) afterLoad() {} + +func (x *subtasksInodeRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) } func (x *taskInode) StateTypeName() string { @@ -81,6 +153,7 @@ func (x *taskInode) StateTypeName() string { func (x *taskInode) StateFields() []string { return []string{ + "taskInodeRefs", "InodeNotSymlink", "InodeDirectoryNoNewChildren", "InodeNoDynamicLookup", @@ -95,25 +168,27 @@ func (x *taskInode) beforeSave() {} func (x *taskInode) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.InodeNotSymlink) - m.Save(1, &x.InodeDirectoryNoNewChildren) - m.Save(2, &x.InodeNoDynamicLookup) - m.Save(3, &x.InodeAttrs) - m.Save(4, &x.OrderedChildren) - m.Save(5, &x.locks) - m.Save(6, &x.task) + m.Save(0, &x.taskInodeRefs) + m.Save(1, &x.InodeNotSymlink) + m.Save(2, &x.InodeDirectoryNoNewChildren) + m.Save(3, &x.InodeNoDynamicLookup) + m.Save(4, &x.InodeAttrs) + m.Save(5, &x.OrderedChildren) + m.Save(6, &x.locks) + m.Save(7, &x.task) } func (x *taskInode) afterLoad() {} func (x *taskInode) StateLoad(m state.Source) { - m.Load(0, &x.InodeNotSymlink) - m.Load(1, &x.InodeDirectoryNoNewChildren) - m.Load(2, &x.InodeNoDynamicLookup) - m.Load(3, &x.InodeAttrs) - m.Load(4, &x.OrderedChildren) - m.Load(5, &x.locks) - m.Load(6, &x.task) + m.Load(0, &x.taskInodeRefs) + m.Load(1, &x.InodeNotSymlink) + m.Load(2, &x.InodeDirectoryNoNewChildren) + m.Load(3, &x.InodeNoDynamicLookup) + m.Load(4, &x.InodeAttrs) + m.Load(5, &x.OrderedChildren) + m.Load(6, &x.locks) + m.Load(7, &x.task) } func (x *fdDirInode) StateTypeName() string { @@ -122,6 +197,7 @@ func (x *fdDirInode) StateTypeName() string { func (x *fdDirInode) StateFields() []string { return []string{ + "fdDirInodeRefs", "InodeNotSymlink", "InodeDirectoryNoNewChildren", "InodeAttrs", @@ -135,23 +211,25 @@ func (x *fdDirInode) beforeSave() {} func (x *fdDirInode) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.InodeNotSymlink) - m.Save(1, &x.InodeDirectoryNoNewChildren) - m.Save(2, &x.InodeAttrs) - m.Save(3, &x.OrderedChildren) - m.Save(4, &x.AlwaysValid) - m.Save(5, &x.fdDir) + m.Save(0, &x.fdDirInodeRefs) + m.Save(1, &x.InodeNotSymlink) + m.Save(2, &x.InodeDirectoryNoNewChildren) + m.Save(3, &x.InodeAttrs) + m.Save(4, &x.OrderedChildren) + m.Save(5, &x.AlwaysValid) + m.Save(6, &x.fdDir) } func (x *fdDirInode) afterLoad() {} func (x *fdDirInode) StateLoad(m state.Source) { - m.Load(0, &x.InodeNotSymlink) - m.Load(1, &x.InodeDirectoryNoNewChildren) - m.Load(2, &x.InodeAttrs) - m.Load(3, &x.OrderedChildren) - m.Load(4, &x.AlwaysValid) - m.Load(5, &x.fdDir) + m.Load(0, &x.fdDirInodeRefs) + m.Load(1, &x.InodeNotSymlink) + m.Load(2, &x.InodeDirectoryNoNewChildren) + m.Load(3, &x.InodeAttrs) + m.Load(4, &x.OrderedChildren) + m.Load(5, &x.AlwaysValid) + m.Load(6, &x.fdDir) } func (x *fdSymlink) StateTypeName() string { @@ -195,6 +273,7 @@ func (x *fdInfoDirInode) StateTypeName() string { func (x *fdInfoDirInode) StateFields() []string { return []string{ + "fdInfoDirInodeRefs", "InodeNotSymlink", "InodeDirectoryNoNewChildren", "InodeAttrs", @@ -208,23 +287,25 @@ func (x *fdInfoDirInode) beforeSave() {} func (x *fdInfoDirInode) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.InodeNotSymlink) - m.Save(1, &x.InodeDirectoryNoNewChildren) - m.Save(2, &x.InodeAttrs) - m.Save(3, &x.OrderedChildren) - m.Save(4, &x.AlwaysValid) - m.Save(5, &x.fdDir) + m.Save(0, &x.fdInfoDirInodeRefs) + m.Save(1, &x.InodeNotSymlink) + m.Save(2, &x.InodeDirectoryNoNewChildren) + m.Save(3, &x.InodeAttrs) + m.Save(4, &x.OrderedChildren) + m.Save(5, &x.AlwaysValid) + m.Save(6, &x.fdDir) } func (x *fdInfoDirInode) afterLoad() {} func (x *fdInfoDirInode) StateLoad(m state.Source) { - m.Load(0, &x.InodeNotSymlink) - m.Load(1, &x.InodeDirectoryNoNewChildren) - m.Load(2, &x.InodeAttrs) - m.Load(3, &x.OrderedChildren) - m.Load(4, &x.AlwaysValid) - m.Load(5, &x.fdDir) + m.Load(0, &x.fdInfoDirInodeRefs) + m.Load(1, &x.InodeNotSymlink) + m.Load(2, &x.InodeDirectoryNoNewChildren) + m.Load(3, &x.InodeAttrs) + m.Load(4, &x.OrderedChildren) + m.Load(5, &x.AlwaysValid) + m.Load(6, &x.fdDir) } func (x *fdInfoData) StateTypeName() string { @@ -234,7 +315,6 @@ func (x *fdInfoData) StateTypeName() string { func (x *fdInfoData) StateFields() []string { return []string{ "DynamicBytesFile", - "AtomicRefCount", "task", "fd", } @@ -245,18 +325,16 @@ func (x *fdInfoData) beforeSave() {} func (x *fdInfoData) StateSave(m state.Sink) { x.beforeSave() m.Save(0, &x.DynamicBytesFile) - m.Save(1, &x.AtomicRefCount) - m.Save(2, &x.task) - m.Save(3, &x.fd) + m.Save(1, &x.task) + m.Save(2, &x.fd) } func (x *fdInfoData) afterLoad() {} func (x *fdInfoData) StateLoad(m state.Source) { m.Load(0, &x.DynamicBytesFile) - m.Load(1, &x.AtomicRefCount) - m.Load(2, &x.task) - m.Load(3, &x.fd) + m.Load(1, &x.task) + m.Load(2, &x.fd) } func (x *auxvData) StateTypeName() string { @@ -670,6 +748,29 @@ func (x *mountsData) StateLoad(m state.Source) { m.Load(1, &x.task) } +func (x *taskInodeRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/proc.taskInodeRefs" +} + +func (x *taskInodeRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *taskInodeRefs) beforeSave() {} + +func (x *taskInodeRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *taskInodeRefs) afterLoad() {} + +func (x *taskInodeRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *ifinet6) StateTypeName() string { return "pkg/sentry/fsimpl/proc.ifinet6" } @@ -910,6 +1011,7 @@ func (x *tasksInode) StateTypeName() string { func (x *tasksInode) StateFields() []string { return []string{ + "tasksInodeRefs", "InodeNotSymlink", "InodeDirectoryNoNewChildren", "InodeAttrs", @@ -928,33 +1030,35 @@ func (x *tasksInode) beforeSave() {} func (x *tasksInode) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.InodeNotSymlink) - m.Save(1, &x.InodeDirectoryNoNewChildren) - m.Save(2, &x.InodeAttrs) - m.Save(3, &x.OrderedChildren) - m.Save(4, &x.AlwaysValid) - m.Save(5, &x.locks) - m.Save(6, &x.fs) - m.Save(7, &x.pidns) - m.Save(8, &x.selfSymlink) - m.Save(9, &x.threadSelfSymlink) - m.Save(10, &x.cgroupControllers) + m.Save(0, &x.tasksInodeRefs) + m.Save(1, &x.InodeNotSymlink) + m.Save(2, &x.InodeDirectoryNoNewChildren) + m.Save(3, &x.InodeAttrs) + m.Save(4, &x.OrderedChildren) + m.Save(5, &x.AlwaysValid) + m.Save(6, &x.locks) + m.Save(7, &x.fs) + m.Save(8, &x.pidns) + m.Save(9, &x.selfSymlink) + m.Save(10, &x.threadSelfSymlink) + m.Save(11, &x.cgroupControllers) } func (x *tasksInode) afterLoad() {} func (x *tasksInode) StateLoad(m state.Source) { - m.Load(0, &x.InodeNotSymlink) - m.Load(1, &x.InodeDirectoryNoNewChildren) - m.Load(2, &x.InodeAttrs) - m.Load(3, &x.OrderedChildren) - m.Load(4, &x.AlwaysValid) - m.Load(5, &x.locks) - m.Load(6, &x.fs) - m.Load(7, &x.pidns) - m.Load(8, &x.selfSymlink) - m.Load(9, &x.threadSelfSymlink) - m.Load(10, &x.cgroupControllers) + m.Load(0, &x.tasksInodeRefs) + m.Load(1, &x.InodeNotSymlink) + m.Load(2, &x.InodeDirectoryNoNewChildren) + m.Load(3, &x.InodeAttrs) + m.Load(4, &x.OrderedChildren) + m.Load(5, &x.AlwaysValid) + m.Load(6, &x.locks) + m.Load(7, &x.fs) + m.Load(8, &x.pidns) + m.Load(9, &x.selfSymlink) + m.Load(10, &x.threadSelfSymlink) + m.Load(11, &x.cgroupControllers) } func (x *statData) StateTypeName() string { @@ -1095,6 +1199,29 @@ func (x *filesystemsData) StateLoad(m state.Source) { m.Load(0, &x.DynamicBytesFile) } +func (x *tasksInodeRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/proc.tasksInodeRefs" +} + +func (x *tasksInodeRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *tasksInodeRefs) beforeSave() {} + +func (x *tasksInodeRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *tasksInodeRefs) afterLoad() {} + +func (x *tasksInodeRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *mmapMinAddrData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.mmapMinAddrData" } @@ -1229,8 +1356,11 @@ func (x *tcpMemData) StateLoad(m state.Source) { } func init() { + state.Register((*fdDirInodeRefs)(nil)) + state.Register((*fdInfoDirInodeRefs)(nil)) state.Register((*FilesystemType)(nil)) state.Register((*subtasksInode)(nil)) + state.Register((*subtasksInodeRefs)(nil)) state.Register((*taskInode)(nil)) state.Register((*fdDirInode)(nil)) state.Register((*fdSymlink)(nil)) @@ -1251,6 +1381,7 @@ func init() { state.Register((*exeSymlink)(nil)) state.Register((*mountInfoData)(nil)) state.Register((*mountsData)(nil)) + state.Register((*taskInodeRefs)(nil)) state.Register((*ifinet6)(nil)) state.Register((*netDevData)(nil)) state.Register((*netUnixData)(nil)) @@ -1267,6 +1398,7 @@ func init() { state.Register((*uptimeData)(nil)) state.Register((*versionData)(nil)) state.Register((*filesystemsData)(nil)) + state.Register((*tasksInodeRefs)(nil)) state.Register((*mmapMinAddrData)(nil)) state.Register((*hostnameData)(nil)) state.Register((*tcpSackData)(nil)) diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index f25747da3..01c0efb3a 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -31,6 +31,7 @@ import ( // // +stateify savable type subtasksInode struct { + subtasksInodeRefs kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren kernfs.InodeAttrs @@ -57,6 +58,7 @@ func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, // Note: credentials are overridden by taskOwnedInode. subInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + subInode.EnableLeakCheck() inode := &taskOwnedInode{Inode: subInode, owner: task} dentry := &kernfs.Dentry{} @@ -182,3 +184,8 @@ func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } + +// DecRef implements kernfs.Inode. +func (i *subtasksInode) DecRef(context.Context) { + i.subtasksInodeRefs.DecRef(i.Destroy) +} diff --git a/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go b/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go new file mode 100644 index 000000000..c6d9b3522 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go @@ -0,0 +1,118 @@ +package proc + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var subtasksInodeownerType *subtasksInode + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type subtasksInodeRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *subtasksInodeRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, subtasksInodeownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *subtasksInodeRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*subtasksInodeRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *subtasksInodeRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *subtasksInodeRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, subtasksInodeownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *subtasksInodeRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *subtasksInodeRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, subtasksInodeownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 109b31b4c..66b557abd 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -32,6 +32,7 @@ import ( // // +stateify savable type taskInode struct { + taskInodeRefs kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren kernfs.InodeNoDynamicLookup @@ -84,6 +85,7 @@ func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace taskInode := &taskInode{task: task} // Note: credentials are overridden by taskOwnedInode. taskInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + taskInode.EnableLeakCheck() inode := &taskOwnedInode{Inode: taskInode, owner: task} dentry := &kernfs.Dentry{} @@ -119,6 +121,11 @@ func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, v return syserror.EPERM } +// DecRef implements kernfs.Inode. +func (i *taskInode) DecRef(context.Context) { + i.taskInodeRefs.DecRef(i.Destroy) +} + // taskOwnedInode implements kernfs.Inode and overrides inode owner with task // effective user and group. type taskOwnedInode struct { @@ -147,6 +154,7 @@ func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux. dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndZero, }) + dir.EnableLeakCheck() inode := &taskOwnedInode{Inode: dir, owner: task} d := &kernfs.Dentry{} diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index e8fcb9aa1..0527b2de8 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -101,6 +100,7 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, off // // +stateify savable type fdDirInode struct { + fdDirInodeRefs kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren kernfs.InodeAttrs @@ -120,6 +120,7 @@ func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry { }, } inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + inode.EnableLeakCheck() dentry := &kernfs.Dentry{} dentry.Init(inode) @@ -175,6 +176,11 @@ func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentia return err } +// DecRef implements kernfs.Inode. +func (i *fdDirInode) DecRef(context.Context) { + i.fdDirInodeRefs.DecRef(i.Destroy) +} + // fdSymlink is an symlink for the /proc/[pid]/fd/[fd] file. // // +stateify savable @@ -227,6 +233,7 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen // // +stateify savable type fdInfoDirInode struct { + fdInfoDirInodeRefs kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren kernfs.InodeAttrs @@ -245,6 +252,7 @@ func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry { }, } inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + inode.EnableLeakCheck() dentry := &kernfs.Dentry{} dentry.Init(inode) @@ -282,12 +290,16 @@ func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd * return fd.VFSFileDescription(), nil } +// DecRef implements kernfs.Inode. +func (i *fdInfoDirInode) DecRef(context.Context) { + i.fdInfoDirInodeRefs.DecRef(i.Destroy) +} + // fdInfoData implements vfs.DynamicBytesSource for /proc/[pid]/fdinfo/[fd]. // // +stateify savable type fdInfoData struct { kernfs.DynamicBytesFile - refs.AtomicRefCount task *kernel.Task fd int32 diff --git a/pkg/sentry/fsimpl/proc/task_inode_refs.go b/pkg/sentry/fsimpl/proc/task_inode_refs.go new file mode 100644 index 000000000..714488450 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/task_inode_refs.go @@ -0,0 +1,118 @@ +package proc + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var taskInodeownerType *taskInode + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type taskInodeRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *taskInodeRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, taskInodeownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *taskInodeRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*taskInodeRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *taskInodeRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *taskInodeRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, taskInodeownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *taskInodeRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *taskInodeRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, taskInodeownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index a4c884bf9..4e69782c7 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -262,7 +262,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { // For now, we always redact this pointer. fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %8d", (*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct. - s.Refs()-1, // RefCount, don't count our own ref. + s.ReadRefs()-1, // RefCount, don't count our own ref. 0, // Protocol, always 0 for UDS. sockFlags, // Flags. sops.Endpoint().Type(), // Type. @@ -430,7 +430,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, // Field: refcount. Don't count the ref we obtain while deferencing // the weakref to this socket. - fmt.Fprintf(buf, "%d ", s.Refs()-1) + fmt.Fprintf(buf, "%d ", s.ReadRefs()-1) // Field: Socket struct address. Redacted due to the same reason as // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. @@ -589,7 +589,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Field: ref; reference count on the socket inode. Don't count the ref // we obtain while deferencing the weakref to this socket. - fmt.Fprintf(buf, "%d ", s.Refs()-1) + fmt.Fprintf(buf, "%d ", s.ReadRefs()-1) // Field: Socket struct address. Redacted due to the same reason as // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 1391992b7..863c4467e 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -37,6 +37,7 @@ const ( // // +stateify savable type tasksInode struct { + tasksInodeRefs kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren kernfs.InodeAttrs @@ -84,6 +85,7 @@ func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace cgroupControllers: cgroupControllers, } inode.InodeAttrs.Init(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + inode.EnableLeakCheck() dentry := &kernfs.Dentry{} dentry.Init(inode) @@ -226,6 +228,11 @@ func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.St return stat, nil } +// DecRef implements kernfs.Inode. +func (i *tasksInode) DecRef(context.Context) { + i.tasksInodeRefs.DecRef(i.Destroy) +} + // staticFileSetStat implements a special static file that allows inode // attributes to be set. This is to support /proc files that are readonly, but // allow attributes to be set. diff --git a/pkg/sentry/fsimpl/proc/tasks_inode_refs.go b/pkg/sentry/fsimpl/proc/tasks_inode_refs.go new file mode 100644 index 000000000..22d9cc488 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks_inode_refs.go @@ -0,0 +1,118 @@ +package proc + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var tasksInodeownerType *tasksInode + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type tasksInodeRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *tasksInodeRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, tasksInodeownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *tasksInodeRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*tasksInodeRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *tasksInodeRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *tasksInodeRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, tasksInodeownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *tasksInodeRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *tasksInodeRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, tasksInodeownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/sys/dir_refs.go b/pkg/sentry/fsimpl/sys/dir_refs.go new file mode 100644 index 000000000..89609b198 --- /dev/null +++ b/pkg/sentry/fsimpl/sys/dir_refs.go @@ -0,0 +1,118 @@ +package sys + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var dirownerType *dir + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type dirRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *dirRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, dirownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *dirRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*dirRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *dirRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *dirRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, dirownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *dirRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *dirRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, dirownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 1f042d9f7..ea30a4ec2 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -118,6 +118,7 @@ func (fs *filesystem) Release(ctx context.Context) { // dir implements kernfs.Inode. type dir struct { + dirRefs kernfs.InodeAttrs kernfs.InodeNoDynamicLookup kernfs.InodeNotSymlink @@ -133,6 +134,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte d := &dir{} d.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + d.EnableLeakCheck() d.dentry.Init(d) d.IncLinks(d.OrderedChildren.Populate(&d.dentry, contents)) @@ -140,7 +142,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte return &d.dentry } -// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } @@ -156,6 +158,11 @@ func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, return fd.VFSFileDescription(), nil } +// DecRef implements kernfs.Inode.DecRef. +func (d *dir) DecRef(context.Context) { + d.dirRefs.DecRef(d.Destroy) +} + // cpuFile implements kernfs.Inode. type cpuFile struct { kernfs.DynamicBytesFile diff --git a/pkg/sentry/fsimpl/sys/sys_state_autogen.go b/pkg/sentry/fsimpl/sys/sys_state_autogen.go index 8866ddff9..347a46318 100644 --- a/pkg/sentry/fsimpl/sys/sys_state_autogen.go +++ b/pkg/sentry/fsimpl/sys/sys_state_autogen.go @@ -1,3 +1,34 @@ // automatically generated by stateify. package sys + +import ( + "gvisor.dev/gvisor/pkg/state" +) + +func (x *dirRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/sys.dirRefs" +} + +func (x *dirRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *dirRefs) beforeSave() {} + +func (x *dirRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *dirRefs) afterLoad() {} + +func (x *dirRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + +func init() { + state.Register((*dirRefs)(nil)) +} diff --git a/pkg/sentry/fsimpl/tmpfs/inode_refs.go b/pkg/sentry/fsimpl/tmpfs/inode_refs.go index 8b7ff185f..dbf0b2766 100644 --- a/pkg/sentry/fsimpl/tmpfs/inode_refs.go +++ b/pkg/sentry/fsimpl/tmpfs/inode_refs.go @@ -1,6 +1,7 @@ package tmpfs import ( + "fmt" "runtime" "sync/atomic" @@ -18,6 +19,11 @@ var inodeownerType *inode // Note that the number of references is actually refCount + 1 so that a default // zero-value Refs object contains one reference. // +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// // +stateify savable type inodeRefs struct { // refCount is composed of two fields: @@ -62,7 +68,7 @@ func (r *inodeRefs) ReadRefs() int64 { //go:nosplit func (r *inodeRefs) IncRef() { if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { - panic("Incrementing non-positive ref count") + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, inodeownerType)) } } @@ -101,7 +107,7 @@ func (r *inodeRefs) TryIncRef() bool { func (r *inodeRefs) DecRef(destroy func()) { switch v := atomic.AddInt64(&r.refCount, -1); { case v < -1: - panic("Decrementing non-positive ref count") + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, inodeownerType)) case v == -1: diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index ce53af69b..5773244ac 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/limits" @@ -78,7 +77,8 @@ type descriptor struct { // // +stateify savable type FDTable struct { - refs.AtomicRefCount + FDTableRefs + k *Kernel // mu protects below. @@ -176,16 +176,15 @@ func (k *Kernel) NewFDTable() *FDTable { return f } -// destroy removes all of the file descriptors from the map. -func (f *FDTable) destroy(ctx context.Context) { - f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool { - return true - }) -} - -// DecRef implements RefCounter.DecRef with destructor f.destroy. +// DecRef implements RefCounter.DecRef. +// +// If f reaches zero references, all of its file descriptors are removed. func (f *FDTable) DecRef(ctx context.Context) { - f.DecRefWithDestructor(ctx, f.destroy) + f.FDTableRefs.DecRef(func() { + f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool { + return true + }) + }) } // Size returns the number of file descriptor slots currently allocated. diff --git a/pkg/sentry/kernel/fd_table_refs.go b/pkg/sentry/kernel/fd_table_refs.go new file mode 100644 index 000000000..ecba138ac --- /dev/null +++ b/pkg/sentry/kernel/fd_table_refs.go @@ -0,0 +1,118 @@ +package kernel + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var FDTableownerType *FDTable + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type FDTableRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *FDTableRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, FDTableownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *FDTableRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*FDTableRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *FDTableRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *FDTableRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, FDTableownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *FDTableRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *FDTableRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, FDTableownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go index 7fd97dc53..6b8feb107 100644 --- a/pkg/sentry/kernel/fd_table_unsafe.go +++ b/pkg/sentry/kernel/fd_table_unsafe.go @@ -31,6 +31,8 @@ type descriptorTable struct { } // init initializes the table. +// +// TODO(gvisor.dev/1486): Enable leak check for FDTable. func (f *FDTable) init() { var slice []unsafe.Pointer // Empty slice. atomic.StorePointer(&f.slice, unsafe.Pointer(&slice)) diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go index 8f2d36d5a..d46d1e1c1 100644 --- a/pkg/sentry/kernel/fs_context.go +++ b/pkg/sentry/kernel/fs_context.go @@ -18,7 +18,6 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -30,7 +29,7 @@ import ( // // +stateify savable type FSContext struct { - refs.AtomicRefCount + FSContextRefs // mu protects below. mu sync.Mutex `state:"nosave"` @@ -64,7 +63,7 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext { cwd: cwd, umask: umask, } - f.EnableLeakCheck("kernel.FSContext") + f.EnableLeakCheck() return &f } @@ -77,54 +76,56 @@ func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext { cwdVFS2: cwd, umask: umask, } - f.EnableLeakCheck("kernel.FSContext") + f.EnableLeakCheck() return &f } -// destroy is the destructor for an FSContext. +// DecRef implements RefCounter.DecRef. // -// This will call DecRef on both root and cwd Dirents. If either call to -// DecRef returns an error, then it will be propagated. If both calls to -// DecRef return an error, then the one from root.DecRef will be propagated. +// When f reaches zero references, DecRef will be called on both root and cwd +// Dirents. // // Note that there may still be calls to WorkingDirectory() or RootDirectory() // (that return nil). This is because valid references may still be held via // proc files or other mechanisms. -func (f *FSContext) destroy(ctx context.Context) { - // Hold f.mu so that we don't race with RootDirectory() and - // WorkingDirectory(). - f.mu.Lock() - defer f.mu.Unlock() - - if VFS2Enabled { - f.rootVFS2.DecRef(ctx) - f.rootVFS2 = vfs.VirtualDentry{} - f.cwdVFS2.DecRef(ctx) - f.cwdVFS2 = vfs.VirtualDentry{} - } else { - f.root.DecRef(ctx) - f.root = nil - f.cwd.DecRef(ctx) - f.cwd = nil - } -} - -// DecRef implements RefCounter.DecRef with destructor f.destroy. func (f *FSContext) DecRef(ctx context.Context) { - f.DecRefWithDestructor(ctx, f.destroy) + f.FSContextRefs.DecRef(func() { + // Hold f.mu so that we don't race with RootDirectory() and + // WorkingDirectory(). + f.mu.Lock() + defer f.mu.Unlock() + + if VFS2Enabled { + f.rootVFS2.DecRef(ctx) + f.rootVFS2 = vfs.VirtualDentry{} + f.cwdVFS2.DecRef(ctx) + f.cwdVFS2 = vfs.VirtualDentry{} + } else { + f.root.DecRef(ctx) + f.root = nil + f.cwd.DecRef(ctx) + f.cwd = nil + } + }) } // Fork forks this FSContext. // -// This is not a valid call after destroy. +// This is not a valid call after f is destroyed. func (f *FSContext) Fork() *FSContext { f.mu.Lock() defer f.mu.Unlock() if VFS2Enabled { + if !f.cwdVFS2.Ok() { + panic("FSContext.Fork() called after destroy") + } f.cwdVFS2.IncRef() f.rootVFS2.IncRef() } else { + if f.cwd == nil { + panic("FSContext.Fork() called after destroy") + } f.cwd.IncRef() f.root.IncRef() } @@ -140,8 +141,8 @@ func (f *FSContext) Fork() *FSContext { // WorkingDirectory returns the current working directory. // -// This will return nil if called after destroy(), otherwise it will return a -// Dirent with a reference taken. +// This will return nil if called after f is destroyed, otherwise it will return +// a Dirent with a reference taken. func (f *FSContext) WorkingDirectory() *fs.Dirent { f.mu.Lock() defer f.mu.Unlock() @@ -152,8 +153,8 @@ func (f *FSContext) WorkingDirectory() *fs.Dirent { // WorkingDirectoryVFS2 returns the current working directory. // -// This will return nil if called after destroy(), otherwise it will return a -// Dirent with a reference taken. +// This will return nil if called after f is destroyed, otherwise it will return +// a Dirent with a reference taken. func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry { f.mu.Lock() defer f.mu.Unlock() @@ -165,7 +166,7 @@ func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry { // SetWorkingDirectory sets the current working directory. // This will take an extra reference on the Dirent. // -// This is not a valid call after destroy. +// This is not a valid call after f is destroyed. func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) { if d == nil { panic("FSContext.SetWorkingDirectory called with nil dirent") @@ -187,11 +188,15 @@ func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) { // SetWorkingDirectoryVFS2 sets the current working directory. // This will take an extra reference on the VirtualDentry. // -// This is not a valid call after destroy. +// This is not a valid call after f is destroyed. func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDentry) { f.mu.Lock() defer f.mu.Unlock() + if !f.cwdVFS2.Ok() { + panic(fmt.Sprintf("FSContext.SetWorkingDirectoryVFS2(%v)) called after destroy", d)) + } + old := f.cwdVFS2 f.cwdVFS2 = d d.IncRef() @@ -200,8 +205,8 @@ func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDe // RootDirectory returns the current filesystem root. // -// This will return nil if called after destroy(), otherwise it will return a -// Dirent with a reference taken. +// This will return nil if called after f is destroyed, otherwise it will return +// a Dirent with a reference taken. func (f *FSContext) RootDirectory() *fs.Dirent { f.mu.Lock() defer f.mu.Unlock() @@ -213,8 +218,8 @@ func (f *FSContext) RootDirectory() *fs.Dirent { // RootDirectoryVFS2 returns the current filesystem root. // -// This will return nil if called after destroy(), otherwise it will return a -// Dirent with a reference taken. +// This will return nil if called after f is destroyed, otherwise it will return +// a Dirent with a reference taken. func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry { f.mu.Lock() defer f.mu.Unlock() @@ -226,7 +231,7 @@ func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry { // SetRootDirectory sets the root directory. // This will take an extra reference on the Dirent. // -// This is not a valid call after free. +// This is not a valid call after f is destroyed. func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) { if d == nil { panic("FSContext.SetRootDirectory called with nil dirent") @@ -247,7 +252,7 @@ func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) { // SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd. // -// This is not a valid call after free. +// This is not a valid call after f is destroyed. func (f *FSContext) SetRootDirectoryVFS2(ctx context.Context, vd vfs.VirtualDentry) { if !vd.Ok() { panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry") diff --git a/pkg/sentry/kernel/fs_context_refs.go b/pkg/sentry/kernel/fs_context_refs.go new file mode 100644 index 000000000..fb2fde971 --- /dev/null +++ b/pkg/sentry/kernel/fs_context_refs.go @@ -0,0 +1,118 @@ +package kernel + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var FSContextownerType *FSContext + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type FSContextRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *FSContextRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, FSContextownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *FSContextRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*FSContextRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *FSContextRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *FSContextRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, FSContextownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *FSContextRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *FSContextRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, FSContextownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/kernel/kernel_state_autogen.go b/pkg/sentry/kernel/kernel_state_autogen.go index e514b07d6..d0ff135d7 100644 --- a/pkg/sentry/kernel/kernel_state_autogen.go +++ b/pkg/sentry/kernel/kernel_state_autogen.go @@ -122,7 +122,7 @@ func (x *FDTable) StateTypeName() string { func (x *FDTable) StateFields() []string { return []string{ - "AtomicRefCount", + "FDTableRefs", "k", "next", "used", @@ -136,7 +136,7 @@ func (x *FDTable) StateSave(m state.Sink) { x.beforeSave() var descriptorTable map[int32]descriptor = x.saveDescriptorTable() m.SaveValue(4, descriptorTable) - m.Save(0, &x.AtomicRefCount) + m.Save(0, &x.FDTableRefs) m.Save(1, &x.k) m.Save(2, &x.next) m.Save(3, &x.used) @@ -145,20 +145,43 @@ func (x *FDTable) StateSave(m state.Sink) { func (x *FDTable) afterLoad() {} func (x *FDTable) StateLoad(m state.Source) { - m.Load(0, &x.AtomicRefCount) + m.Load(0, &x.FDTableRefs) m.Load(1, &x.k) m.Load(2, &x.next) m.Load(3, &x.used) m.LoadValue(4, new(map[int32]descriptor), func(y interface{}) { x.loadDescriptorTable(y.(map[int32]descriptor)) }) } +func (x *FDTableRefs) StateTypeName() string { + return "pkg/sentry/kernel.FDTableRefs" +} + +func (x *FDTableRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *FDTableRefs) beforeSave() {} + +func (x *FDTableRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *FDTableRefs) afterLoad() {} + +func (x *FDTableRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *FSContext) StateTypeName() string { return "pkg/sentry/kernel.FSContext" } func (x *FSContext) StateFields() []string { return []string{ - "AtomicRefCount", + "FSContextRefs", "root", "rootVFS2", "cwd", @@ -171,7 +194,7 @@ func (x *FSContext) beforeSave() {} func (x *FSContext) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.AtomicRefCount) + m.Save(0, &x.FSContextRefs) m.Save(1, &x.root) m.Save(2, &x.rootVFS2) m.Save(3, &x.cwd) @@ -182,7 +205,7 @@ func (x *FSContext) StateSave(m state.Sink) { func (x *FSContext) afterLoad() {} func (x *FSContext) StateLoad(m state.Source) { - m.Load(0, &x.AtomicRefCount) + m.Load(0, &x.FSContextRefs) m.Load(1, &x.root) m.Load(2, &x.rootVFS2) m.Load(3, &x.cwd) @@ -190,6 +213,29 @@ func (x *FSContext) StateLoad(m state.Source) { m.Load(5, &x.umask) } +func (x *FSContextRefs) StateTypeName() string { + return "pkg/sentry/kernel.FSContextRefs" +} + +func (x *FSContextRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *FSContextRefs) beforeSave() {} + +func (x *FSContextRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *FSContextRefs) afterLoad() {} + +func (x *FSContextRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *IPCNamespace) StateTypeName() string { return "pkg/sentry/kernel.IPCNamespace" } @@ -643,6 +689,29 @@ func (x *processGroupEntry) StateLoad(m state.Source) { m.Load(1, &x.prev) } +func (x *ProcessGroupRefs) StateTypeName() string { + return "pkg/sentry/kernel.ProcessGroupRefs" +} + +func (x *ProcessGroupRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *ProcessGroupRefs) beforeSave() {} + +func (x *ProcessGroupRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *ProcessGroupRefs) afterLoad() {} + +func (x *ProcessGroupRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *ptraceOptions) StateTypeName() string { return "pkg/sentry/kernel.ptraceOptions" } @@ -794,13 +863,36 @@ func (x *sessionEntry) StateLoad(m state.Source) { m.Load(1, &x.prev) } +func (x *SessionRefs) StateTypeName() string { + return "pkg/sentry/kernel.SessionRefs" +} + +func (x *SessionRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *SessionRefs) beforeSave() {} + +func (x *SessionRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *SessionRefs) afterLoad() {} + +func (x *SessionRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *Session) StateTypeName() string { return "pkg/sentry/kernel.Session" } func (x *Session) StateFields() []string { return []string{ - "refs", + "SessionRefs", "leader", "id", "foreground", @@ -813,7 +905,7 @@ func (x *Session) beforeSave() {} func (x *Session) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.refs) + m.Save(0, &x.SessionRefs) m.Save(1, &x.leader) m.Save(2, &x.id) m.Save(3, &x.foreground) @@ -824,7 +916,7 @@ func (x *Session) StateSave(m state.Sink) { func (x *Session) afterLoad() {} func (x *Session) StateLoad(m state.Source) { - m.Load(0, &x.refs) + m.Load(0, &x.SessionRefs) m.Load(1, &x.leader) m.Load(2, &x.id) m.Load(3, &x.foreground) @@ -2167,7 +2259,9 @@ func init() { state.Register((*FDFlags)(nil)) state.Register((*descriptor)(nil)) state.Register((*FDTable)(nil)) + state.Register((*FDTableRefs)(nil)) state.Register((*FSContext)(nil)) + state.Register((*FSContextRefs)(nil)) state.Register((*IPCNamespace)(nil)) state.Register((*Kernel)(nil)) state.Register((*SocketEntry)(nil)) @@ -2180,11 +2274,13 @@ func init() { state.Register((*IntervalTimer)(nil)) state.Register((*processGroupList)(nil)) state.Register((*processGroupEntry)(nil)) + state.Register((*ProcessGroupRefs)(nil)) state.Register((*ptraceOptions)(nil)) state.Register((*ptraceStop)(nil)) state.Register((*OldRSeqCriticalRegion)(nil)) state.Register((*sessionList)(nil)) state.Register((*sessionEntry)(nil)) + state.Register((*SessionRefs)(nil)) state.Register((*Session)(nil)) state.Register((*ProcessGroup)(nil)) state.Register((*SignalHandlers)(nil)) diff --git a/pkg/sentry/kernel/process_group_refs.go b/pkg/sentry/kernel/process_group_refs.go new file mode 100644 index 000000000..4ed6e6458 --- /dev/null +++ b/pkg/sentry/kernel/process_group_refs.go @@ -0,0 +1,118 @@ +package kernel + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var ProcessGroupownerType *ProcessGroup + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type ProcessGroupRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *ProcessGroupRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, ProcessGroupownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *ProcessGroupRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*ProcessGroupRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *ProcessGroupRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *ProcessGroupRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, ProcessGroupownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *ProcessGroupRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *ProcessGroupRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, ProcessGroupownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/kernel/session_refs.go b/pkg/sentry/kernel/session_refs.go new file mode 100644 index 000000000..f2e1bb797 --- /dev/null +++ b/pkg/sentry/kernel/session_refs.go @@ -0,0 +1,118 @@ +package kernel + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var SessionownerType *Session + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type SessionRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *SessionRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, SessionownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *SessionRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*SessionRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *SessionRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *SessionRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, SessionownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *SessionRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *SessionRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, SessionownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index 5c4c622c2..df5c8421b 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -16,8 +16,6 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/syserror" ) @@ -32,7 +30,7 @@ type ProcessGroupID ThreadID // // +stateify savable type Session struct { - refs refs.AtomicRefCount + SessionRefs // leader is the originator of the Session. // @@ -62,16 +60,11 @@ type Session struct { sessionEntry } -// incRef grabs a reference. -func (s *Session) incRef() { - s.refs.IncRef() -} - -// decRef drops a reference. +// DecRef drops a reference. // // Precondition: callers must hold TaskSet.mu for writing. -func (s *Session) decRef() { - s.refs.DecRefWithDestructor(nil, func(context.Context) { +func (s *Session) DecRef() { + s.SessionRefs.DecRef(func() { // Remove translations from the leader. for ns := s.leader.pidns; ns != nil; ns = ns.parent { id := ns.sids[s] @@ -88,7 +81,7 @@ func (s *Session) decRef() { // // +stateify savable type ProcessGroup struct { - refs refs.AtomicRefCount // not exported. + refs ProcessGroupRefs // originator is the originator of the group. // @@ -163,7 +156,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) { } alive := true - pg.refs.DecRefWithDestructor(nil, func(context.Context) { + pg.refs.DecRef(func() { alive = false // don't bother with handleOrphan. // Remove translations from the originator. @@ -175,7 +168,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) { // Remove the list of process groups. pg.session.processGroups.Remove(pg) - pg.session.decRef() + pg.session.DecRef() }) if alive { pg.handleOrphan() @@ -302,7 +295,7 @@ func (tg *ThreadGroup) createSession() error { id: SessionID(id), leader: tg, } - s.refs.EnableLeakCheck("kernel.Session") + s.EnableLeakCheck() // Create a new ProcessGroup, belonging to that Session. // This also has a single reference (assigned below). @@ -316,7 +309,7 @@ func (tg *ThreadGroup) createSession() error { session: s, ancestors: 0, } - pg.refs.EnableLeakCheck("kernel.ProcessGroup") + pg.refs.EnableLeakCheck() // Tie them and return the result. s.processGroups.PushBack(pg) @@ -396,13 +389,13 @@ func (tg *ThreadGroup) CreateProcessGroup() error { // // We manually adjust the ancestors if the parent is in the same // session. - tg.processGroup.session.incRef() + tg.processGroup.session.IncRef() pg := ProcessGroup{ id: ProcessGroupID(id), originator: tg, session: tg.processGroup.session, } - pg.refs.EnableLeakCheck("kernel.ProcessGroup") + pg.refs.EnableLeakCheck() if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session { pg.ancestors++ diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index 13ec7afe0..00c03585e 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -39,7 +39,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -252,7 +251,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi creatorPID: pid, changeTime: ktime.NowFromContext(ctx), } - shm.EnableLeakCheck("kernel.Shm") + shm.EnableLeakCheck() // Find the next available ID. for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { @@ -337,14 +336,14 @@ func (r *Registry) remove(s *Shm) { // // +stateify savable type Shm struct { - // AtomicRefCount tracks the number of references to this segment. + // ShmRefs tracks the number of references to this segment. // // A segment holds a reference to itself until it is marked for // destruction. // // In addition to direct users, the MemoryManager will hold references // via MappingIdentity. - refs.AtomicRefCount + ShmRefs mfp pgalloc.MemoryFileProvider @@ -428,11 +427,14 @@ func (s *Shm) InodeID() uint64 { return uint64(s.ID) } -// DecRef overrides refs.RefCount.DecRef with a destructor. +// DecRef drops a reference on s. // // Precondition: Caller must not hold s.mu. func (s *Shm) DecRef(ctx context.Context) { - s.DecRefWithDestructor(ctx, s.destroy) + s.ShmRefs.DecRef(func() { + s.mfp.MemoryFile().DecRef(s.fr) + s.registry.remove(s) + }) } // Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm @@ -642,11 +644,6 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error { return nil } -func (s *Shm) destroy(context.Context) { - s.mfp.MemoryFile().DecRef(s.fr) - s.registry.remove(s) -} - // MarkDestroyed marks a segment for destruction. The segment is actually // destroyed once it has no references. MarkDestroyed may be called multiple // times, and is safe to call after a segment has already been destroyed. See diff --git a/pkg/sentry/kernel/shm/shm_refs.go b/pkg/sentry/kernel/shm/shm_refs.go new file mode 100644 index 000000000..51e07d0b3 --- /dev/null +++ b/pkg/sentry/kernel/shm/shm_refs.go @@ -0,0 +1,118 @@ +package shm + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var ShmownerType *Shm + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type ShmRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *ShmRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, ShmownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *ShmRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*ShmRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *ShmRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *ShmRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, ShmownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *ShmRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *ShmRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, ShmownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/kernel/shm/shm_state_autogen.go b/pkg/sentry/kernel/shm/shm_state_autogen.go index c98632087..fa80353b6 100644 --- a/pkg/sentry/kernel/shm/shm_state_autogen.go +++ b/pkg/sentry/kernel/shm/shm_state_autogen.go @@ -47,7 +47,7 @@ func (x *Shm) StateTypeName() string { func (x *Shm) StateFields() []string { return []string{ - "AtomicRefCount", + "ShmRefs", "mfp", "registry", "ID", @@ -71,7 +71,7 @@ func (x *Shm) beforeSave() {} func (x *Shm) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.AtomicRefCount) + m.Save(0, &x.ShmRefs) m.Save(1, &x.mfp) m.Save(2, &x.registry) m.Save(3, &x.ID) @@ -93,7 +93,7 @@ func (x *Shm) StateSave(m state.Sink) { func (x *Shm) afterLoad() {} func (x *Shm) StateLoad(m state.Source) { - m.Load(0, &x.AtomicRefCount) + m.Load(0, &x.ShmRefs) m.Load(1, &x.mfp) m.Load(2, &x.registry) m.Load(3, &x.ID) @@ -112,7 +112,31 @@ func (x *Shm) StateLoad(m state.Source) { m.Load(16, &x.pendingDestruction) } +func (x *ShmRefs) StateTypeName() string { + return "pkg/sentry/kernel/shm.ShmRefs" +} + +func (x *ShmRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *ShmRefs) beforeSave() {} + +func (x *ShmRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *ShmRefs) afterLoad() {} + +func (x *ShmRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func init() { state.Register((*Registry)(nil)) state.Register((*Shm)(nil)) + state.Register((*ShmRefs)(nil)) } diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go index 16fea53c4..7bf48cb2c 100644 --- a/pkg/sentry/mm/aio_context.go +++ b/pkg/sentry/mm/aio_context.go @@ -17,7 +17,6 @@ package mm import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" @@ -239,7 +238,7 @@ func (ctx *AIOContext) Drain() { // // +stateify savable type aioMappable struct { - refs.AtomicRefCount + aioMappableRefs mfp pgalloc.MemoryFileProvider fr memmap.FileRange @@ -253,13 +252,13 @@ func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) { return nil, err } m := aioMappable{mfp: mfp, fr: fr} - m.EnableLeakCheck("mm.aioMappable") + m.EnableLeakCheck() return &m, nil } // DecRef implements refs.RefCounter.DecRef. func (m *aioMappable) DecRef(ctx context.Context) { - m.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) { + m.aioMappableRefs.DecRef(func() { m.mfp.MemoryFile().DecRef(m.fr) }) } diff --git a/pkg/sentry/mm/aio_mappable_refs.go b/pkg/sentry/mm/aio_mappable_refs.go new file mode 100644 index 000000000..b99909f07 --- /dev/null +++ b/pkg/sentry/mm/aio_mappable_refs.go @@ -0,0 +1,118 @@ +package mm + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var aioMappableownerType *aioMappable + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type aioMappableRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *aioMappableRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, aioMappableownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *aioMappableRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*aioMappableRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *aioMappableRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *aioMappableRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, aioMappableownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *aioMappableRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *aioMappableRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, aioMappableownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/mm/mm_state_autogen.go b/pkg/sentry/mm/mm_state_autogen.go index 2f81a8240..8ab51450c 100644 --- a/pkg/sentry/mm/mm_state_autogen.go +++ b/pkg/sentry/mm/mm_state_autogen.go @@ -92,7 +92,7 @@ func (x *aioMappable) StateTypeName() string { func (x *aioMappable) StateFields() []string { return []string{ - "AtomicRefCount", + "aioMappableRefs", "mfp", "fr", } @@ -102,7 +102,7 @@ func (x *aioMappable) beforeSave() {} func (x *aioMappable) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.AtomicRefCount) + m.Save(0, &x.aioMappableRefs) m.Save(1, &x.mfp) m.Save(2, &x.fr) } @@ -110,11 +110,34 @@ func (x *aioMappable) StateSave(m state.Sink) { func (x *aioMappable) afterLoad() {} func (x *aioMappable) StateLoad(m state.Source) { - m.Load(0, &x.AtomicRefCount) + m.Load(0, &x.aioMappableRefs) m.Load(1, &x.mfp) m.Load(2, &x.fr) } +func (x *aioMappableRefs) StateTypeName() string { + return "pkg/sentry/mm.aioMappableRefs" +} + +func (x *aioMappableRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *aioMappableRefs) beforeSave() {} + +func (x *aioMappableRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *aioMappableRefs) afterLoad() {} + +func (x *aioMappableRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *fileRefcountSet) StateTypeName() string { return "pkg/sentry/mm.fileRefcountSet" } @@ -565,7 +588,7 @@ func (x *SpecialMappable) StateTypeName() string { func (x *SpecialMappable) StateFields() []string { return []string{ - "AtomicRefCount", + "SpecialMappableRefs", "mfp", "fr", "name", @@ -576,7 +599,7 @@ func (x *SpecialMappable) beforeSave() {} func (x *SpecialMappable) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.AtomicRefCount) + m.Save(0, &x.SpecialMappableRefs) m.Save(1, &x.mfp) m.Save(2, &x.fr) m.Save(3, &x.name) @@ -585,12 +608,35 @@ func (x *SpecialMappable) StateSave(m state.Sink) { func (x *SpecialMappable) afterLoad() {} func (x *SpecialMappable) StateLoad(m state.Source) { - m.Load(0, &x.AtomicRefCount) + m.Load(0, &x.SpecialMappableRefs) m.Load(1, &x.mfp) m.Load(2, &x.fr) m.Load(3, &x.name) } +func (x *SpecialMappableRefs) StateTypeName() string { + return "pkg/sentry/mm.SpecialMappableRefs" +} + +func (x *SpecialMappableRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *SpecialMappableRefs) beforeSave() {} + +func (x *SpecialMappableRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *SpecialMappableRefs) afterLoad() {} + +func (x *SpecialMappableRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *vmaSet) StateTypeName() string { return "pkg/sentry/mm.vmaSet" } @@ -693,6 +739,7 @@ func init() { state.Register((*ioResult)(nil)) state.Register((*AIOContext)(nil)) state.Register((*aioMappable)(nil)) + state.Register((*aioMappableRefs)(nil)) state.Register((*fileRefcountSet)(nil)) state.Register((*fileRefcountnode)(nil)) state.Register((*fileRefcountSegmentDataSlices)(nil)) @@ -706,6 +753,7 @@ func init() { state.Register((*pmanode)(nil)) state.Register((*pmaSegmentDataSlices)(nil)) state.Register((*SpecialMappable)(nil)) + state.Register((*SpecialMappableRefs)(nil)) state.Register((*vmaSet)(nil)) state.Register((*vmanode)(nil)) state.Register((*vmaSegmentDataSlices)(nil)) diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go index 4cdb52eb6..f4c93baeb 100644 --- a/pkg/sentry/mm/special_mappable.go +++ b/pkg/sentry/mm/special_mappable.go @@ -16,7 +16,6 @@ package mm import ( "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/usage" @@ -31,7 +30,7 @@ import ( // // +stateify savable type SpecialMappable struct { - refs.AtomicRefCount + SpecialMappableRefs mfp pgalloc.MemoryFileProvider fr memmap.FileRange @@ -45,13 +44,13 @@ type SpecialMappable struct { // Preconditions: fr.Length() != 0. func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *SpecialMappable { m := SpecialMappable{mfp: mfp, fr: fr, name: name} - m.EnableLeakCheck("mm.SpecialMappable") + m.EnableLeakCheck() return &m } // DecRef implements refs.RefCounter.DecRef. func (m *SpecialMappable) DecRef(ctx context.Context) { - m.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) { + m.SpecialMappableRefs.DecRef(func() { m.mfp.MemoryFile().DecRef(m.fr) }) } diff --git a/pkg/sentry/mm/special_mappable_refs.go b/pkg/sentry/mm/special_mappable_refs.go new file mode 100644 index 000000000..035bbe690 --- /dev/null +++ b/pkg/sentry/mm/special_mappable_refs.go @@ -0,0 +1,118 @@ +package mm + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var SpecialMappableownerType *SpecialMappable + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type SpecialMappableRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *SpecialMappableRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, SpecialMappableownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *SpecialMappableRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*SpecialMappableRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *SpecialMappableRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *SpecialMappableRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, SpecialMappableownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *SpecialMappableRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *SpecialMappableRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, SpecialMappableownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/platform/ring0/defs_impl_arm64.go b/pkg/sentry/platform/ring0/defs_impl_arm64.go index 2dac9ad14..424b66f76 100644 --- a/pkg/sentry/platform/ring0/defs_impl_arm64.go +++ b/pkg/sentry/platform/ring0/defs_impl_arm64.go @@ -3,11 +3,11 @@ package ring0 import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "reflect" "fmt" "gvisor.dev/gvisor/pkg/usermem" "io" + "reflect" ) // Useful bits. diff --git a/pkg/sentry/socket/unix/socket_refs.go b/pkg/sentry/socket/unix/socket_refs.go index 4c6ec186b..dababb85f 100644 --- a/pkg/sentry/socket/unix/socket_refs.go +++ b/pkg/sentry/socket/unix/socket_refs.go @@ -1,6 +1,7 @@ package unix import ( + "fmt" "runtime" "sync/atomic" @@ -18,6 +19,11 @@ var socketOpsCommonownerType *socketOpsCommon // Note that the number of references is actually refCount + 1 so that a default // zero-value Refs object contains one reference. // +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// // +stateify savable type socketOpsCommonRefs struct { // refCount is composed of two fields: @@ -62,7 +68,7 @@ func (r *socketOpsCommonRefs) ReadRefs() int64 { //go:nosplit func (r *socketOpsCommonRefs) IncRef() { if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { - panic("Incrementing non-positive ref count") + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, socketOpsCommonownerType)) } } @@ -101,7 +107,7 @@ func (r *socketOpsCommonRefs) TryIncRef() bool { func (r *socketOpsCommonRefs) DecRef(destroy func()) { switch v := atomic.AddInt64(&r.refCount, -1); { case v < -1: - panic("Decrementing non-positive ref count") + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, socketOpsCommonownerType)) case v == -1: diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index c67b602f0..e3a75b519 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -142,9 +142,9 @@ func NewPair(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) (E } q1 := &queue{ReaderQueue: a.Queue, WriterQueue: b.Queue, limit: initialLimit} - q1.EnableLeakCheck("transport.queue") + q1.EnableLeakCheck() q2 := &queue{ReaderQueue: b.Queue, WriterQueue: a.Queue, limit: initialLimit} - q2.EnableLeakCheck("transport.queue") + q2.EnableLeakCheck() if stype == linux.SOCK_STREAM { a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}} @@ -300,14 +300,14 @@ func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce Conn } readQueue := &queue{ReaderQueue: ce.WaiterQueue(), WriterQueue: ne.Queue, limit: initialLimit} - readQueue.EnableLeakCheck("transport.queue") + readQueue.EnableLeakCheck() ne.connected = &connectedEndpoint{ endpoint: ce, writeQueue: readQueue, } writeQueue := &queue{ReaderQueue: ne.Queue, WriterQueue: ce.WaiterQueue(), limit: initialLimit} - writeQueue.EnableLeakCheck("transport.queue") + writeQueue.EnableLeakCheck() if e.stype == linux.SOCK_STREAM { ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}} } else { diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go index 70ee8f9b8..4751b2fd8 100644 --- a/pkg/sentry/socket/unix/transport/connectionless.go +++ b/pkg/sentry/socket/unix/transport/connectionless.go @@ -42,7 +42,7 @@ var ( func NewConnectionless(ctx context.Context) Endpoint { ep := &connectionlessEndpoint{baseEndpoint{Queue: &waiter.Queue{}}} q := queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: initialLimit} - q.EnableLeakCheck("transport.queue") + q.EnableLeakCheck() ep.receiver = &queueReceiver{readQueue: &q} return ep } diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go index ef6043e19..342def28f 100644 --- a/pkg/sentry/socket/unix/transport/queue.go +++ b/pkg/sentry/socket/unix/transport/queue.go @@ -16,7 +16,6 @@ package transport import ( "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" @@ -28,7 +27,7 @@ import ( // // +stateify savable type queue struct { - refs.AtomicRefCount + queueRefs ReaderQueue *waiter.Queue WriterQueue *waiter.Queue @@ -68,11 +67,13 @@ func (q *queue) Reset(ctx context.Context) { q.mu.Unlock() } -// DecRef implements RefCounter.DecRef with destructor q.Reset. +// DecRef implements RefCounter.DecRef. func (q *queue) DecRef(ctx context.Context) { - q.DecRefWithDestructor(ctx, q.Reset) - // We don't need to notify after resetting because no one cares about - // this queue after all references have been dropped. + q.queueRefs.DecRef(func() { + // We don't need to notify after resetting because no one cares about + // this queue after all references have been dropped. + q.Reset(ctx) + }) } // IsReadable determines if q is currently readable. diff --git a/pkg/sentry/socket/unix/transport/queue_refs.go b/pkg/sentry/socket/unix/transport/queue_refs.go new file mode 100644 index 000000000..0d4e34988 --- /dev/null +++ b/pkg/sentry/socket/unix/transport/queue_refs.go @@ -0,0 +1,118 @@ +package transport + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var queueownerType *queue + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type queueRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *queueRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, queueownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *queueRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*queueRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *queueRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *queueRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, queueownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *queueRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *queueRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, queueownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/socket/unix/transport/transport_state_autogen.go b/pkg/sentry/socket/unix/transport/transport_state_autogen.go index 4b88ea3ae..91e632833 100644 --- a/pkg/sentry/socket/unix/transport/transport_state_autogen.go +++ b/pkg/sentry/socket/unix/transport/transport_state_autogen.go @@ -71,7 +71,7 @@ func (x *queue) StateTypeName() string { func (x *queue) StateFields() []string { return []string{ - "AtomicRefCount", + "queueRefs", "ReaderQueue", "WriterQueue", "closed", @@ -86,7 +86,7 @@ func (x *queue) beforeSave() {} func (x *queue) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.AtomicRefCount) + m.Save(0, &x.queueRefs) m.Save(1, &x.ReaderQueue) m.Save(2, &x.WriterQueue) m.Save(3, &x.closed) @@ -99,7 +99,7 @@ func (x *queue) StateSave(m state.Sink) { func (x *queue) afterLoad() {} func (x *queue) StateLoad(m state.Source) { - m.Load(0, &x.AtomicRefCount) + m.Load(0, &x.queueRefs) m.Load(1, &x.ReaderQueue) m.Load(2, &x.WriterQueue) m.Load(3, &x.closed) @@ -109,6 +109,29 @@ func (x *queue) StateLoad(m state.Source) { m.Load(7, &x.dataList) } +func (x *queueRefs) StateTypeName() string { + return "pkg/sentry/socket/unix/transport.queueRefs" +} + +func (x *queueRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *queueRefs) beforeSave() {} + +func (x *queueRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *queueRefs) afterLoad() {} + +func (x *queueRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *messageList) StateTypeName() string { return "pkg/sentry/socket/unix/transport.messageList" } @@ -339,6 +362,7 @@ func init() { state.Register((*connectionedEndpoint)(nil)) state.Register((*connectionlessEndpoint)(nil)) state.Register((*queue)(nil)) + state.Register((*queueRefs)(nil)) state.Register((*messageList)(nil)) state.Register((*messageEntry)(nil)) state.Register((*ControlMessages)(nil)) diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 3219a9e13..22a54fa48 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -38,9 +38,7 @@ import ( // // FileDescription is analogous to Linux's struct file. type FileDescription struct { - // refs is the reference count. refs is accessed using atomic memory - // operations. - refs int64 + FileDescriptionRefs // flagsMu protects statusFlags and asyncHandler below. flagsMu sync.Mutex @@ -131,7 +129,7 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou } } - fd.refs = 1 + fd.EnableLeakCheck() // Remove "file creation flags" to mirror the behavior from file.f_flags in // fs/open.c:do_dentry_open. @@ -149,30 +147,9 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou return nil } -// IncRef increments fd's reference count. -func (fd *FileDescription) IncRef() { - atomic.AddInt64(&fd.refs, 1) -} - -// TryIncRef increments fd's reference count and returns true. If fd's -// reference count is already zero, TryIncRef does nothing and returns false. -// -// TryIncRef does not require that a reference is held on fd. -func (fd *FileDescription) TryIncRef() bool { - for { - refs := atomic.LoadInt64(&fd.refs) - if refs <= 0 { - return false - } - if atomic.CompareAndSwapInt64(&fd.refs, refs, refs+1) { - return true - } - } -} - // DecRef decrements fd's reference count. func (fd *FileDescription) DecRef(ctx context.Context) { - if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 { + fd.FileDescriptionRefs.DecRef(func() { // Unregister fd from all epoll instances. fd.epollMu.Lock() epolls := fd.epolls @@ -208,15 +185,7 @@ func (fd *FileDescription) DecRef(ctx context.Context) { } fd.asyncHandler = nil fd.flagsMu.Unlock() - } else if refs < 0 { - panic("FileDescription.DecRef() called without holding a reference") - } -} - -// Refs returns the current number of references. The returned count -// is inherently racy and is unsafe to use without external synchronization. -func (fd *FileDescription) Refs() int64 { - return atomic.LoadInt64(&fd.refs) + }) } // Mount returns the mount on which fd was opened. It does not take a reference diff --git a/pkg/sentry/vfs/file_description_refs.go b/pkg/sentry/vfs/file_description_refs.go new file mode 100644 index 000000000..bdd7e6554 --- /dev/null +++ b/pkg/sentry/vfs/file_description_refs.go @@ -0,0 +1,118 @@ +package vfs + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var FileDescriptionownerType *FileDescription + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type FileDescriptionRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *FileDescriptionRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, FileDescriptionownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *FileDescriptionRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*FileDescriptionRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *FileDescriptionRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *FileDescriptionRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, FileDescriptionownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *FileDescriptionRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *FileDescriptionRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, FileDescriptionownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 2c60cfab2..46851f638 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -15,8 +15,6 @@ package vfs import ( - "sync/atomic" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" @@ -34,9 +32,7 @@ import ( // // +stateify savable type Filesystem struct { - // refs is the reference count. refs is accessed using atomic memory - // operations. - refs int64 + FilesystemRefs // vfs is the VirtualFilesystem that uses this Filesystem. vfs is // immutable. @@ -52,7 +48,7 @@ type Filesystem struct { // Init must be called before first use of fs. func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, fsType FilesystemType, impl FilesystemImpl) { - fs.refs = 1 + fs.EnableLeakCheck() fs.vfs = vfsObj fs.fsType = fsType fs.impl = impl @@ -76,39 +72,14 @@ func (fs *Filesystem) Impl() FilesystemImpl { return fs.impl } -// IncRef increments fs' reference count. -func (fs *Filesystem) IncRef() { - if atomic.AddInt64(&fs.refs, 1) <= 1 { - panic("Filesystem.IncRef() called without holding a reference") - } -} - -// TryIncRef increments fs' reference count and returns true. If fs' reference -// count is zero, TryIncRef does nothing and returns false. -// -// TryIncRef does not require that a reference is held on fs. -func (fs *Filesystem) TryIncRef() bool { - for { - refs := atomic.LoadInt64(&fs.refs) - if refs <= 0 { - return false - } - if atomic.CompareAndSwapInt64(&fs.refs, refs, refs+1) { - return true - } - } -} - // DecRef decrements fs' reference count. func (fs *Filesystem) DecRef(ctx context.Context) { - if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 { + fs.FilesystemRefs.DecRef(func() { fs.vfs.filesystemsMu.Lock() delete(fs.vfs.filesystems, fs) fs.vfs.filesystemsMu.Unlock() fs.impl.Release(ctx) - } else if refs < 0 { - panic("Filesystem.decRef() called without holding a reference") - } + }) } // FilesystemImpl contains implementation details for a Filesystem. diff --git a/pkg/sentry/vfs/filesystem_refs.go b/pkg/sentry/vfs/filesystem_refs.go new file mode 100644 index 000000000..38a9a986f --- /dev/null +++ b/pkg/sentry/vfs/filesystem_refs.go @@ -0,0 +1,118 @@ +package vfs + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var FilesystemownerType *Filesystem + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type FilesystemRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *FilesystemRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, FilesystemownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *FilesystemRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*FilesystemRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *FilesystemRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *FilesystemRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, FilesystemownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *FilesystemRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *FilesystemRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, FilesystemownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index cd5456eef..db5fb3bb1 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -128,16 +128,14 @@ func (mnt *Mount) Options() MountOptions { // // +stateify savable type MountNamespace struct { + MountNamespaceRefs + // Owner is the usernamespace that owns this mount namespace. Owner *auth.UserNamespace // root is the MountNamespace's root mount. root is immutable. root *Mount - // refs is the reference count. refs is accessed using atomic memory - // operations. - refs int64 - // mountpoints maps all Dentries which are mount points in this namespace // to the number of Mounts for which they are mount points. mountpoints is // protected by VirtualFilesystem.mountMu. @@ -168,9 +166,9 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth } mntns := &MountNamespace{ Owner: creds.UserNamespace, - refs: 1, mountpoints: make(map[*Dentry]uint32), } + mntns.EnableLeakCheck() mntns.root = newMount(vfs, fs, root, mntns, &MountOptions{}) return mntns, nil } @@ -509,17 +507,10 @@ func (mnt *Mount) DecRef(ctx context.Context) { } } -// IncRef increments mntns' reference count. -func (mntns *MountNamespace) IncRef() { - if atomic.AddInt64(&mntns.refs, 1) <= 1 { - panic("MountNamespace.IncRef() called without holding a reference") - } -} - // DecRef decrements mntns' reference count. func (mntns *MountNamespace) DecRef(ctx context.Context) { vfs := mntns.root.fs.VirtualFilesystem() - if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 { + mntns.MountNamespaceRefs.DecRef(func() { vfs.mountMu.Lock() vfs.mounts.seq.BeginWrite() vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(mntns.root, &umountRecursiveOptions{ @@ -533,9 +524,7 @@ func (mntns *MountNamespace) DecRef(ctx context.Context) { for _, mnt := range mountsToDecRef { mnt.DecRef(ctx) } - } else if refs < 0 { - panic("MountNamespace.DecRef() called without holding a reference") - } + }) } // getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes diff --git a/pkg/sentry/vfs/mount_namespace_refs.go b/pkg/sentry/vfs/mount_namespace_refs.go new file mode 100644 index 000000000..63285fb8e --- /dev/null +++ b/pkg/sentry/vfs/mount_namespace_refs.go @@ -0,0 +1,118 @@ +package vfs + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var MountNamespaceownerType *MountNamespace + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type MountNamespaceRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *MountNamespaceRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, MountNamespaceownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *MountNamespaceRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*MountNamespaceRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *MountNamespaceRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *MountNamespaceRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, MountNamespaceownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *MountNamespaceRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *MountNamespaceRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, MountNamespaceownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/vfs/vfs_state_autogen.go b/pkg/sentry/vfs/vfs_state_autogen.go index 7bd988336..be649f60a 100644 --- a/pkg/sentry/vfs/vfs_state_autogen.go +++ b/pkg/sentry/vfs/vfs_state_autogen.go @@ -188,13 +188,36 @@ func (x *eventEntry) StateLoad(m state.Source) { m.Load(1, &x.prev) } +func (x *FileDescriptionRefs) StateTypeName() string { + return "pkg/sentry/vfs.FileDescriptionRefs" +} + +func (x *FileDescriptionRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *FileDescriptionRefs) beforeSave() {} + +func (x *FileDescriptionRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *FileDescriptionRefs) afterLoad() {} + +func (x *FileDescriptionRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *Filesystem) StateTypeName() string { return "pkg/sentry/vfs.Filesystem" } func (x *Filesystem) StateFields() []string { return []string{ - "refs", + "FilesystemRefs", "vfs", "fsType", "impl", @@ -205,7 +228,7 @@ func (x *Filesystem) beforeSave() {} func (x *Filesystem) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.refs) + m.Save(0, &x.FilesystemRefs) m.Save(1, &x.vfs) m.Save(2, &x.fsType) m.Save(3, &x.impl) @@ -214,12 +237,35 @@ func (x *Filesystem) StateSave(m state.Sink) { func (x *Filesystem) afterLoad() {} func (x *Filesystem) StateLoad(m state.Source) { - m.Load(0, &x.refs) + m.Load(0, &x.FilesystemRefs) m.Load(1, &x.vfs) m.Load(2, &x.fsType) m.Load(3, &x.impl) } +func (x *FilesystemRefs) StateTypeName() string { + return "pkg/sentry/vfs.FilesystemRefs" +} + +func (x *FilesystemRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *FilesystemRefs) beforeSave() {} + +func (x *FilesystemRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *FilesystemRefs) afterLoad() {} + +func (x *FilesystemRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *registeredFilesystemType) StateTypeName() string { return "pkg/sentry/vfs.registeredFilesystemType" } @@ -448,9 +494,9 @@ func (x *MountNamespace) StateTypeName() string { func (x *MountNamespace) StateFields() []string { return []string{ + "MountNamespaceRefs", "Owner", "root", - "refs", "mountpoints", } } @@ -459,21 +505,44 @@ func (x *MountNamespace) beforeSave() {} func (x *MountNamespace) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.Owner) - m.Save(1, &x.root) - m.Save(2, &x.refs) + m.Save(0, &x.MountNamespaceRefs) + m.Save(1, &x.Owner) + m.Save(2, &x.root) m.Save(3, &x.mountpoints) } func (x *MountNamespace) afterLoad() {} func (x *MountNamespace) StateLoad(m state.Source) { - m.Load(0, &x.Owner) - m.Load(1, &x.root) - m.Load(2, &x.refs) + m.Load(0, &x.MountNamespaceRefs) + m.Load(1, &x.Owner) + m.Load(2, &x.root) m.Load(3, &x.mountpoints) } +func (x *MountNamespaceRefs) StateTypeName() string { + return "pkg/sentry/vfs.MountNamespaceRefs" +} + +func (x *MountNamespaceRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *MountNamespaceRefs) beforeSave() {} + +func (x *MountNamespaceRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *MountNamespaceRefs) afterLoad() {} + +func (x *MountNamespaceRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *VirtualFilesystem) StateTypeName() string { return "pkg/sentry/vfs.VirtualFilesystem" } @@ -555,7 +624,9 @@ func init() { state.Register((*epollInterestEntry)(nil)) state.Register((*eventList)(nil)) state.Register((*eventEntry)(nil)) + state.Register((*FileDescriptionRefs)(nil)) state.Register((*Filesystem)(nil)) + state.Register((*FilesystemRefs)(nil)) state.Register((*registeredFilesystemType)(nil)) state.Register((*Inotify)(nil)) state.Register((*Watches)(nil)) @@ -563,6 +634,7 @@ func init() { state.Register((*Event)(nil)) state.Register((*Mount)(nil)) state.Register((*MountNamespace)(nil)) + state.Register((*MountNamespaceRefs)(nil)) state.Register((*VirtualFilesystem)(nil)) state.Register((*VirtualDentry)(nil)) } |