diff options
Diffstat (limited to 'pkg/sentry/fsimpl')
28 files changed, 1654 insertions, 138 deletions
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index 3f3a099bd..0eaff9087 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -83,6 +83,7 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds } root.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555) root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + root.EnableLeakCheck() root.dentry.Init(root) // Construct the pts master inode and dentry. Linux always uses inode @@ -110,6 +111,7 @@ func (fs *filesystem) Release(ctx context.Context) { // rootInode is the root directory inode for the devpts mounts. type rootInode struct { + rootInodeRefs kernfs.AlwaysValid kernfs.InodeAttrs kernfs.InodeDirectoryNoNewChildren @@ -233,3 +235,8 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, } return offset, nil } + +// DecRef implements kernfs.Inode. +func (i *rootInode) DecRef(context.Context) { + i.rootInodeRefs.DecRef(i.Destroy) +} diff --git a/pkg/sentry/fsimpl/devpts/devpts_state_autogen.go b/pkg/sentry/fsimpl/devpts/devpts_state_autogen.go index 5942f7bac..adc184d1b 100644 --- a/pkg/sentry/fsimpl/devpts/devpts_state_autogen.go +++ b/pkg/sentry/fsimpl/devpts/devpts_state_autogen.go @@ -120,6 +120,29 @@ func (x *queue) StateLoad(m state.Source) { m.Load(4, &x.transformer) } +func (x *rootInodeRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/devpts.rootInodeRefs" +} + +func (x *rootInodeRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *rootInodeRefs) beforeSave() {} + +func (x *rootInodeRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *rootInodeRefs) afterLoad() {} + +func (x *rootInodeRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *Terminal) StateTypeName() string { return "pkg/sentry/fsimpl/devpts.Terminal" } @@ -157,5 +180,6 @@ func init() { state.Register((*outputQueueTransformer)(nil)) state.Register((*inputQueueTransformer)(nil)) state.Register((*queue)(nil)) + state.Register((*rootInodeRefs)(nil)) state.Register((*Terminal)(nil)) } diff --git a/pkg/sentry/fsimpl/devpts/root_inode_refs.go b/pkg/sentry/fsimpl/devpts/root_inode_refs.go new file mode 100644 index 000000000..051801202 --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/root_inode_refs.go @@ -0,0 +1,118 @@ +package devpts + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var rootInodeownerType *rootInode + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type rootInodeRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *rootInodeRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, rootInodeownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *rootInodeRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*rootInodeRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *rootInodeRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *rootInodeRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, rootInodeownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *rootInodeRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *rootInodeRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, rootInodeownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go b/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go index e4ce04322..f72fe342e 100644 --- a/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go +++ b/pkg/sentry/fsimpl/fuse/fuse_state_autogen.go @@ -99,6 +99,29 @@ func (x *futureResponse) StateLoad(m state.Source) { m.Load(3, &x.data) } +func (x *inodeRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/fuse.inodeRefs" +} + +func (x *inodeRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *inodeRefs) beforeSave() {} + +func (x *inodeRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *inodeRefs) afterLoad() {} + +func (x *inodeRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *requestList) StateTypeName() string { return "pkg/sentry/fsimpl/fuse.requestList" } @@ -155,6 +178,7 @@ func init() { state.Register((*Request)(nil)) state.Register((*Response)(nil)) state.Register((*futureResponse)(nil)) + state.Register((*inodeRefs)(nil)) state.Register((*requestList)(nil)) state.Register((*requestEntry)(nil)) } diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index 44021ee4b..9717c0e15 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -198,6 +198,7 @@ func (fs *filesystem) Release(ctx context.Context) { // inode implements kernfs.Inode. type inode struct { + inodeRefs kernfs.InodeAttrs kernfs.InodeNoDynamicLookup kernfs.InodeNotSymlink @@ -213,6 +214,7 @@ func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *ke i := &inode{} i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + i.EnableLeakCheck() i.dentry.Init(i) return &i.dentry @@ -324,3 +326,8 @@ func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptio return statFromFUSEAttr(out.Attr, opts.Mask, fusefs.devMinor), nil } + +// DecRef implements kernfs.Inode. +func (i *inode) DecRef(context.Context) { + i.inodeRefs.DecRef(i.Destroy) +} diff --git a/pkg/sentry/fsimpl/fuse/inode_refs.go b/pkg/sentry/fsimpl/fuse/inode_refs.go new file mode 100644 index 000000000..6b9456e1d --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/inode_refs.go @@ -0,0 +1,118 @@ +package fuse + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var inodeownerType *inode + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type inodeRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *inodeRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, inodeownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *inodeRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*inodeRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *inodeRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *inodeRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, inodeownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *inodeRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *inodeRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, inodeownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/host/connected_endpoint_refs.go b/pkg/sentry/fsimpl/host/connected_endpoint_refs.go index 3b7bf599e..babb3f664 100644 --- a/pkg/sentry/fsimpl/host/connected_endpoint_refs.go +++ b/pkg/sentry/fsimpl/host/connected_endpoint_refs.go @@ -1,6 +1,7 @@ package host import ( + "fmt" "runtime" "sync/atomic" @@ -18,6 +19,11 @@ var ConnectedEndpointownerType *ConnectedEndpoint // Note that the number of references is actually refCount + 1 so that a default // zero-value Refs object contains one reference. // +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// // +stateify savable type ConnectedEndpointRefs struct { // refCount is composed of two fields: @@ -62,7 +68,7 @@ func (r *ConnectedEndpointRefs) ReadRefs() int64 { //go:nosplit func (r *ConnectedEndpointRefs) IncRef() { if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { - panic("Incrementing non-positive ref count") + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, ConnectedEndpointownerType)) } } @@ -101,7 +107,7 @@ func (r *ConnectedEndpointRefs) TryIncRef() bool { func (r *ConnectedEndpointRefs) DecRef(destroy func()) { switch v := atomic.AddInt64(&r.refCount, -1); { case v < -1: - panic("Decrementing non-positive ref count") + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, ConnectedEndpointownerType)) case v == -1: diff --git a/pkg/sentry/fsimpl/host/inode_refs.go b/pkg/sentry/fsimpl/host/inode_refs.go index 55c0fb3a9..17f90ce4a 100644 --- a/pkg/sentry/fsimpl/host/inode_refs.go +++ b/pkg/sentry/fsimpl/host/inode_refs.go @@ -1,6 +1,7 @@ package host import ( + "fmt" "runtime" "sync/atomic" @@ -18,6 +19,11 @@ var inodeownerType *inode // Note that the number of references is actually refCount + 1 so that a default // zero-value Refs object contains one reference. // +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// // +stateify savable type inodeRefs struct { // refCount is composed of two fields: @@ -62,7 +68,7 @@ func (r *inodeRefs) ReadRefs() int64 { //go:nosplit func (r *inodeRefs) IncRef() { if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { - panic("Incrementing non-positive ref count") + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, inodeownerType)) } } @@ -101,7 +107,7 @@ func (r *inodeRefs) TryIncRef() bool { func (r *inodeRefs) DecRef(destroy func()) { switch v := atomic.AddInt64(&r.refCount, -1); { case v < -1: - panic("Decrementing non-positive ref count") + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, inodeownerType)) case v == -1: diff --git a/pkg/sentry/fsimpl/kernfs/dentry_refs.go b/pkg/sentry/fsimpl/kernfs/dentry_refs.go new file mode 100644 index 000000000..79863b3bc --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/dentry_refs.go @@ -0,0 +1,118 @@ +package kernfs + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var DentryownerType *Dentry + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type DentryRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *DentryRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, DentryownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *DentryRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*DentryRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *DentryRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *DentryRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, DentryownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *DentryRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *DentryRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, DentryownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 885856868..f442a5606 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -344,8 +343,6 @@ type OrderedChildrenOptions struct { // // Must be initialize with Init before first use. type OrderedChildren struct { - refs.AtomicRefCount - // Can children be modified by user syscalls? It set to false, interface // methods that would modify the children return EPERM. Immutable. writable bool @@ -361,14 +358,14 @@ func (o *OrderedChildren) Init(opts OrderedChildrenOptions) { o.set = make(map[string]*slot) } -// DecRef implements Inode.DecRef. -func (o *OrderedChildren) DecRef(ctx context.Context) { - o.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) { - o.mu.Lock() - defer o.mu.Unlock() - o.order.Reset() - o.set = nil - }) +// Destroy clears the children stored in o. It should be called by structs +// embedding OrderedChildren upon destruction, i.e. when their reference count +// reaches zero. +func (o *OrderedChildren) Destroy() { + o.mu.Lock() + defer o.mu.Unlock() + o.order.Reset() + o.set = nil } // Populate inserts children into this OrderedChildren, and d's dentry @@ -549,6 +546,7 @@ func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D // // +stateify savable type StaticDirectory struct { + StaticDirectoryRefs InodeNotSymlink InodeDirectoryNoNewChildren InodeAttrs @@ -594,11 +592,16 @@ func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd return fd.VFSFileDescription(), nil } -// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } +// DecRef implements kernfs.Inode. +func (s *StaticDirectory) DecRef(context.Context) { + s.StaticDirectoryRefs.DecRef(s.Destroy) +} + // AlwaysValid partially implements kernfs.inodeDynamicLookup. type AlwaysValid struct{} diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 51dbc050c..ca3685800 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -57,7 +57,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -161,9 +160,9 @@ const ( // // Must be initialized by Init prior to first use. type Dentry struct { - vfsd vfs.Dentry + DentryRefs - refs.AtomicRefCount + vfsd vfs.Dentry // flags caches useful information about the dentry from the inode. See the // dflags* consts above. Must be accessed by atomic ops. @@ -194,6 +193,7 @@ func (d *Dentry) Init(inode Inode) { if ftype == linux.ModeSymlink { d.flags |= dflagsIsSymlink } + d.EnableLeakCheck() } // VFSDentry returns the generic vfs dentry for this kernfs dentry. @@ -213,16 +213,14 @@ func (d *Dentry) isSymlink() bool { // DecRef implements vfs.DentryImpl.DecRef. func (d *Dentry) DecRef(ctx context.Context) { - d.AtomicRefCount.DecRefWithDestructor(ctx, d.destroy) -} - -// Precondition: Dentry must be removed from VFS' dentry cache. -func (d *Dentry) destroy(ctx context.Context) { - d.inode.DecRef(ctx) // IncRef from Init. - d.inode = nil - if d.parent != nil { - d.parent.DecRef(ctx) // IncRef from Dentry.InsertChild. - } + // Before the destructor is called, Dentry must be removed from VFS' dentry cache. + d.DentryRefs.DecRef(func() { + d.inode.DecRef(ctx) // IncRef from Init. + d.inode = nil + if d.parent != nil { + d.parent.DecRef(ctx) // IncRef from Dentry.InsertChild. + } + }) } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_state_autogen.go b/pkg/sentry/fsimpl/kernfs/kernfs_state_autogen.go index f396affb0..12aaf797f 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_state_autogen.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_state_autogen.go @@ -6,6 +6,29 @@ import ( "gvisor.dev/gvisor/pkg/state" ) +func (x *DentryRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/kernfs.DentryRefs" +} + +func (x *DentryRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *DentryRefs) beforeSave() {} + +func (x *DentryRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *DentryRefs) afterLoad() {} + +func (x *DentryRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *DynamicBytesFile) StateTypeName() string { return "pkg/sentry/fsimpl/kernfs.DynamicBytesFile" } @@ -85,6 +108,7 @@ func (x *StaticDirectory) StateTypeName() string { func (x *StaticDirectory) StateFields() []string { return []string{ + "StaticDirectoryRefs", "InodeNotSymlink", "InodeDirectoryNoNewChildren", "InodeAttrs", @@ -99,25 +123,27 @@ func (x *StaticDirectory) beforeSave() {} func (x *StaticDirectory) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.InodeNotSymlink) - m.Save(1, &x.InodeDirectoryNoNewChildren) - m.Save(2, &x.InodeAttrs) - m.Save(3, &x.InodeNoDynamicLookup) - m.Save(4, &x.OrderedChildren) - m.Save(5, &x.locks) - m.Save(6, &x.fdOpts) + m.Save(0, &x.StaticDirectoryRefs) + m.Save(1, &x.InodeNotSymlink) + m.Save(2, &x.InodeDirectoryNoNewChildren) + m.Save(3, &x.InodeAttrs) + m.Save(4, &x.InodeNoDynamicLookup) + m.Save(5, &x.OrderedChildren) + m.Save(6, &x.locks) + m.Save(7, &x.fdOpts) } func (x *StaticDirectory) afterLoad() {} func (x *StaticDirectory) StateLoad(m state.Source) { - m.Load(0, &x.InodeNotSymlink) - m.Load(1, &x.InodeDirectoryNoNewChildren) - m.Load(2, &x.InodeAttrs) - m.Load(3, &x.InodeNoDynamicLookup) - m.Load(4, &x.OrderedChildren) - m.Load(5, &x.locks) - m.Load(6, &x.fdOpts) + m.Load(0, &x.StaticDirectoryRefs) + m.Load(1, &x.InodeNotSymlink) + m.Load(2, &x.InodeDirectoryNoNewChildren) + m.Load(3, &x.InodeAttrs) + m.Load(4, &x.InodeNoDynamicLookup) + m.Load(5, &x.OrderedChildren) + m.Load(6, &x.locks) + m.Load(7, &x.fdOpts) } func (x *slotList) StateTypeName() string { @@ -172,10 +198,35 @@ func (x *slotEntry) StateLoad(m state.Source) { m.Load(1, &x.prev) } +func (x *StaticDirectoryRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/kernfs.StaticDirectoryRefs" +} + +func (x *StaticDirectoryRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *StaticDirectoryRefs) beforeSave() {} + +func (x *StaticDirectoryRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *StaticDirectoryRefs) afterLoad() {} + +func (x *StaticDirectoryRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func init() { + state.Register((*DentryRefs)(nil)) state.Register((*DynamicBytesFile)(nil)) state.Register((*DynamicBytesFD)(nil)) state.Register((*StaticDirectory)(nil)) state.Register((*slotList)(nil)) state.Register((*slotEntry)(nil)) + state.Register((*StaticDirectoryRefs)(nil)) } diff --git a/pkg/sentry/fsimpl/kernfs/static_directory_refs.go b/pkg/sentry/fsimpl/kernfs/static_directory_refs.go new file mode 100644 index 000000000..478b04bdd --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/static_directory_refs.go @@ -0,0 +1,118 @@ +package kernfs + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var StaticDirectoryownerType *StaticDirectory + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type StaticDirectoryRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *StaticDirectoryRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, StaticDirectoryownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *StaticDirectoryRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*StaticDirectoryRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *StaticDirectoryRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *StaticDirectoryRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, StaticDirectoryownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *StaticDirectoryRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *StaticDirectoryRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, StaticDirectoryownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go b/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go new file mode 100644 index 000000000..9431c1506 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go @@ -0,0 +1,118 @@ +package proc + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var fdDirInodeownerType *fdDirInode + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type fdDirInodeRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *fdDirInodeRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, fdDirInodeownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *fdDirInodeRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*fdDirInodeRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *fdDirInodeRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *fdDirInodeRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, fdDirInodeownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *fdDirInodeRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *fdDirInodeRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, fdDirInodeownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go b/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go new file mode 100644 index 000000000..872b20eb0 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go @@ -0,0 +1,118 @@ +package proc + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var fdInfoDirInodeownerType *fdInfoDirInode + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type fdInfoDirInodeRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *fdInfoDirInodeRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, fdInfoDirInodeownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *fdInfoDirInodeRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*fdInfoDirInodeRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *fdInfoDirInodeRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *fdInfoDirInodeRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, fdInfoDirInodeownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *fdInfoDirInodeRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *fdInfoDirInodeRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, fdInfoDirInodeownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/proc/proc_state_autogen.go b/pkg/sentry/fsimpl/proc/proc_state_autogen.go index a7e43b7a7..907ef38e0 100644 --- a/pkg/sentry/fsimpl/proc/proc_state_autogen.go +++ b/pkg/sentry/fsimpl/proc/proc_state_autogen.go @@ -6,6 +6,52 @@ import ( "gvisor.dev/gvisor/pkg/state" ) +func (x *fdDirInodeRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/proc.fdDirInodeRefs" +} + +func (x *fdDirInodeRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *fdDirInodeRefs) beforeSave() {} + +func (x *fdDirInodeRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *fdDirInodeRefs) afterLoad() {} + +func (x *fdDirInodeRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + +func (x *fdInfoDirInodeRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/proc.fdInfoDirInodeRefs" +} + +func (x *fdInfoDirInodeRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *fdInfoDirInodeRefs) beforeSave() {} + +func (x *fdInfoDirInodeRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *fdInfoDirInodeRefs) afterLoad() {} + +func (x *fdInfoDirInodeRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *FilesystemType) StateTypeName() string { return "pkg/sentry/fsimpl/proc.FilesystemType" } @@ -31,6 +77,7 @@ func (x *subtasksInode) StateTypeName() string { func (x *subtasksInode) StateFields() []string { return []string{ + "subtasksInodeRefs", "InodeNotSymlink", "InodeDirectoryNoNewChildren", "InodeAttrs", @@ -48,31 +95,56 @@ func (x *subtasksInode) beforeSave() {} func (x *subtasksInode) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.InodeNotSymlink) - m.Save(1, &x.InodeDirectoryNoNewChildren) - m.Save(2, &x.InodeAttrs) - m.Save(3, &x.OrderedChildren) - m.Save(4, &x.AlwaysValid) - m.Save(5, &x.locks) - m.Save(6, &x.fs) - m.Save(7, &x.task) - m.Save(8, &x.pidns) - m.Save(9, &x.cgroupControllers) + m.Save(0, &x.subtasksInodeRefs) + m.Save(1, &x.InodeNotSymlink) + m.Save(2, &x.InodeDirectoryNoNewChildren) + m.Save(3, &x.InodeAttrs) + m.Save(4, &x.OrderedChildren) + m.Save(5, &x.AlwaysValid) + m.Save(6, &x.locks) + m.Save(7, &x.fs) + m.Save(8, &x.task) + m.Save(9, &x.pidns) + m.Save(10, &x.cgroupControllers) } func (x *subtasksInode) afterLoad() {} func (x *subtasksInode) StateLoad(m state.Source) { - m.Load(0, &x.InodeNotSymlink) - m.Load(1, &x.InodeDirectoryNoNewChildren) - m.Load(2, &x.InodeAttrs) - m.Load(3, &x.OrderedChildren) - m.Load(4, &x.AlwaysValid) - m.Load(5, &x.locks) - m.Load(6, &x.fs) - m.Load(7, &x.task) - m.Load(8, &x.pidns) - m.Load(9, &x.cgroupControllers) + m.Load(0, &x.subtasksInodeRefs) + m.Load(1, &x.InodeNotSymlink) + m.Load(2, &x.InodeDirectoryNoNewChildren) + m.Load(3, &x.InodeAttrs) + m.Load(4, &x.OrderedChildren) + m.Load(5, &x.AlwaysValid) + m.Load(6, &x.locks) + m.Load(7, &x.fs) + m.Load(8, &x.task) + m.Load(9, &x.pidns) + m.Load(10, &x.cgroupControllers) +} + +func (x *subtasksInodeRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/proc.subtasksInodeRefs" +} + +func (x *subtasksInodeRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *subtasksInodeRefs) beforeSave() {} + +func (x *subtasksInodeRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *subtasksInodeRefs) afterLoad() {} + +func (x *subtasksInodeRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) } func (x *taskInode) StateTypeName() string { @@ -81,6 +153,7 @@ func (x *taskInode) StateTypeName() string { func (x *taskInode) StateFields() []string { return []string{ + "taskInodeRefs", "InodeNotSymlink", "InodeDirectoryNoNewChildren", "InodeNoDynamicLookup", @@ -95,25 +168,27 @@ func (x *taskInode) beforeSave() {} func (x *taskInode) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.InodeNotSymlink) - m.Save(1, &x.InodeDirectoryNoNewChildren) - m.Save(2, &x.InodeNoDynamicLookup) - m.Save(3, &x.InodeAttrs) - m.Save(4, &x.OrderedChildren) - m.Save(5, &x.locks) - m.Save(6, &x.task) + m.Save(0, &x.taskInodeRefs) + m.Save(1, &x.InodeNotSymlink) + m.Save(2, &x.InodeDirectoryNoNewChildren) + m.Save(3, &x.InodeNoDynamicLookup) + m.Save(4, &x.InodeAttrs) + m.Save(5, &x.OrderedChildren) + m.Save(6, &x.locks) + m.Save(7, &x.task) } func (x *taskInode) afterLoad() {} func (x *taskInode) StateLoad(m state.Source) { - m.Load(0, &x.InodeNotSymlink) - m.Load(1, &x.InodeDirectoryNoNewChildren) - m.Load(2, &x.InodeNoDynamicLookup) - m.Load(3, &x.InodeAttrs) - m.Load(4, &x.OrderedChildren) - m.Load(5, &x.locks) - m.Load(6, &x.task) + m.Load(0, &x.taskInodeRefs) + m.Load(1, &x.InodeNotSymlink) + m.Load(2, &x.InodeDirectoryNoNewChildren) + m.Load(3, &x.InodeNoDynamicLookup) + m.Load(4, &x.InodeAttrs) + m.Load(5, &x.OrderedChildren) + m.Load(6, &x.locks) + m.Load(7, &x.task) } func (x *fdDirInode) StateTypeName() string { @@ -122,6 +197,7 @@ func (x *fdDirInode) StateTypeName() string { func (x *fdDirInode) StateFields() []string { return []string{ + "fdDirInodeRefs", "InodeNotSymlink", "InodeDirectoryNoNewChildren", "InodeAttrs", @@ -135,23 +211,25 @@ func (x *fdDirInode) beforeSave() {} func (x *fdDirInode) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.InodeNotSymlink) - m.Save(1, &x.InodeDirectoryNoNewChildren) - m.Save(2, &x.InodeAttrs) - m.Save(3, &x.OrderedChildren) - m.Save(4, &x.AlwaysValid) - m.Save(5, &x.fdDir) + m.Save(0, &x.fdDirInodeRefs) + m.Save(1, &x.InodeNotSymlink) + m.Save(2, &x.InodeDirectoryNoNewChildren) + m.Save(3, &x.InodeAttrs) + m.Save(4, &x.OrderedChildren) + m.Save(5, &x.AlwaysValid) + m.Save(6, &x.fdDir) } func (x *fdDirInode) afterLoad() {} func (x *fdDirInode) StateLoad(m state.Source) { - m.Load(0, &x.InodeNotSymlink) - m.Load(1, &x.InodeDirectoryNoNewChildren) - m.Load(2, &x.InodeAttrs) - m.Load(3, &x.OrderedChildren) - m.Load(4, &x.AlwaysValid) - m.Load(5, &x.fdDir) + m.Load(0, &x.fdDirInodeRefs) + m.Load(1, &x.InodeNotSymlink) + m.Load(2, &x.InodeDirectoryNoNewChildren) + m.Load(3, &x.InodeAttrs) + m.Load(4, &x.OrderedChildren) + m.Load(5, &x.AlwaysValid) + m.Load(6, &x.fdDir) } func (x *fdSymlink) StateTypeName() string { @@ -195,6 +273,7 @@ func (x *fdInfoDirInode) StateTypeName() string { func (x *fdInfoDirInode) StateFields() []string { return []string{ + "fdInfoDirInodeRefs", "InodeNotSymlink", "InodeDirectoryNoNewChildren", "InodeAttrs", @@ -208,23 +287,25 @@ func (x *fdInfoDirInode) beforeSave() {} func (x *fdInfoDirInode) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.InodeNotSymlink) - m.Save(1, &x.InodeDirectoryNoNewChildren) - m.Save(2, &x.InodeAttrs) - m.Save(3, &x.OrderedChildren) - m.Save(4, &x.AlwaysValid) - m.Save(5, &x.fdDir) + m.Save(0, &x.fdInfoDirInodeRefs) + m.Save(1, &x.InodeNotSymlink) + m.Save(2, &x.InodeDirectoryNoNewChildren) + m.Save(3, &x.InodeAttrs) + m.Save(4, &x.OrderedChildren) + m.Save(5, &x.AlwaysValid) + m.Save(6, &x.fdDir) } func (x *fdInfoDirInode) afterLoad() {} func (x *fdInfoDirInode) StateLoad(m state.Source) { - m.Load(0, &x.InodeNotSymlink) - m.Load(1, &x.InodeDirectoryNoNewChildren) - m.Load(2, &x.InodeAttrs) - m.Load(3, &x.OrderedChildren) - m.Load(4, &x.AlwaysValid) - m.Load(5, &x.fdDir) + m.Load(0, &x.fdInfoDirInodeRefs) + m.Load(1, &x.InodeNotSymlink) + m.Load(2, &x.InodeDirectoryNoNewChildren) + m.Load(3, &x.InodeAttrs) + m.Load(4, &x.OrderedChildren) + m.Load(5, &x.AlwaysValid) + m.Load(6, &x.fdDir) } func (x *fdInfoData) StateTypeName() string { @@ -234,7 +315,6 @@ func (x *fdInfoData) StateTypeName() string { func (x *fdInfoData) StateFields() []string { return []string{ "DynamicBytesFile", - "AtomicRefCount", "task", "fd", } @@ -245,18 +325,16 @@ func (x *fdInfoData) beforeSave() {} func (x *fdInfoData) StateSave(m state.Sink) { x.beforeSave() m.Save(0, &x.DynamicBytesFile) - m.Save(1, &x.AtomicRefCount) - m.Save(2, &x.task) - m.Save(3, &x.fd) + m.Save(1, &x.task) + m.Save(2, &x.fd) } func (x *fdInfoData) afterLoad() {} func (x *fdInfoData) StateLoad(m state.Source) { m.Load(0, &x.DynamicBytesFile) - m.Load(1, &x.AtomicRefCount) - m.Load(2, &x.task) - m.Load(3, &x.fd) + m.Load(1, &x.task) + m.Load(2, &x.fd) } func (x *auxvData) StateTypeName() string { @@ -670,6 +748,29 @@ func (x *mountsData) StateLoad(m state.Source) { m.Load(1, &x.task) } +func (x *taskInodeRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/proc.taskInodeRefs" +} + +func (x *taskInodeRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *taskInodeRefs) beforeSave() {} + +func (x *taskInodeRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *taskInodeRefs) afterLoad() {} + +func (x *taskInodeRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *ifinet6) StateTypeName() string { return "pkg/sentry/fsimpl/proc.ifinet6" } @@ -910,6 +1011,7 @@ func (x *tasksInode) StateTypeName() string { func (x *tasksInode) StateFields() []string { return []string{ + "tasksInodeRefs", "InodeNotSymlink", "InodeDirectoryNoNewChildren", "InodeAttrs", @@ -928,33 +1030,35 @@ func (x *tasksInode) beforeSave() {} func (x *tasksInode) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.InodeNotSymlink) - m.Save(1, &x.InodeDirectoryNoNewChildren) - m.Save(2, &x.InodeAttrs) - m.Save(3, &x.OrderedChildren) - m.Save(4, &x.AlwaysValid) - m.Save(5, &x.locks) - m.Save(6, &x.fs) - m.Save(7, &x.pidns) - m.Save(8, &x.selfSymlink) - m.Save(9, &x.threadSelfSymlink) - m.Save(10, &x.cgroupControllers) + m.Save(0, &x.tasksInodeRefs) + m.Save(1, &x.InodeNotSymlink) + m.Save(2, &x.InodeDirectoryNoNewChildren) + m.Save(3, &x.InodeAttrs) + m.Save(4, &x.OrderedChildren) + m.Save(5, &x.AlwaysValid) + m.Save(6, &x.locks) + m.Save(7, &x.fs) + m.Save(8, &x.pidns) + m.Save(9, &x.selfSymlink) + m.Save(10, &x.threadSelfSymlink) + m.Save(11, &x.cgroupControllers) } func (x *tasksInode) afterLoad() {} func (x *tasksInode) StateLoad(m state.Source) { - m.Load(0, &x.InodeNotSymlink) - m.Load(1, &x.InodeDirectoryNoNewChildren) - m.Load(2, &x.InodeAttrs) - m.Load(3, &x.OrderedChildren) - m.Load(4, &x.AlwaysValid) - m.Load(5, &x.locks) - m.Load(6, &x.fs) - m.Load(7, &x.pidns) - m.Load(8, &x.selfSymlink) - m.Load(9, &x.threadSelfSymlink) - m.Load(10, &x.cgroupControllers) + m.Load(0, &x.tasksInodeRefs) + m.Load(1, &x.InodeNotSymlink) + m.Load(2, &x.InodeDirectoryNoNewChildren) + m.Load(3, &x.InodeAttrs) + m.Load(4, &x.OrderedChildren) + m.Load(5, &x.AlwaysValid) + m.Load(6, &x.locks) + m.Load(7, &x.fs) + m.Load(8, &x.pidns) + m.Load(9, &x.selfSymlink) + m.Load(10, &x.threadSelfSymlink) + m.Load(11, &x.cgroupControllers) } func (x *statData) StateTypeName() string { @@ -1095,6 +1199,29 @@ func (x *filesystemsData) StateLoad(m state.Source) { m.Load(0, &x.DynamicBytesFile) } +func (x *tasksInodeRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/proc.tasksInodeRefs" +} + +func (x *tasksInodeRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *tasksInodeRefs) beforeSave() {} + +func (x *tasksInodeRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *tasksInodeRefs) afterLoad() {} + +func (x *tasksInodeRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *mmapMinAddrData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.mmapMinAddrData" } @@ -1229,8 +1356,11 @@ func (x *tcpMemData) StateLoad(m state.Source) { } func init() { + state.Register((*fdDirInodeRefs)(nil)) + state.Register((*fdInfoDirInodeRefs)(nil)) state.Register((*FilesystemType)(nil)) state.Register((*subtasksInode)(nil)) + state.Register((*subtasksInodeRefs)(nil)) state.Register((*taskInode)(nil)) state.Register((*fdDirInode)(nil)) state.Register((*fdSymlink)(nil)) @@ -1251,6 +1381,7 @@ func init() { state.Register((*exeSymlink)(nil)) state.Register((*mountInfoData)(nil)) state.Register((*mountsData)(nil)) + state.Register((*taskInodeRefs)(nil)) state.Register((*ifinet6)(nil)) state.Register((*netDevData)(nil)) state.Register((*netUnixData)(nil)) @@ -1267,6 +1398,7 @@ func init() { state.Register((*uptimeData)(nil)) state.Register((*versionData)(nil)) state.Register((*filesystemsData)(nil)) + state.Register((*tasksInodeRefs)(nil)) state.Register((*mmapMinAddrData)(nil)) state.Register((*hostnameData)(nil)) state.Register((*tcpSackData)(nil)) diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index f25747da3..01c0efb3a 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -31,6 +31,7 @@ import ( // // +stateify savable type subtasksInode struct { + subtasksInodeRefs kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren kernfs.InodeAttrs @@ -57,6 +58,7 @@ func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, // Note: credentials are overridden by taskOwnedInode. subInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + subInode.EnableLeakCheck() inode := &taskOwnedInode{Inode: subInode, owner: task} dentry := &kernfs.Dentry{} @@ -182,3 +184,8 @@ func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } + +// DecRef implements kernfs.Inode. +func (i *subtasksInode) DecRef(context.Context) { + i.subtasksInodeRefs.DecRef(i.Destroy) +} diff --git a/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go b/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go new file mode 100644 index 000000000..c6d9b3522 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go @@ -0,0 +1,118 @@ +package proc + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var subtasksInodeownerType *subtasksInode + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type subtasksInodeRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *subtasksInodeRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, subtasksInodeownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *subtasksInodeRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*subtasksInodeRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *subtasksInodeRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *subtasksInodeRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, subtasksInodeownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *subtasksInodeRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *subtasksInodeRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, subtasksInodeownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 109b31b4c..66b557abd 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -32,6 +32,7 @@ import ( // // +stateify savable type taskInode struct { + taskInodeRefs kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren kernfs.InodeNoDynamicLookup @@ -84,6 +85,7 @@ func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace taskInode := &taskInode{task: task} // Note: credentials are overridden by taskOwnedInode. taskInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + taskInode.EnableLeakCheck() inode := &taskOwnedInode{Inode: taskInode, owner: task} dentry := &kernfs.Dentry{} @@ -119,6 +121,11 @@ func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, v return syserror.EPERM } +// DecRef implements kernfs.Inode. +func (i *taskInode) DecRef(context.Context) { + i.taskInodeRefs.DecRef(i.Destroy) +} + // taskOwnedInode implements kernfs.Inode and overrides inode owner with task // effective user and group. type taskOwnedInode struct { @@ -147,6 +154,7 @@ func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux. dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndZero, }) + dir.EnableLeakCheck() inode := &taskOwnedInode{Inode: dir, owner: task} d := &kernfs.Dentry{} diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index e8fcb9aa1..0527b2de8 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -101,6 +100,7 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, off // // +stateify savable type fdDirInode struct { + fdDirInodeRefs kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren kernfs.InodeAttrs @@ -120,6 +120,7 @@ func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry { }, } inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + inode.EnableLeakCheck() dentry := &kernfs.Dentry{} dentry.Init(inode) @@ -175,6 +176,11 @@ func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentia return err } +// DecRef implements kernfs.Inode. +func (i *fdDirInode) DecRef(context.Context) { + i.fdDirInodeRefs.DecRef(i.Destroy) +} + // fdSymlink is an symlink for the /proc/[pid]/fd/[fd] file. // // +stateify savable @@ -227,6 +233,7 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen // // +stateify savable type fdInfoDirInode struct { + fdInfoDirInodeRefs kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren kernfs.InodeAttrs @@ -245,6 +252,7 @@ func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry { }, } inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + inode.EnableLeakCheck() dentry := &kernfs.Dentry{} dentry.Init(inode) @@ -282,12 +290,16 @@ func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd * return fd.VFSFileDescription(), nil } +// DecRef implements kernfs.Inode. +func (i *fdInfoDirInode) DecRef(context.Context) { + i.fdInfoDirInodeRefs.DecRef(i.Destroy) +} + // fdInfoData implements vfs.DynamicBytesSource for /proc/[pid]/fdinfo/[fd]. // // +stateify savable type fdInfoData struct { kernfs.DynamicBytesFile - refs.AtomicRefCount task *kernel.Task fd int32 diff --git a/pkg/sentry/fsimpl/proc/task_inode_refs.go b/pkg/sentry/fsimpl/proc/task_inode_refs.go new file mode 100644 index 000000000..714488450 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/task_inode_refs.go @@ -0,0 +1,118 @@ +package proc + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var taskInodeownerType *taskInode + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type taskInodeRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *taskInodeRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, taskInodeownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *taskInodeRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*taskInodeRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *taskInodeRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *taskInodeRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, taskInodeownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *taskInodeRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *taskInodeRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, taskInodeownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index a4c884bf9..4e69782c7 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -262,7 +262,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { // For now, we always redact this pointer. fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %8d", (*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct. - s.Refs()-1, // RefCount, don't count our own ref. + s.ReadRefs()-1, // RefCount, don't count our own ref. 0, // Protocol, always 0 for UDS. sockFlags, // Flags. sops.Endpoint().Type(), // Type. @@ -430,7 +430,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, // Field: refcount. Don't count the ref we obtain while deferencing // the weakref to this socket. - fmt.Fprintf(buf, "%d ", s.Refs()-1) + fmt.Fprintf(buf, "%d ", s.ReadRefs()-1) // Field: Socket struct address. Redacted due to the same reason as // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. @@ -589,7 +589,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Field: ref; reference count on the socket inode. Don't count the ref // we obtain while deferencing the weakref to this socket. - fmt.Fprintf(buf, "%d ", s.Refs()-1) + fmt.Fprintf(buf, "%d ", s.ReadRefs()-1) // Field: Socket struct address. Redacted due to the same reason as // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 1391992b7..863c4467e 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -37,6 +37,7 @@ const ( // // +stateify savable type tasksInode struct { + tasksInodeRefs kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren kernfs.InodeAttrs @@ -84,6 +85,7 @@ func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace cgroupControllers: cgroupControllers, } inode.InodeAttrs.Init(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + inode.EnableLeakCheck() dentry := &kernfs.Dentry{} dentry.Init(inode) @@ -226,6 +228,11 @@ func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.St return stat, nil } +// DecRef implements kernfs.Inode. +func (i *tasksInode) DecRef(context.Context) { + i.tasksInodeRefs.DecRef(i.Destroy) +} + // staticFileSetStat implements a special static file that allows inode // attributes to be set. This is to support /proc files that are readonly, but // allow attributes to be set. diff --git a/pkg/sentry/fsimpl/proc/tasks_inode_refs.go b/pkg/sentry/fsimpl/proc/tasks_inode_refs.go new file mode 100644 index 000000000..22d9cc488 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks_inode_refs.go @@ -0,0 +1,118 @@ +package proc + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var tasksInodeownerType *tasksInode + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type tasksInodeRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *tasksInodeRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, tasksInodeownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *tasksInodeRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*tasksInodeRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *tasksInodeRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *tasksInodeRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, tasksInodeownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *tasksInodeRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *tasksInodeRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, tasksInodeownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/sys/dir_refs.go b/pkg/sentry/fsimpl/sys/dir_refs.go new file mode 100644 index 000000000..89609b198 --- /dev/null +++ b/pkg/sentry/fsimpl/sys/dir_refs.go @@ -0,0 +1,118 @@ +package sys + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var dirownerType *dir + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type dirRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *dirRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, dirownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *dirRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*dirRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *dirRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *dirRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, dirownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *dirRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *dirRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, dirownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 1f042d9f7..ea30a4ec2 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -118,6 +118,7 @@ func (fs *filesystem) Release(ctx context.Context) { // dir implements kernfs.Inode. type dir struct { + dirRefs kernfs.InodeAttrs kernfs.InodeNoDynamicLookup kernfs.InodeNotSymlink @@ -133,6 +134,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte d := &dir{} d.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + d.EnableLeakCheck() d.dentry.Init(d) d.IncLinks(d.OrderedChildren.Populate(&d.dentry, contents)) @@ -140,7 +142,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte return &d.dentry } -// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } @@ -156,6 +158,11 @@ func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, return fd.VFSFileDescription(), nil } +// DecRef implements kernfs.Inode.DecRef. +func (d *dir) DecRef(context.Context) { + d.dirRefs.DecRef(d.Destroy) +} + // cpuFile implements kernfs.Inode. type cpuFile struct { kernfs.DynamicBytesFile diff --git a/pkg/sentry/fsimpl/sys/sys_state_autogen.go b/pkg/sentry/fsimpl/sys/sys_state_autogen.go index 8866ddff9..347a46318 100644 --- a/pkg/sentry/fsimpl/sys/sys_state_autogen.go +++ b/pkg/sentry/fsimpl/sys/sys_state_autogen.go @@ -1,3 +1,34 @@ // automatically generated by stateify. package sys + +import ( + "gvisor.dev/gvisor/pkg/state" +) + +func (x *dirRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/sys.dirRefs" +} + +func (x *dirRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *dirRefs) beforeSave() {} + +func (x *dirRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *dirRefs) afterLoad() {} + +func (x *dirRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + +func init() { + state.Register((*dirRefs)(nil)) +} diff --git a/pkg/sentry/fsimpl/tmpfs/inode_refs.go b/pkg/sentry/fsimpl/tmpfs/inode_refs.go index 8b7ff185f..dbf0b2766 100644 --- a/pkg/sentry/fsimpl/tmpfs/inode_refs.go +++ b/pkg/sentry/fsimpl/tmpfs/inode_refs.go @@ -1,6 +1,7 @@ package tmpfs import ( + "fmt" "runtime" "sync/atomic" @@ -18,6 +19,11 @@ var inodeownerType *inode // Note that the number of references is actually refCount + 1 so that a default // zero-value Refs object contains one reference. // +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// // +stateify savable type inodeRefs struct { // refCount is composed of two fields: @@ -62,7 +68,7 @@ func (r *inodeRefs) ReadRefs() int64 { //go:nosplit func (r *inodeRefs) IncRef() { if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { - panic("Incrementing non-positive ref count") + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, inodeownerType)) } } @@ -101,7 +107,7 @@ func (r *inodeRefs) TryIncRef() bool { func (r *inodeRefs) DecRef(destroy func()) { switch v := atomic.AddInt64(&r.refCount, -1); { case v < -1: - panic("Decrementing non-positive ref count") + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, inodeownerType)) case v == -1: |