diff options
Diffstat (limited to 'pkg/sentry/kernel')
-rw-r--r-- | pkg/sentry/kernel/fd_table.go | 21 | ||||
-rw-r--r-- | pkg/sentry/kernel/fd_table_refs.go | 118 | ||||
-rw-r--r-- | pkg/sentry/kernel/fd_table_unsafe.go | 2 | ||||
-rw-r--r-- | pkg/sentry/kernel/fs_context.go | 89 | ||||
-rw-r--r-- | pkg/sentry/kernel/fs_context_refs.go | 118 | ||||
-rw-r--r-- | pkg/sentry/kernel/kernel_state_autogen.go | 114 | ||||
-rw-r--r-- | pkg/sentry/kernel/process_group_refs.go | 118 | ||||
-rw-r--r-- | pkg/sentry/kernel/session_refs.go | 118 | ||||
-rw-r--r-- | pkg/sentry/kernel/sessions.go | 29 | ||||
-rw-r--r-- | pkg/sentry/kernel/shm/shm.go | 19 | ||||
-rw-r--r-- | pkg/sentry/kernel/shm/shm_refs.go | 118 | ||||
-rw-r--r-- | pkg/sentry/kernel/shm/shm_state_autogen.go | 30 |
12 files changed, 800 insertions, 94 deletions
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index ce53af69b..5773244ac 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/limits" @@ -78,7 +77,8 @@ type descriptor struct { // // +stateify savable type FDTable struct { - refs.AtomicRefCount + FDTableRefs + k *Kernel // mu protects below. @@ -176,16 +176,15 @@ func (k *Kernel) NewFDTable() *FDTable { return f } -// destroy removes all of the file descriptors from the map. -func (f *FDTable) destroy(ctx context.Context) { - f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool { - return true - }) -} - -// DecRef implements RefCounter.DecRef with destructor f.destroy. +// DecRef implements RefCounter.DecRef. +// +// If f reaches zero references, all of its file descriptors are removed. func (f *FDTable) DecRef(ctx context.Context) { - f.DecRefWithDestructor(ctx, f.destroy) + f.FDTableRefs.DecRef(func() { + f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool { + return true + }) + }) } // Size returns the number of file descriptor slots currently allocated. diff --git a/pkg/sentry/kernel/fd_table_refs.go b/pkg/sentry/kernel/fd_table_refs.go new file mode 100644 index 000000000..ecba138ac --- /dev/null +++ b/pkg/sentry/kernel/fd_table_refs.go @@ -0,0 +1,118 @@ +package kernel + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var FDTableownerType *FDTable + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type FDTableRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *FDTableRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, FDTableownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *FDTableRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*FDTableRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *FDTableRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *FDTableRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, FDTableownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *FDTableRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *FDTableRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, FDTableownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go index 7fd97dc53..6b8feb107 100644 --- a/pkg/sentry/kernel/fd_table_unsafe.go +++ b/pkg/sentry/kernel/fd_table_unsafe.go @@ -31,6 +31,8 @@ type descriptorTable struct { } // init initializes the table. +// +// TODO(gvisor.dev/1486): Enable leak check for FDTable. func (f *FDTable) init() { var slice []unsafe.Pointer // Empty slice. atomic.StorePointer(&f.slice, unsafe.Pointer(&slice)) diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go index 8f2d36d5a..d46d1e1c1 100644 --- a/pkg/sentry/kernel/fs_context.go +++ b/pkg/sentry/kernel/fs_context.go @@ -18,7 +18,6 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -30,7 +29,7 @@ import ( // // +stateify savable type FSContext struct { - refs.AtomicRefCount + FSContextRefs // mu protects below. mu sync.Mutex `state:"nosave"` @@ -64,7 +63,7 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext { cwd: cwd, umask: umask, } - f.EnableLeakCheck("kernel.FSContext") + f.EnableLeakCheck() return &f } @@ -77,54 +76,56 @@ func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext { cwdVFS2: cwd, umask: umask, } - f.EnableLeakCheck("kernel.FSContext") + f.EnableLeakCheck() return &f } -// destroy is the destructor for an FSContext. +// DecRef implements RefCounter.DecRef. // -// This will call DecRef on both root and cwd Dirents. If either call to -// DecRef returns an error, then it will be propagated. If both calls to -// DecRef return an error, then the one from root.DecRef will be propagated. +// When f reaches zero references, DecRef will be called on both root and cwd +// Dirents. // // Note that there may still be calls to WorkingDirectory() or RootDirectory() // (that return nil). This is because valid references may still be held via // proc files or other mechanisms. -func (f *FSContext) destroy(ctx context.Context) { - // Hold f.mu so that we don't race with RootDirectory() and - // WorkingDirectory(). - f.mu.Lock() - defer f.mu.Unlock() - - if VFS2Enabled { - f.rootVFS2.DecRef(ctx) - f.rootVFS2 = vfs.VirtualDentry{} - f.cwdVFS2.DecRef(ctx) - f.cwdVFS2 = vfs.VirtualDentry{} - } else { - f.root.DecRef(ctx) - f.root = nil - f.cwd.DecRef(ctx) - f.cwd = nil - } -} - -// DecRef implements RefCounter.DecRef with destructor f.destroy. func (f *FSContext) DecRef(ctx context.Context) { - f.DecRefWithDestructor(ctx, f.destroy) + f.FSContextRefs.DecRef(func() { + // Hold f.mu so that we don't race with RootDirectory() and + // WorkingDirectory(). + f.mu.Lock() + defer f.mu.Unlock() + + if VFS2Enabled { + f.rootVFS2.DecRef(ctx) + f.rootVFS2 = vfs.VirtualDentry{} + f.cwdVFS2.DecRef(ctx) + f.cwdVFS2 = vfs.VirtualDentry{} + } else { + f.root.DecRef(ctx) + f.root = nil + f.cwd.DecRef(ctx) + f.cwd = nil + } + }) } // Fork forks this FSContext. // -// This is not a valid call after destroy. +// This is not a valid call after f is destroyed. func (f *FSContext) Fork() *FSContext { f.mu.Lock() defer f.mu.Unlock() if VFS2Enabled { + if !f.cwdVFS2.Ok() { + panic("FSContext.Fork() called after destroy") + } f.cwdVFS2.IncRef() f.rootVFS2.IncRef() } else { + if f.cwd == nil { + panic("FSContext.Fork() called after destroy") + } f.cwd.IncRef() f.root.IncRef() } @@ -140,8 +141,8 @@ func (f *FSContext) Fork() *FSContext { // WorkingDirectory returns the current working directory. // -// This will return nil if called after destroy(), otherwise it will return a -// Dirent with a reference taken. +// This will return nil if called after f is destroyed, otherwise it will return +// a Dirent with a reference taken. func (f *FSContext) WorkingDirectory() *fs.Dirent { f.mu.Lock() defer f.mu.Unlock() @@ -152,8 +153,8 @@ func (f *FSContext) WorkingDirectory() *fs.Dirent { // WorkingDirectoryVFS2 returns the current working directory. // -// This will return nil if called after destroy(), otherwise it will return a -// Dirent with a reference taken. +// This will return nil if called after f is destroyed, otherwise it will return +// a Dirent with a reference taken. func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry { f.mu.Lock() defer f.mu.Unlock() @@ -165,7 +166,7 @@ func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry { // SetWorkingDirectory sets the current working directory. // This will take an extra reference on the Dirent. // -// This is not a valid call after destroy. +// This is not a valid call after f is destroyed. func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) { if d == nil { panic("FSContext.SetWorkingDirectory called with nil dirent") @@ -187,11 +188,15 @@ func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) { // SetWorkingDirectoryVFS2 sets the current working directory. // This will take an extra reference on the VirtualDentry. // -// This is not a valid call after destroy. +// This is not a valid call after f is destroyed. func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDentry) { f.mu.Lock() defer f.mu.Unlock() + if !f.cwdVFS2.Ok() { + panic(fmt.Sprintf("FSContext.SetWorkingDirectoryVFS2(%v)) called after destroy", d)) + } + old := f.cwdVFS2 f.cwdVFS2 = d d.IncRef() @@ -200,8 +205,8 @@ func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDe // RootDirectory returns the current filesystem root. // -// This will return nil if called after destroy(), otherwise it will return a -// Dirent with a reference taken. +// This will return nil if called after f is destroyed, otherwise it will return +// a Dirent with a reference taken. func (f *FSContext) RootDirectory() *fs.Dirent { f.mu.Lock() defer f.mu.Unlock() @@ -213,8 +218,8 @@ func (f *FSContext) RootDirectory() *fs.Dirent { // RootDirectoryVFS2 returns the current filesystem root. // -// This will return nil if called after destroy(), otherwise it will return a -// Dirent with a reference taken. +// This will return nil if called after f is destroyed, otherwise it will return +// a Dirent with a reference taken. func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry { f.mu.Lock() defer f.mu.Unlock() @@ -226,7 +231,7 @@ func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry { // SetRootDirectory sets the root directory. // This will take an extra reference on the Dirent. // -// This is not a valid call after free. +// This is not a valid call after f is destroyed. func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) { if d == nil { panic("FSContext.SetRootDirectory called with nil dirent") @@ -247,7 +252,7 @@ func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) { // SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd. // -// This is not a valid call after free. +// This is not a valid call after f is destroyed. func (f *FSContext) SetRootDirectoryVFS2(ctx context.Context, vd vfs.VirtualDentry) { if !vd.Ok() { panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry") diff --git a/pkg/sentry/kernel/fs_context_refs.go b/pkg/sentry/kernel/fs_context_refs.go new file mode 100644 index 000000000..fb2fde971 --- /dev/null +++ b/pkg/sentry/kernel/fs_context_refs.go @@ -0,0 +1,118 @@ +package kernel + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var FSContextownerType *FSContext + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type FSContextRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *FSContextRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, FSContextownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *FSContextRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*FSContextRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *FSContextRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *FSContextRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, FSContextownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *FSContextRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *FSContextRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, FSContextownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/kernel/kernel_state_autogen.go b/pkg/sentry/kernel/kernel_state_autogen.go index e514b07d6..d0ff135d7 100644 --- a/pkg/sentry/kernel/kernel_state_autogen.go +++ b/pkg/sentry/kernel/kernel_state_autogen.go @@ -122,7 +122,7 @@ func (x *FDTable) StateTypeName() string { func (x *FDTable) StateFields() []string { return []string{ - "AtomicRefCount", + "FDTableRefs", "k", "next", "used", @@ -136,7 +136,7 @@ func (x *FDTable) StateSave(m state.Sink) { x.beforeSave() var descriptorTable map[int32]descriptor = x.saveDescriptorTable() m.SaveValue(4, descriptorTable) - m.Save(0, &x.AtomicRefCount) + m.Save(0, &x.FDTableRefs) m.Save(1, &x.k) m.Save(2, &x.next) m.Save(3, &x.used) @@ -145,20 +145,43 @@ func (x *FDTable) StateSave(m state.Sink) { func (x *FDTable) afterLoad() {} func (x *FDTable) StateLoad(m state.Source) { - m.Load(0, &x.AtomicRefCount) + m.Load(0, &x.FDTableRefs) m.Load(1, &x.k) m.Load(2, &x.next) m.Load(3, &x.used) m.LoadValue(4, new(map[int32]descriptor), func(y interface{}) { x.loadDescriptorTable(y.(map[int32]descriptor)) }) } +func (x *FDTableRefs) StateTypeName() string { + return "pkg/sentry/kernel.FDTableRefs" +} + +func (x *FDTableRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *FDTableRefs) beforeSave() {} + +func (x *FDTableRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *FDTableRefs) afterLoad() {} + +func (x *FDTableRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *FSContext) StateTypeName() string { return "pkg/sentry/kernel.FSContext" } func (x *FSContext) StateFields() []string { return []string{ - "AtomicRefCount", + "FSContextRefs", "root", "rootVFS2", "cwd", @@ -171,7 +194,7 @@ func (x *FSContext) beforeSave() {} func (x *FSContext) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.AtomicRefCount) + m.Save(0, &x.FSContextRefs) m.Save(1, &x.root) m.Save(2, &x.rootVFS2) m.Save(3, &x.cwd) @@ -182,7 +205,7 @@ func (x *FSContext) StateSave(m state.Sink) { func (x *FSContext) afterLoad() {} func (x *FSContext) StateLoad(m state.Source) { - m.Load(0, &x.AtomicRefCount) + m.Load(0, &x.FSContextRefs) m.Load(1, &x.root) m.Load(2, &x.rootVFS2) m.Load(3, &x.cwd) @@ -190,6 +213,29 @@ func (x *FSContext) StateLoad(m state.Source) { m.Load(5, &x.umask) } +func (x *FSContextRefs) StateTypeName() string { + return "pkg/sentry/kernel.FSContextRefs" +} + +func (x *FSContextRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *FSContextRefs) beforeSave() {} + +func (x *FSContextRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *FSContextRefs) afterLoad() {} + +func (x *FSContextRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *IPCNamespace) StateTypeName() string { return "pkg/sentry/kernel.IPCNamespace" } @@ -643,6 +689,29 @@ func (x *processGroupEntry) StateLoad(m state.Source) { m.Load(1, &x.prev) } +func (x *ProcessGroupRefs) StateTypeName() string { + return "pkg/sentry/kernel.ProcessGroupRefs" +} + +func (x *ProcessGroupRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *ProcessGroupRefs) beforeSave() {} + +func (x *ProcessGroupRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *ProcessGroupRefs) afterLoad() {} + +func (x *ProcessGroupRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *ptraceOptions) StateTypeName() string { return "pkg/sentry/kernel.ptraceOptions" } @@ -794,13 +863,36 @@ func (x *sessionEntry) StateLoad(m state.Source) { m.Load(1, &x.prev) } +func (x *SessionRefs) StateTypeName() string { + return "pkg/sentry/kernel.SessionRefs" +} + +func (x *SessionRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *SessionRefs) beforeSave() {} + +func (x *SessionRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *SessionRefs) afterLoad() {} + +func (x *SessionRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *Session) StateTypeName() string { return "pkg/sentry/kernel.Session" } func (x *Session) StateFields() []string { return []string{ - "refs", + "SessionRefs", "leader", "id", "foreground", @@ -813,7 +905,7 @@ func (x *Session) beforeSave() {} func (x *Session) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.refs) + m.Save(0, &x.SessionRefs) m.Save(1, &x.leader) m.Save(2, &x.id) m.Save(3, &x.foreground) @@ -824,7 +916,7 @@ func (x *Session) StateSave(m state.Sink) { func (x *Session) afterLoad() {} func (x *Session) StateLoad(m state.Source) { - m.Load(0, &x.refs) + m.Load(0, &x.SessionRefs) m.Load(1, &x.leader) m.Load(2, &x.id) m.Load(3, &x.foreground) @@ -2167,7 +2259,9 @@ func init() { state.Register((*FDFlags)(nil)) state.Register((*descriptor)(nil)) state.Register((*FDTable)(nil)) + state.Register((*FDTableRefs)(nil)) state.Register((*FSContext)(nil)) + state.Register((*FSContextRefs)(nil)) state.Register((*IPCNamespace)(nil)) state.Register((*Kernel)(nil)) state.Register((*SocketEntry)(nil)) @@ -2180,11 +2274,13 @@ func init() { state.Register((*IntervalTimer)(nil)) state.Register((*processGroupList)(nil)) state.Register((*processGroupEntry)(nil)) + state.Register((*ProcessGroupRefs)(nil)) state.Register((*ptraceOptions)(nil)) state.Register((*ptraceStop)(nil)) state.Register((*OldRSeqCriticalRegion)(nil)) state.Register((*sessionList)(nil)) state.Register((*sessionEntry)(nil)) + state.Register((*SessionRefs)(nil)) state.Register((*Session)(nil)) state.Register((*ProcessGroup)(nil)) state.Register((*SignalHandlers)(nil)) diff --git a/pkg/sentry/kernel/process_group_refs.go b/pkg/sentry/kernel/process_group_refs.go new file mode 100644 index 000000000..4ed6e6458 --- /dev/null +++ b/pkg/sentry/kernel/process_group_refs.go @@ -0,0 +1,118 @@ +package kernel + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var ProcessGroupownerType *ProcessGroup + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type ProcessGroupRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *ProcessGroupRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, ProcessGroupownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *ProcessGroupRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*ProcessGroupRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *ProcessGroupRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *ProcessGroupRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, ProcessGroupownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *ProcessGroupRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *ProcessGroupRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, ProcessGroupownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/kernel/session_refs.go b/pkg/sentry/kernel/session_refs.go new file mode 100644 index 000000000..f2e1bb797 --- /dev/null +++ b/pkg/sentry/kernel/session_refs.go @@ -0,0 +1,118 @@ +package kernel + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var SessionownerType *Session + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type SessionRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *SessionRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, SessionownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *SessionRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*SessionRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *SessionRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *SessionRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, SessionownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *SessionRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *SessionRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, SessionownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index 5c4c622c2..df5c8421b 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -16,8 +16,6 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/syserror" ) @@ -32,7 +30,7 @@ type ProcessGroupID ThreadID // // +stateify savable type Session struct { - refs refs.AtomicRefCount + SessionRefs // leader is the originator of the Session. // @@ -62,16 +60,11 @@ type Session struct { sessionEntry } -// incRef grabs a reference. -func (s *Session) incRef() { - s.refs.IncRef() -} - -// decRef drops a reference. +// DecRef drops a reference. // // Precondition: callers must hold TaskSet.mu for writing. -func (s *Session) decRef() { - s.refs.DecRefWithDestructor(nil, func(context.Context) { +func (s *Session) DecRef() { + s.SessionRefs.DecRef(func() { // Remove translations from the leader. for ns := s.leader.pidns; ns != nil; ns = ns.parent { id := ns.sids[s] @@ -88,7 +81,7 @@ func (s *Session) decRef() { // // +stateify savable type ProcessGroup struct { - refs refs.AtomicRefCount // not exported. + refs ProcessGroupRefs // originator is the originator of the group. // @@ -163,7 +156,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) { } alive := true - pg.refs.DecRefWithDestructor(nil, func(context.Context) { + pg.refs.DecRef(func() { alive = false // don't bother with handleOrphan. // Remove translations from the originator. @@ -175,7 +168,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) { // Remove the list of process groups. pg.session.processGroups.Remove(pg) - pg.session.decRef() + pg.session.DecRef() }) if alive { pg.handleOrphan() @@ -302,7 +295,7 @@ func (tg *ThreadGroup) createSession() error { id: SessionID(id), leader: tg, } - s.refs.EnableLeakCheck("kernel.Session") + s.EnableLeakCheck() // Create a new ProcessGroup, belonging to that Session. // This also has a single reference (assigned below). @@ -316,7 +309,7 @@ func (tg *ThreadGroup) createSession() error { session: s, ancestors: 0, } - pg.refs.EnableLeakCheck("kernel.ProcessGroup") + pg.refs.EnableLeakCheck() // Tie them and return the result. s.processGroups.PushBack(pg) @@ -396,13 +389,13 @@ func (tg *ThreadGroup) CreateProcessGroup() error { // // We manually adjust the ancestors if the parent is in the same // session. - tg.processGroup.session.incRef() + tg.processGroup.session.IncRef() pg := ProcessGroup{ id: ProcessGroupID(id), originator: tg, session: tg.processGroup.session, } - pg.refs.EnableLeakCheck("kernel.ProcessGroup") + pg.refs.EnableLeakCheck() if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session { pg.ancestors++ diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index 13ec7afe0..00c03585e 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -39,7 +39,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -252,7 +251,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi creatorPID: pid, changeTime: ktime.NowFromContext(ctx), } - shm.EnableLeakCheck("kernel.Shm") + shm.EnableLeakCheck() // Find the next available ID. for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { @@ -337,14 +336,14 @@ func (r *Registry) remove(s *Shm) { // // +stateify savable type Shm struct { - // AtomicRefCount tracks the number of references to this segment. + // ShmRefs tracks the number of references to this segment. // // A segment holds a reference to itself until it is marked for // destruction. // // In addition to direct users, the MemoryManager will hold references // via MappingIdentity. - refs.AtomicRefCount + ShmRefs mfp pgalloc.MemoryFileProvider @@ -428,11 +427,14 @@ func (s *Shm) InodeID() uint64 { return uint64(s.ID) } -// DecRef overrides refs.RefCount.DecRef with a destructor. +// DecRef drops a reference on s. // // Precondition: Caller must not hold s.mu. func (s *Shm) DecRef(ctx context.Context) { - s.DecRefWithDestructor(ctx, s.destroy) + s.ShmRefs.DecRef(func() { + s.mfp.MemoryFile().DecRef(s.fr) + s.registry.remove(s) + }) } // Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm @@ -642,11 +644,6 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error { return nil } -func (s *Shm) destroy(context.Context) { - s.mfp.MemoryFile().DecRef(s.fr) - s.registry.remove(s) -} - // MarkDestroyed marks a segment for destruction. The segment is actually // destroyed once it has no references. MarkDestroyed may be called multiple // times, and is safe to call after a segment has already been destroyed. See diff --git a/pkg/sentry/kernel/shm/shm_refs.go b/pkg/sentry/kernel/shm/shm_refs.go new file mode 100644 index 000000000..51e07d0b3 --- /dev/null +++ b/pkg/sentry/kernel/shm/shm_refs.go @@ -0,0 +1,118 @@ +package shm + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var ShmownerType *Shm + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type ShmRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *ShmRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, ShmownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *ShmRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*ShmRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *ShmRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *ShmRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, ShmownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *ShmRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *ShmRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, ShmownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/kernel/shm/shm_state_autogen.go b/pkg/sentry/kernel/shm/shm_state_autogen.go index c98632087..fa80353b6 100644 --- a/pkg/sentry/kernel/shm/shm_state_autogen.go +++ b/pkg/sentry/kernel/shm/shm_state_autogen.go @@ -47,7 +47,7 @@ func (x *Shm) StateTypeName() string { func (x *Shm) StateFields() []string { return []string{ - "AtomicRefCount", + "ShmRefs", "mfp", "registry", "ID", @@ -71,7 +71,7 @@ func (x *Shm) beforeSave() {} func (x *Shm) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.AtomicRefCount) + m.Save(0, &x.ShmRefs) m.Save(1, &x.mfp) m.Save(2, &x.registry) m.Save(3, &x.ID) @@ -93,7 +93,7 @@ func (x *Shm) StateSave(m state.Sink) { func (x *Shm) afterLoad() {} func (x *Shm) StateLoad(m state.Source) { - m.Load(0, &x.AtomicRefCount) + m.Load(0, &x.ShmRefs) m.Load(1, &x.mfp) m.Load(2, &x.registry) m.Load(3, &x.ID) @@ -112,7 +112,31 @@ func (x *Shm) StateLoad(m state.Source) { m.Load(16, &x.pendingDestruction) } +func (x *ShmRefs) StateTypeName() string { + return "pkg/sentry/kernel/shm.ShmRefs" +} + +func (x *ShmRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *ShmRefs) beforeSave() {} + +func (x *ShmRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *ShmRefs) afterLoad() {} + +func (x *ShmRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func init() { state.Register((*Registry)(nil)) state.Register((*Shm)(nil)) + state.Register((*ShmRefs)(nil)) } |