diff options
Diffstat (limited to 'pkg/sentry/fsimpl/proc')
-rw-r--r-- | pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go | 118 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go | 118 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/proc/proc_state_autogen.go | 306 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/proc/subtasks.go | 7 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/proc/subtasks_inode_refs.go | 118 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/proc/task.go | 8 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/proc/task_fds.go | 16 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/proc/task_inode_refs.go | 118 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/proc/task_net.go | 6 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/proc/tasks.go | 7 | ||||
-rw-r--r-- | pkg/sentry/fsimpl/proc/tasks_inode_refs.go | 118 |
11 files changed, 848 insertions, 92 deletions
diff --git a/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go b/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go new file mode 100644 index 000000000..9431c1506 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go @@ -0,0 +1,118 @@ +package proc + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var fdDirInodeownerType *fdDirInode + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type fdDirInodeRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *fdDirInodeRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, fdDirInodeownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *fdDirInodeRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*fdDirInodeRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *fdDirInodeRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *fdDirInodeRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, fdDirInodeownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *fdDirInodeRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *fdDirInodeRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, fdDirInodeownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go b/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go new file mode 100644 index 000000000..872b20eb0 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go @@ -0,0 +1,118 @@ +package proc + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var fdInfoDirInodeownerType *fdInfoDirInode + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type fdInfoDirInodeRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *fdInfoDirInodeRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, fdInfoDirInodeownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *fdInfoDirInodeRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*fdInfoDirInodeRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *fdInfoDirInodeRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *fdInfoDirInodeRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, fdInfoDirInodeownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *fdInfoDirInodeRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *fdInfoDirInodeRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, fdInfoDirInodeownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/proc/proc_state_autogen.go b/pkg/sentry/fsimpl/proc/proc_state_autogen.go index a7e43b7a7..907ef38e0 100644 --- a/pkg/sentry/fsimpl/proc/proc_state_autogen.go +++ b/pkg/sentry/fsimpl/proc/proc_state_autogen.go @@ -6,6 +6,52 @@ import ( "gvisor.dev/gvisor/pkg/state" ) +func (x *fdDirInodeRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/proc.fdDirInodeRefs" +} + +func (x *fdDirInodeRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *fdDirInodeRefs) beforeSave() {} + +func (x *fdDirInodeRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *fdDirInodeRefs) afterLoad() {} + +func (x *fdDirInodeRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + +func (x *fdInfoDirInodeRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/proc.fdInfoDirInodeRefs" +} + +func (x *fdInfoDirInodeRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *fdInfoDirInodeRefs) beforeSave() {} + +func (x *fdInfoDirInodeRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *fdInfoDirInodeRefs) afterLoad() {} + +func (x *fdInfoDirInodeRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *FilesystemType) StateTypeName() string { return "pkg/sentry/fsimpl/proc.FilesystemType" } @@ -31,6 +77,7 @@ func (x *subtasksInode) StateTypeName() string { func (x *subtasksInode) StateFields() []string { return []string{ + "subtasksInodeRefs", "InodeNotSymlink", "InodeDirectoryNoNewChildren", "InodeAttrs", @@ -48,31 +95,56 @@ func (x *subtasksInode) beforeSave() {} func (x *subtasksInode) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.InodeNotSymlink) - m.Save(1, &x.InodeDirectoryNoNewChildren) - m.Save(2, &x.InodeAttrs) - m.Save(3, &x.OrderedChildren) - m.Save(4, &x.AlwaysValid) - m.Save(5, &x.locks) - m.Save(6, &x.fs) - m.Save(7, &x.task) - m.Save(8, &x.pidns) - m.Save(9, &x.cgroupControllers) + m.Save(0, &x.subtasksInodeRefs) + m.Save(1, &x.InodeNotSymlink) + m.Save(2, &x.InodeDirectoryNoNewChildren) + m.Save(3, &x.InodeAttrs) + m.Save(4, &x.OrderedChildren) + m.Save(5, &x.AlwaysValid) + m.Save(6, &x.locks) + m.Save(7, &x.fs) + m.Save(8, &x.task) + m.Save(9, &x.pidns) + m.Save(10, &x.cgroupControllers) } func (x *subtasksInode) afterLoad() {} func (x *subtasksInode) StateLoad(m state.Source) { - m.Load(0, &x.InodeNotSymlink) - m.Load(1, &x.InodeDirectoryNoNewChildren) - m.Load(2, &x.InodeAttrs) - m.Load(3, &x.OrderedChildren) - m.Load(4, &x.AlwaysValid) - m.Load(5, &x.locks) - m.Load(6, &x.fs) - m.Load(7, &x.task) - m.Load(8, &x.pidns) - m.Load(9, &x.cgroupControllers) + m.Load(0, &x.subtasksInodeRefs) + m.Load(1, &x.InodeNotSymlink) + m.Load(2, &x.InodeDirectoryNoNewChildren) + m.Load(3, &x.InodeAttrs) + m.Load(4, &x.OrderedChildren) + m.Load(5, &x.AlwaysValid) + m.Load(6, &x.locks) + m.Load(7, &x.fs) + m.Load(8, &x.task) + m.Load(9, &x.pidns) + m.Load(10, &x.cgroupControllers) +} + +func (x *subtasksInodeRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/proc.subtasksInodeRefs" +} + +func (x *subtasksInodeRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *subtasksInodeRefs) beforeSave() {} + +func (x *subtasksInodeRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *subtasksInodeRefs) afterLoad() {} + +func (x *subtasksInodeRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) } func (x *taskInode) StateTypeName() string { @@ -81,6 +153,7 @@ func (x *taskInode) StateTypeName() string { func (x *taskInode) StateFields() []string { return []string{ + "taskInodeRefs", "InodeNotSymlink", "InodeDirectoryNoNewChildren", "InodeNoDynamicLookup", @@ -95,25 +168,27 @@ func (x *taskInode) beforeSave() {} func (x *taskInode) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.InodeNotSymlink) - m.Save(1, &x.InodeDirectoryNoNewChildren) - m.Save(2, &x.InodeNoDynamicLookup) - m.Save(3, &x.InodeAttrs) - m.Save(4, &x.OrderedChildren) - m.Save(5, &x.locks) - m.Save(6, &x.task) + m.Save(0, &x.taskInodeRefs) + m.Save(1, &x.InodeNotSymlink) + m.Save(2, &x.InodeDirectoryNoNewChildren) + m.Save(3, &x.InodeNoDynamicLookup) + m.Save(4, &x.InodeAttrs) + m.Save(5, &x.OrderedChildren) + m.Save(6, &x.locks) + m.Save(7, &x.task) } func (x *taskInode) afterLoad() {} func (x *taskInode) StateLoad(m state.Source) { - m.Load(0, &x.InodeNotSymlink) - m.Load(1, &x.InodeDirectoryNoNewChildren) - m.Load(2, &x.InodeNoDynamicLookup) - m.Load(3, &x.InodeAttrs) - m.Load(4, &x.OrderedChildren) - m.Load(5, &x.locks) - m.Load(6, &x.task) + m.Load(0, &x.taskInodeRefs) + m.Load(1, &x.InodeNotSymlink) + m.Load(2, &x.InodeDirectoryNoNewChildren) + m.Load(3, &x.InodeNoDynamicLookup) + m.Load(4, &x.InodeAttrs) + m.Load(5, &x.OrderedChildren) + m.Load(6, &x.locks) + m.Load(7, &x.task) } func (x *fdDirInode) StateTypeName() string { @@ -122,6 +197,7 @@ func (x *fdDirInode) StateTypeName() string { func (x *fdDirInode) StateFields() []string { return []string{ + "fdDirInodeRefs", "InodeNotSymlink", "InodeDirectoryNoNewChildren", "InodeAttrs", @@ -135,23 +211,25 @@ func (x *fdDirInode) beforeSave() {} func (x *fdDirInode) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.InodeNotSymlink) - m.Save(1, &x.InodeDirectoryNoNewChildren) - m.Save(2, &x.InodeAttrs) - m.Save(3, &x.OrderedChildren) - m.Save(4, &x.AlwaysValid) - m.Save(5, &x.fdDir) + m.Save(0, &x.fdDirInodeRefs) + m.Save(1, &x.InodeNotSymlink) + m.Save(2, &x.InodeDirectoryNoNewChildren) + m.Save(3, &x.InodeAttrs) + m.Save(4, &x.OrderedChildren) + m.Save(5, &x.AlwaysValid) + m.Save(6, &x.fdDir) } func (x *fdDirInode) afterLoad() {} func (x *fdDirInode) StateLoad(m state.Source) { - m.Load(0, &x.InodeNotSymlink) - m.Load(1, &x.InodeDirectoryNoNewChildren) - m.Load(2, &x.InodeAttrs) - m.Load(3, &x.OrderedChildren) - m.Load(4, &x.AlwaysValid) - m.Load(5, &x.fdDir) + m.Load(0, &x.fdDirInodeRefs) + m.Load(1, &x.InodeNotSymlink) + m.Load(2, &x.InodeDirectoryNoNewChildren) + m.Load(3, &x.InodeAttrs) + m.Load(4, &x.OrderedChildren) + m.Load(5, &x.AlwaysValid) + m.Load(6, &x.fdDir) } func (x *fdSymlink) StateTypeName() string { @@ -195,6 +273,7 @@ func (x *fdInfoDirInode) StateTypeName() string { func (x *fdInfoDirInode) StateFields() []string { return []string{ + "fdInfoDirInodeRefs", "InodeNotSymlink", "InodeDirectoryNoNewChildren", "InodeAttrs", @@ -208,23 +287,25 @@ func (x *fdInfoDirInode) beforeSave() {} func (x *fdInfoDirInode) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.InodeNotSymlink) - m.Save(1, &x.InodeDirectoryNoNewChildren) - m.Save(2, &x.InodeAttrs) - m.Save(3, &x.OrderedChildren) - m.Save(4, &x.AlwaysValid) - m.Save(5, &x.fdDir) + m.Save(0, &x.fdInfoDirInodeRefs) + m.Save(1, &x.InodeNotSymlink) + m.Save(2, &x.InodeDirectoryNoNewChildren) + m.Save(3, &x.InodeAttrs) + m.Save(4, &x.OrderedChildren) + m.Save(5, &x.AlwaysValid) + m.Save(6, &x.fdDir) } func (x *fdInfoDirInode) afterLoad() {} func (x *fdInfoDirInode) StateLoad(m state.Source) { - m.Load(0, &x.InodeNotSymlink) - m.Load(1, &x.InodeDirectoryNoNewChildren) - m.Load(2, &x.InodeAttrs) - m.Load(3, &x.OrderedChildren) - m.Load(4, &x.AlwaysValid) - m.Load(5, &x.fdDir) + m.Load(0, &x.fdInfoDirInodeRefs) + m.Load(1, &x.InodeNotSymlink) + m.Load(2, &x.InodeDirectoryNoNewChildren) + m.Load(3, &x.InodeAttrs) + m.Load(4, &x.OrderedChildren) + m.Load(5, &x.AlwaysValid) + m.Load(6, &x.fdDir) } func (x *fdInfoData) StateTypeName() string { @@ -234,7 +315,6 @@ func (x *fdInfoData) StateTypeName() string { func (x *fdInfoData) StateFields() []string { return []string{ "DynamicBytesFile", - "AtomicRefCount", "task", "fd", } @@ -245,18 +325,16 @@ func (x *fdInfoData) beforeSave() {} func (x *fdInfoData) StateSave(m state.Sink) { x.beforeSave() m.Save(0, &x.DynamicBytesFile) - m.Save(1, &x.AtomicRefCount) - m.Save(2, &x.task) - m.Save(3, &x.fd) + m.Save(1, &x.task) + m.Save(2, &x.fd) } func (x *fdInfoData) afterLoad() {} func (x *fdInfoData) StateLoad(m state.Source) { m.Load(0, &x.DynamicBytesFile) - m.Load(1, &x.AtomicRefCount) - m.Load(2, &x.task) - m.Load(3, &x.fd) + m.Load(1, &x.task) + m.Load(2, &x.fd) } func (x *auxvData) StateTypeName() string { @@ -670,6 +748,29 @@ func (x *mountsData) StateLoad(m state.Source) { m.Load(1, &x.task) } +func (x *taskInodeRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/proc.taskInodeRefs" +} + +func (x *taskInodeRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *taskInodeRefs) beforeSave() {} + +func (x *taskInodeRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *taskInodeRefs) afterLoad() {} + +func (x *taskInodeRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *ifinet6) StateTypeName() string { return "pkg/sentry/fsimpl/proc.ifinet6" } @@ -910,6 +1011,7 @@ func (x *tasksInode) StateTypeName() string { func (x *tasksInode) StateFields() []string { return []string{ + "tasksInodeRefs", "InodeNotSymlink", "InodeDirectoryNoNewChildren", "InodeAttrs", @@ -928,33 +1030,35 @@ func (x *tasksInode) beforeSave() {} func (x *tasksInode) StateSave(m state.Sink) { x.beforeSave() - m.Save(0, &x.InodeNotSymlink) - m.Save(1, &x.InodeDirectoryNoNewChildren) - m.Save(2, &x.InodeAttrs) - m.Save(3, &x.OrderedChildren) - m.Save(4, &x.AlwaysValid) - m.Save(5, &x.locks) - m.Save(6, &x.fs) - m.Save(7, &x.pidns) - m.Save(8, &x.selfSymlink) - m.Save(9, &x.threadSelfSymlink) - m.Save(10, &x.cgroupControllers) + m.Save(0, &x.tasksInodeRefs) + m.Save(1, &x.InodeNotSymlink) + m.Save(2, &x.InodeDirectoryNoNewChildren) + m.Save(3, &x.InodeAttrs) + m.Save(4, &x.OrderedChildren) + m.Save(5, &x.AlwaysValid) + m.Save(6, &x.locks) + m.Save(7, &x.fs) + m.Save(8, &x.pidns) + m.Save(9, &x.selfSymlink) + m.Save(10, &x.threadSelfSymlink) + m.Save(11, &x.cgroupControllers) } func (x *tasksInode) afterLoad() {} func (x *tasksInode) StateLoad(m state.Source) { - m.Load(0, &x.InodeNotSymlink) - m.Load(1, &x.InodeDirectoryNoNewChildren) - m.Load(2, &x.InodeAttrs) - m.Load(3, &x.OrderedChildren) - m.Load(4, &x.AlwaysValid) - m.Load(5, &x.locks) - m.Load(6, &x.fs) - m.Load(7, &x.pidns) - m.Load(8, &x.selfSymlink) - m.Load(9, &x.threadSelfSymlink) - m.Load(10, &x.cgroupControllers) + m.Load(0, &x.tasksInodeRefs) + m.Load(1, &x.InodeNotSymlink) + m.Load(2, &x.InodeDirectoryNoNewChildren) + m.Load(3, &x.InodeAttrs) + m.Load(4, &x.OrderedChildren) + m.Load(5, &x.AlwaysValid) + m.Load(6, &x.locks) + m.Load(7, &x.fs) + m.Load(8, &x.pidns) + m.Load(9, &x.selfSymlink) + m.Load(10, &x.threadSelfSymlink) + m.Load(11, &x.cgroupControllers) } func (x *statData) StateTypeName() string { @@ -1095,6 +1199,29 @@ func (x *filesystemsData) StateLoad(m state.Source) { m.Load(0, &x.DynamicBytesFile) } +func (x *tasksInodeRefs) StateTypeName() string { + return "pkg/sentry/fsimpl/proc.tasksInodeRefs" +} + +func (x *tasksInodeRefs) StateFields() []string { + return []string{ + "refCount", + } +} + +func (x *tasksInodeRefs) beforeSave() {} + +func (x *tasksInodeRefs) StateSave(m state.Sink) { + x.beforeSave() + m.Save(0, &x.refCount) +} + +func (x *tasksInodeRefs) afterLoad() {} + +func (x *tasksInodeRefs) StateLoad(m state.Source) { + m.Load(0, &x.refCount) +} + func (x *mmapMinAddrData) StateTypeName() string { return "pkg/sentry/fsimpl/proc.mmapMinAddrData" } @@ -1229,8 +1356,11 @@ func (x *tcpMemData) StateLoad(m state.Source) { } func init() { + state.Register((*fdDirInodeRefs)(nil)) + state.Register((*fdInfoDirInodeRefs)(nil)) state.Register((*FilesystemType)(nil)) state.Register((*subtasksInode)(nil)) + state.Register((*subtasksInodeRefs)(nil)) state.Register((*taskInode)(nil)) state.Register((*fdDirInode)(nil)) state.Register((*fdSymlink)(nil)) @@ -1251,6 +1381,7 @@ func init() { state.Register((*exeSymlink)(nil)) state.Register((*mountInfoData)(nil)) state.Register((*mountsData)(nil)) + state.Register((*taskInodeRefs)(nil)) state.Register((*ifinet6)(nil)) state.Register((*netDevData)(nil)) state.Register((*netUnixData)(nil)) @@ -1267,6 +1398,7 @@ func init() { state.Register((*uptimeData)(nil)) state.Register((*versionData)(nil)) state.Register((*filesystemsData)(nil)) + state.Register((*tasksInodeRefs)(nil)) state.Register((*mmapMinAddrData)(nil)) state.Register((*hostnameData)(nil)) state.Register((*tcpSackData)(nil)) diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index f25747da3..01c0efb3a 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -31,6 +31,7 @@ import ( // // +stateify savable type subtasksInode struct { + subtasksInodeRefs kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren kernfs.InodeAttrs @@ -57,6 +58,7 @@ func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, // Note: credentials are overridden by taskOwnedInode. subInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + subInode.EnableLeakCheck() inode := &taskOwnedInode{Inode: subInode, owner: task} dentry := &kernfs.Dentry{} @@ -182,3 +184,8 @@ func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } + +// DecRef implements kernfs.Inode. +func (i *subtasksInode) DecRef(context.Context) { + i.subtasksInodeRefs.DecRef(i.Destroy) +} diff --git a/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go b/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go new file mode 100644 index 000000000..c6d9b3522 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go @@ -0,0 +1,118 @@ +package proc + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var subtasksInodeownerType *subtasksInode + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type subtasksInodeRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *subtasksInodeRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, subtasksInodeownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *subtasksInodeRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*subtasksInodeRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *subtasksInodeRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *subtasksInodeRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, subtasksInodeownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *subtasksInodeRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *subtasksInodeRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, subtasksInodeownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 109b31b4c..66b557abd 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -32,6 +32,7 @@ import ( // // +stateify savable type taskInode struct { + taskInodeRefs kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren kernfs.InodeNoDynamicLookup @@ -84,6 +85,7 @@ func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace taskInode := &taskInode{task: task} // Note: credentials are overridden by taskOwnedInode. taskInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + taskInode.EnableLeakCheck() inode := &taskOwnedInode{Inode: taskInode, owner: task} dentry := &kernfs.Dentry{} @@ -119,6 +121,11 @@ func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, v return syserror.EPERM } +// DecRef implements kernfs.Inode. +func (i *taskInode) DecRef(context.Context) { + i.taskInodeRefs.DecRef(i.Destroy) +} + // taskOwnedInode implements kernfs.Inode and overrides inode owner with task // effective user and group. type taskOwnedInode struct { @@ -147,6 +154,7 @@ func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux. dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, kernfs.GenericDirectoryFDOptions{ SeekEnd: kernfs.SeekEndZero, }) + dir.EnableLeakCheck() inode := &taskOwnedInode{Inode: dir, owner: task} d := &kernfs.Dentry{} diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index e8fcb9aa1..0527b2de8 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -101,6 +100,7 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, off // // +stateify savable type fdDirInode struct { + fdDirInodeRefs kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren kernfs.InodeAttrs @@ -120,6 +120,7 @@ func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry { }, } inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + inode.EnableLeakCheck() dentry := &kernfs.Dentry{} dentry.Init(inode) @@ -175,6 +176,11 @@ func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentia return err } +// DecRef implements kernfs.Inode. +func (i *fdDirInode) DecRef(context.Context) { + i.fdDirInodeRefs.DecRef(i.Destroy) +} + // fdSymlink is an symlink for the /proc/[pid]/fd/[fd] file. // // +stateify savable @@ -227,6 +233,7 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen // // +stateify savable type fdInfoDirInode struct { + fdInfoDirInodeRefs kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren kernfs.InodeAttrs @@ -245,6 +252,7 @@ func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry { }, } inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + inode.EnableLeakCheck() dentry := &kernfs.Dentry{} dentry.Init(inode) @@ -282,12 +290,16 @@ func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd * return fd.VFSFileDescription(), nil } +// DecRef implements kernfs.Inode. +func (i *fdInfoDirInode) DecRef(context.Context) { + i.fdInfoDirInodeRefs.DecRef(i.Destroy) +} + // fdInfoData implements vfs.DynamicBytesSource for /proc/[pid]/fdinfo/[fd]. // // +stateify savable type fdInfoData struct { kernfs.DynamicBytesFile - refs.AtomicRefCount task *kernel.Task fd int32 diff --git a/pkg/sentry/fsimpl/proc/task_inode_refs.go b/pkg/sentry/fsimpl/proc/task_inode_refs.go new file mode 100644 index 000000000..714488450 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/task_inode_refs.go @@ -0,0 +1,118 @@ +package proc + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var taskInodeownerType *taskInode + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type taskInodeRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *taskInodeRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, taskInodeownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *taskInodeRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*taskInodeRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *taskInodeRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *taskInodeRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, taskInodeownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *taskInodeRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *taskInodeRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, taskInodeownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index a4c884bf9..4e69782c7 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -262,7 +262,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { // For now, we always redact this pointer. fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %8d", (*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct. - s.Refs()-1, // RefCount, don't count our own ref. + s.ReadRefs()-1, // RefCount, don't count our own ref. 0, // Protocol, always 0 for UDS. sockFlags, // Flags. sops.Endpoint().Type(), // Type. @@ -430,7 +430,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, // Field: refcount. Don't count the ref we obtain while deferencing // the weakref to this socket. - fmt.Fprintf(buf, "%d ", s.Refs()-1) + fmt.Fprintf(buf, "%d ", s.ReadRefs()-1) // Field: Socket struct address. Redacted due to the same reason as // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. @@ -589,7 +589,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Field: ref; reference count on the socket inode. Don't count the ref // we obtain while deferencing the weakref to this socket. - fmt.Fprintf(buf, "%d ", s.Refs()-1) + fmt.Fprintf(buf, "%d ", s.ReadRefs()-1) // Field: Socket struct address. Redacted due to the same reason as // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 1391992b7..863c4467e 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -37,6 +37,7 @@ const ( // // +stateify savable type tasksInode struct { + tasksInodeRefs kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren kernfs.InodeAttrs @@ -84,6 +85,7 @@ func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace cgroupControllers: cgroupControllers, } inode.InodeAttrs.Init(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) + inode.EnableLeakCheck() dentry := &kernfs.Dentry{} dentry.Init(inode) @@ -226,6 +228,11 @@ func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.St return stat, nil } +// DecRef implements kernfs.Inode. +func (i *tasksInode) DecRef(context.Context) { + i.tasksInodeRefs.DecRef(i.Destroy) +} + // staticFileSetStat implements a special static file that allows inode // attributes to be set. This is to support /proc files that are readonly, but // allow attributes to be set. diff --git a/pkg/sentry/fsimpl/proc/tasks_inode_refs.go b/pkg/sentry/fsimpl/proc/tasks_inode_refs.go new file mode 100644 index 000000000..22d9cc488 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks_inode_refs.go @@ -0,0 +1,118 @@ +package proc + +import ( + "fmt" + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var tasksInodeownerType *tasksInode + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// TODO(gvisor.dev/issue/1486): Store stack traces when leak check is enabled in +// a map with 16-bit hashes, and store the hash in the top 16 bits of refCount. +// This will allow us to add stack trace information to the leak messages +// without growing the size of Refs. +// +// +stateify savable +type tasksInodeRefs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *tasksInodeRefs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, tasksInodeownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *tasksInodeRefs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*tasksInodeRefs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *tasksInodeRefs) ReadRefs() int64 { + + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *tasksInodeRefs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic(fmt.Sprintf("Incrementing non-positive ref count %p owned by %T", r, tasksInodeownerType)) + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *tasksInodeRefs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *tasksInodeRefs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic(fmt.Sprintf("Decrementing non-positive ref count %p, owned by %T", r, tasksInodeownerType)) + + case v == -1: + + if destroy != nil { + destroy() + } + } +} |