summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel/thread_group.go
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/kernel/thread_group.go')
-rw-r--r--pkg/sentry/kernel/thread_group.go330
1 files changed, 330 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
new file mode 100644
index 000000000..8bd53928e
--- /dev/null
+++ b/pkg/sentry/kernel/thread_group.go
@@ -0,0 +1,330 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+ "sync"
+ "sync/atomic"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+)
+
+// A ThreadGroup is a logical grouping of tasks that has widespread
+// significance to other kernel features (e.g. signal handling). ("Thread
+// groups" are usually called "processes" in userspace documentation.)
+//
+// ThreadGroup is a superset of Linux's struct signal_struct.
+//
+// +stateify savable
+type ThreadGroup struct {
+ threadGroupNode
+
+ // signalHandlers is the set of signal handlers used by every task in this
+ // thread group. (signalHandlers may also be shared with other thread
+ // groups.)
+ //
+ // signalHandlers.mu (hereafter "the signal mutex") protects state related
+ // to signal handling, as well as state that usually needs to be atomic
+ // with signal handling, for all ThreadGroups and Tasks using
+ // signalHandlers. (This is analogous to Linux's use of struct
+ // sighand_struct::siglock.)
+ //
+ // The signalHandlers pointer can only be mutated during an execve
+ // (Task.finishExec). Consequently, when it's possible for a task in the
+ // thread group to be completing an execve, signalHandlers is protected by
+ // the owning TaskSet.mu. Otherwise, it is possible to read the
+ // signalHandlers pointer without synchronization. In particular,
+ // completing an execve requires that all other tasks in the thread group
+ // have exited, so task goroutines do not need the owning TaskSet.mu to
+ // read the signalHandlers pointer of their thread groups.
+ signalHandlers *SignalHandlers
+
+ // pendingSignals is the set of pending signals that may be handled by any
+ // task in this thread group.
+ //
+ // pendingSignals is protected by the signal mutex.
+ pendingSignals pendingSignals
+
+ // If groupStopDequeued is true, a task in the thread group has dequeued a
+ // stop signal, but has not yet initiated the group stop.
+ //
+ // groupStopDequeued is analogous to Linux's JOBCTL_STOP_DEQUEUED.
+ //
+ // groupStopDequeued is protected by the signal mutex.
+ groupStopDequeued bool
+
+ // groupStopSignal is the signal that caused a group stop to be initiated.
+ //
+ // groupStopSignal is protected by the signal mutex.
+ groupStopSignal linux.Signal
+
+ // groupStopPendingCount is the number of active tasks in the thread group
+ // for which Task.groupStopPending is set.
+ //
+ // groupStopPendingCount is analogous to Linux's
+ // signal_struct::group_stop_count.
+ //
+ // groupStopPendingCount is protected by the signal mutex.
+ groupStopPendingCount int
+
+ // If groupStopComplete is true, groupStopPendingCount transitioned from
+ // non-zero to zero without an intervening SIGCONT.
+ //
+ // groupStopComplete is analogous to Linux's SIGNAL_STOP_STOPPED.
+ //
+ // groupStopComplete is protected by the signal mutex.
+ groupStopComplete bool
+
+ // If groupStopWaitable is true, the thread group is indicating a waitable
+ // group stop event (as defined by EventChildGroupStop).
+ //
+ // Linux represents the analogous state as SIGNAL_STOP_STOPPED being set
+ // and group_exit_code being non-zero.
+ //
+ // groupStopWaitable is protected by the signal mutex.
+ groupStopWaitable bool
+
+ // If groupContNotify is true, then a SIGCONT has recently ended a group
+ // stop on this thread group, and the first task to observe it should
+ // notify its parent. groupContInterrupted is true iff SIGCONT ended an
+ // incomplete group stop. If groupContNotify is false, groupContInterrupted is
+ // meaningless.
+ //
+ // Analogues in Linux:
+ //
+ // - groupContNotify && groupContInterrupted is represented by
+ // SIGNAL_CLD_STOPPED.
+ //
+ // - groupContNotify && !groupContInterrupted is represented by
+ // SIGNAL_CLD_CONTINUED.
+ //
+ // - !groupContNotify is represented by neither flag being set.
+ //
+ // groupContNotify and groupContInterrupted are protected by the signal
+ // mutex.
+ groupContNotify bool
+ groupContInterrupted bool
+
+ // If groupContWaitable is true, the thread group is indicating a waitable
+ // continue event (as defined by EventGroupContinue).
+ //
+ // groupContWaitable is analogous to Linux's SIGNAL_STOP_CONTINUED.
+ //
+ // groupContWaitable is protected by the signal mutex.
+ groupContWaitable bool
+
+ // exiting is true if all tasks in the ThreadGroup should exit. exiting is
+ // analogous to Linux's SIGNAL_GROUP_EXIT.
+ //
+ // exiting is protected by the signal mutex. exiting can only transition
+ // from false to true.
+ exiting bool
+
+ // exitStatus is the thread group's exit status.
+ //
+ // While exiting is false, exitStatus is protected by the signal mutex.
+ // When exiting becomes true, exitStatus becomes immutable.
+ exitStatus ExitStatus
+
+ // terminationSignal is the signal that this thread group's leader will
+ // send to its parent when it exits.
+ //
+ // terminationSignal is protected by the TaskSet mutex.
+ terminationSignal linux.Signal
+
+ // liveGoroutines is the number of non-exited task goroutines in the thread
+ // group.
+ //
+ // liveGoroutines is not saved; it is reset as task goroutines are
+ // restarted by Task.Start.
+ liveGoroutines sync.WaitGroup `state:"nosave"`
+
+ timerMu sync.Mutex `state:"nosave"`
+
+ // itimerRealTimer implements ITIMER_REAL for the thread group.
+ itimerRealTimer *ktime.Timer
+
+ // itimerVirtSetting is the ITIMER_VIRTUAL setting for the thread group.
+ //
+ // itimerVirtSetting is protected by the signal mutex.
+ itimerVirtSetting ktime.Setting
+
+ // itimerProfSetting is the ITIMER_PROF setting for the thread group.
+ //
+ // itimerProfSetting is protected by the signal mutex.
+ itimerProfSetting ktime.Setting
+
+ // rlimitCPUSoftSetting is the setting for RLIMIT_CPU soft limit
+ // notifications for the thread group.
+ //
+ // rlimitCPUSoftSetting is protected by the signal mutex.
+ rlimitCPUSoftSetting ktime.Setting
+
+ // cpuTimersEnabled is non-zero if itimerVirtSetting.Enabled is true,
+ // itimerProfSetting.Enabled is true, rlimitCPUSoftSetting.Enabled is true,
+ // or limits.Get(CPU) is finite.
+ //
+ // cpuTimersEnabled is protected by the signal mutex. cpuTimersEnabled is
+ // accessed using atomic memory operations.
+ cpuTimersEnabled uint32
+
+ // timers is the thread group's POSIX interval timers. nextTimerID is the
+ // TimerID at which allocation should begin searching for an unused ID.
+ //
+ // timers and nextTimerID are protected by timerMu.
+ timers map[linux.TimerID]*IntervalTimer
+ nextTimerID linux.TimerID
+
+ // exitedCPUStats is the CPU usage for all exited tasks in the thread
+ // group. exitedCPUStats is protected by the TaskSet mutex.
+ exitedCPUStats usage.CPUStats
+
+ // childCPUStats is the CPU usage of all joined descendants of this thread
+ // group. childCPUStats is protected by the TaskSet mutex.
+ childCPUStats usage.CPUStats
+
+ // ioUsage is the I/O usage for all exited tasks in the thread group.
+ // The ioUsage pointer is immutable.
+ ioUsage *usage.IO
+
+ // maxRSS is the historical maximum resident set size of the thread group, updated when:
+ //
+ // - A task in the thread group exits, since after all tasks have
+ // exited the MemoryManager is no longer reachable.
+ //
+ // - The thread group completes an execve, since this changes
+ // MemoryManagers.
+ //
+ // maxRSS is protected by the TaskSet mutex.
+ maxRSS uint64
+
+ // childMaxRSS is the maximum resident set size in bytes of all joined
+ // descendants of this thread group.
+ //
+ // childMaxRSS is protected by the TaskSet mutex.
+ childMaxRSS uint64
+
+ // Resource limits for this ThreadGroup. The limits pointer is immutable.
+ limits *limits.LimitSet
+
+ // processGroup is the processGroup for this thread group.
+ //
+ // processGroup is protected by the TaskSet mutex.
+ processGroup *ProcessGroup
+
+ // execed indicates an exec has occurred since creation. This will be
+ // set by finishExec, and new TheadGroups will have this field cleared.
+ // When execed is set, the processGroup may no longer be changed.
+ //
+ // execed is protected by the TaskSet mutex.
+ execed bool
+
+ // rscr is the thread group's RSEQ critical region.
+ rscr atomic.Value `state:".(*RSEQCriticalRegion)"`
+}
+
+// newThreadGroup returns a new, empty thread group in PID namespace ns. The
+// thread group leader will send its parent terminationSignal when it exits.
+// The new thread group isn't visible to the system until a task has been
+// created inside of it by a successful call to TaskSet.NewTask.
+func (k *Kernel) newThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup {
+ tg := &ThreadGroup{
+ threadGroupNode: threadGroupNode{
+ pidns: ns,
+ },
+ signalHandlers: sh,
+ terminationSignal: terminationSignal,
+ ioUsage: &usage.IO{},
+ limits: limits,
+ }
+ tg.itimerRealTimer = ktime.NewTimer(k.monotonicClock, &itimerRealListener{tg: tg})
+ tg.timers = make(map[linux.TimerID]*IntervalTimer)
+ tg.rscr.Store(&RSEQCriticalRegion{})
+ return tg
+}
+
+// saveRscr is invopked by stateify.
+func (tg *ThreadGroup) saveRscr() *RSEQCriticalRegion {
+ return tg.rscr.Load().(*RSEQCriticalRegion)
+}
+
+// loadRscr is invoked by stateify.
+func (tg *ThreadGroup) loadRscr(rscr *RSEQCriticalRegion) {
+ tg.rscr.Store(rscr)
+}
+
+// SignalHandlers returns the signal handlers used by tg.
+//
+// Preconditions: The caller must provide the synchronization required to read
+// tg.signalHandlers, as described in the field's comment.
+func (tg *ThreadGroup) SignalHandlers() *SignalHandlers {
+ return tg.signalHandlers
+}
+
+// Limits returns tg's limits.
+func (tg *ThreadGroup) Limits() *limits.LimitSet {
+ return tg.limits
+}
+
+// release releases the thread group's resources.
+func (tg *ThreadGroup) release() {
+ // Timers must be destroyed without holding the TaskSet or signal mutexes
+ // since timers send signals with Timer.mu locked.
+ tg.itimerRealTimer.Destroy()
+ var its []*IntervalTimer
+ tg.pidns.owner.mu.Lock()
+ tg.signalHandlers.mu.Lock()
+ for _, it := range tg.timers {
+ its = append(its, it)
+ }
+ tg.timers = make(map[linux.TimerID]*IntervalTimer) // nil maps can't be saved
+ tg.signalHandlers.mu.Unlock()
+ tg.pidns.owner.mu.Unlock()
+ for _, it := range its {
+ it.DestroyTimer()
+ }
+}
+
+// forEachChildThreadGroupLocked indicates over all child ThreadGroups.
+//
+// Precondition: TaskSet.mu must be held.
+func (tg *ThreadGroup) forEachChildThreadGroupLocked(fn func(*ThreadGroup)) {
+ for t := tg.tasks.Front(); t != nil; t = t.Next() {
+ for child := range t.children {
+ if child == child.tg.leader {
+ fn(child.tg)
+ }
+ }
+ }
+}
+
+// itimerRealListener implements ktime.Listener for ITIMER_REAL expirations.
+//
+// +stateify savable
+type itimerRealListener struct {
+ tg *ThreadGroup
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (l *itimerRealListener) Notify(exp uint64) {
+ l.tg.SendSignal(SignalInfoPriv(linux.SIGALRM))
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (l *itimerRealListener) Destroy() {
+}