// Copyright 2018 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "sync" "sync/atomic" "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/sentry/limits" "gvisor.googlesource.com/gvisor/pkg/sentry/usage" ) // A ThreadGroup is a logical grouping of tasks that has widespread // significance to other kernel features (e.g. signal handling). ("Thread // groups" are usually called "processes" in userspace documentation.) // // ThreadGroup is a superset of Linux's struct signal_struct. // // +stateify savable type ThreadGroup struct { threadGroupNode // signalHandlers is the set of signal handlers used by every task in this // thread group. (signalHandlers may also be shared with other thread // groups.) // // signalHandlers.mu (hereafter "the signal mutex") protects state related // to signal handling, as well as state that usually needs to be atomic // with signal handling, for all ThreadGroups and Tasks using // signalHandlers. (This is analogous to Linux's use of struct // sighand_struct::siglock.) // // The signalHandlers pointer can only be mutated during an execve // (Task.finishExec). Consequently, when it's possible for a task in the // thread group to be completing an execve, signalHandlers is protected by // the owning TaskSet.mu. Otherwise, it is possible to read the // signalHandlers pointer without synchronization. In particular, // completing an execve requires that all other tasks in the thread group // have exited, so task goroutines do not need the owning TaskSet.mu to // read the signalHandlers pointer of their thread groups. signalHandlers *SignalHandlers // pendingSignals is the set of pending signals that may be handled by any // task in this thread group. // // pendingSignals is protected by the signal mutex. pendingSignals pendingSignals // lastTimerSignalTask records the last task we deliver a process timer signal to. // Please see SendTimerSignal for more details. // // lastTimerSignalTask is protected by the signal mutex. lastTimerSignalTask *Task // groupStopPhase indicates the state of a group stop in progress on the // thread group, if any. // // groupStopPhase is protected by the signal mutex. groupStopPhase groupStopPhase // groupStopSignal is the signal that caused a group stop to be initiated. // groupStopSignal is only meaningful if groupStopPhase is // groupStopInitiated or groupStopComplete. // // groupStopSignal is protected by the signal mutex. groupStopSignal linux.Signal // groupStopCount is the number of non-exited tasks in the thread group // that have acknowledged an initiated group stop. groupStopCount is only // meaningful if groupStopPhase is groupStopInitiated. // // groupStopCount is protected by the signal mutex. groupStopCount int // If groupStopWaitable is true, the thread group is indicating a waitable // group stop event (as defined by EventChildGroupStop). // // Linux represents the analogous state as SIGNAL_STOP_STOPPED being set // and group_exit_code being non-zero. // // groupStopWaitable is protected by the signal mutex. groupStopWaitable bool // If groupContNotify is true, then a SIGCONT has recently ended a group // stop on this thread group, and the first task to observe it should // notify its parent. // // groupContNotify is protected by the signal mutex. groupContNotify bool // If groupContNotify is true, groupContInterrupted is true iff SIGCONT // ended a group stop in phase groupStopInitiated. If groupContNotify is // false, groupContInterrupted is meaningless. // // Analogues in Linux: // // - groupContNotify && groupContInterrupted is represented by // SIGNAL_CLD_STOPPED. // // - groupContNotify && !groupContInterrupted is represented by // SIGNAL_CLD_CONTINUED. // // - !groupContNotify is represented by neither flag being set. // // groupContInterrupted is protected by the signal mutex. groupContInterrupted bool // If groupContWaitable is true, the thread group is indicating a waitable // continue event (as defined by EventGroupContinue). // // groupContWaitable is analogous to Linux's SIGNAL_STOP_CONTINUED. // // groupContWaitable is protected by the signal mutex. groupContWaitable bool // exiting is true if all tasks in the ThreadGroup should exit. exiting is // analogous to Linux's SIGNAL_GROUP_EXIT. // // exiting is protected by the signal mutex. exiting can only transition // from false to true. exiting bool // exitStatus is the thread group's exit status. // // While exiting is false, exitStatus is protected by the signal mutex. // When exiting becomes true, exitStatus becomes immutable. exitStatus ExitStatus // terminationSignal is the signal that this thread group's leader will // send to its parent when it exits. // // terminationSignal is protected by the TaskSet mutex. terminationSignal linux.Signal // liveGoroutines is the number of non-exited task goroutines in the thread // group. // // liveGoroutines is not saved; it is reset as task goroutines are // restarted by Task.Start. liveGoroutines sync.WaitGroup `state:"nosave"` // tm contains process timers. TimerManager fields are immutable. tm TimerManager // timers is the thread group's POSIX interval timers. nextTimerID is the // TimerID at which allocation should begin searching for an unused ID. // // timers and nextTimerID are protected by timerMu. timerMu sync.Mutex `state:"nosave"` timers map[linux.TimerID]*IntervalTimer nextTimerID linux.TimerID // exitedCPUStats is the CPU usage for all exited tasks in the thread // group. exitedCPUStats is protected by the TaskSet mutex. exitedCPUStats usage.CPUStats // childCPUStats is the CPU usage of all joined descendants of this thread // group. childCPUStats is protected by the TaskSet mutex. childCPUStats usage.CPUStats // ioUsage is the I/O usage for all exited tasks in the thread group. // The ioUsage pointer is immutable. ioUsage *usage.IO // maxRSS is the historical maximum resident set size of the thread group, updated when: // // - A task in the thread group exits, since after all tasks have // exited the MemoryManager is no longer reachable. // // - The thread group completes an execve, since this changes // MemoryManagers. // // maxRSS is protected by the TaskSet mutex. maxRSS uint64 // childMaxRSS is the maximum resident set size in bytes of all joined // descendants of this thread group. // // childMaxRSS is protected by the TaskSet mutex. childMaxRSS uint64 // Resource limits for this ThreadGroup. The limits pointer is immutable. limits *limits.LimitSet // processGroup is the processGroup for this thread group. // // processGroup is protected by the TaskSet mutex. processGroup *ProcessGroup // execed indicates an exec has occurred since creation. This will be // set by finishExec, and new TheadGroups will have this field cleared. // When execed is set, the processGroup may no longer be changed. // // execed is protected by the TaskSet mutex. execed bool // rscr is the thread group's RSEQ critical region. rscr atomic.Value `state:".(*RSEQCriticalRegion)"` } // NewThreadGroup returns a new, empty thread group in PID namespace ns. The // thread group leader will send its parent terminationSignal when it exits. // The new thread group isn't visible to the system until a task has been // created inside of it by a successful call to TaskSet.NewTask. func NewThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup { tg := &ThreadGroup{ threadGroupNode: threadGroupNode{ pidns: ns, }, signalHandlers: sh, terminationSignal: terminationSignal, ioUsage: &usage.IO{}, limits: limits, } tg.tm = newTimerManager(tg, monotonicClock) tg.timers = make(map[linux.TimerID]*IntervalTimer) tg.rscr.Store(&RSEQCriticalRegion{}) return tg } // saveRscr is invopked by stateify. func (tg *ThreadGroup) saveRscr() *RSEQCriticalRegion { return tg.rscr.Load().(*RSEQCriticalRegion) } // loadRscr is invoked by stateify. func (tg *ThreadGroup) loadRscr(rscr *RSEQCriticalRegion) { tg.rscr.Store(rscr) } // SignalHandlers returns the signal handlers used by tg. // // Preconditions: The caller must provide the synchronization required to read // tg.signalHandlers, as described in the field's comment. func (tg *ThreadGroup) SignalHandlers() *SignalHandlers { return tg.signalHandlers } // Timer returns tg's timers. func (tg *ThreadGroup) Timer() *TimerManager { return &tg.tm } // Limits returns tg's limits. func (tg *ThreadGroup) Limits() *limits.LimitSet { return tg.limits } // release releases the thread group's resources. func (tg *ThreadGroup) release() { // These must be done without holding the TaskSet or signal mutexes since // timers send signals with Timer.mu locked. tg.tm.destroy() var its []*IntervalTimer tg.pidns.owner.mu.Lock() tg.signalHandlers.mu.Lock() for _, it := range tg.timers { its = append(its, it) } tg.timers = make(map[linux.TimerID]*IntervalTimer) // nil maps can't be saved tg.signalHandlers.mu.Unlock() tg.pidns.owner.mu.Unlock() for _, it := range its { it.DestroyTimer() } } // forEachChildThreadGroupLocked indicates over all child ThreadGroups. // // Precondition: TaskSet.mu must be held. func (tg *ThreadGroup) forEachChildThreadGroupLocked(fn func(*ThreadGroup)) { for t := tg.tasks.Front(); t != nil; t = t.Next() { for child := range t.children { if child == child.tg.leader { fn(child.tg) } } } }