summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/kernel
diff options
context:
space:
mode:
authorJamie Liu <jamieliu@google.com>2018-10-17 15:48:55 -0700
committerShentubot <shentubot@google.com>2018-10-17 15:50:02 -0700
commitb2a88ff4713325fca736f6a3bf200be02d2d72a7 (patch)
tree489e54828c2bfe0bf326976920e5d5e612f877a0 /pkg/sentry/kernel
parent6922eee6499212a009fdc254224f916bd1c46f29 (diff)
Check thread group CPU timers in the CPU clock ticker.
This reduces the number of goroutines and runtime timers when ITIMER_VIRTUAL or ITIMER_PROF are enabled, or when RLIMIT_CPU is set. This also ensures that thread group CPU timers only advance if running tasks are observed at the time the CPU clock advances, mostly eliminating the possibility that a CPU timer expiration observes no running tasks and falls back to the group leader. PiperOrigin-RevId: 217603396 Change-Id: Ia24ce934d5574334857d9afb5ad8ca0b6a6e65f4
Diffstat (limited to 'pkg/sentry/kernel')
-rw-r--r--pkg/sentry/kernel/BUILD1
-rw-r--r--pkg/sentry/kernel/kernel.go46
-rw-r--r--pkg/sentry/kernel/task_acct.go97
-rw-r--r--pkg/sentry/kernel/task_clone.go2
-rw-r--r--pkg/sentry/kernel/task_exit.go3
-rw-r--r--pkg/sentry/kernel/task_run.go4
-rw-r--r--pkg/sentry/kernel/task_sched.go344
-rw-r--r--pkg/sentry/kernel/task_signals.go66
-rw-r--r--pkg/sentry/kernel/thread_group.go73
-rw-r--r--pkg/sentry/kernel/threads.go6
-rw-r--r--pkg/sentry/kernel/time/time.go35
-rw-r--r--pkg/sentry/kernel/timekeeper.go26
-rw-r--r--pkg/sentry/kernel/timer.go290
13 files changed, 538 insertions, 455 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index acc61cb09..e2fb61ba6 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -111,7 +111,6 @@ go_library(
"threads.go",
"timekeeper.go",
"timekeeper_state.go",
- "timer.go",
"uts_namespace.go",
"vdso.go",
"version.go",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 84afdb530..5d6856f3c 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -20,7 +20,7 @@
//
// Kernel.extMu
// ThreadGroup.timerMu
-// ktime.Timer.mu (for IntervalTimer)
+// ktime.Timer.mu (for kernelCPUClockTicker and IntervalTimer)
// TaskSet.mu
// SignalHandlers.mu
// Task.mu
@@ -617,7 +617,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
return nil, 0, fmt.Errorf("no kernel MountNamespace")
}
- tg := NewThreadGroup(k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
+ tg := k.newThreadGroup(k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
ctx := args.NewContext(k)
// Grab the root directory.
@@ -705,7 +705,7 @@ func (k *Kernel) Start() error {
}
k.started = true
- k.cpuClockTicker = ktime.NewTimer(k.monotonicClock, kernelCPUClockListener{k})
+ k.cpuClockTicker = ktime.NewTimer(k.monotonicClock, newKernelCPUClockTicker(k))
k.cpuClockTicker.Swap(ktime.Setting{
Enabled: true,
Period: linux.ClockTick,
@@ -741,14 +741,13 @@ func (k *Kernel) pauseTimeLocked() {
// mutex, while holding the Timer mutex.)
for t := range k.tasks.Root.tids {
if t == t.tg.leader {
- t.tg.tm.pause()
- }
- // This means we'll iterate ThreadGroups and FDMaps shared by multiple
- // tasks repeatedly, but ktime.Timer.Pause is idempotent so this is
- // harmless.
- for _, it := range t.tg.timers {
- it.PauseTimer()
+ t.tg.itimerRealTimer.Pause()
+ for _, it := range t.tg.timers {
+ it.PauseTimer()
+ }
}
+ // This means we'll iterate FDMaps shared by multiple tasks repeatedly,
+ // but ktime.Timer.Pause is idempotent so this is harmless.
if fdm := t.fds; fdm != nil {
for _, desc := range fdm.files {
if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
@@ -774,10 +773,10 @@ func (k *Kernel) resumeTimeLocked() {
k.timekeeper.ResumeUpdates()
for t := range k.tasks.Root.tids {
if t == t.tg.leader {
- t.tg.tm.resume()
- }
- for _, it := range t.tg.timers {
- it.ResumeTimer()
+ t.tg.itimerRealTimer.Resume()
+ for _, it := range t.tg.timers {
+ it.ResumeTimer()
+ }
}
if fdm := t.fds; fdm != nil {
for _, desc := range fdm.files {
@@ -1078,22 +1077,3 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
return nil
}
}
-
-type kernelCPUClockListener struct {
- k *Kernel
-}
-
-// Notify implements ktime.TimerListener.Notify.
-func (l kernelCPUClockListener) Notify(exp uint64) {
- // Only increment cpuClock by 1 regardless of the number of expirations.
- // This approximately compensates for cases where thread throttling or bad
- // Go runtime scheduling prevents the cpuClockTicker goroutine, and
- // presumably task goroutines as well, from executing for a long period of
- // time. It's also necessary to prevent CPU clocks from seeing large
- // discontinuous jumps.
- atomic.AddUint64(&l.k.cpuClock, 1)
-}
-
-// Destroy implements ktime.TimerListener.Destroy.
-func (l kernelCPUClockListener) Destroy() {
-}
diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go
index ce12cdb64..d2052921e 100644
--- a/pkg/sentry/kernel/task_acct.go
+++ b/pkg/sentry/kernel/task_acct.go
@@ -21,8 +21,99 @@ import (
ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
)
+// Getitimer implements getitimer(2).
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) Getitimer(id int32) (linux.ItimerVal, error) {
+ var tm ktime.Time
+ var s ktime.Setting
+ switch id {
+ case linux.ITIMER_REAL:
+ tm, s = t.tg.itimerRealTimer.Get()
+ case linux.ITIMER_VIRTUAL:
+ tm = t.tg.UserCPUClock().Now()
+ t.tg.signalHandlers.mu.Lock()
+ s, _ = t.tg.itimerVirtSetting.At(tm)
+ t.tg.signalHandlers.mu.Unlock()
+ case linux.ITIMER_PROF:
+ tm = t.tg.CPUClock().Now()
+ t.tg.signalHandlers.mu.Lock()
+ s, _ = t.tg.itimerProfSetting.At(tm)
+ t.tg.signalHandlers.mu.Unlock()
+ default:
+ return linux.ItimerVal{}, syserror.EINVAL
+ }
+ val, iv := ktime.SpecFromSetting(tm, s)
+ return linux.ItimerVal{
+ Value: linux.DurationToTimeval(val),
+ Interval: linux.DurationToTimeval(iv),
+ }, nil
+}
+
+// Setitimer implements setitimer(2).
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) Setitimer(id int32, newitv linux.ItimerVal) (linux.ItimerVal, error) {
+ var tm ktime.Time
+ var olds ktime.Setting
+ switch id {
+ case linux.ITIMER_REAL:
+ news, err := ktime.SettingFromSpec(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), t.tg.itimerRealTimer.Clock())
+ if err != nil {
+ return linux.ItimerVal{}, err
+ }
+ tm, olds = t.tg.itimerRealTimer.Swap(news)
+ case linux.ITIMER_VIRTUAL:
+ c := t.tg.UserCPUClock()
+ var err error
+ t.k.cpuClockTicker.Atomically(func() {
+ tm = c.Now()
+ var news ktime.Setting
+ news, err = ktime.SettingFromSpecAt(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), tm)
+ if err != nil {
+ return
+ }
+ t.tg.signalHandlers.mu.Lock()
+ olds = t.tg.itimerVirtSetting
+ t.tg.itimerVirtSetting = news
+ t.tg.updateCPUTimersEnabledLocked()
+ t.tg.signalHandlers.mu.Unlock()
+ })
+ if err != nil {
+ return linux.ItimerVal{}, err
+ }
+ case linux.ITIMER_PROF:
+ c := t.tg.CPUClock()
+ var err error
+ t.k.cpuClockTicker.Atomically(func() {
+ tm = c.Now()
+ var news ktime.Setting
+ news, err = ktime.SettingFromSpecAt(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), tm)
+ if err != nil {
+ return
+ }
+ t.tg.signalHandlers.mu.Lock()
+ olds = t.tg.itimerProfSetting
+ t.tg.itimerProfSetting = news
+ t.tg.updateCPUTimersEnabledLocked()
+ t.tg.signalHandlers.mu.Unlock()
+ })
+ if err != nil {
+ return linux.ItimerVal{}, err
+ }
+ default:
+ return linux.ItimerVal{}, syserror.EINVAL
+ }
+ oldval, oldiv := ktime.SpecFromSetting(tm, olds)
+ return linux.ItimerVal{
+ Value: linux.DurationToTimeval(oldval),
+ Interval: linux.DurationToTimeval(oldiv),
+ }, nil
+}
+
// IOUsage returns the io usage of the thread.
func (t *Task) IOUsage() *usage.IO {
return t.ioUsage
@@ -56,12 +147,6 @@ func (t *Task) SetName(name string) {
t.Debugf("Set thread name to %q", name)
}
-// SetCPUTimer is used by setrlimit(RLIMIT_CPU) to enforce the hard and soft
-// limits on CPU time used by this process.
-func (tg *ThreadGroup) SetCPUTimer(l *limits.Limit) {
- tg.Timer().applyCPULimits(*l)
-}
-
// Limits implements context.Context.Limits.
func (t *Task) Limits() *limits.LimitSet {
return t.ThreadGroup().Limits()
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 7c469ec46..de3aef40d 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -241,7 +241,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
if opts.NewSignalHandlers {
sh = sh.Fork()
}
- tg = NewThreadGroup(pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock)
+ tg = t.k.newThreadGroup(pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock)
}
cfg := &TaskConfig{
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index f5b45fb17..65969ca9b 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -675,9 +675,6 @@ func (t *Task) exitNotifyLocked(fromPtraceDetach bool) {
t.tg.ioUsage.Accumulate(t.ioUsage)
t.tg.signalHandlers.mu.Lock()
t.tg.tasks.Remove(t)
- if t.tg.lastTimerSignalTask == t {
- t.tg.lastTimerSignalTask = nil
- }
t.tg.tasksCount--
tc := t.tg.tasksCount
t.tg.signalHandlers.mu.Unlock()
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 8dd0ef6ea..49ac933b7 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -70,10 +70,6 @@ func (t *Task) run(threadID uintptr) {
// Platform.CooperativelySharesAddressSpace() == true, we give up the
// AddressSpace before the task goroutine finishes executing.
- // Ensure that thread group timers for execution time reflect that this
- // task now exists.
- t.tg.tm.kick()
-
// If this is a newly-started task, it should check for participation in
// group stops. If this is a task resuming after restore, it was
// interrupted by saving. In either case, the task is initially
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 49141ab74..19dcc963a 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -18,12 +18,15 @@ package kernel
import (
"fmt"
+ "math/rand"
"sync/atomic"
"time"
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+ ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
"gvisor.googlesource.com/gvisor/pkg/syserror"
)
@@ -84,6 +87,33 @@ type TaskGoroutineSchedInfo struct {
SysTicks uint64
}
+// userTicksAt returns the extrapolated value of ts.UserTicks after
+// Kernel.CPUClockNow() indicates a time of now.
+//
+// Preconditions: now <= Kernel.CPUClockNow(). (Since Kernel.cpuClock is
+// monotonic, this is satisfied if now is the result of a previous call to
+// Kernel.CPUClockNow().) This requirement exists because otherwise a racing
+// change to t.gosched can cause userTicksAt to adjust stats by too much,
+// making the observed stats non-monotonic.
+func (ts *TaskGoroutineSchedInfo) userTicksAt(now uint64) uint64 {
+ if ts.Timestamp < now && ts.State == TaskGoroutineRunningApp {
+ // Update stats to reflect execution since the last update.
+ return ts.UserTicks + (now - ts.Timestamp)
+ }
+ return ts.UserTicks
+}
+
+// sysTicksAt returns the extrapolated value of ts.SysTicks after
+// Kernel.CPUClockNow() indicates a time of now.
+//
+// Preconditions: As for userTicksAt.
+func (ts *TaskGoroutineSchedInfo) sysTicksAt(now uint64) uint64 {
+ if ts.Timestamp < now && ts.State == TaskGoroutineRunningSys {
+ return ts.SysTicks + (now - ts.Timestamp)
+ }
+ return ts.SysTicks
+}
+
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) {
now := t.k.CPUClockNow()
@@ -127,26 +157,12 @@ func (t *Task) CPUStats() usage.CPUStats {
return t.cpuStatsAt(t.k.CPUClockNow())
}
-// Preconditions: now <= Kernel.CPUClockNow(). (Since Kernel.cpuClock is
-// monotonic, this is satisfied if now is the result of a previous call to
-// Kernel.CPUClockNow().) This requirement exists because otherwise a racing
-// change to t.gosched can cause cpuStatsAt to adjust stats by too much, making
-// the returned stats non-monotonic.
+// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt.
func (t *Task) cpuStatsAt(now uint64) usage.CPUStats {
tsched := t.TaskGoroutineSchedInfo()
- if tsched.Timestamp < now {
- // Update stats to reflect execution since the last update to
- // t.gosched.
- switch tsched.State {
- case TaskGoroutineRunningSys:
- tsched.SysTicks += now - tsched.Timestamp
- case TaskGoroutineRunningApp:
- tsched.UserTicks += now - tsched.Timestamp
- }
- }
return usage.CPUStats{
- UserTime: time.Duration(tsched.UserTicks * uint64(linux.ClockTick)),
- SysTime: time.Duration(tsched.SysTicks * uint64(linux.ClockTick)),
+ UserTime: time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick)),
+ SysTime: time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick)),
VoluntarySwitches: atomic.LoadUint64(&t.yieldCount),
}
}
@@ -162,9 +178,14 @@ func (tg *ThreadGroup) CPUStats() usage.CPUStats {
// ThreadGroup has ever executed anyway.
return usage.CPUStats{}
}
- now := tg.leader.k.CPUClockNow()
+ return tg.cpuStatsAtLocked(tg.leader.k.CPUClockNow())
+}
+
+// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt. The TaskSet mutex
+// must be locked.
+func (tg *ThreadGroup) cpuStatsAtLocked(now uint64) usage.CPUStats {
stats := tg.exitedCPUStats
- // Account for active tasks.
+ // Account for live tasks.
for t := tg.tasks.Front(); t != nil; t = t.Next() {
stats.Accumulate(t.cpuStatsAt(now))
}
@@ -182,6 +203,291 @@ func (tg *ThreadGroup) JoinedChildCPUStats() usage.CPUStats {
return tg.childCPUStats
}
+// taskClock is a ktime.Clock that measures the time that a task has spent
+// executing. taskClock is primarily used to implement CLOCK_THREAD_CPUTIME_ID.
+//
+// +stateify savable
+type taskClock struct {
+ t *Task
+
+ // If includeSys is true, the taskClock includes both time spent executing
+ // application code as well as time spent in the sentry. Otherwise, the
+ // taskClock includes only time spent executing application code.
+ includeSys bool
+
+ // Implements waiter.Waitable. TimeUntil wouldn't change its estimation
+ // based on either of the clock events, so there's no event to be
+ // notified for.
+ ktime.NoClockEvents `state:"nosave"`
+
+ // Implements ktime.Clock.WallTimeUntil.
+ //
+ // As an upper bound, a task's clock cannot advance faster than CPU
+ // time. It would have to execute at a rate of more than 1 task-second
+ // per 1 CPU-second, which isn't possible.
+ ktime.WallRateClock `state:"nosave"`
+}
+
+// UserCPUClock returns a clock measuring the CPU time the task has spent
+// executing application code.
+func (t *Task) UserCPUClock() ktime.Clock {
+ return &taskClock{t: t, includeSys: false}
+}
+
+// CPUClock returns a clock measuring the CPU time the task has spent executing
+// application and "kernel" code.
+func (t *Task) CPUClock() ktime.Clock {
+ return &taskClock{t: t, includeSys: true}
+}
+
+// Now implements ktime.Clock.Now.
+func (tc *taskClock) Now() ktime.Time {
+ stats := tc.t.CPUStats()
+ if tc.includeSys {
+ return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
+ }
+ return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
+}
+
+// tgClock is a ktime.Clock that measures the time a thread group has spent
+// executing. tgClock is primarily used to implement CLOCK_PROCESS_CPUTIME_ID.
+//
+// +stateify savable
+type tgClock struct {
+ tg *ThreadGroup
+
+ // If includeSys is true, the tgClock includes both time spent executing
+ // application code as well as time spent in the sentry. Otherwise, the
+ // tgClock includes only time spent executing application code.
+ includeSys bool
+
+ // Implements waiter.Waitable.
+ ktime.ClockEventsQueue `state:"nosave"`
+}
+
+// Now implements ktime.Clock.Now.
+func (tgc *tgClock) Now() ktime.Time {
+ stats := tgc.tg.CPUStats()
+ if tgc.includeSys {
+ return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
+ }
+ return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
+}
+
+// WallTimeUntil implements ktime.Clock.WallTimeUntil.
+func (tgc *tgClock) WallTimeUntil(t, now ktime.Time) time.Duration {
+ // Thread group CPU time should not exceed wall time * live tasks, since
+ // task goroutines exit after the transition to TaskExitZombie in
+ // runExitNotify.
+ tgc.tg.pidns.owner.mu.RLock()
+ n := tgc.tg.liveTasks
+ tgc.tg.pidns.owner.mu.RUnlock()
+ if n == 0 {
+ if t.Before(now) {
+ return 0
+ }
+ // The timer tick raced with thread group exit, after which no more
+ // tasks can enter the thread group. So tgc.Now() will never advance
+ // again. Return a large delay; the timer should be stopped long before
+ // it comes again anyway.
+ return time.Hour
+ }
+ // This is a lower bound on the amount of time that can elapse before an
+ // associated timer expires, so returning this value tends to result in a
+ // sequence of closely-spaced ticks just before timer expiry. To avoid
+ // this, round up to the nearest ClockTick; CPU usage measurements are
+ // limited to this resolution anyway.
+ remaining := time.Duration(t.Sub(now).Nanoseconds()/int64(n)) * time.Nanosecond
+ return ((remaining + (linux.ClockTick - time.Nanosecond)) / linux.ClockTick) * linux.ClockTick
+}
+
+// UserCPUClock returns a ktime.Clock that measures the time that a thread
+// group has spent executing.
+func (tg *ThreadGroup) UserCPUClock() ktime.Clock {
+ return &tgClock{tg: tg, includeSys: false}
+}
+
+// CPUClock returns a ktime.Clock that measures the time that a thread group
+// has spent executing, including sentry time.
+func (tg *ThreadGroup) CPUClock() ktime.Clock {
+ return &tgClock{tg: tg, includeSys: true}
+}
+
+type kernelCPUClockTicker struct {
+ k *Kernel
+
+ // These are essentially kernelCPUClockTicker.Notify local variables that
+ // are cached between calls to reduce allocations.
+ rng *rand.Rand
+ tgs []*ThreadGroup
+}
+
+func newKernelCPUClockTicker(k *Kernel) *kernelCPUClockTicker {
+ return &kernelCPUClockTicker{
+ k: k,
+ rng: rand.New(rand.NewSource(rand.Int63())),
+ }
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (ticker *kernelCPUClockTicker) Notify(exp uint64) {
+ // Only increment cpuClock by 1 regardless of the number of expirations.
+ // This approximately compensates for cases where thread throttling or bad
+ // Go runtime scheduling prevents the kernelCPUClockTicker goroutine, and
+ // presumably task goroutines as well, from executing for a long period of
+ // time. It's also necessary to prevent CPU clocks from seeing large
+ // discontinuous jumps.
+ now := atomic.AddUint64(&ticker.k.cpuClock, 1)
+
+ // Check thread group CPU timers.
+ tgs := ticker.k.tasks.Root.ThreadGroupsAppend(ticker.tgs)
+ for _, tg := range tgs {
+ if atomic.LoadUint32(&tg.cpuTimersEnabled) == 0 {
+ continue
+ }
+
+ ticker.k.tasks.mu.RLock()
+ if tg.leader == nil {
+ // No tasks have ever run in this thread group.
+ ticker.k.tasks.mu.RUnlock()
+ continue
+ }
+ // Accumulate thread group CPU stats, and randomly select running tasks
+ // using reservoir sampling to receive CPU timer signals.
+ var virtReceiver *Task
+ nrVirtCandidates := 0
+ var profReceiver *Task
+ nrProfCandidates := 0
+ tgUserTime := tg.exitedCPUStats.UserTime
+ tgSysTime := tg.exitedCPUStats.SysTime
+ for t := tg.tasks.Front(); t != nil; t = t.Next() {
+ tsched := t.TaskGoroutineSchedInfo()
+ tgUserTime += time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick))
+ tgSysTime += time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick))
+ switch tsched.State {
+ case TaskGoroutineRunningApp:
+ // Considered by ITIMER_VIRT, ITIMER_PROF, and RLIMIT_CPU
+ // timers.
+ nrVirtCandidates++
+ if int(randInt31n(ticker.rng, int32(nrVirtCandidates))) == 0 {
+ virtReceiver = t
+ }
+ fallthrough
+ case TaskGoroutineRunningSys:
+ // Considered by ITIMER_PROF and RLIMIT_CPU timers.
+ nrProfCandidates++
+ if int(randInt31n(ticker.rng, int32(nrProfCandidates))) == 0 {
+ profReceiver = t
+ }
+ }
+ }
+ tgVirtNow := ktime.FromNanoseconds(tgUserTime.Nanoseconds())
+ tgProfNow := ktime.FromNanoseconds((tgUserTime + tgSysTime).Nanoseconds())
+
+ // All of the following are standard (not real-time) signals, which are
+ // automatically deduplicated, so we ignore the number of expirations.
+ tg.signalHandlers.mu.Lock()
+ // It should only be possible for these timers to advance if we found
+ // at least one running task.
+ if virtReceiver != nil {
+ // ITIMER_VIRTUAL
+ newItimerVirtSetting, exp := tg.itimerVirtSetting.At(tgVirtNow)
+ tg.itimerVirtSetting = newItimerVirtSetting
+ if exp != 0 {
+ virtReceiver.sendSignalLocked(sigPriv(linux.SIGVTALRM), true)
+ }
+ }
+ if profReceiver != nil {
+ // ITIMER_PROF
+ newItimerProfSetting, exp := tg.itimerProfSetting.At(tgProfNow)
+ tg.itimerProfSetting = newItimerProfSetting
+ if exp != 0 {
+ profReceiver.sendSignalLocked(sigPriv(linux.SIGPROF), true)
+ }
+ // RLIMIT_CPU soft limit
+ newRlimitCPUSoftSetting, exp := tg.rlimitCPUSoftSetting.At(tgProfNow)
+ tg.rlimitCPUSoftSetting = newRlimitCPUSoftSetting
+ if exp != 0 {
+ profReceiver.sendSignalLocked(sigPriv(linux.SIGXCPU), true)
+ }
+ // RLIMIT_CPU hard limit
+ rlimitCPUMax := tg.limits.Get(limits.CPU).Max
+ if rlimitCPUMax != limits.Infinity && !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPUMax))) {
+ profReceiver.sendSignalLocked(sigPriv(linux.SIGKILL), true)
+ }
+ }
+ tg.signalHandlers.mu.Unlock()
+
+ ticker.k.tasks.mu.RUnlock()
+ }
+
+ // Retain tgs between calls to Notify to reduce allocations.
+ for i := range tgs {
+ tgs[i] = nil
+ }
+ ticker.tgs = tgs[:0]
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (ticker *kernelCPUClockTicker) Destroy() {
+}
+
+// randInt31n returns a random integer in [0, n).
+//
+// randInt31n is equivalent to math/rand.Rand.int31n(), which is unexported.
+// See that function for details.
+func randInt31n(rng *rand.Rand, n int32) int32 {
+ v := rng.Uint32()
+ prod := uint64(v) * uint64(n)
+ low := uint32(prod)
+ if low < uint32(n) {
+ thresh := uint32(-n) % uint32(n)
+ for low < thresh {
+ v = rng.Uint32()
+ prod = uint64(v) * uint64(n)
+ low = uint32(prod)
+ }
+ }
+ return int32(prod >> 32)
+}
+
+// NotifyRlimitCPUUpdated is called by setrlimit.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) NotifyRlimitCPUUpdated() {
+ t.k.cpuClockTicker.Atomically(func() {
+ t.tg.pidns.owner.mu.RLock()
+ defer t.tg.pidns.owner.mu.RUnlock()
+ t.tg.signalHandlers.mu.Lock()
+ defer t.tg.signalHandlers.mu.Unlock()
+ rlimitCPU := t.tg.limits.Get(limits.CPU)
+ t.tg.rlimitCPUSoftSetting = ktime.Setting{
+ Enabled: rlimitCPU.Cur != limits.Infinity,
+ Next: ktime.FromNanoseconds((time.Duration(rlimitCPU.Cur) * time.Second).Nanoseconds()),
+ Period: time.Second,
+ }
+ if rlimitCPU.Max != limits.Infinity {
+ // Check if tg is already over the hard limit.
+ tgcpu := t.tg.cpuStatsAtLocked(t.k.CPUClockNow())
+ tgProfNow := ktime.FromNanoseconds((tgcpu.UserTime + tgcpu.SysTime).Nanoseconds())
+ if !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPU.Max))) {
+ t.sendSignalLocked(sigPriv(linux.SIGKILL), true)
+ }
+ }
+ t.tg.updateCPUTimersEnabledLocked()
+ })
+}
+
+// Preconditions: The signal mutex must be locked.
+func (tg *ThreadGroup) updateCPUTimersEnabledLocked() {
+ rlimitCPU := tg.limits.Get(limits.CPU)
+ if tg.itimerVirtSetting.Enabled || tg.itimerProfSetting.Enabled || tg.rlimitCPUSoftSetting.Enabled || rlimitCPU.Max != limits.Infinity {
+ atomic.StoreUint32(&tg.cpuTimersEnabled, 1)
+ } else {
+ atomic.StoreUint32(&tg.cpuTimersEnabled, 0)
+ }
+}
+
// StateStatus returns a string representation of the task's current state,
// appropriate for /proc/[pid]/status.
func (t *Task) StateStatus() string {
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index afb010f60..e2925a708 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -359,72 +359,6 @@ func (tg *ThreadGroup) SendSignal(info *arch.SignalInfo) error {
return tg.leader.sendSignalLocked(info, true /* group */)
}
-// Preconditions: The TaskSet mutex must be locked.
-func (t *Task) onCPULocked(includeSys bool) bool {
- // Task is exiting.
- if t.exitState != TaskExitNone {
- return false
- }
-
- switch t.TaskGoroutineSchedInfo().State {
- case TaskGoroutineRunningSys:
- return includeSys
- case TaskGoroutineRunningApp:
- return true
- default:
- return false
- }
-}
-
-// SendTimerSignal mimics the process timer signal delivery behavior in linux:
-// signals are delivered to the thread that triggers the timer expiration (see
-// kernel/time/posix-cpu-timers.c:check_process_timers(). This
-// means
-// 1) the thread is running on cpu at the time.
-// 2) a thread runs more frequently will get more of those signals.
-//
-// We approximate this behavior by selecting a running task in a round-robin
-// fashion. Statistically, a thread running more often should have a higher
-// probability to be selected.
-func (tg *ThreadGroup) SendTimerSignal(info *arch.SignalInfo, includeSys bool) error {
- tg.pidns.owner.mu.RLock()
- defer tg.pidns.owner.mu.RUnlock()
- tg.signalHandlers.mu.Lock()
- defer tg.signalHandlers.mu.Unlock()
-
- // Find the next running threads.
- var t *Task
- if tg.lastTimerSignalTask == nil {
- t = tg.tasks.Front()
- } else {
- t = tg.lastTimerSignalTask.Next()
- }
-
- // Iterate from lastTimerSignalTask.Next() to the last task in the task list.
- for t != nil {
- if t.onCPULocked(includeSys) {
- tg.lastTimerSignalTask = t
- return t.sendSignalLocked(info, true /* group */)
- }
- t = t.Next()
- }
-
- // t is nil when we reach here. If lastTimerSignalTask is not nil, iterate
- // from Front to lastTimerSignalTask.
- if tg.lastTimerSignalTask != nil {
- for t := tg.tasks.Front(); t != tg.lastTimerSignalTask.Next(); t = t.Next() {
- if t.onCPULocked(includeSys) {
- tg.lastTimerSignalTask = t
- return t.sendSignalLocked(info, true /* group */)
- }
- }
- }
-
- // No running threads? Just try the leader.
- tg.lastTimerSignalTask = tg.leader
- return tg.leader.sendSignalLocked(info, true /* group */)
-}
-
func (t *Task) sendSignalLocked(info *arch.SignalInfo, group bool) error {
return t.sendSignalTimerLocked(info, group, nil)
}
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 13dce08ce..dfff7b52d 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -19,6 +19,7 @@ import (
"sync/atomic"
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
)
@@ -59,12 +60,6 @@ type ThreadGroup struct {
// pendingSignals is protected by the signal mutex.
pendingSignals pendingSignals
- // lastTimerSignalTask records the last task we deliver a process timer signal to.
- // Please see SendTimerSignal for more details.
- //
- // lastTimerSignalTask is protected by the signal mutex.
- lastTimerSignalTask *Task
-
// groupStopPhase indicates the state of a group stop in progress on the
// thread group, if any.
//
@@ -152,14 +147,39 @@ type ThreadGroup struct {
// restarted by Task.Start.
liveGoroutines sync.WaitGroup `state:"nosave"`
- // tm contains process timers. TimerManager fields are immutable.
- tm TimerManager
+ timerMu sync.Mutex `state:"nosave"`
+
+ // itimerRealTimer implements ITIMER_REAL for the thread group.
+ itimerRealTimer *ktime.Timer
+
+ // itimerVirtSetting is the ITIMER_VIRTUAL setting for the thread group.
+ //
+ // itimerVirtSetting is protected by the signal mutex.
+ itimerVirtSetting ktime.Setting
+
+ // itimerProfSetting is the ITIMER_PROF setting for the thread group.
+ //
+ // itimerProfSetting is protected by the signal mutex.
+ itimerProfSetting ktime.Setting
+
+ // rlimitCPUSoftSetting is the setting for RLIMIT_CPU soft limit
+ // notifications for the thread group.
+ //
+ // rlimitCPUSoftSetting is protected by the signal mutex.
+ rlimitCPUSoftSetting ktime.Setting
+
+ // cpuTimersEnabled is non-zero if itimerVirtSetting.Enabled is true,
+ // itimerProfSetting.Enabled is true, rlimitCPUSoftSetting.Enabled is true,
+ // or limits.Get(CPU) is finite.
+ //
+ // cpuTimersEnabled is protected by the signal mutex. cpuTimersEnabled is
+ // accessed using atomic memory operations.
+ cpuTimersEnabled uint32
// timers is the thread group's POSIX interval timers. nextTimerID is the
// TimerID at which allocation should begin searching for an unused ID.
//
// timers and nextTimerID are protected by timerMu.
- timerMu sync.Mutex `state:"nosave"`
timers map[linux.TimerID]*IntervalTimer
nextTimerID linux.TimerID
@@ -211,11 +231,11 @@ type ThreadGroup struct {
rscr atomic.Value `state:".(*RSEQCriticalRegion)"`
}
-// NewThreadGroup returns a new, empty thread group in PID namespace ns. The
+// newThreadGroup returns a new, empty thread group in PID namespace ns. The
// thread group leader will send its parent terminationSignal when it exits.
// The new thread group isn't visible to the system until a task has been
// created inside of it by a successful call to TaskSet.NewTask.
-func NewThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup {
+func (k *Kernel) newThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup {
tg := &ThreadGroup{
threadGroupNode: threadGroupNode{
pidns: ns,
@@ -225,7 +245,7 @@ func NewThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linu
ioUsage: &usage.IO{},
limits: limits,
}
- tg.tm = newTimerManager(tg, monotonicClock)
+ tg.itimerRealTimer = ktime.NewTimer(k.monotonicClock, &itimerRealListener{tg: tg})
tg.timers = make(map[linux.TimerID]*IntervalTimer)
tg.rscr.Store(&RSEQCriticalRegion{})
return tg
@@ -249,11 +269,6 @@ func (tg *ThreadGroup) SignalHandlers() *SignalHandlers {
return tg.signalHandlers
}
-// Timer returns tg's timers.
-func (tg *ThreadGroup) Timer() *TimerManager {
- return &tg.tm
-}
-
// Limits returns tg's limits.
func (tg *ThreadGroup) Limits() *limits.LimitSet {
return tg.limits
@@ -261,11 +276,9 @@ func (tg *ThreadGroup) Limits() *limits.LimitSet {
// release releases the thread group's resources.
func (tg *ThreadGroup) release() {
- // These must be done without holding the TaskSet or signal mutexes since
- // timers send signals with Timer.mu locked.
-
- tg.tm.destroy()
-
+ // Timers must be destroyed without holding the TaskSet or signal mutexes
+ // since timers send signals with Timer.mu locked.
+ tg.itimerRealTimer.Destroy()
var its []*IntervalTimer
tg.pidns.owner.mu.Lock()
tg.signalHandlers.mu.Lock()
@@ -292,3 +305,19 @@ func (tg *ThreadGroup) forEachChildThreadGroupLocked(fn func(*ThreadGroup)) {
}
}
}
+
+// itimerRealListener implements ktime.Listener for ITIMER_REAL expirations.
+//
+// +stateify savable
+type itimerRealListener struct {
+ tg *ThreadGroup
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (l *itimerRealListener) Notify(exp uint64) {
+ l.tg.SendSignal(sigPriv(linux.SIGALRM))
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (l *itimerRealListener) Destroy() {
+}
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index 3d5713106..4e3d19e97 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -243,9 +243,13 @@ func (ns *PIDNamespace) Tasks() []*Task {
// ThreadGroups returns a snapshot of the thread groups in ns.
func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup {
+ return ns.ThreadGroupsAppend(nil)
+}
+
+// ThreadGroupsAppend appends a snapshot of the thread groups in ns to tgs.
+func (ns *PIDNamespace) ThreadGroupsAppend(tgs []*ThreadGroup) []*ThreadGroup {
ns.owner.mu.RLock()
defer ns.owner.mu.RUnlock()
- var tgs []*ThreadGroup
for t := range ns.tids {
if t == t.tg.leader {
tgs = append(tgs, t.tg)
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index 1f6fed007..52e0dfba1 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -307,6 +307,12 @@ type Setting struct {
// SettingFromSpec converts a (value, interval) pair to a Setting based on a
// reading from c. value is interpreted as a time relative to c.Now().
func SettingFromSpec(value time.Duration, interval time.Duration, c Clock) (Setting, error) {
+ return SettingFromSpecAt(value, interval, c.Now())
+}
+
+// SettingFromSpecAt converts a (value, interval) pair to a Setting. value is
+// interpreted as a time relative to now.
+func SettingFromSpecAt(value time.Duration, interval time.Duration, now Time) (Setting, error) {
if value < 0 {
return Setting{}, syserror.EINVAL
}
@@ -315,7 +321,7 @@ func SettingFromSpec(value time.Duration, interval time.Duration, c Clock) (Sett
}
return Setting{
Enabled: true,
- Next: c.Now().Add(value),
+ Next: now.Add(value),
Period: interval,
}, nil
}
@@ -365,14 +371,14 @@ func ItimerspecFromSetting(now Time, s Setting) linux.Itimerspec {
}
}
-// advancedTo returns an updated Setting and a number of expirations after
-// the associated Clock indicates a time of now.
+// At returns an updated Setting and a number of expirations after the
+// associated Clock indicates a time of now.
//
-// Settings may be created by successive calls to advancedTo with decreasing
+// Settings may be created by successive calls to At with decreasing
// values of now (i.e. time may appear to go backward). Supporting this is
// required to support non-monotonic clocks, as well as allowing
// Timer.clock.Now() to be called without holding Timer.mu.
-func (s Setting) advancedTo(now Time) (Setting, uint64) {
+func (s Setting) At(now Time) (Setting, uint64) {
if !s.Enabled {
return s, 0
}
@@ -519,7 +525,7 @@ func (t *Timer) Tick() {
if t.paused {
return
}
- s, exp := t.setting.advancedTo(now)
+ s, exp := t.setting.At(now)
t.setting = s
if exp > 0 {
t.listener.Notify(exp)
@@ -574,7 +580,7 @@ func (t *Timer) Get() (Time, Setting) {
if t.paused {
panic(fmt.Sprintf("Timer.Get called on paused Timer %p", t))
}
- s, exp := t.setting.advancedTo(now)
+ s, exp := t.setting.At(now)
t.setting = s
if exp > 0 {
t.listener.Notify(exp)
@@ -607,14 +613,14 @@ func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) {
if t.paused {
panic(fmt.Sprintf("Timer.SwapAnd called on paused Timer %p", t))
}
- oldS, oldExp := t.setting.advancedTo(now)
+ oldS, oldExp := t.setting.At(now)
if oldExp > 0 {
t.listener.Notify(oldExp)
}
if f != nil {
f()
}
- newS, newExp := s.advancedTo(now)
+ newS, newExp := s.At(now)
t.setting = newS
if newExp > 0 {
t.listener.Notify(newExp)
@@ -623,6 +629,17 @@ func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) {
return now, oldS
}
+// Atomically invokes f atomically with respect to expirations of t; that is, t
+// cannot generate expirations while f is being called.
+//
+// Preconditions: f cannot call any Timer methods since it is called with the
+// Timer mutex locked.
+func (t *Timer) Atomically(f func()) {
+ t.mu.Lock()
+ defer t.mu.Unlock()
+ f()
+}
+
// Preconditions: t.mu must be locked.
func (t *Timer) resetKickerLocked(now Time) {
if t.setting.Enabled {
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
index df5dbe128..2167f3efe 100644
--- a/pkg/sentry/kernel/timekeeper.go
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -15,6 +15,7 @@
package kernel
import (
+ "fmt"
"sync"
"time"
@@ -277,3 +278,28 @@ func (t *Timekeeper) GetTime(c sentrytime.ClockID) (int64, error) {
func (t *Timekeeper) BootTime() ktime.Time {
return t.bootTime
}
+
+// timekeeperClock is a ktime.Clock that reads time from a
+// kernel.Timekeeper-managed clock.
+//
+// +stateify savable
+type timekeeperClock struct {
+ tk *Timekeeper
+ c sentrytime.ClockID
+
+ // Implements ktime.Clock.WallTimeUntil.
+ ktime.WallRateClock `state:"nosave"`
+
+ // Implements waiter.Waitable. (We have no ability to detect
+ // discontinuities from external changes to CLOCK_REALTIME).
+ ktime.NoClockEvents `state:"nosave"`
+}
+
+// Now implements ktime.Clock.Now.
+func (tc *timekeeperClock) Now() ktime.Time {
+ now, err := tc.tk.GetTime(tc.c)
+ if err != nil {
+ panic(fmt.Sprintf("timekeeperClock(ClockID=%v)).Now: %v", tc.c, err))
+ }
+ return ktime.FromNanoseconds(now)
+}
diff --git a/pkg/sentry/kernel/timer.go b/pkg/sentry/kernel/timer.go
deleted file mode 100644
index 534d03d0f..000000000
--- a/pkg/sentry/kernel/timer.go
+++ /dev/null
@@ -1,290 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package kernel
-
-import (
- "fmt"
- "time"
-
- "gvisor.googlesource.com/gvisor/pkg/abi/linux"
- ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
- "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
- sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
-)
-
-// timekeeperClock is a ktime.Clock that reads time from a
-// kernel.Timekeeper-managed clock.
-//
-// +stateify savable
-type timekeeperClock struct {
- tk *Timekeeper
- c sentrytime.ClockID
-
- // Implements ktime.Clock.WallTimeUntil.
- ktime.WallRateClock `state:"nosave"`
-
- // Implements waiter.Waitable. (We have no ability to detect
- // discontinuities from external changes to CLOCK_REALTIME).
- ktime.NoClockEvents `state:"nosave"`
-}
-
-// Now implements ktime.Clock.Now.
-func (tc *timekeeperClock) Now() ktime.Time {
- now, err := tc.tk.GetTime(tc.c)
- if err != nil {
- panic(fmt.Sprintf("timekeeperClock(ClockID=%v)).Now: %v", tc.c, err))
- }
- return ktime.FromNanoseconds(now)
-}
-
-// tgClock is a ktime.Clock that measures the time a thread group has spent
-// executing.
-//
-// +stateify savable
-type tgClock struct {
- tg *ThreadGroup
-
- // If includeSys is true, the tgClock includes both time spent executing
- // application code as well as time spent in the sentry. Otherwise, the
- // tgClock includes only time spent executing application code.
- includeSys bool
-
- // Implements waiter.Waitable.
- ktime.ClockEventsQueue `state:"nosave"`
-}
-
-// UserCPUClock returns a ktime.Clock that measures the time that a thread
-// group has spent executing.
-func (tg *ThreadGroup) UserCPUClock() ktime.Clock {
- return tg.tm.virtClock
-}
-
-// CPUClock returns a ktime.Clock that measures the time that a thread group
-// has spent executing, including sentry time.
-func (tg *ThreadGroup) CPUClock() ktime.Clock {
- return tg.tm.profClock
-}
-
-// Now implements ktime.Clock.Now.
-func (tgc *tgClock) Now() ktime.Time {
- stats := tgc.tg.CPUStats()
- if tgc.includeSys {
- return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
- }
- return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
-}
-
-// WallTimeUntil implements ktime.Clock.WallTimeUntil.
-func (tgc *tgClock) WallTimeUntil(t, now ktime.Time) time.Duration {
- // The assumption here is that the time spent in this process (not matter
- // virtual or prof) should not exceed wall time * active tasks, since
- // Task.exitThreadGroup stops accounting as it transitions to
- // TaskExitInitiated.
- tgc.tg.pidns.owner.mu.RLock()
- n := tgc.tg.activeTasks
- tgc.tg.pidns.owner.mu.RUnlock()
- if n == 0 {
- if t.Before(now) {
- return 0
- }
- // The timer tick raced with thread group exit, after which no more
- // tasks can enter the thread group. So tgc.Now() will never advance
- // again. Return a large delay; the timer should be stopped long before
- // it comes again anyway.
- return time.Hour
- }
- // This is a lower bound on the amount of time that can elapse before an
- // associated timer expires, so returning this value tends to result in a
- // sequence of closely-spaced ticks just before timer expiry. To avoid
- // this, round up to the nearest ClockTick; CPU usage measurements are
- // limited to this resolution anyway.
- remaining := time.Duration(int64(t.Sub(now))/int64(n)) * time.Nanosecond
- return ((remaining + (linux.ClockTick - time.Nanosecond)) / linux.ClockTick) * linux.ClockTick
-}
-
-// taskClock is a ktime.Clock that measures the time that a task has spent
-// executing.
-type taskClock struct {
- t *Task
-
- // If includeSys is true, the taskClock includes both time spent executing
- // application code as well as time spent in the sentry. Otherwise, the
- // taskClock includes only time spent executing application code.
- includeSys bool
-
- // Implements waiter.Waitable. TimeUntil wouldn't change its estimation
- // based on either of the clock events, so there's no event to be
- // notified for.
- ktime.NoClockEvents `state:"nosave"`
-
- // Implements ktime.Clock.WallTimeUntil.
- //
- // As an upper bound, a task's clock cannot advance faster than CPU
- // time. It would have to execute at a rate of more than 1 task-second
- // per 1 CPU-second, which isn't possible.
- ktime.WallRateClock `state:"nosave"`
-}
-
-// UserCPUClock returns a clock measuring the CPU time the task has spent
-// executing application code.
-func (t *Task) UserCPUClock() ktime.Clock {
- return &taskClock{t: t, includeSys: false}
-}
-
-// CPUClock returns a clock measuring the CPU time the task has spent executing
-// application and "kernel" code.
-func (t *Task) CPUClock() ktime.Clock {
- return &taskClock{t: t, includeSys: true}
-}
-
-// Now implements ktime.Clock.Now.
-func (tc *taskClock) Now() ktime.Time {
- stats := tc.t.CPUStats()
- if tc.includeSys {
- return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
- }
- return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
-}
-
-// signalNotifier is a ktime.Listener that sends signals to a ThreadGroup.
-//
-// +stateify savable
-type signalNotifier struct {
- tg *ThreadGroup
- signal linux.Signal
- realTimer bool
- includeSys bool
-}
-
-// Notify implements ktime.TimerListener.Notify.
-func (s *signalNotifier) Notify(exp uint64) {
- // Since all signals sent using a signalNotifier are standard (not
- // real-time) signals, we can ignore the number of expirations and send
- // only a single signal.
- if s.realTimer {
- // real timer signal sent to leader. See kernel/time/itimer.c:it_real_fn
- s.tg.SendSignal(sigPriv(s.signal))
- } else {
- s.tg.SendTimerSignal(sigPriv(s.signal), s.includeSys)
- }
-}
-
-// Destroy implements ktime.TimerListener.Destroy.
-func (s *signalNotifier) Destroy() {}
-
-// TimerManager is a collection of supported process cpu timers.
-//
-// +stateify savable
-type TimerManager struct {
- // Clocks used to drive thread group execution time timers.
- virtClock *tgClock
- profClock *tgClock
-
- RealTimer *ktime.Timer
- VirtualTimer *ktime.Timer
- ProfTimer *ktime.Timer
- SoftLimitTimer *ktime.Timer
- HardLimitTimer *ktime.Timer
-}
-
-// newTimerManager returns a new instance of TimerManager.
-func newTimerManager(tg *ThreadGroup, monotonicClock ktime.Clock) TimerManager {
- virtClock := &tgClock{tg: tg, includeSys: false}
- profClock := &tgClock{tg: tg, includeSys: true}
- tm := TimerManager{
- virtClock: virtClock,
- profClock: profClock,
- RealTimer: ktime.NewTimer(monotonicClock, &signalNotifier{
- tg: tg,
- signal: linux.SIGALRM,
- realTimer: true,
- includeSys: false,
- }),
- VirtualTimer: ktime.NewTimer(virtClock, &signalNotifier{
- tg: tg,
- signal: linux.SIGVTALRM,
- realTimer: false,
- includeSys: false,
- }),
- ProfTimer: ktime.NewTimer(profClock, &signalNotifier{
- tg: tg,
- signal: linux.SIGPROF,
- realTimer: false,
- includeSys: true,
- }),
- SoftLimitTimer: ktime.NewTimer(profClock, &signalNotifier{
- tg: tg,
- signal: linux.SIGXCPU,
- realTimer: false,
- includeSys: true,
- }),
- HardLimitTimer: ktime.NewTimer(profClock, &signalNotifier{
- tg: tg,
- signal: linux.SIGKILL,
- realTimer: false,
- includeSys: true,
- }),
- }
- tm.applyCPULimits(tg.Limits().Get(limits.CPU))
- return tm
-}
-
-// Save saves this TimerManger.
-
-// destroy destroys all timers.
-func (tm *TimerManager) destroy() {
- tm.RealTimer.Destroy()
- tm.VirtualTimer.Destroy()
- tm.ProfTimer.Destroy()
- tm.SoftLimitTimer.Destroy()
- tm.HardLimitTimer.Destroy()
-}
-
-func (tm *TimerManager) applyCPULimits(l limits.Limit) {
- tm.SoftLimitTimer.Swap(ktime.Setting{
- Enabled: l.Cur != limits.Infinity,
- Next: ktime.FromNanoseconds((time.Duration(l.Cur) * time.Second).Nanoseconds()),
- Period: time.Second,
- })
- tm.HardLimitTimer.Swap(ktime.Setting{
- Enabled: l.Max != limits.Infinity,
- Next: ktime.FromNanoseconds((time.Duration(l.Max) * time.Second).Nanoseconds()),
- })
-}
-
-// kick is called when the number of threads in the thread group associated
-// with tm increases.
-func (tm *TimerManager) kick() {
- tm.virtClock.Notify(ktime.ClockEventRateIncrease)
- tm.profClock.Notify(ktime.ClockEventRateIncrease)
-}
-
-// pause is to pause the timers and stop timer signal delivery.
-func (tm *TimerManager) pause() {
- tm.RealTimer.Pause()
- tm.VirtualTimer.Pause()
- tm.ProfTimer.Pause()
- tm.SoftLimitTimer.Pause()
- tm.HardLimitTimer.Pause()
-}
-
-// resume is to resume the timers and continue timer signal delivery.
-func (tm *TimerManager) resume() {
- tm.RealTimer.Resume()
- tm.VirtualTimer.Resume()
- tm.ProfTimer.Resume()
- tm.SoftLimitTimer.Resume()
- tm.HardLimitTimer.Resume()
-}