Check thread group CPU timers in the CPU clock ticker.

This reduces the number of goroutines and runtime timers when ITIMER_VIRTUAL or ITIMER_PROF are enabled, or when RLIMIT_CPU is set. This also ensures that thread group CPU timers only advance if running tasks are observed at the time the CPU clock advances, mostly eliminating the possibility that a CPU timer expiration observes no running tasks and falls back to the group leader. PiperOrigin-RevId: 217603396 Change-Id: Ia24ce934d5574334857d9afb5ad8ca0b6a6e65f4
author: Jamie Liu <jamieliu@google.com> 2018-10-17 15:48:55 -0700
committer: Shentubot <shentubot@google.com> 2018-10-17 15:50:02 -0700
commit: b2a88ff4713325fca736f6a3bf200be02d2d72a7 (patch)
tree: 489e54828c2bfe0bf326976920e5d5e612f877a0 /pkg/sentry/kernel
parent: 6922eee6499212a009fdc254224f916bd1c46f29 (diff)
13 files changed, 538 insertions, 455 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index acc61cb09..e2fb61ba6 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -111,7 +111,6 @@ go_library(
         "threads.go",
         "timekeeper.go",
         "timekeeper_state.go",
-        "timer.go",
         "uts_namespace.go",
         "vdso.go",
         "version.go",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 84afdb530..5d6856f3c 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -20,7 +20,7 @@
 //
 // Kernel.extMu
 //   ThreadGroup.timerMu
-//     ktime.Timer.mu (for IntervalTimer)
+//     ktime.Timer.mu (for kernelCPUClockTicker and IntervalTimer)
 //       TaskSet.mu
 //         SignalHandlers.mu
 //           Task.mu
@@ -617,7 +617,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		return nil, 0, fmt.Errorf("no kernel MountNamespace")
 	}
 
-	tg := NewThreadGroup(k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
+	tg := k.newThreadGroup(k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
 	ctx := args.NewContext(k)
 
 	// Grab the root directory.
@@ -705,7 +705,7 @@ func (k *Kernel) Start() error {
 	}
 
 	k.started = true
-	k.cpuClockTicker = ktime.NewTimer(k.monotonicClock, kernelCPUClockListener{k})
+	k.cpuClockTicker = ktime.NewTimer(k.monotonicClock, newKernelCPUClockTicker(k))
 	k.cpuClockTicker.Swap(ktime.Setting{
 		Enabled: true,
 		Period:  linux.ClockTick,
@@ -741,14 +741,13 @@ func (k *Kernel) pauseTimeLocked() {
 	// mutex, while holding the Timer mutex.)
 	for t := range k.tasks.Root.tids {
 		if t == t.tg.leader {
-			t.tg.tm.pause()
-		}
-		// This means we'll iterate ThreadGroups and FDMaps shared by multiple
-		// tasks repeatedly, but ktime.Timer.Pause is idempotent so this is
-		// harmless.
-		for _, it := range t.tg.timers {
-			it.PauseTimer()
+			t.tg.itimerRealTimer.Pause()
+			for _, it := range t.tg.timers {
+				it.PauseTimer()
+			}
 		}
+		// This means we'll iterate FDMaps shared by multiple tasks repeatedly,
+		// but ktime.Timer.Pause is idempotent so this is harmless.
 		if fdm := t.fds; fdm != nil {
 			for _, desc := range fdm.files {
 				if tfd, ok := desc.file.FileOperations.(*timerfd.TimerOperations); ok {
@@ -774,10 +773,10 @@ func (k *Kernel) resumeTimeLocked() {
 	k.timekeeper.ResumeUpdates()
 	for t := range k.tasks.Root.tids {
 		if t == t.tg.leader {
-			t.tg.tm.resume()
-		}
-		for _, it := range t.tg.timers {
-			it.ResumeTimer()
+			t.tg.itimerRealTimer.Resume()
+			for _, it := range t.tg.timers {
+				it.ResumeTimer()
+			}
 		}
 		if fdm := t.fds; fdm != nil {
 			for _, desc := range fdm.files {
@@ -1078,22 +1077,3 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
 		return nil
 	}
 }
-
-type kernelCPUClockListener struct {
-	k *Kernel
-}
-
-// Notify implements ktime.TimerListener.Notify.
-func (l kernelCPUClockListener) Notify(exp uint64) {
-	// Only increment cpuClock by 1 regardless of the number of expirations.
-	// This approximately compensates for cases where thread throttling or bad
-	// Go runtime scheduling prevents the cpuClockTicker goroutine, and
-	// presumably task goroutines as well, from executing for a long period of
-	// time. It's also necessary to prevent CPU clocks from seeing large
-	// discontinuous jumps.
-	atomic.AddUint64(&l.k.cpuClock, 1)
-}
-
-// Destroy implements ktime.TimerListener.Destroy.
-func (l kernelCPUClockListener) Destroy() {
-}
diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go
index ce12cdb64..d2052921e 100644
--- a/pkg/sentry/kernel/task_acct.go
+++ b/pkg/sentry/kernel/task_acct.go
@@ -21,8 +21,99 @@ import (
 	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
 
+// Getitimer implements getitimer(2).
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) Getitimer(id int32) (linux.ItimerVal, error) {
+	var tm ktime.Time
+	var s ktime.Setting
+	switch id {
+	case linux.ITIMER_REAL:
+		tm, s = t.tg.itimerRealTimer.Get()
+	case linux.ITIMER_VIRTUAL:
+		tm = t.tg.UserCPUClock().Now()
+		t.tg.signalHandlers.mu.Lock()
+		s, _ = t.tg.itimerVirtSetting.At(tm)
+		t.tg.signalHandlers.mu.Unlock()
+	case linux.ITIMER_PROF:
+		tm = t.tg.CPUClock().Now()
+		t.tg.signalHandlers.mu.Lock()
+		s, _ = t.tg.itimerProfSetting.At(tm)
+		t.tg.signalHandlers.mu.Unlock()
+	default:
+		return linux.ItimerVal{}, syserror.EINVAL
+	}
+	val, iv := ktime.SpecFromSetting(tm, s)
+	return linux.ItimerVal{
+		Value:    linux.DurationToTimeval(val),
+		Interval: linux.DurationToTimeval(iv),
+	}, nil
+}
+
+// Setitimer implements setitimer(2).
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) Setitimer(id int32, newitv linux.ItimerVal) (linux.ItimerVal, error) {
+	var tm ktime.Time
+	var olds ktime.Setting
+	switch id {
+	case linux.ITIMER_REAL:
+		news, err := ktime.SettingFromSpec(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), t.tg.itimerRealTimer.Clock())
+		if err != nil {
+			return linux.ItimerVal{}, err
+		}
+		tm, olds = t.tg.itimerRealTimer.Swap(news)
+	case linux.ITIMER_VIRTUAL:
+		c := t.tg.UserCPUClock()
+		var err error
+		t.k.cpuClockTicker.Atomically(func() {
+			tm = c.Now()
+			var news ktime.Setting
+			news, err = ktime.SettingFromSpecAt(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), tm)
+			if err != nil {
+				return
+			}
+			t.tg.signalHandlers.mu.Lock()
+			olds = t.tg.itimerVirtSetting
+			t.tg.itimerVirtSetting = news
+			t.tg.updateCPUTimersEnabledLocked()
+			t.tg.signalHandlers.mu.Unlock()
+		})
+		if err != nil {
+			return linux.ItimerVal{}, err
+		}
+	case linux.ITIMER_PROF:
+		c := t.tg.CPUClock()
+		var err error
+		t.k.cpuClockTicker.Atomically(func() {
+			tm = c.Now()
+			var news ktime.Setting
+			news, err = ktime.SettingFromSpecAt(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), tm)
+			if err != nil {
+				return
+			}
+			t.tg.signalHandlers.mu.Lock()
+			olds = t.tg.itimerProfSetting
+			t.tg.itimerProfSetting = news
+			t.tg.updateCPUTimersEnabledLocked()
+			t.tg.signalHandlers.mu.Unlock()
+		})
+		if err != nil {
+			return linux.ItimerVal{}, err
+		}
+	default:
+		return linux.ItimerVal{}, syserror.EINVAL
+	}
+	oldval, oldiv := ktime.SpecFromSetting(tm, olds)
+	return linux.ItimerVal{
+		Value:    linux.DurationToTimeval(oldval),
+		Interval: linux.DurationToTimeval(oldiv),
+	}, nil
+}
+
 // IOUsage returns the io usage of the thread.
 func (t *Task) IOUsage() *usage.IO {
 	return t.ioUsage
@@ -56,12 +147,6 @@ func (t *Task) SetName(name string) {
 	t.Debugf("Set thread name to %q", name)
 }
 
-// SetCPUTimer is used by setrlimit(RLIMIT_CPU) to enforce the hard and soft
-// limits on CPU time used by this process.
-func (tg *ThreadGroup) SetCPUTimer(l *limits.Limit) {
-	tg.Timer().applyCPULimits(*l)
-}
-
 // Limits implements context.Context.Limits.
 func (t *Task) Limits() *limits.LimitSet {
 	return t.ThreadGroup().Limits()
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 7c469ec46..de3aef40d 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -241,7 +241,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		if opts.NewSignalHandlers {
 			sh = sh.Fork()
 		}
-		tg = NewThreadGroup(pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock)
+		tg = t.k.newThreadGroup(pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock)
 	}
 
 	cfg := &TaskConfig{
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index f5b45fb17..65969ca9b 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -675,9 +675,6 @@ func (t *Task) exitNotifyLocked(fromPtraceDetach bool) {
 		t.tg.ioUsage.Accumulate(t.ioUsage)
 		t.tg.signalHandlers.mu.Lock()
 		t.tg.tasks.Remove(t)
-		if t.tg.lastTimerSignalTask == t {
-			t.tg.lastTimerSignalTask = nil
-		}
 		t.tg.tasksCount--
 		tc := t.tg.tasksCount
 		t.tg.signalHandlers.mu.Unlock()
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 8dd0ef6ea..49ac933b7 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -70,10 +70,6 @@ func (t *Task) run(threadID uintptr) {
 	// Platform.CooperativelySharesAddressSpace() == true, we give up the
 	// AddressSpace before the task goroutine finishes executing.
 
-	// Ensure that thread group timers for execution time reflect that this
-	// task now exists.
-	t.tg.tm.kick()
-
 	// If this is a newly-started task, it should check for participation in
 	// group stops. If this is a task resuming after restore, it was
 	// interrupted by saving. In either case, the task is initially
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 49141ab74..19dcc963a 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -18,12 +18,15 @@ package kernel
 
 import (
 	"fmt"
+	"math/rand"
 	"sync/atomic"
 	"time"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 )
@@ -84,6 +87,33 @@ type TaskGoroutineSchedInfo struct {
 	SysTicks uint64
 }
 
+// userTicksAt returns the extrapolated value of ts.UserTicks after
+// Kernel.CPUClockNow() indicates a time of now.
+//
+// Preconditions: now <= Kernel.CPUClockNow(). (Since Kernel.cpuClock is
+// monotonic, this is satisfied if now is the result of a previous call to
+// Kernel.CPUClockNow().) This requirement exists because otherwise a racing
+// change to t.gosched can cause userTicksAt to adjust stats by too much,
+// making the observed stats non-monotonic.
+func (ts *TaskGoroutineSchedInfo) userTicksAt(now uint64) uint64 {
+	if ts.Timestamp < now && ts.State == TaskGoroutineRunningApp {
+		// Update stats to reflect execution since the last update.
+		return ts.UserTicks + (now - ts.Timestamp)
+	}
+	return ts.UserTicks
+}
+
+// sysTicksAt returns the extrapolated value of ts.SysTicks after
+// Kernel.CPUClockNow() indicates a time of now.
+//
+// Preconditions: As for userTicksAt.
+func (ts *TaskGoroutineSchedInfo) sysTicksAt(now uint64) uint64 {
+	if ts.Timestamp < now && ts.State == TaskGoroutineRunningSys {
+		return ts.SysTicks + (now - ts.Timestamp)
+	}
+	return ts.SysTicks
+}
+
 // Preconditions: The caller must be running on the task goroutine.
 func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) {
 	now := t.k.CPUClockNow()
@@ -127,26 +157,12 @@ func (t *Task) CPUStats() usage.CPUStats {
 	return t.cpuStatsAt(t.k.CPUClockNow())
 }
 
-// Preconditions: now <= Kernel.CPUClockNow(). (Since Kernel.cpuClock is
-// monotonic, this is satisfied if now is the result of a previous call to
-// Kernel.CPUClockNow().) This requirement exists because otherwise a racing
-// change to t.gosched can cause cpuStatsAt to adjust stats by too much, making
-// the returned stats non-monotonic.
+// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt.
 func (t *Task) cpuStatsAt(now uint64) usage.CPUStats {
 	tsched := t.TaskGoroutineSchedInfo()
-	if tsched.Timestamp < now {
-		// Update stats to reflect execution since the last update to
-		// t.gosched.
-		switch tsched.State {
-		case TaskGoroutineRunningSys:
-			tsched.SysTicks += now - tsched.Timestamp
-		case TaskGoroutineRunningApp:
-			tsched.UserTicks += now - tsched.Timestamp
-		}
-	}
 	return usage.CPUStats{
-		UserTime:          time.Duration(tsched.UserTicks * uint64(linux.ClockTick)),
-		SysTime:           time.Duration(tsched.SysTicks * uint64(linux.ClockTick)),
+		UserTime:          time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick)),
+		SysTime:           time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick)),
 		VoluntarySwitches: atomic.LoadUint64(&t.yieldCount),
 	}
 }
@@ -162,9 +178,14 @@ func (tg *ThreadGroup) CPUStats() usage.CPUStats {
 		// ThreadGroup has ever executed anyway.
 		return usage.CPUStats{}
 	}
-	now := tg.leader.k.CPUClockNow()
+	return tg.cpuStatsAtLocked(tg.leader.k.CPUClockNow())
+}
+
+// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt. The TaskSet mutex
+// must be locked.
+func (tg *ThreadGroup) cpuStatsAtLocked(now uint64) usage.CPUStats {
 	stats := tg.exitedCPUStats
-	// Account for active tasks.
+	// Account for live tasks.
 	for t := tg.tasks.Front(); t != nil; t = t.Next() {
 		stats.Accumulate(t.cpuStatsAt(now))
 	}
@@ -182,6 +203,291 @@ func (tg *ThreadGroup) JoinedChildCPUStats() usage.CPUStats {
 	return tg.childCPUStats
 }
 
+// taskClock is a ktime.Clock that measures the time that a task has spent
+// executing. taskClock is primarily used to implement CLOCK_THREAD_CPUTIME_ID.
+//
+// +stateify savable
+type taskClock struct {
+	t *Task
+
+	// If includeSys is true, the taskClock includes both time spent executing
+	// application code as well as time spent in the sentry. Otherwise, the
+	// taskClock includes only time spent executing application code.
+	includeSys bool
+
+	// Implements waiter.Waitable. TimeUntil wouldn't change its estimation
+	// based on either of the clock events, so there's no event to be
+	// notified for.
+	ktime.NoClockEvents `state:"nosave"`
+
+	// Implements ktime.Clock.WallTimeUntil.
+	//
+	// As an upper bound, a task's clock cannot advance faster than CPU
+	// time. It would have to execute at a rate of more than 1 task-second
+	// per 1 CPU-second, which isn't possible.
+	ktime.WallRateClock `state:"nosave"`
+}
+
+// UserCPUClock returns a clock measuring the CPU time the task has spent
+// executing application code.
+func (t *Task) UserCPUClock() ktime.Clock {
+	return &taskClock{t: t, includeSys: false}
+}
+
+// CPUClock returns a clock measuring the CPU time the task has spent executing
+// application and "kernel" code.
+func (t *Task) CPUClock() ktime.Clock {
+	return &taskClock{t: t, includeSys: true}
+}
+
+// Now implements ktime.Clock.Now.
+func (tc *taskClock) Now() ktime.Time {
+	stats := tc.t.CPUStats()
+	if tc.includeSys {
+		return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
+	}
+	return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
+}
+
+// tgClock is a ktime.Clock that measures the time a thread group has spent
+// executing. tgClock is primarily used to implement CLOCK_PROCESS_CPUTIME_ID.
+//
+// +stateify savable
+type tgClock struct {
+	tg *ThreadGroup
+
+	// If includeSys is true, the tgClock includes both time spent executing
+	// application code as well as time spent in the sentry. Otherwise, the
+	// tgClock includes only time spent executing application code.
+	includeSys bool
+
+	// Implements waiter.Waitable.
+	ktime.ClockEventsQueue `state:"nosave"`
+}
+
+// Now implements ktime.Clock.Now.
+func (tgc *tgClock) Now() ktime.Time {
+	stats := tgc.tg.CPUStats()
+	if tgc.includeSys {
+		return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
+	}
+	return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
+}
+
+// WallTimeUntil implements ktime.Clock.WallTimeUntil.
+func (tgc *tgClock) WallTimeUntil(t, now ktime.Time) time.Duration {
+	// Thread group CPU time should not exceed wall time * live tasks, since
+	// task goroutines exit after the transition to TaskExitZombie in
+	// runExitNotify.
+	tgc.tg.pidns.owner.mu.RLock()
+	n := tgc.tg.liveTasks
+	tgc.tg.pidns.owner.mu.RUnlock()
+	if n == 0 {
+		if t.Before(now) {
+			return 0
+		}
+		// The timer tick raced with thread group exit, after which no more
+		// tasks can enter the thread group. So tgc.Now() will never advance
+		// again. Return a large delay; the timer should be stopped long before
+		// it comes again anyway.
+		return time.Hour
+	}
+	// This is a lower bound on the amount of time that can elapse before an
+	// associated timer expires, so returning this value tends to result in a
+	// sequence of closely-spaced ticks just before timer expiry. To avoid
+	// this, round up to the nearest ClockTick; CPU usage measurements are
+	// limited to this resolution anyway.
+	remaining := time.Duration(t.Sub(now).Nanoseconds()/int64(n)) * time.Nanosecond
+	return ((remaining + (linux.ClockTick - time.Nanosecond)) / linux.ClockTick) * linux.ClockTick
+}
+
+// UserCPUClock returns a ktime.Clock that measures the time that a thread
+// group has spent executing.
+func (tg *ThreadGroup) UserCPUClock() ktime.Clock {
+	return &tgClock{tg: tg, includeSys: false}
+}
+
+// CPUClock returns a ktime.Clock that measures the time that a thread group
+// has spent executing, including sentry time.
+func (tg *ThreadGroup) CPUClock() ktime.Clock {
+	return &tgClock{tg: tg, includeSys: true}
+}
+
+type kernelCPUClockTicker struct {
+	k *Kernel
+
+	// These are essentially kernelCPUClockTicker.Notify local variables that
+	// are cached between calls to reduce allocations.
+	rng *rand.Rand
+	tgs []*ThreadGroup
+}
+
+func newKernelCPUClockTicker(k *Kernel) *kernelCPUClockTicker {
+	return &kernelCPUClockTicker{
+		k:   k,
+		rng: rand.New(rand.NewSource(rand.Int63())),
+	}
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (ticker *kernelCPUClockTicker) Notify(exp uint64) {
+	// Only increment cpuClock by 1 regardless of the number of expirations.
+	// This approximately compensates for cases where thread throttling or bad
+	// Go runtime scheduling prevents the kernelCPUClockTicker goroutine, and
+	// presumably task goroutines as well, from executing for a long period of
+	// time. It's also necessary to prevent CPU clocks from seeing large
+	// discontinuous jumps.
+	now := atomic.AddUint64(&ticker.k.cpuClock, 1)
+
+	// Check thread group CPU timers.
+	tgs := ticker.k.tasks.Root.ThreadGroupsAppend(ticker.tgs)
+	for _, tg := range tgs {
+		if atomic.LoadUint32(&tg.cpuTimersEnabled) == 0 {
+			continue
+		}
+
+		ticker.k.tasks.mu.RLock()
+		if tg.leader == nil {
+			// No tasks have ever run in this thread group.
+			ticker.k.tasks.mu.RUnlock()
+			continue
+		}
+		// Accumulate thread group CPU stats, and randomly select running tasks
+		// using reservoir sampling to receive CPU timer signals.
+		var virtReceiver *Task
+		nrVirtCandidates := 0
+		var profReceiver *Task
+		nrProfCandidates := 0
+		tgUserTime := tg.exitedCPUStats.UserTime
+		tgSysTime := tg.exitedCPUStats.SysTime
+		for t := tg.tasks.Front(); t != nil; t = t.Next() {
+			tsched := t.TaskGoroutineSchedInfo()
+			tgUserTime += time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick))
+			tgSysTime += time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick))
+			switch tsched.State {
+			case TaskGoroutineRunningApp:
+				// Considered by ITIMER_VIRT, ITIMER_PROF, and RLIMIT_CPU
+				// timers.
+				nrVirtCandidates++
+				if int(randInt31n(ticker.rng, int32(nrVirtCandidates))) == 0 {
+					virtReceiver = t
+				}
+				fallthrough
+			case TaskGoroutineRunningSys:
+				// Considered by ITIMER_PROF and RLIMIT_CPU timers.
+				nrProfCandidates++
+				if int(randInt31n(ticker.rng, int32(nrProfCandidates))) == 0 {
+					profReceiver = t
+				}
+			}
+		}
+		tgVirtNow := ktime.FromNanoseconds(tgUserTime.Nanoseconds())
+		tgProfNow := ktime.FromNanoseconds((tgUserTime + tgSysTime).Nanoseconds())
+
+		// All of the following are standard (not real-time) signals, which are
+		// automatically deduplicated, so we ignore the number of expirations.
+		tg.signalHandlers.mu.Lock()
+		// It should only be possible for these timers to advance if we found
+		// at least one running task.
+		if virtReceiver != nil {
+			// ITIMER_VIRTUAL
+			newItimerVirtSetting, exp := tg.itimerVirtSetting.At(tgVirtNow)
+			tg.itimerVirtSetting = newItimerVirtSetting
+			if exp != 0 {
+				virtReceiver.sendSignalLocked(sigPriv(linux.SIGVTALRM), true)
+			}
+		}
+		if profReceiver != nil {
+			// ITIMER_PROF
+			newItimerProfSetting, exp := tg.itimerProfSetting.At(tgProfNow)
+			tg.itimerProfSetting = newItimerProfSetting
+			if exp != 0 {
+				profReceiver.sendSignalLocked(sigPriv(linux.SIGPROF), true)
+			}
+			// RLIMIT_CPU soft limit
+			newRlimitCPUSoftSetting, exp := tg.rlimitCPUSoftSetting.At(tgProfNow)
+			tg.rlimitCPUSoftSetting = newRlimitCPUSoftSetting
+			if exp != 0 {
+				profReceiver.sendSignalLocked(sigPriv(linux.SIGXCPU), true)
+			}
+			// RLIMIT_CPU hard limit
+			rlimitCPUMax := tg.limits.Get(limits.CPU).Max
+			if rlimitCPUMax != limits.Infinity && !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPUMax))) {
+				profReceiver.sendSignalLocked(sigPriv(linux.SIGKILL), true)
+			}
+		}
+		tg.signalHandlers.mu.Unlock()
+
+		ticker.k.tasks.mu.RUnlock()
+	}
+
+	// Retain tgs between calls to Notify to reduce allocations.
+	for i := range tgs {
+		tgs[i] = nil
+	}
+	ticker.tgs = tgs[:0]
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (ticker *kernelCPUClockTicker) Destroy() {
+}
+
+// randInt31n returns a random integer in [0, n).
+//
+// randInt31n is equivalent to math/rand.Rand.int31n(), which is unexported.
+// See that function for details.
+func randInt31n(rng *rand.Rand, n int32) int32 {
+	v := rng.Uint32()
+	prod := uint64(v) * uint64(n)
+	low := uint32(prod)
+	if low < uint32(n) {
+		thresh := uint32(-n) % uint32(n)
+		for low < thresh {
+			v = rng.Uint32()
+			prod = uint64(v) * uint64(n)
+			low = uint32(prod)
+		}
+	}
+	return int32(prod >> 32)
+}
+
+// NotifyRlimitCPUUpdated is called by setrlimit.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) NotifyRlimitCPUUpdated() {
+	t.k.cpuClockTicker.Atomically(func() {
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		t.tg.signalHandlers.mu.Lock()
+		defer t.tg.signalHandlers.mu.Unlock()
+		rlimitCPU := t.tg.limits.Get(limits.CPU)
+		t.tg.rlimitCPUSoftSetting = ktime.Setting{
+			Enabled: rlimitCPU.Cur != limits.Infinity,
+			Next:    ktime.FromNanoseconds((time.Duration(rlimitCPU.Cur) * time.Second).Nanoseconds()),
+			Period:  time.Second,
+		}
+		if rlimitCPU.Max != limits.Infinity {
+			// Check if tg is already over the hard limit.
+			tgcpu := t.tg.cpuStatsAtLocked(t.k.CPUClockNow())
+			tgProfNow := ktime.FromNanoseconds((tgcpu.UserTime + tgcpu.SysTime).Nanoseconds())
+			if !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPU.Max))) {
+				t.sendSignalLocked(sigPriv(linux.SIGKILL), true)
+			}
+		}
+		t.tg.updateCPUTimersEnabledLocked()
+	})
+}
+
+// Preconditions: The signal mutex must be locked.
+func (tg *ThreadGroup) updateCPUTimersEnabledLocked() {
+	rlimitCPU := tg.limits.Get(limits.CPU)
+	if tg.itimerVirtSetting.Enabled || tg.itimerProfSetting.Enabled || tg.rlimitCPUSoftSetting.Enabled || rlimitCPU.Max != limits.Infinity {
+		atomic.StoreUint32(&tg.cpuTimersEnabled, 1)
+	} else {
+		atomic.StoreUint32(&tg.cpuTimersEnabled, 0)
+	}
+}
+
 // StateStatus returns a string representation of the task's current state,
 // appropriate for /proc/[pid]/status.
 func (t *Task) StateStatus() string {
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index afb010f60..e2925a708 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -359,72 +359,6 @@ func (tg *ThreadGroup) SendSignal(info *arch.SignalInfo) error {
 	return tg.leader.sendSignalLocked(info, true /* group */)
 }
 
-// Preconditions: The TaskSet mutex must be locked.
-func (t *Task) onCPULocked(includeSys bool) bool {
-	// Task is exiting.
-	if t.exitState != TaskExitNone {
-		return false
-	}
-
-	switch t.TaskGoroutineSchedInfo().State {
-	case TaskGoroutineRunningSys:
-		return includeSys
-	case TaskGoroutineRunningApp:
-		return true
-	default:
-		return false
-	}
-}
-
-// SendTimerSignal mimics the process timer signal delivery behavior in linux:
-// signals are delivered to the thread that triggers the timer expiration (see
-// kernel/time/posix-cpu-timers.c:check_process_timers(). This
-// means
-//   1) the thread is running on cpu at the time.
-//   2) a thread runs more frequently will get more of those signals.
-//
-// We approximate this behavior by selecting a running task in a round-robin
-// fashion. Statistically, a thread running more often should have a higher
-// probability to be selected.
-func (tg *ThreadGroup) SendTimerSignal(info *arch.SignalInfo, includeSys bool) error {
-	tg.pidns.owner.mu.RLock()
-	defer tg.pidns.owner.mu.RUnlock()
-	tg.signalHandlers.mu.Lock()
-	defer tg.signalHandlers.mu.Unlock()
-
-	// Find the next running threads.
-	var t *Task
-	if tg.lastTimerSignalTask == nil {
-		t = tg.tasks.Front()
-	} else {
-		t = tg.lastTimerSignalTask.Next()
-	}
-
-	// Iterate from lastTimerSignalTask.Next() to the last task in the task list.
-	for t != nil {
-		if t.onCPULocked(includeSys) {
-			tg.lastTimerSignalTask = t
-			return t.sendSignalLocked(info, true /* group */)
-		}
-		t = t.Next()
-	}
-
-	// t is nil when we reach here. If lastTimerSignalTask is not nil, iterate
-	// from Front to lastTimerSignalTask.
-	if tg.lastTimerSignalTask != nil {
-		for t := tg.tasks.Front(); t != tg.lastTimerSignalTask.Next(); t = t.Next() {
-			if t.onCPULocked(includeSys) {
-				tg.lastTimerSignalTask = t
-				return t.sendSignalLocked(info, true /* group */)
-			}
-		}
-	}
-
-	// No running threads? Just try the leader.
-	tg.lastTimerSignalTask = tg.leader
-	return tg.leader.sendSignalLocked(info, true /* group */)
-}
-
 func (t *Task) sendSignalLocked(info *arch.SignalInfo, group bool) error {
 	return t.sendSignalTimerLocked(info, group, nil)
 }
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 13dce08ce..dfff7b52d 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -19,6 +19,7 @@ import (
 	"sync/atomic"
 
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
 )
@@ -59,12 +60,6 @@ type ThreadGroup struct {
 	// pendingSignals is protected by the signal mutex.
 	pendingSignals pendingSignals
 
-	// lastTimerSignalTask records the last task we deliver a process timer signal to.
-	// Please see SendTimerSignal for more details.
-	//
-	// lastTimerSignalTask is protected by the signal mutex.
-	lastTimerSignalTask *Task
-
 	// groupStopPhase indicates the state of a group stop in progress on the
 	// thread group, if any.
 	//
@@ -152,14 +147,39 @@ type ThreadGroup struct {
 	// restarted by Task.Start.
 	liveGoroutines sync.WaitGroup `state:"nosave"`
 
-	// tm contains process timers. TimerManager fields are immutable.
-	tm TimerManager
+	timerMu sync.Mutex `state:"nosave"`
+
+	// itimerRealTimer implements ITIMER_REAL for the thread group.
+	itimerRealTimer *ktime.Timer
+
+	// itimerVirtSetting is the ITIMER_VIRTUAL setting for the thread group.
+	//
+	// itimerVirtSetting is protected by the signal mutex.
+	itimerVirtSetting ktime.Setting
+
+	// itimerProfSetting is the ITIMER_PROF setting for the thread group.
+	//
+	// itimerProfSetting is protected by the signal mutex.
+	itimerProfSetting ktime.Setting
+
+	// rlimitCPUSoftSetting is the setting for RLIMIT_CPU soft limit
+	// notifications for the thread group.
+	//
+	// rlimitCPUSoftSetting is protected by the signal mutex.
+	rlimitCPUSoftSetting ktime.Setting
+
+	// cpuTimersEnabled is non-zero if itimerVirtSetting.Enabled is true,
+	// itimerProfSetting.Enabled is true, rlimitCPUSoftSetting.Enabled is true,
+	// or limits.Get(CPU) is finite.
+	//
+	// cpuTimersEnabled is protected by the signal mutex. cpuTimersEnabled is
+	// accessed using atomic memory operations.
+	cpuTimersEnabled uint32
 
 	// timers is the thread group's POSIX interval timers. nextTimerID is the
 	// TimerID at which allocation should begin searching for an unused ID.
 	//
 	// timers and nextTimerID are protected by timerMu.
-	timerMu     sync.Mutex `state:"nosave"`
 	timers      map[linux.TimerID]*IntervalTimer
 	nextTimerID linux.TimerID
 
@@ -211,11 +231,11 @@ type ThreadGroup struct {
 	rscr atomic.Value `state:".(*RSEQCriticalRegion)"`
 }
 
-// NewThreadGroup returns a new, empty thread group in PID namespace ns. The
+// newThreadGroup returns a new, empty thread group in PID namespace ns. The
 // thread group leader will send its parent terminationSignal when it exits.
 // The new thread group isn't visible to the system until a task has been
 // created inside of it by a successful call to TaskSet.NewTask.
-func NewThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup {
+func (k *Kernel) newThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup {
 	tg := &ThreadGroup{
 		threadGroupNode: threadGroupNode{
 			pidns: ns,
@@ -225,7 +245,7 @@ func NewThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linu
 		ioUsage:           &usage.IO{},
 		limits:            limits,
 	}
-	tg.tm = newTimerManager(tg, monotonicClock)
+	tg.itimerRealTimer = ktime.NewTimer(k.monotonicClock, &itimerRealListener{tg: tg})
 	tg.timers = make(map[linux.TimerID]*IntervalTimer)
 	tg.rscr.Store(&RSEQCriticalRegion{})
 	return tg
@@ -249,11 +269,6 @@ func (tg *ThreadGroup) SignalHandlers() *SignalHandlers {
 	return tg.signalHandlers
 }
 
-// Timer returns tg's timers.
-func (tg *ThreadGroup) Timer() *TimerManager {
-	return &tg.tm
-}
-
 // Limits returns tg's limits.
 func (tg *ThreadGroup) Limits() *limits.LimitSet {
 	return tg.limits
@@ -261,11 +276,9 @@ func (tg *ThreadGroup) Limits() *limits.LimitSet {
 
 // release releases the thread group's resources.
 func (tg *ThreadGroup) release() {
-	// These must be done without holding the TaskSet or signal mutexes since
-	// timers send signals with Timer.mu locked.
-
-	tg.tm.destroy()
-
+	// Timers must be destroyed without holding the TaskSet or signal mutexes
+	// since timers send signals with Timer.mu locked.
+	tg.itimerRealTimer.Destroy()
 	var its []*IntervalTimer
 	tg.pidns.owner.mu.Lock()
 	tg.signalHandlers.mu.Lock()
@@ -292,3 +305,19 @@ func (tg *ThreadGroup) forEachChildThreadGroupLocked(fn func(*ThreadGroup)) {
 		}
 	}
 }
+
+// itimerRealListener implements ktime.Listener for ITIMER_REAL expirations.
+//
+// +stateify savable
+type itimerRealListener struct {
+	tg *ThreadGroup
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (l *itimerRealListener) Notify(exp uint64) {
+	l.tg.SendSignal(sigPriv(linux.SIGALRM))
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (l *itimerRealListener) Destroy() {
+}
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index 3d5713106..4e3d19e97 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -243,9 +243,13 @@ func (ns *PIDNamespace) Tasks() []*Task {
 
 // ThreadGroups returns a snapshot of the thread groups in ns.
 func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup {
+	return ns.ThreadGroupsAppend(nil)
+}
+
+// ThreadGroupsAppend appends a snapshot of the thread groups in ns to tgs.
+func (ns *PIDNamespace) ThreadGroupsAppend(tgs []*ThreadGroup) []*ThreadGroup {
 	ns.owner.mu.RLock()
 	defer ns.owner.mu.RUnlock()
-	var tgs []*ThreadGroup
 	for t := range ns.tids {
 		if t == t.tg.leader {
 			tgs = append(tgs, t.tg)
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index 1f6fed007..52e0dfba1 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -307,6 +307,12 @@ type Setting struct {
 // SettingFromSpec converts a (value, interval) pair to a Setting based on a
 // reading from c. value is interpreted as a time relative to c.Now().
 func SettingFromSpec(value time.Duration, interval time.Duration, c Clock) (Setting, error) {
+	return SettingFromSpecAt(value, interval, c.Now())
+}
+
+// SettingFromSpecAt converts a (value, interval) pair to a Setting. value is
+// interpreted as a time relative to now.
+func SettingFromSpecAt(value time.Duration, interval time.Duration, now Time) (Setting, error) {
 	if value < 0 {
 		return Setting{}, syserror.EINVAL
 	}
@@ -315,7 +321,7 @@ func SettingFromSpec(value time.Duration, interval time.Duration, c Clock) (Sett
 	}
 	return Setting{
 		Enabled: true,
-		Next:    c.Now().Add(value),
+		Next:    now.Add(value),
 		Period:  interval,
 	}, nil
 }
@@ -365,14 +371,14 @@ func ItimerspecFromSetting(now Time, s Setting) linux.Itimerspec {
 	}
 }
 
-// advancedTo returns an updated Setting and a number of expirations after
-// the associated Clock indicates a time of now.
+// At returns an updated Setting and a number of expirations after the
+// associated Clock indicates a time of now.
 //
-// Settings may be created by successive calls to advancedTo with decreasing
+// Settings may be created by successive calls to At with decreasing
 // values of now (i.e. time may appear to go backward). Supporting this is
 // required to support non-monotonic clocks, as well as allowing
 // Timer.clock.Now() to be called without holding Timer.mu.
-func (s Setting) advancedTo(now Time) (Setting, uint64) {
+func (s Setting) At(now Time) (Setting, uint64) {
 	if !s.Enabled {
 		return s, 0
 	}
@@ -519,7 +525,7 @@ func (t *Timer) Tick() {
 	if t.paused {
 		return
 	}
-	s, exp := t.setting.advancedTo(now)
+	s, exp := t.setting.At(now)
 	t.setting = s
 	if exp > 0 {
 		t.listener.Notify(exp)
@@ -574,7 +580,7 @@ func (t *Timer) Get() (Time, Setting) {
 	if t.paused {
 		panic(fmt.Sprintf("Timer.Get called on paused Timer %p", t))
 	}
-	s, exp := t.setting.advancedTo(now)
+	s, exp := t.setting.At(now)
 	t.setting = s
 	if exp > 0 {
 		t.listener.Notify(exp)
@@ -607,14 +613,14 @@ func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) {
 	if t.paused {
 		panic(fmt.Sprintf("Timer.SwapAnd called on paused Timer %p", t))
 	}
-	oldS, oldExp := t.setting.advancedTo(now)
+	oldS, oldExp := t.setting.At(now)
 	if oldExp > 0 {
 		t.listener.Notify(oldExp)
 	}
 	if f != nil {
 		f()
 	}
-	newS, newExp := s.advancedTo(now)
+	newS, newExp := s.At(now)
 	t.setting = newS
 	if newExp > 0 {
 		t.listener.Notify(newExp)
@@ -623,6 +629,17 @@ func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) {
 	return now, oldS
 }
 
+// Atomically invokes f atomically with respect to expirations of t; that is, t
+// cannot generate expirations while f is being called.
+//
+// Preconditions: f cannot call any Timer methods since it is called with the
+// Timer mutex locked.
+func (t *Timer) Atomically(f func()) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	f()
+}
+
 // Preconditions: t.mu must be locked.
 func (t *Timer) resetKickerLocked(now Time) {
 	if t.setting.Enabled {
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
index df5dbe128..2167f3efe 100644
--- a/pkg/sentry/kernel/timekeeper.go
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -15,6 +15,7 @@
 package kernel
 
 import (
+	"fmt"
 	"sync"
 	"time"
 
@@ -277,3 +278,28 @@ func (t *Timekeeper) GetTime(c sentrytime.ClockID) (int64, error) {
 func (t *Timekeeper) BootTime() ktime.Time {
 	return t.bootTime
 }
+
+// timekeeperClock is a ktime.Clock that reads time from a
+// kernel.Timekeeper-managed clock.
+//
+// +stateify savable
+type timekeeperClock struct {
+	tk *Timekeeper
+	c  sentrytime.ClockID
+
+	// Implements ktime.Clock.WallTimeUntil.
+	ktime.WallRateClock `state:"nosave"`
+
+	// Implements waiter.Waitable. (We have no ability to detect
+	// discontinuities from external changes to CLOCK_REALTIME).
+	ktime.NoClockEvents `state:"nosave"`
+}
+
+// Now implements ktime.Clock.Now.
+func (tc *timekeeperClock) Now() ktime.Time {
+	now, err := tc.tk.GetTime(tc.c)
+	if err != nil {
+		panic(fmt.Sprintf("timekeeperClock(ClockID=%v)).Now: %v", tc.c, err))
+	}
+	return ktime.FromNanoseconds(now)
+}
diff --git a/pkg/sentry/kernel/timer.go b/pkg/sentry/kernel/timer.go
deleted file mode 100644
index 534d03d0f..000000000
--- a/pkg/sentry/kernel/timer.go
+++ /dev/null
@@ -1,290 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package kernel
-
-import (
-	"fmt"
-	"time"
-
-	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
-	ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
-	sentrytime "gvisor.googlesource.com/gvisor/pkg/sentry/time"
-)
-
-// timekeeperClock is a ktime.Clock that reads time from a
-// kernel.Timekeeper-managed clock.
-//
-// +stateify savable
-type timekeeperClock struct {
-	tk *Timekeeper
-	c  sentrytime.ClockID
-
-	// Implements ktime.Clock.WallTimeUntil.
-	ktime.WallRateClock `state:"nosave"`
-
-	// Implements waiter.Waitable. (We have no ability to detect
-	// discontinuities from external changes to CLOCK_REALTIME).
-	ktime.NoClockEvents `state:"nosave"`
-}
-
-// Now implements ktime.Clock.Now.
-func (tc *timekeeperClock) Now() ktime.Time {
-	now, err := tc.tk.GetTime(tc.c)
-	if err != nil {
-		panic(fmt.Sprintf("timekeeperClock(ClockID=%v)).Now: %v", tc.c, err))
-	}
-	return ktime.FromNanoseconds(now)
-}
-
-// tgClock is a ktime.Clock that measures the time a thread group has spent
-// executing.
-//
-// +stateify savable
-type tgClock struct {
-	tg *ThreadGroup
-
-	// If includeSys is true, the tgClock includes both time spent executing
-	// application code as well as time spent in the sentry. Otherwise, the
-	// tgClock includes only time spent executing application code.
-	includeSys bool
-
-	// Implements waiter.Waitable.
-	ktime.ClockEventsQueue `state:"nosave"`
-}
-
-// UserCPUClock returns a ktime.Clock that measures the time that a thread
-// group has spent executing.
-func (tg *ThreadGroup) UserCPUClock() ktime.Clock {
-	return tg.tm.virtClock
-}
-
-// CPUClock returns a ktime.Clock that measures the time that a thread group
-// has spent executing, including sentry time.
-func (tg *ThreadGroup) CPUClock() ktime.Clock {
-	return tg.tm.profClock
-}
-
-// Now implements ktime.Clock.Now.
-func (tgc *tgClock) Now() ktime.Time {
-	stats := tgc.tg.CPUStats()
-	if tgc.includeSys {
-		return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
-	}
-	return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
-}
-
-// WallTimeUntil implements ktime.Clock.WallTimeUntil.
-func (tgc *tgClock) WallTimeUntil(t, now ktime.Time) time.Duration {
-	// The assumption here is that the time spent in this process (not matter
-	// virtual or prof) should not exceed wall time * active tasks, since
-	// Task.exitThreadGroup stops accounting as it transitions to
-	// TaskExitInitiated.
-	tgc.tg.pidns.owner.mu.RLock()
-	n := tgc.tg.activeTasks
-	tgc.tg.pidns.owner.mu.RUnlock()
-	if n == 0 {
-		if t.Before(now) {
-			return 0
-		}
-		// The timer tick raced with thread group exit, after which no more
-		// tasks can enter the thread group. So tgc.Now() will never advance
-		// again. Return a large delay; the timer should be stopped long before
-		// it comes again anyway.
-		return time.Hour
-	}
-	// This is a lower bound on the amount of time that can elapse before an
-	// associated timer expires, so returning this value tends to result in a
-	// sequence of closely-spaced ticks just before timer expiry. To avoid
-	// this, round up to the nearest ClockTick; CPU usage measurements are
-	// limited to this resolution anyway.
-	remaining := time.Duration(int64(t.Sub(now))/int64(n)) * time.Nanosecond
-	return ((remaining + (linux.ClockTick - time.Nanosecond)) / linux.ClockTick) * linux.ClockTick
-}
-
-// taskClock is a ktime.Clock that measures the time that a task has spent
-// executing.
-type taskClock struct {
-	t *Task
-
-	// If includeSys is true, the taskClock includes both time spent executing
-	// application code as well as time spent in the sentry. Otherwise, the
-	// taskClock includes only time spent executing application code.
-	includeSys bool
-
-	// Implements waiter.Waitable. TimeUntil wouldn't change its estimation
-	// based on either of the clock events, so there's no event to be
-	// notified for.
-	ktime.NoClockEvents `state:"nosave"`
-
-	// Implements ktime.Clock.WallTimeUntil.
-	//
-	// As an upper bound, a task's clock cannot advance faster than CPU
-	// time. It would have to execute at a rate of more than 1 task-second
-	// per 1 CPU-second, which isn't possible.
-	ktime.WallRateClock `state:"nosave"`
-}
-
-// UserCPUClock returns a clock measuring the CPU time the task has spent
-// executing application code.
-func (t *Task) UserCPUClock() ktime.Clock {
-	return &taskClock{t: t, includeSys: false}
-}
-
-// CPUClock returns a clock measuring the CPU time the task has spent executing
-// application and "kernel" code.
-func (t *Task) CPUClock() ktime.Clock {
-	return &taskClock{t: t, includeSys: true}
-}
-
-// Now implements ktime.Clock.Now.
-func (tc *taskClock) Now() ktime.Time {
-	stats := tc.t.CPUStats()
-	if tc.includeSys {
-		return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
-	}
-	return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
-}
-
-// signalNotifier is a ktime.Listener that sends signals to a ThreadGroup.
-//
-// +stateify savable
-type signalNotifier struct {
-	tg         *ThreadGroup
-	signal     linux.Signal
-	realTimer  bool
-	includeSys bool
-}
-
-// Notify implements ktime.TimerListener.Notify.
-func (s *signalNotifier) Notify(exp uint64) {
-	// Since all signals sent using a signalNotifier are standard (not
-	// real-time) signals, we can ignore the number of expirations and send
-	// only a single signal.
-	if s.realTimer {
-		// real timer signal sent to leader. See kernel/time/itimer.c:it_real_fn
-		s.tg.SendSignal(sigPriv(s.signal))
-	} else {
-		s.tg.SendTimerSignal(sigPriv(s.signal), s.includeSys)
-	}
-}
-
-// Destroy implements ktime.TimerListener.Destroy.
-func (s *signalNotifier) Destroy() {}
-
-// TimerManager is a collection of supported process cpu timers.
-//
-// +stateify savable
-type TimerManager struct {
-	// Clocks used to drive thread group execution time timers.
-	virtClock *tgClock
-	profClock *tgClock
-
-	RealTimer      *ktime.Timer
-	VirtualTimer   *ktime.Timer
-	ProfTimer      *ktime.Timer
-	SoftLimitTimer *ktime.Timer
-	HardLimitTimer *ktime.Timer
-}
-
-// newTimerManager returns a new instance of TimerManager.
-func newTimerManager(tg *ThreadGroup, monotonicClock ktime.Clock) TimerManager {
-	virtClock := &tgClock{tg: tg, includeSys: false}
-	profClock := &tgClock{tg: tg, includeSys: true}
-	tm := TimerManager{
-		virtClock: virtClock,
-		profClock: profClock,
-		RealTimer: ktime.NewTimer(monotonicClock, &signalNotifier{
-			tg:         tg,
-			signal:     linux.SIGALRM,
-			realTimer:  true,
-			includeSys: false,
-		}),
-		VirtualTimer: ktime.NewTimer(virtClock, &signalNotifier{
-			tg:         tg,
-			signal:     linux.SIGVTALRM,
-			realTimer:  false,
-			includeSys: false,
-		}),
-		ProfTimer: ktime.NewTimer(profClock, &signalNotifier{
-			tg:         tg,
-			signal:     linux.SIGPROF,
-			realTimer:  false,
-			includeSys: true,
-		}),
-		SoftLimitTimer: ktime.NewTimer(profClock, &signalNotifier{
-			tg:         tg,
-			signal:     linux.SIGXCPU,
-			realTimer:  false,
-			includeSys: true,
-		}),
-		HardLimitTimer: ktime.NewTimer(profClock, &signalNotifier{
-			tg:         tg,
-			signal:     linux.SIGKILL,
-			realTimer:  false,
-			includeSys: true,
-		}),
-	}
-	tm.applyCPULimits(tg.Limits().Get(limits.CPU))
-	return tm
-}
-
-// Save saves this TimerManger.
-
-// destroy destroys all timers.
-func (tm *TimerManager) destroy() {
-	tm.RealTimer.Destroy()
-	tm.VirtualTimer.Destroy()
-	tm.ProfTimer.Destroy()
-	tm.SoftLimitTimer.Destroy()
-	tm.HardLimitTimer.Destroy()
-}
-
-func (tm *TimerManager) applyCPULimits(l limits.Limit) {
-	tm.SoftLimitTimer.Swap(ktime.Setting{
-		Enabled: l.Cur != limits.Infinity,
-		Next:    ktime.FromNanoseconds((time.Duration(l.Cur) * time.Second).Nanoseconds()),
-		Period:  time.Second,
-	})
-	tm.HardLimitTimer.Swap(ktime.Setting{
-		Enabled: l.Max != limits.Infinity,
-		Next:    ktime.FromNanoseconds((time.Duration(l.Max) * time.Second).Nanoseconds()),
-	})
-}
-
-// kick is called when the number of threads in the thread group associated
-// with tm increases.
-func (tm *TimerManager) kick() {
-	tm.virtClock.Notify(ktime.ClockEventRateIncrease)
-	tm.profClock.Notify(ktime.ClockEventRateIncrease)
-}
-
-// pause is to pause the timers and stop timer signal delivery.
-func (tm *TimerManager) pause() {
-	tm.RealTimer.Pause()
-	tm.VirtualTimer.Pause()
-	tm.ProfTimer.Pause()
-	tm.SoftLimitTimer.Pause()
-	tm.HardLimitTimer.Pause()
-}
-
-// resume is to resume the timers and continue timer signal delivery.
-func (tm *TimerManager) resume() {
-	tm.RealTimer.Resume()
-	tm.VirtualTimer.Resume()
-	tm.ProfTimer.Resume()
-	tm.SoftLimitTimer.Resume()
-	tm.HardLimitTimer.Resume()
-}
author	Jamie Liu <jamieliu@google.com>	2018-10-17 15:48:55 -0700
committer	Shentubot <shentubot@google.com>	2018-10-17 15:50:02 -0700
commit	b2a88ff4713325fca736f6a3bf200be02d2d72a7 (patch)
tree	489e54828c2bfe0bf326976920e5d5e612f877a0 /pkg/sentry/kernel
parent	6922eee6499212a009fdc254224f916bd1c46f29 (diff)