diff options
Diffstat (limited to 'pkg/sentry/watchdog/watchdog.go')
-rw-r--r-- | pkg/sentry/watchdog/watchdog.go | 305 |
1 files changed, 305 insertions, 0 deletions
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go new file mode 100644 index 000000000..2fc4472dd --- /dev/null +++ b/pkg/sentry/watchdog/watchdog.go @@ -0,0 +1,305 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package watchdog is responsible for monitoring the sentry for tasks that may +// potentially be stuck or looping inderterminally causing hard to debug hungs in +// the untrusted app. +// +// It works by periodically querying all tasks to check whether they are in user +// mode (RunUser), kernel mode (RunSys), or blocked in the kernel (OffCPU). Tasks +// that have been running in kernel mode for a long time in the same syscall +// without blocking are considered stuck and are reported. +// +// When a stuck task is detected, the watchdog can take one of the following actions: +// 1. LogWarning: Logs a warning message followed by a stack dump of all goroutines. +// If a tasks continues to be stuck, the message will repeat every minute, unless +// a new stuck task is detected +// 2. Panic: same as above, followed by panic() +// +package watchdog + +import ( + "bytes" + "fmt" + "sync" + "time" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/metric" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" +) + +// DefaultTimeout is a resonable timeout value for most applications. +const DefaultTimeout = 3 * time.Minute + +// descheduleThreshold is the amount of time scheduling needs to be off before the entire wait period +// is discounted from task's last update time. It's set high enough that small scheduling delays won't +// trigger it. +const descheduleThreshold = 1 * time.Second + +var stuckTasks = metric.MustCreateNewUint64Metric("/watchdog/stuck_tasks_detected", true /* sync */, "Cumulative count of stuck tasks detected") + +// Amount of time to wait before dumping the stack to the log again when the same task(s) remains stuck. +var stackDumpSameTaskPeriod = time.Minute + +// Action defines what action to take when a stuck task is detected. +type Action int + +const ( + // LogWarning logs warning message followed by stack trace. + LogWarning Action = iota + // Panic will do the same logging as LogWarning and panic(). + Panic +) + +// String returns Action's string representation. +func (a Action) String() string { + switch a { + case LogWarning: + return "LogWarning" + case Panic: + return "Panic" + default: + panic(fmt.Sprintf("Invalid action: %d", a)) + } +} + +// Watchdog is the main watchdog class. It controls a goroutine that periodically +// analyses all tasks and reports if any of them appear to be stuck. +type Watchdog struct { + // period indicates how often to check all tasks. It's calculated based on + // 'taskTimeout'. + period time.Duration + + // taskTimeout is the amount of time to allow a task to execute the same syscall + // without blocking before it's declared stuck. + taskTimeout time.Duration + + // timeoutAction indicates what action to take when a stuck tasks is detected. + timeoutAction Action + + // k is where the tasks come from. + k *kernel.Kernel + + // stop is used to notify to watchdog should stop. + stop chan struct{} + + // done is used to notify when the watchdog has stopped. + done chan struct{} + + // offenders map contains all tasks that are currently stuck. + offenders map[*kernel.Task]*offender + + // lastStackDump tracks the last time a stack dump was generated to prevent + // spamming the log. + lastStackDump time.Time + + // lastRun is set to the last time the watchdog executed a monitoring loop. + lastRun ktime.Time + + // mu protects the fields below. + mu sync.Mutex + + // started is true if the watchdog has been started before. + started bool +} + +type offender struct { + lastUpdateTime ktime.Time +} + +// New creates a new watchdog. +func New(k *kernel.Kernel, taskTimeout time.Duration, a Action) *Watchdog { + // 4 is arbitrary, just don't want to prolong 'taskTimeout' too much. + period := taskTimeout / 4 + return &Watchdog{ + k: k, + period: period, + taskTimeout: taskTimeout, + timeoutAction: a, + offenders: make(map[*kernel.Task]*offender), + stop: make(chan struct{}), + done: make(chan struct{}), + } +} + +// Start starts the watchdog. +func (w *Watchdog) Start() { + if w.taskTimeout == 0 { + log.Infof("Watchdog disabled") + return + } + + w.mu.Lock() + defer w.mu.Unlock() + if w.started { + return + } + + w.lastRun = w.k.MonotonicClock().Now() + + log.Infof("Starting watchdog, period: %v, timeout: %v, action: %v", w.period, w.taskTimeout, w.timeoutAction) + go w.loop() // S/R-SAFE: watchdog is stopped during save and restarted after restore. + w.started = true +} + +// Stop requests the watchdog to stop and wait for it. +func (w *Watchdog) Stop() { + if w.taskTimeout == 0 { + return + } + + w.mu.Lock() + defer w.mu.Unlock() + if !w.started { + return + } + log.Infof("Stopping watchdog") + w.stop <- struct{}{} + <-w.done + w.started = false + log.Infof("Watchdog stopped") +} + +// loop is the main watchdog routine. It only returns when 'Stop()' is called. +func (w *Watchdog) loop() { + // Loop until someone stops it. + for { + select { + case <-w.stop: + w.done <- struct{}{} + return + case <-time.After(w.period): + w.runTurn() + } + } +} + +// runTurn runs a single pass over all tasks and reports anything it finds. +func (w *Watchdog) runTurn() { + // Someone needs to watch the watchdog. The call below can get stuck if there + // is a deadlock affecting root's PID namespace mutex. Run it in a goroutine + // and report if it takes too long to return. + var tasks []*kernel.Task + done := make(chan struct{}) + go func() { // S/R-SAFE: watchdog is stopped and restarted during S/R. + tasks = w.k.TaskSet().Root.Tasks() + close(done) + }() + + select { + case <-done: + case <-time.After(w.taskTimeout): + // Report if the watchdog is not making progress. + // No one is wathching the watchdog watcher though. + w.reportStuckWatchdog() + <-done + } + + newOffenders := make(map[*kernel.Task]*offender) + newTaskFound := false + now := ktime.FromNanoseconds(int64(w.k.CPUClockNow() * uint64(linux.ClockTick))) + + // The process may be running with low CPU limit making tasks appear stuck because + // are starved of CPU cycles. An estimate is that Tasks could have been starved + // since the last time the watchdog run. If the watchdog detects that scheduling + // is off, it will discount the entire duration since last run from 'lastUpdateTime'. + discount := time.Duration(0) + if now.Sub(w.lastRun.Add(w.period)) > descheduleThreshold { + discount = now.Sub(w.lastRun) + } + w.lastRun = now + + log.Infof("Watchdog starting loop, tasks: %d, discount: %v", len(tasks), discount) + for _, t := range tasks { + tsched := t.TaskGoroutineSchedInfo() + + // An offender is a task running inside the kernel for longer than the specified timeout. + if tsched.State == kernel.TaskGoroutineRunningSys { + lastUpdateTime := ktime.FromNanoseconds(int64(tsched.Timestamp * uint64(linux.ClockTick))) + elapsed := now.Sub(lastUpdateTime) - discount + if elapsed > w.taskTimeout { + tc, ok := w.offenders[t] + if !ok { + // New stuck task detected. + // + // TODO(b/65849403): Tasks blocked doing IO may be considered stuck in kernel. + tc = &offender{lastUpdateTime: lastUpdateTime} + stuckTasks.Increment() + newTaskFound = true + } + newOffenders[t] = tc + } + } + } + if len(newOffenders) > 0 { + w.report(newOffenders, newTaskFound, now) + } + + // Remember which tasks have been reported. + w.offenders = newOffenders +} + +// report takes appropriate action when a stuck task is detected. +func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound bool, now ktime.Time) { + var buf bytes.Buffer + buf.WriteString(fmt.Sprintf("Sentry detected %d stuck task(s):\n", len(offenders))) + for t, o := range offenders { + tid := w.k.TaskSet().Root.IDOfTask(t) + buf.WriteString(fmt.Sprintf("\tTask tid: %v (%#x), entered RunSys state %v ago.\n", tid, uint64(tid), now.Sub(o.lastUpdateTime))) + } + buf.WriteString("Search for '(*Task).run(0x..., 0x<tid>)' in the stack dump to find the offending goroutine") + w.onStuckTask(newTaskFound, &buf) +} + +func (w *Watchdog) reportStuckWatchdog() { + var buf bytes.Buffer + buf.WriteString("Watchdog goroutine is stuck:\n") + w.onStuckTask(true, &buf) +} + +func (w *Watchdog) onStuckTask(newTaskFound bool, buf *bytes.Buffer) { + switch w.timeoutAction { + case LogWarning: + // Dump stack only if a new task is detected or if it sometime has passed since + // the last time a stack dump was generated. + if !newTaskFound && time.Since(w.lastStackDump) < stackDumpSameTaskPeriod { + buf.WriteString("\n...[stack dump skipped]...") + log.Warningf(buf.String()) + } else { + log.TracebackAll(buf.String()) + w.lastStackDump = time.Now() + } + + case Panic: + // Panic will skip over running tasks, which is likely the culprit here. So manually + // dump all stacks before panic'ing. + log.TracebackAll(buf.String()) + + // Attempt to flush metrics, timeout and move on in case metrics are stuck as well. + metricsEmitted := make(chan struct{}, 1) + go func() { // S/R-SAFE: watchdog is stopped during save and restarted after restore. + // Flush metrics before killing process. + metric.EmitMetricUpdate() + metricsEmitted <- struct{}{} + }() + select { + case <-metricsEmitted: + case <-time.After(1 * time.Second): + } + panic("Sentry detected stuck task(s). See stack trace and message above for more details") + } +} |