diff options
Diffstat (limited to 'pkg/sentry/watchdog/watchdog.go')
-rw-r--r-- | pkg/sentry/watchdog/watchdog.go | 374 |
1 files changed, 374 insertions, 0 deletions
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go new file mode 100644 index 000000000..748273366 --- /dev/null +++ b/pkg/sentry/watchdog/watchdog.go @@ -0,0 +1,374 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package watchdog is responsible for monitoring the sentry for tasks that may +// potentially be stuck or looping inderterminally causing hard to debug hungs in +// the untrusted app. +// +// It works by periodically querying all tasks to check whether they are in user +// mode (RunUser), kernel mode (RunSys), or blocked in the kernel (OffCPU). Tasks +// that have been running in kernel mode for a long time in the same syscall +// without blocking are considered stuck and are reported. +// +// When a stuck task is detected, the watchdog can take one of the following actions: +// 1. LogWarning: Logs a warning message followed by a stack dump of all goroutines. +// If a tasks continues to be stuck, the message will repeat every minute, unless +// a new stuck task is detected +// 2. Panic: same as above, followed by panic() +// +package watchdog + +import ( + "bytes" + "fmt" + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/metric" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sync" +) + +// Opts configures the watchdog. +type Opts struct { + // TaskTimeout is the amount of time to allow a task to execute the + // same syscall without blocking before it's declared stuck. + TaskTimeout time.Duration + + // TaskTimeoutAction indicates what action to take when a stuck tasks + // is detected. + TaskTimeoutAction Action + + // StartupTimeout is the amount of time to allow between watchdog + // creation and calling watchdog.Start. + StartupTimeout time.Duration + + // StartupTimeoutAction indicates what action to take when + // watchdog.Start is not called within the timeout. + StartupTimeoutAction Action +} + +// DefaultOpts is a default set of options for the watchdog. +var DefaultOpts = Opts{ + // Task timeout. + TaskTimeout: 3 * time.Minute, + TaskTimeoutAction: LogWarning, + + // Startup timeout. + StartupTimeout: 30 * time.Second, + StartupTimeoutAction: LogWarning, +} + +// descheduleThreshold is the amount of time scheduling needs to be off before the entire wait period +// is discounted from task's last update time. It's set high enough that small scheduling delays won't +// trigger it. +const descheduleThreshold = 1 * time.Second + +var ( + stuckStartup = metric.MustCreateNewUint64Metric("/watchdog/stuck_startup_detected", true /* sync */, "Incremented once on startup watchdog timeout") + stuckTasks = metric.MustCreateNewUint64Metric("/watchdog/stuck_tasks_detected", true /* sync */, "Cumulative count of stuck tasks detected") +) + +// Amount of time to wait before dumping the stack to the log again when the same task(s) remains stuck. +var stackDumpSameTaskPeriod = time.Minute + +// Action defines what action to take when a stuck task is detected. +type Action int + +const ( + // LogWarning logs warning message followed by stack trace. + LogWarning Action = iota + + // Panic will do the same logging as LogWarning and panic(). + Panic +) + +// String returns Action's string representation. +func (a Action) String() string { + switch a { + case LogWarning: + return "LogWarning" + case Panic: + return "Panic" + default: + panic(fmt.Sprintf("Invalid action: %d", a)) + } +} + +// Watchdog is the main watchdog class. It controls a goroutine that periodically +// analyses all tasks and reports if any of them appear to be stuck. +type Watchdog struct { + // Configuration options are embedded. + Opts + + // period indicates how often to check all tasks. It's calculated based on + // opts.TaskTimeout. + period time.Duration + + // k is where the tasks come from. + k *kernel.Kernel + + // stop is used to notify to watchdog should stop. + stop chan struct{} + + // done is used to notify when the watchdog has stopped. + done chan struct{} + + // offenders map contains all tasks that are currently stuck. + offenders map[*kernel.Task]*offender + + // lastStackDump tracks the last time a stack dump was generated to prevent + // spamming the log. + lastStackDump time.Time + + // lastRun is set to the last time the watchdog executed a monitoring loop. + lastRun ktime.Time + + // mu protects the fields below. + mu sync.Mutex + + // running is true if the watchdog is running. + running bool + + // startCalled is true if Start has ever been called. It remains true + // even if Stop is called. + startCalled bool +} + +type offender struct { + lastUpdateTime ktime.Time +} + +// New creates a new watchdog. +func New(k *kernel.Kernel, opts Opts) *Watchdog { + // 4 is arbitrary, just don't want to prolong 'TaskTimeout' too much. + period := opts.TaskTimeout / 4 + w := &Watchdog{ + Opts: opts, + k: k, + period: period, + offenders: make(map[*kernel.Task]*offender), + stop: make(chan struct{}), + done: make(chan struct{}), + } + + // Handle StartupTimeout if it exists. + if w.StartupTimeout > 0 { + log.Infof("Watchdog waiting %v for startup", w.StartupTimeout) + go w.waitForStart() // S/R-SAFE: watchdog is stopped buring save and restarted after restore. + } + + return w +} + +// Start starts the watchdog. +func (w *Watchdog) Start() { + w.mu.Lock() + defer w.mu.Unlock() + w.startCalled = true + + if w.running { + return + } + + if w.TaskTimeout == 0 { + log.Infof("Watchdog task timeout disabled") + return + } + w.lastRun = w.k.MonotonicClock().Now() + + log.Infof("Starting watchdog, period: %v, timeout: %v, action: %v", w.period, w.TaskTimeout, w.TaskTimeoutAction) + go w.loop() // S/R-SAFE: watchdog is stopped during save and restarted after restore. + w.running = true +} + +// Stop requests the watchdog to stop and wait for it. +func (w *Watchdog) Stop() { + if w.TaskTimeout == 0 { + return + } + + w.mu.Lock() + defer w.mu.Unlock() + if !w.running { + return + } + log.Infof("Stopping watchdog") + w.stop <- struct{}{} + <-w.done + w.running = false + log.Infof("Watchdog stopped") +} + +// waitForStart waits for Start to be called and takes action if it does not +// happen within the startup timeout. +func (w *Watchdog) waitForStart() { + <-time.After(w.StartupTimeout) + w.mu.Lock() + defer w.mu.Unlock() + if w.startCalled { + // We are fine. + return + } + + stuckStartup.Increment() + + var buf bytes.Buffer + buf.WriteString(fmt.Sprintf("Watchdog.Start() not called within %s", w.StartupTimeout)) + w.doAction(w.StartupTimeoutAction, false, &buf) +} + +// loop is the main watchdog routine. It only returns when 'Stop()' is called. +func (w *Watchdog) loop() { + // Loop until someone stops it. + for { + select { + case <-w.stop: + w.done <- struct{}{} + return + case <-time.After(w.period): + w.runTurn() + } + } +} + +// runTurn runs a single pass over all tasks and reports anything it finds. +func (w *Watchdog) runTurn() { + // Someone needs to watch the watchdog. The call below can get stuck if there + // is a deadlock affecting root's PID namespace mutex. Run it in a goroutine + // and report if it takes too long to return. + var tasks []*kernel.Task + done := make(chan struct{}) + go func() { // S/R-SAFE: watchdog is stopped and restarted during S/R. + tasks = w.k.TaskSet().Root.Tasks() + close(done) + }() + + select { + case <-done: + case <-time.After(w.TaskTimeout): + // Report if the watchdog is not making progress. + // No one is watching the watchdog watcher though. + w.reportStuckWatchdog() + <-done + } + + newOffenders := make(map[*kernel.Task]*offender) + newTaskFound := false + now := ktime.FromNanoseconds(int64(w.k.CPUClockNow() * uint64(linux.ClockTick))) + + // The process may be running with low CPU limit making tasks appear stuck because + // are starved of CPU cycles. An estimate is that Tasks could have been starved + // since the last time the watchdog run. If the watchdog detects that scheduling + // is off, it will discount the entire duration since last run from 'lastUpdateTime'. + discount := time.Duration(0) + if now.Sub(w.lastRun.Add(w.period)) > descheduleThreshold { + discount = now.Sub(w.lastRun) + } + w.lastRun = now + + log.Infof("Watchdog starting loop, tasks: %d, discount: %v", len(tasks), discount) + for _, t := range tasks { + tsched := t.TaskGoroutineSchedInfo() + + // An offender is a task running inside the kernel for longer than the specified timeout. + if tsched.State == kernel.TaskGoroutineRunningSys { + lastUpdateTime := ktime.FromNanoseconds(int64(tsched.Timestamp * uint64(linux.ClockTick))) + elapsed := now.Sub(lastUpdateTime) - discount + if elapsed > w.TaskTimeout { + tc, ok := w.offenders[t] + if !ok { + // New stuck task detected. + // + // Note that tasks blocked doing IO may be considered stuck in kernel, + // unless they are surrounded b + // Task.UninterruptibleSleepStart/Finish. + tc = &offender{lastUpdateTime: lastUpdateTime} + stuckTasks.Increment() + newTaskFound = true + } + newOffenders[t] = tc + } + } + } + if len(newOffenders) > 0 { + w.report(newOffenders, newTaskFound, now) + } + + // Remember which tasks have been reported. + w.offenders = newOffenders +} + +// report takes appropriate action when a stuck task is detected. +func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound bool, now ktime.Time) { + var buf bytes.Buffer + buf.WriteString(fmt.Sprintf("Sentry detected %d stuck task(s):\n", len(offenders))) + for t, o := range offenders { + tid := w.k.TaskSet().Root.IDOfTask(t) + buf.WriteString(fmt.Sprintf("\tTask tid: %v (%#x), entered RunSys state %v ago.\n", tid, uint64(tid), now.Sub(o.lastUpdateTime))) + } + + buf.WriteString("Search for '(*Task).run(0x..., 0x<tid>)' in the stack dump to find the offending goroutine") + + // Force stack dump only if a new task is detected. + w.doAction(w.TaskTimeoutAction, newTaskFound, &buf) +} + +func (w *Watchdog) reportStuckWatchdog() { + var buf bytes.Buffer + buf.WriteString("Watchdog goroutine is stuck") + w.doAction(w.TaskTimeoutAction, false, &buf) +} + +// doAction will take the given action. If the action is LogWarning, the stack +// is not always dumped to the log to prevent log flooding. "forceStack" +// guarantees that the stack will be dumped regardless. +func (w *Watchdog) doAction(action Action, forceStack bool, msg *bytes.Buffer) { + switch action { + case LogWarning: + // Dump stack only if forced or sometime has passed since the last time a + // stack dump was generated. + if !forceStack && time.Since(w.lastStackDump) < stackDumpSameTaskPeriod { + msg.WriteString("\n...[stack dump skipped]...") + log.Warningf(msg.String()) + return + } + log.TracebackAll(msg.String()) + w.lastStackDump = time.Now() + + case Panic: + // Panic will skip over running tasks, which is likely the culprit here. So manually + // dump all stacks before panic'ing. + log.TracebackAll(msg.String()) + + // Attempt to flush metrics, timeout and move on in case metrics are stuck as well. + metricsEmitted := make(chan struct{}, 1) + go func() { // S/R-SAFE: watchdog is stopped during save and restarted after restore. + // Flush metrics before killing process. + metric.EmitMetricUpdate() + metricsEmitted <- struct{}{} + }() + select { + case <-metricsEmitted: + case <-time.After(1 * time.Second): + } + panic(fmt.Sprintf("%s\nStack for running G's are skipped while panicking.", msg.String())) + + default: + panic(fmt.Sprintf("Unknown watchdog action %v", action)) + + } +} |