// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package watchdog is responsible for monitoring the sentry for tasks that may // potentially be stuck or looping inderterminally causing hard to debug hungs in // the untrusted app. // // It works by periodically querying all tasks to check whether they are in user // mode (RunUser), kernel mode (RunSys), or blocked in the kernel (OffCPU). Tasks // that have been running in kernel mode for a long time in the same syscall // without blocking are considered stuck and are reported. // // When a stuck task is detected, the watchdog can take one of the following actions: // 1. LogWarning: Logs a warning message followed by a stack dump of all goroutines. // If a tasks continues to be stuck, the message will repeat every minute, unless // a new stuck task is detected // 2. Panic: same as above, followed by panic() // package watchdog import ( "bytes" "fmt" "time" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sync" ) // Opts configures the watchdog. type Opts struct { // TaskTimeout is the amount of time to allow a task to execute the // same syscall without blocking before it's declared stuck. TaskTimeout time.Duration // TaskTimeoutAction indicates what action to take when a stuck tasks // is detected. TaskTimeoutAction Action // StartupTimeout is the amount of time to allow between watchdog // creation and calling watchdog.Start. StartupTimeout time.Duration // StartupTimeoutAction indicates what action to take when // watchdog.Start is not called within the timeout. StartupTimeoutAction Action } // DefaultOpts is a default set of options for the watchdog. var DefaultOpts = Opts{ // Task timeout. TaskTimeout: 3 * time.Minute, TaskTimeoutAction: LogWarning, // Startup timeout. StartupTimeout: 30 * time.Second, StartupTimeoutAction: LogWarning, } // descheduleThreshold is the amount of time scheduling needs to be off before the entire wait period // is discounted from task's last update time. It's set high enough that small scheduling delays won't // trigger it. const descheduleThreshold = 1 * time.Second var ( stuckStartup = metric.MustCreateNewUint64Metric("/watchdog/stuck_startup_detected", true /* sync */, "Incremented once on startup watchdog timeout") stuckTasks = metric.MustCreateNewUint64Metric("/watchdog/stuck_tasks_detected", true /* sync */, "Cumulative count of stuck tasks detected") ) // Amount of time to wait before dumping the stack to the log again when the same task(s) remains stuck. var stackDumpSameTaskPeriod = time.Minute // Action defines what action to take when a stuck task is detected. type Action int const ( // LogWarning logs warning message followed by stack trace. LogWarning Action = iota // Panic will do the same logging as LogWarning and panic(). Panic ) // Set implements flag.Value. func (a *Action) Set(v string) error { switch v { case "log", "logwarning": *a = LogWarning case "panic": *a = Panic default: return fmt.Errorf("invalid watchdog action %q", v) } return nil } // Get implements flag.Value. func (a *Action) Get() interface{} { return *a } // String returns Action's string representation. func (a *Action) String() string { switch *a { case LogWarning: return "logWarning" case Panic: return "panic" default: panic(fmt.Sprintf("Invalid watchdog action: %d", *a)) } } // Watchdog is the main watchdog class. It controls a goroutine that periodically // analyses all tasks and reports if any of them appear to be stuck. type Watchdog struct { // Configuration options are embedded. Opts // period indicates how often to check all tasks. It's calculated based on // opts.TaskTimeout. period time.Duration // k is where the tasks come from. k *kernel.Kernel // stop is used to notify to watchdog should stop. stop chan struct{} // done is used to notify when the watchdog has stopped. done chan struct{} // offenders map contains all tasks that are currently stuck. offenders map[*kernel.Task]*offender // lastStackDump tracks the last time a stack dump was generated to prevent // spamming the log. lastStackDump time.Time // lastRun is set to the last time the watchdog executed a monitoring loop. lastRun ktime.Time // mu protects the fields below. mu sync.Mutex // running is true if the watchdog is running. running bool // startCalled is true if Start has ever been called. It remains true // even if Stop is called. startCalled bool } type offender struct { lastUpdateTime ktime.Time } // New creates a new watchdog. func New(k *kernel.Kernel, opts Opts) *Watchdog { // 4 is arbitrary, just don't want to prolong 'TaskTimeout' too much. period := opts.TaskTimeout / 4 w := &Watchdog{ Opts: opts, k: k, period: period, offenders: make(map[*kernel.Task]*offender), stop: make(chan struct{}), done: make(chan struct{}), } // Handle StartupTimeout if it exists. if w.StartupTimeout > 0 { log.Infof("Watchdog waiting %v for startup", w.StartupTimeout) go w.waitForStart() // S/R-SAFE: watchdog is stopped buring save and restarted after restore. } return w } // Start starts the watchdog. func (w *Watchdog) Start() { w.mu.Lock() defer w.mu.Unlock() w.startCalled = true if w.running { return } if w.TaskTimeout == 0 { log.Infof("Watchdog task timeout disabled") return } w.lastRun = w.k.MonotonicClock().Now() log.Infof("Starting watchdog, period: %v, timeout: %v, action: %v", w.period, w.TaskTimeout, w.TaskTimeoutAction) go w.loop() // S/R-SAFE: watchdog is stopped during save and restarted after restore. w.running = true } // Stop requests the watchdog to stop and wait for it. func (w *Watchdog) Stop() { if w.TaskTimeout == 0 { return } w.mu.Lock() defer w.mu.Unlock() if !w.running { return } log.Infof("Stopping watchdog") w.stop <- struct{}{} <-w.done w.running = false log.Infof("Watchdog stopped") } // waitForStart waits for Start to be called and takes action if it does not // happen within the startup timeout. func (w *Watchdog) waitForStart() { <-time.After(w.StartupTimeout) w.mu.Lock() defer w.mu.Unlock() if w.startCalled { // We are fine. return } stuckStartup.Increment() var buf bytes.Buffer buf.WriteString(fmt.Sprintf("Watchdog.Start() not called within %s", w.StartupTimeout)) w.doAction(w.StartupTimeoutAction, false, &buf) } // loop is the main watchdog routine. It only returns when 'Stop()' is called. func (w *Watchdog) loop() { // Loop until someone stops it. for { select { case <-w.stop: w.done <- struct{}{} return case <-time.After(w.period): w.runTurn() } } } // runTurn runs a single pass over all tasks and reports anything it finds. func (w *Watchdog) runTurn() { // Someone needs to watch the watchdog. The call below can get stuck if there // is a deadlock affecting root's PID namespace mutex. Run it in a goroutine // and report if it takes too long to return. var tasks []*kernel.Task done := make(chan struct{}) go func() { // S/R-SAFE: watchdog is stopped and restarted during S/R. tasks = w.k.TaskSet().Root.Tasks() close(done) }() select { case <-done: case <-time.After(w.TaskTimeout): // Report if the watchdog is not making progress. // No one is watching the watchdog watcher though. w.reportStuckWatchdog() <-done } newOffenders := make(map[*kernel.Task]*offender) newTaskFound := false now := ktime.FromNanoseconds(int64(w.k.CPUClockNow() * uint64(linux.ClockTick))) // The process may be running with low CPU limit making tasks appear stuck because // are starved of CPU cycles. An estimate is that Tasks could have been starved // since the last time the watchdog run. If the watchdog detects that scheduling // is off, it will discount the entire duration since last run from 'lastUpdateTime'. discount := time.Duration(0) if now.Sub(w.lastRun.Add(w.period)) > descheduleThreshold { discount = now.Sub(w.lastRun) } w.lastRun = now log.Infof("Watchdog starting loop, tasks: %d, discount: %v", len(tasks), discount) for _, t := range tasks { tsched := t.TaskGoroutineSchedInfo() // An offender is a task running inside the kernel for longer than the specified timeout. if tsched.State == kernel.TaskGoroutineRunningSys { lastUpdateTime := ktime.FromNanoseconds(int64(tsched.Timestamp * uint64(linux.ClockTick))) elapsed := now.Sub(lastUpdateTime) - discount if elapsed > w.TaskTimeout { tc, ok := w.offenders[t] if !ok { // New stuck task detected. // // Note that tasks blocked doing IO may be considered stuck in kernel, // unless they are surrounded b // Task.UninterruptibleSleepStart/Finish. tc = &offender{lastUpdateTime: lastUpdateTime} stuckTasks.Increment() newTaskFound = true } newOffenders[t] = tc } } } if len(newOffenders) > 0 { w.report(newOffenders, newTaskFound, now) } // Remember which tasks have been reported. w.offenders = newOffenders } // report takes appropriate action when a stuck task is detected. func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound bool, now ktime.Time) { var buf bytes.Buffer buf.WriteString(fmt.Sprintf("Sentry detected %d stuck task(s):\n", len(offenders))) for t, o := range offenders { tid := w.k.TaskSet().Root.IDOfTask(t) buf.WriteString(fmt.Sprintf("\tTask tid: %v (%#x), entered RunSys state %v ago.\n", tid, uint64(tid), now.Sub(o.lastUpdateTime))) } buf.WriteString("Search for '(*Task).run(0x..., 0x<tid>)' in the stack dump to find the offending goroutine") // Force stack dump only if a new task is detected. w.doAction(w.TaskTimeoutAction, newTaskFound, &buf) } func (w *Watchdog) reportStuckWatchdog() { var buf bytes.Buffer buf.WriteString("Watchdog goroutine is stuck") w.doAction(w.TaskTimeoutAction, false, &buf) } // doAction will take the given action. If the action is LogWarning, the stack // is not always dumped to the log to prevent log flooding. "forceStack" // guarantees that the stack will be dumped regardless. func (w *Watchdog) doAction(action Action, forceStack bool, msg *bytes.Buffer) { switch action { case LogWarning: // Dump stack only if forced or sometime has passed since the last time a // stack dump was generated. if !forceStack && time.Since(w.lastStackDump) < stackDumpSameTaskPeriod { msg.WriteString("\n...[stack dump skipped]...") log.Warningf(msg.String()) return } log.TracebackAll(msg.String()) w.lastStackDump = time.Now() case Panic: // Panic will skip over running tasks, which is likely the culprit here. So manually // dump all stacks before panic'ing. log.TracebackAll(msg.String()) // Attempt to flush metrics, timeout and move on in case metrics are stuck as well. metricsEmitted := make(chan struct{}, 1) go func() { // S/R-SAFE: watchdog is stopped during save and restarted after restore. // Flush metrics before killing process. metric.EmitMetricUpdate() metricsEmitted <- struct{}{} }() select { case <-metricsEmitted: case <-time.After(1 * time.Second): } panic(fmt.Sprintf("%s\nStack for running G's are skipped while panicking.", msg.String())) default: panic(fmt.Sprintf("Unknown watchdog action %v", action)) } }