1 files changed, 374 insertions, 0 deletions
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
new file mode 100644
index 000000000..748273366
--- /dev/null
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -0,0 +1,374 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package watchdog is responsible for monitoring the sentry for tasks that may
+// potentially be stuck or looping inderterminally causing hard to debug hungs in
+// the untrusted app.
+//
+// It works by periodically querying all tasks to check whether they are in user
+// mode (RunUser), kernel mode (RunSys), or blocked in the kernel (OffCPU). Tasks
+// that have been running in kernel mode for a long time in the same syscall
+// without blocking are considered stuck and are reported.
+//
+// When a stuck task is detected, the watchdog can take one of the following actions:
+//		1. LogWarning: Logs a warning message followed by a stack dump of all goroutines.
+//			 If a tasks continues to be stuck, the message will repeat every minute, unless
+//			 a new stuck task is detected
+//		2. Panic: same as above, followed by panic()
+//
+package watchdog
+
+import (
+	"bytes"
+	"fmt"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/metric"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// Opts configures the watchdog.
+type Opts struct {
+	// TaskTimeout is the amount of time to allow a task to execute the
+	// same syscall without blocking before it's declared stuck.
+	TaskTimeout time.Duration
+
+	// TaskTimeoutAction indicates what action to take when a stuck tasks
+	// is detected.
+	TaskTimeoutAction Action
+
+	// StartupTimeout is the amount of time to allow between watchdog
+	// creation and calling watchdog.Start.
+	StartupTimeout time.Duration
+
+	// StartupTimeoutAction indicates what action to take when
+	// watchdog.Start is not called within the timeout.
+	StartupTimeoutAction Action
+}
+
+// DefaultOpts is a default set of options for the watchdog.
+var DefaultOpts = Opts{
+	// Task timeout.
+	TaskTimeout:       3 * time.Minute,
+	TaskTimeoutAction: LogWarning,
+
+	// Startup timeout.
+	StartupTimeout:       30 * time.Second,
+	StartupTimeoutAction: LogWarning,
+}
+
+// descheduleThreshold is the amount of time scheduling needs to be off before the entire wait period
+// is discounted from task's last update time. It's set high enough that small scheduling delays won't
+// trigger it.
+const descheduleThreshold = 1 * time.Second
+
+var (
+	stuckStartup = metric.MustCreateNewUint64Metric("/watchdog/stuck_startup_detected", true /* sync */, "Incremented once on startup watchdog timeout")
+	stuckTasks   = metric.MustCreateNewUint64Metric("/watchdog/stuck_tasks_detected", true /* sync */, "Cumulative count of stuck tasks detected")
+)
+
+// Amount of time to wait before dumping the stack to the log again when the same task(s) remains stuck.
+var stackDumpSameTaskPeriod = time.Minute
+
+// Action defines what action to take when a stuck task is detected.
+type Action int
+
+const (
+	// LogWarning logs warning message followed by stack trace.
+	LogWarning Action = iota
+
+	// Panic will do the same logging as LogWarning and panic().
+	Panic
+)
+
+// String returns Action's string representation.
+func (a Action) String() string {
+	switch a {
+	case LogWarning:
+		return "LogWarning"
+	case Panic:
+		return "Panic"
+	default:
+		panic(fmt.Sprintf("Invalid action: %d", a))
+	}
+}
+
+// Watchdog is the main watchdog class. It controls a goroutine that periodically
+// analyses all tasks and reports if any of them appear to be stuck.
+type Watchdog struct {
+	// Configuration options are embedded.
+	Opts
+
+	// period indicates how often to check all tasks. It's calculated based on
+	// opts.TaskTimeout.
+	period time.Duration
+
+	// k is where the tasks come from.
+	k *kernel.Kernel
+
+	// stop is used to notify to watchdog should stop.
+	stop chan struct{}
+
+	// done is used to notify when the watchdog has stopped.
+	done chan struct{}
+
+	// offenders map contains all tasks that are currently stuck.
+	offenders map[*kernel.Task]*offender
+
+	// lastStackDump tracks the last time a stack dump was generated to prevent
+	// spamming the log.
+	lastStackDump time.Time
+
+	// lastRun is set to the last time the watchdog executed a monitoring loop.
+	lastRun ktime.Time
+
+	// mu protects the fields below.
+	mu sync.Mutex
+
+	// running is true if the watchdog is running.
+	running bool
+
+	// startCalled is true if Start has ever been called. It remains true
+	// even if Stop is called.
+	startCalled bool
+}
+
+type offender struct {
+	lastUpdateTime ktime.Time
+}
+
+// New creates a new watchdog.
+func New(k *kernel.Kernel, opts Opts) *Watchdog {
+	// 4 is arbitrary, just don't want to prolong 'TaskTimeout' too much.
+	period := opts.TaskTimeout / 4
+	w := &Watchdog{
+		Opts:      opts,
+		k:         k,
+		period:    period,
+		offenders: make(map[*kernel.Task]*offender),
+		stop:      make(chan struct{}),
+		done:      make(chan struct{}),
+	}
+
+	// Handle StartupTimeout if it exists.
+	if w.StartupTimeout > 0 {
+		log.Infof("Watchdog waiting %v for startup", w.StartupTimeout)
+		go w.waitForStart() // S/R-SAFE: watchdog is stopped buring save and restarted after restore.
+	}
+
+	return w
+}
+
+// Start starts the watchdog.
+func (w *Watchdog) Start() {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	w.startCalled = true
+
+	if w.running {
+		return
+	}
+
+	if w.TaskTimeout == 0 {
+		log.Infof("Watchdog task timeout disabled")
+		return
+	}
+	w.lastRun = w.k.MonotonicClock().Now()
+
+	log.Infof("Starting watchdog, period: %v, timeout: %v, action: %v", w.period, w.TaskTimeout, w.TaskTimeoutAction)
+	go w.loop() // S/R-SAFE: watchdog is stopped during save and restarted after restore.
+	w.running = true
+}
+
+// Stop requests the watchdog to stop and wait for it.
+func (w *Watchdog) Stop() {
+	if w.TaskTimeout == 0 {
+		return
+	}
+
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	if !w.running {
+		return
+	}
+	log.Infof("Stopping watchdog")
+	w.stop <- struct{}{}
+	<-w.done
+	w.running = false
+	log.Infof("Watchdog stopped")
+}
+
+// waitForStart waits for Start to be called and takes action if it does not
+// happen within the startup timeout.
+func (w *Watchdog) waitForStart() {
+	<-time.After(w.StartupTimeout)
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	if w.startCalled {
+		// We are fine.
+		return
+	}
+
+	stuckStartup.Increment()
+
+	var buf bytes.Buffer
+	buf.WriteString(fmt.Sprintf("Watchdog.Start() not called within %s", w.StartupTimeout))
+	w.doAction(w.StartupTimeoutAction, false, &buf)
+}
+
+// loop is the main watchdog routine. It only returns when 'Stop()' is called.
+func (w *Watchdog) loop() {
+	// Loop until someone stops it.
+	for {
+		select {
+		case <-w.stop:
+			w.done <- struct{}{}
+			return
+		case <-time.After(w.period):
+			w.runTurn()
+		}
+	}
+}
+
+// runTurn runs a single pass over all tasks and reports anything it finds.
+func (w *Watchdog) runTurn() {
+	// Someone needs to watch the watchdog. The call below can get stuck if there
+	// is a deadlock affecting root's PID namespace mutex. Run it in a goroutine
+	// and report if it takes too long to return.
+	var tasks []*kernel.Task
+	done := make(chan struct{})
+	go func() { // S/R-SAFE: watchdog is stopped and restarted during S/R.
+		tasks = w.k.TaskSet().Root.Tasks()
+		close(done)
+	}()
+
+	select {
+	case <-done:
+	case <-time.After(w.TaskTimeout):
+		// Report if the watchdog is not making progress.
+		// No one is watching the watchdog watcher though.
+		w.reportStuckWatchdog()
+		<-done
+	}
+
+	newOffenders := make(map[*kernel.Task]*offender)
+	newTaskFound := false
+	now := ktime.FromNanoseconds(int64(w.k.CPUClockNow() * uint64(linux.ClockTick)))
+
+	// The process may be running with low CPU limit making tasks appear stuck because
+	// are starved of CPU cycles. An estimate is that Tasks could have been starved
+	// since the last time the watchdog run. If the watchdog detects that scheduling
+	// is off, it will discount the entire duration since last run from 'lastUpdateTime'.
+	discount := time.Duration(0)
+	if now.Sub(w.lastRun.Add(w.period)) > descheduleThreshold {
+		discount = now.Sub(w.lastRun)
+	}
+	w.lastRun = now
+
+	log.Infof("Watchdog starting loop, tasks: %d, discount: %v", len(tasks), discount)
+	for _, t := range tasks {
+		tsched := t.TaskGoroutineSchedInfo()
+
+		// An offender is a task running inside the kernel for longer than the specified timeout.
+		if tsched.State == kernel.TaskGoroutineRunningSys {
+			lastUpdateTime := ktime.FromNanoseconds(int64(tsched.Timestamp * uint64(linux.ClockTick)))
+			elapsed := now.Sub(lastUpdateTime) - discount
+			if elapsed > w.TaskTimeout {
+				tc, ok := w.offenders[t]
+				if !ok {
+					// New stuck task detected.
+					//
+					// Note that tasks blocked doing IO may be considered stuck in kernel,
+					// unless they are surrounded b
+					// Task.UninterruptibleSleepStart/Finish.
+					tc = &offender{lastUpdateTime: lastUpdateTime}
+					stuckTasks.Increment()
+					newTaskFound = true
+				}
+				newOffenders[t] = tc
+			}
+		}
+	}
+	if len(newOffenders) > 0 {
+		w.report(newOffenders, newTaskFound, now)
+	}
+
+	// Remember which tasks have been reported.
+	w.offenders = newOffenders
+}
+
+// report takes appropriate action when a stuck task is detected.
+func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound bool, now ktime.Time) {
+	var buf bytes.Buffer
+	buf.WriteString(fmt.Sprintf("Sentry detected %d stuck task(s):\n", len(offenders)))
+	for t, o := range offenders {
+		tid := w.k.TaskSet().Root.IDOfTask(t)
+		buf.WriteString(fmt.Sprintf("\tTask tid: %v (%#x), entered RunSys state %v ago.\n", tid, uint64(tid), now.Sub(o.lastUpdateTime)))
+	}
+
+	buf.WriteString("Search for '(*Task).run(0x..., 0x<tid>)' in the stack dump to find the offending goroutine")
+
+	// Force stack dump only if a new task is detected.
+	w.doAction(w.TaskTimeoutAction, newTaskFound, &buf)
+}
+
+func (w *Watchdog) reportStuckWatchdog() {
+	var buf bytes.Buffer
+	buf.WriteString("Watchdog goroutine is stuck")
+	w.doAction(w.TaskTimeoutAction, false, &buf)
+}
+
+// doAction will take the given action. If the action is LogWarning, the stack
+// is not always dumped to the log to prevent log flooding. "forceStack"
+// guarantees that the stack will be dumped regardless.
+func (w *Watchdog) doAction(action Action, forceStack bool, msg *bytes.Buffer) {
+	switch action {
+	case LogWarning:
+		// Dump stack only if forced or sometime has passed since the last time a
+		// stack dump was generated.
+		if !forceStack && time.Since(w.lastStackDump) < stackDumpSameTaskPeriod {
+			msg.WriteString("\n...[stack dump skipped]...")
+			log.Warningf(msg.String())
+			return
+		}
+		log.TracebackAll(msg.String())
+		w.lastStackDump = time.Now()
+
+	case Panic:
+		// Panic will skip over running tasks, which is likely the culprit here. So manually
+		// dump all stacks before panic'ing.
+		log.TracebackAll(msg.String())
+
+		// Attempt to flush metrics, timeout and move on in case metrics are stuck as well.
+		metricsEmitted := make(chan struct{}, 1)
+		go func() { // S/R-SAFE: watchdog is stopped during save and restarted after restore.
+			// Flush metrics before killing process.
+			metric.EmitMetricUpdate()
+			metricsEmitted <- struct{}{}
+		}()
+		select {
+		case <-metricsEmitted:
+		case <-time.After(1 * time.Second):
+		}
+		panic(fmt.Sprintf("%s\nStack for running G's are skipped while panicking.", msg.String()))
+
+	default:
+		panic(fmt.Sprintf("Unknown watchdog action %v", action))
+
+	}
+}