Check in gVisor.

PiperOrigin-RevId: 194583126 Change-Id: Ica1d8821a90f74e7e745962d71801c598c652463
author: Googler <noreply@google.com> 2018-04-27 10:37:02 -0700
committer: Adin Scannell <ascannell@google.com> 2018-04-28 01:44:26 -0400
commit: d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree: 54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/kernel/task_sched.go
parent: f70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)
1 files changed, 329 insertions, 0 deletions
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
new file mode 100644
index 000000000..b50139077
--- /dev/null
+++ b/pkg/sentry/kernel/task_sched.go
@@ -0,0 +1,329 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+// CPU scheduling, real and fake.
+
+import (
+	"fmt"
+	"sync/atomic"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+	"gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// TaskGoroutineState is a coarse representation of the current execution
+// status of a kernel.Task goroutine.
+type TaskGoroutineState int
+
+const (
+	// TaskGoroutineNonexistent indicates that the task goroutine has either
+	// not yet been created by Task.Start() or has returned from Task.run().
+	// This must be the zero value for TaskGoroutineState.
+	TaskGoroutineNonexistent TaskGoroutineState = iota
+
+	// TaskGoroutineRunningSys indicates that the task goroutine is executing
+	// sentry code.
+	TaskGoroutineRunningSys
+
+	// TaskGoroutineRunningApp indicates that the task goroutine is executing
+	// application code.
+	TaskGoroutineRunningApp
+
+	// TaskGoroutineBlockedInterruptible indicates that the task goroutine is
+	// blocked in Task.block(), and hence may be woken by Task.interrupt()
+	// (e.g. due to signal delivery).
+	TaskGoroutineBlockedInterruptible
+
+	// TaskGoroutineBlockedUninterruptible indicates that the task goroutine is
+	// stopped outside of Task.block() and Task.doStop(), and hence cannot be
+	// woken by Task.interrupt().
+	TaskGoroutineBlockedUninterruptible
+
+	// TaskGoroutineStopped indicates that the task goroutine is blocked in
+	// Task.doStop(). TaskGoroutineStopped is similar to
+	// TaskGoroutineBlockedUninterruptible, but is a separate state to make it
+	// possible to determine when Task.stop is meaningful.
+	TaskGoroutineStopped
+)
+
+// TaskGoroutineSchedInfo contains task goroutine scheduling state which must
+// be read and updated atomically.
+type TaskGoroutineSchedInfo struct {
+	// Timestamp was the value of Kernel.cpuClock when this
+	// TaskGoroutineSchedInfo was last updated.
+	Timestamp uint64
+
+	// State is the current state of the task goroutine.
+	State TaskGoroutineState
+
+	// UserTicks is the amount of time the task goroutine has spent executing
+	// its associated Task's application code, in units of linux.ClockTick.
+	UserTicks uint64
+
+	// SysTicks is the amount of time the task goroutine has spent executing in
+	// the sentry, in units of linux.ClockTick.
+	SysTicks uint64
+}
+
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) {
+	now := t.k.CPUClockNow()
+	if t.gosched.State != TaskGoroutineRunningSys {
+		panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, TaskGoroutineRunningSys, state))
+	}
+	t.goschedSeq.BeginWrite()
+	// This function is very hot; avoid defer.
+	t.gosched.SysTicks += now - t.gosched.Timestamp
+	t.gosched.Timestamp = now
+	t.gosched.State = state
+	t.goschedSeq.EndWrite()
+}
+
+// Preconditions: The caller must be running on the task goroutine, and leaving
+// a state indicated by a previous call to
+// t.accountTaskGoroutineEnter(state).
+func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) {
+	now := t.k.CPUClockNow()
+	if t.gosched.State != state {
+		panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, state, TaskGoroutineRunningSys))
+	}
+	t.goschedSeq.BeginWrite()
+	// This function is very hot; avoid defer.
+	if state == TaskGoroutineRunningApp {
+		t.gosched.UserTicks += now - t.gosched.Timestamp
+	}
+	t.gosched.Timestamp = now
+	t.gosched.State = TaskGoroutineRunningSys
+	t.goschedSeq.EndWrite()
+}
+
+// TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info.
+// Most clients should use t.CPUStats() instead.
+func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo {
+	return SeqAtomicLoadTaskGoroutineSchedInfo(&t.goschedSeq, &t.gosched)
+}
+
+// CPUStats returns the CPU usage statistics of t.
+func (t *Task) CPUStats() usage.CPUStats {
+	return t.cpuStatsAt(t.k.CPUClockNow())
+}
+
+// Preconditions: now <= Kernel.CPUClockNow(). (Since Kernel.cpuClock is
+// monotonic, this is satisfied if now is the result of a previous call to
+// Kernel.CPUClockNow().) This requirement exists because otherwise a racing
+// change to t.gosched can cause cpuStatsAt to adjust stats by too much, making
+// the returned stats non-monotonic.
+func (t *Task) cpuStatsAt(now uint64) usage.CPUStats {
+	tsched := t.TaskGoroutineSchedInfo()
+	if tsched.Timestamp < now {
+		// Update stats to reflect execution since the last update to
+		// t.gosched.
+		switch tsched.State {
+		case TaskGoroutineRunningSys:
+			tsched.SysTicks += now - tsched.Timestamp
+		case TaskGoroutineRunningApp:
+			tsched.UserTicks += now - tsched.Timestamp
+		}
+	}
+	return usage.CPUStats{
+		UserTime:          time.Duration(tsched.UserTicks * uint64(linux.ClockTick)),
+		SysTime:           time.Duration(tsched.SysTicks * uint64(linux.ClockTick)),
+		VoluntarySwitches: atomic.LoadUint64(&t.yieldCount),
+	}
+}
+
+// CPUStats returns the combined CPU usage statistics of all past and present
+// threads in tg.
+func (tg *ThreadGroup) CPUStats() usage.CPUStats {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	// Hack to get a pointer to the Kernel.
+	if tg.leader == nil {
+		// Per comment on tg.leader, this is only possible if nothing in the
+		// ThreadGroup has ever executed anyway.
+		return usage.CPUStats{}
+	}
+	now := tg.leader.k.CPUClockNow()
+	stats := tg.exitedCPUStats
+	// Account for active tasks.
+	for t := tg.tasks.Front(); t != nil; t = t.Next() {
+		stats.Accumulate(t.cpuStatsAt(now))
+	}
+	return stats
+}
+
+// JoinedChildCPUStats implements the semantics of RUSAGE_CHILDREN: "Return
+// resource usage statistics for all children of [tg] that have terminated and
+// been waited for. These statistics will include the resources used by
+// grandchildren, and further removed descendants, if all of the intervening
+// descendants waited on their terminated children."
+func (tg *ThreadGroup) JoinedChildCPUStats() usage.CPUStats {
+	tg.pidns.owner.mu.RLock()
+	defer tg.pidns.owner.mu.RUnlock()
+	return tg.childCPUStats
+}
+
+// StateStatus returns a string representation of the task's current state,
+// appropriate for /proc/[pid]/status.
+func (t *Task) StateStatus() string {
+	switch s := t.TaskGoroutineSchedInfo().State; s {
+	case TaskGoroutineNonexistent:
+		t.tg.pidns.owner.mu.RLock()
+		defer t.tg.pidns.owner.mu.RUnlock()
+		switch t.exitState {
+		case TaskExitZombie:
+			return "Z (zombie)"
+		case TaskExitDead:
+			return "X (dead)"
+		default:
+			// The task goroutine can't exit before passing through
+			// runExitNotify, so this indicates that the task has been created,
+			// but the task goroutine hasn't yet started. The Linux equivalent
+			// is struct task_struct::state == TASK_NEW
+			// (kernel/fork.c:copy_process() =>
+			// kernel/sched/core.c:sched_fork()), but the TASK_NEW bit is
+			// masked out by TASK_REPORT for /proc/[pid]/status, leaving only
+			// TASK_RUNNING.
+			return "R (running)"
+		}
+	case TaskGoroutineRunningSys, TaskGoroutineRunningApp:
+		return "R (running)"
+	case TaskGoroutineBlockedInterruptible:
+		return "S (sleeping)"
+	case TaskGoroutineStopped:
+		t.tg.signalHandlers.mu.Lock()
+		defer t.tg.signalHandlers.mu.Unlock()
+		switch t.stop.(type) {
+		case *groupStop:
+			return "T (stopped)"
+		case *ptraceStop:
+			return "t (tracing stop)"
+		}
+		fallthrough
+	case TaskGoroutineBlockedUninterruptible:
+		// This is the name Linux uses for TASK_UNINTERRUPTIBLE and
+		// TASK_KILLABLE (= TASK_UNINTERRUPTIBLE | TASK_WAKEKILL):
+		// fs/proc/array.c:task_state_array.
+		return "D (disk sleep)"
+	default:
+		panic(fmt.Sprintf("Invalid TaskGoroutineState: %v", s))
+	}
+}
+
+// CPUMask returns a copy of t's allowed CPU mask.
+func (t *Task) CPUMask() sched.CPUSet {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.allowedCPUMask.Copy()
+}
+
+// SetCPUMask sets t's allowed CPU mask based on mask. It takes ownership of
+// mask.
+//
+// Preconditions: mask.Size() ==
+// sched.CPUSetSize(t.Kernel().ApplicationCores()).
+func (t *Task) SetCPUMask(mask sched.CPUSet) error {
+	if want := sched.CPUSetSize(t.k.applicationCores); mask.Size() != want {
+		panic(fmt.Sprintf("Invalid CPUSet %v (expected %d bytes)", mask, want))
+	}
+
+	// Remove CPUs in mask above Kernel.applicationCores.
+	mask.ClearAbove(t.k.applicationCores)
+
+	// Ensure that at least 1 CPU is still allowed.
+	if mask.NumCPUs() == 0 {
+		return syserror.EINVAL
+	}
+
+	if t.k.useHostCores {
+		// No-op; pretend the mask was immediately changed back.
+		return nil
+	}
+
+	t.tg.pidns.owner.mu.RLock()
+	rootTID := t.tg.pidns.owner.Root.tids[t]
+	t.tg.pidns.owner.mu.RUnlock()
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.allowedCPUMask = mask
+	atomic.StoreInt32(&t.cpu, assignCPU(mask, rootTID))
+	return nil
+}
+
+// CPU returns the cpu id for a given task.
+func (t *Task) CPU() int32 {
+	if t.k.useHostCores {
+		return int32(hostcpu.GetCPU())
+	}
+
+	return atomic.LoadInt32(&t.cpu)
+}
+
+// assignCPU returns the virtualized CPU number for the task with global TID
+// tid and allowedCPUMask allowed.
+func assignCPU(allowed sched.CPUSet, tid ThreadID) (cpu int32) {
+	// To pretend that threads are evenly distributed to allowed CPUs, choose n
+	// to be less than the number of CPUs in allowed ...
+	n := int(tid) % int(allowed.NumCPUs())
+	// ... then pick the nth CPU in allowed.
+	allowed.ForEachCPU(func(c uint) {
+		if n--; n == 0 {
+			cpu = int32(c)
+		}
+	})
+	return cpu
+}
+
+// Niceness returns t's niceness.
+func (t *Task) Niceness() int {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.niceness
+}
+
+// Priority returns t's priority.
+func (t *Task) Priority() int {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.niceness + 20
+}
+
+// SetNiceness sets t's niceness to n.
+func (t *Task) SetNiceness(n int) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.niceness = n
+}
+
+// NumaPolicy returns t's current numa policy.
+func (t *Task) NumaPolicy() (policy int32, nodeMask uint32) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.numaPolicy, t.numaNodeMask
+}
+
+// SetNumaPolicy sets t's numa policy.
+func (t *Task) SetNumaPolicy(policy int32, nodeMask uint32) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	t.numaPolicy = policy
+	t.numaNodeMask = nodeMask
+}
author	Googler <noreply@google.com>	2018-04-27 10:37:02 -0700
committer	Adin Scannell <ascannell@google.com>	2018-04-28 01:44:26 -0400
commit	d02b74a5dcfed4bfc8f2f8e545bca4d2afabb296 (patch)
tree	54f95eef73aee6bacbfc736fffc631be2605ed53 /pkg/sentry/kernel/task_sched.go
parent	f70210e742919f40aa2f0934a22f1c9ba6dada62 (diff)