Stub out basic `runsc events --stat` CPU functionality

Because we lack gVisor-internal cgroups, we take the CPU usage of the entire pod and divide it proportionally according to sentry-internal usage stats. This fixes `kubectl top pods`, which gets a pod's CPU usage by summing the usage of its containers. Addresses #172. PiperOrigin-RevId: 355229833
author: Kevin Krakauer <krakauer@google.com> 2021-02-02 12:45:25 -0800
committer: gVisor bot <gvisor-bot@google.com> 2021-02-02 12:47:23 -0800
commit: 5f7bf3152652d36903f9659688321ae7c42995d0 (patch)
tree: 374c56830303dc412894baa4edfd04bcda4cda74
parent: f884ea13b713143ff9978092ddb352c159346167 (diff)
9 files changed, 205 insertions, 48 deletions
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 1d88db12f..de7a0f3ab 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -404,3 +404,16 @@ func ttyName(tty *kernel.TTY) string {
 	}
 	return fmt.Sprintf("pts/%d", tty.Index)
 }
+
+// ContainerUsage retrieves per-container CPU usage.
+func ContainerUsage(kr *kernel.Kernel) map[string]uint64 {
+	cusage := make(map[string]uint64)
+	for _, tg := range kr.TaskSet().Root.ThreadGroups() {
+		// We want each tg's usage including reaped children.
+		cid := tg.Leader().ContainerID()
+		stats := tg.CPUStats()
+		stats.Accumulate(tg.JoinedChildCPUStats())
+		cusage[cid] += uint64(stats.UserTime.Nanoseconds()) + uint64(stats.SysTime.Nanoseconds())
+	}
+	return cusage
+}
diff --git a/runsc/boot/events.go b/runsc/boot/events.go
index 422f4da00..0814b2a69 100644
--- a/runsc/boot/events.go
+++ b/runsc/boot/events.go
@@ -15,21 +15,30 @@
 package boot
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 )
 
+// EventOut is the return type of the Event command.
+type EventOut struct {
+	Event Event `json:"event"`
+
+	// ContainerUsage maps each container ID to its total CPU usage.
+	ContainerUsage map[string]uint64 `json:"containerUsage"`
+}
+
 // Event struct for encoding the event data to JSON. Corresponds to runc's
 // main.event struct.
 type Event struct {
-	Type string      `json:"type"`
-	ID   string      `json:"id"`
-	Data interface{} `json:"data,omitempty"`
+	Type string `json:"type"`
+	ID   string `json:"id"`
+	Data Stats  `json:"data"`
 }
 
 // Stats is the runc specific stats structure for stability when encoding and
 // decoding stats.
 type Stats struct {
+	CPU    CPU    `json:"cpu"`
 	Memory Memory `json:"memory"`
 	Pids   Pids   `json:"pids"`
 }
@@ -58,24 +67,42 @@ type Memory struct {
 	Raw       map[string]uint64 `json:"raw,omitempty"`
 }
 
-// Event gets the events from the container.
-func (cm *containerManager) Event(_ *struct{}, out *Event) error {
-	stats := &Stats{}
-	stats.populateMemory(cm.l.k)
-	stats.populatePIDs(cm.l.k)
-	*out = Event{Type: "stats", Data: stats}
-	return nil
+// CPU contains stats on the CPU.
+type CPU struct {
+	Usage CPUUsage `json:"usage"`
+}
+
+// CPUUsage contains stats on CPU usage.
+type CPUUsage struct {
+	Kernel uint64   `json:"kernel,omitempty"`
+	User   uint64   `json:"user,omitempty"`
+	Total  uint64   `json:"total,omitempty"`
+	PerCPU []uint64 `json:"percpu,omitempty"`
 }
 
-func (s *Stats) populateMemory(k *kernel.Kernel) {
-	mem := k.MemoryFile()
+// Event gets the events from the container.
+func (cm *containerManager) Event(_ *struct{}, out *EventOut) error {
+	*out = EventOut{
+		Event: Event{
+			Type: "stats",
+		},
+	}
+
+	// Memory usage.
+	// TODO(gvisor.dev/issue/172): Per-container accounting.
+	mem := cm.l.k.MemoryFile()
 	mem.UpdateUsage()
 	_, totalUsage := usage.MemoryAccounting.Copy()
-	s.Memory.Usage = MemoryEntry{
+	out.Event.Data.Memory.Usage = MemoryEntry{
 		Usage: totalUsage,
 	}
-}
 
-func (s *Stats) populatePIDs(k *kernel.Kernel) {
-	s.Pids.Current = uint64(len(k.TaskSet().Root.ThreadGroups()))
+	// PIDs.
+	// TODO(gvisor.dev/issue/172): Per-container accounting.
+	out.Event.Data.Pids.Current = uint64(len(cm.l.k.TaskSet().Root.ThreadGroups()))
+
+	// CPU usage by container.
+	out.ContainerUsage = control.ContainerUsage(cm.l.k)
+
+	return nil
 }
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index d37528ee7..77a7c530b 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -102,7 +102,7 @@ type containerInfo struct {
 	goferFDs []*fd.FD
 }
 
-// Loader keeps state needed to start the kernel and run the container..
+// Loader keeps state needed to start the kernel and run the container.
 type Loader struct {
 	// k is the kernel.
 	k *kernel.Kernel
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index e9ae59a92..797c1c2bc 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -281,8 +281,13 @@ func New(spec *specs.Spec) (*Cgroup, error) {
 	if spec.Linux == nil || spec.Linux.CgroupsPath == "" {
 		return nil, nil
 	}
+	return NewFromPath(spec.Linux.CgroupsPath)
+}
+
+// NewFromPath creates a new Cgroup instance.
+func NewFromPath(cgroupsPath string) (*Cgroup, error) {
 	var parents map[string]string
-	if !filepath.IsAbs(spec.Linux.CgroupsPath) {
+	if !filepath.IsAbs(cgroupsPath) {
 		var err error
 		parents, err = LoadPaths("self")
 		if err != nil {
@@ -291,7 +296,7 @@ func New(spec *specs.Spec) (*Cgroup, error) {
 	}
 	own := make(map[string]bool)
 	return &Cgroup{
-		Name:    spec.Linux.CgroupsPath,
+		Name:    cgroupsPath,
 		Parents: parents,
 		Own:     own,
 	}, nil
@@ -389,6 +394,9 @@ func (c *Cgroup) Join() (func(), error) {
 	undo = func() {
 		for _, path := range undoPaths {
 			log.Debugf("Restoring cgroup %q", path)
+			// Writing the value 0 to a cgroup.procs file causes
+			// the writing process to be moved to the corresponding
+			// cgroup. - cgroups(7).
 			if err := setValue(path, "cgroup.procs", "0"); err != nil {
 				log.Warningf("Error restoring cgroup %q: %v", path, err)
 			}
@@ -399,6 +407,9 @@ func (c *Cgroup) Join() (func(), error) {
 	for key, cfg := range controllers {
 		path := c.makePath(key)
 		log.Debugf("Joining cgroup %q", path)
+		// Writing the value 0 to a cgroup.procs file causes the
+		// writing process to be moved to the corresponding cgroup.
+		// - cgroups(7).
 		if err := setValue(path, "cgroup.procs", "0"); err != nil {
 			if cfg.optional && os.IsNotExist(err) {
 				continue
@@ -426,6 +437,16 @@ func (c *Cgroup) CPUQuota() (float64, error) {
 	return float64(quota) / float64(period), nil
 }
 
+// CPUUsage returns the total CPU usage of the cgroup.
+func (c *Cgroup) CPUUsage() (uint64, error) {
+	path := c.makePath("cpuacct")
+	usage, err := getValue(path, "cpuacct.usage")
+	if err != nil {
+		return 0, err
+	}
+	return strconv.ParseUint(strings.TrimSpace(usage), 10, 64)
+}
+
 // NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'.
 func (c *Cgroup) NumCPU() (int, error) {
 	path := c.makePath("cpuset")
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
index 75b0aac8d..06f00e8e7 100644
--- a/runsc/cmd/events.go
+++ b/runsc/cmd/events.go
@@ -93,9 +93,9 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa
 
 		// err must be preserved because it is used below when breaking
 		// out of the loop.
-		b, err := json.Marshal(ev)
+		b, err := json.Marshal(ev.Event)
 		if err != nil {
-			log.Warningf("Error while marshalling event %v: %v", ev, err)
+			log.Warningf("Error while marshalling event %v: %v", ev.Event, err)
 		} else {
 			os.Stdout.Write(b)
 		}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 5a0f8d5dc..aae64ae1c 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -486,12 +486,20 @@ func (c *Container) Execute(args *control.ExecArgs) (int32, error) {
 }
 
 // Event returns events for the container.
-func (c *Container) Event() (*boot.Event, error) {
+func (c *Container) Event() (*boot.EventOut, error) {
 	log.Debugf("Getting events for container, cid: %s", c.ID)
 	if err := c.requireStatus("get events for", Created, Running, Paused); err != nil {
 		return nil, err
 	}
-	return c.Sandbox.Event(c.ID)
+	event, err := c.Sandbox.Event(c.ID)
+	if err != nil {
+		return nil, err
+	}
+
+	// Some stats can utilize host cgroups for accuracy.
+	c.populateStats(event)
+
+	return event, nil
 }
 
 // SandboxPid returns the Pid of the sandbox the container is running in, or -1 if the
@@ -1110,3 +1118,54 @@ func setOOMScoreAdj(pid int, scoreAdj int) error {
 	}
 	return nil
 }
+
+// populateStats populates event with stats estimates based on cgroups and the
+// sentry's accounting.
+// TODO(gvisor.dev/issue/172): This is an estimation; we should do more
+// detailed accounting.
+func (c *Container) populateStats(event *boot.EventOut) {
+	// The events command, when run for all running containers, should
+	// account for the full cgroup CPU usage. We split cgroup usage
+	// proportionally according to the sentry-internal usage measurements,
+	// only counting Running containers.
+	log.Warningf("event.ContainerUsage: %v", event.ContainerUsage)
+	var containerUsage uint64
+	var allContainersUsage uint64
+	for ID, usage := range event.ContainerUsage {
+		allContainersUsage += usage
+		if ID == c.ID {
+			containerUsage = usage
+		}
+	}
+
+	cgroup, err := c.Sandbox.FindCgroup()
+	if err != nil {
+		// No cgroup, so rely purely on the sentry's accounting.
+		log.Warningf("events: no cgroups")
+		event.Event.Data.CPU.Usage.Total = containerUsage
+		return
+	}
+
+	// Get the host cgroup CPU usage.
+	cgroupsUsage, err := cgroup.CPUUsage()
+	if err != nil {
+		// No cgroup usage, so rely purely on the sentry's accounting.
+		log.Warningf("events: failed when getting cgroup CPU usage for container: %v", err)
+		event.Event.Data.CPU.Usage.Total = containerUsage
+		return
+	}
+
+	// If the sentry reports no memory usage, fall back on cgroups and
+	// split usage equally across containers.
+	if allContainersUsage == 0 {
+		log.Warningf("events: no sentry CPU usage reported")
+		allContainersUsage = cgroupsUsage
+		containerUsage = cgroupsUsage / uint64(len(event.ContainerUsage))
+	}
+
+	log.Warningf("%f, %f, %f", containerUsage, cgroupsUsage, allContainersUsage)
+	// Scaling can easily overflow a uint64 (e.g. a containerUsage and
+	// cgroupsUsage of 16 seconds each will overflow), so use floats.
+	event.Event.Data.CPU.Usage.Total = uint64(float64(containerUsage) * (float64(cgroupsUsage) / float64(allContainersUsage)))
+	return
+}
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 75fdcf4cc..173332cc2 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -15,7 +15,6 @@
 package container
 
 import (
-	"encoding/json"
 	"fmt"
 	"io/ioutil"
 	"math"
@@ -322,8 +321,8 @@ func TestMultiContainerWait(t *testing.T) {
 	}
 }
 
-// TestExecWait ensures what we can wait containers and individual processes in the
-// sandbox that have already exited.
+// TestExecWait ensures what we can wait on containers and individual processes
+// in the sandbox that have already exited.
 func TestExecWait(t *testing.T) {
 	rootDir, cleanup, err := testutil.SetupRootDir()
 	if err != nil {
@@ -1743,8 +1742,9 @@ func TestMultiContainerEvent(t *testing.T) {
 
 	// Setup the containers.
 	sleep := []string{"/bin/sleep", "100"}
+	busy := []string{"/bin/bash", "-c", "i=0 ; while true ; do (( i += 1 )) ; done"}
 	quick := []string{"/bin/true"}
-	podSpec, ids := createSpecs(sleep, sleep, quick)
+	podSpec, ids := createSpecs(sleep, busy, quick)
 	containers, cleanup, err := startContainers(conf, podSpec, ids)
 	if err != nil {
 		t.Fatalf("error starting containers: %v", err)
@@ -1755,37 +1755,58 @@ func TestMultiContainerEvent(t *testing.T) {
 		t.Logf("Running containerd %s", cont.ID)
 	}
 
-	// Wait for last container to stabilize the process count that is checked
-	// further below.
+	// Wait for last container to stabilize the process count that is
+	// checked further below.
 	if ws, err := containers[2].Wait(); err != nil || ws != 0 {
 		t.Fatalf("Container.Wait, status: %v, err: %v", ws, err)
 	}
+	expectedPL := []*control.Process{
+		newProcessBuilder().Cmd("sleep").Process(),
+	}
+	if err := waitForProcessList(containers[0], expectedPL); err != nil {
+		t.Errorf("failed to wait for sleep to start: %v", err)
+	}
+	expectedPL = []*control.Process{
+		newProcessBuilder().Cmd("bash").Process(),
+	}
+	if err := waitForProcessList(containers[1], expectedPL); err != nil {
+		t.Errorf("failed to wait for bash to start: %v", err)
+	}
 
 	// Check events for running containers.
+	var prevUsage uint64
 	for _, cont := range containers[:2] {
-		evt, err := cont.Event()
+		ret, err := cont.Event()
 		if err != nil {
 			t.Errorf("Container.Events(): %v", err)
 		}
+		evt := ret.Event
 		if want := "stats"; evt.Type != want {
-			t.Errorf("Wrong event type, want: %s, got :%s", want, evt.Type)
+			t.Errorf("Wrong event type, want: %s, got: %s", want, evt.Type)
 		}
 		if cont.ID != evt.ID {
-			t.Errorf("Wrong container ID, want: %s, got :%s", cont.ID, evt.ID)
+			t.Errorf("Wrong container ID, want: %s, got: %s", cont.ID, evt.ID)
 		}
-		// Event.Data is an interface, so it comes from the wire was
-		// map[string]string. Marshal and unmarshall again to the correc type.
-		data, err := json.Marshal(evt.Data)
-		if err != nil {
-			t.Fatalf("invalid event data: %v", err)
+		// One process per remaining container.
+		if got, want := evt.Data.Pids.Current, uint64(2); got != want {
+			t.Errorf("Wrong number of PIDs, want: %d, got: %d", want, got)
 		}
-		var stats boot.Stats
-		if err := json.Unmarshal(data, &stats); err != nil {
-			t.Fatalf("invalid event data: %v", err)
+
+		// Both remaining containers should have nonzero usage, and
+		// 'busy' should have higher usage than 'sleep'.
+		usage := evt.Data.CPU.Usage.Total
+		if usage == 0 {
+			t.Errorf("Running container should report nonzero CPU usage, but got %d", usage)
 		}
-		// One process per remaining container.
-		if want := uint64(2); stats.Pids.Current != want {
-			t.Errorf("Wrong number of PIDs, want: %d, got :%d", want, stats.Pids.Current)
+		if usage <= prevUsage {
+			t.Errorf("Expected container %s to use more than %d ns of CPU, but used %d", cont.ID, prevUsage, usage)
+		}
+		t.Logf("Container %s usage: %d", cont.ID, usage)
+		prevUsage = usage
+
+		// The exited container should have a usage of zero.
+		if exited := ret.ContainerUsage[containers[2].ID]; exited != 0 {
+			t.Errorf("Exited container should report 0 CPU usage, but got %d", exited)
 		}
 	}
 
diff --git a/runsc/container/state_file.go b/runsc/container/state_file.go
index dfbf1f2d3..c46322ba4 100644
--- a/runsc/container/state_file.go
+++ b/runsc/container/state_file.go
@@ -49,7 +49,7 @@ type LoadOpts struct {
 // Returns ErrNotExist if no container is found. Returns error in case more than
 // one containers matching the ID prefix is found.
 func Load(rootDir string, id FullID, opts LoadOpts) (*Container, error) {
-	//log.Debugf("Load container, rootDir: %q, partial cid: %s", rootDir, partialID)
+	log.Debugf("Load container, rootDir: %q, id: %+v, opts: %+v", rootDir, id, opts)
 	if !opts.Exact {
 		var err error
 		id, err = findContainerID(rootDir, id.ContainerID)
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 266bc0bdc..7fe65c7ba 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -308,6 +308,22 @@ func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
 	return pl, nil
 }
 
+// FindCgroup returns the sandbox's Cgroup, or an error if it does not have one.
+func (s *Sandbox) FindCgroup() (*cgroup.Cgroup, error) {
+	paths, err := cgroup.LoadPaths(strconv.Itoa(s.Pid))
+	if err != nil {
+		return nil, err
+	}
+	// runsc places sandboxes in the same cgroup for each controller, so we
+	// pick an arbitrary controller here to get the cgroup path.
+	const controller = "cpuacct"
+	controllerPath, ok := paths[controller]
+	if !ok {
+		return nil, fmt.Errorf("no %q controller found", controller)
+	}
+	return cgroup.NewFromPath(controllerPath)
+}
+
 // Execute runs the specified command in the container. It returns the PID of
 // the newly created process.
 func (s *Sandbox) Execute(args *control.ExecArgs) (int32, error) {
@@ -327,7 +343,7 @@ func (s *Sandbox) Execute(args *control.ExecArgs) (int32, error) {
 }
 
 // Event retrieves stats about the sandbox such as memory and CPU utilization.
-func (s *Sandbox) Event(cid string) (*boot.Event, error) {
+func (s *Sandbox) Event(cid string) (*boot.EventOut, error) {
 	log.Debugf("Getting events for container %q in sandbox %q", cid, s.ID)
 	conn, err := s.sandboxConnect()
 	if err != nil {
@@ -335,13 +351,13 @@ func (s *Sandbox) Event(cid string) (*boot.Event, error) {
 	}
 	defer conn.Close()
 
-	var e boot.Event
+	var e boot.EventOut
 	// TODO(b/129292330): Pass in the container id (cid) here. The sandbox
 	// should return events only for that container.
 	if err := conn.Call(boot.ContainerEvent, nil, &e); err != nil {
 		return nil, fmt.Errorf("retrieving event data from sandbox: %v", err)
 	}
-	e.ID = cid
+	e.Event.ID = cid
 	return &e, nil
 }
author	Kevin Krakauer <krakauer@google.com>	2021-02-02 12:45:25 -0800
committer	gVisor bot <gvisor-bot@google.com>	2021-02-02 12:47:23 -0800
commit	5f7bf3152652d36903f9659688321ae7c42995d0 (patch)
tree	374c56830303dc412894baa4edfd04bcda4cda74
parent	f884ea13b713143ff9978092ddb352c159346167 (diff)