4 files changed, 329 insertions, 14 deletions
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index e246c38ae..de8202bb1 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -49,6 +49,7 @@ go_test(
         "//pkg/abi/linux",
         "//pkg/log",
         "//pkg/sentry/control",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/unet",
         "//pkg/urpc",
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 8320bb2ca..bbb364214 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -138,6 +138,34 @@ type Container struct {
 	RootContainerDir string
 }
 
+// loadSandbox loads all containers that belong to the sandbox with the given
+// ID.
+func loadSandbox(rootDir, id string) ([]*Container, error) {
+	cids, err := List(rootDir)
+	if err != nil {
+		return nil, err
+	}
+
+	// Load the container metadata.
+	var containers []*Container
+	for _, cid := range cids {
+		container, err := Load(rootDir, cid)
+		if err != nil {
+			// Container file may not exist if it raced with creation/deletion or
+			// directory was left behind. Load provides a snapshot in time, so it's
+			// fine to skip it.
+			if os.IsNotExist(err) {
+				continue
+			}
+			return nil, fmt.Errorf("loading container %q: %v", id, err)
+		}
+		if container.Sandbox.ID == id {
+			containers = append(containers, container)
+		}
+	}
+	return containers, nil
+}
+
 // Load loads a container with the given id from a metadata file. id may be an
 // abbreviation of the full container id, in which case Load loads the
 // container to which id unambiguously refers to.
@@ -180,7 +208,7 @@ func Load(rootDir, id string) (*Container, error) {
 	// If the status is "Running" or "Created", check that the sandbox
 	// process still exists, and set it to Stopped if it does not.
 	//
-	// This is inherently racey.
+	// This is inherently racy.
 	if c.Status == Running || c.Status == Created {
 		// Check if the sandbox process is still running.
 		if !c.isSandboxRunning() {
@@ -237,7 +265,13 @@ func List(rootDir string) ([]string, error) {
 	}
 	var out []string
 	for _, f := range fs {
-		out = append(out, f.Name())
+		// Filter out directories that do no belong to a container.
+		cid := f.Name()
+		if validateID(cid) == nil {
+			if _, err := os.Stat(filepath.Join(rootDir, cid, metadataFilename)); err == nil {
+				out = append(out, f.Name())
+			}
+		}
 	}
 	return out, nil
 }
@@ -475,7 +509,13 @@ func (c *Container) Start(conf *boot.Config) error {
 	}
 
 	c.changeStatus(Running)
-	return c.save()
+	if err := c.save(); err != nil {
+		return err
+	}
+
+	// Adjust the oom_score_adj for sandbox and gofers. This must be done after
+	// save().
+	return c.adjustOOMScoreAdj(conf)
 }
 
 // Restore takes a container and replaces its kernel and file system
@@ -1098,3 +1138,68 @@ func runInCgroup(cg *cgroup.Cgroup, fn func() error) error {
 	}
 	return fn()
 }
+
+// adjustOOMScoreAdj sets the oom_score_adj for the sandbox and all gofers.
+// oom_score_adj is set to the lowest oom_score_adj among the containers
+// running in the sandbox.
+//
+// TODO(gvisor.dev/issue/512): This call could race with other containers being
+// created at the same time and end up setting the wrong oom_score_adj to the
+// sandbox.
+func (c *Container) adjustOOMScoreAdj(conf *boot.Config) error {
+	// If this container's OOMScoreAdj is nil then we can exit early as no
+	// change should be made to oom_score_adj for the sandbox.
+	if c.Spec.Process.OOMScoreAdj == nil {
+		return nil
+	}
+
+	containers, err := loadSandbox(conf.RootDir, c.Sandbox.ID)
+	if err != nil {
+		return fmt.Errorf("loading sandbox containers: %v", err)
+	}
+
+	// Get the lowest score for all containers.
+	var lowScore int
+	scoreFound := false
+	for _, container := range containers {
+		if container.Spec.Process.OOMScoreAdj != nil && (!scoreFound || *container.Spec.Process.OOMScoreAdj < lowScore) {
+			scoreFound = true
+			lowScore = *container.Spec.Process.OOMScoreAdj
+		}
+	}
+
+	// Only set oom_score_adj if one of the containers has oom_score_adj set
+	// in the OCI bundle. If not, we need to inherit the parent process's
+	// oom_score_adj.
+	// See: https://github.com/opencontainers/runtime-spec/blob/master/config.md#linux-process
+	if !scoreFound {
+		return nil
+	}
+
+	// Set the lowest of all containers oom_score_adj to the sandbox.
+	if err := setOOMScoreAdj(c.Sandbox.Pid, lowScore); err != nil {
+		return fmt.Errorf("setting oom_score_adj for sandbox %q: %v", c.Sandbox.ID, err)
+	}
+
+	// Set container's oom_score_adj to the gofer since it is dedicated to the
+	// container, in case the gofer uses up too much memory.
+	if err := setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj); err != nil {
+		return fmt.Errorf("setting gofer oom_score_adj for container %q: %v", c.ID, err)
+	}
+	return nil
+}
+
+// setOOMScoreAdj sets oom_score_adj to the given value for the given PID.
+// /proc must be available and mounted read-write. scoreAdj should be between
+// -1000 and 1000.
+func setOOMScoreAdj(pid int, scoreAdj int) error {
+	f, err := os.OpenFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid), os.O_WRONLY, 0644)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+	if _, err := f.WriteString(strconv.Itoa(scoreAdj)); err != nil {
+		return err
+	}
+	return nil
+}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index ff68c586e..3d4f304f3 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -76,7 +76,7 @@ func waitForProcessCount(cont *Container, want int) error {
 }
 
 func blockUntilWaitable(pid int) error {
-	_, _, err := testutil.RetryEintr(func() (uintptr, uintptr, error) {
+	_, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) {
 		var err error
 		_, _, err1 := syscall.Syscall6(syscall.SYS_WAITID, 1, uintptr(pid), 0, syscall.WEXITED|syscall.WNOWAIT, 0, 0)
 		if err1 != 0 {
@@ -1310,10 +1310,13 @@ func TestRunNonRoot(t *testing.T) {
 		t.Logf("Running test with conf: %+v", conf)
 
 		spec := testutil.NewSpecWithArgs("/bin/true")
+
+		// Set a random user/group with no access to "blocked" dir.
 		spec.Process.User.UID = 343
 		spec.Process.User.GID = 2401
+		spec.Process.Capabilities = nil
 
-		// User that container runs as can't list '$TMP/blocked' and would fail to
+		// User running inside container can't list '$TMP/blocked' and would fail to
 		// mount it.
 		dir, err := ioutil.TempDir(testutil.TmpDir(), "blocked")
 		if err != nil {
@@ -1327,6 +1330,17 @@ func TestRunNonRoot(t *testing.T) {
 			t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
 		}
 
+		src, err := ioutil.TempDir(testutil.TmpDir(), "src")
+		if err != nil {
+			t.Fatalf("ioutil.TempDir() failed: %v", err)
+		}
+
+		spec.Mounts = append(spec.Mounts, specs.Mount{
+			Destination: dir,
+			Source:      src,
+			Type:        "bind",
+		})
+
 		if err := run(spec, conf); err != nil {
 			t.Fatalf("error running sandbox: %v", err)
 		}
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 978a422f5..ae03d24b4 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -29,6 +29,7 @@ import (
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/specutils"
 	"gvisor.dev/gvisor/runsc/test/testutil"
@@ -59,11 +60,14 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
 }
 
 func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) {
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		return nil, nil, fmt.Errorf("error creating root dir: %v", err)
+	// Setup root dir if one hasn't been provided.
+	if len(conf.RootDir) == 0 {
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			return nil, nil, fmt.Errorf("error creating root dir: %v", err)
+		}
+		conf.RootDir = rootDir
 	}
-	conf.RootDir = rootDir
 
 	var containers []*Container
 	var bundles []string
@@ -74,7 +78,7 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
 		for _, b := range bundles {
 			os.RemoveAll(b)
 		}
-		os.RemoveAll(rootDir)
+		os.RemoveAll(conf.RootDir)
 	}
 	for i, spec := range specs {
 		bundleDir, err := testutil.SetupBundleDir(spec)
@@ -488,7 +492,7 @@ func TestMultiContainerSignal(t *testing.T) {
 		if err := containers[1].Destroy(); err != nil {
 			t.Errorf("failed to destroy container: %v", err)
 		}
-		_, _, err = testutil.RetryEintr(func() (uintptr, uintptr, error) {
+		_, _, err = specutils.RetryEintr(func() (uintptr, uintptr, error) {
 			cpid, err := syscall.Wait4(goferPid, nil, 0, nil)
 			return uintptr(cpid), 0, err
 		})
@@ -905,9 +909,9 @@ func TestMultiContainerDifferentFilesystems(t *testing.T) {
 	}
 }
 
-// TestMultiContainerGoferStop tests that IO operations continue to work after
-// containers have been stopped and gofers killed.
-func TestMultiContainerGoferStop(t *testing.T) {
+// TestMultiContainerContainerDestroyStress tests that IO operations continue
+// to work after containers have been stopped and gofers killed.
+func TestMultiContainerContainerDestroyStress(t *testing.T) {
 	app, err := testutil.FindFile("runsc/container/test_app/test_app")
 	if err != nil {
 		t.Fatal("error finding test_app:", err)
@@ -1345,3 +1349,194 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
 		}
 	}
 }
+
+// Test that container is destroyed when Gofer is killed.
+func TestMultiContainerGoferKilled(t *testing.T) {
+	sleep := []string{"sleep", "100"}
+	specs, ids := createSpecs(sleep, sleep, sleep)
+	conf := testutil.TestConfig()
+	containers, cleanup, err := startContainers(conf, specs, ids)
+	if err != nil {
+		t.Fatalf("error starting containers: %v", err)
+	}
+	defer cleanup()
+
+	// Ensure container is running
+	c := containers[2]
+	expectedPL := []*control.Process{
+		{PID: 3, Cmd: "sleep"},
+	}
+	if err := waitForProcessList(c, expectedPL); err != nil {
+		t.Errorf("failed to wait for sleep to start: %v", err)
+	}
+
+	// Kill container's gofer.
+	if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil {
+		t.Fatalf("syscall.Kill(%d, SIGKILL)=%v", c.GoferPid, err)
+	}
+
+	// Wait until container stops.
+	if err := waitForProcessList(c, nil); err != nil {
+		t.Errorf("Container %q was not stopped after gofer death: %v", c.ID, err)
+	}
+
+	// Check that container isn't running anymore.
+	args := &control.ExecArgs{Argv: []string{"/bin/true"}}
+	if _, err := c.executeSync(args); err == nil {
+		t.Fatalf("Container %q was not stopped after gofer death", c.ID)
+	}
+
+	// Check that other containers are unaffected.
+	for i, c := range containers {
+		if i == 2 {
+			continue // container[2] has been killed.
+		}
+		pl := []*control.Process{
+			{PID: kernel.ThreadID(i + 1), Cmd: "sleep"},
+		}
+		if err := waitForProcessList(c, pl); err != nil {
+			t.Errorf("Container %q was affected by another container: %v", c.ID, err)
+		}
+		args := &control.ExecArgs{Argv: []string{"/bin/true"}}
+		if _, err := c.executeSync(args); err != nil {
+			t.Fatalf("Container %q was affected by another container: %v", c.ID, err)
+		}
+	}
+
+	// Kill root container's gofer to bring entire sandbox down.
+	c = containers[0]
+	if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil {
+		t.Fatalf("syscall.Kill(%d, SIGKILL)=%v", c.GoferPid, err)
+	}
+
+	// Wait until sandbox stops. waitForProcessList will loop until sandbox exits
+	// and RPC errors out.
+	impossiblePL := []*control.Process{
+		{PID: 100, Cmd: "non-existent-process"},
+	}
+	if err := waitForProcessList(c, impossiblePL); err == nil {
+		t.Fatalf("Sandbox was not killed after gofer death")
+	}
+
+	// Check that entire sandbox isn't running anymore.
+	for _, c := range containers {
+		args := &control.ExecArgs{Argv: []string{"/bin/true"}}
+		if _, err := c.executeSync(args); err == nil {
+			t.Fatalf("Container %q was not stopped after gofer death", c.ID)
+		}
+	}
+}
+
+func TestMultiContainerLoadSandbox(t *testing.T) {
+	sleep := []string{"sleep", "100"}
+	specs, ids := createSpecs(sleep, sleep, sleep)
+	conf := testutil.TestConfig()
+
+	// Create containers for the sandbox.
+	wants, cleanup, err := startContainers(conf, specs, ids)
+	if err != nil {
+		t.Fatalf("error starting containers: %v", err)
+	}
+	defer cleanup()
+
+	// Then create unrelated containers.
+	for i := 0; i < 3; i++ {
+		specs, ids = createSpecs(sleep, sleep, sleep)
+		_, cleanup, err = startContainers(conf, specs, ids)
+		if err != nil {
+			t.Fatalf("error starting containers: %v", err)
+		}
+		defer cleanup()
+	}
+
+	// Create an unrelated directory under root.
+	dir := filepath.Join(conf.RootDir, "not-a-container")
+	if err := os.MkdirAll(dir, 0755); err != nil {
+		t.Fatalf("os.MkdirAll(%q)=%v", dir, err)
+	}
+
+	// Create a valid but empty container directory.
+	randomCID := testutil.UniqueContainerID()
+	dir = filepath.Join(conf.RootDir, randomCID)
+	if err := os.MkdirAll(dir, 0755); err != nil {
+		t.Fatalf("os.MkdirAll(%q)=%v", dir, err)
+	}
+
+	// Load the sandbox and check that the correct containers were returned.
+	id := wants[0].Sandbox.ID
+	gots, err := loadSandbox(conf.RootDir, id)
+	if err != nil {
+		t.Fatalf("loadSandbox()=%v", err)
+	}
+	wantIDs := make(map[string]struct{})
+	for _, want := range wants {
+		wantIDs[want.ID] = struct{}{}
+	}
+	for _, got := range gots {
+		if got.Sandbox.ID != id {
+			t.Errorf("wrong sandbox ID, got: %v, want: %v", got.Sandbox.ID, id)
+		}
+		if _, ok := wantIDs[got.ID]; !ok {
+			t.Errorf("wrong container ID, got: %v, wants: %v", got.ID, wantIDs)
+		}
+		delete(wantIDs, got.ID)
+	}
+	if len(wantIDs) != 0 {
+		t.Errorf("containers not found: %v", wantIDs)
+	}
+}
+
+// TestMultiContainerRunNonRoot checks that child container can be configured
+// when running as non-privileged user.
+func TestMultiContainerRunNonRoot(t *testing.T) {
+	cmdRoot := []string{"/bin/sleep", "100"}
+	cmdSub := []string{"/bin/true"}
+	podSpecs, ids := createSpecs(cmdRoot, cmdSub)
+
+	// User running inside container can't list '$TMP/blocked' and would fail to
+	// mount it.
+	blocked, err := ioutil.TempDir(testutil.TmpDir(), "blocked")
+	if err != nil {
+		t.Fatalf("ioutil.TempDir() failed: %v", err)
+	}
+	if err := os.Chmod(blocked, 0700); err != nil {
+		t.Fatalf("os.MkDir(%q) failed: %v", blocked, err)
+	}
+	dir := path.Join(blocked, "test")
+	if err := os.Mkdir(dir, 0755); err != nil {
+		t.Fatalf("os.MkDir(%q) failed: %v", dir, err)
+	}
+
+	src, err := ioutil.TempDir(testutil.TmpDir(), "src")
+	if err != nil {
+		t.Fatalf("ioutil.TempDir() failed: %v", err)
+	}
+
+	// Set a random user/group with no access to "blocked" dir.
+	podSpecs[1].Process.User.UID = 343
+	podSpecs[1].Process.User.GID = 2401
+	podSpecs[1].Process.Capabilities = nil
+
+	podSpecs[1].Mounts = append(podSpecs[1].Mounts, specs.Mount{
+		Destination: dir,
+		Source:      src,
+		Type:        "bind",
+	})
+
+	conf := testutil.TestConfig()
+	pod, cleanup, err := startContainers(conf, podSpecs, ids)
+	if err != nil {
+		t.Fatalf("error starting containers: %v", err)
+	}
+	defer cleanup()
+
+	// Once all containers are started, wait for the child container to exit.
+	// This means that the volume was mounted properly.
+	ws, err := pod[1].Wait()
+	if err != nil {
+		t.Fatalf("running child container: %v", err)
+	}
+	if !ws.Exited() || ws.ExitStatus() != 0 {
+		t.Fatalf("child container failed, waitStatus: %v", ws)
+	}
+}