Make Wait() return the sandbox exit status if the sandbox has exited.

It's possible for Start() and Wait() calls to race, if the sandboxed application is short-lived. If the application finishes before (or during) the Wait RPC, then Wait will fail. In practice this looks like "connection refused" or "EOF" errors when waiting for an RPC response. This race is especially bad in tests, where we often run "true" inside a sandbox. This CL does a best-effort fix, by returning the sandbox exit status as the container exit status. In most cases, these are the same. This fixes the remaining flakes in runsc/container:container_test. PiperOrigin-RevId: 216777793 Change-Id: I9dfc6e6ec885b106a736055bc7a75b2008dfff7a
author: Nicolas Lacasse <nlacasse@google.com> 2018-10-11 16:05:44 -0700
committer: Shentubot <shentubot@google.com> 2018-10-11 16:07:05 -0700
commit: ea5f6ed6ecab7f8b2648836117f62629b3c2cbb8 (patch)
tree: edd6bb8e7c06c38947736b694b08448c936c2312
parent: 86680fa00240e3e439d1275f4f8bf89678cf3355 (diff)
2 files changed, 68 insertions, 3 deletions
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 7ea99d06b..94572667e 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1725,6 +1725,53 @@ func TestUserLog(t *testing.T) {
 	}
 }
 
+func TestWaitOnExitedSandbox(t *testing.T) {
+	for _, conf := range configs(all...) {
+		t.Logf("Running test with conf: %+v", conf)
+
+		// Run a shell that exits immediately with a non-zero code.
+		const wantExit = 17
+		cmd := fmt.Sprintf("exit %d", wantExit)
+		spec := testutil.NewSpecWithArgs("/bin/sh", "-c", cmd)
+		rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		defer os.RemoveAll(bundleDir)
+
+		// Create and Start the container.
+		c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer c.Destroy()
+		if err := c.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+
+		// Wait for the sandbox to stop running.
+		if err := testutil.Poll(func() error {
+			if c.Sandbox.IsRunning() {
+				return nil
+			}
+			return fmt.Errorf("sandbox still running")
+		}, 10*time.Second); err != nil {
+			t.Fatalf("error waiting for sandbox to exit: %v", err)
+		}
+
+		// Now call Wait.
+		ws, err := c.Wait()
+		if err != nil {
+			t.Fatalf("error waiting on container: %v", err)
+		}
+
+		if got := ws.ExitStatus(); got != wantExit {
+			t.Errorf("got exit status %d, want %d", got, wantExit)
+		}
+	}
+}
+
 // executeSync synchronously executes a new process.
 func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
 	pid, err := cont.Execute(args)
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index a0de4a175..39c855db9 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -596,10 +596,28 @@ func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
 	}
 	defer conn.Close()
 
-	if err := conn.Call(boot.ContainerWait, &cid, &ws); err != nil {
-		return ws, fmt.Errorf("error waiting on container %q: %v", cid, err)
+	// First try the Wait RPC to the sandbox.
+	if err := conn.Call(boot.ContainerWait, &cid, &ws); err == nil {
+		return ws, nil
 	}
-	return ws, nil
+	log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
+
+	// The sandbox may have already exited, or exited while handling the
+	// Wait RPC. The best we can do is ask Linux what the sandbox exit
+	// status was, since in most cases that will be the same as the
+	// container exit status.
+	p, err := os.FindProcess(s.Pid)
+	if err != nil {
+		// "On Unix systems, FindProcess always succeeds and returns a
+		// Process for the given pid, regardless of whether the process
+		// exists."
+		return ws, fmt.Errorf("FindProcess(%d) failed: %v", s.Pid, err)
+	}
+	ps, err := p.Wait()
+	if err != nil {
+		return ws, fmt.Errorf("sandbox no longer running, tried to get exit status, but Wait failed: %v", err)
+	}
+	return ps.Sys().(syscall.WaitStatus), nil
 }
 
 // WaitPID waits for process 'pid' in the container's sandbox and returns its
author	Nicolas Lacasse <nlacasse@google.com>	2018-10-11 16:05:44 -0700
committer	Shentubot <shentubot@google.com>	2018-10-11 16:07:05 -0700
commit	ea5f6ed6ecab7f8b2648836117f62629b3c2cbb8 (patch)
tree	edd6bb8e7c06c38947736b694b08448c936c2312
parent	86680fa00240e3e439d1275f4f8bf89678cf3355 (diff)