summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorNicolas Lacasse <nlacasse@google.com>2018-10-11 16:05:44 -0700
committerShentubot <shentubot@google.com>2018-10-11 16:07:05 -0700
commitea5f6ed6ecab7f8b2648836117f62629b3c2cbb8 (patch)
treeedd6bb8e7c06c38947736b694b08448c936c2312
parent86680fa00240e3e439d1275f4f8bf89678cf3355 (diff)
Make Wait() return the sandbox exit status if the sandbox has exited.
It's possible for Start() and Wait() calls to race, if the sandboxed application is short-lived. If the application finishes before (or during) the Wait RPC, then Wait will fail. In practice this looks like "connection refused" or "EOF" errors when waiting for an RPC response. This race is especially bad in tests, where we often run "true" inside a sandbox. This CL does a best-effort fix, by returning the sandbox exit status as the container exit status. In most cases, these are the same. This fixes the remaining flakes in runsc/container:container_test. PiperOrigin-RevId: 216777793 Change-Id: I9dfc6e6ec885b106a736055bc7a75b2008dfff7a
-rw-r--r--runsc/container/container_test.go47
-rw-r--r--runsc/sandbox/sandbox.go24
2 files changed, 68 insertions, 3 deletions
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 7ea99d06b..94572667e 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -1725,6 +1725,53 @@ func TestUserLog(t *testing.T) {
}
}
+func TestWaitOnExitedSandbox(t *testing.T) {
+ for _, conf := range configs(all...) {
+ t.Logf("Running test with conf: %+v", conf)
+
+ // Run a shell that exits immediately with a non-zero code.
+ const wantExit = 17
+ cmd := fmt.Sprintf("exit %d", wantExit)
+ spec := testutil.NewSpecWithArgs("/bin/sh", "-c", cmd)
+ rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+ if err != nil {
+ t.Fatalf("error setting up container: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+ defer os.RemoveAll(bundleDir)
+
+ // Create and Start the container.
+ c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "")
+ if err != nil {
+ t.Fatalf("error creating container: %v", err)
+ }
+ defer c.Destroy()
+ if err := c.Start(conf); err != nil {
+ t.Fatalf("error starting container: %v", err)
+ }
+
+ // Wait for the sandbox to stop running.
+ if err := testutil.Poll(func() error {
+ if c.Sandbox.IsRunning() {
+ return nil
+ }
+ return fmt.Errorf("sandbox still running")
+ }, 10*time.Second); err != nil {
+ t.Fatalf("error waiting for sandbox to exit: %v", err)
+ }
+
+ // Now call Wait.
+ ws, err := c.Wait()
+ if err != nil {
+ t.Fatalf("error waiting on container: %v", err)
+ }
+
+ if got := ws.ExitStatus(); got != wantExit {
+ t.Errorf("got exit status %d, want %d", got, wantExit)
+ }
+ }
+}
+
// executeSync synchronously executes a new process.
func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
pid, err := cont.Execute(args)
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index a0de4a175..39c855db9 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -596,10 +596,28 @@ func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
}
defer conn.Close()
- if err := conn.Call(boot.ContainerWait, &cid, &ws); err != nil {
- return ws, fmt.Errorf("error waiting on container %q: %v", cid, err)
+ // First try the Wait RPC to the sandbox.
+ if err := conn.Call(boot.ContainerWait, &cid, &ws); err == nil {
+ return ws, nil
}
- return ws, nil
+ log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
+
+ // The sandbox may have already exited, or exited while handling the
+ // Wait RPC. The best we can do is ask Linux what the sandbox exit
+ // status was, since in most cases that will be the same as the
+ // container exit status.
+ p, err := os.FindProcess(s.Pid)
+ if err != nil {
+ // "On Unix systems, FindProcess always succeeds and returns a
+ // Process for the given pid, regardless of whether the process
+ // exists."
+ return ws, fmt.Errorf("FindProcess(%d) failed: %v", s.Pid, err)
+ }
+ ps, err := p.Wait()
+ if err != nil {
+ return ws, fmt.Errorf("sandbox no longer running, tried to get exit status, but Wait failed: %v", err)
+ }
+ return ps.Sys().(syscall.WaitStatus), nil
}
// WaitPID waits for process 'pid' in the container's sandbox and returns its