diff options
author | Nicolas Lacasse <nlacasse@google.com> | 2018-10-11 16:05:44 -0700 |
---|---|---|
committer | Shentubot <shentubot@google.com> | 2018-10-11 16:07:05 -0700 |
commit | ea5f6ed6ecab7f8b2648836117f62629b3c2cbb8 (patch) | |
tree | edd6bb8e7c06c38947736b694b08448c936c2312 | |
parent | 86680fa00240e3e439d1275f4f8bf89678cf3355 (diff) |
Make Wait() return the sandbox exit status if the sandbox has exited.
It's possible for Start() and Wait() calls to race, if the sandboxed
application is short-lived. If the application finishes before (or during) the
Wait RPC, then Wait will fail. In practice this looks like "connection
refused" or "EOF" errors when waiting for an RPC response.
This race is especially bad in tests, where we often run "true" inside a
sandbox.
This CL does a best-effort fix, by returning the sandbox exit status as the
container exit status. In most cases, these are the same.
This fixes the remaining flakes in runsc/container:container_test.
PiperOrigin-RevId: 216777793
Change-Id: I9dfc6e6ec885b106a736055bc7a75b2008dfff7a
-rw-r--r-- | runsc/container/container_test.go | 47 | ||||
-rw-r--r-- | runsc/sandbox/sandbox.go | 24 |
2 files changed, 68 insertions, 3 deletions
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index 7ea99d06b..94572667e 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -1725,6 +1725,53 @@ func TestUserLog(t *testing.T) { } } +func TestWaitOnExitedSandbox(t *testing.T) { + for _, conf := range configs(all...) { + t.Logf("Running test with conf: %+v", conf) + + // Run a shell that exits immediately with a non-zero code. + const wantExit = 17 + cmd := fmt.Sprintf("exit %d", wantExit) + spec := testutil.NewSpecWithArgs("/bin/sh", "-c", cmd) + rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer os.RemoveAll(rootDir) + defer os.RemoveAll(bundleDir) + + // Create and Start the container. + c, err := Create(testutil.UniqueContainerID(), spec, conf, bundleDir, "", "", "") + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } + + // Wait for the sandbox to stop running. + if err := testutil.Poll(func() error { + if c.Sandbox.IsRunning() { + return nil + } + return fmt.Errorf("sandbox still running") + }, 10*time.Second); err != nil { + t.Fatalf("error waiting for sandbox to exit: %v", err) + } + + // Now call Wait. + ws, err := c.Wait() + if err != nil { + t.Fatalf("error waiting on container: %v", err) + } + + if got := ws.ExitStatus(); got != wantExit { + t.Errorf("got exit status %d, want %d", got, wantExit) + } + } +} + // executeSync synchronously executes a new process. func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) { pid, err := cont.Execute(args) diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index a0de4a175..39c855db9 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -596,10 +596,28 @@ func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) { } defer conn.Close() - if err := conn.Call(boot.ContainerWait, &cid, &ws); err != nil { - return ws, fmt.Errorf("error waiting on container %q: %v", cid, err) + // First try the Wait RPC to the sandbox. + if err := conn.Call(boot.ContainerWait, &cid, &ws); err == nil { + return ws, nil } - return ws, nil + log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err) + + // The sandbox may have already exited, or exited while handling the + // Wait RPC. The best we can do is ask Linux what the sandbox exit + // status was, since in most cases that will be the same as the + // container exit status. + p, err := os.FindProcess(s.Pid) + if err != nil { + // "On Unix systems, FindProcess always succeeds and returns a + // Process for the given pid, regardless of whether the process + // exists." + return ws, fmt.Errorf("FindProcess(%d) failed: %v", s.Pid, err) + } + ps, err := p.Wait() + if err != nil { + return ws, fmt.Errorf("sandbox no longer running, tried to get exit status, but Wait failed: %v", err) + } + return ps.Sys().(syscall.WaitStatus), nil } // WaitPID waits for process 'pid' in the container's sandbox and returns its |