diff options
author | Fabricio Voznika <fvoznika@google.com> | 2018-09-18 15:20:19 -0700 |
---|---|---|
committer | Shentubot <shentubot@google.com> | 2018-09-18 15:21:28 -0700 |
commit | 7967d8ecd57383f406d202f7e2f65e275bb36fc8 (patch) | |
tree | c1902ae3bb809f14acd59cf9566d063c31407096 | |
parent | dd05c96d99b6dc7a8503c82e10ee5caeb6930cf6 (diff) |
Handle children processes better in tests
Reap children more systematically in container tests. Previously,
container_test was taking ~5 mins to run because constainer.Destroy()
would timeout waiting for the sandbox process to exit. Now the test
running in less than a minute.
Also made the contract around Container and Sandbox destroy clearer.
PiperOrigin-RevId: 213527471
Change-Id: Icca84ee1212bbdcb62bdfc9cc7b71b12c6d1688d
-rw-r--r-- | runsc/cmd/checkpoint.go | 1 | ||||
-rw-r--r-- | runsc/container/container.go | 14 | ||||
-rw-r--r-- | runsc/container/container_test.go | 79 | ||||
-rw-r--r-- | runsc/container/multi_container_test.go | 59 | ||||
-rw-r--r-- | runsc/sandbox/sandbox.go | 3 | ||||
-rw-r--r-- | runsc/test/testutil/testutil.go | 45 |
6 files changed, 99 insertions, 102 deletions
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go index 7c2c3f59e..d074b8617 100644 --- a/runsc/cmd/checkpoint.go +++ b/runsc/cmd/checkpoint.go @@ -137,6 +137,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa if err != nil { Fatalf("error restoring container: %v", err) } + defer cont.Destroy() if err := cont.Restore(spec, conf, fullImagePath); err != nil { Fatalf("error starting container: %v", err) diff --git a/runsc/container/container.go b/runsc/container/container.go index a24c6cc31..9bf2f4625 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -198,7 +198,8 @@ func List(rootDir string) ([]string, error) { } // Create creates the container in a new Sandbox process, unless the metadata -// indicates that an existing Sandbox should be used. +// indicates that an existing Sandbox should be used. The caller must call +// Destroy() on the container. func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string) (*Container, error) { log.Debugf("Create container %q in root dir: %s", id, conf.RootDir) if err := validateID(id); err != nil { @@ -295,14 +296,12 @@ func (c *Container) Start(conf *boot.Config) error { // stop and destroy the container" -OCI spec. if c.Spec.Hooks != nil { if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil { - c.Destroy() return err } } if specutils.ShouldCreateSandbox(c.Spec) || !conf.MultiContainer { if err := c.Sandbox.StartRoot(c.Spec, conf); err != nil { - c.Destroy() return err } } else { @@ -312,7 +311,6 @@ func (c *Container) Start(conf *boot.Config) error { return err } if err := c.Sandbox.Start(c.Spec, conf, c.ID, ioFiles); err != nil { - c.Destroy() return err } } @@ -351,6 +349,8 @@ func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocke if err != nil { return 0, fmt.Errorf("error creating container: %v", err) } + defer c.Destroy() + if err := c.Start(conf); err != nil { return 0, fmt.Errorf("error starting container: %v", err) } @@ -420,7 +420,7 @@ func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, er // Signal returns an error if the container is already stopped. // TODO: Distinguish different error types. func (c *Container) Signal(sig syscall.Signal) error { - log.Debugf("Signal container %q", c.ID) + log.Debugf("Signal container %q: %v", c.ID, sig) if c.Status == Stopped { return fmt.Errorf("container sandbox is stopped") } @@ -490,8 +490,8 @@ func (c *Container) Processes() ([]*control.Process, error) { return c.Sandbox.Processes(c.ID) } -// Destroy frees all resources associated with the container. -// Destroy returns error if any step fails, and the function can be safely retried. +// Destroy frees all resources associated with the container. It fails fast and +// is idempotent. func (c *Container) Destroy() error { log.Debugf("Destroy container %q", c.ID) diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index 24beb2b75..996d80a89 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -252,35 +252,6 @@ func configs(opts ...configOption) []*boot.Config { return cs } -// In normal runsc usage, sandbox processes will be parented to -// init and init will reap the them. However, in the test environment -// the test runner is the parent and will not reap the sandbox -// processes, so we must do it ourselves, or else they will left -// as zombies. -// The function returns a wait group, and the caller can reap -// children synchronously by waiting on the wait group. -func reapChildren(c *Container) (*sync.WaitGroup, error) { - var wg sync.WaitGroup - p, err := os.FindProcess(c.Sandbox.Pid) - if err != nil { - return nil, fmt.Errorf("error finding sandbox process: %v", err) - } - g, err := os.FindProcess(c.GoferPid) - if err != nil { - return nil, fmt.Errorf("error finding gofer process: %v", err) - } - wg.Add(2) - go func() { - p.Wait() - wg.Done() - }() - go func() { - g.Wait() - wg.Done() - }() - return &wg, nil -} - // TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle. // It verifies after each step that the container can be loaded from disk, and // has the correct status. @@ -310,12 +281,14 @@ func TestLifecycle(t *testing.T) { } // Create the container. id := testutil.UniqueContainerID() - if _, err := Create(id, spec, conf, bundleDir, "", ""); err != nil { + c, err := Create(id, spec, conf, bundleDir, "", "") + if err != nil { t.Fatalf("error creating container: %v", err) } + defer c.Destroy() // Load the container from disk and check the status. - c, err := Load(rootDir, id) + c, err = Load(rootDir, id) if err != nil { t.Fatalf("error loading container: %v", err) } @@ -378,17 +351,6 @@ func TestLifecycle(t *testing.T) { // Wait for it to die. wg.Wait() - // The sandbox process should have exited by now, but it is a - // zombie. In normal runsc usage, it will be parented to init, - // and init will reap the sandbox. However, in this case the - // test runner is the parent and will not reap the sandbox - // process, so we must do it ourselves. - reapWg, err := reapChildren(c) - if err != nil { - t.Fatalf("error reaping children: %v", err) - } - reapWg.Wait() - // Load the container from disk and check the status. c, err = Load(rootDir, id) if err != nil { @@ -1164,6 +1126,7 @@ func TestConsoleSocket(t *testing.T) { if err != nil { t.Fatalf("error creating container: %v", err) } + c.Destroy() // Open the othe end of the socket. sock, err := srv.Accept() @@ -1196,11 +1159,6 @@ func TestConsoleSocket(t *testing.T) { t.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err) } - // Reap the sandbox process. - if _, err := reapChildren(c); err != nil { - t.Fatalf("error reaping children: %v", err) - } - // Shut it down. if err := c.Destroy(); err != nil { t.Fatalf("error destroying container: %v", err) @@ -1566,29 +1524,21 @@ func TestGoferExits(t *testing.T) { t.Fatalf("error starting container: %v", err) } + // Kill sandbox and expect gofer to exit on its own. sandboxProc, err := os.FindProcess(c.Sandbox.Pid) if err != nil { t.Fatalf("error finding sandbox process: %v", err) } - gofer, err := os.FindProcess(c.GoferPid) - if err != nil { - t.Fatalf("error finding sandbox process: %v", err) - } - - // Kill sandbox and expect gofer to exit on its own. if err := sandboxProc.Kill(); err != nil { t.Fatalf("error killing sandbox process: %v", err) } - if _, err := sandboxProc.Wait(); err != nil { - t.Fatalf("error waiting for sandbox process: %v", err) - } - - if _, err := gofer.Wait(); err != nil { - t.Fatalf("error waiting for gofer process: %v", err) - } - if err := c.waitForStopped(); err != nil { - t.Errorf("container is not stopped: %v", err) + _, _, err = testutil.RetryEintr(func() (uintptr, uintptr, error) { + cpid, err := syscall.Wait4(c.GoferPid, nil, 0, nil) + return uintptr(cpid), 0, err + }) + if err != nil && err != syscall.ECHILD { + t.Errorf("error waiting for gofer to exit: %v", err) } } @@ -1606,5 +1556,8 @@ func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, } func TestMain(m *testing.M) { - testutil.RunAsRoot(m) + testutil.RunAsRoot() + stop := testutil.StartReaper() + defer stop() + os.Exit(m.Run()) } diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index 09888cb86..349ea755a 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -15,7 +15,6 @@ package container import ( - "fmt" "io/ioutil" "os" "path/filepath" @@ -379,6 +378,22 @@ func TestMultiContainerSignal(t *testing.T) { t.Errorf("failed to wait for sleep to start: %v", err) } + // Destroy container and ensure container's gofer process has exited. + if err := containers[1].Destroy(); err != nil { + t.Errorf("failed to destroy container: %v", err) + } + _, _, err = testutil.RetryEintr(func() (uintptr, uintptr, error) { + cpid, err := syscall.Wait4(containers[1].GoferPid, nil, 0, nil) + return uintptr(cpid), 0, err + }) + if err != nil && err != syscall.ECHILD { + t.Errorf("error waiting for gofer to exit: %v", err) + } + // Make sure process 1 is still running. + if err := waitForProcessList(containers[0], expectedPL[:1]); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + // Now that process 2 is gone, ensure we get an error trying to // signal it again. if err := containers[1].Signal(syscall.SIGKILL); err == nil { @@ -390,36 +405,26 @@ func TestMultiContainerSignal(t *testing.T) { t.Errorf("failed to kill process 1: %v", err) } - if err := waitForSandboxExit(containers[0]); err != nil { - t.Errorf("failed to exit sandbox: %v", err) + // Ensure that container's gofer and sandbox process are no more. + _, _, err = testutil.RetryEintr(func() (uintptr, uintptr, error) { + cpid, err := syscall.Wait4(containers[0].GoferPid, nil, 0, nil) + return uintptr(cpid), 0, err + }) + if err != nil && err != syscall.ECHILD { + t.Errorf("error waiting for gofer to exit: %v", err) + } + + _, _, err = testutil.RetryEintr(func() (uintptr, uintptr, error) { + cpid, err := syscall.Wait4(containers[0].Sandbox.Pid, nil, 0, nil) + return uintptr(cpid), 0, err + }) + if err != nil && err != syscall.ECHILD { + t.Errorf("error waiting for sandbox to exit: %v", err) } - // The sentry should be gone, so signaling should yield an - // error. + // The sentry should be gone, so signaling should yield an error. if err := containers[0].Signal(syscall.SIGKILL); err == nil { t.Errorf("sandbox %q shouldn't exist, but we were able to signal it", containers[0].Sandbox.ID) } } } - -// waitForSandboxExit waits until both the sandbox and gofer processes of the -// container have exited. -func waitForSandboxExit(container *Container) error { - goferProc, _ := os.FindProcess(container.GoferPid) - state, err := goferProc.Wait() - if err != nil { - return err - } - if !state.Exited() { - return fmt.Errorf("gofer with PID %d failed to exit", container.GoferPid) - } - sandboxProc, _ := os.FindProcess(container.Sandbox.Pid) - state, err = sandboxProc.Wait() - if err != nil { - return err - } - if !state.Exited() { - return fmt.Errorf("sandbox with PID %d failed to exit", container.Sandbox.Pid) - } - return nil -} diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 21625a7c6..f58d111bf 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -59,7 +59,8 @@ type Sandbox struct { Chroot string `json:"chroot"` } -// Create creates the sandbox process. +// Create creates the sandbox process. The caller must call Destroy() on the +// sandbox. func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket string, ioFiles []*os.File) (*Sandbox, error) { s := &Sandbox{ID: id} diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go index 4d354de31..2e7f95912 100644 --- a/runsc/test/testutil/testutil.go +++ b/runsc/test/testutil/testutil.go @@ -24,10 +24,10 @@ import ( "net/http" "os" "os/exec" + "os/signal" "path/filepath" "runtime" "syscall" - "testing" "time" "github.com/cenkalti/backoff" @@ -236,10 +236,11 @@ func WaitForHTTP(port int, timeout time.Duration) error { // RunAsRoot ensures the test runs with CAP_SYS_ADMIN. If need it will create // a new user namespace and reexecute the test as root inside of the namespace. -func RunAsRoot(m *testing.M) { +// This functionr returns when it's running as root. If it needs to create +// another process, it will exit from there and not return. +func RunAsRoot() { if specutils.HasCapSysAdmin() { - // Capability: check! Good to run. - os.Exit(m.Run()) + return } // Current process doesn't have CAP_SYS_ADMIN, create user namespace and run @@ -278,3 +279,39 @@ func RunAsRoot(m *testing.M) { } os.Exit(0) } + +// StartReaper starts a gorouting that will reap all children processes created +// by the tests. Caller must call the returned function to stop it. +func StartReaper() func() { + ch := make(chan os.Signal, 1) + signal.Notify(ch, syscall.SIGCHLD) + stop := make(chan struct{}) + + go func() { + for { + select { + case <-ch: + case <-stop: + return + } + for { + cpid, _ := syscall.Wait4(-1, nil, syscall.WNOHANG, nil) + if cpid < 1 { + break + } + } + } + }() + return func() { stop <- struct{}{} } +} + +// RetryEintr retries the function until an error different than EINTR is +// returned. +func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) { + for { + r1, r2, err := f() + if err != syscall.EINTR { + return r1, r2, err + } + } +} |