From bde2a91433cfbac426577a691bf13817115b53be Mon Sep 17 00:00:00 2001 From: Lantao Liu Date: Thu, 13 Sep 2018 16:36:53 -0700 Subject: runsc: Support container signal/wait. This CL: 1) Fix `runsc wait`, it now also works after the container exits; 2) Generate correct container state in Load; 2) Make sure `Destory` cleanup everything before successfully return. PiperOrigin-RevId: 212900107 Change-Id: Ie129cbb9d74f8151a18364f1fc0b2603eac4109a --- runsc/container/container.go | 178 ++++++++++++++++---------------------- runsc/container/container_test.go | 52 +++++++++-- 2 files changed, 122 insertions(+), 108 deletions(-) (limited to 'runsc/container') diff --git a/runsc/container/container.go b/runsc/container/container.go index 38848d02f..792b7967b 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -136,13 +136,17 @@ func Load(rootDir, id string) (*Container, error) { // This is inherently racey. if c.Status == Running || c.Status == Created { // Check if the sandbox process is still running. - if c.IsRunning() { - // TODO: Send a message into the sandbox to - // see if this particular container is still running. - } else { + if !c.Sandbox.IsRunning() { // Sandbox no longer exists, so this container definitely does not exist. c.Status = Stopped c.Sandbox = nil + } else if c.Status == Running { + // Container state should reflect the actual state of + // the application, so we don't consider gofer process + // here. + if err := c.Signal(syscall.Signal(0)); err != nil { + c.Status = Stopped + } } } @@ -382,10 +386,12 @@ func (c *Container) Pid() int { } // Wait waits for the container to exit, and returns its WaitStatus. +// Call to wait on a stopped container is needed to retrieve the exit status +// and wait returns immediately. func (c *Container) Wait() (syscall.WaitStatus, error) { log.Debugf("Wait on container %q", c.ID) - if c.Status == Stopped { - return 0, fmt.Errorf("container is stopped") + if c.Sandbox == nil || !c.Sandbox.IsRunning() { + return 0, fmt.Errorf("container sandbox is not running") } return c.Sandbox.Wait(c.ID) } @@ -394,8 +400,8 @@ func (c *Container) Wait() (syscall.WaitStatus, error) { // returns its WaitStatus. func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) { log.Debugf("Wait on pid %d in sandbox %q", pid, c.Sandbox.ID) - if c.Status == Stopped { - return 0, fmt.Errorf("container is stopped") + if c.Sandbox == nil || !c.Sandbox.IsRunning() { + return 0, fmt.Errorf("container sandbox is not running") } return c.Sandbox.WaitPID(pid, c.Sandbox.ID) } @@ -404,29 +410,19 @@ func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) { // its WaitStatus. func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) { log.Debugf("Wait on pid %d in container %q", pid, c.ID) - if c.Status == Stopped { - return 0, fmt.Errorf("container is stopped") - } - ws, err := c.Sandbox.WaitPID(pid, c.ID) - if err != nil { - return 0, err - } - if c.Sandbox.IsRootContainer(c.ID) { - // If waiting for the root, give some time for the sandbox process to exit - // to prevent races with resources that might still be in use. - if err := c.waitForStopped(); err != nil { - return 0, err - } + if c.Sandbox == nil || !c.Sandbox.IsRunning() { + return 0, fmt.Errorf("container sandbox is not running") } - return ws, nil + return c.Sandbox.WaitPID(pid, c.ID) } // Signal sends the signal to the container. +// Signal returns an error if the container is already stopped. +// TODO: Distinguish different error types. func (c *Container) Signal(sig syscall.Signal) error { log.Debugf("Signal container %q", c.ID) if c.Status == Stopped { - log.Warningf("container %q not running, not sending signal %v", c.ID, sig) - return nil + return fmt.Errorf("container sandbox is stopped") } // TODO: Query the container for its state, then save it. return c.Sandbox.Signal(c.ID, sig) @@ -437,8 +433,7 @@ func (c *Container) Signal(sig syscall.Signal) error { func (c *Container) Checkpoint(f *os.File) error { log.Debugf("Checkpoint container %q", c.ID) if c.Status == Stopped { - log.Warningf("container %q not running, not checkpointing", c.ID) - return nil + return fmt.Errorf("container sandbox is stopped") } return c.Sandbox.Checkpoint(c.ID, f) } @@ -496,93 +491,36 @@ func (c *Container) Processes() ([]*control.Process, error) { } // Destroy frees all resources associated with the container. +// Destroy returns error if any step fails, and the function can be safely retried. func (c *Container) Destroy() error { log.Debugf("Destroy container %q", c.ID) - // First stop the container. - if c.Sandbox != nil { - if err := c.Sandbox.Stop(c.ID); err != nil { - return err - } + if err := c.stop(); err != nil { + return fmt.Errorf("error stopping container: %v", err) } - // "If any poststop hook fails, the runtime MUST log a warning, but the - // remaining hooks and lifecycle continue as if the hook had succeeded" -OCI spec. - if c.Spec.Hooks != nil && (c.Status == Created || c.Status == Running) { - executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State()) - } - - // If we are the first container in the sandbox, take the sandbox down - // as well. - if c.Sandbox != nil && c.Sandbox.IsRootContainer(c.ID) { - if err := c.Sandbox.Destroy(); err != nil { - log.Warningf("Failed to destroy sandbox %q: %v", c.Sandbox.ID, err) - } - } - c.Status = Stopped - c.Sandbox = nil - - if err := c.destroyGofer(); err != nil { - return fmt.Errorf("error destroying gofer: %v", err) + if err := destroyFS(c.Spec); err != nil { + return fmt.Errorf("error destroying container fs: %v", err) } if err := os.RemoveAll(c.Root); err != nil && !os.IsNotExist(err) { return fmt.Errorf("error deleting container root directory %q: %v", c.Root, err) } - return nil -} - -func (c *Container) destroyGofer() error { - if c.GoferPid != 0 { - log.Debugf("Killing gofer for container %q, PID: %d", c.ID, c.GoferPid) - if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil { - log.Warningf("error sending signal %d to pid %d: %v", syscall.SIGKILL, c.GoferPid, err) - } - } - - // Gofer process may take some time to teardown. Retry in case of failure. - ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) - defer cancel() - b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) - err := backoff.Retry(func() error { return destroyFS(c.Spec) }, b) - if err == nil { - // Success! - c.GoferPid = 0 - } - return err -} - -// IsRunning returns true if the sandbox or gofer process is running. -func (c *Container) IsRunning() bool { - if c.Sandbox != nil && c.Sandbox.IsRunning() { - return true - } - if c.GoferPid != 0 { - // Send a signal 0 to the gofer process. - if err := syscall.Kill(c.GoferPid, 0); err == nil { - log.Warningf("Found orphan gofer process, pid: %d", c.GoferPid) - if err := c.destroyGofer(); err != nil { - log.Warningf("Error destroying gofer: %v", err) - } - - // Don't wait for gofer to die. Return 'running' and hope gofer is dead - // next time around. - return true - } + // "If any poststop hook fails, the runtime MUST log a warning, but the + // remaining hooks and lifecycle continue as if the hook had succeeded" -OCI spec. + // Based on the OCI, "The post-stop hooks MUST be called after the container is + // deleted but before the delete operation returns" + // Run it here to: + // 1) Conform to the OCI. + // 2) Make sure it only runs once, because the root has been deleted, the container + // can't be loaded again. + if c.Spec.Hooks != nil { + executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State()) } - return false -} -// DestroyAndWait frees all resources associated with the container -// and waits for destroy to finish before returning. -// -// TODO: This only works for single container. -func (c *Container) DestroyAndWait() error { - if err := c.Destroy(); err != nil { - return fmt.Errorf("error destroying container %v: %v", c, err) - } - return c.waitForStopped() + c.Status = Stopped + return nil } // save saves the container metadata to a file. @@ -602,13 +540,49 @@ func (c *Container) save() error { return nil } +// stop stops the container (for regular containers) or the sandbox (for +// root containers), and waits for the container or sandbox and the gofer +// to stop. If any of them doesn't stop before timeout, an error is returned. +func (c *Container) stop() error { + if c.Sandbox != nil && c.Sandbox.IsRunning() { + log.Debugf("Killing container %q", c.ID) + if c.Sandbox.IsRootContainer(c.ID) { + if err := c.Sandbox.Destroy(); err != nil { + return fmt.Errorf("error destroying sandbox %q: %v", c.Sandbox.ID, err) + } + } else { + if err := c.Signal(syscall.SIGKILL); err != nil { + // The container may already be stopped, log the error. + log.Warningf("Error sending signal %d to container %q: %v", syscall.SIGKILL, c.ID, err) + } + } + } + + // Try killing gofer if it does not exit with container. + if c.GoferPid != 0 { + log.Debugf("Killing gofer for container %q, PID: %d", c.ID, c.GoferPid) + if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil { + // The gofer may already be stopped, log the error. + log.Warningf("Error sending signal %d to gofer %d: %v", syscall.SIGKILL, c.GoferPid, err) + } + } + return c.waitForStopped() +} + func (c *Container) waitForStopped() error { ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) defer cancel() b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) op := func() error { - if !c.IsRunning() { - return fmt.Errorf("container is still running") + if c.Sandbox != nil && c.Sandbox.IsRunning() { + if err := c.Signal(syscall.Signal(0)); err == nil { + return fmt.Errorf("container is still running") + } + } + if c.GoferPid != 0 { + if err := syscall.Kill(c.GoferPid, 0); err == nil { + return fmt.Errorf("gofer is still running") + } } return nil } diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index 790334249..ab1823f1c 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -200,6 +200,7 @@ func run(spec *specs.Spec, conf *boot.Config) error { if err := s.Start(conf); err != nil { return fmt.Errorf("error starting container: %v", err) } + ws, err := s.Wait() if err != nil { return fmt.Errorf("error waiting on container: %v", err) @@ -251,6 +252,35 @@ func configs(opts ...configOption) []*boot.Config { return cs } +// In normal runsc usage, sandbox processes will be parented to +// init and init will reap the them. However, in the test environment +// the test runner is the parent and will not reap the sandbox +// processes, so we must do it ourselves, or else they will left +// as zombies. +// The function returns a wait group, and the caller can reap +// children synchronously by waiting on the wait group. +func reapChildren(c *Container) (*sync.WaitGroup, error) { + var wg sync.WaitGroup + p, err := os.FindProcess(c.Sandbox.Pid) + if err != nil { + return nil, fmt.Errorf("error finding sandbox process: %v", err) + } + g, err := os.FindProcess(c.GoferPid) + if err != nil { + return nil, fmt.Errorf("error finding gofer process: %v", err) + } + wg.Add(2) + go func() { + p.Wait() + wg.Done() + }() + go func() { + g.Wait() + wg.Done() + }() + return &wg, nil +} + // TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle. // It verifies after each step that the container can be loaded from disk, and // has the correct status. @@ -306,6 +336,7 @@ func TestLifecycle(t *testing.T) { if err := s.Start(conf); err != nil { t.Fatalf("error starting container: %v", err) } + // Load the container from disk and check the status. s, err = Load(rootDir, id) if err != nil { @@ -352,10 +383,11 @@ func TestLifecycle(t *testing.T) { // and init will reap the sandbox. However, in this case the // test runner is the parent and will not reap the sandbox // process, so we must do it ourselves. - p, _ := os.FindProcess(s.Sandbox.Pid) - p.Wait() - g, _ := os.FindProcess(s.GoferPid) - g.Wait() + reapWg, err := reapChildren(s) + if err != nil { + t.Fatalf("error reaping children: %v", err) + } + reapWg.Wait() // Load the container from disk and check the status. s, err = Load(rootDir, id) @@ -1164,6 +1196,11 @@ func TestConsoleSocket(t *testing.T) { t.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err) } + // Reap the sandbox process. + if _, err := reapChildren(s); err != nil { + t.Fatalf("error reaping children: %v", err) + } + // Shut it down. if err := s.Destroy(); err != nil { t.Fatalf("error destroying container: %v", err) @@ -1259,6 +1296,7 @@ func TestReadonlyRoot(t *testing.T) { if err := s.Start(conf); err != nil { t.Fatalf("error starting container: %v", err) } + ws, err := s.Wait() if err != nil { t.Fatalf("error waiting on container: %v", err) @@ -1302,6 +1340,7 @@ func TestReadonlyMount(t *testing.T) { if err := s.Start(conf); err != nil { t.Fatalf("error starting container: %v", err) } + ws, err := s.Wait() if err != nil { t.Fatalf("error waiting on container: %v", err) @@ -1547,8 +1586,9 @@ func TestGoferExits(t *testing.T) { if _, err := gofer.Wait(); err != nil { t.Fatalf("error waiting for gofer process: %v", err) } - if c.IsRunning() { - t.Errorf("container shouldn't be running, container: %+v", c) + + if err := c.waitForStopped(); err != nil { + t.Errorf("container is not stopped: %v", err) } } -- cgit v1.2.3