From bde2a91433cfbac426577a691bf13817115b53be Mon Sep 17 00:00:00 2001 From: Lantao Liu Date: Thu, 13 Sep 2018 16:36:53 -0700 Subject: runsc: Support container signal/wait. This CL: 1) Fix `runsc wait`, it now also works after the container exits; 2) Generate correct container state in Load; 2) Make sure `Destory` cleanup everything before successfully return. PiperOrigin-RevId: 212900107 Change-Id: Ie129cbb9d74f8151a18364f1fc0b2603eac4109a --- runsc/boot/controller.go | 25 +++--- runsc/boot/loader.go | 75 ++++++++-------- runsc/cmd/checkpoint.go | 2 +- runsc/cmd/debug.go | 6 +- runsc/container/container.go | 178 ++++++++++++++++---------------------- runsc/container/container_test.go | 52 +++++++++-- runsc/sandbox/BUILD | 2 +- runsc/sandbox/sandbox.go | 38 ++++---- 8 files changed, 195 insertions(+), 183 deletions(-) diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index aaac852e0..69154ff23 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -161,8 +161,11 @@ func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error { log.Debugf("containerManager.StartRoot") // Tell the root container to start and wait for the result. cm.startChan <- struct{}{} + if err := <-cm.startResultChan; err != nil { + return fmt.Errorf("failed to start sandbox: %v", err) + } cm.l.setRootContainerID(*cid) - return <-cm.startResultChan + return nil } // Processes retrieves information about processes running in the sandbox. @@ -216,11 +219,11 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { return fmt.Errorf("start arguments must contain at least one file for the container root") } - tgid, err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files) + err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files) if err != nil { return err } - log.Debugf("Container %q started with root PID of %d", args.CID, tgid) + log.Debugf("Container %q started", args.CID) return nil } @@ -241,16 +244,12 @@ func (cm *containerManager) ExecuteAsync(args *ExecArgs, pid *int32) error { // Get the container Root Dirent from the Task, since we must run this // process with the same Root. cm.l.mu.Lock() - tgid, ok := cm.l.containerRootTGIDs[args.CID] + tg, ok := cm.l.containerRootTGs[args.CID] cm.l.mu.Unlock() if !ok { return fmt.Errorf("cannot exec in container %q: no such container", args.CID) } - t := cm.l.k.TaskSet().Root.TaskWithID(kernel.ThreadID(tgid)) - if t == nil { - return fmt.Errorf("cannot exec in container %q: no thread group with ID %d", args.CID, tgid) - } - t.WithMuLocked(func(t *kernel.Task) { + tg.Leader().WithMuLocked(func(t *kernel.Task) { args.Root = t.FSContext().RootDirectory() }) if args.Root != nil { @@ -378,12 +377,15 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { cm.l.k = k cm.l.watchdog = watchdog cm.l.rootProcArgs = kernel.CreateProcessArgs{} - cm.l.setRootContainerID(o.SandboxID) cm.l.restore = true // Tell the root container to start and wait for the result. cm.startChan <- struct{}{} - return <-cm.startResultChan + if err := <-cm.startResultChan; err != nil { + return fmt.Errorf("failed to start sandbox: %v", err) + } + cm.l.setRootContainerID(o.SandboxID) + return nil } // Resume unpauses a container. @@ -423,6 +425,7 @@ type SignalArgs struct { } // Signal sends a signal to the init process of the container. +// TODO: Send signal to exec process. func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error { log.Debugf("containerManager.Signal") return cm.l.signal(args.CID, args.Signo) diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 30d22b9c6..2ddb358bd 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -16,7 +16,6 @@ package boot import ( - "errors" "fmt" "math/rand" "os" @@ -101,15 +100,15 @@ type Loader struct { // sandboxID is the ID for the whole sandbox. sandboxID string - // mu guards containerRootTGIDs. + // mu guards containerRootTGs. mu sync.Mutex - // containerRootTGIDs maps container IDs to their root processes. It + // containerRootTGs maps container IDs to their root processes. It // can be used to determine which process to manipulate when clients // call methods on particular containers. // - // containerRootTGIDs is guarded by mu. - containerRootTGIDs map[string]kernel.ThreadID + // containerRootTGs is guarded by mu. + containerRootTGs map[string]*kernel.ThreadGroup } func init() { @@ -399,11 +398,11 @@ func (l *Loader) run() error { // startContainer starts a child container. It returns the thread group ID of // the newly created process. -func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, files []*os.File) (kernel.ThreadID, error) { +func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, files []*os.File) error { // Create capabilities. caps, err := specutils.Capabilities(spec.Process.Capabilities) if err != nil { - return 0, fmt.Errorf("error creating capabilities: %v", err) + return fmt.Errorf("error creating capabilities: %v", err) } // Convert the spec's additional GIDs to KGIDs. @@ -429,7 +428,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config procArgs, err := newProcess(spec, creds, l.k) if err != nil { - return 0, fmt.Errorf("failed to create new process: %v", err) + return fmt.Errorf("failed to create new process: %v", err) } // Can't take ownership away from os.File. dup them to get a new FDs. @@ -437,7 +436,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config for _, f := range files { fd, err := syscall.Dup(int(f.Fd())) if err != nil { - return 0, fmt.Errorf("failed to dup file: %v", err) + return fmt.Errorf("failed to dup file: %v", err) } f.Close() ioFDs = append(ioFDs, fd) @@ -453,24 +452,18 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config procArgs.Limits, k, cid); err != nil { - return 0, fmt.Errorf("failed to create new process: %v", err) + return fmt.Errorf("failed to create new process: %v", err) } ctx := procArgs.NewContext(l.k) mns := k.RootMountNamespace() if err := setExecutablePath(ctx, mns, &procArgs); err != nil { - return 0, fmt.Errorf("error setting executable path for %+v: %v", procArgs, err) + return fmt.Errorf("error setting executable path for %+v: %v", procArgs, err) } tg, err := l.k.CreateProcess(procArgs) if err != nil { - return 0, fmt.Errorf("failed to create process in sentry: %v", err) - } - - ts := l.k.TaskSet() - tgid := ts.Root.IDOfThreadGroup(tg) - if tgid == 0 { - return 0, errors.New("failed to get thread group ID of new process") + return fmt.Errorf("failed to create process in sentry: %v", err) } // CreateProcess takes a reference on FDMap if successful. @@ -478,9 +471,9 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config l.mu.Lock() defer l.mu.Unlock() - l.containerRootTGIDs[cid] = tgid + l.containerRootTGs[cid] = tg - return tgid, nil + return nil } // TODO: Per-container namespaces must be supported for -pid. @@ -490,53 +483,56 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error { // Don't defer unlock, as doing so would make it impossible for // multiple clients to wait on the same container. l.mu.Lock() - tgid, ok := l.containerRootTGIDs[cid] + tg, ok := l.containerRootTGs[cid] if !ok { defer l.mu.Unlock() - return fmt.Errorf("can't find process for container %q in %v", cid, l.containerRootTGIDs) + return fmt.Errorf("can't find process for container %q in %v", cid, l.containerRootTGs) } l.mu.Unlock() // If the thread either has already exited or exits during waiting, // consider the container exited. + // TODO: Multiple calls to waitContainer() should return + // the same exit status. defer func() { l.mu.Lock() defer l.mu.Unlock() // TODO: Containers don't map 1:1 with their root // processes. Container exits should be managed explicitly // rather than via PID. - delete(l.containerRootTGIDs, cid) + delete(l.containerRootTGs, cid) }() - return l.wait(tgid, cid, waitStatus) + l.wait(tg, waitStatus) + return nil } func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error { // TODO: Containers all currently share a PID namespace. // When per-container PID namespaces are supported, wait should use cid // to find the appropriate PID namespace. - if cid != l.sandboxID { + /*if cid != l.sandboxID { return errors.New("non-sandbox PID namespaces are not yet implemented") + }*/ + // TODO: This won't work if the exec process already exited. + tg := l.k.TaskSet().Root.ThreadGroupWithID(kernel.ThreadID(tgid)) + if tg == nil { + return fmt.Errorf("no thread group with ID %d", tgid) } - return l.wait(tgid, cid, waitStatus) + l.wait(tg, waitStatus) + return nil } // wait waits for the process with TGID 'tgid' in a container's PID namespace // to exit. -func (l *Loader) wait(tgid kernel.ThreadID, cid string, waitStatus *uint32) error { - tg := l.k.TaskSet().Root.ThreadGroupWithID(kernel.ThreadID(tgid)) - if tg == nil { - return fmt.Errorf("no thread group with ID %d", tgid) - } +func (l *Loader) wait(tg *kernel.ThreadGroup, waitStatus *uint32) { tg.WaitExited() *waitStatus = tg.ExitStatus().Status() - return nil } func (l *Loader) setRootContainerID(cid string) { l.mu.Lock() defer l.mu.Unlock() - // The root container has PID 1. - l.containerRootTGIDs = map[string]kernel.ThreadID{cid: 1} + l.containerRootTGs = map[string]*kernel.ThreadGroup{cid: l.k.GlobalInit()} l.sandboxID = cid } @@ -579,18 +575,15 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) { } } +// TODO: Support sending signal to all. func (l *Loader) signal(cid string, signo int32) error { l.mu.Lock() - tgid, ok := l.containerRootTGIDs[cid] + tg, ok := l.containerRootTGs[cid] l.mu.Unlock() if !ok { return fmt.Errorf("failed to signal container %q: no such container", cid) } - // The thread group ID of a process is the leading task's thread ID. - t := l.k.TaskSet().Root.TaskWithID(tgid) - if t == nil { - return fmt.Errorf("cannot signal: no task with ID %d", tgid) - } - return t.SendSignal(&arch.SignalInfo{Signo: signo}) + si := arch.SignalInfo{Signo: signo} + return tg.Leader().SendSignal(&si) } diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go index 05014ba3d..7c2c3f59e 100644 --- a/runsc/cmd/checkpoint.go +++ b/runsc/cmd/checkpoint.go @@ -129,7 +129,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa log.Warningf("ignoring console socket since it cannot be restored") } - if err := cont.DestroyAndWait(); err != nil { + if err := cont.Destroy(); err != nil { Fatalf("error destroying container: %v", err) } diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go index b20987b2c..caa44168b 100644 --- a/runsc/cmd/debug.go +++ b/runsc/cmd/debug.go @@ -95,10 +95,10 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) } } - log.Infof("Found sandbox %q, PID: %d", c.Sandbox.ID, c.Sandbox.Pid) - if !c.IsRunning() { - Fatalf("sandbox %q is not running", c.Sandbox.ID) + if c.Sandbox == nil || !c.Sandbox.IsRunning() { + Fatalf("container sandbox is not running") } + log.Infof("Found sandbox %q, PID: %d", c.Sandbox.ID, c.Sandbox.Pid) if d.signal > 0 { log.Infof("Sending signal %d to process: %d", d.signal, c.Sandbox.Pid) diff --git a/runsc/container/container.go b/runsc/container/container.go index 38848d02f..792b7967b 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -136,13 +136,17 @@ func Load(rootDir, id string) (*Container, error) { // This is inherently racey. if c.Status == Running || c.Status == Created { // Check if the sandbox process is still running. - if c.IsRunning() { - // TODO: Send a message into the sandbox to - // see if this particular container is still running. - } else { + if !c.Sandbox.IsRunning() { // Sandbox no longer exists, so this container definitely does not exist. c.Status = Stopped c.Sandbox = nil + } else if c.Status == Running { + // Container state should reflect the actual state of + // the application, so we don't consider gofer process + // here. + if err := c.Signal(syscall.Signal(0)); err != nil { + c.Status = Stopped + } } } @@ -382,10 +386,12 @@ func (c *Container) Pid() int { } // Wait waits for the container to exit, and returns its WaitStatus. +// Call to wait on a stopped container is needed to retrieve the exit status +// and wait returns immediately. func (c *Container) Wait() (syscall.WaitStatus, error) { log.Debugf("Wait on container %q", c.ID) - if c.Status == Stopped { - return 0, fmt.Errorf("container is stopped") + if c.Sandbox == nil || !c.Sandbox.IsRunning() { + return 0, fmt.Errorf("container sandbox is not running") } return c.Sandbox.Wait(c.ID) } @@ -394,8 +400,8 @@ func (c *Container) Wait() (syscall.WaitStatus, error) { // returns its WaitStatus. func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) { log.Debugf("Wait on pid %d in sandbox %q", pid, c.Sandbox.ID) - if c.Status == Stopped { - return 0, fmt.Errorf("container is stopped") + if c.Sandbox == nil || !c.Sandbox.IsRunning() { + return 0, fmt.Errorf("container sandbox is not running") } return c.Sandbox.WaitPID(pid, c.Sandbox.ID) } @@ -404,29 +410,19 @@ func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) { // its WaitStatus. func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) { log.Debugf("Wait on pid %d in container %q", pid, c.ID) - if c.Status == Stopped { - return 0, fmt.Errorf("container is stopped") - } - ws, err := c.Sandbox.WaitPID(pid, c.ID) - if err != nil { - return 0, err - } - if c.Sandbox.IsRootContainer(c.ID) { - // If waiting for the root, give some time for the sandbox process to exit - // to prevent races with resources that might still be in use. - if err := c.waitForStopped(); err != nil { - return 0, err - } + if c.Sandbox == nil || !c.Sandbox.IsRunning() { + return 0, fmt.Errorf("container sandbox is not running") } - return ws, nil + return c.Sandbox.WaitPID(pid, c.ID) } // Signal sends the signal to the container. +// Signal returns an error if the container is already stopped. +// TODO: Distinguish different error types. func (c *Container) Signal(sig syscall.Signal) error { log.Debugf("Signal container %q", c.ID) if c.Status == Stopped { - log.Warningf("container %q not running, not sending signal %v", c.ID, sig) - return nil + return fmt.Errorf("container sandbox is stopped") } // TODO: Query the container for its state, then save it. return c.Sandbox.Signal(c.ID, sig) @@ -437,8 +433,7 @@ func (c *Container) Signal(sig syscall.Signal) error { func (c *Container) Checkpoint(f *os.File) error { log.Debugf("Checkpoint container %q", c.ID) if c.Status == Stopped { - log.Warningf("container %q not running, not checkpointing", c.ID) - return nil + return fmt.Errorf("container sandbox is stopped") } return c.Sandbox.Checkpoint(c.ID, f) } @@ -496,93 +491,36 @@ func (c *Container) Processes() ([]*control.Process, error) { } // Destroy frees all resources associated with the container. +// Destroy returns error if any step fails, and the function can be safely retried. func (c *Container) Destroy() error { log.Debugf("Destroy container %q", c.ID) - // First stop the container. - if c.Sandbox != nil { - if err := c.Sandbox.Stop(c.ID); err != nil { - return err - } + if err := c.stop(); err != nil { + return fmt.Errorf("error stopping container: %v", err) } - // "If any poststop hook fails, the runtime MUST log a warning, but the - // remaining hooks and lifecycle continue as if the hook had succeeded" -OCI spec. - if c.Spec.Hooks != nil && (c.Status == Created || c.Status == Running) { - executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State()) - } - - // If we are the first container in the sandbox, take the sandbox down - // as well. - if c.Sandbox != nil && c.Sandbox.IsRootContainer(c.ID) { - if err := c.Sandbox.Destroy(); err != nil { - log.Warningf("Failed to destroy sandbox %q: %v", c.Sandbox.ID, err) - } - } - c.Status = Stopped - c.Sandbox = nil - - if err := c.destroyGofer(); err != nil { - return fmt.Errorf("error destroying gofer: %v", err) + if err := destroyFS(c.Spec); err != nil { + return fmt.Errorf("error destroying container fs: %v", err) } if err := os.RemoveAll(c.Root); err != nil && !os.IsNotExist(err) { return fmt.Errorf("error deleting container root directory %q: %v", c.Root, err) } - return nil -} - -func (c *Container) destroyGofer() error { - if c.GoferPid != 0 { - log.Debugf("Killing gofer for container %q, PID: %d", c.ID, c.GoferPid) - if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil { - log.Warningf("error sending signal %d to pid %d: %v", syscall.SIGKILL, c.GoferPid, err) - } - } - - // Gofer process may take some time to teardown. Retry in case of failure. - ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) - defer cancel() - b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) - err := backoff.Retry(func() error { return destroyFS(c.Spec) }, b) - if err == nil { - // Success! - c.GoferPid = 0 - } - return err -} - -// IsRunning returns true if the sandbox or gofer process is running. -func (c *Container) IsRunning() bool { - if c.Sandbox != nil && c.Sandbox.IsRunning() { - return true - } - if c.GoferPid != 0 { - // Send a signal 0 to the gofer process. - if err := syscall.Kill(c.GoferPid, 0); err == nil { - log.Warningf("Found orphan gofer process, pid: %d", c.GoferPid) - if err := c.destroyGofer(); err != nil { - log.Warningf("Error destroying gofer: %v", err) - } - - // Don't wait for gofer to die. Return 'running' and hope gofer is dead - // next time around. - return true - } + // "If any poststop hook fails, the runtime MUST log a warning, but the + // remaining hooks and lifecycle continue as if the hook had succeeded" -OCI spec. + // Based on the OCI, "The post-stop hooks MUST be called after the container is + // deleted but before the delete operation returns" + // Run it here to: + // 1) Conform to the OCI. + // 2) Make sure it only runs once, because the root has been deleted, the container + // can't be loaded again. + if c.Spec.Hooks != nil { + executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State()) } - return false -} -// DestroyAndWait frees all resources associated with the container -// and waits for destroy to finish before returning. -// -// TODO: This only works for single container. -func (c *Container) DestroyAndWait() error { - if err := c.Destroy(); err != nil { - return fmt.Errorf("error destroying container %v: %v", c, err) - } - return c.waitForStopped() + c.Status = Stopped + return nil } // save saves the container metadata to a file. @@ -602,13 +540,49 @@ func (c *Container) save() error { return nil } +// stop stops the container (for regular containers) or the sandbox (for +// root containers), and waits for the container or sandbox and the gofer +// to stop. If any of them doesn't stop before timeout, an error is returned. +func (c *Container) stop() error { + if c.Sandbox != nil && c.Sandbox.IsRunning() { + log.Debugf("Killing container %q", c.ID) + if c.Sandbox.IsRootContainer(c.ID) { + if err := c.Sandbox.Destroy(); err != nil { + return fmt.Errorf("error destroying sandbox %q: %v", c.Sandbox.ID, err) + } + } else { + if err := c.Signal(syscall.SIGKILL); err != nil { + // The container may already be stopped, log the error. + log.Warningf("Error sending signal %d to container %q: %v", syscall.SIGKILL, c.ID, err) + } + } + } + + // Try killing gofer if it does not exit with container. + if c.GoferPid != 0 { + log.Debugf("Killing gofer for container %q, PID: %d", c.ID, c.GoferPid) + if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil { + // The gofer may already be stopped, log the error. + log.Warningf("Error sending signal %d to gofer %d: %v", syscall.SIGKILL, c.GoferPid, err) + } + } + return c.waitForStopped() +} + func (c *Container) waitForStopped() error { ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) defer cancel() b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) op := func() error { - if !c.IsRunning() { - return fmt.Errorf("container is still running") + if c.Sandbox != nil && c.Sandbox.IsRunning() { + if err := c.Signal(syscall.Signal(0)); err == nil { + return fmt.Errorf("container is still running") + } + } + if c.GoferPid != 0 { + if err := syscall.Kill(c.GoferPid, 0); err == nil { + return fmt.Errorf("gofer is still running") + } } return nil } diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index 790334249..ab1823f1c 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -200,6 +200,7 @@ func run(spec *specs.Spec, conf *boot.Config) error { if err := s.Start(conf); err != nil { return fmt.Errorf("error starting container: %v", err) } + ws, err := s.Wait() if err != nil { return fmt.Errorf("error waiting on container: %v", err) @@ -251,6 +252,35 @@ func configs(opts ...configOption) []*boot.Config { return cs } +// In normal runsc usage, sandbox processes will be parented to +// init and init will reap the them. However, in the test environment +// the test runner is the parent and will not reap the sandbox +// processes, so we must do it ourselves, or else they will left +// as zombies. +// The function returns a wait group, and the caller can reap +// children synchronously by waiting on the wait group. +func reapChildren(c *Container) (*sync.WaitGroup, error) { + var wg sync.WaitGroup + p, err := os.FindProcess(c.Sandbox.Pid) + if err != nil { + return nil, fmt.Errorf("error finding sandbox process: %v", err) + } + g, err := os.FindProcess(c.GoferPid) + if err != nil { + return nil, fmt.Errorf("error finding gofer process: %v", err) + } + wg.Add(2) + go func() { + p.Wait() + wg.Done() + }() + go func() { + g.Wait() + wg.Done() + }() + return &wg, nil +} + // TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle. // It verifies after each step that the container can be loaded from disk, and // has the correct status. @@ -306,6 +336,7 @@ func TestLifecycle(t *testing.T) { if err := s.Start(conf); err != nil { t.Fatalf("error starting container: %v", err) } + // Load the container from disk and check the status. s, err = Load(rootDir, id) if err != nil { @@ -352,10 +383,11 @@ func TestLifecycle(t *testing.T) { // and init will reap the sandbox. However, in this case the // test runner is the parent and will not reap the sandbox // process, so we must do it ourselves. - p, _ := os.FindProcess(s.Sandbox.Pid) - p.Wait() - g, _ := os.FindProcess(s.GoferPid) - g.Wait() + reapWg, err := reapChildren(s) + if err != nil { + t.Fatalf("error reaping children: %v", err) + } + reapWg.Wait() // Load the container from disk and check the status. s, err = Load(rootDir, id) @@ -1164,6 +1196,11 @@ func TestConsoleSocket(t *testing.T) { t.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err) } + // Reap the sandbox process. + if _, err := reapChildren(s); err != nil { + t.Fatalf("error reaping children: %v", err) + } + // Shut it down. if err := s.Destroy(); err != nil { t.Fatalf("error destroying container: %v", err) @@ -1259,6 +1296,7 @@ func TestReadonlyRoot(t *testing.T) { if err := s.Start(conf); err != nil { t.Fatalf("error starting container: %v", err) } + ws, err := s.Wait() if err != nil { t.Fatalf("error waiting on container: %v", err) @@ -1302,6 +1340,7 @@ func TestReadonlyMount(t *testing.T) { if err := s.Start(conf); err != nil { t.Fatalf("error starting container: %v", err) } + ws, err := s.Wait() if err != nil { t.Fatalf("error waiting on container: %v", err) @@ -1547,8 +1586,9 @@ func TestGoferExits(t *testing.T) { if _, err := gofer.Wait(); err != nil { t.Fatalf("error waiting for gofer process: %v", err) } - if c.IsRunning() { - t.Errorf("container shouldn't be running, container: %+v", c) + + if err := c.waitForStopped(); err != nil { + t.Errorf("container is not stopped: %v", err) } } diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index 5cf8f0cda..7ae19ff35 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -23,8 +23,8 @@ go_library( "//runsc/boot", "//runsc/console", "//runsc/specutils", + "@com_github_cenkalti_backoff//:go_default_library", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", "@com_github_vishvananda_netlink//:go_default_library", - "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index 8e90dcc70..156b2f769 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -16,6 +16,7 @@ package sandbox import ( + "context" "fmt" "os" "os/exec" @@ -23,8 +24,8 @@ import ( "syscall" "time" + "github.com/cenkalti/backoff" specs "github.com/opencontainers/runtime-spec/specs-go" - "golang.org/x/sys/unix" "gvisor.googlesource.com/gvisor/pkg/control/client" "gvisor.googlesource.com/gvisor/pkg/control/server" "gvisor.googlesource.com/gvisor/pkg/log" @@ -543,20 +544,18 @@ func (s *Sandbox) IsRootContainer(cid string) bool { return s.ID == cid } -// Stop stops the container in the sandbox. -func (s *Sandbox) Stop(cid string) error { - // TODO: This should stop the container with the given ID - // in the sandbox. - return nil -} - // Destroy frees all resources associated with the sandbox. +// Destroy returns error if any step fails, and the function can be safely retried. func (s *Sandbox) Destroy() error { log.Debugf("Destroy sandbox %q", s.ID) if s.Pid != 0 { - // TODO: Too harsh? log.Debugf("Killing sandbox %q", s.ID) - signalProcess(s.Pid, unix.SIGKILL) + if err := syscall.Kill(s.Pid, syscall.SIGKILL); err != nil && err != syscall.ESRCH { + return fmt.Errorf("error killing sandbox %q PID %q: %v", s.ID, s.Pid, err) + } + if err := s.waitForStopped(); err != nil { + return fmt.Errorf("error waiting sandbox %q stop: %v", s.ID, err) + } } if s.Chroot != "" { @@ -641,7 +640,7 @@ func (s *Sandbox) Resume(cid string) error { func (s *Sandbox) IsRunning() bool { if s.Pid != 0 { // Send a signal 0 to the sandbox process. - if err := signalProcess(s.Pid, 0); err == nil { + if err := syscall.Kill(s.Pid, 0); err == nil { // Succeeded, process is running. return true } @@ -665,14 +664,17 @@ func (s *Sandbox) Stacks() (string, error) { return stacks, nil } -// signalProcess sends a signal to the host process (i.e. a sandbox or gofer -// process). Sandbox.Signal should be used to send a signal to a process -// running inside the sandbox. -func signalProcess(pid int, sig syscall.Signal) error { - if err := syscall.Kill(pid, sig); err != nil { - return fmt.Errorf("error sending signal %d to pid %d: %v", sig, pid, err) +func (s *Sandbox) waitForStopped() error { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) + op := func() error { + if s.IsRunning() { + return fmt.Errorf("sandbox is still running") + } + return nil } - return nil + return backoff.Retry(op, b) } // deviceFileForPlatform opens the device file for the given platform. If the -- cgit v1.2.3