summaryrefslogtreecommitdiffhomepage
path: root/runsc
diff options
context:
space:
mode:
authorLantao Liu <lantaol@google.com>2018-09-13 16:36:53 -0700
committerShentubot <shentubot@google.com>2018-09-13 16:38:03 -0700
commitbde2a91433cfbac426577a691bf13817115b53be (patch)
tree1403a6e5ffca3345da142bf68535763b6f34e5a9 /runsc
parentadf8f339703922211886d3e5588160f65bc131b3 (diff)
runsc: Support container signal/wait.
This CL: 1) Fix `runsc wait`, it now also works after the container exits; 2) Generate correct container state in Load; 2) Make sure `Destory` cleanup everything before successfully return. PiperOrigin-RevId: 212900107 Change-Id: Ie129cbb9d74f8151a18364f1fc0b2603eac4109a
Diffstat (limited to 'runsc')
-rw-r--r--runsc/boot/controller.go25
-rw-r--r--runsc/boot/loader.go75
-rw-r--r--runsc/cmd/checkpoint.go2
-rw-r--r--runsc/cmd/debug.go6
-rw-r--r--runsc/container/container.go178
-rw-r--r--runsc/container/container_test.go52
-rw-r--r--runsc/sandbox/BUILD2
-rw-r--r--runsc/sandbox/sandbox.go38
8 files changed, 195 insertions, 183 deletions
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index aaac852e0..69154ff23 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -161,8 +161,11 @@ func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error {
log.Debugf("containerManager.StartRoot")
// Tell the root container to start and wait for the result.
cm.startChan <- struct{}{}
+ if err := <-cm.startResultChan; err != nil {
+ return fmt.Errorf("failed to start sandbox: %v", err)
+ }
cm.l.setRootContainerID(*cid)
- return <-cm.startResultChan
+ return nil
}
// Processes retrieves information about processes running in the sandbox.
@@ -216,11 +219,11 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
return fmt.Errorf("start arguments must contain at least one file for the container root")
}
- tgid, err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files)
+ err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files)
if err != nil {
return err
}
- log.Debugf("Container %q started with root PID of %d", args.CID, tgid)
+ log.Debugf("Container %q started", args.CID)
return nil
}
@@ -241,16 +244,12 @@ func (cm *containerManager) ExecuteAsync(args *ExecArgs, pid *int32) error {
// Get the container Root Dirent from the Task, since we must run this
// process with the same Root.
cm.l.mu.Lock()
- tgid, ok := cm.l.containerRootTGIDs[args.CID]
+ tg, ok := cm.l.containerRootTGs[args.CID]
cm.l.mu.Unlock()
if !ok {
return fmt.Errorf("cannot exec in container %q: no such container", args.CID)
}
- t := cm.l.k.TaskSet().Root.TaskWithID(kernel.ThreadID(tgid))
- if t == nil {
- return fmt.Errorf("cannot exec in container %q: no thread group with ID %d", args.CID, tgid)
- }
- t.WithMuLocked(func(t *kernel.Task) {
+ tg.Leader().WithMuLocked(func(t *kernel.Task) {
args.Root = t.FSContext().RootDirectory()
})
if args.Root != nil {
@@ -378,12 +377,15 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
cm.l.k = k
cm.l.watchdog = watchdog
cm.l.rootProcArgs = kernel.CreateProcessArgs{}
- cm.l.setRootContainerID(o.SandboxID)
cm.l.restore = true
// Tell the root container to start and wait for the result.
cm.startChan <- struct{}{}
- return <-cm.startResultChan
+ if err := <-cm.startResultChan; err != nil {
+ return fmt.Errorf("failed to start sandbox: %v", err)
+ }
+ cm.l.setRootContainerID(o.SandboxID)
+ return nil
}
// Resume unpauses a container.
@@ -423,6 +425,7 @@ type SignalArgs struct {
}
// Signal sends a signal to the init process of the container.
+// TODO: Send signal to exec process.
func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
log.Debugf("containerManager.Signal")
return cm.l.signal(args.CID, args.Signo)
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 30d22b9c6..2ddb358bd 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -16,7 +16,6 @@
package boot
import (
- "errors"
"fmt"
"math/rand"
"os"
@@ -101,15 +100,15 @@ type Loader struct {
// sandboxID is the ID for the whole sandbox.
sandboxID string
- // mu guards containerRootTGIDs.
+ // mu guards containerRootTGs.
mu sync.Mutex
- // containerRootTGIDs maps container IDs to their root processes. It
+ // containerRootTGs maps container IDs to their root processes. It
// can be used to determine which process to manipulate when clients
// call methods on particular containers.
//
- // containerRootTGIDs is guarded by mu.
- containerRootTGIDs map[string]kernel.ThreadID
+ // containerRootTGs is guarded by mu.
+ containerRootTGs map[string]*kernel.ThreadGroup
}
func init() {
@@ -399,11 +398,11 @@ func (l *Loader) run() error {
// startContainer starts a child container. It returns the thread group ID of
// the newly created process.
-func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, files []*os.File) (kernel.ThreadID, error) {
+func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
// Create capabilities.
caps, err := specutils.Capabilities(spec.Process.Capabilities)
if err != nil {
- return 0, fmt.Errorf("error creating capabilities: %v", err)
+ return fmt.Errorf("error creating capabilities: %v", err)
}
// Convert the spec's additional GIDs to KGIDs.
@@ -429,7 +428,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
procArgs, err := newProcess(spec, creds, l.k)
if err != nil {
- return 0, fmt.Errorf("failed to create new process: %v", err)
+ return fmt.Errorf("failed to create new process: %v", err)
}
// Can't take ownership away from os.File. dup them to get a new FDs.
@@ -437,7 +436,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
for _, f := range files {
fd, err := syscall.Dup(int(f.Fd()))
if err != nil {
- return 0, fmt.Errorf("failed to dup file: %v", err)
+ return fmt.Errorf("failed to dup file: %v", err)
}
f.Close()
ioFDs = append(ioFDs, fd)
@@ -453,24 +452,18 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
procArgs.Limits,
k,
cid); err != nil {
- return 0, fmt.Errorf("failed to create new process: %v", err)
+ return fmt.Errorf("failed to create new process: %v", err)
}
ctx := procArgs.NewContext(l.k)
mns := k.RootMountNamespace()
if err := setExecutablePath(ctx, mns, &procArgs); err != nil {
- return 0, fmt.Errorf("error setting executable path for %+v: %v", procArgs, err)
+ return fmt.Errorf("error setting executable path for %+v: %v", procArgs, err)
}
tg, err := l.k.CreateProcess(procArgs)
if err != nil {
- return 0, fmt.Errorf("failed to create process in sentry: %v", err)
- }
-
- ts := l.k.TaskSet()
- tgid := ts.Root.IDOfThreadGroup(tg)
- if tgid == 0 {
- return 0, errors.New("failed to get thread group ID of new process")
+ return fmt.Errorf("failed to create process in sentry: %v", err)
}
// CreateProcess takes a reference on FDMap if successful.
@@ -478,9 +471,9 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
l.mu.Lock()
defer l.mu.Unlock()
- l.containerRootTGIDs[cid] = tgid
+ l.containerRootTGs[cid] = tg
- return tgid, nil
+ return nil
}
// TODO: Per-container namespaces must be supported for -pid.
@@ -490,53 +483,56 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
// Don't defer unlock, as doing so would make it impossible for
// multiple clients to wait on the same container.
l.mu.Lock()
- tgid, ok := l.containerRootTGIDs[cid]
+ tg, ok := l.containerRootTGs[cid]
if !ok {
defer l.mu.Unlock()
- return fmt.Errorf("can't find process for container %q in %v", cid, l.containerRootTGIDs)
+ return fmt.Errorf("can't find process for container %q in %v", cid, l.containerRootTGs)
}
l.mu.Unlock()
// If the thread either has already exited or exits during waiting,
// consider the container exited.
+ // TODO: Multiple calls to waitContainer() should return
+ // the same exit status.
defer func() {
l.mu.Lock()
defer l.mu.Unlock()
// TODO: Containers don't map 1:1 with their root
// processes. Container exits should be managed explicitly
// rather than via PID.
- delete(l.containerRootTGIDs, cid)
+ delete(l.containerRootTGs, cid)
}()
- return l.wait(tgid, cid, waitStatus)
+ l.wait(tg, waitStatus)
+ return nil
}
func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error {
// TODO: Containers all currently share a PID namespace.
// When per-container PID namespaces are supported, wait should use cid
// to find the appropriate PID namespace.
- if cid != l.sandboxID {
+ /*if cid != l.sandboxID {
return errors.New("non-sandbox PID namespaces are not yet implemented")
+ }*/
+ // TODO: This won't work if the exec process already exited.
+ tg := l.k.TaskSet().Root.ThreadGroupWithID(kernel.ThreadID(tgid))
+ if tg == nil {
+ return fmt.Errorf("no thread group with ID %d", tgid)
}
- return l.wait(tgid, cid, waitStatus)
+ l.wait(tg, waitStatus)
+ return nil
}
// wait waits for the process with TGID 'tgid' in a container's PID namespace
// to exit.
-func (l *Loader) wait(tgid kernel.ThreadID, cid string, waitStatus *uint32) error {
- tg := l.k.TaskSet().Root.ThreadGroupWithID(kernel.ThreadID(tgid))
- if tg == nil {
- return fmt.Errorf("no thread group with ID %d", tgid)
- }
+func (l *Loader) wait(tg *kernel.ThreadGroup, waitStatus *uint32) {
tg.WaitExited()
*waitStatus = tg.ExitStatus().Status()
- return nil
}
func (l *Loader) setRootContainerID(cid string) {
l.mu.Lock()
defer l.mu.Unlock()
- // The root container has PID 1.
- l.containerRootTGIDs = map[string]kernel.ThreadID{cid: 1}
+ l.containerRootTGs = map[string]*kernel.ThreadGroup{cid: l.k.GlobalInit()}
l.sandboxID = cid
}
@@ -579,18 +575,15 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
}
}
+// TODO: Support sending signal to all.
func (l *Loader) signal(cid string, signo int32) error {
l.mu.Lock()
- tgid, ok := l.containerRootTGIDs[cid]
+ tg, ok := l.containerRootTGs[cid]
l.mu.Unlock()
if !ok {
return fmt.Errorf("failed to signal container %q: no such container", cid)
}
- // The thread group ID of a process is the leading task's thread ID.
- t := l.k.TaskSet().Root.TaskWithID(tgid)
- if t == nil {
- return fmt.Errorf("cannot signal: no task with ID %d", tgid)
- }
- return t.SendSignal(&arch.SignalInfo{Signo: signo})
+ si := arch.SignalInfo{Signo: signo}
+ return tg.Leader().SendSignal(&si)
}
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index 05014ba3d..7c2c3f59e 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -129,7 +129,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
log.Warningf("ignoring console socket since it cannot be restored")
}
- if err := cont.DestroyAndWait(); err != nil {
+ if err := cont.Destroy(); err != nil {
Fatalf("error destroying container: %v", err)
}
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index b20987b2c..caa44168b 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -95,10 +95,10 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
}
}
- log.Infof("Found sandbox %q, PID: %d", c.Sandbox.ID, c.Sandbox.Pid)
- if !c.IsRunning() {
- Fatalf("sandbox %q is not running", c.Sandbox.ID)
+ if c.Sandbox == nil || !c.Sandbox.IsRunning() {
+ Fatalf("container sandbox is not running")
}
+ log.Infof("Found sandbox %q, PID: %d", c.Sandbox.ID, c.Sandbox.Pid)
if d.signal > 0 {
log.Infof("Sending signal %d to process: %d", d.signal, c.Sandbox.Pid)
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 38848d02f..792b7967b 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -136,13 +136,17 @@ func Load(rootDir, id string) (*Container, error) {
// This is inherently racey.
if c.Status == Running || c.Status == Created {
// Check if the sandbox process is still running.
- if c.IsRunning() {
- // TODO: Send a message into the sandbox to
- // see if this particular container is still running.
- } else {
+ if !c.Sandbox.IsRunning() {
// Sandbox no longer exists, so this container definitely does not exist.
c.Status = Stopped
c.Sandbox = nil
+ } else if c.Status == Running {
+ // Container state should reflect the actual state of
+ // the application, so we don't consider gofer process
+ // here.
+ if err := c.Signal(syscall.Signal(0)); err != nil {
+ c.Status = Stopped
+ }
}
}
@@ -382,10 +386,12 @@ func (c *Container) Pid() int {
}
// Wait waits for the container to exit, and returns its WaitStatus.
+// Call to wait on a stopped container is needed to retrieve the exit status
+// and wait returns immediately.
func (c *Container) Wait() (syscall.WaitStatus, error) {
log.Debugf("Wait on container %q", c.ID)
- if c.Status == Stopped {
- return 0, fmt.Errorf("container is stopped")
+ if c.Sandbox == nil || !c.Sandbox.IsRunning() {
+ return 0, fmt.Errorf("container sandbox is not running")
}
return c.Sandbox.Wait(c.ID)
}
@@ -394,8 +400,8 @@ func (c *Container) Wait() (syscall.WaitStatus, error) {
// returns its WaitStatus.
func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) {
log.Debugf("Wait on pid %d in sandbox %q", pid, c.Sandbox.ID)
- if c.Status == Stopped {
- return 0, fmt.Errorf("container is stopped")
+ if c.Sandbox == nil || !c.Sandbox.IsRunning() {
+ return 0, fmt.Errorf("container sandbox is not running")
}
return c.Sandbox.WaitPID(pid, c.Sandbox.ID)
}
@@ -404,29 +410,19 @@ func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) {
// its WaitStatus.
func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) {
log.Debugf("Wait on pid %d in container %q", pid, c.ID)
- if c.Status == Stopped {
- return 0, fmt.Errorf("container is stopped")
- }
- ws, err := c.Sandbox.WaitPID(pid, c.ID)
- if err != nil {
- return 0, err
- }
- if c.Sandbox.IsRootContainer(c.ID) {
- // If waiting for the root, give some time for the sandbox process to exit
- // to prevent races with resources that might still be in use.
- if err := c.waitForStopped(); err != nil {
- return 0, err
- }
+ if c.Sandbox == nil || !c.Sandbox.IsRunning() {
+ return 0, fmt.Errorf("container sandbox is not running")
}
- return ws, nil
+ return c.Sandbox.WaitPID(pid, c.ID)
}
// Signal sends the signal to the container.
+// Signal returns an error if the container is already stopped.
+// TODO: Distinguish different error types.
func (c *Container) Signal(sig syscall.Signal) error {
log.Debugf("Signal container %q", c.ID)
if c.Status == Stopped {
- log.Warningf("container %q not running, not sending signal %v", c.ID, sig)
- return nil
+ return fmt.Errorf("container sandbox is stopped")
}
// TODO: Query the container for its state, then save it.
return c.Sandbox.Signal(c.ID, sig)
@@ -437,8 +433,7 @@ func (c *Container) Signal(sig syscall.Signal) error {
func (c *Container) Checkpoint(f *os.File) error {
log.Debugf("Checkpoint container %q", c.ID)
if c.Status == Stopped {
- log.Warningf("container %q not running, not checkpointing", c.ID)
- return nil
+ return fmt.Errorf("container sandbox is stopped")
}
return c.Sandbox.Checkpoint(c.ID, f)
}
@@ -496,93 +491,36 @@ func (c *Container) Processes() ([]*control.Process, error) {
}
// Destroy frees all resources associated with the container.
+// Destroy returns error if any step fails, and the function can be safely retried.
func (c *Container) Destroy() error {
log.Debugf("Destroy container %q", c.ID)
- // First stop the container.
- if c.Sandbox != nil {
- if err := c.Sandbox.Stop(c.ID); err != nil {
- return err
- }
+ if err := c.stop(); err != nil {
+ return fmt.Errorf("error stopping container: %v", err)
}
- // "If any poststop hook fails, the runtime MUST log a warning, but the
- // remaining hooks and lifecycle continue as if the hook had succeeded" -OCI spec.
- if c.Spec.Hooks != nil && (c.Status == Created || c.Status == Running) {
- executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State())
- }
-
- // If we are the first container in the sandbox, take the sandbox down
- // as well.
- if c.Sandbox != nil && c.Sandbox.IsRootContainer(c.ID) {
- if err := c.Sandbox.Destroy(); err != nil {
- log.Warningf("Failed to destroy sandbox %q: %v", c.Sandbox.ID, err)
- }
- }
- c.Status = Stopped
- c.Sandbox = nil
-
- if err := c.destroyGofer(); err != nil {
- return fmt.Errorf("error destroying gofer: %v", err)
+ if err := destroyFS(c.Spec); err != nil {
+ return fmt.Errorf("error destroying container fs: %v", err)
}
if err := os.RemoveAll(c.Root); err != nil && !os.IsNotExist(err) {
return fmt.Errorf("error deleting container root directory %q: %v", c.Root, err)
}
- return nil
-}
-
-func (c *Container) destroyGofer() error {
- if c.GoferPid != 0 {
- log.Debugf("Killing gofer for container %q, PID: %d", c.ID, c.GoferPid)
- if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil {
- log.Warningf("error sending signal %d to pid %d: %v", syscall.SIGKILL, c.GoferPid, err)
- }
- }
-
- // Gofer process may take some time to teardown. Retry in case of failure.
- ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
- defer cancel()
- b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
- err := backoff.Retry(func() error { return destroyFS(c.Spec) }, b)
- if err == nil {
- // Success!
- c.GoferPid = 0
- }
- return err
-}
-
-// IsRunning returns true if the sandbox or gofer process is running.
-func (c *Container) IsRunning() bool {
- if c.Sandbox != nil && c.Sandbox.IsRunning() {
- return true
- }
- if c.GoferPid != 0 {
- // Send a signal 0 to the gofer process.
- if err := syscall.Kill(c.GoferPid, 0); err == nil {
- log.Warningf("Found orphan gofer process, pid: %d", c.GoferPid)
- if err := c.destroyGofer(); err != nil {
- log.Warningf("Error destroying gofer: %v", err)
- }
-
- // Don't wait for gofer to die. Return 'running' and hope gofer is dead
- // next time around.
- return true
- }
+ // "If any poststop hook fails, the runtime MUST log a warning, but the
+ // remaining hooks and lifecycle continue as if the hook had succeeded" -OCI spec.
+ // Based on the OCI, "The post-stop hooks MUST be called after the container is
+ // deleted but before the delete operation returns"
+ // Run it here to:
+ // 1) Conform to the OCI.
+ // 2) Make sure it only runs once, because the root has been deleted, the container
+ // can't be loaded again.
+ if c.Spec.Hooks != nil {
+ executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State())
}
- return false
-}
-// DestroyAndWait frees all resources associated with the container
-// and waits for destroy to finish before returning.
-//
-// TODO: This only works for single container.
-func (c *Container) DestroyAndWait() error {
- if err := c.Destroy(); err != nil {
- return fmt.Errorf("error destroying container %v: %v", c, err)
- }
- return c.waitForStopped()
+ c.Status = Stopped
+ return nil
}
// save saves the container metadata to a file.
@@ -602,13 +540,49 @@ func (c *Container) save() error {
return nil
}
+// stop stops the container (for regular containers) or the sandbox (for
+// root containers), and waits for the container or sandbox and the gofer
+// to stop. If any of them doesn't stop before timeout, an error is returned.
+func (c *Container) stop() error {
+ if c.Sandbox != nil && c.Sandbox.IsRunning() {
+ log.Debugf("Killing container %q", c.ID)
+ if c.Sandbox.IsRootContainer(c.ID) {
+ if err := c.Sandbox.Destroy(); err != nil {
+ return fmt.Errorf("error destroying sandbox %q: %v", c.Sandbox.ID, err)
+ }
+ } else {
+ if err := c.Signal(syscall.SIGKILL); err != nil {
+ // The container may already be stopped, log the error.
+ log.Warningf("Error sending signal %d to container %q: %v", syscall.SIGKILL, c.ID, err)
+ }
+ }
+ }
+
+ // Try killing gofer if it does not exit with container.
+ if c.GoferPid != 0 {
+ log.Debugf("Killing gofer for container %q, PID: %d", c.ID, c.GoferPid)
+ if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil {
+ // The gofer may already be stopped, log the error.
+ log.Warningf("Error sending signal %d to gofer %d: %v", syscall.SIGKILL, c.GoferPid, err)
+ }
+ }
+ return c.waitForStopped()
+}
+
func (c *Container) waitForStopped() error {
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
defer cancel()
b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
op := func() error {
- if !c.IsRunning() {
- return fmt.Errorf("container is still running")
+ if c.Sandbox != nil && c.Sandbox.IsRunning() {
+ if err := c.Signal(syscall.Signal(0)); err == nil {
+ return fmt.Errorf("container is still running")
+ }
+ }
+ if c.GoferPid != 0 {
+ if err := syscall.Kill(c.GoferPid, 0); err == nil {
+ return fmt.Errorf("gofer is still running")
+ }
}
return nil
}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 790334249..ab1823f1c 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -200,6 +200,7 @@ func run(spec *specs.Spec, conf *boot.Config) error {
if err := s.Start(conf); err != nil {
return fmt.Errorf("error starting container: %v", err)
}
+
ws, err := s.Wait()
if err != nil {
return fmt.Errorf("error waiting on container: %v", err)
@@ -251,6 +252,35 @@ func configs(opts ...configOption) []*boot.Config {
return cs
}
+// In normal runsc usage, sandbox processes will be parented to
+// init and init will reap the them. However, in the test environment
+// the test runner is the parent and will not reap the sandbox
+// processes, so we must do it ourselves, or else they will left
+// as zombies.
+// The function returns a wait group, and the caller can reap
+// children synchronously by waiting on the wait group.
+func reapChildren(c *Container) (*sync.WaitGroup, error) {
+ var wg sync.WaitGroup
+ p, err := os.FindProcess(c.Sandbox.Pid)
+ if err != nil {
+ return nil, fmt.Errorf("error finding sandbox process: %v", err)
+ }
+ g, err := os.FindProcess(c.GoferPid)
+ if err != nil {
+ return nil, fmt.Errorf("error finding gofer process: %v", err)
+ }
+ wg.Add(2)
+ go func() {
+ p.Wait()
+ wg.Done()
+ }()
+ go func() {
+ g.Wait()
+ wg.Done()
+ }()
+ return &wg, nil
+}
+
// TestLifecycle tests the basic Create/Start/Signal/Destroy container lifecycle.
// It verifies after each step that the container can be loaded from disk, and
// has the correct status.
@@ -306,6 +336,7 @@ func TestLifecycle(t *testing.T) {
if err := s.Start(conf); err != nil {
t.Fatalf("error starting container: %v", err)
}
+
// Load the container from disk and check the status.
s, err = Load(rootDir, id)
if err != nil {
@@ -352,10 +383,11 @@ func TestLifecycle(t *testing.T) {
// and init will reap the sandbox. However, in this case the
// test runner is the parent and will not reap the sandbox
// process, so we must do it ourselves.
- p, _ := os.FindProcess(s.Sandbox.Pid)
- p.Wait()
- g, _ := os.FindProcess(s.GoferPid)
- g.Wait()
+ reapWg, err := reapChildren(s)
+ if err != nil {
+ t.Fatalf("error reaping children: %v", err)
+ }
+ reapWg.Wait()
// Load the container from disk and check the status.
s, err = Load(rootDir, id)
@@ -1164,6 +1196,11 @@ func TestConsoleSocket(t *testing.T) {
t.Errorf("fd is not a terminal (ioctl TGGETS got %v)", err)
}
+ // Reap the sandbox process.
+ if _, err := reapChildren(s); err != nil {
+ t.Fatalf("error reaping children: %v", err)
+ }
+
// Shut it down.
if err := s.Destroy(); err != nil {
t.Fatalf("error destroying container: %v", err)
@@ -1259,6 +1296,7 @@ func TestReadonlyRoot(t *testing.T) {
if err := s.Start(conf); err != nil {
t.Fatalf("error starting container: %v", err)
}
+
ws, err := s.Wait()
if err != nil {
t.Fatalf("error waiting on container: %v", err)
@@ -1302,6 +1340,7 @@ func TestReadonlyMount(t *testing.T) {
if err := s.Start(conf); err != nil {
t.Fatalf("error starting container: %v", err)
}
+
ws, err := s.Wait()
if err != nil {
t.Fatalf("error waiting on container: %v", err)
@@ -1547,8 +1586,9 @@ func TestGoferExits(t *testing.T) {
if _, err := gofer.Wait(); err != nil {
t.Fatalf("error waiting for gofer process: %v", err)
}
- if c.IsRunning() {
- t.Errorf("container shouldn't be running, container: %+v", c)
+
+ if err := c.waitForStopped(); err != nil {
+ t.Errorf("container is not stopped: %v", err)
}
}
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index 5cf8f0cda..7ae19ff35 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -23,8 +23,8 @@ go_library(
"//runsc/boot",
"//runsc/console",
"//runsc/specutils",
+ "@com_github_cenkalti_backoff//:go_default_library",
"@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
"@com_github_vishvananda_netlink//:go_default_library",
- "@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 8e90dcc70..156b2f769 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -16,6 +16,7 @@
package sandbox
import (
+ "context"
"fmt"
"os"
"os/exec"
@@ -23,8 +24,8 @@ import (
"syscall"
"time"
+ "github.com/cenkalti/backoff"
specs "github.com/opencontainers/runtime-spec/specs-go"
- "golang.org/x/sys/unix"
"gvisor.googlesource.com/gvisor/pkg/control/client"
"gvisor.googlesource.com/gvisor/pkg/control/server"
"gvisor.googlesource.com/gvisor/pkg/log"
@@ -543,20 +544,18 @@ func (s *Sandbox) IsRootContainer(cid string) bool {
return s.ID == cid
}
-// Stop stops the container in the sandbox.
-func (s *Sandbox) Stop(cid string) error {
- // TODO: This should stop the container with the given ID
- // in the sandbox.
- return nil
-}
-
// Destroy frees all resources associated with the sandbox.
+// Destroy returns error if any step fails, and the function can be safely retried.
func (s *Sandbox) Destroy() error {
log.Debugf("Destroy sandbox %q", s.ID)
if s.Pid != 0 {
- // TODO: Too harsh?
log.Debugf("Killing sandbox %q", s.ID)
- signalProcess(s.Pid, unix.SIGKILL)
+ if err := syscall.Kill(s.Pid, syscall.SIGKILL); err != nil && err != syscall.ESRCH {
+ return fmt.Errorf("error killing sandbox %q PID %q: %v", s.ID, s.Pid, err)
+ }
+ if err := s.waitForStopped(); err != nil {
+ return fmt.Errorf("error waiting sandbox %q stop: %v", s.ID, err)
+ }
}
if s.Chroot != "" {
@@ -641,7 +640,7 @@ func (s *Sandbox) Resume(cid string) error {
func (s *Sandbox) IsRunning() bool {
if s.Pid != 0 {
// Send a signal 0 to the sandbox process.
- if err := signalProcess(s.Pid, 0); err == nil {
+ if err := syscall.Kill(s.Pid, 0); err == nil {
// Succeeded, process is running.
return true
}
@@ -665,14 +664,17 @@ func (s *Sandbox) Stacks() (string, error) {
return stacks, nil
}
-// signalProcess sends a signal to the host process (i.e. a sandbox or gofer
-// process). Sandbox.Signal should be used to send a signal to a process
-// running inside the sandbox.
-func signalProcess(pid int, sig syscall.Signal) error {
- if err := syscall.Kill(pid, sig); err != nil {
- return fmt.Errorf("error sending signal %d to pid %d: %v", sig, pid, err)
+func (s *Sandbox) waitForStopped() error {
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+ b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
+ op := func() error {
+ if s.IsRunning() {
+ return fmt.Errorf("sandbox is still running")
+ }
+ return nil
}
- return nil
+ return backoff.Retry(op, b)
}
// deviceFileForPlatform opens the device file for the given platform. If the