summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorFabricio Voznika <fvoznika@google.com>2018-09-28 12:20:56 -0700
committerShentubot <shentubot@google.com>2018-09-28 12:22:21 -0700
commit2496d9b4b6343154525f73e9583a4a60bebcfa30 (patch)
tree3ac4c3c1ea5813a2c3a32ea8b4d05e01db0d99d1
parentfb65b0b471621b430969fe1c3009bee68209bf67 (diff)
Make runsc kill and delete more conformant to the "spec"
PiperOrigin-RevId: 214976251 Change-Id: I631348c3886f41f63d0e77e7c4f21b3ede2ab521
-rw-r--r--runsc/boot/controller.go89
-rw-r--r--runsc/boot/fs.go67
-rw-r--r--runsc/boot/loader.go67
-rw-r--r--runsc/cmd/boot.go2
-rw-r--r--runsc/container/container.go22
-rw-r--r--runsc/container/multi_container_test.go49
-rw-r--r--runsc/container/test_app.go70
-rw-r--r--runsc/sandbox/sandbox.go3
8 files changed, 248 insertions, 121 deletions
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 362e74df5..98356e8b7 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -21,10 +21,8 @@ import (
"path"
specs "github.com/opencontainers/runtime-spec/specs-go"
- "gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/control/server"
"gvisor.googlesource.com/gvisor/pkg/log"
- "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
"gvisor.googlesource.com/gvisor/pkg/sentry/control"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
@@ -32,7 +30,6 @@ import (
"gvisor.googlesource.com/gvisor/pkg/sentry/state"
"gvisor.googlesource.com/gvisor/pkg/sentry/time"
"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
- "gvisor.googlesource.com/gvisor/pkg/syserror"
"gvisor.googlesource.com/gvisor/pkg/urpc"
)
@@ -247,91 +244,7 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
// filesystem.
func (cm *containerManager) Destroy(cid *string, _ *struct{}) error {
log.Debugf("containerManager.destroy %q", *cid)
- cm.l.mu.Lock()
- defer cm.l.mu.Unlock()
-
- key := execID{cid: *cid}
- if tg, ok := cm.l.processes[key]; ok {
- // Send SIGKILL to threadgroup.
- if err := tg.SendSignal(&arch.SignalInfo{
- Signo: int32(linux.SIGKILL),
- Code: arch.SignalInfoUser,
- }); err == nil {
- // SIGKILL sent. Now wait for it to exit.
- log.Debugf("Waiting for container process to exit.")
- tg.WaitExited()
- log.Debugf("Container process exited.")
- } else if err != syserror.ESRCH {
- return fmt.Errorf("error sending SIGKILL to container %q: %v", *cid, err)
- }
-
- // Remove the container thread group from the map.
- delete(cm.l.processes, key)
- }
-
- // Clean up the filesystem by unmounting all mounts for this container
- // and deleting the container root directory.
-
- // First get a reference to the container root directory.
- mns := cm.l.k.RootMountNamespace()
- mnsRoot := mns.Root()
- defer mnsRoot.DecRef()
- ctx := cm.l.rootProcArgs.NewContext(cm.l.k)
- containerRoot := path.Join(ChildContainersDir, *cid)
- containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, linux.MaxSymlinkTraversals)
- if err == syserror.ENOENT {
- // Container must have been destroyed already. That's fine.
- return nil
- }
- if err != nil {
- return fmt.Errorf("error finding container root directory %q: %v", containerRoot, err)
- }
- defer containerRootDirent.DecRef()
-
- // Iterate through all submounts and unmount them. We unmount lazily by
- // setting detach=true, so we can unmount in any order.
- for _, m := range containerRootDirent.Inode.MountSource.Submounts() {
- root := m.Root()
- defer root.DecRef()
-
- // Do a best-effort unmount by flushing the refs and unmount
- // with "detach only = true".
- log.Debugf("Unmounting container submount %q", root.BaseName())
- m.FlushDirentRefs()
- if err := mns.Unmount(ctx, root, true /* detach only */); err != nil {
- return fmt.Errorf("error unmounting container submount %q: %v", root.BaseName(), err)
- }
- }
-
- // Unmount the container root itself.
- log.Debugf("Unmounting container root %q", containerRoot)
- containerRootDirent.Inode.MountSource.FlushDirentRefs()
- if err := mns.Unmount(ctx, containerRootDirent, true /* detach only */); err != nil {
- return fmt.Errorf("error unmounting container root mount %q: %v", containerRootDirent.BaseName(), err)
- }
-
- // Get a reference to the parent directory and remove the root
- // container directory.
- containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, linux.MaxSymlinkTraversals)
- if err != nil {
- return fmt.Errorf("error finding containers directory %q: %v", ChildContainersDir, err)
- }
- defer containersDirDirent.DecRef()
- log.Debugf("Deleting container root %q", containerRoot)
- if err := containersDirDirent.RemoveDirectory(ctx, mnsRoot, *cid); err != nil {
- return fmt.Errorf("error removing directory %q: %v", containerRoot, err)
- }
-
- // Flushing dirent references triggers many async close operations. We
- // must wait for those to complete before returning, otherwise the
- // caller may kill the gofer before they complete, causing a cascade of
- // failing RPCs.
- log.Infof("Waiting for async filesystem operations to complete")
- fs.AsyncBarrier()
-
- // We made it!
- log.Debugf("Destroyed container %q", *cid)
- return nil
+ return cm.l.destroyContainer(*cid)
}
// ExecuteAsync starts running a command on a created or running sandbox. It
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 22d5f621c..9e8fea7e1 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -16,6 +16,7 @@ package boot
import (
"fmt"
+ "path"
"path/filepath"
"strconv"
"strings"
@@ -576,9 +577,9 @@ func subtargets(root string, mnts []specs.Mount) []string {
return targets
}
-// setFileSystemForProcess is used to set up the file system and amend the procArgs accordingly.
+// setupContainerFS is used to set up the file system and amend the procArgs accordingly.
// procArgs are passed by reference and the FDMap field is modified. It dups stdioFDs.
-func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
+func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
ctx := procArgs.NewContext(k)
// Create the FD map, which will set stdin, stdout, and stderr. If
@@ -676,3 +677,65 @@ func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *ke
procArgs.Filename = f
return nil
}
+
+// destroyContainerFS cleans up the filesystem by unmounting all mounts for the
+// given container and deleting the container root directory.
+func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error {
+ // First get a reference to the container root directory.
+ mns := k.RootMountNamespace()
+ mnsRoot := mns.Root()
+ defer mnsRoot.DecRef()
+ containerRoot := path.Join(ChildContainersDir, cid)
+ containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, linux.MaxSymlinkTraversals)
+ if err == syserror.ENOENT {
+ // Container must have been destroyed already. That's fine.
+ return nil
+ }
+ if err != nil {
+ return fmt.Errorf("error finding container root directory %q: %v", containerRoot, err)
+ }
+ defer containerRootDirent.DecRef()
+
+ // Iterate through all submounts and unmount them. We unmount lazily by
+ // setting detach=true, so we can unmount in any order.
+ for _, m := range containerRootDirent.Inode.MountSource.Submounts() {
+ root := m.Root()
+ defer root.DecRef()
+
+ // Do a best-effort unmount by flushing the refs and unmount
+ // with "detach only = true".
+ log.Debugf("Unmounting container submount %q", root.BaseName())
+ m.FlushDirentRefs()
+ if err := mns.Unmount(ctx, root, true /* detach only */); err != nil {
+ return fmt.Errorf("error unmounting container submount %q: %v", root.BaseName(), err)
+ }
+ }
+
+ // Unmount the container root itself.
+ log.Debugf("Unmounting container root %q", containerRoot)
+ containerRootDirent.Inode.MountSource.FlushDirentRefs()
+ if err := mns.Unmount(ctx, containerRootDirent, true /* detach only */); err != nil {
+ return fmt.Errorf("error unmounting container root mount %q: %v", containerRootDirent.BaseName(), err)
+ }
+
+ // Get a reference to the parent directory and remove the root
+ // container directory.
+ containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, linux.MaxSymlinkTraversals)
+ if err != nil {
+ return fmt.Errorf("error finding containers directory %q: %v", ChildContainersDir, err)
+ }
+ defer containersDirDirent.DecRef()
+ log.Debugf("Deleting container root %q", containerRoot)
+ if err := containersDirDirent.RemoveDirectory(ctx, mnsRoot, cid); err != nil {
+ return fmt.Errorf("error removing directory %q: %v", containerRoot, err)
+ }
+
+ // Flushing dirent references triggers many async close operations. We
+ // must wait for those to complete before returning, otherwise the
+ // caller may kill the gofer before they complete, causing a cascade of
+ // failing RPCs.
+ log.Infof("Waiting for async filesystem operations to complete")
+ fs.AsyncBarrier()
+
+ return nil
+}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 52c251812..1e2a12280 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -112,9 +112,6 @@ type Loader struct {
// have the corresponding pid set.
//
// processes is guardded by mu.
- //
- // TODO: When containers are removed via `runsc delete`,
- // processes should be cleaned up.
processes map[execID]*kernel.ThreadGroup
}
@@ -385,7 +382,7 @@ func (l *Loader) run() error {
// If we are restoring, we do not want to create a process.
// l.restore is set by the container manager when a restore call is made.
if !l.restore {
- if err := setFileSystemForProcess(
+ if err := setupContainerFS(
&l.rootProcArgs,
l.spec,
l.conf,
@@ -476,7 +473,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
stdioFDs := ioFDs[:3]
goferFDs := ioFDs[3:]
- if err := setFileSystemForProcess(
+ if err := setupContainerFS(
&procArgs,
spec,
conf,
@@ -519,6 +516,34 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
return nil
}
+// destroyContainer stops a container if it is still running and cleans up its
+// filesystem.
+func (l *Loader) destroyContainer(cid string) error {
+ // First kill and wait for all processes in the container.
+ if err := l.signal(cid, int32(linux.SIGKILL), true /*all*/); err != nil {
+ return fmt.Errorf("failed to SIGKILL all container processes: %v", err)
+ }
+
+ l.mu.Lock()
+ defer l.mu.Unlock()
+
+ // Remove all container thread groups from the map.
+ for key := range l.processes {
+ if key.cid == cid {
+ delete(l.processes, key)
+ }
+ }
+
+ ctx := l.rootProcArgs.NewContext(l.k)
+ if err := destroyContainerFS(ctx, cid, l.k); err != nil {
+ return fmt.Errorf("failed to destroy filesystem for container %q: %v", cid, err)
+ }
+
+ // We made it!
+ log.Debugf("Container destroyed %q", cid)
+ return nil
+}
+
func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
// Get the container Root Dirent from the Task, since we must run this
// process with the same Root.
@@ -669,13 +694,27 @@ func (l *Loader) signal(cid string, signo int32, all bool) error {
}
si := arch.SignalInfo{Signo: signo}
- if all {
- // Pause the kernel to prevent new processes from being created while
- // the signal is delivered. This prevents process leaks when SIGKILL is
- // sent to the entire container.
- l.k.Pause()
- defer l.k.Unpause()
- return l.k.SendContainerSignal(cid, &si)
- }
- return tg.Leader().SendSignal(&si)
+ if !all {
+ return tg.Leader().SendSignal(&si)
+ }
+
+ // Pause the kernel to prevent new processes from being created while
+ // the signal is delivered. This prevents process leaks when SIGKILL is
+ // sent to the entire container.
+ l.k.Pause()
+ if err := l.k.SendContainerSignal(cid, &si); err != nil {
+ l.k.Unpause()
+ return err
+ }
+ l.k.Unpause()
+
+ // If killing all processes, wait for them to exit.
+ if all && linux.Signal(signo) == linux.SIGKILL {
+ for _, t := range l.k.TaskSet().Root.Tasks() {
+ if t.ContainerID() == cid {
+ t.ThreadGroup().WaitExited()
+ }
+ }
+ }
+ return nil
}
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 933ba2d9e..82e534479 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -142,6 +142,8 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
if err != nil {
Fatalf("error creating loader: %v", err)
}
+ // Fatalf exits the process and doesn't run defers. 'l' must be destroyed
+ // explicitly!
// Notify other processes the loader has been created.
l.NotifyLoaderCreated()
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 44b7dad8a..e65800b8d 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -72,6 +72,21 @@ func validateID(id string) error {
// Containers must write their metadata files after any change to their internal
// states. The entire container directory is deleted when the container is
// destroyed.
+//
+// When the container is stopped, all processes that belong to the container
+// must be stopped before Destroy() returns. containerd makes roughly the
+// following calls to stop a container:
+// - First it attempts to kill the container process with
+// 'runsc kill SIGTERM'. After some time, it escalates to SIGKILL. In a
+// separate thread, it's waiting on the container. As soon as the wait
+// returns, it moves on to the next step:
+// - It calls 'runsc kill --all SIGKILL' to stop every process that belongs to
+// the container. 'kill --all SIGKILL' waits for all processes before
+// returning.
+// - Containerd waits for stdin, stdout and stderr to drain and be closed.
+// - It calls 'runsc delete'. runc implementation kills --all SIGKILL once
+// again just to be sure, waits, and then proceeds with remaining teardown.
+//
type Container struct {
// ID is the container ID.
ID string `json:"id"`
@@ -451,7 +466,8 @@ func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, er
return c.Sandbox.WaitPID(c.ID, pid, clearStatus)
}
-// Signal sends the signal to the container.
+// Signal sends the signal to the container. If all is true and signal is
+// SIGKILL, then waits for all processes to exit before returning.
// Signal returns an error if the container is already stopped.
// TODO: Distinguish different error types.
func (c *Container) Signal(sig syscall.Signal, all bool) error {
@@ -534,8 +550,8 @@ func (c *Container) Processes() ([]*control.Process, error) {
return c.Sandbox.Processes(c.ID)
}
-// Destroy frees all resources associated with the container. It fails fast and
-// is idempotent.
+// Destroy stops all processes and frees all resources associated with the
+// container. It fails fast and is idempotent.
func (c *Container) Destroy() error {
log.Debugf("Destroy container %q", c.ID)
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 8c98bed22..3d7385a82 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -25,6 +25,7 @@ import (
"sync"
"syscall"
"testing"
+ "time"
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.googlesource.com/gvisor/pkg/sentry/control"
@@ -403,12 +404,18 @@ func TestMultiContainerSignal(t *testing.T) {
// TestMultiContainerDestroy checks that container are properly cleaned-up when
// they are destroyed.
func TestMultiContainerDestroy(t *testing.T) {
+ app, err := testutil.FindFile("runsc/container/test_app")
+ if err != nil {
+ t.Fatal("error finding test_app:", err)
+ }
+
for _, conf := range configs(all...) {
t.Logf("Running test with conf: %+v", conf)
- // Two containers that will run for a long time. We will
- // destroy the second one.
- specs, ids := createSpecs([]string{"sleep", "100"}, []string{"sleep", "100"})
+ // First container will remain intact while the second container is killed.
+ specs, ids := createSpecs(
+ []string{app, "reaper"},
+ []string{app, "fork-bomb"})
containers, cleanup, err := startContainers(conf, specs, ids)
if err != nil {
t.Fatalf("error starting containers: %v", err)
@@ -416,26 +423,48 @@ func TestMultiContainerDestroy(t *testing.T) {
defer cleanup()
// Exec in the root container to check for the existence of the
- // second containers root filesystem directory.
+ // second container's root filesystem directory.
contDir := path.Join(boot.ChildContainersDir, containers[1].ID)
- args := &control.ExecArgs{
+ dirArgs := &control.ExecArgs{
Filename: "/usr/bin/test",
Argv: []string{"test", "-d", contDir},
}
- if ws, err := containers[0].executeSync(args); err != nil {
- t.Fatalf("error executing %+v: %v", args, err)
+ if ws, err := containers[0].executeSync(dirArgs); err != nil {
+ t.Fatalf("error executing %+v: %v", dirArgs, err)
} else if ws.ExitStatus() != 0 {
t.Errorf("exec 'test -f %q' got exit status %d, wanted 0", contDir, ws.ExitStatus())
}
- // Destory the second container.
+ // Exec more processes to ensure signal all works for exec'd processes too.
+ args := &control.ExecArgs{
+ Filename: app,
+ Argv: []string{app, "fork-bomb"},
+ }
+ if _, err := containers[1].Execute(args); err != nil {
+ t.Fatalf("error exec'ing: %v", err)
+ }
+
+ // Let it brew...
+ time.Sleep(500 * time.Millisecond)
+
if err := containers[1].Destroy(); err != nil {
t.Fatalf("error destroying container: %v", err)
}
+ // Check that destroy killed all processes belonging to the container and
+ // waited for them to exit before returning.
+ pss, err := containers[0].Sandbox.Processes("")
+ if err != nil {
+ t.Fatalf("error getting process data from sandbox: %v", err)
+ }
+ expectedPL := []*control.Process{{PID: 1, Cmd: "test_app"}}
+ if !procListsEqual(pss, expectedPL) {
+ t.Errorf("container got process list: %s, want: %s", procListToString(pss), procListToString(expectedPL))
+ }
+
// Now the container dir should be gone.
- if ws, err := containers[0].executeSync(args); err != nil {
- t.Fatalf("error executing %+v: %v", args, err)
+ if ws, err := containers[0].executeSync(dirArgs); err != nil {
+ t.Fatalf("error executing %+v: %v", dirArgs, err)
} else if ws.ExitStatus() == 0 {
t.Errorf("exec 'test -f %q' got exit status 0, wanted non-zero", contDir)
}
diff --git a/runsc/container/test_app.go b/runsc/container/test_app.go
index a99eb97c4..f69cfdf83 100644
--- a/runsc/container/test_app.go
+++ b/runsc/container/test_app.go
@@ -36,6 +36,8 @@ func main() {
subcommands.Register(subcommands.FlagsCommand(), "")
subcommands.Register(new(uds), "")
subcommands.Register(new(taskTree), "")
+ subcommands.Register(new(forkBomb), "")
+ subcommands.Register(new(reaper), "")
flag.Parse()
@@ -151,9 +153,7 @@ func (c *taskTree) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa
if c.depth == 0 {
log.Printf("Child sleeping, PID: %d\n", os.Getpid())
- for {
- time.Sleep(24 * time.Hour)
- }
+ select {}
}
log.Printf("Parent %d sleeping, PID: %d\n", c.depth, os.Getpid())
@@ -177,3 +177,67 @@ func (c *taskTree) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa
}
return subcommands.ExitSuccess
}
+
+type forkBomb struct {
+ delay time.Duration
+}
+
+// Name implements subcommands.Command.
+func (*forkBomb) Name() string {
+ return "fork-bomb"
+}
+
+// Synopsis implements subcommands.Command.
+func (*forkBomb) Synopsis() string {
+ return "creates child process until the end of times"
+}
+
+// Usage implements subcommands.Command.
+func (*forkBomb) Usage() string {
+ return "fork-bomb <flags>"
+}
+
+// SetFlags implements subcommands.Command.
+func (c *forkBomb) SetFlags(f *flag.FlagSet) {
+ f.DurationVar(&c.delay, "delay", 100*time.Millisecond, "amount of time to delay creation of child")
+}
+
+// Execute implements subcommands.Command.
+func (c *forkBomb) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ time.Sleep(c.delay)
+
+ cmd := exec.Command("/proc/self/exe", c.Name())
+ cmd.Stdout = os.Stdout
+ cmd.Stderr = os.Stderr
+ if err := cmd.Run(); err != nil {
+ log.Fatal("failed to call self:", err)
+ }
+ return subcommands.ExitSuccess
+}
+
+type reaper struct{}
+
+// Name implements subcommands.Command.
+func (*reaper) Name() string {
+ return "reaper"
+}
+
+// Synopsis implements subcommands.Command.
+func (*reaper) Synopsis() string {
+ return "reaps all children in a loop"
+}
+
+// Usage implements subcommands.Command.
+func (*reaper) Usage() string {
+ return "reaper <flags>"
+}
+
+// SetFlags implements subcommands.Command.
+func (*reaper) SetFlags(*flag.FlagSet) {}
+
+// Execute implements subcommands.Command.
+func (c *reaper) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ stop := testutil.StartReaper()
+ defer stop()
+ select {}
+}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index d288be1d2..4111b1a60 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -572,7 +572,8 @@ func (s *Sandbox) destroy() error {
return nil
}
-// Signal sends the signal to a container in the sandbox.
+// Signal sends the signal to a container in the sandbox. If all is true and
+// signal is SIGKILL, then waits for all processes to exit before returning.
func (s *Sandbox) Signal(cid string, sig syscall.Signal, all bool) error {
log.Debugf("Signal sandbox %q", s.ID)
conn, err := s.sandboxConnect()