diff options
author | Fabricio Voznika <fvoznika@google.com> | 2018-09-28 12:20:56 -0700 |
---|---|---|
committer | Shentubot <shentubot@google.com> | 2018-09-28 12:22:21 -0700 |
commit | 2496d9b4b6343154525f73e9583a4a60bebcfa30 (patch) | |
tree | 3ac4c3c1ea5813a2c3a32ea8b4d05e01db0d99d1 | |
parent | fb65b0b471621b430969fe1c3009bee68209bf67 (diff) |
Make runsc kill and delete more conformant to the "spec"
PiperOrigin-RevId: 214976251
Change-Id: I631348c3886f41f63d0e77e7c4f21b3ede2ab521
-rw-r--r-- | runsc/boot/controller.go | 89 | ||||
-rw-r--r-- | runsc/boot/fs.go | 67 | ||||
-rw-r--r-- | runsc/boot/loader.go | 67 | ||||
-rw-r--r-- | runsc/cmd/boot.go | 2 | ||||
-rw-r--r-- | runsc/container/container.go | 22 | ||||
-rw-r--r-- | runsc/container/multi_container_test.go | 49 | ||||
-rw-r--r-- | runsc/container/test_app.go | 70 | ||||
-rw-r--r-- | runsc/sandbox/sandbox.go | 3 |
8 files changed, 248 insertions, 121 deletions
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 362e74df5..98356e8b7 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -21,10 +21,8 @@ import ( "path" specs "github.com/opencontainers/runtime-spec/specs-go" - "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/control/server" "gvisor.googlesource.com/gvisor/pkg/log" - "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/control" "gvisor.googlesource.com/gvisor/pkg/sentry/fs" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" @@ -32,7 +30,6 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/state" "gvisor.googlesource.com/gvisor/pkg/sentry/time" "gvisor.googlesource.com/gvisor/pkg/sentry/watchdog" - "gvisor.googlesource.com/gvisor/pkg/syserror" "gvisor.googlesource.com/gvisor/pkg/urpc" ) @@ -247,91 +244,7 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { // filesystem. func (cm *containerManager) Destroy(cid *string, _ *struct{}) error { log.Debugf("containerManager.destroy %q", *cid) - cm.l.mu.Lock() - defer cm.l.mu.Unlock() - - key := execID{cid: *cid} - if tg, ok := cm.l.processes[key]; ok { - // Send SIGKILL to threadgroup. - if err := tg.SendSignal(&arch.SignalInfo{ - Signo: int32(linux.SIGKILL), - Code: arch.SignalInfoUser, - }); err == nil { - // SIGKILL sent. Now wait for it to exit. - log.Debugf("Waiting for container process to exit.") - tg.WaitExited() - log.Debugf("Container process exited.") - } else if err != syserror.ESRCH { - return fmt.Errorf("error sending SIGKILL to container %q: %v", *cid, err) - } - - // Remove the container thread group from the map. - delete(cm.l.processes, key) - } - - // Clean up the filesystem by unmounting all mounts for this container - // and deleting the container root directory. - - // First get a reference to the container root directory. - mns := cm.l.k.RootMountNamespace() - mnsRoot := mns.Root() - defer mnsRoot.DecRef() - ctx := cm.l.rootProcArgs.NewContext(cm.l.k) - containerRoot := path.Join(ChildContainersDir, *cid) - containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, linux.MaxSymlinkTraversals) - if err == syserror.ENOENT { - // Container must have been destroyed already. That's fine. - return nil - } - if err != nil { - return fmt.Errorf("error finding container root directory %q: %v", containerRoot, err) - } - defer containerRootDirent.DecRef() - - // Iterate through all submounts and unmount them. We unmount lazily by - // setting detach=true, so we can unmount in any order. - for _, m := range containerRootDirent.Inode.MountSource.Submounts() { - root := m.Root() - defer root.DecRef() - - // Do a best-effort unmount by flushing the refs and unmount - // with "detach only = true". - log.Debugf("Unmounting container submount %q", root.BaseName()) - m.FlushDirentRefs() - if err := mns.Unmount(ctx, root, true /* detach only */); err != nil { - return fmt.Errorf("error unmounting container submount %q: %v", root.BaseName(), err) - } - } - - // Unmount the container root itself. - log.Debugf("Unmounting container root %q", containerRoot) - containerRootDirent.Inode.MountSource.FlushDirentRefs() - if err := mns.Unmount(ctx, containerRootDirent, true /* detach only */); err != nil { - return fmt.Errorf("error unmounting container root mount %q: %v", containerRootDirent.BaseName(), err) - } - - // Get a reference to the parent directory and remove the root - // container directory. - containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, linux.MaxSymlinkTraversals) - if err != nil { - return fmt.Errorf("error finding containers directory %q: %v", ChildContainersDir, err) - } - defer containersDirDirent.DecRef() - log.Debugf("Deleting container root %q", containerRoot) - if err := containersDirDirent.RemoveDirectory(ctx, mnsRoot, *cid); err != nil { - return fmt.Errorf("error removing directory %q: %v", containerRoot, err) - } - - // Flushing dirent references triggers many async close operations. We - // must wait for those to complete before returning, otherwise the - // caller may kill the gofer before they complete, causing a cascade of - // failing RPCs. - log.Infof("Waiting for async filesystem operations to complete") - fs.AsyncBarrier() - - // We made it! - log.Debugf("Destroyed container %q", *cid) - return nil + return cm.l.destroyContainer(*cid) } // ExecuteAsync starts running a command on a created or running sandbox. It diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 22d5f621c..9e8fea7e1 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -16,6 +16,7 @@ package boot import ( "fmt" + "path" "path/filepath" "strconv" "strings" @@ -576,9 +577,9 @@ func subtargets(root string, mnts []specs.Mount) []string { return targets } -// setFileSystemForProcess is used to set up the file system and amend the procArgs accordingly. +// setupContainerFS is used to set up the file system and amend the procArgs accordingly. // procArgs are passed by reference and the FDMap field is modified. It dups stdioFDs. -func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error { +func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error { ctx := procArgs.NewContext(k) // Create the FD map, which will set stdin, stdout, and stderr. If @@ -676,3 +677,65 @@ func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *ke procArgs.Filename = f return nil } + +// destroyContainerFS cleans up the filesystem by unmounting all mounts for the +// given container and deleting the container root directory. +func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error { + // First get a reference to the container root directory. + mns := k.RootMountNamespace() + mnsRoot := mns.Root() + defer mnsRoot.DecRef() + containerRoot := path.Join(ChildContainersDir, cid) + containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, linux.MaxSymlinkTraversals) + if err == syserror.ENOENT { + // Container must have been destroyed already. That's fine. + return nil + } + if err != nil { + return fmt.Errorf("error finding container root directory %q: %v", containerRoot, err) + } + defer containerRootDirent.DecRef() + + // Iterate through all submounts and unmount them. We unmount lazily by + // setting detach=true, so we can unmount in any order. + for _, m := range containerRootDirent.Inode.MountSource.Submounts() { + root := m.Root() + defer root.DecRef() + + // Do a best-effort unmount by flushing the refs and unmount + // with "detach only = true". + log.Debugf("Unmounting container submount %q", root.BaseName()) + m.FlushDirentRefs() + if err := mns.Unmount(ctx, root, true /* detach only */); err != nil { + return fmt.Errorf("error unmounting container submount %q: %v", root.BaseName(), err) + } + } + + // Unmount the container root itself. + log.Debugf("Unmounting container root %q", containerRoot) + containerRootDirent.Inode.MountSource.FlushDirentRefs() + if err := mns.Unmount(ctx, containerRootDirent, true /* detach only */); err != nil { + return fmt.Errorf("error unmounting container root mount %q: %v", containerRootDirent.BaseName(), err) + } + + // Get a reference to the parent directory and remove the root + // container directory. + containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, linux.MaxSymlinkTraversals) + if err != nil { + return fmt.Errorf("error finding containers directory %q: %v", ChildContainersDir, err) + } + defer containersDirDirent.DecRef() + log.Debugf("Deleting container root %q", containerRoot) + if err := containersDirDirent.RemoveDirectory(ctx, mnsRoot, cid); err != nil { + return fmt.Errorf("error removing directory %q: %v", containerRoot, err) + } + + // Flushing dirent references triggers many async close operations. We + // must wait for those to complete before returning, otherwise the + // caller may kill the gofer before they complete, causing a cascade of + // failing RPCs. + log.Infof("Waiting for async filesystem operations to complete") + fs.AsyncBarrier() + + return nil +} diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 52c251812..1e2a12280 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -112,9 +112,6 @@ type Loader struct { // have the corresponding pid set. // // processes is guardded by mu. - // - // TODO: When containers are removed via `runsc delete`, - // processes should be cleaned up. processes map[execID]*kernel.ThreadGroup } @@ -385,7 +382,7 @@ func (l *Loader) run() error { // If we are restoring, we do not want to create a process. // l.restore is set by the container manager when a restore call is made. if !l.restore { - if err := setFileSystemForProcess( + if err := setupContainerFS( &l.rootProcArgs, l.spec, l.conf, @@ -476,7 +473,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config stdioFDs := ioFDs[:3] goferFDs := ioFDs[3:] - if err := setFileSystemForProcess( + if err := setupContainerFS( &procArgs, spec, conf, @@ -519,6 +516,34 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config return nil } +// destroyContainer stops a container if it is still running and cleans up its +// filesystem. +func (l *Loader) destroyContainer(cid string) error { + // First kill and wait for all processes in the container. + if err := l.signal(cid, int32(linux.SIGKILL), true /*all*/); err != nil { + return fmt.Errorf("failed to SIGKILL all container processes: %v", err) + } + + l.mu.Lock() + defer l.mu.Unlock() + + // Remove all container thread groups from the map. + for key := range l.processes { + if key.cid == cid { + delete(l.processes, key) + } + } + + ctx := l.rootProcArgs.NewContext(l.k) + if err := destroyContainerFS(ctx, cid, l.k); err != nil { + return fmt.Errorf("failed to destroy filesystem for container %q: %v", cid, err) + } + + // We made it! + log.Debugf("Container destroyed %q", cid) + return nil +} + func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { // Get the container Root Dirent from the Task, since we must run this // process with the same Root. @@ -669,13 +694,27 @@ func (l *Loader) signal(cid string, signo int32, all bool) error { } si := arch.SignalInfo{Signo: signo} - if all { - // Pause the kernel to prevent new processes from being created while - // the signal is delivered. This prevents process leaks when SIGKILL is - // sent to the entire container. - l.k.Pause() - defer l.k.Unpause() - return l.k.SendContainerSignal(cid, &si) - } - return tg.Leader().SendSignal(&si) + if !all { + return tg.Leader().SendSignal(&si) + } + + // Pause the kernel to prevent new processes from being created while + // the signal is delivered. This prevents process leaks when SIGKILL is + // sent to the entire container. + l.k.Pause() + if err := l.k.SendContainerSignal(cid, &si); err != nil { + l.k.Unpause() + return err + } + l.k.Unpause() + + // If killing all processes, wait for them to exit. + if all && linux.Signal(signo) == linux.SIGKILL { + for _, t := range l.k.TaskSet().Root.Tasks() { + if t.ContainerID() == cid { + t.ThreadGroup().WaitExited() + } + } + } + return nil } diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go index 933ba2d9e..82e534479 100644 --- a/runsc/cmd/boot.go +++ b/runsc/cmd/boot.go @@ -142,6 +142,8 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) if err != nil { Fatalf("error creating loader: %v", err) } + // Fatalf exits the process and doesn't run defers. 'l' must be destroyed + // explicitly! // Notify other processes the loader has been created. l.NotifyLoaderCreated() diff --git a/runsc/container/container.go b/runsc/container/container.go index 44b7dad8a..e65800b8d 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -72,6 +72,21 @@ func validateID(id string) error { // Containers must write their metadata files after any change to their internal // states. The entire container directory is deleted when the container is // destroyed. +// +// When the container is stopped, all processes that belong to the container +// must be stopped before Destroy() returns. containerd makes roughly the +// following calls to stop a container: +// - First it attempts to kill the container process with +// 'runsc kill SIGTERM'. After some time, it escalates to SIGKILL. In a +// separate thread, it's waiting on the container. As soon as the wait +// returns, it moves on to the next step: +// - It calls 'runsc kill --all SIGKILL' to stop every process that belongs to +// the container. 'kill --all SIGKILL' waits for all processes before +// returning. +// - Containerd waits for stdin, stdout and stderr to drain and be closed. +// - It calls 'runsc delete'. runc implementation kills --all SIGKILL once +// again just to be sure, waits, and then proceeds with remaining teardown. +// type Container struct { // ID is the container ID. ID string `json:"id"` @@ -451,7 +466,8 @@ func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, er return c.Sandbox.WaitPID(c.ID, pid, clearStatus) } -// Signal sends the signal to the container. +// Signal sends the signal to the container. If all is true and signal is +// SIGKILL, then waits for all processes to exit before returning. // Signal returns an error if the container is already stopped. // TODO: Distinguish different error types. func (c *Container) Signal(sig syscall.Signal, all bool) error { @@ -534,8 +550,8 @@ func (c *Container) Processes() ([]*control.Process, error) { return c.Sandbox.Processes(c.ID) } -// Destroy frees all resources associated with the container. It fails fast and -// is idempotent. +// Destroy stops all processes and frees all resources associated with the +// container. It fails fast and is idempotent. func (c *Container) Destroy() error { log.Debugf("Destroy container %q", c.ID) diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index 8c98bed22..3d7385a82 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -25,6 +25,7 @@ import ( "sync" "syscall" "testing" + "time" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.googlesource.com/gvisor/pkg/sentry/control" @@ -403,12 +404,18 @@ func TestMultiContainerSignal(t *testing.T) { // TestMultiContainerDestroy checks that container are properly cleaned-up when // they are destroyed. func TestMultiContainerDestroy(t *testing.T) { + app, err := testutil.FindFile("runsc/container/test_app") + if err != nil { + t.Fatal("error finding test_app:", err) + } + for _, conf := range configs(all...) { t.Logf("Running test with conf: %+v", conf) - // Two containers that will run for a long time. We will - // destroy the second one. - specs, ids := createSpecs([]string{"sleep", "100"}, []string{"sleep", "100"}) + // First container will remain intact while the second container is killed. + specs, ids := createSpecs( + []string{app, "reaper"}, + []string{app, "fork-bomb"}) containers, cleanup, err := startContainers(conf, specs, ids) if err != nil { t.Fatalf("error starting containers: %v", err) @@ -416,26 +423,48 @@ func TestMultiContainerDestroy(t *testing.T) { defer cleanup() // Exec in the root container to check for the existence of the - // second containers root filesystem directory. + // second container's root filesystem directory. contDir := path.Join(boot.ChildContainersDir, containers[1].ID) - args := &control.ExecArgs{ + dirArgs := &control.ExecArgs{ Filename: "/usr/bin/test", Argv: []string{"test", "-d", contDir}, } - if ws, err := containers[0].executeSync(args); err != nil { - t.Fatalf("error executing %+v: %v", args, err) + if ws, err := containers[0].executeSync(dirArgs); err != nil { + t.Fatalf("error executing %+v: %v", dirArgs, err) } else if ws.ExitStatus() != 0 { t.Errorf("exec 'test -f %q' got exit status %d, wanted 0", contDir, ws.ExitStatus()) } - // Destory the second container. + // Exec more processes to ensure signal all works for exec'd processes too. + args := &control.ExecArgs{ + Filename: app, + Argv: []string{app, "fork-bomb"}, + } + if _, err := containers[1].Execute(args); err != nil { + t.Fatalf("error exec'ing: %v", err) + } + + // Let it brew... + time.Sleep(500 * time.Millisecond) + if err := containers[1].Destroy(); err != nil { t.Fatalf("error destroying container: %v", err) } + // Check that destroy killed all processes belonging to the container and + // waited for them to exit before returning. + pss, err := containers[0].Sandbox.Processes("") + if err != nil { + t.Fatalf("error getting process data from sandbox: %v", err) + } + expectedPL := []*control.Process{{PID: 1, Cmd: "test_app"}} + if !procListsEqual(pss, expectedPL) { + t.Errorf("container got process list: %s, want: %s", procListToString(pss), procListToString(expectedPL)) + } + // Now the container dir should be gone. - if ws, err := containers[0].executeSync(args); err != nil { - t.Fatalf("error executing %+v: %v", args, err) + if ws, err := containers[0].executeSync(dirArgs); err != nil { + t.Fatalf("error executing %+v: %v", dirArgs, err) } else if ws.ExitStatus() == 0 { t.Errorf("exec 'test -f %q' got exit status 0, wanted non-zero", contDir) } diff --git a/runsc/container/test_app.go b/runsc/container/test_app.go index a99eb97c4..f69cfdf83 100644 --- a/runsc/container/test_app.go +++ b/runsc/container/test_app.go @@ -36,6 +36,8 @@ func main() { subcommands.Register(subcommands.FlagsCommand(), "") subcommands.Register(new(uds), "") subcommands.Register(new(taskTree), "") + subcommands.Register(new(forkBomb), "") + subcommands.Register(new(reaper), "") flag.Parse() @@ -151,9 +153,7 @@ func (c *taskTree) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa if c.depth == 0 { log.Printf("Child sleeping, PID: %d\n", os.Getpid()) - for { - time.Sleep(24 * time.Hour) - } + select {} } log.Printf("Parent %d sleeping, PID: %d\n", c.depth, os.Getpid()) @@ -177,3 +177,67 @@ func (c *taskTree) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa } return subcommands.ExitSuccess } + +type forkBomb struct { + delay time.Duration +} + +// Name implements subcommands.Command. +func (*forkBomb) Name() string { + return "fork-bomb" +} + +// Synopsis implements subcommands.Command. +func (*forkBomb) Synopsis() string { + return "creates child process until the end of times" +} + +// Usage implements subcommands.Command. +func (*forkBomb) Usage() string { + return "fork-bomb <flags>" +} + +// SetFlags implements subcommands.Command. +func (c *forkBomb) SetFlags(f *flag.FlagSet) { + f.DurationVar(&c.delay, "delay", 100*time.Millisecond, "amount of time to delay creation of child") +} + +// Execute implements subcommands.Command. +func (c *forkBomb) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + time.Sleep(c.delay) + + cmd := exec.Command("/proc/self/exe", c.Name()) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + log.Fatal("failed to call self:", err) + } + return subcommands.ExitSuccess +} + +type reaper struct{} + +// Name implements subcommands.Command. +func (*reaper) Name() string { + return "reaper" +} + +// Synopsis implements subcommands.Command. +func (*reaper) Synopsis() string { + return "reaps all children in a loop" +} + +// Usage implements subcommands.Command. +func (*reaper) Usage() string { + return "reaper <flags>" +} + +// SetFlags implements subcommands.Command. +func (*reaper) SetFlags(*flag.FlagSet) {} + +// Execute implements subcommands.Command. +func (c *reaper) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + stop := testutil.StartReaper() + defer stop() + select {} +} diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index d288be1d2..4111b1a60 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -572,7 +572,8 @@ func (s *Sandbox) destroy() error { return nil } -// Signal sends the signal to a container in the sandbox. +// Signal sends the signal to a container in the sandbox. If all is true and +// signal is SIGKILL, then waits for all processes to exit before returning. func (s *Sandbox) Signal(cid string, sig syscall.Signal, all bool) error { log.Debugf("Signal sandbox %q", s.ID) conn, err := s.sandboxConnect() |