From 2496d9b4b6343154525f73e9583a4a60bebcfa30 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Fri, 28 Sep 2018 12:20:56 -0700 Subject: Make runsc kill and delete more conformant to the "spec" PiperOrigin-RevId: 214976251 Change-Id: I631348c3886f41f63d0e77e7c4f21b3ede2ab521 --- runsc/boot/controller.go | 89 +----------------------------------------------- runsc/boot/fs.go | 67 ++++++++++++++++++++++++++++++++++-- runsc/boot/loader.go | 67 ++++++++++++++++++++++++++++-------- 3 files changed, 119 insertions(+), 104 deletions(-) (limited to 'runsc/boot') diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 362e74df5..98356e8b7 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -21,10 +21,8 @@ import ( "path" specs "github.com/opencontainers/runtime-spec/specs-go" - "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/control/server" "gvisor.googlesource.com/gvisor/pkg/log" - "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/control" "gvisor.googlesource.com/gvisor/pkg/sentry/fs" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" @@ -32,7 +30,6 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/state" "gvisor.googlesource.com/gvisor/pkg/sentry/time" "gvisor.googlesource.com/gvisor/pkg/sentry/watchdog" - "gvisor.googlesource.com/gvisor/pkg/syserror" "gvisor.googlesource.com/gvisor/pkg/urpc" ) @@ -247,91 +244,7 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { // filesystem. func (cm *containerManager) Destroy(cid *string, _ *struct{}) error { log.Debugf("containerManager.destroy %q", *cid) - cm.l.mu.Lock() - defer cm.l.mu.Unlock() - - key := execID{cid: *cid} - if tg, ok := cm.l.processes[key]; ok { - // Send SIGKILL to threadgroup. - if err := tg.SendSignal(&arch.SignalInfo{ - Signo: int32(linux.SIGKILL), - Code: arch.SignalInfoUser, - }); err == nil { - // SIGKILL sent. Now wait for it to exit. - log.Debugf("Waiting for container process to exit.") - tg.WaitExited() - log.Debugf("Container process exited.") - } else if err != syserror.ESRCH { - return fmt.Errorf("error sending SIGKILL to container %q: %v", *cid, err) - } - - // Remove the container thread group from the map. - delete(cm.l.processes, key) - } - - // Clean up the filesystem by unmounting all mounts for this container - // and deleting the container root directory. - - // First get a reference to the container root directory. - mns := cm.l.k.RootMountNamespace() - mnsRoot := mns.Root() - defer mnsRoot.DecRef() - ctx := cm.l.rootProcArgs.NewContext(cm.l.k) - containerRoot := path.Join(ChildContainersDir, *cid) - containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, linux.MaxSymlinkTraversals) - if err == syserror.ENOENT { - // Container must have been destroyed already. That's fine. - return nil - } - if err != nil { - return fmt.Errorf("error finding container root directory %q: %v", containerRoot, err) - } - defer containerRootDirent.DecRef() - - // Iterate through all submounts and unmount them. We unmount lazily by - // setting detach=true, so we can unmount in any order. - for _, m := range containerRootDirent.Inode.MountSource.Submounts() { - root := m.Root() - defer root.DecRef() - - // Do a best-effort unmount by flushing the refs and unmount - // with "detach only = true". - log.Debugf("Unmounting container submount %q", root.BaseName()) - m.FlushDirentRefs() - if err := mns.Unmount(ctx, root, true /* detach only */); err != nil { - return fmt.Errorf("error unmounting container submount %q: %v", root.BaseName(), err) - } - } - - // Unmount the container root itself. - log.Debugf("Unmounting container root %q", containerRoot) - containerRootDirent.Inode.MountSource.FlushDirentRefs() - if err := mns.Unmount(ctx, containerRootDirent, true /* detach only */); err != nil { - return fmt.Errorf("error unmounting container root mount %q: %v", containerRootDirent.BaseName(), err) - } - - // Get a reference to the parent directory and remove the root - // container directory. - containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, linux.MaxSymlinkTraversals) - if err != nil { - return fmt.Errorf("error finding containers directory %q: %v", ChildContainersDir, err) - } - defer containersDirDirent.DecRef() - log.Debugf("Deleting container root %q", containerRoot) - if err := containersDirDirent.RemoveDirectory(ctx, mnsRoot, *cid); err != nil { - return fmt.Errorf("error removing directory %q: %v", containerRoot, err) - } - - // Flushing dirent references triggers many async close operations. We - // must wait for those to complete before returning, otherwise the - // caller may kill the gofer before they complete, causing a cascade of - // failing RPCs. - log.Infof("Waiting for async filesystem operations to complete") - fs.AsyncBarrier() - - // We made it! - log.Debugf("Destroyed container %q", *cid) - return nil + return cm.l.destroyContainer(*cid) } // ExecuteAsync starts running a command on a created or running sandbox. It diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 22d5f621c..9e8fea7e1 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -16,6 +16,7 @@ package boot import ( "fmt" + "path" "path/filepath" "strconv" "strings" @@ -576,9 +577,9 @@ func subtargets(root string, mnts []specs.Mount) []string { return targets } -// setFileSystemForProcess is used to set up the file system and amend the procArgs accordingly. +// setupContainerFS is used to set up the file system and amend the procArgs accordingly. // procArgs are passed by reference and the FDMap field is modified. It dups stdioFDs. -func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error { +func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error { ctx := procArgs.NewContext(k) // Create the FD map, which will set stdin, stdout, and stderr. If @@ -676,3 +677,65 @@ func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *ke procArgs.Filename = f return nil } + +// destroyContainerFS cleans up the filesystem by unmounting all mounts for the +// given container and deleting the container root directory. +func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error { + // First get a reference to the container root directory. + mns := k.RootMountNamespace() + mnsRoot := mns.Root() + defer mnsRoot.DecRef() + containerRoot := path.Join(ChildContainersDir, cid) + containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, linux.MaxSymlinkTraversals) + if err == syserror.ENOENT { + // Container must have been destroyed already. That's fine. + return nil + } + if err != nil { + return fmt.Errorf("error finding container root directory %q: %v", containerRoot, err) + } + defer containerRootDirent.DecRef() + + // Iterate through all submounts and unmount them. We unmount lazily by + // setting detach=true, so we can unmount in any order. + for _, m := range containerRootDirent.Inode.MountSource.Submounts() { + root := m.Root() + defer root.DecRef() + + // Do a best-effort unmount by flushing the refs and unmount + // with "detach only = true". + log.Debugf("Unmounting container submount %q", root.BaseName()) + m.FlushDirentRefs() + if err := mns.Unmount(ctx, root, true /* detach only */); err != nil { + return fmt.Errorf("error unmounting container submount %q: %v", root.BaseName(), err) + } + } + + // Unmount the container root itself. + log.Debugf("Unmounting container root %q", containerRoot) + containerRootDirent.Inode.MountSource.FlushDirentRefs() + if err := mns.Unmount(ctx, containerRootDirent, true /* detach only */); err != nil { + return fmt.Errorf("error unmounting container root mount %q: %v", containerRootDirent.BaseName(), err) + } + + // Get a reference to the parent directory and remove the root + // container directory. + containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, linux.MaxSymlinkTraversals) + if err != nil { + return fmt.Errorf("error finding containers directory %q: %v", ChildContainersDir, err) + } + defer containersDirDirent.DecRef() + log.Debugf("Deleting container root %q", containerRoot) + if err := containersDirDirent.RemoveDirectory(ctx, mnsRoot, cid); err != nil { + return fmt.Errorf("error removing directory %q: %v", containerRoot, err) + } + + // Flushing dirent references triggers many async close operations. We + // must wait for those to complete before returning, otherwise the + // caller may kill the gofer before they complete, causing a cascade of + // failing RPCs. + log.Infof("Waiting for async filesystem operations to complete") + fs.AsyncBarrier() + + return nil +} diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 52c251812..1e2a12280 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -112,9 +112,6 @@ type Loader struct { // have the corresponding pid set. // // processes is guardded by mu. - // - // TODO: When containers are removed via `runsc delete`, - // processes should be cleaned up. processes map[execID]*kernel.ThreadGroup } @@ -385,7 +382,7 @@ func (l *Loader) run() error { // If we are restoring, we do not want to create a process. // l.restore is set by the container manager when a restore call is made. if !l.restore { - if err := setFileSystemForProcess( + if err := setupContainerFS( &l.rootProcArgs, l.spec, l.conf, @@ -476,7 +473,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config stdioFDs := ioFDs[:3] goferFDs := ioFDs[3:] - if err := setFileSystemForProcess( + if err := setupContainerFS( &procArgs, spec, conf, @@ -519,6 +516,34 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config return nil } +// destroyContainer stops a container if it is still running and cleans up its +// filesystem. +func (l *Loader) destroyContainer(cid string) error { + // First kill and wait for all processes in the container. + if err := l.signal(cid, int32(linux.SIGKILL), true /*all*/); err != nil { + return fmt.Errorf("failed to SIGKILL all container processes: %v", err) + } + + l.mu.Lock() + defer l.mu.Unlock() + + // Remove all container thread groups from the map. + for key := range l.processes { + if key.cid == cid { + delete(l.processes, key) + } + } + + ctx := l.rootProcArgs.NewContext(l.k) + if err := destroyContainerFS(ctx, cid, l.k); err != nil { + return fmt.Errorf("failed to destroy filesystem for container %q: %v", cid, err) + } + + // We made it! + log.Debugf("Container destroyed %q", cid) + return nil +} + func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { // Get the container Root Dirent from the Task, since we must run this // process with the same Root. @@ -669,13 +694,27 @@ func (l *Loader) signal(cid string, signo int32, all bool) error { } si := arch.SignalInfo{Signo: signo} - if all { - // Pause the kernel to prevent new processes from being created while - // the signal is delivered. This prevents process leaks when SIGKILL is - // sent to the entire container. - l.k.Pause() - defer l.k.Unpause() - return l.k.SendContainerSignal(cid, &si) - } - return tg.Leader().SendSignal(&si) + if !all { + return tg.Leader().SendSignal(&si) + } + + // Pause the kernel to prevent new processes from being created while + // the signal is delivered. This prevents process leaks when SIGKILL is + // sent to the entire container. + l.k.Pause() + if err := l.k.SendContainerSignal(cid, &si); err != nil { + l.k.Unpause() + return err + } + l.k.Unpause() + + // If killing all processes, wait for them to exit. + if all && linux.Signal(signo) == linux.SIGKILL { + for _, t := range l.k.TaskSet().Root.Tasks() { + if t.ContainerID() == cid { + t.ThreadGroup().WaitExited() + } + } + } + return nil } -- cgit v1.2.3