From 2496d9b4b6343154525f73e9583a4a60bebcfa30 Mon Sep 17 00:00:00 2001
From: Fabricio Voznika <fvoznika@google.com>
Date: Fri, 28 Sep 2018 12:20:56 -0700
Subject: Make runsc kill and delete more conformant to the "spec"

PiperOrigin-RevId: 214976251
Change-Id: I631348c3886f41f63d0e77e7c4f21b3ede2ab521
---
 runsc/boot/controller.go | 89 +-----------------------------------------------
 runsc/boot/fs.go         | 67 ++++++++++++++++++++++++++++++++++--
 runsc/boot/loader.go     | 67 ++++++++++++++++++++++++++++--------
 3 files changed, 119 insertions(+), 104 deletions(-)

(limited to 'runsc/boot')

diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 362e74df5..98356e8b7 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -21,10 +21,8 @@ import (
 	"path"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/control/server"
 	"gvisor.googlesource.com/gvisor/pkg/log"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
@@ -32,7 +30,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/state"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/time"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
-	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/pkg/urpc"
 )
 
@@ -247,91 +244,7 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 // filesystem.
 func (cm *containerManager) Destroy(cid *string, _ *struct{}) error {
 	log.Debugf("containerManager.destroy %q", *cid)
-	cm.l.mu.Lock()
-	defer cm.l.mu.Unlock()
-
-	key := execID{cid: *cid}
-	if tg, ok := cm.l.processes[key]; ok {
-		// Send SIGKILL to threadgroup.
-		if err := tg.SendSignal(&arch.SignalInfo{
-			Signo: int32(linux.SIGKILL),
-			Code:  arch.SignalInfoUser,
-		}); err == nil {
-			// SIGKILL sent. Now wait for it to exit.
-			log.Debugf("Waiting for container process to exit.")
-			tg.WaitExited()
-			log.Debugf("Container process exited.")
-		} else if err != syserror.ESRCH {
-			return fmt.Errorf("error sending SIGKILL to container %q: %v", *cid, err)
-		}
-
-		// Remove the container thread group from the map.
-		delete(cm.l.processes, key)
-	}
-
-	// Clean up the filesystem by unmounting all mounts for this container
-	// and deleting the container root directory.
-
-	// First get a reference to the container root directory.
-	mns := cm.l.k.RootMountNamespace()
-	mnsRoot := mns.Root()
-	defer mnsRoot.DecRef()
-	ctx := cm.l.rootProcArgs.NewContext(cm.l.k)
-	containerRoot := path.Join(ChildContainersDir, *cid)
-	containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, linux.MaxSymlinkTraversals)
-	if err == syserror.ENOENT {
-		// Container must have been destroyed already. That's fine.
-		return nil
-	}
-	if err != nil {
-		return fmt.Errorf("error finding container root directory %q: %v", containerRoot, err)
-	}
-	defer containerRootDirent.DecRef()
-
-	// Iterate through all submounts and unmount them. We unmount lazily by
-	// setting detach=true, so we can unmount in any order.
-	for _, m := range containerRootDirent.Inode.MountSource.Submounts() {
-		root := m.Root()
-		defer root.DecRef()
-
-		// Do a best-effort unmount by flushing the refs and unmount
-		// with "detach only = true".
-		log.Debugf("Unmounting container submount %q", root.BaseName())
-		m.FlushDirentRefs()
-		if err := mns.Unmount(ctx, root, true /* detach only */); err != nil {
-			return fmt.Errorf("error unmounting container submount %q: %v", root.BaseName(), err)
-		}
-	}
-
-	// Unmount the container root itself.
-	log.Debugf("Unmounting container root %q", containerRoot)
-	containerRootDirent.Inode.MountSource.FlushDirentRefs()
-	if err := mns.Unmount(ctx, containerRootDirent, true /* detach only */); err != nil {
-		return fmt.Errorf("error unmounting container root mount %q: %v", containerRootDirent.BaseName(), err)
-	}
-
-	// Get a reference to the parent directory and remove the root
-	// container directory.
-	containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, linux.MaxSymlinkTraversals)
-	if err != nil {
-		return fmt.Errorf("error finding containers directory %q: %v", ChildContainersDir, err)
-	}
-	defer containersDirDirent.DecRef()
-	log.Debugf("Deleting container root %q", containerRoot)
-	if err := containersDirDirent.RemoveDirectory(ctx, mnsRoot, *cid); err != nil {
-		return fmt.Errorf("error removing directory %q: %v", containerRoot, err)
-	}
-
-	// Flushing dirent references triggers many async close operations. We
-	// must wait for those to complete before returning, otherwise the
-	// caller may kill the gofer before they complete, causing a cascade of
-	// failing RPCs.
-	log.Infof("Waiting for async filesystem operations to complete")
-	fs.AsyncBarrier()
-
-	// We made it!
-	log.Debugf("Destroyed container %q", *cid)
-	return nil
+	return cm.l.destroyContainer(*cid)
 }
 
 // ExecuteAsync starts running a command on a created or running sandbox. It
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 22d5f621c..9e8fea7e1 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -16,6 +16,7 @@ package boot
 
 import (
 	"fmt"
+	"path"
 	"path/filepath"
 	"strconv"
 	"strings"
@@ -576,9 +577,9 @@ func subtargets(root string, mnts []specs.Mount) []string {
 	return targets
 }
 
-// setFileSystemForProcess is used to set up the file system and amend the procArgs accordingly.
+// setupContainerFS is used to set up the file system and amend the procArgs accordingly.
 // procArgs are passed by reference and the FDMap field is modified. It dups stdioFDs.
-func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
+func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
 	ctx := procArgs.NewContext(k)
 
 	// Create the FD map, which will set stdin, stdout, and stderr.  If
@@ -676,3 +677,65 @@ func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *ke
 	procArgs.Filename = f
 	return nil
 }
+
+// destroyContainerFS cleans up the filesystem by unmounting all mounts for the
+// given container and deleting the container root directory.
+func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error {
+	// First get a reference to the container root directory.
+	mns := k.RootMountNamespace()
+	mnsRoot := mns.Root()
+	defer mnsRoot.DecRef()
+	containerRoot := path.Join(ChildContainersDir, cid)
+	containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, linux.MaxSymlinkTraversals)
+	if err == syserror.ENOENT {
+		// Container must have been destroyed already. That's fine.
+		return nil
+	}
+	if err != nil {
+		return fmt.Errorf("error finding container root directory %q: %v", containerRoot, err)
+	}
+	defer containerRootDirent.DecRef()
+
+	// Iterate through all submounts and unmount them. We unmount lazily by
+	// setting detach=true, so we can unmount in any order.
+	for _, m := range containerRootDirent.Inode.MountSource.Submounts() {
+		root := m.Root()
+		defer root.DecRef()
+
+		// Do a best-effort unmount by flushing the refs and unmount
+		// with "detach only = true".
+		log.Debugf("Unmounting container submount %q", root.BaseName())
+		m.FlushDirentRefs()
+		if err := mns.Unmount(ctx, root, true /* detach only */); err != nil {
+			return fmt.Errorf("error unmounting container submount %q: %v", root.BaseName(), err)
+		}
+	}
+
+	// Unmount the container root itself.
+	log.Debugf("Unmounting container root %q", containerRoot)
+	containerRootDirent.Inode.MountSource.FlushDirentRefs()
+	if err := mns.Unmount(ctx, containerRootDirent, true /* detach only */); err != nil {
+		return fmt.Errorf("error unmounting container root mount %q: %v", containerRootDirent.BaseName(), err)
+	}
+
+	// Get a reference to the parent directory and remove the root
+	// container directory.
+	containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, linux.MaxSymlinkTraversals)
+	if err != nil {
+		return fmt.Errorf("error finding containers directory %q: %v", ChildContainersDir, err)
+	}
+	defer containersDirDirent.DecRef()
+	log.Debugf("Deleting container root %q", containerRoot)
+	if err := containersDirDirent.RemoveDirectory(ctx, mnsRoot, cid); err != nil {
+		return fmt.Errorf("error removing directory %q: %v", containerRoot, err)
+	}
+
+	// Flushing dirent references triggers many async close operations. We
+	// must wait for those to complete before returning, otherwise the
+	// caller may kill the gofer before they complete, causing a cascade of
+	// failing RPCs.
+	log.Infof("Waiting for async filesystem operations to complete")
+	fs.AsyncBarrier()
+
+	return nil
+}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 52c251812..1e2a12280 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -112,9 +112,6 @@ type Loader struct {
 	// have the corresponding pid set.
 	//
 	// processes is guardded by mu.
-	//
-	// TODO: When containers are removed via `runsc delete`,
-	// processes should be cleaned up.
 	processes map[execID]*kernel.ThreadGroup
 }
 
@@ -385,7 +382,7 @@ func (l *Loader) run() error {
 	// If we are restoring, we do not want to create a process.
 	// l.restore is set by the container manager when a restore call is made.
 	if !l.restore {
-		if err := setFileSystemForProcess(
+		if err := setupContainerFS(
 			&l.rootProcArgs,
 			l.spec,
 			l.conf,
@@ -476,7 +473,7 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 
 	stdioFDs := ioFDs[:3]
 	goferFDs := ioFDs[3:]
-	if err := setFileSystemForProcess(
+	if err := setupContainerFS(
 		&procArgs,
 		spec,
 		conf,
@@ -519,6 +516,34 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 	return nil
 }
 
+// destroyContainer stops a container if it is still running and cleans up its
+// filesystem.
+func (l *Loader) destroyContainer(cid string) error {
+	// First kill and wait for all processes in the container.
+	if err := l.signal(cid, int32(linux.SIGKILL), true /*all*/); err != nil {
+		return fmt.Errorf("failed to SIGKILL all container processes: %v", err)
+	}
+
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	// Remove all container thread groups from the map.
+	for key := range l.processes {
+		if key.cid == cid {
+			delete(l.processes, key)
+		}
+	}
+
+	ctx := l.rootProcArgs.NewContext(l.k)
+	if err := destroyContainerFS(ctx, cid, l.k); err != nil {
+		return fmt.Errorf("failed to destroy filesystem for container %q: %v", cid, err)
+	}
+
+	// We made it!
+	log.Debugf("Container destroyed %q", cid)
+	return nil
+}
+
 func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 	// Get the container Root Dirent from the Task, since we must run this
 	// process with the same Root.
@@ -669,13 +694,27 @@ func (l *Loader) signal(cid string, signo int32, all bool) error {
 	}
 
 	si := arch.SignalInfo{Signo: signo}
-	if all {
-		// Pause the kernel to prevent new processes from being created while
-		// the signal is delivered. This prevents process leaks when SIGKILL is
-		// sent to the entire container.
-		l.k.Pause()
-		defer l.k.Unpause()
-		return l.k.SendContainerSignal(cid, &si)
-	}
-	return tg.Leader().SendSignal(&si)
+	if !all {
+		return tg.Leader().SendSignal(&si)
+	}
+
+	// Pause the kernel to prevent new processes from being created while
+	// the signal is delivered. This prevents process leaks when SIGKILL is
+	// sent to the entire container.
+	l.k.Pause()
+	if err := l.k.SendContainerSignal(cid, &si); err != nil {
+		l.k.Unpause()
+		return err
+	}
+	l.k.Unpause()
+
+	// If killing all processes, wait for them to exit.
+	if all && linux.Signal(signo) == linux.SIGKILL {
+		for _, t := range l.k.TaskSet().Root.Tasks() {
+			if t.ContainerID() == cid {
+				t.ThreadGroup().WaitExited()
+			}
+		}
+	}
+	return nil
 }
-- 
cgit v1.2.3