Resolve flakes with TestMultiContainerDestroy

Some processes are reparented to the root container depending on the kill order and the root container would not reap in time. So some zombie processes were still present when the test checked. Fix it by running the second container inside a PID namespace. PiperOrigin-RevId: 267278591
author: Fabricio Voznika <fvoznika@google.com> 2019-09-04 18:55:39 -0700
committer: gVisor bot <gvisor-bot@google.com> 2019-09-04 18:56:49 -0700
commit: 0f5cdc1e00488823f1f7b9884c15b899677362b6 (patch)
tree: 16da94bb0a1f3505998e10c5261b8616e478bf1d /runsc/boot
parent: bcddd0a4778916f6a5347246a81fbe236050d2c4 (diff)
1 files changed, 22 insertions, 27 deletions
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 19b738705..823a34619 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -760,26 +760,34 @@ func (l *Loader) destroyContainer(cid string) error {
 		if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
 			return fmt.Errorf("sending SIGKILL to all container processes: %v", err)
 		}
+		// Wait for all processes that belong to the container to exit (including
+		// exec'd processes).
+		for _, t := range l.k.TaskSet().Root.Tasks() {
+			if t.ContainerID() == cid {
+				t.ThreadGroup().WaitExited()
+			}
+		}
+
+		// At this point, all processes inside of the container have exited,
+		// releasing all references to the container's MountNamespace and
+		// causing all submounts and overlays to be unmounted.
+		//
+		// Since the container's MountNamespace has been released,
+		// MountNamespace.destroy() will have executed, but that function may
+		// trigger async close operations. We must wait for those to complete
+		// before returning, otherwise the caller may kill the gofer before
+		// they complete, causing a cascade of failing RPCs.
+		fs.AsyncBarrier()
 	}
 
-	// Remove all container thread groups from the map.
+	// No more failure from this point on. Remove all container thread groups
+	// from the map.
 	for key := range l.processes {
 		if key.cid == cid {
 			delete(l.processes, key)
 		}
 	}
 
-	// At this point, all processes inside of the container have exited,
-	// releasing all references to the container's MountNamespace and
-	// causing all submounts and overlays to be unmounted.
-	//
-	// Since the container's MountNamespace has been released,
-	// MountNamespace.destroy() will have executed, but that function may
-	// trigger async close operations. We must wait for those to complete
-	// before returning, otherwise the caller may kill the gofer before
-	// they complete, causing a cascade of failing RPCs.
-	fs.AsyncBarrier()
-
 	log.Debugf("Container destroyed %q", cid)
 	return nil
 }
@@ -1037,21 +1045,8 @@ func (l *Loader) signalAllProcesses(cid string, signo int32) error {
 	// the signal is delivered. This prevents process leaks when SIGKILL is
 	// sent to the entire container.
 	l.k.Pause()
-	if err := l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo}); err != nil {
-		l.k.Unpause()
-		return err
-	}
-	l.k.Unpause()
-
-	// If SIGKILLing all processes, wait for them to exit.
-	if linux.Signal(signo) == linux.SIGKILL {
-		for _, t := range l.k.TaskSet().Root.Tasks() {
-			if t.ContainerID() == cid {
-				t.ThreadGroup().WaitExited()
-			}
-		}
-	}
-	return nil
+	defer l.k.Unpause()
+	return l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo})
 }
 
 // threadGroupFromID same as threadGroupFromIDLocked except that it acquires
author	Fabricio Voznika <fvoznika@google.com>	2019-09-04 18:55:39 -0700
committer	gVisor bot <gvisor-bot@google.com>	2019-09-04 18:56:49 -0700
commit	0f5cdc1e00488823f1f7b9884c15b899677362b6 (patch)
tree	16da94bb0a1f3505998e10c5261b8616e478bf1d /runsc/boot
parent	bcddd0a4778916f6a5347246a81fbe236050d2c4 (diff)