From 0f5cdc1e00488823f1f7b9884c15b899677362b6 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Wed, 4 Sep 2019 18:55:39 -0700 Subject: Resolve flakes with TestMultiContainerDestroy Some processes are reparented to the root container depending on the kill order and the root container would not reap in time. So some zombie processes were still present when the test checked. Fix it by running the second container inside a PID namespace. PiperOrigin-RevId: 267278591 --- runsc/boot/loader.go | 49 ++++++++++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 27 deletions(-) (limited to 'runsc/boot') diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 19b738705..823a34619 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -760,26 +760,34 @@ func (l *Loader) destroyContainer(cid string) error { if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil { return fmt.Errorf("sending SIGKILL to all container processes: %v", err) } + // Wait for all processes that belong to the container to exit (including + // exec'd processes). + for _, t := range l.k.TaskSet().Root.Tasks() { + if t.ContainerID() == cid { + t.ThreadGroup().WaitExited() + } + } + + // At this point, all processes inside of the container have exited, + // releasing all references to the container's MountNamespace and + // causing all submounts and overlays to be unmounted. + // + // Since the container's MountNamespace has been released, + // MountNamespace.destroy() will have executed, but that function may + // trigger async close operations. We must wait for those to complete + // before returning, otherwise the caller may kill the gofer before + // they complete, causing a cascade of failing RPCs. + fs.AsyncBarrier() } - // Remove all container thread groups from the map. + // No more failure from this point on. Remove all container thread groups + // from the map. for key := range l.processes { if key.cid == cid { delete(l.processes, key) } } - // At this point, all processes inside of the container have exited, - // releasing all references to the container's MountNamespace and - // causing all submounts and overlays to be unmounted. - // - // Since the container's MountNamespace has been released, - // MountNamespace.destroy() will have executed, but that function may - // trigger async close operations. We must wait for those to complete - // before returning, otherwise the caller may kill the gofer before - // they complete, causing a cascade of failing RPCs. - fs.AsyncBarrier() - log.Debugf("Container destroyed %q", cid) return nil } @@ -1037,21 +1045,8 @@ func (l *Loader) signalAllProcesses(cid string, signo int32) error { // the signal is delivered. This prevents process leaks when SIGKILL is // sent to the entire container. l.k.Pause() - if err := l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo}); err != nil { - l.k.Unpause() - return err - } - l.k.Unpause() - - // If SIGKILLing all processes, wait for them to exit. - if linux.Signal(signo) == linux.SIGKILL { - for _, t := range l.k.TaskSet().Root.Tasks() { - if t.ContainerID() == cid { - t.ThreadGroup().WaitExited() - } - } - } - return nil + defer l.k.Unpause() + return l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo}) } // threadGroupFromID same as threadGroupFromIDLocked except that it acquires -- cgit v1.2.3