summaryrefslogtreecommitdiffhomepage
path: root/runsc/boot
diff options
context:
space:
mode:
authorFabricio Voznika <fvoznika@google.com>2019-09-04 18:55:39 -0700
committergVisor bot <gvisor-bot@google.com>2019-09-04 18:56:49 -0700
commit0f5cdc1e00488823f1f7b9884c15b899677362b6 (patch)
tree16da94bb0a1f3505998e10c5261b8616e478bf1d /runsc/boot
parentbcddd0a4778916f6a5347246a81fbe236050d2c4 (diff)
Resolve flakes with TestMultiContainerDestroy
Some processes are reparented to the root container depending on the kill order and the root container would not reap in time. So some zombie processes were still present when the test checked. Fix it by running the second container inside a PID namespace. PiperOrigin-RevId: 267278591
Diffstat (limited to 'runsc/boot')
-rw-r--r--runsc/boot/loader.go49
1 files changed, 22 insertions, 27 deletions
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 19b738705..823a34619 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -760,26 +760,34 @@ func (l *Loader) destroyContainer(cid string) error {
if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
return fmt.Errorf("sending SIGKILL to all container processes: %v", err)
}
+ // Wait for all processes that belong to the container to exit (including
+ // exec'd processes).
+ for _, t := range l.k.TaskSet().Root.Tasks() {
+ if t.ContainerID() == cid {
+ t.ThreadGroup().WaitExited()
+ }
+ }
+
+ // At this point, all processes inside of the container have exited,
+ // releasing all references to the container's MountNamespace and
+ // causing all submounts and overlays to be unmounted.
+ //
+ // Since the container's MountNamespace has been released,
+ // MountNamespace.destroy() will have executed, but that function may
+ // trigger async close operations. We must wait for those to complete
+ // before returning, otherwise the caller may kill the gofer before
+ // they complete, causing a cascade of failing RPCs.
+ fs.AsyncBarrier()
}
- // Remove all container thread groups from the map.
+ // No more failure from this point on. Remove all container thread groups
+ // from the map.
for key := range l.processes {
if key.cid == cid {
delete(l.processes, key)
}
}
- // At this point, all processes inside of the container have exited,
- // releasing all references to the container's MountNamespace and
- // causing all submounts and overlays to be unmounted.
- //
- // Since the container's MountNamespace has been released,
- // MountNamespace.destroy() will have executed, but that function may
- // trigger async close operations. We must wait for those to complete
- // before returning, otherwise the caller may kill the gofer before
- // they complete, causing a cascade of failing RPCs.
- fs.AsyncBarrier()
-
log.Debugf("Container destroyed %q", cid)
return nil
}
@@ -1037,21 +1045,8 @@ func (l *Loader) signalAllProcesses(cid string, signo int32) error {
// the signal is delivered. This prevents process leaks when SIGKILL is
// sent to the entire container.
l.k.Pause()
- if err := l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo}); err != nil {
- l.k.Unpause()
- return err
- }
- l.k.Unpause()
-
- // If SIGKILLing all processes, wait for them to exit.
- if linux.Signal(signo) == linux.SIGKILL {
- for _, t := range l.k.TaskSet().Root.Tasks() {
- if t.ContainerID() == cid {
- t.ThreadGroup().WaitExited()
- }
- }
- }
- return nil
+ defer l.k.Unpause()
+ return l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo})
}
// threadGroupFromID same as threadGroupFromIDLocked except that it acquires