9 files changed, 102 insertions, 60 deletions
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 6ac19668f..a7c4ebb0c 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -162,6 +162,12 @@ var allowedSyscalls = seccomp.SyscallRules{
 	},
 	syscall.SYS_LSEEK:   {},
 	syscall.SYS_MADVISE: {},
+	unix.SYS_MEMBARRIER: []seccomp.Rule{
+		{
+			seccomp.EqualTo(linux.MEMBARRIER_CMD_GLOBAL),
+			seccomp.EqualTo(0),
+		},
+	},
 	syscall.SYS_MINCORE: {},
 	// Used by the Go runtime as a temporarily workaround for a Linux
 	// 5.2-5.4 bug.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 2e652ddad..8ad000497 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -282,6 +282,7 @@ func New(args Args) (*Loader, error) {
 		args.NumCPU = runtime.NumCPU()
 	}
 	log.Infof("CPUs: %d", args.NumCPU)
+	runtime.GOMAXPROCS(args.NumCPU)
 
 	if args.TotalMem > 0 {
 		// Adjust the total memory returned by the Sentry so that applications that
@@ -471,9 +472,13 @@ func (l *Loader) Destroy() {
 	}
 	l.watchdog.Stop()
 
+	// Release all kernel resources. This is only safe after we can no longer
+	// save/restore.
+	l.k.Release()
+
 	// In the success case, stdioFDs and goferFDs will only contain
 	// released/closed FDs that ownership has been passed over to host FDs and
-	// gofer sessions. Close them here in case on failure.
+	// gofer sessions. Close them here in case of failure.
 	for _, fd := range l.root.stdioFDs {
 		_ = fd.Close()
 	}
@@ -797,7 +802,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
 }
 
 // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
-// the gofer FDs looking for disconnects, and destroys the container if a
+// the gofer FDs looking for disconnects, and kills the container processes if a
 // disconnect occurs in any of the gofer FDs.
 func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) {
 	go func() {
@@ -818,18 +823,15 @@ func (l *Loader) startGoferMonitor(cid string, goferFDs []*fd.FD) {
 			panic(fmt.Sprintf("Error monitoring gofer FDs: %v", err))
 		}
 
-		// Check if the gofer has stopped as part of normal container destruction.
-		// This is done just to avoid sending an annoying error message to the log.
-		// Note that there is a small race window in between mu.Unlock() and the
-		// lock being reacquired in destroyContainer(), but it's harmless to call
-		// destroyContainer() multiple times.
 		l.mu.Lock()
-		_, ok := l.processes[execID{cid: cid}]
-		l.mu.Unlock()
-		if ok {
-			log.Infof("Gofer socket disconnected, destroying container %q", cid)
-			if err := l.destroyContainer(cid); err != nil {
-				log.Warningf("Error destroying container %q after gofer stopped: %v", cid, err)
+		defer l.mu.Unlock()
+
+		// The gofer could have been stopped due to a normal container shutdown.
+		// Check if the container has not stopped yet.
+		if tg, _ := l.tryThreadGroupFromIDLocked(execID{cid: cid}); tg != nil {
+			log.Infof("Gofer socket disconnected, killing container %q", cid)
+			if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
+				log.Warningf("Error killing container %q after gofer stopped: %v", cid, err)
 			}
 		}
 	}()
@@ -898,17 +900,24 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 		return 0, fmt.Errorf("container %q not started", args.ContainerID)
 	}
 
-	// Get the container MountNamespace from the Task.
+	// Get the container MountNamespace from the Task. Try to acquire ref may fail
+	// in case it raced with task exit.
 	if kernel.VFS2Enabled {
-		// task.MountNamespace() does not take a ref, so we must do so ourselves.
+		// task.MountNamespaceVFS2() does not take a ref, so we must do so ourselves.
 		args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2()
-		args.MountNamespaceVFS2.IncRef()
+		if !args.MountNamespaceVFS2.TryIncRef() {
+			return 0, fmt.Errorf("container %q has stopped", args.ContainerID)
+		}
 	} else {
+		var reffed bool
 		tg.Leader().WithMuLocked(func(t *kernel.Task) {
 			// task.MountNamespace() does not take a ref, so we must do so ourselves.
 			args.MountNamespace = t.MountNamespace()
-			args.MountNamespace.IncRef()
+			reffed = args.MountNamespace.TryIncRef()
 		})
+		if !reffed {
+			return 0, fmt.Errorf("container %q has stopped", args.ContainerID)
+		}
 	}
 
 	// Add the HOME environment variable if it is not already set.
@@ -916,7 +925,6 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 		root := args.MountNamespaceVFS2.Root()
 		ctx := vfs.WithRoot(l.k.SupervisorContext(), root)
 		defer args.MountNamespaceVFS2.DecRef(ctx)
-		defer root.DecRef(ctx)
 		envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv)
 		if err != nil {
 			return 0, err
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index bf9ec5d38..e376f944b 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -264,7 +264,7 @@ type CreateMountTestcase struct {
 	expectedPaths []string
 }
 
-func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
+func createMountTestcases() []*CreateMountTestcase {
 	testCases := []*CreateMountTestcase{
 		&CreateMountTestcase{
 			// Only proc.
@@ -409,32 +409,26 @@ func createMountTestcases(vfs2 bool) []*CreateMountTestcase {
 					Destination: "/proc",
 					Type:        "tmpfs",
 				},
-				// TODO (gvisor.dev/issue/1487): Re-add this case when sysfs supports
-				//  MkDirAt in VFS2 (and remove the reduntant append).
-				// {
-				//		Destination: "/sys/bar",
-				//		Type:        "tmpfs",
-				//	},
-				//
+				{
+					Destination: "/sys/bar",
+					Type:        "tmpfs",
+				},
+
 				{
 					Destination: "/tmp/baz",
 					Type:        "tmpfs",
 				},
 			},
 		},
-		expectedPaths: []string{"/proc", "/sys" /* "/sys/bar" ,*/, "/tmp", "/tmp/baz"},
+		expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz"},
 	}
 
-	if !vfs2 {
-		vfsCase.spec.Mounts = append(vfsCase.spec.Mounts, specs.Mount{Destination: "/sys/bar", Type: "tmpfs"})
-		vfsCase.expectedPaths = append(vfsCase.expectedPaths, "/sys/bar")
-	}
 	return append(testCases, vfsCase)
 }
 
 // Test that MountNamespace can be created with various specs.
 func TestCreateMountNamespace(t *testing.T) {
-	for _, tc := range createMountTestcases(false /* vfs2 */) {
+	for _, tc := range createMountTestcases() {
 		t.Run(tc.name, func(t *testing.T) {
 			conf := testConfig()
 			ctx := contexttest.Context(t)
@@ -471,7 +465,7 @@ func TestCreateMountNamespace(t *testing.T) {
 
 // Test that MountNamespace can be created with various specs.
 func TestCreateMountNamespaceVFS2(t *testing.T) {
-	for _, tc := range createMountTestcases(true /* vfs2 */) {
+	for _, tc := range createMountTestcases() {
 		t.Run(tc.name, func(t *testing.T) {
 			spec := testSpec()
 			spec.Mounts = tc.spec.Mounts
@@ -497,6 +491,7 @@ func TestCreateMountNamespaceVFS2(t *testing.T) {
 			}
 
 			root := mns.Root()
+			root.IncRef()
 			defer root.DecRef(ctx)
 			for _, p := range tc.expectedPaths {
 				target := &vfs.PathOperation{
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index e36664938..82e459f46 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -170,6 +170,7 @@ func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.Create
 	rootProcArgs.MountNamespaceVFS2 = mns
 
 	root := mns.Root()
+	root.IncRef()
 	defer root.DecRef(rootCtx)
 	if root.Mount().ReadOnly() {
 		// Switch to ReadWrite while we setup submounts.
@@ -377,6 +378,7 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.C
 	}
 
 	root := mns.Root()
+	root.IncRef()
 	defer root.DecRef(ctx)
 	target := &vfs.PathOperation{
 		Root:  root,
@@ -474,6 +476,7 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *config.Config
 	}
 
 	root := mns.Root()
+	root.IncRef()
 	defer root.DecRef(ctx)
 	pop := vfs.PathOperation{
 		Root:  root,
@@ -597,6 +600,7 @@ func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *co
 	defer newMnt.DecRef(ctx)
 
 	root := mns.Root()
+	root.IncRef()
 	defer root.DecRef(ctx)
 	target := &vfs.PathOperation{
 		Root:  root,
@@ -617,6 +621,7 @@ func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *co
 
 func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error {
 	root := mns.Root()
+	root.IncRef()
 	defer root.DecRef(ctx)
 	target := &vfs.PathOperation{
 		Root:  root,
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 548c68087..1f8e277cc 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -316,6 +316,7 @@ func configs(t *testing.T, opts ...configOption) map[string]*config.Config {
 	return cs
 }
 
+// TODO(gvisor.dev/issue/1624): Merge with configs when VFS2 is the default.
 func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*config.Config {
 	all := configs(t, opts...)
 	for key, value := range configs(t, opts...) {
@@ -894,13 +895,15 @@ func TestKillPid(t *testing.T) {
 	}
 }
 
-// TestCheckpointRestore creates a container that continuously writes successive integers
-// to a file. To test checkpoint and restore functionality, the container is
-// checkpointed and the last number printed to the file is recorded. Then, it is restored in two
-// new containers and the first number printed from these containers is checked. Both should
-// be the next consecutive number after the last number from the checkpointed container.
+// TestCheckpointRestore creates a container that continuously writes successive
+// integers to a file. To test checkpoint and restore functionality, the
+// container is checkpointed and the last number printed to the file is
+// recorded. Then, it is restored in two new containers and the first number
+// printed from these containers is checked. Both should be the next consecutive
+// number after the last number from the checkpointed container.
 func TestCheckpointRestore(t *testing.T) {
 	// Skip overlay because test requires writing to host file.
+	// TODO(gvisor.dev/issue/1663): Add VFS when S/R support is added.
 	for name, conf := range configs(t, noOverlay...) {
 		t.Run(name, func(t *testing.T) {
 			dir, err := ioutil.TempDir(testutil.TmpDir(), "checkpoint-test")
@@ -1062,6 +1065,7 @@ func TestCheckpointRestore(t *testing.T) {
 // with filesystem Unix Domain Socket use.
 func TestUnixDomainSockets(t *testing.T) {
 	// Skip overlay because test requires writing to host file.
+	// TODO(gvisor.dev/issue/1663): Add VFS when S/R support is added.
 	for name, conf := range configs(t, noOverlay...) {
 		t.Run(name, func(t *testing.T) {
 			// UDS path is limited to 108 chars for compatibility with older systems.
@@ -1199,7 +1203,7 @@ func TestUnixDomainSockets(t *testing.T) {
 // recreated. Then it resumes the container, verify that the file gets created
 // again.
 func TestPauseResume(t *testing.T) {
-	for name, conf := range configs(t, noOverlay...) {
+	for name, conf := range configsWithVFS2(t, noOverlay...) {
 		t.Run(name, func(t *testing.T) {
 			tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "lock")
 			if err != nil {
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 952215ec1..850e80290 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -480,7 +480,7 @@ func TestMultiContainerMount(t *testing.T) {
 // TestMultiContainerSignal checks that it is possible to signal individual
 // containers without killing the entire sandbox.
 func TestMultiContainerSignal(t *testing.T) {
-	for name, conf := range configs(t, all...) {
+	for name, conf := range configsWithVFS2(t, all...) {
 		t.Run(name, func(t *testing.T) {
 			rootDir, cleanup, err := testutil.SetupRootDir()
 			if err != nil {
@@ -1691,12 +1691,11 @@ func TestMultiContainerRunNonRoot(t *testing.T) {
 }
 
 // TestMultiContainerHomeEnvDir tests that the HOME environment variable is set
-// for root containers, sub-containers, and execed processes.
+// for root containers, sub-containers, and exec'ed processes.
 func TestMultiContainerHomeEnvDir(t *testing.T) {
-	// TODO(gvisor.dev/issue/1487): VFSv2 configs failing.
 	// NOTE: Don't use overlay since we need changes to persist to the temp dir
 	// outside the sandbox.
-	for testName, conf := range configs(t, noOverlay...) {
+	for testName, conf := range configsWithVFS2(t, noOverlay...) {
 		t.Run(testName, func(t *testing.T) {
 
 			rootDir, cleanup, err := testutil.SetupRootDir()
@@ -1718,9 +1717,9 @@ func TestMultiContainerHomeEnvDir(t *testing.T) {
 
 			// We will sleep in the root container in order to ensure that the root
 			//container doesn't terminate before sub containers can be created.
-			rootCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s; sleep 1000", homeDirs["root"].Name())}
-			subCmd := []string{"/bin/sh", "-c", fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["sub"].Name())}
-			execCmd := fmt.Sprintf("printf \"$HOME\" > %s", homeDirs["exec"].Name())
+			rootCmd := []string{"/bin/sh", "-c", fmt.Sprintf(`printf "$HOME" > %s; sleep 1000`, homeDirs["root"].Name())}
+			subCmd := []string{"/bin/sh", "-c", fmt.Sprintf(`printf "$HOME" > %s`, homeDirs["sub"].Name())}
+			execCmd := fmt.Sprintf(`printf "$HOME" > %s`, homeDirs["exec"].Name())
 
 			// Setup the containers, a root container and sub container.
 			specConfig, ids := createSpecs(rootCmd, subCmd)
diff --git a/runsc/flag/flag.go b/runsc/flag/flag.go
index ba1ff833f..775325c06 100644
--- a/runsc/flag/flag.go
+++ b/runsc/flag/flag.go
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+// Package flag wraps flag primitives.
 package flag
 
 import (
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 0b9f39466..8f66dd1f8 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -309,11 +309,20 @@ func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (
 	const bufSize = 4 << 20 // 4MB.
 
 	if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, bufSize); err != nil {
-		return nil, fmt.Errorf("failed to increase socket rcv buffer to %d: %v", bufSize, err)
+		syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF, bufSize)
+		sz, _ := syscall.GetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF)
+
+		if sz < bufSize {
+			log.Warningf("Failed to increase rcv buffer to %d on SOCK_RAW on %s. Current buffer %d: %v", bufSize, iface.Name, sz, err)
+		}
 	}
 
 	if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUFFORCE, bufSize); err != nil {
-		return nil, fmt.Errorf("failed to increase socket snd buffer to %d: %v", bufSize, err)
+		syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF, bufSize)
+		sz, _ := syscall.GetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF)
+		if sz < bufSize {
+			log.Warningf("Failed to increase snd buffer to %d on SOCK_RAW on %s. Curent buffer %d: %v", bufSize, iface.Name, sz, err)
+		}
 	}
 
 	return &socketEntry{deviceFile, gsoMaxSize}, nil
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index a8f4f64a5..c4309feb3 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -72,11 +72,14 @@ type Sandbox struct {
 	// will have it as a child process.
 	child bool
 
-	// status is an exit status of a sandbox process.
-	status syscall.WaitStatus
-
 	// statusMu protects status.
 	statusMu sync.Mutex
+
+	// status is the exit status of a sandbox process. It's only set if the
+	// child==true and the sandbox was waited on. This field allows for multiple
+	// threads to wait on sandbox and get the exit code, since Linux will return
+	// WaitStatus to one of the waiters only.
+	status syscall.WaitStatus
 }
 
 // Args is used to configure a new sandbox.
@@ -746,35 +749,47 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn
 // Wait waits for the containerized process to exit, and returns its WaitStatus.
 func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
 	log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID)
-	var ws syscall.WaitStatus
 
 	if conn, err := s.sandboxConnect(); err != nil {
-		// The sandbox may have exited while before we had a chance to
-		// wait on it.
+		// The sandbox may have exited while before we had a chance to wait on it.
+		// There is nothing we can do for subcontainers. For the init container, we
+		// can try to get the sandbox exit code.
+		if !s.IsRootContainer(cid) {
+			return syscall.WaitStatus(0), err
+		}
 		log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
 	} else {
 		defer conn.Close()
+
 		// Try the Wait RPC to the sandbox.
+		var ws syscall.WaitStatus
 		err = conn.Call(boot.ContainerWait, &cid, &ws)
 		if err == nil {
 			// It worked!
 			return ws, nil
 		}
+		// See comment above.
+		if !s.IsRootContainer(cid) {
+			return syscall.WaitStatus(0), err
+		}
+
 		// The sandbox may have exited after we connected, but before
 		// or during the Wait RPC.
 		log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err)
 	}
 
-	// The sandbox may have already exited, or exited while handling the
-	// Wait RPC. The best we can do is ask Linux what the sandbox exit
-	// status was, since in most cases that will be the same as the
-	// container exit status.
+	// The sandbox may have already exited, or exited while handling the Wait RPC.
+	// The best we can do is ask Linux what the sandbox exit status was, since in
+	// most cases that will be the same as the container exit status.
 	if err := s.waitForStopped(); err != nil {
-		return ws, err
+		return syscall.WaitStatus(0), err
 	}
 	if !s.child {
-		return ws, fmt.Errorf("sandbox no longer running and its exit status is unavailable")
+		return syscall.WaitStatus(0), fmt.Errorf("sandbox no longer running and its exit status is unavailable")
 	}
+
+	s.statusMu.Lock()
+	defer s.statusMu.Unlock()
 	return s.status, nil
 }