From 04cbb13ce9b151cf906f42e3f18ce3a875f01f63 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Tue, 23 Jul 2019 14:35:50 -0700
Subject: Give each container a distinct MountNamespace.

This keeps all container filesystem completely separate from eachother
(including from the root container filesystem), and allows us to get rid of the
"__runsc_containers__" directory.

It also simplifies container startup/teardown as we don't have to muck around
in the root container's filesystem.

PiperOrigin-RevId: 259613346
---
 pkg/sentry/control/proc.go              |  22 +++-
 runsc/boot/fs.go                        | 140 ++++--------------------
 runsc/boot/loader.go                    |  48 ++++++---
 runsc/container/multi_container_test.go | 120 +++++++++++++++++----
 runsc/container/test_app/BUILD          |   6 +-
 runsc/container/test_app/fds.go         | 185 ++++++++++++++++++++++++++++++++
 runsc/container/test_app/test_app.go    |   8 +-
 7 files changed, 366 insertions(+), 163 deletions(-)
 create mode 100644 runsc/container/test_app/fds.go

diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 6ae60c5cb..60e6c9285 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -54,6 +54,12 @@ type ExecArgs struct {
 	// Envv is a list of environment variables.
 	Envv []string `json:"envv"`
 
+	// MountNamespace is the mount namespace to execute the new process in.
+	// A reference on MountNamespace must be held for the lifetime of the
+	// ExecArgs. If MountNamespace is nil, it will default to the kernel's
+	// root MountNamespace.
+	MountNamespace *fs.MountNamespace
+
 	// Root defines the root directory for the new process. A reference on
 	// Root must be held for the lifetime of the ExecArgs. If Root is nil,
 	// it will default to the VFS root.
@@ -145,6 +151,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 		Argv:                    args.Argv,
 		Envv:                    args.Envv,
 		WorkingDirectory:        args.WorkingDirectory,
+		MountNamespace:          args.MountNamespace,
 		Root:                    args.Root,
 		Credentials:             creds,
 		FDTable:                 fdTable,
@@ -157,16 +164,25 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
 		ContainerID:             args.ContainerID,
 	}
 	if initArgs.Root != nil {
-		// initArgs must hold a reference on Root. This ref is dropped
-		// in CreateProcess.
+		// initArgs must hold a reference on Root, which will be
+		// donated to the new process in CreateProcess.
 		initArgs.Root.IncRef()
 	}
+	if initArgs.MountNamespace != nil {
+		// initArgs must hold a reference on MountNamespace, which will
+		// be donated to the new process in CreateProcess.
+		initArgs.MountNamespace.IncRef()
+	}
 	ctx := initArgs.NewContext(proc.Kernel)
 
 	if initArgs.Filename == "" {
 		// Get the full path to the filename from the PATH env variable.
 		paths := fs.GetPath(initArgs.Envv)
-		f, err := proc.Kernel.RootMountNamespace().ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
+		mns := initArgs.MountNamespace
+		if mns == nil {
+			mns = proc.Kernel.RootMountNamespace()
+		}
+		f, err := mns.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
 		if err != nil {
 			return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
 		}
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index d3e3196fd..55bfc27ff 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -54,10 +54,6 @@ const (
 	// MountPrefix is the annotation prefix for mount hints.
 	MountPrefix = "gvisor.dev/spec/mount"
 
-	// ChildContainersDir is the directory where child container root
-	// filesystems are mounted.
-	ChildContainersDir = "/__runsc_containers__"
-
 	// Filesystems that runsc supports.
 	bind     = "bind"
 	devpts   = "devpts"
@@ -256,10 +252,10 @@ func subtargets(root string, mnts []specs.Mount) []string {
 
 // setExecutablePath sets the procArgs.Filename by searching the PATH for an
 // executable matching the procArgs.Argv[0].
-func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error {
+func setExecutablePath(ctx context.Context, procArgs *kernel.CreateProcessArgs) error {
 	paths := fs.GetPath(procArgs.Envv)
 	exe := procArgs.Argv[0]
-	f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths)
+	f, err := procArgs.MountNamespace.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths)
 	if err != nil {
 		return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err)
 	}
@@ -514,11 +510,16 @@ func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs *
 
 	// If this is the root container, we also need to setup the root mount
 	// namespace.
-	mns := c.k.RootMountNamespace()
-	if mns == nil {
+	rootMNS := c.k.RootMountNamespace()
+	if rootMNS == nil {
 		// Setup the root container.
-		if err := c.setupRootContainer(ctx, rootCtx, conf, func(mns *fs.MountNamespace) {
-			c.k.SetRootMountNamespace(mns)
+		if err := c.setupRootContainer(ctx, rootCtx, conf, func(rootMNS *fs.MountNamespace) {
+			// The callback to setupRootContainer inherits a
+			// reference on the rootMNS, so we don't need to take
+			// an additional reference here.
+			procArgs.MountNamespace = rootMNS
+			procArgs.Root = rootMNS.Root()
+			c.k.SetRootMountNamespace(rootMNS)
 		}); err != nil {
 			return err
 		}
@@ -527,54 +528,26 @@ func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs *
 
 	// Setup a child container.
 	log.Infof("Creating new process in child container.")
-	globalRoot := mns.Root()
-	defer globalRoot.DecRef()
-
-	// Create mount point for the container's rootfs.
-	maxTraversals := uint(0)
-	contDir, err := mns.FindInode(ctx, globalRoot, nil, ChildContainersDir, &maxTraversals)
-	if err != nil {
-		return fmt.Errorf("couldn't find child container dir %q: %v", ChildContainersDir, err)
-	}
-	if err := contDir.CreateDirectory(ctx, globalRoot, c.cid, fs.FilePermsFromMode(0755)); err != nil {
-		return fmt.Errorf("create directory %q: %v", c.cid, err)
-	}
-	containerRoot, err := contDir.Walk(ctx, globalRoot, c.cid)
-	if err != nil {
-		return fmt.Errorf("walk to %q failed: %v", c.cid, err)
-	}
-	defer containerRoot.DecRef()
 
-	// Create the container's root filesystem mount.
+	// Create a new root inode and mount namespace for the container.
 	rootInode, err := c.createRootMount(rootCtx, conf)
 	if err != nil {
 		return fmt.Errorf("creating filesystem for container: %v", err)
 	}
-
-	// Mount the container's root filesystem to the newly created mount point.
-	if err := mns.Mount(ctx, containerRoot, rootInode); err != nil {
-		return fmt.Errorf("mount container root: %v", err)
-	}
-
-	// We have to re-walk to the dirent to find the mounted directory. The old
-	// dirent is invalid at this point.
-	containerRoot, err = contDir.Walk(ctx, globalRoot, c.cid)
+	mns, err := fs.NewMountNamespace(rootCtx, rootInode)
 	if err != nil {
-		return fmt.Errorf("find container mount point %q: %v", c.cid, err)
+		return fmt.Errorf("creating new mount namespace for container: %v", err)
 	}
-	cu := specutils.MakeCleanup(func() { containerRoot.DecRef() })
-	defer cu.Clean()
-
-	log.Infof("Mounted child's root fs to %q", filepath.Join(ChildContainersDir, c.cid))
 
 	// Set process root here, so 'rootCtx.Value(CtxRoot)' will return it.
-	procArgs.Root = containerRoot
+	// This will also donate a reference to procArgs, as required.
+	procArgs.MountNamespace = mns
+	procArgs.Root = mns.Root()
 
 	// Mount all submounts.
-	if err := c.mountSubmounts(rootCtx, conf, mns, containerRoot); err != nil {
+	if err := c.mountSubmounts(rootCtx, conf, mns, procArgs.Root); err != nil {
 		return err
 	}
-	cu.Release()
 	return c.checkDispenser()
 }
 
@@ -585,75 +558,11 @@ func (c *containerMounter) checkDispenser() error {
 	return nil
 }
 
-// destroyContainerFS cleans up the filesystem by unmounting all mounts for the
-// given container and deleting the container root directory.
-func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error {
-	defer func() {
-		// Flushing dirent references triggers many async close
-		// operations. We must wait for those to complete before
-		// returning, otherwise the caller may kill the gofer before
-		// they complete, causing a cascade of failing RPCs.
-		//
-		// This must take place in the first deferred function, so that
-		// it runs after all the other deferred DecRef() calls in this
-		// function.
-		log.Infof("Waiting for async filesystem operations to complete")
-		fs.AsyncBarrier()
-	}()
-
-	// First get a reference to the container root directory.
-	mns := k.RootMountNamespace()
-	mnsRoot := mns.Root()
-	defer mnsRoot.DecRef()
-	containerRoot := path.Join(ChildContainersDir, cid)
-	maxTraversals := uint(0)
-	containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, &maxTraversals)
-	if err == syserror.ENOENT {
-		// Container must have been destroyed already. That's fine.
-		return nil
-	}
-	if err != nil {
-		return fmt.Errorf("finding container root directory %q: %v", containerRoot, err)
-	}
-	defer containerRootDirent.DecRef()
-
-	// Iterate through all submounts and unmount them. We unmount lazily by
-	// setting detach=true, so we can unmount in any order.
-	mnt := mns.FindMount(containerRootDirent)
-	for _, m := range mns.AllMountsUnder(mnt) {
-		root := m.Root()
-		defer root.DecRef()
-
-		// Do a best-effort unmount by flushing the refs and unmount
-		// with "detach only = true". Unmount returns EINVAL when the mount point
-		// doesn't exist, i.e. it has already been unmounted.
-		log.Debugf("Unmounting container mount %q", root.BaseName())
-		root.Inode.MountSource.FlushDirentRefs()
-		if err := mns.Unmount(ctx, root, true /* detach only */); err != nil && err != syserror.EINVAL {
-			return fmt.Errorf("unmounting container mount %q: %v", root.BaseName(), err)
-		}
-	}
-
-	// Get a reference to the parent directory and remove the root
-	// container directory.
-	maxTraversals = 0
-	containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, &maxTraversals)
-	if err != nil {
-		return fmt.Errorf("finding containers directory %q: %v", ChildContainersDir, err)
-	}
-	defer containersDirDirent.DecRef()
-	log.Debugf("Deleting container root %q", containerRoot)
-	if err := containersDirDirent.RemoveDirectory(ctx, mnsRoot, cid); err != nil {
-		return fmt.Errorf("removing directory %q: %v", containerRoot, err)
-	}
-
-	return nil
-}
-
 // setupRootContainer creates a mount namespace containing the root filesystem
 // and all mounts. 'rootCtx' is used to walk directories to find mount points.
-// 'setMountNS' is called after namespace is created. It must set the mount NS
-// to 'rootCtx'.
+// The 'setMountNS' callback is called after the mount namespace is created and
+// will get a reference on that namespace. The callback must ensure that the
+// rootCtx has the provided mount namespace.
 func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx context.Context, conf *Config, setMountNS func(*fs.MountNamespace)) error {
 	for _, hint := range c.hints.mounts {
 		log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
@@ -664,13 +573,6 @@ func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx c
 		hint.root = inode
 	}
 
-	// Create a tmpfs mount where we create and mount a root filesystem for
-	// each child container.
-	c.mounts = append(c.mounts, specs.Mount{
-		Type:        tmpfs,
-		Destination: ChildContainersDir,
-	})
-
 	rootInode, err := c.createRootMount(rootCtx, conf)
 	if err != nil {
 		return fmt.Errorf("creating root mount: %v", err)
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 8e8c6105b..a8adaf292 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -35,6 +35,7 @@ import (
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fs/host"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -525,8 +526,7 @@ func (l *Loader) run() error {
 		}
 
 		rootCtx := l.rootProcArgs.NewContext(l.k)
-		rootMns := l.k.RootMountNamespace()
-		if err := setExecutablePath(rootCtx, rootMns, &l.rootProcArgs); err != nil {
+		if err := setExecutablePath(rootCtx, &l.rootProcArgs); err != nil {
 			return err
 		}
 
@@ -540,7 +540,7 @@ func (l *Loader) run() error {
 			}
 		}
 		if !hasHomeEnvv {
-			homeDir, err := getExecUserHome(rootCtx, rootMns, uint32(l.rootProcArgs.Credentials.RealKUID))
+			homeDir, err := getExecUserHome(rootCtx, l.rootProcArgs.MountNamespace, uint32(l.rootProcArgs.Credentials.RealKUID))
 			if err != nil {
 				return fmt.Errorf("error reading exec user: %v", err)
 			}
@@ -663,8 +663,7 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
 		return fmt.Errorf("configuring container FS: %v", err)
 	}
 
-	mns := l.k.RootMountNamespace()
-	if err := setExecutablePath(ctx, mns, &procArgs); err != nil {
+	if err := setExecutablePath(ctx, &procArgs); err != nil {
 		return fmt.Errorf("setting executable path for %+v: %v", procArgs, err)
 	}
 
@@ -689,8 +688,10 @@ func (l *Loader) destroyContainer(cid string) error {
 	defer l.mu.Unlock()
 
 	// Has the container started?
-	if _, _, err := l.threadGroupFromIDLocked(execID{cid: cid}); err == nil {
-		// If the container has started, kill and wait for all processes.
+	_, _, err := l.threadGroupFromIDLocked(execID{cid: cid})
+
+	// If the container has started, kill and wait for all processes.
+	if err == nil {
 		if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
 			return fmt.Errorf("sending SIGKILL to all container processes: %v", err)
 		}
@@ -703,12 +704,17 @@ func (l *Loader) destroyContainer(cid string) error {
 		}
 	}
 
-	ctx := l.rootProcArgs.NewContext(l.k)
-	if err := destroyContainerFS(ctx, cid, l.k); err != nil {
-		return fmt.Errorf("destroying filesystem for container %q: %v", cid, err)
-	}
+	// At this point, all processes inside of the container have exited,
+	// releasing all references to the container's MountNamespace and
+	// causing all submounts and overlays to be unmounted.
+	//
+	// Since the container's MountNamespace has been released,
+	// MountNamespace.destroy() will have executed, but that function may
+	// trigger async close operations. We must wait for those to complete
+	// before returning, otherwise the caller may kill the gofer before
+	// they complete, causing a cascade of failing RPCs.
+	fs.AsyncBarrier()
 
-	// We made it!
 	log.Debugf("Container destroyed %q", cid)
 	return nil
 }
@@ -724,14 +730,22 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 		return 0, fmt.Errorf("no such container: %q", args.ContainerID)
 	}
 
-	// Get the container Root Dirent from the Task, since we must run this
-	// process with the same Root.
+	// Get the container Root Dirent and MountNamespace from the Task.
 	tg.Leader().WithMuLocked(func(t *kernel.Task) {
+		// FSContext.RootDirectory() will take an extra ref for us.
 		args.Root = t.FSContext().RootDirectory()
+
+		// task.MountNamespace() does not take a ref, so we must do so
+		// ourselves.
+		args.MountNamespace = t.MountNamespace()
+		args.MountNamespace.IncRef()
 	})
-	if args.Root != nil {
-		defer args.Root.DecRef()
-	}
+	defer func() {
+		if args.Root != nil {
+			args.Root.DecRef()
+		}
+		args.MountNamespace.DecRef()
+	}()
 
 	// Start the process.
 	proc := control.Proc{Kernel: l.k}
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index c0f9b372c..e299a0e88 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -456,19 +456,6 @@ func TestMultiContainerDestroy(t *testing.T) {
 		}
 		defer cleanup()
 
-		// Exec in the root container to check for the existence of the
-		// second container's root filesystem directory.
-		contDir := path.Join(boot.ChildContainersDir, containers[1].ID)
-		dirArgs := &control.ExecArgs{
-			Filename: "/usr/bin/test",
-			Argv:     []string{"test", "-d", contDir},
-		}
-		if ws, err := containers[0].executeSync(dirArgs); err != nil {
-			t.Fatalf("error executing %+v: %v", dirArgs, err)
-		} else if ws.ExitStatus() != 0 {
-			t.Errorf("exec 'test -f %q' got exit status %d, wanted 0", contDir, ws.ExitStatus())
-		}
-
 		// Exec more processes to ensure signal all works for exec'd processes too.
 		args := &control.ExecArgs{
 			Filename: app,
@@ -496,13 +483,6 @@ func TestMultiContainerDestroy(t *testing.T) {
 			t.Errorf("container got process list: %s, want: %s", procListToString(pss), procListToString(expectedPL))
 		}
 
-		// Now the container dir should be gone.
-		if ws, err := containers[0].executeSync(dirArgs); err != nil {
-			t.Fatalf("error executing %+v: %v", dirArgs, err)
-		} else if ws.ExitStatus() == 0 {
-			t.Errorf("exec 'test -f %q' got exit status 0, wanted non-zero", contDir)
-		}
-
 		// Check that cont.Destroy is safe to call multiple times.
 		if err := containers[1].Destroy(); err != nil {
 			t.Errorf("error destroying container: %v", err)
@@ -786,6 +766,47 @@ func TestMultiContainerDestroyStarting(t *testing.T) {
 	wg.Wait()
 }
 
+// TestMultiContainerDifferentFilesystems tests that different containers have
+// different root filesystems.
+func TestMultiContainerDifferentFilesystems(t *testing.T) {
+	filename := "/foo"
+	// Root container will create file and then sleep.
+	cmdRoot := []string{"sh", "-c", fmt.Sprintf("touch %q && sleep 100", filename)}
+
+	// Child containers will assert that the file does not exist, and will
+	// then create it.
+	script := fmt.Sprintf("if [ -f %q ]; then exit 1; else touch %q; fi", filename, filename)
+	cmd := []string{"sh", "-c", script}
+
+	// Make sure overlay is enabled, and none of the root filesystems are
+	// read-only, otherwise we won't be able to create the file.
+	conf := testutil.TestConfig()
+	conf.Overlay = true
+	specs, ids := createSpecs(cmdRoot, cmd, cmd)
+	for _, s := range specs {
+		s.Root.Readonly = false
+	}
+
+	containers, cleanup, err := startContainers(conf, specs, ids)
+	if err != nil {
+		t.Fatalf("error starting containers: %v", err)
+	}
+	defer cleanup()
+
+	// Both child containers should exit successfully.
+	for i, c := range containers {
+		if i == 0 {
+			// Don't wait on the root.
+			continue
+		}
+		if ws, err := c.Wait(); err != nil {
+			t.Errorf("failed to wait for process %s: %v", c.Spec.Process.Args, err)
+		} else if es := ws.ExitStatus(); es != 0 {
+			t.Errorf("process %s exited with non-zero status %d", c.Spec.Process.Args, es)
+		}
+	}
+}
+
 // TestMultiContainerGoferStop tests that IO operations continue to work after
 // containers have been stopped and gofers killed.
 func TestMultiContainerGoferStop(t *testing.T) {
@@ -1167,3 +1188,62 @@ func TestMultiContainerSharedMountRestart(t *testing.T) {
 		}
 	}
 }
+
+// Test that one container can send an FD to another container, even though
+// they have distinct MountNamespaces.
+func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
+	app, err := testutil.FindFile("runsc/container/test_app/test_app")
+	if err != nil {
+		t.Fatal("error finding test_app:", err)
+	}
+
+	// We set up two containers with one shared mount that is used for a
+	// shared socket. The first container will send an FD over the socket
+	// to the second container. The FD corresponds to a file in the first
+	// container's mount namespace that is not part of the second
+	// container's mount namespace. However, the second container still
+	// should be able to read the FD.
+
+	// Create a shared mount where we will put the socket.
+	sharedMnt := specs.Mount{
+		Destination: "/mydir/test",
+		Type:        "tmpfs",
+		// Shared mounts need a Source, even for tmpfs. It is only used
+		// to match up different shared mounts inside the pod.
+		Source: "/some/dir",
+	}
+	socketPath := filepath.Join(sharedMnt.Destination, "socket")
+
+	// Create a writeable tmpfs mount where the FD sender app will create
+	// files to send. This will only be mounted in the FD sender.
+	writeableMnt := specs.Mount{
+		Destination: "/tmp",
+		Type:        "tmpfs",
+	}
+
+	// Create the specs.
+	specs, ids := createSpecs(
+		[]string{"sleep", "1000"},
+		[]string{app, "fd_sender", "--socket", socketPath},
+		[]string{app, "fd_receiver", "--socket", socketPath},
+	)
+	createSharedMount(sharedMnt, "shared-mount", specs...)
+	specs[1].Mounts = append(specs[2].Mounts, sharedMnt, writeableMnt)
+	specs[2].Mounts = append(specs[1].Mounts, sharedMnt)
+
+	conf := testutil.TestConfig()
+	containers, cleanup, err := startContainers(conf, specs, ids)
+	if err != nil {
+		t.Fatalf("error starting containers: %v", err)
+	}
+	defer cleanup()
+
+	// Both containers should exit successfully.
+	for _, c := range containers[1:] {
+		if ws, err := c.Wait(); err != nil {
+			t.Errorf("failed to wait for process %s: %v", c.Spec.Process.Args, err)
+		} else if es := ws.ExitStatus(); es != 0 {
+			t.Errorf("process %s exited with non-zero status %d", c.Spec.Process.Args, es)
+		}
+	}
+}
diff --git a/runsc/container/test_app/BUILD b/runsc/container/test_app/BUILD
index 054705ed7..82dbd54d2 100644
--- a/runsc/container/test_app/BUILD
+++ b/runsc/container/test_app/BUILD
@@ -5,10 +5,14 @@ package(licenses = ["notice"])
 go_binary(
     name = "test_app",
     testonly = 1,
-    srcs = ["test_app.go"],
+    srcs = [
+        "fds.go",
+        "test_app.go",
+    ],
     pure = "on",
     visibility = ["//runsc/container:__pkg__"],
     deps = [
+        "//pkg/unet",
         "//runsc/test/testutil",
         "@com_github_google_subcommands//:go_default_library",
     ],
diff --git a/runsc/container/test_app/fds.go b/runsc/container/test_app/fds.go
new file mode 100644
index 000000000..c12809cab
--- /dev/null
+++ b/runsc/container/test_app/fds.go
@@ -0,0 +1,185 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+import (
+	"context"
+	"io/ioutil"
+	"log"
+	"os"
+	"time"
+
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/pkg/unet"
+	"gvisor.dev/gvisor/runsc/test/testutil"
+)
+
+const fileContents = "foobarbaz"
+
+// fdSender will open a file and send the FD over a unix domain socket.
+type fdSender struct {
+	socketPath string
+}
+
+// Name implements subcommands.Command.Name.
+func (*fdSender) Name() string {
+	return "fd_sender"
+}
+
+// Synopsis implements subcommands.Command.Synopsys.
+func (*fdSender) Synopsis() string {
+	return "creates a file and sends the FD over the socket"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*fdSender) Usage() string {
+	return "fd_sender <flags>"
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (fds *fdSender) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&fds.socketPath, "socket", "", "path to socket")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (fds *fdSender) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if fds.socketPath == "" {
+		log.Fatalf("socket flag must be set")
+	}
+
+	dir, err := ioutil.TempDir(testutil.TmpDir(), "")
+	if err != nil {
+		log.Fatalf("TempDir failed: %v", err)
+	}
+
+	fileToSend, err := ioutil.TempFile(dir, "")
+	if err != nil {
+		log.Fatalf("TempFile failed: %v", err)
+	}
+	defer fileToSend.Close()
+
+	if _, err := fileToSend.WriteString(fileContents); err != nil {
+		log.Fatalf("Write(%q) failed: %v", fileContents, err)
+	}
+
+	// Receiver may not be started yet, so try connecting in a poll loop.
+	var s *unet.Socket
+	if err := testutil.Poll(func() error {
+		var err error
+		s, err = unet.Connect(fds.socketPath, true /* SEQPACKET, so we can send empty message with FD */)
+		return err
+	}, 10*time.Second); err != nil {
+		log.Fatalf("Error connecting to socket %q: %v", fds.socketPath, err)
+	}
+	defer s.Close()
+
+	w := s.Writer(true)
+	w.ControlMessage.PackFDs(int(fileToSend.Fd()))
+	if _, err := w.WriteVec([][]byte{[]byte{'a'}}); err != nil {
+		log.Fatalf("Error sending FD %q over socket %q: %v", fileToSend.Fd(), fds.socketPath, err)
+	}
+
+	log.Print("FD SENDER exiting successfully")
+	return subcommands.ExitSuccess
+}
+
+// fdReceiver receives an FD from a unix domain socket and does things to it.
+type fdReceiver struct {
+	socketPath string
+}
+
+// Name implements subcommands.Command.Name.
+func (*fdReceiver) Name() string {
+	return "fd_receiver"
+}
+
+// Synopsis implements subcommands.Command.Synopsys.
+func (*fdReceiver) Synopsis() string {
+	return "reads an FD from a unix socket, and then does things to it"
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*fdReceiver) Usage() string {
+	return "fd_receiver <flags>"
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (fdr *fdReceiver) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&fdr.socketPath, "socket", "", "path to socket")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (fdr *fdReceiver) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if fdr.socketPath == "" {
+		log.Fatalf("Flags cannot be empty, given: socket: %q", fdr.socketPath)
+	}
+
+	ss, err := unet.BindAndListen(fdr.socketPath, true /* packet */)
+	if err != nil {
+		log.Fatalf("BindAndListen(%q) failed: %v", fdr.socketPath, err)
+	}
+	defer ss.Close()
+
+	var s *unet.Socket
+	c := make(chan error, 1)
+	go func() {
+		var err error
+		s, err = ss.Accept()
+		c <- err
+	}()
+
+	select {
+	case err := <-c:
+		if err != nil {
+			log.Fatalf("Accept() failed: %v", err)
+		}
+	case <-time.After(10 * time.Second):
+		log.Fatalf("Timeout waiting for accept")
+	}
+
+	r := s.Reader(true)
+	r.EnableFDs(1)
+	b := [][]byte{{'a'}}
+	if n, err := r.ReadVec(b); n != 1 || err != nil {
+		log.Fatalf("ReadVec got n=%d err %v (wanted 0, nil)", n, err)
+	}
+
+	fds, err := r.ExtractFDs()
+	if err != nil {
+		log.Fatalf("ExtractFD() got err %v", err)
+	}
+	if len(fds) != 1 {
+		log.Fatalf("ExtractFD() got %d FDs, wanted 1", len(fds))
+	}
+	fd := fds[0]
+
+	file := os.NewFile(uintptr(fd), "received file")
+	defer file.Close()
+	if _, err := file.Seek(0, os.SEEK_SET); err != nil {
+		log.Fatalf("Seek(0, 0) failed: %v", err)
+	}
+
+	got, err := ioutil.ReadAll(file)
+	if err != nil {
+		log.Fatalf("ReadAll failed: %v", err)
+	}
+	if string(got) != fileContents {
+		log.Fatalf("ReadAll got %q want %q", string(got), fileContents)
+	}
+
+	log.Print("FD RECEIVER exiting successfully")
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/container/test_app/test_app.go b/runsc/container/test_app/test_app.go
index b7fc6498f..6578c7b41 100644
--- a/runsc/container/test_app/test_app.go
+++ b/runsc/container/test_app/test_app.go
@@ -35,11 +35,13 @@ import (
 func main() {
 	subcommands.Register(subcommands.HelpCommand(), "")
 	subcommands.Register(subcommands.FlagsCommand(), "")
-	subcommands.Register(new(uds), "")
-	subcommands.Register(new(taskTree), "")
+	subcommands.Register(new(fdReceiver), "")
+	subcommands.Register(new(fdSender), "")
 	subcommands.Register(new(forkBomb), "")
 	subcommands.Register(new(reaper), "")
 	subcommands.Register(new(syscall), "")
+	subcommands.Register(new(taskTree), "")
+	subcommands.Register(new(uds), "")
 
 	flag.Parse()
 
@@ -76,7 +78,7 @@ func (c *uds) SetFlags(f *flag.FlagSet) {
 // Execute implements subcommands.Command.Execute.
 func (c *uds) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
 	if c.fileName == "" || c.socketPath == "" {
-		log.Fatal("Flags cannot be empty, given: fileName: %q, socketPath: %q", c.fileName, c.socketPath)
+		log.Fatalf("Flags cannot be empty, given: fileName: %q, socketPath: %q", c.fileName, c.socketPath)
 		return subcommands.ExitFailure
 	}
 	outputFile, err := os.OpenFile(c.fileName, os.O_WRONLY|os.O_CREATE, 0666)
-- 
cgit v1.2.3