41 files changed, 2306 insertions, 763 deletions
diff --git a/runsc/BUILD b/runsc/BUILD
index af8e928c5..8a57c597b 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -1,6 +1,4 @@
-package(
-    licenses = ["notice"],  # Apache 2.0
-)
+package(licenses = ["notice"])  # Apache 2.0
 
 load("@io_bazel_rules_go//go:def.bzl", "go_binary")
 load("@bazel_tools//tools/build_defs/pkg:pkg.bzl", "pkg_deb", "pkg_tar")
@@ -84,8 +82,9 @@ pkg_tar(
 genrule(
     name = "deb-version",
     outs = ["version.txt"],
-    cmd = "cat bazel-out/volatile-status.txt | grep VERSION | sed 's/^[^0-9]*//' >$@",
+    cmd = "$(location :runsc) -version | grep 'runsc version' | sed 's/^[^0-9]*//' > $@",
     stamp = 1,
+    tools = [":runsc"],
 )
 
 pkg_deb(
@@ -98,4 +97,7 @@ pkg_deb(
     package = "runsc",
     postinst = "debian/postinst.sh",
     version_file = ":version.txt",
+    visibility = [
+        "//visibility:public",
+    ],
 )
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index df9907e52..744f852a1 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -16,6 +16,7 @@ go_library(
         "limits.go",
         "loader.go",
         "network.go",
+        "pprof.go",
         "strace.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/runsc/boot",
@@ -30,6 +31,7 @@ go_library(
         "//pkg/cpuid",
         "//pkg/eventchannel",
         "//pkg/log",
+        "//pkg/memutil",
         "//pkg/rand",
         "//pkg/sentry/arch",
         "//pkg/sentry/arch:registers_go_proto",
@@ -51,7 +53,6 @@ go_library(
         "//pkg/sentry/kernel/kdefs",
         "//pkg/sentry/limits",
         "//pkg/sentry/loader",
-        "//pkg/sentry/memutil",
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/kvm",
@@ -94,6 +95,7 @@ go_test(
     size = "small",
     srcs = [
         "compat_test.go",
+        "fs_test.go",
         "loader_test.go",
     ],
     embed = [":boot"],
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 15f624f9b..6112b6c0a 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -221,6 +221,17 @@ type Config struct {
 	// user, and without chrooting the sandbox process. This can be
 	// necessary in test environments that have limited capabilities.
 	TestOnlyAllowRunAsCurrentUserWithoutChroot bool
+
+	// NumNetworkChannels controls the number of AF_PACKET sockets that map
+	// to the same underlying network device. This allows netstack to better
+	// scale for high throughput use cases.
+	NumNetworkChannels int
+
+	// Rootless allows the sandbox to be started with a user that is not root.
+	// Defense is depth measures are weaker with rootless. Specifically, the
+	// sandbox and Gofer process run as root inside a user namespace with root
+	// mapped to the caller's user.
+	Rootless bool
 }
 
 // ToFlags returns a slice of flags that correspond to the given Config.
@@ -244,6 +255,8 @@ func (c *Config) ToFlags() []string {
 		"--panic-signal=" + strconv.Itoa(c.PanicSignal),
 		"--profile=" + strconv.FormatBool(c.ProfileEnable),
 		"--net-raw=" + strconv.FormatBool(c.EnableRaw),
+		"--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels),
+		"--rootless=" + strconv.FormatBool(c.Rootless),
 	}
 	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
 		// Only include if set since it is never to be used by users.
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 72ab9ef86..26765cc46 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -237,7 +237,7 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 		return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer")
 	}
 
-	err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files)
+	err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files)
 	if err != nil {
 		log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err)
 		return err
@@ -340,8 +340,8 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	cm.l.k = k
 
 	// Set up the restore environment.
-	fds := &fdDispenser{fds: cm.l.goferFDs}
-	renv, err := createRestoreEnvironment(cm.l.spec, cm.l.conf, fds)
+	mntr := newContainerMounter(cm.l.spec, "", cm.l.goferFDs, cm.l.k, cm.l.mountHints)
+	renv, err := mntr.createRestoreEnvironment(cm.l.conf)
 	if err != nil {
 		return fmt.Errorf("creating RestoreEnvironment: %v", err)
 	}
@@ -359,6 +359,17 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 		return fmt.Errorf("file cannot be empty")
 	}
 
+	if cm.l.conf.ProfileEnable {
+		// initializePProf opens /proc/self/maps, so has to be
+		// called before installing seccomp filters.
+		initializePProf()
+	}
+
+	// Seccomp filters have to be applied before parsing the state file.
+	if err := cm.l.installSeccompFilters(); err != nil {
+		return err
+	}
+
 	// Load the state.
 	loadOpts := state.LoadOpts{Source: specFile}
 	if err := loadOpts.Load(k, networkStack); err != nil {
@@ -369,11 +380,11 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	k.Timekeeper().SetClocks(time.NewCalibratedClocks())
 
 	// Since we have a new kernel we also must make a new watchdog.
-	watchdog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction)
+	dog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction)
 
 	// Change the loader fields to reflect the changes made when restoring.
 	cm.l.k = k
-	cm.l.watchdog = watchdog
+	cm.l.watchdog = dog
 	cm.l.rootProcArgs = kernel.CreateProcessArgs{}
 	cm.l.restore = true
 
@@ -420,16 +431,12 @@ type WaitPIDArgs struct {
 
 	// CID is the container ID.
 	CID string
-
-	// ClearStatus determines whether the exit status of the process should
-	// be cleared when WaitPID returns.
-	ClearStatus bool
 }
 
 // WaitPID waits for the process with PID 'pid' in the sandbox.
 func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error {
 	log.Debugf("containerManager.Wait")
-	return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, args.ClearStatus, waitStatus)
+	return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, waitStatus)
 }
 
 // SignalDeliveryMode enumerates different signal delivery modes.
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index 4e428b49c..0811e10f4 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -28,11 +28,12 @@ import (
 // createFDMap creates an FD map that contains stdin, stdout, and stderr. If
 // console is true, then ioctl calls will be passed through to the host FD.
 // Upon success, createFDMap dups then closes stdioFDs.
-func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) {
+func createFDMap(ctx context.Context, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) {
 	if len(stdioFDs) != 3 {
 		return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
 	}
 
+	k := kernel.KernelFromContext(ctx)
 	fdm := k.NewFDMap()
 	defer fdm.DecRef()
 	mounter := fs.FileOwnerFromContext(ctx)
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 652da1cef..ef2dbfad2 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -246,6 +246,10 @@ var allowedSyscalls = seccomp.SyscallRules{
 	},
 	syscall.SYS_SETITIMER: {},
 	syscall.SYS_SHUTDOWN: []seccomp.Rule{
+		// Used by fs/host to shutdown host sockets.
+		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RD)},
+		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_WR)},
+		// Used by unet to shutdown connections.
 		{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
 	},
 	syscall.SYS_SIGALTSTACK:     {},
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 4b1557b9a..2fa0725d1 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 	"path"
 	"path/filepath"
+	"sort"
 	"strconv"
 	"strings"
 	"syscall"
@@ -29,9 +30,6 @@ import (
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys"
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
 	_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tty"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
@@ -40,6 +38,8 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/syserror"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
@@ -51,6 +51,9 @@ const (
 	// Device name for root mount.
 	rootDevice = "9pfs-/"
 
+	// MountPrefix is the annotation prefix for mount hints.
+	MountPrefix = "gvisor.dev/spec/mount"
+
 	// ChildContainersDir is the directory where child container root
 	// filesystems are mounted.
 	ChildContainersDir = "/__runsc_containers__"
@@ -65,67 +68,24 @@ const (
 	nonefs   = "none"
 )
 
-type fdDispenser struct {
-	fds []int
-}
-
-func (f *fdDispenser) remove() int {
-	if f.empty() {
-		panic("fdDispenser out of fds")
-	}
-	rv := f.fds[0]
-	f.fds = f.fds[1:]
-	return rv
-}
-
-func (f *fdDispenser) empty() bool {
-	return len(f.fds) == 0
-}
+func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
+	// Upper layer uses the same flags as lower, but it must be read-write.
+	upperFlags := lowerFlags
+	upperFlags.ReadOnly = false
 
-func adjustDirentCache(k *kernel.Kernel) error {
-	var hl syscall.Rlimit
-	if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil {
-		return fmt.Errorf("getting RLIMIT_NOFILE: %v", err)
-	}
-	if int64(hl.Cur) != syscall.RLIM_INFINITY {
-		newSize := hl.Cur / 2
-		if newSize < gofer.DefaultDirentCacheSize {
-			log.Infof("Setting gofer dirent cache size to %d", newSize)
-			gofer.DefaultDirentCacheSize = newSize
-			k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize)
-		}
+	tmpFS := mustFindFilesystem("tmpfs")
+	if !fs.IsDir(lower.StableAttr) {
+		// Create overlay on top of mount file, e.g. /etc/hostname.
+		msrc := fs.NewCachingMountSource(tmpFS, upperFlags)
+		return fs.NewOverlayRootFile(ctx, msrc, lower, upperFlags)
 	}
-	return nil
-}
 
-// setupRootContainerFS creates a mount namespace containing the root filesystem
-// and all mounts. 'rootCtx' is used to walk directories to find mount points.
-// 'setMountNS' is called after namespace is created. It must set the mount NS
-// to 'rootCtx'.
-func setupRootContainerFS(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, goferFDs []int, setMountNS func(*fs.MountNamespace)) error {
-	mounts := compileMounts(spec)
-
-	// Create a tmpfs mount where we create and mount a root filesystem for
-	// each child container.
-	mounts = append(mounts, specs.Mount{
-		Type:        tmpfs,
-		Destination: ChildContainersDir,
-	})
-
-	fds := &fdDispenser{fds: goferFDs}
-	rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts)
-	if err != nil {
-		return fmt.Errorf("creating root mount: %v", err)
-	}
-	mns, err := fs.NewMountNamespace(userCtx, rootInode)
+	// Create overlay on top of mount dir.
+	upper, err := tmpFS.Mount(ctx, name+"-upper", upperFlags, "", nil)
 	if err != nil {
-		return fmt.Errorf("creating root mount namespace: %v", err)
+		return nil, fmt.Errorf("creating tmpfs overlay: %v", err)
 	}
-	setMountNS(mns)
-
-	root := mns.Root()
-	defer root.DecRef()
-	return mountSubmounts(rootCtx, conf, mns, root, mounts, fds)
+	return fs.NewOverlayRoot(ctx, upper, lower, upperFlags)
 }
 
 // compileMounts returns the supported mounts from the mount spec, adding any
@@ -184,186 +144,6 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
 	return mounts
 }
 
-// createRootMount creates the root filesystem.
-func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser, mounts []specs.Mount) (*fs.Inode, error) {
-	// First construct the filesystem from the spec.Root.
-	mf := fs.MountSourceFlags{ReadOnly: spec.Root.Readonly || conf.Overlay}
-
-	var (
-		rootInode *fs.Inode
-		err       error
-	)
-
-	fd := fds.remove()
-	log.Infof("Mounting root over 9P, ioFD: %d", fd)
-	p9FS := mustFindFilesystem("9p")
-	opts := p9MountOptions(fd, conf.FileAccess)
-	rootInode, err = p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
-	if err != nil {
-		return nil, fmt.Errorf("creating root mount point: %v", err)
-	}
-
-	// We need to overlay the root on top of a ramfs with stub directories
-	// for submount paths.  "/dev" "/sys" "/proc" and "/tmp" are always
-	// mounted even if they are not in the spec.
-	submounts := append(subtargets("/", mounts), "/dev", "/sys", "/proc", "/tmp")
-	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
-	if err != nil {
-		return nil, fmt.Errorf("adding submount overlay: %v", err)
-	}
-
-	if conf.Overlay && !spec.Root.Readonly {
-		log.Debugf("Adding overlay on top of root mount")
-		// Overlay a tmpfs filesystem on top of the root.
-		rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf)
-		if err != nil {
-			return nil, err
-		}
-	}
-
-	log.Infof("Mounted %q to %q type root", spec.Root.Path, "/")
-	return rootInode, nil
-}
-
-func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
-	// Upper layer uses the same flags as lower, but it must be read-write.
-	lowerFlags.ReadOnly = false
-
-	tmpFS := mustFindFilesystem("tmpfs")
-	if !fs.IsDir(lower.StableAttr) {
-		// Create overlay on top of mount file, e.g. /etc/hostname.
-		msrc := fs.NewCachingMountSource(tmpFS, lowerFlags)
-		return fs.NewOverlayRootFile(ctx, msrc, lower, lowerFlags)
-	}
-
-	// Create overlay on top of mount dir.
-	upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "", nil)
-	if err != nil {
-		return nil, fmt.Errorf("creating tmpfs overlay: %v", err)
-	}
-	return fs.NewOverlayRoot(ctx, upper, lower, lowerFlags)
-}
-
-// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
-// used for mounts.
-func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (string, []string, bool, error) {
-	var (
-		fsName     string
-		opts       []string
-		useOverlay bool
-		err        error
-	)
-
-	switch m.Type {
-	case devpts, devtmpfs, proc, sysfs:
-		fsName = m.Type
-	case nonefs:
-		fsName = sysfs
-	case tmpfs:
-		fsName = m.Type
-
-		// tmpfs has some extra supported options that we must pass through.
-		opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
-
-	case bind:
-		fd := fds.remove()
-		fsName = "9p"
-		// Non-root bind mounts are always shared.
-		opts = p9MountOptions(fd, FileAccessShared)
-		// If configured, add overlay to all writable mounts.
-		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
-
-	default:
-		// TODO(nlacasse): Support all the mount types and make this a
-		// fatal error.  Most applications will "just work" without
-		// them, so this is a warning for now.
-		// we do not support.
-		log.Warningf("ignoring unknown filesystem type %q", m.Type)
-	}
-	return fsName, opts, useOverlay, err
-}
-
-func mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, mounts []specs.Mount, fds *fdDispenser) error {
-	for _, m := range mounts {
-		if err := mountSubmount(ctx, conf, mns, root, fds, m, mounts); err != nil {
-			return fmt.Errorf("mount submount %q: %v", m.Destination, err)
-		}
-	}
-
-	if err := mountTmp(ctx, conf, mns, root, mounts); err != nil {
-		return fmt.Errorf("mount submount %q: %v", "tmp", err)
-	}
-
-	if !fds.empty() {
-		return fmt.Errorf("not all mount points were consumed, remaining: %v", fds)
-	}
-	return nil
-}
-
-// mountSubmount mounts volumes inside the container's root. Because mounts may
-// be readonly, a lower ramfs overlay is added to create the mount point dir.
-// Another overlay is added with tmpfs on top if Config.Overlay is true.
-// 'm.Destination' must be an absolute path with '..' and symlinks resolved.
-func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, fds *fdDispenser, m specs.Mount, mounts []specs.Mount) error {
-	// Map mount type to filesystem name, and parse out the options that we are
-	// capable of dealing with.
-	fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds)
-
-	// Return the error or nil that corresponds to the default case in getMountNameAndOptions.
-	if err != nil {
-		return err
-	}
-	if fsName == "" {
-		return nil
-	}
-
-	// All filesystem names should have been mapped to something we know.
-	filesystem := mustFindFilesystem(fsName)
-
-	mf := mountFlags(m.Options)
-	if useOverlay {
-		// All writes go to upper, be paranoid and make lower readonly.
-		mf.ReadOnly = true
-	}
-
-	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil)
-	if err != nil {
-		return fmt.Errorf("creating mount with source %q: %v", m.Source, err)
-	}
-
-	// If there are submounts, we need to overlay the mount on top of a
-	// ramfs with stub directories for submount paths.
-	submounts := subtargets(m.Destination, mounts)
-	if len(submounts) > 0 {
-		log.Infof("Adding submount overlay over %q", m.Destination)
-		inode, err = addSubmountOverlay(ctx, inode, submounts)
-		if err != nil {
-			return fmt.Errorf("adding submount overlay: %v", err)
-		}
-	}
-
-	if useOverlay {
-		log.Debugf("Adding overlay on top of mount %q", m.Destination)
-		inode, err = addOverlay(ctx, conf, inode, m.Type, mf)
-		if err != nil {
-			return err
-		}
-	}
-
-	maxTraversals := uint(0)
-	dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals)
-	if err != nil {
-		return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
-	}
-	defer dirent.DecRef()
-	if err := mns.Mount(ctx, dirent, inode); err != nil {
-		return fmt.Errorf("mount %q error: %v", m.Destination, err)
-	}
-
-	log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
-	return nil
-}
-
 // p9MountOptions creates a slice of options for a p9 mount.
 func p9MountOptions(fd int, fa FileAccessType) []string {
 	opts := []string{
@@ -416,82 +196,6 @@ func mountDevice(m specs.Mount) string {
 	return "none"
 }
 
-// addRestoreMount adds a mount to the MountSources map used for restoring a
-// checkpointed container.
-func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, fds *fdDispenser) error {
-	fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds)
-
-	// Return the error or nil that corresponds to the default case in getMountNameAndOptions.
-	if err != nil {
-		return err
-	}
-	// TODO(nlacasse): Fix this when we support all the mount types and
-	// make this a fatal error.
-	if fsName == "" {
-		return nil
-	}
-
-	newMount := fs.MountArgs{
-		Dev:        mountDevice(m),
-		Flags:      mountFlags(m.Options),
-		DataString: strings.Join(opts, ","),
-	}
-	if useOverlay {
-		newMount.Flags.ReadOnly = true
-	}
-	renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
-	log.Infof("Added mount at %q: %+v", fsName, newMount)
-	return nil
-}
-
-// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding the mounts
-// to the environment.
-func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.RestoreEnvironment, error) {
-	renv := &fs.RestoreEnvironment{
-		MountSources: make(map[string][]fs.MountArgs),
-	}
-
-	// Add root mount.
-	fd := fds.remove()
-	opts := p9MountOptions(fd, conf.FileAccess)
-
-	mf := fs.MountSourceFlags{}
-	if spec.Root.Readonly || conf.Overlay {
-		mf.ReadOnly = true
-	}
-
-	rootMount := fs.MountArgs{
-		Dev:        rootDevice,
-		Flags:      mf,
-		DataString: strings.Join(opts, ","),
-	}
-	renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
-
-	// Add submounts.
-	var tmpMounted bool
-	for _, m := range compileMounts(spec) {
-		if err := addRestoreMount(conf, renv, m, fds); err != nil {
-			return nil, err
-		}
-		if filepath.Clean(m.Destination) == "/tmp" {
-			tmpMounted = true
-		}
-	}
-
-	// TODO(b/67958150): handle '/tmp' properly (see mountTmp()).
-	if !tmpMounted {
-		tmpMount := specs.Mount{
-			Type:        tmpfs,
-			Destination: "/tmp",
-		}
-		if err := addRestoreMount(conf, renv, tmpMount, fds); err != nil {
-			return nil, err
-		}
-	}
-
-	return renv, nil
-}
-
 func mountFlags(opts []string) fs.MountSourceFlags {
 	mf := fs.MountSourceFlags{}
 	for _, o := range opts {
@@ -546,22 +250,254 @@ func subtargets(root string, mnts []specs.Mount) []string {
 	return targets
 }
 
-// setupContainerFS is used to set up the file system and amend the procArgs accordingly.
-// procArgs are passed by reference and the FDMap field is modified. It dups stdioFDs.
-func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
-	ctx := procArgs.NewContext(k)
-
-	// Create the FD map, which will set stdin, stdout, and stderr.  If console
-	// is true, then ioctl calls will be passed through to the host fd.
-	fdm, err := createFDMap(ctx, k, ls, console, stdioFDs)
+// setExecutablePath sets the procArgs.Filename by searching the PATH for an
+// executable matching the procArgs.Argv[0].
+func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error {
+	paths := fs.GetPath(procArgs.Envv)
+	exe := procArgs.Argv[0]
+	f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths)
 	if err != nil {
-		return fmt.Errorf("importing fds: %v", err)
+		return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err)
+	}
+	procArgs.Filename = f
+	return nil
+}
+
+func adjustDirentCache(k *kernel.Kernel) error {
+	var hl syscall.Rlimit
+	if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil {
+		return fmt.Errorf("getting RLIMIT_NOFILE: %v", err)
 	}
+	if int64(hl.Cur) != syscall.RLIM_INFINITY {
+		newSize := hl.Cur / 2
+		if newSize < gofer.DefaultDirentCacheSize {
+			log.Infof("Setting gofer dirent cache size to %d", newSize)
+			gofer.DefaultDirentCacheSize = newSize
+			k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize)
+		}
+	}
+	return nil
+}
 
-	// CreateProcess takes a reference on FDMap if successful. We
-	// won't need ours either way.
-	procArgs.FDMap = fdm
+type fdDispenser struct {
+	fds []int
+}
 
+func (f *fdDispenser) remove() int {
+	if f.empty() {
+		panic("fdDispenser out of fds")
+	}
+	rv := f.fds[0]
+	f.fds = f.fds[1:]
+	return rv
+}
+
+func (f *fdDispenser) empty() bool {
+	return len(f.fds) == 0
+}
+
+type shareType int
+
+const (
+	invalid shareType = iota
+
+	// container shareType indicates that the mount is used by a single container.
+	container
+
+	// pod shareType indicates that the mount is used by more than one container
+	// inside the pod.
+	pod
+
+	// shared shareType indicates that the mount can also be shared with a process
+	// outside the pod, e.g. NFS.
+	shared
+)
+
+func parseShare(val string) (shareType, error) {
+	switch val {
+	case "container":
+		return container, nil
+	case "pod":
+		return pod, nil
+	case "shared":
+		return shared, nil
+	default:
+		return 0, fmt.Errorf("invalid share value %q", val)
+	}
+}
+
+func (s shareType) String() string {
+	switch s {
+	case invalid:
+		return "invalid"
+	case container:
+		return "container"
+	case pod:
+		return "pod"
+	case shared:
+		return "shared"
+	default:
+		return fmt.Sprintf("invalid share value %d", s)
+	}
+}
+
+// mountHint represents extra information about mounts that are provided via
+// annotations. They can override mount type, and provide sharing information
+// so that mounts can be correctly shared inside the pod.
+type mountHint struct {
+	name  string
+	share shareType
+	mount specs.Mount
+
+	// root is the inode where the volume is mounted. For mounts with 'pod' share
+	// the volume is mounted once and then bind mounted inside the containers.
+	root *fs.Inode
+}
+
+func (m *mountHint) setField(key, val string) error {
+	switch key {
+	case "source":
+		if len(val) == 0 {
+			return fmt.Errorf("source cannot be empty")
+		}
+		m.mount.Source = val
+	case "type":
+		return m.setType(val)
+	case "share":
+		share, err := parseShare(val)
+		if err != nil {
+			return err
+		}
+		m.share = share
+	case "options":
+		return m.setOptions(val)
+	default:
+		return fmt.Errorf("invalid mount annotation: %s=%s", key, val)
+	}
+	return nil
+}
+
+func (m *mountHint) setType(val string) error {
+	switch val {
+	case "tmpfs", "bind":
+		m.mount.Type = val
+	default:
+		return fmt.Errorf("invalid type %q", val)
+	}
+	return nil
+}
+
+func (m *mountHint) setOptions(val string) error {
+	opts := strings.Split(val, ",")
+	if err := specutils.ValidateMountOptions(opts); err != nil {
+		return err
+	}
+	// Sort options so it can be compared with container mount options later on.
+	sort.Strings(opts)
+	m.mount.Options = opts
+	return nil
+}
+
+func (m *mountHint) isSupported() bool {
+	return m.mount.Type == tmpfs && m.share == pod
+}
+
+// podMountHints contains a collection of mountHints for the pod.
+type podMountHints struct {
+	mounts map[string]*mountHint
+}
+
+func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
+	mnts := make(map[string]*mountHint)
+	for k, v := range spec.Annotations {
+		// Look for 'gvisor.dev/spec/mount' annotations and parse them.
+		if strings.HasPrefix(k, MountPrefix) {
+			parts := strings.Split(k, "/")
+			if len(parts) != 5 {
+				return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v)
+			}
+			name := parts[3]
+			if len(name) == 0 || path.Clean(name) != name {
+				return nil, fmt.Errorf("invalid mount name: %s", name)
+			}
+			mnt := mnts[name]
+			if mnt == nil {
+				mnt = &mountHint{name: name}
+				mnts[name] = mnt
+			}
+			if err := mnt.setField(parts[4], v); err != nil {
+				return nil, err
+			}
+		}
+	}
+
+	// Validate all hints after done parsing.
+	for name, m := range mnts {
+		log.Infof("Mount annotation found, name: %s, source: %q, type: %s, share: %v", name, m.mount.Source, m.mount.Type, m.share)
+		if m.share == invalid {
+			return nil, fmt.Errorf("share field for %q has not been set", m.name)
+		}
+		if len(m.mount.Source) == 0 {
+			return nil, fmt.Errorf("source field for %q has not been set", m.name)
+		}
+		if len(m.mount.Type) == 0 {
+			return nil, fmt.Errorf("type field for %q has not been set", m.name)
+		}
+
+		// Check for duplicate mount sources.
+		for name2, m2 := range mnts {
+			if name != name2 && m.mount.Source == m2.mount.Source {
+				return nil, fmt.Errorf("mounts %q and %q have the same mount source %q", m.name, m2.name, m.mount.Source)
+			}
+		}
+	}
+
+	return &podMountHints{mounts: mnts}, nil
+}
+
+func (p *podMountHints) findMount(mount specs.Mount) *mountHint {
+	for _, m := range p.mounts {
+		if m.mount.Source == mount.Source {
+			return m
+		}
+	}
+	return nil
+}
+
+type containerMounter struct {
+	// cid is the container ID. May be set to empty for the root container.
+	cid string
+
+	root *specs.Root
+
+	// mounts is the set of submounts for the container. It's a copy from the spec
+	// that may be freely modified without affecting the original spec.
+	mounts []specs.Mount
+
+	// fds is the list of FDs to be dispensed for mounts that require it.
+	fds fdDispenser
+
+	k *kernel.Kernel
+
+	hints *podMountHints
+}
+
+func newContainerMounter(spec *specs.Spec, cid string, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter {
+	return &containerMounter{
+		cid:    cid,
+		root:   spec.Root,
+		mounts: compileMounts(spec),
+		fds:    fdDispenser{fds: goferFDs},
+		k:      k,
+		hints:  hints,
+	}
+}
+
+// setupFS is used to set up the file system for containers and amend
+// the procArgs accordingly. This is the main entry point for this rest of
+// functions in this file. procArgs are passed by reference and the FDMap field
+// is modified. It dups stdioFDs.
+func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs, creds *auth.Credentials) error {
 	// Use root user to configure mounts. The current user might not have
 	// permission to do so.
 	rootProcArgs := kernel.CreateProcessArgs{
@@ -570,16 +506,19 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf
 		Umask:                0022,
 		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
 	}
-	rootCtx := rootProcArgs.NewContext(k)
+	rootCtx := rootProcArgs.NewContext(c.k)
 
 	// If this is the root container, we also need to setup the root mount
 	// namespace.
-	mns := k.RootMountNamespace()
+	mns := c.k.RootMountNamespace()
 	if mns == nil {
 		// Setup the root container.
-		return setupRootContainerFS(ctx, rootCtx, spec, conf, goferFDs, func(mns *fs.MountNamespace) {
-			k.SetRootMountNamespace(mns)
-		})
+		if err := c.setupRootContainer(ctx, rootCtx, conf, func(mns *fs.MountNamespace) {
+			c.k.SetRootMountNamespace(mns)
+		}); err != nil {
+			return err
+		}
+		return c.checkDispenser()
 	}
 
 	// Setup a child container.
@@ -593,18 +532,17 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf
 	if err != nil {
 		return fmt.Errorf("couldn't find child container dir %q: %v", ChildContainersDir, err)
 	}
-	if err := contDir.CreateDirectory(ctx, globalRoot, cid, fs.FilePermsFromMode(0755)); err != nil {
-		return fmt.Errorf("create directory %q: %v", cid, err)
+	if err := contDir.CreateDirectory(ctx, globalRoot, c.cid, fs.FilePermsFromMode(0755)); err != nil {
+		return fmt.Errorf("create directory %q: %v", c.cid, err)
 	}
-	containerRoot, err := contDir.Walk(ctx, globalRoot, cid)
+	containerRoot, err := contDir.Walk(ctx, globalRoot, c.cid)
 	if err != nil {
-		return fmt.Errorf("walk to %q failed: %v", cid, err)
+		return fmt.Errorf("walk to %q failed: %v", c.cid, err)
 	}
 	defer containerRoot.DecRef()
 
 	// Create the container's root filesystem mount.
-	fds := &fdDispenser{fds: goferFDs}
-	rootInode, err := createRootMount(rootCtx, spec, conf, fds, nil)
+	rootInode, err := c.createRootMount(rootCtx, conf)
 	if err != nil {
 		return fmt.Errorf("creating filesystem for container: %v", err)
 	}
@@ -614,39 +552,32 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf
 		return fmt.Errorf("mount container root: %v", err)
 	}
 
-	// We have to re-walk to the dirent to find the mounted
-	// directory. The old dirent is invalid at this point.
-	containerRoot, err = contDir.Walk(ctx, globalRoot, cid)
+	// We have to re-walk to the dirent to find the mounted directory. The old
+	// dirent is invalid at this point.
+	containerRoot, err = contDir.Walk(ctx, globalRoot, c.cid)
 	if err != nil {
-		return fmt.Errorf("find container mount point %q: %v", cid, err)
+		return fmt.Errorf("find container mount point %q: %v", c.cid, err)
 	}
 	cu := specutils.MakeCleanup(func() { containerRoot.DecRef() })
 	defer cu.Clean()
 
-	log.Infof("Mounted child's root fs to %q", filepath.Join(ChildContainersDir, cid))
+	log.Infof("Mounted child's root fs to %q", filepath.Join(ChildContainersDir, c.cid))
 
 	// Set process root here, so 'rootCtx.Value(CtxRoot)' will return it.
 	procArgs.Root = containerRoot
 
 	// Mount all submounts.
-	mounts := compileMounts(spec)
-	if err := mountSubmounts(rootCtx, conf, mns, containerRoot, mounts, fds); err != nil {
+	if err := c.mountSubmounts(rootCtx, conf, mns, containerRoot); err != nil {
 		return err
 	}
 	cu.Release()
-	return nil
+	return c.checkDispenser()
 }
 
-// setExecutablePath sets the procArgs.Filename by searching the PATH for an
-// executable matching the procArgs.Argv[0].
-func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error {
-	paths := fs.GetPath(procArgs.Envv)
-	exe := procArgs.Argv[0]
-	f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths)
-	if err != nil {
-		return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err)
+func (c *containerMounter) checkDispenser() error {
+	if !c.fds.empty() {
+		return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.fds)
 	}
-	procArgs.Filename = f
 	return nil
 }
 
@@ -715,17 +646,354 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
 	return nil
 }
 
+// setupRootContainer creates a mount namespace containing the root filesystem
+// and all mounts. 'rootCtx' is used to walk directories to find mount points.
+// 'setMountNS' is called after namespace is created. It must set the mount NS
+// to 'rootCtx'.
+func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx context.Context, conf *Config, setMountNS func(*fs.MountNamespace)) error {
+	for _, hint := range c.hints.mounts {
+		log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
+		inode, err := c.mountSharedMaster(rootCtx, conf, hint)
+		if err != nil {
+			return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
+		}
+		hint.root = inode
+	}
+
+	// Create a tmpfs mount where we create and mount a root filesystem for
+	// each child container.
+	c.mounts = append(c.mounts, specs.Mount{
+		Type:        tmpfs,
+		Destination: ChildContainersDir,
+	})
+
+	rootInode, err := c.createRootMount(rootCtx, conf)
+	if err != nil {
+		return fmt.Errorf("creating root mount: %v", err)
+	}
+	mns, err := fs.NewMountNamespace(userCtx, rootInode)
+	if err != nil {
+		return fmt.Errorf("creating root mount namespace: %v", err)
+	}
+	setMountNS(mns)
+
+	root := mns.Root()
+	defer root.DecRef()
+	return c.mountSubmounts(rootCtx, conf, mns, root)
+}
+
+// mountSharedMaster mounts the master of a volume that is shared among
+// containers in a pod. It returns the root mount's inode.
+func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) {
+	// Map mount type to filesystem name, and parse out the options that we are
+	// capable of dealing with.
+	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount)
+	if err != nil {
+		return nil, err
+	}
+	if len(fsName) == 0 {
+		return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
+	}
+
+	// Mount with revalidate because it's shared among containers.
+	opts = append(opts, "cache=revalidate")
+
+	// All filesystem names should have been mapped to something we know.
+	filesystem := mustFindFilesystem(fsName)
+
+	mf := mountFlags(hint.mount.Options)
+	if useOverlay {
+		// All writes go to upper, be paranoid and make lower readonly.
+		mf.ReadOnly = true
+	}
+
+	inode, err := filesystem.Mount(ctx, mountDevice(hint.mount), mf, strings.Join(opts, ","), nil)
+	if err != nil {
+		return nil, fmt.Errorf("creating mount %q: %v", hint.name, err)
+	}
+
+	if useOverlay {
+		log.Debugf("Adding overlay on top of shared mount %q", hint.name)
+		inode, err = addOverlay(ctx, conf, inode, hint.mount.Type, mf)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	return inode, nil
+}
+
+// createRootMount creates the root filesystem.
+func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) {
+	// First construct the filesystem from the spec.Root.
+	mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay}
+
+	fd := c.fds.remove()
+	log.Infof("Mounting root over 9P, ioFD: %d", fd)
+	p9FS := mustFindFilesystem("9p")
+	opts := p9MountOptions(fd, conf.FileAccess)
+	rootInode, err := p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
+	if err != nil {
+		return nil, fmt.Errorf("creating root mount point: %v", err)
+	}
+
+	// We need to overlay the root on top of a ramfs with stub directories
+	// for submount paths.  "/dev" "/sys" "/proc" and "/tmp" are always
+	// mounted even if they are not in the spec.
+	submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp")
+	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
+	if err != nil {
+		return nil, fmt.Errorf("adding submount overlay: %v", err)
+	}
+
+	if conf.Overlay && !c.root.Readonly {
+		log.Debugf("Adding overlay on top of root mount")
+		// Overlay a tmpfs filesystem on top of the root.
+		rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	log.Infof("Mounted %q to %q type root", c.root.Path, "/")
+	return rootInode, nil
+}
+
+// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
+// used for mounts.
+func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (string, []string, bool, error) {
+	var (
+		fsName     string
+		opts       []string
+		useOverlay bool
+		err        error
+	)
+
+	switch m.Type {
+	case devpts, devtmpfs, proc, sysfs:
+		fsName = m.Type
+	case nonefs:
+		fsName = sysfs
+	case tmpfs:
+		fsName = m.Type
+
+		// tmpfs has some extra supported options that we must pass through.
+		opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
+
+	case bind:
+		fd := c.fds.remove()
+		fsName = "9p"
+		// Non-root bind mounts are always shared.
+		opts = p9MountOptions(fd, FileAccessShared)
+		// If configured, add overlay to all writable mounts.
+		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
+
+	default:
+		// TODO(nlacasse): Support all the mount types and make this a fatal error.
+		// Most applications will "just work" without them, so this is a warning
+		// for now.
+		log.Warningf("ignoring unknown filesystem type %q", m.Type)
+	}
+	return fsName, opts, useOverlay, err
+}
+
+func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
+	for _, m := range c.mounts {
+		if hint := c.hints.findMount(m); hint != nil && hint.isSupported() {
+			if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil {
+				return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err)
+			}
+		} else {
+			if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil {
+				return fmt.Errorf("mount submount %q: %v", m.Destination, err)
+			}
+		}
+	}
+
+	if err := c.mountTmp(ctx, conf, mns, root); err != nil {
+		return fmt.Errorf("mount submount %q: %v", "tmp", err)
+	}
+	return nil
+}
+
+// mountSubmount mounts volumes inside the container's root. Because mounts may
+// be readonly, a lower ramfs overlay is added to create the mount point dir.
+// Another overlay is added with tmpfs on top if Config.Overlay is true.
+// 'm.Destination' must be an absolute path with '..' and symlinks resolved.
+func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
+	// Map mount type to filesystem name, and parse out the options that we are
+	// capable of dealing with.
+	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
+	if err != nil {
+		return err
+	}
+	if fsName == "" {
+		// Filesystem is not supported (e.g. cgroup), just skip it.
+		return nil
+	}
+
+	// All filesystem names should have been mapped to something we know.
+	filesystem := mustFindFilesystem(fsName)
+
+	mf := mountFlags(m.Options)
+	if useOverlay {
+		// All writes go to upper, be paranoid and make lower readonly.
+		mf.ReadOnly = true
+	}
+
+	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil)
+	if err != nil {
+		return fmt.Errorf("creating mount with source %q: %v", m.Source, err)
+	}
+
+	// If there are submounts, we need to overlay the mount on top of a ramfs
+	// with stub directories for submount paths.
+	submounts := subtargets(m.Destination, c.mounts)
+	if len(submounts) > 0 {
+		log.Infof("Adding submount overlay over %q", m.Destination)
+		inode, err = addSubmountOverlay(ctx, inode, submounts)
+		if err != nil {
+			return fmt.Errorf("adding submount overlay: %v", err)
+		}
+	}
+
+	if useOverlay {
+		log.Debugf("Adding overlay on top of mount %q", m.Destination)
+		inode, err = addOverlay(ctx, conf, inode, m.Type, mf)
+		if err != nil {
+			return err
+		}
+	}
+
+	maxTraversals := uint(0)
+	dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals)
+	if err != nil {
+		return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
+	}
+	defer dirent.DecRef()
+	if err := mns.Mount(ctx, dirent, inode); err != nil {
+		return fmt.Errorf("mount %q error: %v", m.Destination, err)
+	}
+
+	log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
+	return nil
+}
+
+// mountSharedSubmount binds mount to a previously mounted volume that is shared
+// among containers in the same pod.
+func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount specs.Mount, source *mountHint) error {
+	// For now enforce that all options are the same. Once bind mount is properly
+	// supported, then we should ensure the master is less restrictive than the
+	// container, e.g. master can be 'rw' while container mounts as 'ro'.
+	if len(mount.Options) != len(source.mount.Options) {
+		return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options)
+	}
+	sort.Strings(mount.Options)
+	for i, opt := range mount.Options {
+		if opt != source.mount.Options[i] {
+			return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options)
+		}
+	}
+
+	maxTraversals := uint(0)
+	target, err := mns.FindInode(ctx, root, root, mount.Destination, &maxTraversals)
+	if err != nil {
+		return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err)
+	}
+	defer target.DecRef()
+
+	if err := mns.Mount(ctx, target, source.root); err != nil {
+		return fmt.Errorf("bind mount %q error: %v", mount.Destination, err)
+	}
+
+	log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
+	return nil
+}
+
+// addRestoreMount adds a mount to the MountSources map used for restoring a
+// checkpointed container.
+func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
+	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
+	if err != nil {
+		return err
+	}
+	if fsName == "" {
+		// Filesystem is not supported (e.g. cgroup), just skip it.
+		return nil
+	}
+
+	newMount := fs.MountArgs{
+		Dev:        mountDevice(m),
+		Flags:      mountFlags(m.Options),
+		DataString: strings.Join(opts, ","),
+	}
+	if useOverlay {
+		newMount.Flags.ReadOnly = true
+	}
+	renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
+	log.Infof("Added mount at %q: %+v", fsName, newMount)
+	return nil
+}
+
+// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding
+// the mounts to the environment.
+func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) {
+	renv := &fs.RestoreEnvironment{
+		MountSources: make(map[string][]fs.MountArgs),
+	}
+
+	// Add root mount.
+	fd := c.fds.remove()
+	opts := p9MountOptions(fd, conf.FileAccess)
+
+	mf := fs.MountSourceFlags{}
+	if c.root.Readonly || conf.Overlay {
+		mf.ReadOnly = true
+	}
+
+	rootMount := fs.MountArgs{
+		Dev:        rootDevice,
+		Flags:      mf,
+		DataString: strings.Join(opts, ","),
+	}
+	renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
+
+	// Add submounts.
+	var tmpMounted bool
+	for _, m := range c.mounts {
+		if err := c.addRestoreMount(conf, renv, m); err != nil {
+			return nil, err
+		}
+		if filepath.Clean(m.Destination) == "/tmp" {
+			tmpMounted = true
+		}
+	}
+
+	// TODO(b/67958150): handle '/tmp' properly (see mountTmp()).
+	if !tmpMounted {
+		tmpMount := specs.Mount{
+			Type:        tmpfs,
+			Destination: "/tmp",
+		}
+		if err := c.addRestoreMount(conf, renv, tmpMount); err != nil {
+			return nil, err
+		}
+	}
+
+	return renv, nil
+}
+
 // mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so.
 // Technically we don't have to mount tmpfs at /tmp, as we could just rely on
 // the host /tmp, but this is a nice optimization, and fixes some apps that call
 // mknod in /tmp. It's unsafe to mount tmpfs if:
-//   1. /tmp is mounted explictly: we should not override user's wish
+//   1. /tmp is mounted explicitly: we should not override user's wish
 //   2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
 //
 // Note that when there are submounts inside of '/tmp', directories for the
 // mount points must be present, making '/tmp' not empty anymore.
-func mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, mounts []specs.Mount) error {
-	for _, m := range mounts {
+func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
+	for _, m := range c.mounts {
 		if filepath.Clean(m.Destination) == "/tmp" {
 			log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m)
 			return nil
@@ -766,7 +1034,7 @@ func mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *f
 			// another user. This is normally done for /tmp.
 			Options: []string{"mode=1777"},
 		}
-		return mountSubmount(ctx, conf, mns, root, nil, tmpMount, mounts)
+		return c.mountSubmount(ctx, conf, mns, root, tmpMount)
 
 	default:
 		return err
diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go
new file mode 100644
index 000000000..49ab34b33
--- /dev/null
+++ b/runsc/boot/fs_test.go
@@ -0,0 +1,193 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+	"path"
+	"reflect"
+	"strings"
+	"testing"
+
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+func TestPodMountHintsHappy(t *testing.T) {
+	spec := &specs.Spec{
+		Annotations: map[string]string{
+			path.Join(MountPrefix, "mount1", "source"): "foo",
+			path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
+			path.Join(MountPrefix, "mount1", "share"):  "pod",
+
+			path.Join(MountPrefix, "mount2", "source"):  "bar",
+			path.Join(MountPrefix, "mount2", "type"):    "bind",
+			path.Join(MountPrefix, "mount2", "share"):   "container",
+			path.Join(MountPrefix, "mount2", "options"): "rw,private",
+		},
+	}
+	podHints, err := newPodMountHints(spec)
+	if err != nil {
+		t.Errorf("newPodMountHints failed: %v", err)
+	}
+
+	// Check that fields were set correctly.
+	mount1 := podHints.mounts["mount1"]
+	if want := "mount1"; want != mount1.name {
+		t.Errorf("mount1 name, want: %q, got: %q", want, mount1.name)
+	}
+	if want := "foo"; want != mount1.mount.Source {
+		t.Errorf("mount1 source, want: %q, got: %q", want, mount1.mount.Source)
+	}
+	if want := "tmpfs"; want != mount1.mount.Type {
+		t.Errorf("mount1 type, want: %q, got: %q", want, mount1.mount.Type)
+	}
+	if want := pod; want != mount1.share {
+		t.Errorf("mount1 type, want: %q, got: %q", want, mount1.share)
+	}
+	if want := []string(nil); !reflect.DeepEqual(want, mount1.mount.Options) {
+		t.Errorf("mount1 type, want: %q, got: %q", want, mount1.mount.Options)
+	}
+
+	mount2 := podHints.mounts["mount2"]
+	if want := "mount2"; want != mount2.name {
+		t.Errorf("mount2 name, want: %q, got: %q", want, mount2.name)
+	}
+	if want := "bar"; want != mount2.mount.Source {
+		t.Errorf("mount2 source, want: %q, got: %q", want, mount2.mount.Source)
+	}
+	if want := "bind"; want != mount2.mount.Type {
+		t.Errorf("mount2 type, want: %q, got: %q", want, mount2.mount.Type)
+	}
+	if want := container; want != mount2.share {
+		t.Errorf("mount2 type, want: %q, got: %q", want, mount2.share)
+	}
+	if want := []string{"private", "rw"}; !reflect.DeepEqual(want, mount2.mount.Options) {
+		t.Errorf("mount2 type, want: %q, got: %q", want, mount2.mount.Options)
+	}
+}
+
+func TestPodMountHintsErrors(t *testing.T) {
+	for _, tst := range []struct {
+		name        string
+		annotations map[string]string
+		error       string
+	}{
+		{
+			name: "too short",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1"): "foo",
+			},
+			error: "invalid mount annotation",
+		},
+		{
+			name: "no name",
+			annotations: map[string]string{
+				MountPrefix + "//source": "foo",
+			},
+			error: "invalid mount name",
+		},
+		{
+			name: "missing source",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1", "type"):  "tmpfs",
+				path.Join(MountPrefix, "mount1", "share"): "pod",
+			},
+			error: "source field",
+		},
+		{
+			name: "missing type",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1", "source"): "foo",
+				path.Join(MountPrefix, "mount1", "share"):  "pod",
+			},
+			error: "type field",
+		},
+		{
+			name: "missing share",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1", "source"): "foo",
+				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
+			},
+			error: "share field",
+		},
+		{
+			name: "invalid field name",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1", "invalid"): "foo",
+			},
+			error: "invalid mount annotation",
+		},
+		{
+			name: "invalid source",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1", "source"): "",
+				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
+				path.Join(MountPrefix, "mount1", "share"):  "pod",
+			},
+			error: "source cannot be empty",
+		},
+		{
+			name: "invalid type",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1", "source"): "foo",
+				path.Join(MountPrefix, "mount1", "type"):   "invalid-type",
+				path.Join(MountPrefix, "mount1", "share"):  "pod",
+			},
+			error: "invalid type",
+		},
+		{
+			name: "invalid share",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1", "source"): "foo",
+				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
+				path.Join(MountPrefix, "mount1", "share"):  "invalid-share",
+			},
+			error: "invalid share",
+		},
+		{
+			name: "invalid options",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1", "source"):  "foo",
+				path.Join(MountPrefix, "mount1", "type"):    "tmpfs",
+				path.Join(MountPrefix, "mount1", "share"):   "pod",
+				path.Join(MountPrefix, "mount1", "options"): "invalid-option",
+			},
+			error: "unknown mount option",
+		},
+		{
+			name: "duplicate source",
+			annotations: map[string]string{
+				path.Join(MountPrefix, "mount1", "source"): "foo",
+				path.Join(MountPrefix, "mount1", "type"):   "tmpfs",
+				path.Join(MountPrefix, "mount1", "share"):  "pod",
+
+				path.Join(MountPrefix, "mount2", "source"): "foo",
+				path.Join(MountPrefix, "mount2", "type"):   "bind",
+				path.Join(MountPrefix, "mount2", "share"):  "container",
+			},
+			error: "have the same mount source",
+		},
+	} {
+		t.Run(tst.name, func(t *testing.T) {
+			spec := &specs.Spec{Annotations: tst.annotations}
+			podHints, err := newPodMountHints(spec)
+			if err == nil || !strings.Contains(err.Error(), tst.error) {
+				t.Errorf("newPodMountHints invalid error, want: .*%s.*, got: %v", tst.error, err)
+			}
+			if podHints != nil {
+				t.Errorf("newPodMountHints must return nil on failure: %+v", podHints)
+			}
+		})
+	}
+}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 6ac6b94dd..c1dea736f 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -29,6 +29,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
 	"gvisor.googlesource.com/gvisor/pkg/cpuid"
 	"gvisor.googlesource.com/gvisor/pkg/log"
+	"gvisor.googlesource.com/gvisor/pkg/memutil"
 	"gvisor.googlesource.com/gvisor/pkg/rand"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
@@ -37,7 +38,6 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/loader"
-	"gvisor.googlesource.com/gvisor/pkg/sentry/memutil"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
@@ -117,6 +117,10 @@ type Loader struct {
 	//
 	// processes is guardded by mu.
 	processes map[execID]*execProcess
+
+	// mountHints provides extra information about mounts for containers that
+	// apply to the entire pod.
+	mountHints *podMountHints
 }
 
 // execID uniquely identifies a sentry process that is executed in a container.
@@ -288,7 +292,7 @@ func New(args Args) (*Loader, error) {
 	}
 
 	// Create a watchdog.
-	watchdog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction)
+	dog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction)
 
 	procArgs, err := newProcess(args.ID, args.Spec, creds, k)
 	if err != nil {
@@ -299,18 +303,24 @@ func New(args Args) (*Loader, error) {
 		return nil, fmt.Errorf("initializing compat logs: %v", err)
 	}
 
+	mountHints, err := newPodMountHints(args.Spec)
+	if err != nil {
+		return nil, fmt.Errorf("creating pod mount hints: %v", err)
+	}
+
 	eid := execID{cid: args.ID}
 	l := &Loader{
 		k:            k,
 		conf:         args.Conf,
 		console:      args.Console,
-		watchdog:     watchdog,
+		watchdog:     dog,
 		spec:         args.Spec,
 		goferFDs:     args.GoferFDs,
 		stdioFDs:     args.StdioFDs,
 		rootProcArgs: procArgs,
 		sandboxID:    args.ID,
 		processes:    map[execID]*execProcess{eid: {}},
+		mountHints:   mountHints,
 	}
 
 	// We don't care about child signals; some platforms can generate a
@@ -424,6 +434,9 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {
 		return nil, fmt.Errorf("error creating memfd: %v", err)
 	}
 	memfile := os.NewFile(uintptr(memfd), memfileName)
+	// We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if
+	// there are memory cgroups specified, because at this point we're already
+	// in a mount namespace in which the relevant cgroupfs is not visible.
 	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
 	if err != nil {
 		memfile.Close()
@@ -432,7 +445,24 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {
 	return mf, nil
 }
 
-// Run runs the root container..
+func (l *Loader) installSeccompFilters() error {
+	if l.conf.DisableSeccomp {
+		filter.Report("syscall filter is DISABLED. Running in less secure mode.")
+	} else {
+		opts := filter.Options{
+			Platform:      l.k.Platform,
+			HostNetwork:   l.conf.Network == NetworkHost,
+			ProfileEnable: l.conf.ProfileEnable,
+			ControllerFD:  l.ctrl.srv.FD(),
+		}
+		if err := filter.Install(opts); err != nil {
+			return fmt.Errorf("installing seccomp filters: %v", err)
+		}
+	}
+	return nil
+}
+
+// Run runs the root container.
 func (l *Loader) Run() error {
 	err := l.run()
 	l.ctrl.manager.startResultChan <- err
@@ -467,36 +497,34 @@ func (l *Loader) run() error {
 		return fmt.Errorf("trying to start deleted container %q", l.sandboxID)
 	}
 
-	// Finally done with all configuration. Setup filters before user code
-	// is loaded.
-	if l.conf.DisableSeccomp {
-		filter.Report("syscall filter is DISABLED. Running in less secure mode.")
-	} else {
-		opts := filter.Options{
-			Platform:      l.k.Platform,
-			HostNetwork:   l.conf.Network == NetworkHost,
-			ProfileEnable: l.conf.ProfileEnable,
-			ControllerFD:  l.ctrl.srv.FD(),
-		}
-		if err := filter.Install(opts); err != nil {
-			return fmt.Errorf("installing seccomp filters: %v", err)
-		}
-	}
-
 	// If we are restoring, we do not want to create a process.
 	// l.restore is set by the container manager when a restore call is made.
 	if !l.restore {
-		if err := setupContainerFS(
-			&l.rootProcArgs,
-			l.spec,
-			l.conf,
-			l.stdioFDs,
-			l.goferFDs,
-			l.console,
-			l.rootProcArgs.Credentials,
-			l.rootProcArgs.Limits,
-			l.k,
-			"" /* CID, which isn't needed for the root container */); err != nil {
+		if l.conf.ProfileEnable {
+			initializePProf()
+		}
+
+		// Finally done with all configuration. Setup filters before user code
+		// is loaded.
+		if err := l.installSeccompFilters(); err != nil {
+			return err
+		}
+
+		// Create the FD map, which will set stdin, stdout, and stderr.  If console
+		// is true, then ioctl calls will be passed through to the host fd.
+		ctx := l.rootProcArgs.NewContext(l.k)
+		fdm, err := createFDMap(ctx, l.rootProcArgs.Limits, l.console, l.stdioFDs)
+		if err != nil {
+			return fmt.Errorf("importing fds: %v", err)
+		}
+		// CreateProcess takes a reference on FDMap if successful. We won't need
+		// ours either way.
+		l.rootProcArgs.FDMap = fdm
+
+		// cid for root container can be empty. Only subcontainers need it to set
+		// the mount location.
+		mntr := newContainerMounter(l.spec, "", l.goferFDs, l.k, l.mountHints)
+		if err := mntr.setupFS(ctx, l.conf, &l.rootProcArgs, l.rootProcArgs.Credentials); err != nil {
 			return err
 		}
 
@@ -552,7 +580,7 @@ func (l *Loader) createContainer(cid string) error {
 // startContainer starts a child container. It returns the thread group ID of
 // the newly created process. Caller owns 'files' and may close them after
 // this method returns.
-func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
+func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
 	// Create capabilities.
 	caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
 	if err != nil {
@@ -596,6 +624,16 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		stdioFDs = append(stdioFDs, int(f.Fd()))
 	}
 
+	// Create the FD map, which will set stdin, stdout, and stderr.
+	ctx := procArgs.NewContext(l.k)
+	fdm, err := createFDMap(ctx, procArgs.Limits, false, stdioFDs)
+	if err != nil {
+		return fmt.Errorf("importing fds: %v", err)
+	}
+	// CreateProcess takes a reference on FDMap if successful. We won't need ours
+	// either way.
+	procArgs.FDMap = fdm
+
 	// Can't take ownership away from os.File. dup them to get a new FDs.
 	var goferFDs []int
 	for _, f := range files[3:] {
@@ -606,22 +644,12 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
 		goferFDs = append(goferFDs, fd)
 	}
 
-	if err := setupContainerFS(
-		&procArgs,
-		spec,
-		conf,
-		stdioFDs,
-		goferFDs,
-		false,
-		creds,
-		procArgs.Limits,
-		k,
-		cid); err != nil {
+	mntr := newContainerMounter(spec, cid, goferFDs, l.k, l.mountHints)
+	if err := mntr.setupFS(ctx, conf, &procArgs, creds); err != nil {
 		return fmt.Errorf("configuring container FS: %v", err)
 	}
 
-	ctx := procArgs.NewContext(l.k)
-	mns := k.RootMountNamespace()
+	mns := l.k.RootMountNamespace()
 	if err := setExecutablePath(ctx, mns, &procArgs); err != nil {
 		return fmt.Errorf("setting executable path for %+v: %v", procArgs, err)
 	}
@@ -724,7 +752,7 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
 	return nil
 }
 
-func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, waitStatus *uint32) error {
+func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error {
 	if tgid <= 0 {
 		return fmt.Errorf("PID (%d) must be positive", tgid)
 	}
@@ -736,13 +764,10 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, wai
 		ws := l.wait(execTG)
 		*waitStatus = ws
 
-		// Remove tg from the cache if caller requested it.
-		if clearStatus {
-			l.mu.Lock()
-			delete(l.processes, eid)
-			log.Debugf("updated processes (removal): %v", l.processes)
-			l.mu.Unlock()
-		}
+		l.mu.Lock()
+		delete(l.processes, eid)
+		log.Debugf("updated processes (removal): %v", l.processes)
+		l.mu.Unlock()
 		return nil
 	}
 
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 4603f751d..2f2499811 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -397,14 +397,15 @@ func TestCreateMountNamespace(t *testing.T) {
 			}
 			defer cleanup()
 
-			// setupRootContainerFS needs to find root from the context after the
+			// setupRootContainer needs to find root from the context after the
 			// namespace is created.
 			var mns *fs.MountNamespace
 			setMountNS := func(m *fs.MountNamespace) {
 				mns = m
 				ctx.(*contexttest.TestContext).RegisterValue(fs.CtxRoot, mns.Root())
 			}
-			if err := setupRootContainerFS(ctx, ctx, &tc.spec, conf, []int{sandEnd}, setMountNS); err != nil {
+			mntr := newContainerMounter(&tc.spec, "", []int{sandEnd}, nil, &podMountHints{})
+			if err := mntr.setupRootContainer(ctx, ctx, conf, setMountNS); err != nil {
 				t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err)
 			}
 			root := mns.Root()
@@ -609,8 +610,8 @@ func TestRestoreEnvironment(t *testing.T) {
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
 			conf := testConfig()
-			fds := &fdDispenser{fds: tc.ioFDs}
-			actualRenv, err := createRestoreEnvironment(tc.spec, conf, fds)
+			mntr := newContainerMounter(tc.spec, "", tc.ioFDs, nil, &podMountHints{})
+			actualRenv, err := mntr.createRestoreEnvironment(conf)
 			if !tc.errorExpected && err != nil {
 				t.Fatalf("could not create restore environment for test:%s", tc.name)
 			} else if tc.errorExpected {
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 0a154d90b..d86803252 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -56,7 +56,11 @@ type FDBasedLink struct {
 	Addresses   []net.IP
 	Routes      []Route
 	GSOMaxSize  uint32
-	LinkAddress []byte
+	LinkAddress net.HardwareAddr
+
+	// NumChannels controls how many underlying FD's are to be used to
+	// create this endpoint.
+	NumChannels int
 }
 
 // LoopbackLink configures a loopback li nk.
@@ -68,8 +72,9 @@ type LoopbackLink struct {
 
 // CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes.
 type CreateLinksAndRoutesArgs struct {
-	// FilePayload contains the fds associated with the FDBasedLinks.  The
-	// two slices must have the same length.
+	// FilePayload contains the fds associated with the FDBasedLinks. The
+	// number of fd's should match the sum of the NumChannels field of the
+	// FDBasedLink entries below.
 	urpc.FilePayload
 
 	LoopbackLinks []LoopbackLink
@@ -95,8 +100,12 @@ func (r *Route) toTcpipRoute(id tcpip.NICID) tcpip.Route {
 // CreateLinksAndRoutes creates links and routes in a network stack.  It should
 // only be called once.
 func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error {
-	if len(args.FilePayload.Files) != len(args.FDBasedLinks) {
-		return fmt.Errorf("FilePayload must be same length at FDBasedLinks")
+	wantFDs := 0
+	for _, l := range args.FDBasedLinks {
+		wantFDs += l.NumChannels
+	}
+	if got := len(args.FilePayload.Files); got != wantFDs {
+		return fmt.Errorf("args.FilePayload.Files has %d FD's but we need %d entries based on FDBasedLinks", got, wantFDs)
 	}
 
 	var nicID tcpip.NICID
@@ -123,20 +132,26 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 	}
 
-	for i, link := range args.FDBasedLinks {
+	fdOffset := 0
+	for _, link := range args.FDBasedLinks {
 		nicID++
 		nicids[link.Name] = nicID
 
-		// Copy the underlying FD.
-		oldFD := args.FilePayload.Files[i].Fd()
-		newFD, err := syscall.Dup(int(oldFD))
-		if err != nil {
-			return fmt.Errorf("failed to dup FD %v: %v", oldFD, err)
+		FDs := []int{}
+		for j := 0; j < link.NumChannels; j++ {
+			// Copy the underlying FD.
+			oldFD := args.FilePayload.Files[fdOffset].Fd()
+			newFD, err := syscall.Dup(int(oldFD))
+			if err != nil {
+				return fmt.Errorf("failed to dup FD %v: %v", oldFD, err)
+			}
+			FDs = append(FDs, newFD)
+			fdOffset++
 		}
 
 		mac := tcpip.LinkAddress(link.LinkAddress)
 		linkEP, err := fdbased.New(&fdbased.Options{
-			FD:                 newFD,
+			FDs:                FDs,
 			MTU:                uint32(link.MTU),
 			EthernetHeader:     true,
 			Address:            mac,
@@ -148,7 +163,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 			return err
 		}
 
-		log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac)
+		log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
 		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, false /* loopback */); err != nil {
 			return err
 		}
diff --git a/runsc/boot/pprof.go b/runsc/boot/pprof.go
new file mode 100644
index 000000000..463362f02
--- /dev/null
+++ b/runsc/boot/pprof.go
@@ -0,0 +1,18 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+func initializePProf() {
+}
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index b7551a5ab..df6af0ced 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -14,9 +14,11 @@ go_library(
         "debug.go",
         "delete.go",
         "do.go",
+        "error.go",
         "events.go",
         "exec.go",
         "gofer.go",
+        "help.go",
         "kill.go",
         "list.go",
         "path.go",
@@ -28,6 +30,7 @@ go_library(
         "spec.go",
         "start.go",
         "state.go",
+        "syscalls.go",
         "wait.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/runsc/cmd",
@@ -38,6 +41,7 @@ go_library(
         "//pkg/log",
         "//pkg/p9",
         "//pkg/sentry/control",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/unet",
         "//pkg/urpc",
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 3a547d4aa..e0a950e9c 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -130,6 +130,8 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// Ensure that if there is a panic, all goroutine stacks are printed.
 	debug.SetTraceback("all")
 
+	conf := args[0].(*boot.Config)
+
 	if b.setUpRoot {
 		if err := setUpChroot(b.pidns); err != nil {
 			Fatalf("error setting up chroot: %v", err)
@@ -143,14 +145,16 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 					args = append(args, arg)
 				}
 			}
-			// Note that we've already read the spec from the spec FD, and
-			// we will read it again after the exec call. This works
-			// because the ReadSpecFromFile function seeks to the beginning
-			// of the file before reading.
-			if err := callSelfAsNobody(args); err != nil {
-				Fatalf("%v", err)
+			if !conf.Rootless {
+				// Note that we've already read the spec from the spec FD, and
+				// we will read it again after the exec call. This works
+				// because the ReadSpecFromFile function seeks to the beginning
+				// of the file before reading.
+				if err := callSelfAsNobody(args); err != nil {
+					Fatalf("%v", err)
+				}
+				panic("callSelfAsNobody must never return success")
 			}
-			panic("callSelfAsNobody must never return success")
 		}
 	}
 
@@ -163,9 +167,6 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 	specutils.LogSpec(spec)
 
-	conf := args[0].(*boot.Config)
-	waitStatus := args[1].(*syscall.WaitStatus)
-
 	if b.applyCaps {
 		caps := spec.Process.Capabilities
 		if caps == nil {
@@ -251,6 +252,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 
 	ws := l.WaitExit()
 	log.Infof("application exiting with %+v", ws)
+	waitStatus := args[1].(*syscall.WaitStatus)
 	*waitStatus = syscall.WaitStatus(ws.Status())
 	l.Destroy()
 	return subcommands.ExitSuccess
diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go
index ee74d33d8..2825dfaa5 100644
--- a/runsc/cmd/capability_test.go
+++ b/runsc/cmd/capability_test.go
@@ -116,6 +116,6 @@ func TestCapabilities(t *testing.T) {
 }
 
 func TestMain(m *testing.M) {
-	testutil.RunAsRoot()
+	specutils.MaybeRunAsRoot()
 	os.Exit(m.Run())
 }
diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go
index a2fc377d1..5b4cc4a39 100644
--- a/runsc/cmd/cmd.go
+++ b/runsc/cmd/cmd.go
@@ -17,34 +17,15 @@ package cmd
 
 import (
 	"fmt"
-	"os"
 	"runtime"
 	"strconv"
 	"syscall"
 
-	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
-// Errorf logs to stderr and returns subcommands.ExitFailure.
-func Errorf(s string, args ...interface{}) subcommands.ExitStatus {
-	// If runsc is being invoked by docker or cri-o, then we might not have
-	// access to stderr, so we log a serious-looking warning in addition to
-	// writing to stderr.
-	log.Warningf("FATAL ERROR: "+s, args...)
-	fmt.Fprintf(os.Stderr, s+"\n", args...)
-	// Return an error that is unlikely to be used by the application.
-	return subcommands.ExitFailure
-}
-
-// Fatalf logs to stderr and exits with a failure status code.
-func Fatalf(s string, args ...interface{}) {
-	Errorf(s, args...)
-	os.Exit(128)
-}
-
 // intFlags can be used with int flags that appear multiple times.
 type intFlags []int
 
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index 629c198fd..e82e8c667 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -16,7 +16,6 @@ package cmd
 
 import (
 	"context"
-
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
@@ -83,13 +82,17 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 	id := f.Arg(0)
 	conf := args[0].(*boot.Config)
 
+	if conf.Rootless {
+		return Errorf("Rootless mode not supported with %q", c.Name())
+	}
+
 	bundleDir := c.bundleDir
 	if bundleDir == "" {
 		bundleDir = getwdOrDie()
 	}
 	spec, err := specutils.ReadSpec(bundleDir)
 	if err != nil {
-		Fatalf("reading spec: %v", err)
+		return Errorf("reading spec: %v", err)
 	}
 	specutils.LogSpec(spec)
 
@@ -97,7 +100,7 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 	// container unless the metadata specifies that it should be run in an
 	// existing container.
 	if _, err := container.Create(id, spec, conf, bundleDir, c.consoleSocket, c.pidFile, c.userLog); err != nil {
-		Fatalf("creating container: %v", err)
+		return Errorf("creating container: %v", err)
 	}
 	return subcommands.ExitSuccess
 }
diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
index 8ea59046c..3f6e46fce 100644
--- a/runsc/cmd/do.go
+++ b/runsc/cmd/do.go
@@ -39,10 +39,9 @@ import (
 // Do implements subcommands.Command for the "do" command. It sets up a simple
 // sandbox and executes the command inside it. See Usage() for more details.
 type Do struct {
-	root             string
-	cwd              string
-	ip               string
-	networkNamespace bool
+	root string
+	cwd  string
+	ip   string
 }
 
 // Name implements subcommands.Command.Name.
@@ -72,7 +71,6 @@ func (c *Do) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&c.root, "root", "/", `path to the root directory, defaults to "/"`)
 	f.StringVar(&c.cwd, "cwd", ".", "path to the current directory, defaults to the current directory")
 	f.StringVar(&c.ip, "ip", "192.168.10.2", "IPv4 address for the sandbox")
-	f.BoolVar(&c.networkNamespace, "netns", true, "run in a new network namespace")
 }
 
 // Execute implements subcommands.Command.Execute.
@@ -85,15 +83,21 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 	conf := args[0].(*boot.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
-	// Map the entire host file system, but make it readonly with a writable
-	// overlay on top (ignore --overlay option).
-	conf.Overlay = true
+	if conf.Rootless {
+		if err := specutils.MaybeRunAsRoot(); err != nil {
+			return Errorf("Error executing inside namespace: %v", err)
+		}
+		// Execution will continue here if no more capabilities are needed...
+	}
 
 	hostname, err := os.Hostname()
 	if err != nil {
 		return Errorf("Error to retrieve hostname: %v", err)
 	}
 
+	// Map the entire host file system, but make it readonly with a writable
+	// overlay on top (ignore --overlay option).
+	conf.Overlay = true
 	absRoot, err := resolvePath(c.root)
 	if err != nil {
 		return Errorf("Error resolving root: %v", err)
@@ -119,11 +123,22 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
 	specutils.LogSpec(spec)
 
 	cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000))
-	if !c.networkNamespace {
-		if conf.Network != boot.NetworkHost {
-			Fatalf("The current network namespace can be used only if --network=host is set", nil)
+	if conf.Network == boot.NetworkNone {
+		netns := specs.LinuxNamespace{
+			Type: specs.NetworkNamespace,
+		}
+		if spec.Linux != nil {
+			panic("spec.Linux is not nil")
 		}
-	} else if conf.Network != boot.NetworkNone {
+		spec.Linux = &specs.Linux{Namespaces: []specs.LinuxNamespace{netns}}
+
+	} else if conf.Rootless {
+		if conf.Network == boot.NetworkSandbox {
+			fmt.Println("*** Rootless requires changing network type to host ***")
+			conf.Network = boot.NetworkHost
+		}
+
+	} else {
 		clean, err := c.setupNet(cid, spec)
 		if err != nil {
 			return Errorf("Error setting up network: %v", err)
diff --git a/runsc/cmd/error.go b/runsc/cmd/error.go
new file mode 100644
index 000000000..700b19f14
--- /dev/null
+++ b/runsc/cmd/error.go
@@ -0,0 +1,72 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"encoding/json"
+	"fmt"
+	"io"
+	"os"
+	"time"
+
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+// ErrorLogger is where error messages should be written to. These messages are
+// consumed by containerd and show up to users of command line tools,
+// like docker/kubectl.
+var ErrorLogger io.Writer
+
+type jsonError struct {
+	Msg   string    `json:"msg"`
+	Level string    `json:"level"`
+	Time  time.Time `json:"time"`
+}
+
+// Errorf logs error to containerd log (--log), to stderr, and debug logs. It
+// returns subcommands.ExitFailure for convenience with subcommand.Execute()
+// methods:
+//    return Errorf("Danger! Danger!")
+//
+func Errorf(format string, args ...interface{}) subcommands.ExitStatus {
+	// If runsc is being invoked by docker or cri-o, then we might not have
+	// access to stderr, so we log a serious-looking warning in addition to
+	// writing to stderr.
+	log.Warningf("FATAL ERROR: "+format, args...)
+	fmt.Fprintf(os.Stderr, format+"\n", args...)
+
+	j := jsonError{
+		Msg:   fmt.Sprintf(format, args...),
+		Level: "error",
+		Time:  time.Now(),
+	}
+	b, err := json.Marshal(j)
+	if err != nil {
+		panic(err)
+	}
+	if ErrorLogger != nil {
+		ErrorLogger.Write(b)
+	}
+
+	return subcommands.ExitFailure
+}
+
+// Fatalf logs the same way as Errorf() does, plus *exits* the process.
+func Fatalf(format string, args ...interface{}) {
+	Errorf(format, args...)
+	// Return an error that is unlikely to be used by the application.
+	os.Exit(128)
+}
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 52fd7ac4b..0eeaaadba 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -40,8 +40,6 @@ import (
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
 
-const privateClearStatusFlag = "private-clear-status"
-
 // Exec implements subcommands.Command for the "exec" command.
 type Exec struct {
 	cwd string
@@ -51,7 +49,6 @@ type Exec struct {
 	extraKGIDs      stringSlice
 	caps            stringSlice
 	detach          bool
-	clearStatus     bool
 	processPath     string
 	pidFile         string
 	internalPidFile string
@@ -103,10 +100,6 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&ex.pidFile, "pid-file", "", "filename that the container pid will be written to")
 	f.StringVar(&ex.internalPidFile, "internal-pid-file", "", "filename that the container-internal pid will be written to")
 	f.StringVar(&ex.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal")
-
-	// This flag clears the status of the exec'd process upon completion. It is
-	// only used when we fork due to --detach being set on the parent.
-	f.BoolVar(&ex.clearStatus, privateClearStatusFlag, true, "private flag, do not use")
 }
 
 // Execute implements subcommands.Command.Execute. It starts a process in an
@@ -150,13 +143,16 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// write the child's PID to the pid file. So when the container returns, the
 	// child process will also return and signal containerd.
 	if ex.detach {
-		return ex.execAndWait(waitStatus)
+		return ex.execChildAndWait(waitStatus)
 	}
+	return ex.exec(c, e, waitStatus)
+}
 
+func (ex *Exec) exec(c *container.Container, e *control.ExecArgs, waitStatus *syscall.WaitStatus) subcommands.ExitStatus {
 	// Start the new process and get it pid.
 	pid, err := c.Execute(e)
 	if err != nil {
-		Fatalf("getting processes for container: %v", err)
+		return Errorf("executing processes for container: %v", err)
 	}
 
 	if e.StdioIsPty {
@@ -170,33 +166,37 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	if ex.internalPidFile != "" {
 		pidStr := []byte(strconv.Itoa(int(pid)))
 		if err := ioutil.WriteFile(ex.internalPidFile, pidStr, 0644); err != nil {
-			Fatalf("writing internal pid file %q: %v", ex.internalPidFile, err)
+			return Errorf("writing internal pid file %q: %v", ex.internalPidFile, err)
 		}
 	}
 
-	// Generate the pid file after the internal pid file is generated, so that users
-	// can safely assume that the internal pid file is ready after `runsc exec -d`
-	// returns.
+	// Generate the pid file after the internal pid file is generated, so that
+	// users can safely assume that the internal pid file is ready after
+	// `runsc exec -d` returns.
 	if ex.pidFile != "" {
 		if err := ioutil.WriteFile(ex.pidFile, []byte(strconv.Itoa(os.Getpid())), 0644); err != nil {
-			Fatalf("writing pid file: %v", err)
+			return Errorf("writing pid file: %v", err)
 		}
 	}
 
 	// Wait for the process to exit.
-	ws, err := c.WaitPID(pid, ex.clearStatus)
+	ws, err := c.WaitPID(pid)
 	if err != nil {
-		Fatalf("waiting on pid %d: %v", pid, err)
+		return Errorf("waiting on pid %d: %v", pid, err)
 	}
 	*waitStatus = ws
 	return subcommands.ExitSuccess
 }
 
-func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStatus {
-	binPath := specutils.ExePath
+func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStatus {
 	var args []string
+	for _, a := range os.Args[1:] {
+		if !strings.Contains(a, "detach") {
+			args = append(args, a)
+		}
+	}
 
-	// The command needs to write a pid file so that execAndWait can tell
+	// The command needs to write a pid file so that execChildAndWait can tell
 	// when it has started. If no pid-file was provided, we should use a
 	// filename in a temp directory.
 	pidFile := ex.pidFile
@@ -210,19 +210,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 		args = append(args, "--pid-file="+pidFile)
 	}
 
-	// Add the rest of the args, excluding the "detach" flag.
-	for _, a := range os.Args[1:] {
-		if strings.Contains(a, "detach") {
-			// Replace with the "private-clear-status" flag, which tells
-			// the new process it's a detached child and shouldn't
-			// clear the exit status of the sentry process.
-			args = append(args, fmt.Sprintf("--%s=false", privateClearStatusFlag))
-		} else {
-			args = append(args, a)
-		}
-	}
-
-	cmd := exec.Command(binPath, args...)
+	cmd := exec.Command(specutils.ExePath, args...)
 	cmd.Args[0] = "runsc-exec"
 
 	// Exec stdio defaults to current process stdio.
@@ -233,8 +221,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 	// If the console control socket file is provided, then create a new
 	// pty master/slave pair and set the TTY on the sandbox process.
 	if ex.consoleSocket != "" {
-		// Create a new TTY pair and send the master on the provided
-		// socket.
+		// Create a new TTY pair and send the master on the provided socket.
 		tty, err := console.NewWithSocket(ex.consoleSocket)
 		if err != nil {
 			Fatalf("setting up console with socket %q: %v", ex.consoleSocket, err)
@@ -256,7 +243,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 		Fatalf("failure to start child exec process, err: %v", err)
 	}
 
-	log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, binPath, args)
+	log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, specutils.ExePath, args)
 
 	// Wait for PID file to ensure that child process has started. Otherwise,
 	// '--process' file is deleted as soon as this process returns and the child
@@ -278,7 +265,10 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
 		return false, nil
 	}
 	if err := specutils.WaitForReady(cmd.Process.Pid, 10*time.Second, ready); err != nil {
-		Fatalf("unexpected error waiting for PID file, err: %v", err)
+		// Don't log fatal error here, otherwise it will override the error logged
+		// by the child process that has failed to start.
+		log.Warningf("Unexpected error waiting for PID file, err: %v", err)
+		return subcommands.ExitFailure
 	}
 
 	*waitStatus = 0
diff --git a/runsc/cmd/help.go b/runsc/cmd/help.go
new file mode 100644
index 000000000..ff4f901cb
--- /dev/null
+++ b/runsc/cmd/help.go
@@ -0,0 +1,126 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"fmt"
+
+	"flag"
+	"github.com/google/subcommands"
+)
+
+// NewHelp returns a help command for the given commander.
+func NewHelp(cdr *subcommands.Commander) *Help {
+	return &Help{
+		cdr: cdr,
+	}
+}
+
+// Help implements subcommands.Command for the "help" command. The 'help'
+// command prints help for commands registered to a Commander but also allows for
+// registering additional help commands that print other documentation.
+type Help struct {
+	cdr      *subcommands.Commander
+	commands []subcommands.Command
+	help     bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Help) Name() string {
+	return "help"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Help) Synopsis() string {
+	return "Print help documentation."
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Help) Usage() string {
+	return `help [<subcommand>]:
+	With an argument, prints detailed information on the use of
+	the specified topic or subcommand. With no argument, print a list of
+	all commands and a brief description of each.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (h *Help) SetFlags(f *flag.FlagSet) {}
+
+// Execute implements subcommands.Command.Execute.
+func (h *Help) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	switch f.NArg() {
+	case 0:
+		fmt.Fprintf(h.cdr.Output, "Usage: %s <flags> <subcommand> <subcommand args>\n\n", h.cdr.Name())
+		fmt.Fprintf(h.cdr.Output, `runsc is a command line client for running applications packaged in the Open
+Container Initiative (OCI) format. Applications run by runsc are run in an
+isolated gVisor sandbox that emulates a Linux environment.
+
+gVisor is a user-space kernel, written in Go, that implements a substantial
+portion of the Linux system call interface. It provides an additional layer
+of isolation between running applications and the host operating system.
+
+Functionality is provided by subcommands. For additonal help on individual
+subcommands use "%s %s <subcommand>".
+
+`, h.cdr.Name(), h.Name())
+		h.cdr.VisitGroups(func(g *subcommands.CommandGroup) {
+			h.cdr.ExplainGroup(h.cdr.Output, g)
+		})
+
+		fmt.Fprintf(h.cdr.Output, "Additional help topics (Use \"%s %s <topic>\" to see help on the topic):\n", h.cdr.Name(), h.Name())
+		for _, cmd := range h.commands {
+			fmt.Fprintf(h.cdr.Output, "\t%-15s  %s\n", cmd.Name(), cmd.Synopsis())
+		}
+		fmt.Fprintf(h.cdr.Output, "\nUse \"%s flags\" for a list of top-level flags\n", h.cdr.Name())
+		return subcommands.ExitSuccess
+	default:
+		// Look for commands registered to the commander and print help explanation if found.
+		found := false
+		h.cdr.VisitCommands(func(g *subcommands.CommandGroup, cmd subcommands.Command) {
+			if f.Arg(0) == cmd.Name() {
+				h.cdr.ExplainCommand(h.cdr.Output, cmd)
+				found = true
+			}
+		})
+		if found {
+			return subcommands.ExitSuccess
+		}
+
+		// Next check commands registered to the help command.
+		for _, cmd := range h.commands {
+			if f.Arg(0) == cmd.Name() {
+				fs := flag.NewFlagSet(f.Arg(0), flag.ContinueOnError)
+				fs.Usage = func() { h.cdr.ExplainCommand(h.cdr.Error, cmd) }
+				cmd.SetFlags(fs)
+				if fs.Parse(f.Args()[1:]) != nil {
+					return subcommands.ExitUsageError
+				}
+				return cmd.Execute(ctx, f, args...)
+			}
+		}
+
+		fmt.Fprintf(h.cdr.Error, "Subcommand %s not understood\n", f.Arg(0))
+	}
+
+	f.Usage()
+	return subcommands.ExitUsageError
+}
+
+// Register registers a new help command.
+func (h *Help) Register(cmd subcommands.Command) {
+	h.commands = append(h.commands, cmd)
+}
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
index 3ab2f5676..a78a0dce6 100644
--- a/runsc/cmd/restore.go
+++ b/runsc/cmd/restore.go
@@ -80,25 +80,29 @@ func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{
 	conf := args[0].(*boot.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
+	if conf.Rootless {
+		return Errorf("Rootless mode not supported with %q", r.Name())
+	}
+
 	bundleDir := r.bundleDir
 	if bundleDir == "" {
 		bundleDir = getwdOrDie()
 	}
 	spec, err := specutils.ReadSpec(bundleDir)
 	if err != nil {
-		Fatalf("reading spec: %v", err)
+		return Errorf("reading spec: %v", err)
 	}
 	specutils.LogSpec(spec)
 
 	if r.imagePath == "" {
-		Fatalf("image-path flag must be provided")
+		return Errorf("image-path flag must be provided")
 	}
 
 	conf.RestoreFile = filepath.Join(r.imagePath, checkpointFileName)
 
 	ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, r.userLog, r.detach)
 	if err != nil {
-		Fatalf("running container: %v", err)
+		return Errorf("running container: %v", err)
 	}
 	*waitStatus = ws
 
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
index c228b4f93..abf602239 100644
--- a/runsc/cmd/run.go
+++ b/runsc/cmd/run.go
@@ -67,19 +67,23 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	conf := args[0].(*boot.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
+	if conf.Rootless {
+		return Errorf("Rootless mode not supported with %q", r.Name())
+	}
+
 	bundleDir := r.bundleDir
 	if bundleDir == "" {
 		bundleDir = getwdOrDie()
 	}
 	spec, err := specutils.ReadSpec(bundleDir)
 	if err != nil {
-		Fatalf("reading spec: %v", err)
+		return Errorf("reading spec: %v", err)
 	}
 	specutils.LogSpec(spec)
 
 	ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, r.userLog, r.detach)
 	if err != nil {
-		Fatalf("running container: %v", err)
+		return Errorf("running container: %v", err)
 	}
 
 	*waitStatus = ws
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
index 657726251..31e8f42bb 100644
--- a/runsc/cmd/start.go
+++ b/runsc/cmd/start.go
@@ -16,7 +16,6 @@ package cmd
 
 import (
 	"context"
-
 	"flag"
 	"github.com/google/subcommands"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
diff --git a/runsc/cmd/syscalls.go b/runsc/cmd/syscalls.go
new file mode 100644
index 000000000..9c8a66490
--- /dev/null
+++ b/runsc/cmd/syscalls.go
@@ -0,0 +1,347 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+	"encoding/csv"
+	"encoding/json"
+	"fmt"
+	"io"
+	"os"
+	"sort"
+	"strconv"
+	"text/tabwriter"
+
+	"flag"
+	"github.com/google/subcommands"
+	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+)
+
+// Syscalls implements subcommands.Command for the "syscalls" command.
+type Syscalls struct {
+	output string
+	os     string
+	arch   string
+}
+
+// CompatibilityInfo is a map of system and architecture to compatibility doc.
+// Maps operating system to architecture to ArchInfo.
+type CompatibilityInfo map[string]map[string]ArchInfo
+
+// ArchInfo is compatbility doc for an architecture.
+type ArchInfo struct {
+	// Syscalls maps syscall number for the architecture to the doc.
+	Syscalls map[uintptr]SyscallDoc `json:"syscalls"`
+}
+
+// SyscallDoc represents a single item of syscall documentation.
+type SyscallDoc struct {
+	Name string `json:"name"`
+	num  uintptr
+
+	Support string   `json:"support"`
+	Note    string   `json:"note,omitempty"`
+	URLs    []string `json:"urls,omitempty"`
+}
+
+type outputFunc func(io.Writer, CompatibilityInfo) error
+
+var (
+	// The string name to use for printing compatibility for all OSes.
+	osAll = "all"
+
+	// The string name to use for printing compatibility for all architectures.
+	archAll = "all"
+
+	// A map of OS name to map of architecture name to syscall table.
+	syscallTableMap = make(map[string]map[string]*kernel.SyscallTable)
+
+	// A map of output type names to output functions.
+	outputMap = map[string]outputFunc{
+		"table": outputTable,
+		"json":  outputJSON,
+		"csv":   outputCSV,
+	}
+)
+
+// Name implements subcommands.Command.Name.
+func (*Syscalls) Name() string {
+	return "syscalls"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Syscalls) Synopsis() string {
+	return "Print compatibility information for syscalls."
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Syscalls) Usage() string {
+	return `syscalls [options] - Print compatibility information for syscalls.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (s *Syscalls) SetFlags(f *flag.FlagSet) {
+	f.StringVar(&s.output, "o", "table", "Output format (table, csv, json).")
+	f.StringVar(&s.os, "os", osAll, "The OS (e.g. linux)")
+	f.StringVar(&s.arch, "arch", archAll, "The CPU architecture (e.g. amd64).")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (s *Syscalls) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	out, ok := outputMap[s.output]
+	if !ok {
+		Fatalf("Unsupported output format %q", s.output)
+	}
+
+	// Build map of all supported architectures.
+	tables := kernel.SyscallTables()
+	for _, t := range tables {
+		osMap, ok := syscallTableMap[t.OS.String()]
+		if !ok {
+			osMap = make(map[string]*kernel.SyscallTable)
+			syscallTableMap[t.OS.String()] = osMap
+		}
+		osMap[t.Arch.String()] = t
+	}
+
+	// Build a map of the architectures we want to output.
+	info, err := getCompatibilityInfo(s.os, s.arch)
+	if err != nil {
+		Fatalf("%v", err)
+	}
+
+	if err := out(os.Stdout, info); err != nil {
+		Fatalf("Error writing output: %v", err)
+	}
+
+	return subcommands.ExitSuccess
+}
+
+// getCompatibilityInfo returns compatibility info for the given OS name and
+// architecture name. Supports the special name 'all' for OS and architecture that
+// specifies that all supported OSes or architectures should be included.
+func getCompatibilityInfo(osName string, archName string) (CompatibilityInfo, error) {
+	info := CompatibilityInfo(make(map[string]map[string]ArchInfo))
+	if osName == osAll {
+		// Special processing for the 'all' OS name.
+		for osName, _ := range syscallTableMap {
+			info[osName] = make(map[string]ArchInfo)
+			// osName is a specific OS name.
+			if err := addToCompatibilityInfo(info, osName, archName); err != nil {
+				return info, err
+			}
+		}
+	} else {
+		// osName is a specific OS name.
+		info[osName] = make(map[string]ArchInfo)
+		if err := addToCompatibilityInfo(info, osName, archName); err != nil {
+			return info, err
+		}
+	}
+
+	return info, nil
+}
+
+// addToCompatibilityInfo adds ArchInfo for the given specific OS name and
+// architecture name. Supports the special architecture name 'all' to specify
+// that all supported architectures for the OS should be included.
+func addToCompatibilityInfo(info CompatibilityInfo, osName string, archName string) error {
+	if archName == archAll {
+		// Special processing for the 'all' architecture name.
+		for archName, _ := range syscallTableMap[osName] {
+			archInfo, err := getArchInfo(osName, archName)
+			if err != nil {
+				return err
+			}
+			info[osName][archName] = archInfo
+		}
+	} else {
+		// archName is a specific architecture name.
+		archInfo, err := getArchInfo(osName, archName)
+		if err != nil {
+			return err
+		}
+		info[osName][archName] = archInfo
+	}
+
+	return nil
+}
+
+// getArchInfo returns compatibility info for a specific OS and architecture.
+func getArchInfo(osName string, archName string) (ArchInfo, error) {
+	info := ArchInfo{}
+	info.Syscalls = make(map[uintptr]SyscallDoc)
+
+	t, ok := syscallTableMap[osName][archName]
+	if !ok {
+		return info, fmt.Errorf("syscall table for %s/%s not found", osName, archName)
+	}
+
+	for num, sc := range t.Table {
+		info.Syscalls[num] = SyscallDoc{
+			Name:    sc.Name,
+			num:     num,
+			Support: sc.SupportLevel.String(),
+			Note:    sc.Note,
+			URLs:    sc.URLs,
+		}
+	}
+
+	return info, nil
+}
+
+// outputTable outputs the syscall info in tabular format.
+func outputTable(w io.Writer, info CompatibilityInfo) error {
+	tw := tabwriter.NewWriter(w, 0, 0, 2, ' ', 0)
+
+	// Linux
+	for osName, osInfo := range info {
+		for archName, archInfo := range osInfo {
+			// Print the OS/arch
+			fmt.Fprintf(w, "%s/%s:\n\n", osName, archName)
+
+			// Sort the syscalls for output in the table.
+			sortedCalls := []SyscallDoc{}
+			for _, sc := range archInfo.Syscalls {
+				sortedCalls = append(sortedCalls, sc)
+			}
+			sort.Slice(sortedCalls, func(i, j int) bool {
+				return sortedCalls[i].num < sortedCalls[j].num
+			})
+
+			// Write the header
+			_, err := fmt.Fprintf(tw, "%s\t%s\t%s\t%s\n",
+				"NUM",
+				"NAME",
+				"SUPPORT",
+				"NOTE",
+			)
+			if err != nil {
+				return err
+			}
+
+			// Write each syscall entry
+			for _, sc := range sortedCalls {
+				_, err = fmt.Fprintf(tw, "%s\t%s\t%s\t%s\n",
+					strconv.FormatInt(int64(sc.num), 10),
+					sc.Name,
+					sc.Support,
+					sc.Note,
+				)
+				if err != nil {
+					return err
+				}
+				// Add issue urls to note.
+				for _, url := range sc.URLs {
+					_, err = fmt.Fprintf(tw, "%s\t%s\t%s\tSee: %s\t\n",
+						"",
+						"",
+						"",
+						url,
+					)
+					if err != nil {
+						return err
+					}
+				}
+			}
+
+			err = tw.Flush()
+			if err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
+
+// outputJSON outputs the syscall info in JSON format.
+func outputJSON(w io.Writer, info CompatibilityInfo) error {
+	e := json.NewEncoder(w)
+	e.SetIndent("", "  ")
+	return e.Encode(info)
+}
+
+// numberedRow is aCSV row annotated by syscall number (used for sorting)
+type numberedRow struct {
+	num uintptr
+	row []string
+}
+
+// outputCSV outputs the syscall info in tabular format.
+func outputCSV(w io.Writer, info CompatibilityInfo) error {
+	csvWriter := csv.NewWriter(w)
+
+	// Linux
+	for osName, osInfo := range info {
+		for archName, archInfo := range osInfo {
+			// Sort the syscalls for output in the table.
+			sortedCalls := []numberedRow{}
+			for _, sc := range archInfo.Syscalls {
+				// Add issue urls to note.
+				note := sc.Note
+				for _, url := range sc.URLs {
+					note = fmt.Sprintf("%s\nSee: %s", note, url)
+				}
+
+				sortedCalls = append(sortedCalls, numberedRow{
+					num: sc.num,
+					row: []string{
+						osName,
+						archName,
+						strconv.FormatInt(int64(sc.num), 10),
+						sc.Name,
+						sc.Support,
+						note,
+					},
+				})
+			}
+			sort.Slice(sortedCalls, func(i, j int) bool {
+				return sortedCalls[i].num < sortedCalls[j].num
+			})
+
+			// Write the header
+			err := csvWriter.Write([]string{
+				"OS",
+				"Arch",
+				"Num",
+				"Name",
+				"Support",
+				"Note",
+			})
+			if err != nil {
+				return err
+			}
+
+			// Write each syscall entry
+			for _, sc := range sortedCalls {
+				err = csvWriter.Write(sc.row)
+				if err != nil {
+					return err
+				}
+			}
+
+			csvWriter.Flush()
+			err = csvWriter.Error()
+			if err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
index a55a682f3..58fd01974 100644
--- a/runsc/cmd/wait.go
+++ b/runsc/cmd/wait.go
@@ -88,14 +88,14 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		waitStatus = ws
 	// Wait on a PID in the root PID namespace.
 	case wt.rootPID != unsetPID:
-		ws, err := c.WaitRootPID(int32(wt.rootPID), true /* clearStatus */)
+		ws, err := c.WaitRootPID(int32(wt.rootPID))
 		if err != nil {
 			Fatalf("waiting on PID in root PID namespace %d in container %q: %v", wt.rootPID, c.ID, err)
 		}
 		waitStatus = ws
 	// Wait on a PID in the container's PID namespace.
 	case wt.pid != unsetPID:
-		ws, err := c.WaitPID(int32(wt.pid), true /* clearStatus */)
+		ws, err := c.WaitPID(int32(wt.pid))
 		if err != nil {
 			Fatalf("waiting on PID %d in container %q: %v", wt.pid, c.ID, err)
 		}
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index b8af27c15..d016533e6 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -258,7 +258,7 @@ func TestJobControlSignalExec(t *testing.T) {
 	}
 
 	// Make sure the process indicates it was killed by a SIGKILL.
-	ws, err := c.WaitPID(pid, true)
+	ws, err := c.WaitPID(pid)
 	if err != nil {
 		t.Errorf("waiting on container failed: %v", err)
 	}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 513085836..04b611b56 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -530,22 +530,22 @@ func (c *Container) Wait() (syscall.WaitStatus, error) {
 
 // WaitRootPID waits for process 'pid' in the sandbox's PID namespace and
 // returns its WaitStatus.
-func (c *Container) WaitRootPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
+func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on PID %d in sandbox %q", pid, c.Sandbox.ID)
 	if !c.isSandboxRunning() {
 		return 0, fmt.Errorf("sandbox is not running")
 	}
-	return c.Sandbox.WaitPID(c.Sandbox.ID, pid, clearStatus)
+	return c.Sandbox.WaitPID(c.Sandbox.ID, pid)
 }
 
 // WaitPID waits for process 'pid' in the container's PID namespace and returns
 // its WaitStatus.
-func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
+func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on PID %d in container %q", pid, c.ID)
 	if !c.isSandboxRunning() {
 		return 0, fmt.Errorf("sandbox is not running")
 	}
-	return c.Sandbox.WaitPID(c.ID, pid, clearStatus)
+	return c.Sandbox.WaitPID(c.ID, pid)
 }
 
 // SignalContainer sends the signal to the container. If all is true and signal
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index dcd9910a0..867bf8187 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -36,6 +36,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
 
@@ -1841,7 +1842,7 @@ func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus,
 	if err != nil {
 		return 0, fmt.Errorf("error executing: %v", err)
 	}
-	ws, err := cont.WaitPID(pid, true /* clearStatus */)
+	ws, err := cont.WaitPID(pid)
 	if err != nil {
 		return 0, fmt.Errorf("error waiting: %v", err)
 	}
@@ -1853,7 +1854,7 @@ func TestMain(m *testing.M) {
 	if err := testutil.ConfigureExePath(); err != nil {
 		panic(err.Error())
 	}
-	testutil.RunAsRoot()
+	specutils.MaybeRunAsRoot()
 
 	os.Exit(m.Run())
 }
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 39c4dc03d..d57a73d46 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -99,6 +99,36 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
 	return containers, cleanup, nil
 }
 
+type execDesc struct {
+	c    *Container
+	cmd  []string
+	want int
+	desc string
+}
+
+func execMany(execs []execDesc) error {
+	for _, exec := range execs {
+		args := &control.ExecArgs{Argv: exec.cmd}
+		if ws, err := exec.c.executeSync(args); err != nil {
+			return fmt.Errorf("error executing %+v: %v", args, err)
+		} else if ws.ExitStatus() != exec.want {
+			return fmt.Errorf("%q: exec %q got exit status: %d, want: %d", exec.desc, exec.cmd, ws.ExitStatus(), exec.want)
+		}
+	}
+	return nil
+}
+
+func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) {
+	for _, spec := range pod {
+		spec.Annotations[path.Join(boot.MountPrefix, name, "source")] = mount.Source
+		spec.Annotations[path.Join(boot.MountPrefix, name, "type")] = mount.Type
+		spec.Annotations[path.Join(boot.MountPrefix, name, "share")] = "pod"
+		if len(mount.Options) > 0 {
+			spec.Annotations[path.Join(boot.MountPrefix, name, "options")] = strings.Join(mount.Options, ",")
+		}
+	}
+}
+
 // TestMultiContainerSanity checks that it is possible to run 2 dead-simple
 // containers in the same sandbox.
 func TestMultiContainerSanity(t *testing.T) {
@@ -175,12 +205,12 @@ func TestMultiContainerWait(t *testing.T) {
 		go func(c *Container) {
 			defer wg.Done()
 			const pid = 2
-			if ws, err := c.WaitPID(pid, true /* clearStatus */); err != nil {
+			if ws, err := c.WaitPID(pid); err != nil {
 				t.Errorf("failed to wait for PID %d: %v", pid, err)
 			} else if es := ws.ExitStatus(); es != 0 {
 				t.Errorf("PID %d exited with non-zero status %d", pid, es)
 			}
-			if _, err := c.WaitPID(pid, true /* clearStatus */); err == nil {
+			if _, err := c.WaitPID(pid); err == nil {
 				t.Errorf("wait for stopped PID %d should fail", pid)
 			}
 		}(containers[1])
@@ -263,12 +293,12 @@ func TestExecWait(t *testing.T) {
 	}
 
 	// Get the exit status from the exec'd process.
-	if ws, err := containers[0].WaitPID(pid, true /* clearStatus */); err != nil {
+	if ws, err := containers[0].WaitPID(pid); err != nil {
 		t.Fatalf("failed to wait for process %+v with pid %d: %v", args, pid, err)
 	} else if es := ws.ExitStatus(); es != 0 {
 		t.Fatalf("process %+v exited with non-zero status %d", args, es)
 	}
-	if _, err := containers[0].WaitPID(pid, true /* clearStatus */); err == nil {
+	if _, err := containers[0].WaitPID(pid); err == nil {
 		t.Fatalf("wait for stopped process %+v should fail", args)
 	}
 }
@@ -828,3 +858,272 @@ func TestMultiContainerGoferStop(t *testing.T) {
 		}
 	}
 }
+
+// Test that pod shared mounts are properly mounted in 2 containers and that
+// changes from one container is reflected in the other.
+func TestMultiContainerSharedMount(t *testing.T) {
+	for _, conf := range configs(all...) {
+		t.Logf("Running test with conf: %+v", conf)
+
+		// Setup the containers.
+		sleep := []string{"sleep", "100"}
+		podSpec, ids := createSpecs(sleep, sleep)
+		mnt0 := specs.Mount{
+			Destination: "/mydir/test",
+			Source:      "/some/dir",
+			Type:        "tmpfs",
+			Options:     nil,
+		}
+		podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+
+		mnt1 := mnt0
+		mnt1.Destination = "/mydir2/test2"
+		podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+
+		createSharedMount(mnt0, "test-mount", podSpec...)
+
+		containers, cleanup, err := startContainers(conf, podSpec, ids)
+		if err != nil {
+			t.Fatalf("error starting containers: %v", err)
+		}
+		defer cleanup()
+
+		file0 := path.Join(mnt0.Destination, "abc")
+		file1 := path.Join(mnt1.Destination, "abc")
+		execs := []execDesc{
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
+				desc: "directory is mounted in container0",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
+				desc: "directory is mounted in container1",
+			},
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/touch", file0},
+				desc: "create file in container0",
+			},
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/test", "-f", file0},
+				desc: "file appears in container0",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/usr/bin/test", "-f", file1},
+				desc: "file appears in container1",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/bin/rm", file1},
+				desc: "file removed from container1",
+			},
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/test", "!", "-f", file0},
+				desc: "file removed from container0",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/usr/bin/test", "!", "-f", file1},
+				desc: "file removed from container1",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/bin/mkdir", file1},
+				desc: "create directory in container1",
+			},
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/test", "-d", file0},
+				desc: "dir appears in container0",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/usr/bin/test", "-d", file1},
+				desc: "dir appears in container1",
+			},
+			{
+				c:    containers[0],
+				cmd:  []string{"/bin/rmdir", file0},
+				desc: "create directory in container0",
+			},
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/test", "!", "-d", file0},
+				desc: "dir removed from container0",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/usr/bin/test", "!", "-d", file1},
+				desc: "dir removed from container1",
+			},
+		}
+		if err := execMany(execs); err != nil {
+			t.Fatal(err.Error())
+		}
+	}
+}
+
+// Test that pod mounts are mounted as readonly when requested.
+func TestMultiContainerSharedMountReadonly(t *testing.T) {
+	for _, conf := range configs(all...) {
+		t.Logf("Running test with conf: %+v", conf)
+
+		// Setup the containers.
+		sleep := []string{"sleep", "100"}
+		podSpec, ids := createSpecs(sleep, sleep)
+		mnt0 := specs.Mount{
+			Destination: "/mydir/test",
+			Source:      "/some/dir",
+			Type:        "tmpfs",
+			Options:     []string{"ro"},
+		}
+		podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+
+		mnt1 := mnt0
+		mnt1.Destination = "/mydir2/test2"
+		podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+
+		createSharedMount(mnt0, "test-mount", podSpec...)
+
+		containers, cleanup, err := startContainers(conf, podSpec, ids)
+		if err != nil {
+			t.Fatalf("error starting containers: %v", err)
+		}
+		defer cleanup()
+
+		file0 := path.Join(mnt0.Destination, "abc")
+		file1 := path.Join(mnt1.Destination, "abc")
+		execs := []execDesc{
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
+				desc: "directory is mounted in container0",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
+				desc: "directory is mounted in container1",
+			},
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/touch", file0},
+				want: 1,
+				desc: "fails to write to container0",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/usr/bin/touch", file1},
+				want: 1,
+				desc: "fails to write to container1",
+			},
+		}
+		if err := execMany(execs); err != nil {
+			t.Fatal(err.Error())
+		}
+	}
+}
+
+// Test that shared pod mounts continue to work after container is restarted.
+func TestMultiContainerSharedMountRestart(t *testing.T) {
+	for _, conf := range configs(all...) {
+		t.Logf("Running test with conf: %+v", conf)
+
+		// Setup the containers.
+		sleep := []string{"sleep", "100"}
+		podSpec, ids := createSpecs(sleep, sleep)
+		mnt0 := specs.Mount{
+			Destination: "/mydir/test",
+			Source:      "/some/dir",
+			Type:        "tmpfs",
+			Options:     nil,
+		}
+		podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+
+		mnt1 := mnt0
+		mnt1.Destination = "/mydir2/test2"
+		podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+
+		createSharedMount(mnt0, "test-mount", podSpec...)
+
+		containers, cleanup, err := startContainers(conf, podSpec, ids)
+		if err != nil {
+			t.Fatalf("error starting containers: %v", err)
+		}
+		defer cleanup()
+
+		file0 := path.Join(mnt0.Destination, "abc")
+		file1 := path.Join(mnt1.Destination, "abc")
+		execs := []execDesc{
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/touch", file0},
+				desc: "create file in container0",
+			},
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/test", "-f", file0},
+				desc: "file appears in container0",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/usr/bin/test", "-f", file1},
+				desc: "file appears in container1",
+			},
+		}
+		if err := execMany(execs); err != nil {
+			t.Fatal(err.Error())
+		}
+
+		containers[1].Destroy()
+
+		bundleDir, err := testutil.SetupBundleDir(podSpec[1])
+		if err != nil {
+			t.Fatalf("error restarting container: %v", err)
+		}
+		defer os.RemoveAll(bundleDir)
+
+		containers[1], err = Create(ids[1], podSpec[1], conf, bundleDir, "", "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		if err := containers[1].Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+
+		execs = []execDesc{
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/test", "-f", file0},
+				desc: "file is still in container0",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/usr/bin/test", "-f", file1},
+				desc: "file is still in container1",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/bin/rm", file1},
+				desc: "file removed from container1",
+			},
+			{
+				c:    containers[0],
+				cmd:  []string{"/usr/bin/test", "!", "-f", file0},
+				desc: "file removed from container0",
+			},
+			{
+				c:    containers[1],
+				cmd:  []string{"/usr/bin/test", "!", "-f", file1},
+				desc: "file removed from container1",
+			},
+		}
+		if err := execMany(execs); err != nil {
+			t.Fatal(err.Error())
+		}
+	}
+}
diff --git a/runsc/main.go b/runsc/main.go
index 11bc73f75..cfe3a78d0 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -48,11 +48,12 @@ var (
 	// system that are not covered by the runtime spec.
 
 	// Debugging flags.
-	debugLog       = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
-	logPackets     = flag.Bool("log-packets", false, "enable network packet logging")
-	logFD          = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
-	debugLogFD     = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
-	debugLogFormat = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s")
+	debugLog        = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
+	logPackets      = flag.Bool("log-packets", false, "enable network packet logging")
+	logFD           = flag.Int("log-fd", -1, "file descriptor to log to.  If set, the 'log' flag is ignored.")
+	debugLogFD      = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to.  If set, the 'debug-log-dir' flag is ignored.")
+	debugLogFormat  = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s")
+	alsoLogToStderr = flag.Bool("alsologtostderr", false, "send log messages to stderr")
 
 	// Debugging flags: strace related
 	strace         = flag.Bool("strace", false, "enable strace")
@@ -60,22 +61,27 @@ var (
 	straceLogSize  = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs")
 
 	// Flags that control sandbox runtime behavior.
-	platform       = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
-	network        = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
-	gso            = flag.Bool("gso", true, "enable generic segmenation offload")
-	fileAccess     = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
-	overlay        = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
-	watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
-	panicSignal    = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
-	profile        = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
-	netRaw         = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.")
-
+	platform           = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
+	network            = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
+	gso                = flag.Bool("gso", true, "enable generic segmenation offload")
+	fileAccess         = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
+	overlay            = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
+	watchdogAction     = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
+	panicSignal        = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
+	profile            = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
+	netRaw             = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.")
+	numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.")
+	rootless           = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
+
+	// Test flags, not to be used outside tests, ever.
 	testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
 )
 
 func main() {
 	// Help and flags commands are generated automatically.
-	subcommands.Register(subcommands.HelpCommand(), "")
+	help := cmd.NewHelp(subcommands.DefaultCommander)
+	help.Register(new(cmd.Syscalls))
+	subcommands.Register(help, "")
 	subcommands.Register(subcommands.FlagsCommand(), "")
 
 	// Register user-facing runsc commands.
@@ -117,6 +123,22 @@ func main() {
 		os.Exit(0)
 	}
 
+	var errorLogger io.Writer
+	if *logFD > -1 {
+		errorLogger = os.NewFile(uintptr(*logFD), "error log file")
+
+	} else if *logFilename != "" {
+		// We must set O_APPEND and not O_TRUNC because Docker passes
+		// the same log file for all commands (and also parses these
+		// log files), so we can't destroy them on each command.
+		var err error
+		errorLogger, err = os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644)
+		if err != nil {
+			cmd.Fatalf("error opening log file %q: %v", *logFilename, err)
+		}
+	}
+	cmd.ErrorLogger = errorLogger
+
 	platformType, err := boot.MakePlatformType(*platform)
 	if err != nil {
 		cmd.Fatalf("%v", err)
@@ -141,26 +163,33 @@ func main() {
 		cmd.Fatalf("%v", err)
 	}
 
+	if *numNetworkChannels <= 0 {
+		cmd.Fatalf("num_network_channels must be > 0, got: %d", *numNetworkChannels)
+	}
+
 	// Create a new Config from the flags.
 	conf := &boot.Config{
-		RootDir:        *rootDir,
-		Debug:          *debug,
-		LogFilename:    *logFilename,
-		LogFormat:      *logFormat,
-		DebugLog:       *debugLog,
-		DebugLogFormat: *debugLogFormat,
-		FileAccess:     fsAccess,
-		Overlay:        *overlay,
-		Network:        netType,
-		GSO:            *gso,
-		LogPackets:     *logPackets,
-		Platform:       platformType,
-		Strace:         *strace,
-		StraceLogSize:  *straceLogSize,
-		WatchdogAction: wa,
-		PanicSignal:    *panicSignal,
-		ProfileEnable:  *profile,
-		EnableRaw:      *netRaw,
+		RootDir:            *rootDir,
+		Debug:              *debug,
+		LogFilename:        *logFilename,
+		LogFormat:          *logFormat,
+		DebugLog:           *debugLog,
+		DebugLogFormat:     *debugLogFormat,
+		FileAccess:         fsAccess,
+		Overlay:            *overlay,
+		Network:            netType,
+		GSO:                *gso,
+		LogPackets:         *logPackets,
+		Platform:           platformType,
+		Strace:             *strace,
+		StraceLogSize:      *straceLogSize,
+		WatchdogAction:     wa,
+		PanicSignal:        *panicSignal,
+		ProfileEnable:      *profile,
+		EnableRaw:          *netRaw,
+		NumNetworkChannels: *numNetworkChannels,
+		Rootless:           *rootless,
+
 		TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
 	}
 	if len(*straceSyscalls) != 0 {
@@ -174,24 +203,7 @@ func main() {
 
 	subcommand := flag.CommandLine.Arg(0)
 
-	var logFile io.Writer = os.Stderr
-	if *logFD > -1 {
-		logFile = os.NewFile(uintptr(*logFD), "log file")
-	} else if *logFilename != "" {
-		// We must set O_APPEND and not O_TRUNC because Docker passes
-		// the same log file for all commands (and also parses these
-		// log files), so we can't destroy them on each command.
-		f, err := os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644)
-		if err != nil {
-			cmd.Fatalf("error opening log file %q: %v", *logFilename, err)
-		}
-		logFile = f
-	} else if subcommand == "do" {
-		logFile = ioutil.Discard
-	}
-
-	e := newEmitter(*logFormat, logFile)
-
+	var e log.Emitter
 	if *debugLogFD > -1 {
 		f := os.NewFile(uintptr(*debugLogFD), "debug log file")
 
@@ -201,28 +213,31 @@ func main() {
 			cmd.Fatalf("flag --debug-log-fd should only be passed to 'boot' and 'gofer' command, but was passed to %q", subcommand)
 		}
 
-		// If we are the boot process, then we own our stdio FDs and
-		// can do what we want with them. Since Docker and Containerd
-		// both eat boot's stderr, we dup our stderr to the provided
-		// log FD so that panics will appear in the logs, rather than
-		// just disappear.
+		// If we are the boot process, then we own our stdio FDs and can do what we
+		// want with them. Since Docker and Containerd both eat boot's stderr, we
+		// dup our stderr to the provided log FD so that panics will appear in the
+		// logs, rather than just disappear.
 		if err := syscall.Dup2(int(f.Fd()), int(os.Stderr.Fd())); err != nil {
 			cmd.Fatalf("error dup'ing fd %d to stderr: %v", f.Fd(), err)
 		}
 
-		if logFile == os.Stderr {
-			// Suppress logging to stderr when debug log is enabled. Otherwise all
-			// messages will be duplicated in the debug log (see Dup2() call above).
-			e = newEmitter(*debugLogFormat, f)
-		} else {
-			e = log.MultiEmitter{e, newEmitter(*debugLogFormat, f)}
-		}
+		e = newEmitter(*debugLogFormat, f)
+
 	} else if *debugLog != "" {
 		f, err := specutils.DebugLogFile(*debugLog, subcommand)
 		if err != nil {
 			cmd.Fatalf("error opening debug log file in %q: %v", *debugLog, err)
 		}
-		e = log.MultiEmitter{e, newEmitter(*debugLogFormat, f)}
+		e = newEmitter(*debugLogFormat, f)
+
+	} else {
+		// Stderr is reserved for the application, just discard the logs if no debug
+		// log is specified.
+		e = newEmitter("text", ioutil.Discard)
+	}
+
+	if *alsoLogToStderr {
+		e = log.MultiEmitter{e, newEmitter(*debugLogFormat, os.Stderr)}
 	}
 
 	log.SetTarget(e)
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 0460d5f1a..e9e24fc58 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -68,7 +68,7 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi
 		// Build the path to the net namespace of the sandbox process.
 		// This is what we will copy.
 		nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
-		if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO); err != nil {
+		if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO, conf.NumNetworkChannels); err != nil {
 			return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
 		}
 	case boot.NetworkHost:
@@ -138,7 +138,7 @@ func isRootNS() (bool, error) {
 // createInterfacesAndRoutesFromNS scrapes the interface and routes from the
 // net namespace with the given path, creates them in the sandbox, and removes
 // them from the host.
-func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool) error {
+func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool, numNetworkChannels int) error {
 	// Join the network namespace that we will be copying.
 	restore, err := joinNetNS(nsPath)
 	if err != nil {
@@ -202,25 +202,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
 			continue
 		}
 
-		// Create the socket.
-		const protocol = 0x0300 // htons(ETH_P_ALL)
-		fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol)
-		if err != nil {
-			return fmt.Errorf("unable to create raw socket: %v", err)
-		}
-		deviceFile := os.NewFile(uintptr(fd), "raw-device-fd")
-
-		// Bind to the appropriate device.
-		ll := syscall.SockaddrLinklayer{
-			Protocol: protocol,
-			Ifindex:  iface.Index,
-			Hatype:   0, // No ARP type.
-			Pkttype:  syscall.PACKET_OTHERHOST,
-		}
-		if err := syscall.Bind(fd, &ll); err != nil {
-			return fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
-		}
-
 		// Scrape the routes before removing the address, since that
 		// will remove the routes as well.
 		routes, def, err := routesForIface(iface)
@@ -236,9 +217,10 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
 		}
 
 		link := boot.FDBasedLink{
-			Name:   iface.Name,
-			MTU:    iface.MTU,
-			Routes: routes,
+			Name:        iface.Name,
+			MTU:         iface.MTU,
+			Routes:      routes,
+			NumChannels: numNetworkChannels,
 		}
 
 		// Get the link for the interface.
@@ -246,32 +228,25 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
 		if err != nil {
 			return fmt.Errorf("getting link for interface %q: %v", iface.Name, err)
 		}
-		link.LinkAddress = []byte(ifaceLink.Attrs().HardwareAddr)
+		link.LinkAddress = ifaceLink.Attrs().HardwareAddr
 
-		if enableGSO {
-			gso, err := isGSOEnabled(fd, iface.Name)
+		log.Debugf("Setting up network channels")
+		// Create the socket for the device.
+		for i := 0; i < link.NumChannels; i++ {
+			log.Debugf("Creating Channel %d", i)
+			socketEntry, err := createSocket(iface, ifaceLink, enableGSO)
 			if err != nil {
-				return fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err)
+				return fmt.Errorf("failed to createSocket for %s : %v", iface.Name, err)
 			}
-			if gso {
-				if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
-					return fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
-				}
-				link.GSOMaxSize = ifaceLink.Attrs().GSOMaxSize
+			if i == 0 {
+				link.GSOMaxSize = socketEntry.gsoMaxSize
 			} else {
-				log.Infof("GSO not available in host.")
+				if link.GSOMaxSize != socketEntry.gsoMaxSize {
+					return fmt.Errorf("inconsistent gsoMaxSize %d and %d when creating multiple channels for same interface: %s",
+						link.GSOMaxSize, socketEntry.gsoMaxSize, iface.Name)
+				}
 			}
-		}
-
-		// Use SO_RCVBUFFORCE because on linux the receive buffer for an
-		// AF_PACKET socket is capped by "net.core.rmem_max". rmem_max
-		// defaults to a unusually low value of 208KB. This is too low
-		// for gVisor to be able to receive packets at high throughputs
-		// without incurring packet drops.
-		const rcvBufSize = 4 << 20 // 4MB.
-
-		if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, rcvBufSize); err != nil {
-			return fmt.Errorf("failed to increase socket rcv buffer to %d: %v", rcvBufSize, err)
+			args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile)
 		}
 
 		// Collect the addresses for the interface, enable forwarding,
@@ -285,7 +260,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
 			}
 		}
 
-		args.FilePayload.Files = append(args.FilePayload.Files, deviceFile)
 		args.FDBasedLinks = append(args.FDBasedLinks, link)
 	}
 
@@ -296,6 +270,61 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
 	return nil
 }
 
+type socketEntry struct {
+	deviceFile *os.File
+	gsoMaxSize uint32
+}
+
+// createSocket creates an underlying AF_PACKET socket and configures it for use by
+// the sentry and returns an *os.File that wraps the underlying socket fd.
+func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (*socketEntry, error) {
+	// Create the socket.
+	const protocol = 0x0300 // htons(ETH_P_ALL)
+	fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol)
+	if err != nil {
+		return nil, fmt.Errorf("unable to create raw socket: %v", err)
+	}
+	deviceFile := os.NewFile(uintptr(fd), "raw-device-fd")
+	// Bind to the appropriate device.
+	ll := syscall.SockaddrLinklayer{
+		Protocol: protocol,
+		Ifindex:  iface.Index,
+		Hatype:   0, // No ARP type.
+		Pkttype:  syscall.PACKET_OTHERHOST,
+	}
+	if err := syscall.Bind(fd, &ll); err != nil {
+		return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
+	}
+
+	gsoMaxSize := uint32(0)
+	if enableGSO {
+		gso, err := isGSOEnabled(fd, iface.Name)
+		if err != nil {
+			return nil, fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err)
+		}
+		if gso {
+			if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
+				return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
+			}
+			gsoMaxSize = ifaceLink.Attrs().GSOMaxSize
+		} else {
+			log.Infof("GSO not available in host.")
+		}
+	}
+
+	// Use SO_RCVBUFFORCE because on linux the receive buffer for an
+	// AF_PACKET socket is capped by "net.core.rmem_max". rmem_max
+	// defaults to a unusually low value of 208KB. This is too low
+	// for gVisor to be able to receive packets at high throughputs
+	// without incurring packet drops.
+	const rcvBufSize = 4 << 20 // 4MB.
+
+	if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, rcvBufSize); err != nil {
+		return nil, fmt.Errorf("failed to increase socket rcv buffer to %d: %v", rcvBufSize, err)
+	}
+	return &socketEntry{deviceFile, gsoMaxSize}, nil
+}
+
 // loopbackLinks collects the links for a loopback interface.
 func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink, error) {
 	var links []boot.LoopbackLink
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 47a66afb2..5ff6f879c 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -515,46 +515,64 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 		} else if specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) {
 			log.Infof("Sandbox will be started in new user namespace")
 			nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
+			cmd.Args = append(cmd.Args, "--setup-root")
 
-			// Map nobody in the new namespace to nobody in the parent namespace.
-			//
-			// A sandbox process will construct an empty
-			// root for itself, so it has to have the CAP_SYS_ADMIN
-			// capability.
-			//
-			// FIXME(b/122554829): The current implementations of
-			// os/exec doesn't allow to set ambient capabilities if
-			// a process is started in a new user namespace. As a
-			// workaround, we start the sandbox process with the 0
-			// UID and then it constructs a chroot and sets UID to
-			// nobody.  https://github.com/golang/go/issues/2315
-			const nobody = 65534
-			cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
-				{
-					ContainerID: int(0),
-					HostID:      int(nobody - 1),
-					Size:        int(1),
-				},
-				{
-					ContainerID: int(nobody),
-					HostID:      int(nobody),
-					Size:        int(1),
-				},
-			}
-			cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{
-				{
-					ContainerID: int(nobody),
-					HostID:      int(nobody),
-					Size:        int(1),
-				},
+			if conf.Rootless {
+				log.Infof("Rootless mode: sandbox will run as root inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid())
+				cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
+					{
+						ContainerID: 0,
+						HostID:      os.Getuid(),
+						Size:        1,
+					},
+				}
+				cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{
+					{
+						ContainerID: 0,
+						HostID:      os.Getgid(),
+						Size:        1,
+					},
+				}
+				cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0}
+
+			} else {
+				// Map nobody in the new namespace to nobody in the parent namespace.
+				//
+				// A sandbox process will construct an empty
+				// root for itself, so it has to have the CAP_SYS_ADMIN
+				// capability.
+				//
+				// FIXME(b/122554829): The current implementations of
+				// os/exec doesn't allow to set ambient capabilities if
+				// a process is started in a new user namespace. As a
+				// workaround, we start the sandbox process with the 0
+				// UID and then it constructs a chroot and sets UID to
+				// nobody.  https://github.com/golang/go/issues/2315
+				const nobody = 65534
+				cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
+					{
+						ContainerID: 0,
+						HostID:      nobody - 1,
+						Size:        1,
+					},
+					{
+						ContainerID: nobody,
+						HostID:      nobody,
+						Size:        1,
+					},
+				}
+				cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{
+					{
+						ContainerID: nobody,
+						HostID:      nobody,
+						Size:        1,
+					},
+				}
+
+				// Set credentials to run as user and group nobody.
+				cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: nobody}
 			}
 
-			// Set credentials to run as user and group nobody.
-			cmd.SysProcAttr.Credential = &syscall.Credential{
-				Uid: 0,
-				Gid: nobody,
-			}
-			cmd.Args = append(cmd.Args, "--setup-root")
 		} else {
 			return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID")
 		}
@@ -649,7 +667,7 @@ func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
 
 // WaitPID waits for process 'pid' in the container's sandbox and returns its
 // WaitStatus.
-func (s *Sandbox) WaitPID(cid string, pid int32, clearStatus bool) (syscall.WaitStatus, error) {
+func (s *Sandbox) WaitPID(cid string, pid int32) (syscall.WaitStatus, error) {
 	log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID)
 	var ws syscall.WaitStatus
 	conn, err := s.sandboxConnect()
@@ -659,9 +677,8 @@ func (s *Sandbox) WaitPID(cid string, pid int32, clearStatus bool) (syscall.Wait
 	defer conn.Close()
 
 	args := &boot.WaitPIDArgs{
-		PID:         pid,
-		CID:         cid,
-		ClearStatus: clearStatus,
+		PID: pid,
+		CID: cid,
 	}
 	if err := conn.Call(boot.ContainerWaitPID, args, &ws); err != nil {
 		return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %v", pid, s.ID, err)
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index 15476de6f..0456e4c4f 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -10,10 +10,7 @@ go_library(
         "specutils.go",
     ],
     importpath = "gvisor.googlesource.com/gvisor/runsc/specutils",
-    visibility = [
-        "//runsc:__subpackages__",
-        "//test:__subpackages__",
-    ],
+    visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/log",
diff --git a/runsc/specutils/fs.go b/runsc/specutils/fs.go
index 1f3afb4e4..6e6902e9f 100644
--- a/runsc/specutils/fs.go
+++ b/runsc/specutils/fs.go
@@ -16,6 +16,7 @@ package specutils
 
 import (
 	"fmt"
+	"math/bits"
 	"path"
 	"syscall"
 
@@ -105,22 +106,30 @@ func optionsToFlags(opts []string, source map[string]mapping) uint32 {
 	return rv
 }
 
-// ValidateMount validates that spec mounts are correct.
+// validateMount validates that spec mounts are correct.
 func validateMount(mnt *specs.Mount) error {
 	if !path.IsAbs(mnt.Destination) {
 		return fmt.Errorf("Mount.Destination must be an absolute path: %v", mnt)
 	}
-
 	if mnt.Type == "bind" {
-		for _, o := range mnt.Options {
-			if ContainsStr(invalidOptions, o) {
-				return fmt.Errorf("mount option %q is not supported: %v", o, mnt)
-			}
-			_, ok1 := optionsMap[o]
-			_, ok2 := propOptionsMap[o]
-			if !ok1 && !ok2 {
-				return fmt.Errorf("unknown mount option %q", o)
-			}
+		return ValidateMountOptions(mnt.Options)
+	}
+	return nil
+}
+
+// ValidateMountOptions validates that mount options are correct.
+func ValidateMountOptions(opts []string) error {
+	for _, o := range opts {
+		if ContainsStr(invalidOptions, o) {
+			return fmt.Errorf("mount option %q is not supported", o)
+		}
+		_, ok1 := optionsMap[o]
+		_, ok2 := propOptionsMap[o]
+		if !ok1 && !ok2 {
+			return fmt.Errorf("unknown mount option %q", o)
+		}
+		if err := validatePropagation(o); err != nil {
+			return err
 		}
 	}
 	return nil
@@ -133,5 +142,14 @@ func validateRootfsPropagation(opt string) error {
 	if flags&(syscall.MS_SLAVE|syscall.MS_PRIVATE) == 0 {
 		return fmt.Errorf("root mount propagation option must specify private or slave: %q", opt)
 	}
+	return validatePropagation(opt)
+}
+
+func validatePropagation(opt string) error {
+	flags := PropOptionsToFlags([]string{opt})
+	exclusive := flags & (syscall.MS_SLAVE | syscall.MS_PRIVATE | syscall.MS_SHARED | syscall.MS_UNBINDABLE)
+	if bits.OnesCount32(exclusive) > 1 {
+		return fmt.Errorf("mount propagation options are mutually exclusive: %q", opt)
+	}
 	return nil
 }
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
index 7d194335c..06c13d1ab 100644
--- a/runsc/specutils/namespace.go
+++ b/runsc/specutils/namespace.go
@@ -220,3 +220,55 @@ func HasCapabilities(cs ...capability.Cap) bool {
 	}
 	return true
 }
+
+// MaybeRunAsRoot ensures the process runs with capabilities needed to create a
+// sandbox, e.g. CAP_SYS_ADMIN, CAP_SYS_CHROOT, etc. If capabilities are needed,
+// it will create a new user namespace and re-execute the process as root
+// inside the namespace with the same arguments and environment.
+//
+// This function returns immediately when no new capability is needed. If
+// another process is executed, it returns straight from here with the same exit
+// code as the child.
+func MaybeRunAsRoot() error {
+	if HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT, capability.CAP_SETUID, capability.CAP_SETGID) {
+		return nil
+	}
+
+	// Current process doesn't have required capabilities, create user namespace
+	// and run as root inside the namespace to acquire capabilities.
+	log.Infof("*** Re-running as root in new user namespace ***")
+
+	cmd := exec.Command("/proc/self/exe", os.Args[1:]...)
+
+	cmd.SysProcAttr = &syscall.SysProcAttr{
+		Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS,
+		// Set current user/group as root inside the namespace. Since we may not
+		// have CAP_SETUID/CAP_SETGID, just map root to the current user/group.
+		UidMappings: []syscall.SysProcIDMap{
+			{ContainerID: 0, HostID: os.Getuid(), Size: 1},
+		},
+		GidMappings: []syscall.SysProcIDMap{
+			{ContainerID: 0, HostID: os.Getgid(), Size: 1},
+		},
+		Credential:                 &syscall.Credential{Uid: 0, Gid: 0},
+		GidMappingsEnableSetgroups: false,
+	}
+
+	cmd.Env = os.Environ()
+	cmd.Stdin = os.Stdin
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	if err := cmd.Run(); err != nil {
+		if exit, ok := err.(*exec.ExitError); ok {
+			if ws, ok := exit.Sys().(syscall.WaitStatus); ok {
+				os.Exit(ws.ExitStatus())
+			}
+			log.Warningf("No wait status provided, exiting with -1: %v", err)
+			os.Exit(-1)
+		}
+		return fmt.Errorf("re-executing self: %v", err)
+	}
+	// Child completed with success.
+	os.Exit(0)
+	panic("unreachable")
+}
diff --git a/runsc/test/integration/BUILD b/runsc/test/integration/BUILD
index 0c4e4fa80..04ed885c6 100644
--- a/runsc/test/integration/BUILD
+++ b/runsc/test/integration/BUILD
@@ -8,6 +8,7 @@ go_test(
     srcs = [
         "exec_test.go",
         "integration_test.go",
+        "regression_test.go",
     ],
     embed = [":integration"],
     tags = [
diff --git a/runsc/test/integration/exec_test.go b/runsc/test/integration/exec_test.go
index 7af064d79..7c0e61ac3 100644
--- a/runsc/test/integration/exec_test.go
+++ b/runsc/test/integration/exec_test.go
@@ -29,6 +29,7 @@ package integration
 import (
 	"fmt"
 	"strconv"
+	"strings"
 	"syscall"
 	"testing"
 	"time"
@@ -136,3 +137,25 @@ func TestExecJobControl(t *testing.T) {
 		t.Errorf("ws.ExitedStatus got %d, want %d", got, want)
 	}
 }
+
+// Test that failure to exec returns proper error message.
+func TestExecError(t *testing.T) {
+	if err := testutil.Pull("alpine"); err != nil {
+		t.Fatalf("docker pull failed: %v", err)
+	}
+	d := testutil.MakeDocker("exec-error-test")
+
+	// Start the container.
+	if err := d.Run("alpine", "sleep", "1000"); err != nil {
+		t.Fatalf("docker run failed: %v", err)
+	}
+	defer d.CleanUp()
+
+	_, err := d.Exec("no_can_find")
+	if err == nil {
+		t.Fatalf("docker exec didn't fail")
+	}
+	if want := `error finding executable "no_can_find" in PATH`; !strings.Contains(err.Error(), want) {
+		t.Fatalf("docker exec wrong error, got: %s, want: .*%s.*", err.Error(), want)
+	}
+}
diff --git a/runsc/test/integration/regression_test.go b/runsc/test/integration/regression_test.go
new file mode 100644
index 000000000..80bae9970
--- /dev/null
+++ b/runsc/test/integration/regression_test.go
@@ -0,0 +1,45 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package integration
+
+import (
+	"strings"
+	"testing"
+
+	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
+)
+
+// Test that UDS can be created using overlay when parent directory is in lower
+// layer only (b/134090485).
+//
+// Prerequisite: the directory where the socket file is created must not have
+// been open for write before bind(2) is called.
+func TestBindOverlay(t *testing.T) {
+	if err := testutil.Pull("ubuntu:trusty"); err != nil {
+		t.Fatal("docker pull failed:", err)
+	}
+	d := testutil.MakeDocker("bind-overlay-test")
+
+	cmd := "nc -l -U /var/run/sock& sleep 1 && echo foobar-asdf | nc -U /var/run/sock"
+	got, err := d.RunFg("ubuntu:trusty", "bash", "-c", cmd)
+	if err != nil {
+		t.Fatal("docker run failed:", err)
+	}
+
+	if want := "foobar-asdf"; !strings.Contains(got, want) {
+		t.Fatalf("docker run output is missing %q: %s", want, got)
+	}
+	defer d.CleanUp()
+}
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
index ddec81444..eedf962a4 100644
--- a/runsc/test/testutil/BUILD
+++ b/runsc/test/testutil/BUILD
@@ -18,6 +18,5 @@ go_library(
         "@com_github_cenkalti_backoff//:go_default_library",
         "@com_github_kr_pty//:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
-        "@com_github_syndtr_gocapability//capability:go_default_library",
     ],
 )
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 9efb1ba8e..1bd5adc54 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -30,7 +30,6 @@ import (
 	"os/exec"
 	"os/signal"
 	"path/filepath"
-	"runtime"
 	"strings"
 	"sync"
 	"sync/atomic"
@@ -39,7 +38,6 @@ import (
 
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
-	"github.com/syndtr/gocapability/capability"
 	"gvisor.googlesource.com/gvisor/runsc/boot"
 	"gvisor.googlesource.com/gvisor/runsc/specutils"
 )
@@ -136,6 +134,7 @@ func TestConfig() *boot.Config {
 		Strace:     true,
 		FileAccess: boot.FileAccessExclusive,
 		TestOnlyAllowRunAsCurrentUserWithoutChroot: true,
+		NumNetworkChannels:                         1,
 	}
 }
 
@@ -283,54 +282,6 @@ func WaitForHTTP(port int, timeout time.Duration) error {
 	return Poll(cb, timeout)
 }
 
-// RunAsRoot ensures the test runs with CAP_SYS_ADMIN and CAP_SYS_CHROOT. If
-// needed it will create a new user namespace and re-execute the test as root
-// inside of the namespace. This function returns when it's running as root. If
-// it needs to create another process, it will exit from there and not return.
-func RunAsRoot() {
-	if specutils.HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT) {
-		return
-	}
-
-	fmt.Println("*** Re-running test as root in new user namespace ***")
-
-	// Current process doesn't have CAP_SYS_ADMIN, create user namespace and run
-	// as root inside that namespace to get it.
-	runtime.LockOSThread()
-	defer runtime.UnlockOSThread()
-
-	cmd := exec.Command("/proc/self/exe", os.Args[1:]...)
-	cmd.SysProcAttr = &syscall.SysProcAttr{
-		Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS,
-		// Set current user/group as root inside the namespace.
-		UidMappings: []syscall.SysProcIDMap{
-			{ContainerID: 0, HostID: os.Getuid(), Size: 1},
-		},
-		GidMappings: []syscall.SysProcIDMap{
-			{ContainerID: 0, HostID: os.Getgid(), Size: 1},
-		},
-		GidMappingsEnableSetgroups: false,
-		Credential: &syscall.Credential{
-			Uid: 0,
-			Gid: 0,
-		},
-	}
-	cmd.Env = os.Environ()
-	cmd.Stdin = os.Stdin
-	cmd.Stdout = os.Stdout
-	cmd.Stderr = os.Stderr
-	if err := cmd.Run(); err != nil {
-		if exit, ok := err.(*exec.ExitError); ok {
-			if ws, ok := exit.Sys().(syscall.WaitStatus); ok {
-				os.Exit(ws.ExitStatus())
-			}
-			os.Exit(-1)
-		}
-		panic(fmt.Sprint("error running child process:", err.Error()))
-	}
-	os.Exit(0)
-}
-
 // Reaper reaps child processes.
 type Reaper struct {
 	// mu protects ch, which will be nil if the reaper is not running.