10 files changed, 337 insertions, 219 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 588bb8851..6fe2b57de 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -57,10 +57,10 @@ go_library(
         "//pkg/sentry/pgalloc",
         "//pkg/sentry/platform",
         "//pkg/sentry/sighandling",
-        "//pkg/sentry/socket/epsocket",
         "//pkg/sentry/socket/hostinet",
         "//pkg/sentry/socket/netlink",
         "//pkg/sentry/socket/netlink/route",
+        "//pkg/sentry/socket/netstack",
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/state",
         "//pkg/sentry/strace",
@@ -80,6 +80,7 @@ go_library(
         "//pkg/tcpip/network/ipv6",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/icmp",
+        "//pkg/tcpip/transport/raw",
         "//pkg/tcpip/transport/tcp",
         "//pkg/tcpip/transport/udp",
         "//pkg/urpc",
@@ -109,6 +110,7 @@ go_test(
         "//pkg/sentry/arch:registers_go_proto",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
+        "//pkg/sentry/kernel/auth",
         "//pkg/unet",
         "//runsc/fsgofer",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 7ae0dd05d..38278d0a2 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -19,6 +19,7 @@ import (
 	"strconv"
 	"strings"
 
+	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
 )
 
@@ -112,6 +113,34 @@ func MakeWatchdogAction(s string) (watchdog.Action, error) {
 	}
 }
 
+// MakeRefsLeakMode converts type from string.
+func MakeRefsLeakMode(s string) (refs.LeakMode, error) {
+	switch strings.ToLower(s) {
+	case "disabled":
+		return refs.NoLeakChecking, nil
+	case "log-names":
+		return refs.LeaksLogWarning, nil
+	case "log-traces":
+		return refs.LeaksLogTraces, nil
+	default:
+		return 0, fmt.Errorf("invalid refs leakmode %q", s)
+	}
+}
+
+func refsLeakModeToString(mode refs.LeakMode) string {
+	switch mode {
+	// If not set, default it to disabled.
+	case refs.UninitializedLeakChecking, refs.NoLeakChecking:
+		return "disabled"
+	case refs.LeaksLogWarning:
+		return "log-names"
+	case refs.LeaksLogTraces:
+		return "log-traces"
+	default:
+		panic(fmt.Sprintf("Invalid leakmode: %d", mode))
+	}
+}
+
 // Config holds configuration that is not part of the runtime spec.
 type Config struct {
 	// RootDir is the runtime root directory.
@@ -138,6 +167,9 @@ type Config struct {
 	// Overlay is whether to wrap the root filesystem in an overlay.
 	Overlay bool
 
+	// FSGoferHostUDS enables the gofer to mount a host UDS.
+	FSGoferHostUDS bool
+
 	// Network indicates what type of network to use.
 	Network NetworkType
 
@@ -182,12 +214,6 @@ type Config struct {
 	// RestoreFile is the path to the saved container image
 	RestoreFile string
 
-	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
-	// tests. It allows runsc to start the sandbox process as the current
-	// user, and without chrooting the sandbox process. This can be
-	// necessary in test environments that have limited capabilities.
-	TestOnlyAllowRunAsCurrentUserWithoutChroot bool
-
 	// NumNetworkChannels controls the number of AF_PACKET sockets that map
 	// to the same underlying network device. This allows netstack to better
 	// scale for high throughput use cases.
@@ -201,6 +227,22 @@ type Config struct {
 
 	// AlsoLogToStderr allows to send log messages to stderr.
 	AlsoLogToStderr bool
+
+	// ReferenceLeakMode sets reference leak check mode
+	ReferenceLeakMode refs.LeakMode
+
+	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
+	// tests. It allows runsc to start the sandbox process as the current
+	// user, and without chrooting the sandbox process. This can be
+	// necessary in test environments that have limited capabilities.
+	TestOnlyAllowRunAsCurrentUserWithoutChroot bool
+
+	// TestOnlyTestNameEnv should only be used in tests. It looks up for the
+	// test name in the container environment variables and adds it to the debug
+	// log file name. This is done to help identify the log with the test when
+	// multiple tests are run in parallel, since there is no way to pass
+	// parameters to the runtime from docker.
+	TestOnlyTestNameEnv string
 }
 
 // ToFlags returns a slice of flags that correspond to the given Config.
@@ -214,6 +256,7 @@ func (c *Config) ToFlags() []string {
 		"--debug-log-format=" + c.DebugLogFormat,
 		"--file-access=" + c.FileAccess.String(),
 		"--overlay=" + strconv.FormatBool(c.Overlay),
+		"--fsgofer-host-uds=" + strconv.FormatBool(c.FSGoferHostUDS),
 		"--network=" + c.Network.String(),
 		"--log-packets=" + strconv.FormatBool(c.LogPackets),
 		"--platform=" + c.Platform,
@@ -227,10 +270,14 @@ func (c *Config) ToFlags() []string {
 		"--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels),
 		"--rootless=" + strconv.FormatBool(c.Rootless),
 		"--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr),
+		"--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode),
 	}
+	// Only include these if set since it is never to be used by users.
 	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
-		// Only include if set since it is never to be used by users.
-		f = append(f, "-TESTONLY-unsafe-nonroot=true")
+		f = append(f, "--TESTONLY-unsafe-nonroot=true")
+	}
+	if len(c.TestOnlyTestNameEnv) != 0 {
+		f = append(f, "--TESTONLY-test-name-env="+c.TestOnlyTestNameEnv)
 	}
 	return f
 }
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 72cbabd16..a73c593ea 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -18,7 +18,6 @@ import (
 	"errors"
 	"fmt"
 	"os"
-	"path"
 	"syscall"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
@@ -27,7 +26,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/socket/epsocket"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	"gvisor.dev/gvisor/pkg/sentry/state"
 	"gvisor.dev/gvisor/pkg/sentry/time"
 	"gvisor.dev/gvisor/pkg/sentry/watchdog"
@@ -142,7 +141,7 @@ func newController(fd int, l *Loader) (*controller, error) {
 	}
 	srv.Register(manager)
 
-	if eps, ok := l.k.NetworkStack().(*epsocket.Stack); ok {
+	if eps, ok := l.k.NetworkStack().(*netstack.Stack); ok {
 		net := &Network{
 			Stack: eps.Stack,
 		}
@@ -234,13 +233,6 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	if args.CID == "" {
 		return errors.New("start argument missing container ID")
 	}
-	// Prevent CIDs containing ".." from confusing the sentry when creating
-	// /containers/<cid> directory.
-	// TODO(b/129293409): Once we have multiple independent roots, this
-	// check won't be necessary.
-	if path.Clean(args.CID) != args.CID {
-		return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", args.CID)
-	}
 	if len(args.FilePayload.Files) < 4 {
 		return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer")
 	}
@@ -355,7 +347,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	fs.SetRestoreEnvironment(*renv)
 
 	// Prepare to load from the state file.
-	if eps, ok := networkStack.(*epsocket.Stack); ok {
+	if eps, ok := networkStack.(*netstack.Stack); ok {
 		stack.StackFromEnv = eps.Stack // FIXME(b/36201077)
 	}
 	info, err := specFile.Stat()
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 7ca776b3a..a2ecc6bcb 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -88,14 +88,24 @@ var allowedSyscalls = seccomp.SyscallRules{
 			seccomp.AllowValue(linux.FUTEX_WAIT | linux.FUTEX_PRIVATE_FLAG),
 			seccomp.AllowAny{},
 			seccomp.AllowAny{},
-			seccomp.AllowValue(0),
 		},
 		{
 			seccomp.AllowAny{},
 			seccomp.AllowValue(linux.FUTEX_WAKE | linux.FUTEX_PRIVATE_FLAG),
 			seccomp.AllowAny{},
+		},
+		// Non-private variants are included for flipcall support. They are otherwise
+		// unncessary, as the sentry will use only private futexes internally.
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(linux.FUTEX_WAIT),
+			seccomp.AllowAny{},
+			seccomp.AllowAny{},
+		},
+		{
+			seccomp.AllowAny{},
+			seccomp.AllowValue(linux.FUTEX_WAKE),
 			seccomp.AllowAny{},
-			seccomp.AllowValue(0),
 		},
 	},
 	syscall.SYS_GETPID: {},
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index b6eeacf98..393c2a88b 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -25,19 +25,21 @@ import (
 
 	// Include filesystem types that OCI spec might mount.
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/dev"
-	"gvisor.dev/gvisor/pkg/sentry/fs/gofer"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/host"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/proc"
-	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/gofer"
+	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
@@ -62,6 +64,9 @@ const (
 	nonefs   = "none"
 )
 
+// tmpfs has some extra supported options that we must pass through.
+var tmpfsAllowedOptions = []string{"mode", "uid", "gid"}
+
 func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
 	// Upper layer uses the same flags as lower, but it must be read-write.
 	upperFlags := lowerFlags
@@ -170,27 +175,25 @@ func p9MountOptions(fd int, fa FileAccessType) []string {
 func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) {
 	var out []string
 	for _, o := range opts {
-		kv := strings.Split(o, "=")
-		switch len(kv) {
-		case 1:
-			if specutils.ContainsStr(allowedKeys, o) {
-				out = append(out, o)
-				continue
-			}
-			log.Warningf("ignoring unsupported key %q", kv)
-		case 2:
-			if specutils.ContainsStr(allowedKeys, kv[0]) {
-				out = append(out, o)
-				continue
-			}
-			log.Warningf("ignoring unsupported key %q", kv[0])
-		default:
-			return nil, fmt.Errorf("invalid option %q", o)
+		ok, err := parseMountOption(o, allowedKeys...)
+		if err != nil {
+			return nil, err
+		}
+		if ok {
+			out = append(out, o)
 		}
 	}
 	return out, nil
 }
 
+func parseMountOption(opt string, allowedKeys ...string) (bool, error) {
+	kv := strings.SplitN(opt, "=", 3)
+	if len(kv) > 2 {
+		return false, fmt.Errorf("invalid option %q", opt)
+	}
+	return specutils.ContainsStr(allowedKeys, kv[0]), nil
+}
+
 // mountDevice returns a device string based on the fs type and target
 // of the mount.
 func mountDevice(m specs.Mount) string {
@@ -205,6 +208,8 @@ func mountDevice(m specs.Mount) string {
 
 func mountFlags(opts []string) fs.MountSourceFlags {
 	mf := fs.MountSourceFlags{}
+	// Note: changes to supported options must be reflected in
+	// isSupportedMountFlag() as well.
 	for _, o := range opts {
 		switch o {
 		case "rw":
@@ -222,6 +227,18 @@ func mountFlags(opts []string) fs.MountSourceFlags {
 	return mf
 }
 
+func isSupportedMountFlag(fstype, opt string) bool {
+	switch opt {
+	case "rw", "ro", "noatime", "noexec":
+		return true
+	}
+	if fstype == tmpfs {
+		ok, err := parseMountOption(opt, tmpfsAllowedOptions...)
+		return ok && err == nil
+	}
+	return false
+}
+
 func mustFindFilesystem(name string) fs.Filesystem {
 	fs, ok := fs.FindFilesystem(name)
 	if !ok {
@@ -261,6 +278,18 @@ func subtargets(root string, mnts []specs.Mount) []string {
 	return targets
 }
 
+func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+	mns, err := mntr.setupFS(conf, procArgs)
+	if err != nil {
+		return err
+	}
+
+	// Set namespace here so that it can be found in ctx.
+	procArgs.MountNamespace = mns
+
+	return setExecutablePath(ctx, procArgs)
+}
+
 // setExecutablePath sets the procArgs.Filename by searching the PATH for an
 // executable matching the procArgs.Argv[0].
 func setExecutablePath(ctx context.Context, procArgs *kernel.CreateProcessArgs) error {
@@ -413,6 +442,39 @@ func (m *mountHint) isSupported() bool {
 	return m.mount.Type == tmpfs && m.share == pod
 }
 
+// checkCompatible verifies that shared mount is compatible with master.
+// For now enforce that all options are the same. Once bind mount is properly
+// supported, then we should ensure the master is less restrictive than the
+// container, e.g. master can be 'rw' while container mounts as 'ro'.
+func (m *mountHint) checkCompatible(mount specs.Mount) error {
+	// Remove options that don't affect to mount's behavior.
+	masterOpts := filterUnsupportedOptions(m.mount)
+	slaveOpts := filterUnsupportedOptions(mount)
+
+	if len(masterOpts) != len(slaveOpts) {
+		return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+	}
+
+	sort.Strings(masterOpts)
+	sort.Strings(slaveOpts)
+	for i, opt := range masterOpts {
+		if opt != slaveOpts[i] {
+			return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
+		}
+	}
+	return nil
+}
+
+func filterUnsupportedOptions(mount specs.Mount) []string {
+	rv := make([]string, 0, len(mount.Options))
+	for _, o := range mount.Options {
+		if isSupportedMountFlag(mount.Type, o) {
+			rv = append(rv, o)
+		}
+	}
+	return rv
+}
+
 // podMountHints contains a collection of mountHints for the pod.
 type podMountHints struct {
 	mounts map[string]*mountHint
@@ -500,73 +562,95 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin
 	}
 }
 
-// setupChildContainer is used to set up the file system for non-root containers
-// and amend the procArgs accordingly. This is the main entry point for this
-// rest of functions in this file. procArgs are passed by reference and the
-// FDMap field is modified. It dups stdioFDs.
-func (c *containerMounter) setupChildContainer(conf *Config, procArgs *kernel.CreateProcessArgs) error {
-	// Setup a child container.
-	log.Infof("Creating new process in child container.")
-
-	// Create a new root inode and mount namespace for the container.
-	rootCtx := c.k.SupervisorContext()
-	rootInode, err := c.createRootMount(rootCtx, conf)
-	if err != nil {
-		return fmt.Errorf("creating filesystem for container: %v", err)
+// processHints processes annotations that container hints about how volumes
+// should be mounted (e.g. a volume shared between containers). It must be
+// called for the root container only.
+func (c *containerMounter) processHints(conf *Config) error {
+	ctx := c.k.SupervisorContext()
+	for _, hint := range c.hints.mounts {
+		log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
+		inode, err := c.mountSharedMaster(ctx, conf, hint)
+		if err != nil {
+			return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
+		}
+		hint.root = inode
 	}
-	mns, err := fs.NewMountNamespace(rootCtx, rootInode)
+	return nil
+}
+
+// setupFS is used to set up the file system for all containers. This is the
+// main entry point method, with most of the other being internal only. It
+// returns the mount namespace that is created for the container.
+func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) {
+	log.Infof("Configuring container's file system")
+
+	// Create context with root credentials to mount the filesystem (the current
+	// user may not be privileged enough).
+	rootProcArgs := *procArgs
+	rootProcArgs.WorkingDirectory = "/"
+	rootProcArgs.Credentials = auth.NewRootCredentials(procArgs.Credentials.UserNamespace)
+	rootProcArgs.Umask = 0022
+	rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
+	rootCtx := rootProcArgs.NewContext(c.k)
+
+	mns, err := c.createMountNamespace(rootCtx, conf)
 	if err != nil {
-		return fmt.Errorf("creating new mount namespace for container: %v", err)
+		return nil, err
 	}
-	procArgs.MountNamespace = mns
-	root := mns.Root()
-	defer root.DecRef()
 
-	// Mount all submounts.
-	if err := c.mountSubmounts(rootCtx, conf, mns, root); err != nil {
-		return err
+	// Set namespace here so that it can be found in rootCtx.
+	rootProcArgs.MountNamespace = mns
+
+	if err := c.mountSubmounts(rootCtx, conf, mns); err != nil {
+		return nil, err
 	}
-	return c.checkDispenser()
+	return mns, nil
 }
 
-func (c *containerMounter) checkDispenser() error {
-	if !c.fds.empty() {
-		return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.fds)
+func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Config) (*fs.MountNamespace, error) {
+	rootInode, err := c.createRootMount(ctx, conf)
+	if err != nil {
+		return nil, fmt.Errorf("creating filesystem for container: %v", err)
 	}
-	return nil
+	mns, err := fs.NewMountNamespace(ctx, rootInode)
+	if err != nil {
+		return nil, fmt.Errorf("creating new mount namespace for container: %v", err)
+	}
+	return mns, nil
 }
 
-// setupRootContainer creates a mount namespace containing the root filesystem
-// and all mounts. 'rootCtx' is used to walk directories to find mount points.
-// The 'setMountNS' callback is called after the mount namespace is created and
-// will get a reference on that namespace. The callback must ensure that the
-// rootCtx has the provided mount namespace.
-func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx context.Context, conf *Config, setMountNS func(*fs.MountNamespace)) error {
-	for _, hint := range c.hints.mounts {
-		log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
-		inode, err := c.mountSharedMaster(rootCtx, conf, hint)
-		if err != nil {
-			return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
+func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error {
+	root := mns.Root()
+	defer root.DecRef()
+
+	for _, m := range c.mounts {
+		log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options)
+		if hint := c.hints.findMount(m); hint != nil && hint.isSupported() {
+			if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil {
+				return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err)
+			}
+		} else {
+			if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil {
+				return fmt.Errorf("mount submount %q: %v", m.Destination, err)
+			}
 		}
-		hint.root = inode
 	}
 
-	rootInode, err := c.createRootMount(rootCtx, conf)
-	if err != nil {
-		return fmt.Errorf("creating root mount: %v", err)
+	if err := c.mountTmp(ctx, conf, mns, root); err != nil {
+		return fmt.Errorf("mount submount %q: %v", "tmp", err)
 	}
-	mns, err := fs.NewMountNamespace(userCtx, rootInode)
-	if err != nil {
-		return fmt.Errorf("creating root mount namespace: %v", err)
+
+	if err := c.checkDispenser(); err != nil {
+		return err
 	}
-	setMountNS(mns)
+	return nil
+}
 
-	root := mns.Root()
-	defer root.DecRef()
-	if err := c.mountSubmounts(rootCtx, conf, mns, root); err != nil {
-		return fmt.Errorf("mounting submounts: %v", err)
+func (c *containerMounter) checkDispenser() error {
+	if !c.fds.empty() {
+		return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.fds)
 	}
-	return c.checkDispenser()
+	return nil
 }
 
 // mountSharedMaster mounts the master of a volume that is shared among
@@ -663,9 +747,7 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 		fsName = sysfs
 	case tmpfs:
 		fsName = m.Type
-
-		// tmpfs has some extra supported options that we must pass through.
-		opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
+		opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...)
 
 	case bind:
 		fd := c.fds.remove()
@@ -684,25 +766,6 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 	return fsName, opts, useOverlay, err
 }
 
-func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
-	for _, m := range c.mounts {
-		if hint := c.hints.findMount(m); hint != nil && hint.isSupported() {
-			if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil {
-				return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err)
-			}
-		} else {
-			if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil {
-				return fmt.Errorf("mount submount %q: %v", m.Destination, err)
-			}
-		}
-	}
-
-	if err := c.mountTmp(ctx, conf, mns, root); err != nil {
-		return fmt.Errorf("mount submount %q: %v", "tmp", err)
-	}
-	return nil
-}
-
 // mountSubmount mounts volumes inside the container's root. Because mounts may
 // be readonly, a lower ramfs overlay is added to create the mount point dir.
 // Another overlay is added with tmpfs on top if Config.Overlay is true.
@@ -769,17 +832,8 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
 // mountSharedSubmount binds mount to a previously mounted volume that is shared
 // among containers in the same pod.
 func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount specs.Mount, source *mountHint) error {
-	// For now enforce that all options are the same. Once bind mount is properly
-	// supported, then we should ensure the master is less restrictive than the
-	// container, e.g. master can be 'rw' while container mounts as 'ro'.
-	if len(mount.Options) != len(source.mount.Options) {
-		return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options)
-	}
-	sort.Strings(mount.Options)
-	for i, opt := range mount.Options {
-		if opt != source.mount.Options[i] {
-			return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options)
-		}
+	if err := source.checkCompatible(mount); err != nil {
+		return err
 	}
 
 	maxTraversals := uint(0)
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index f91158027..c8e5e86ee 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -20,7 +20,6 @@ import (
 	mrand "math/rand"
 	"os"
 	"runtime"
-	"strings"
 	"sync"
 	"sync/atomic"
 	"syscall"
@@ -33,7 +32,6 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/memutil"
 	"gvisor.dev/gvisor/pkg/rand"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -56,6 +54,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/raw"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/runsc/boot/filter"
@@ -63,10 +62,10 @@ import (
 	"gvisor.dev/gvisor/runsc/specutils"
 
 	// Include supported socket providers.
-	"gvisor.dev/gvisor/pkg/sentry/socket/epsocket"
 	"gvisor.dev/gvisor/pkg/sentry/socket/hostinet"
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink"
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
 )
 
@@ -527,34 +526,21 @@ func (l *Loader) run() error {
 
 		// Setup the root container file system.
 		l.startGoferMonitor(l.sandboxID, l.goferFDs)
+
 		mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
-		if err := mntr.setupRootContainer(ctx, ctx, l.conf, func(mns *fs.MountNamespace) {
-			l.rootProcArgs.MountNamespace = mns
-		}); err != nil {
+		if err := mntr.processHints(l.conf); err != nil {
 			return err
 		}
-
-		if err := setExecutablePath(ctx, &l.rootProcArgs); err != nil {
+		if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil {
 			return err
 		}
 
-		// Read /etc/passwd for the user's HOME directory and set the HOME
-		// environment variable as required by POSIX if it is not overridden by
-		// the user.
-		hasHomeEnvv := false
-		for _, envv := range l.rootProcArgs.Envv {
-			if strings.HasPrefix(envv, "HOME=") {
-				hasHomeEnvv = true
-			}
-		}
-		if !hasHomeEnvv {
-			homeDir, err := getExecUserHome(ctx, l.rootProcArgs.MountNamespace, uint32(l.rootProcArgs.Credentials.RealKUID))
-			if err != nil {
-				return fmt.Errorf("error reading exec user: %v", err)
-			}
-
-			l.rootProcArgs.Envv = append(l.rootProcArgs.Envv, "HOME="+homeDir)
+		// Add the HOME enviroment variable if it is not already set.
+		envv, err := maybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace, l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
+		if err != nil {
+			return err
 		}
+		l.rootProcArgs.Envv = envv
 
 		// Create the root container init task. It will begin running
 		// when the kernel is started.
@@ -687,13 +673,10 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
 
 	// Setup the child container file system.
 	l.startGoferMonitor(cid, goferFDs)
-	mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints)
-	if err := mntr.setupChildContainer(conf, &procArgs); err != nil {
-		return fmt.Errorf("configuring container FS: %v", err)
-	}
 
-	if err := setExecutablePath(ctx, &procArgs); err != nil {
-		return fmt.Errorf("setting executable path for %+v: %v", procArgs, err)
+	mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints)
+	if err := setupContainerFS(ctx, conf, mntr, &procArgs); err != nil {
+		return err
 	}
 
 	// Create and start the new process.
@@ -766,26 +749,34 @@ func (l *Loader) destroyContainer(cid string) error {
 		if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
 			return fmt.Errorf("sending SIGKILL to all container processes: %v", err)
 		}
+		// Wait for all processes that belong to the container to exit (including
+		// exec'd processes).
+		for _, t := range l.k.TaskSet().Root.Tasks() {
+			if t.ContainerID() == cid {
+				t.ThreadGroup().WaitExited()
+			}
+		}
+
+		// At this point, all processes inside of the container have exited,
+		// releasing all references to the container's MountNamespace and
+		// causing all submounts and overlays to be unmounted.
+		//
+		// Since the container's MountNamespace has been released,
+		// MountNamespace.destroy() will have executed, but that function may
+		// trigger async close operations. We must wait for those to complete
+		// before returning, otherwise the caller may kill the gofer before
+		// they complete, causing a cascade of failing RPCs.
+		fs.AsyncBarrier()
 	}
 
-	// Remove all container thread groups from the map.
+	// No more failure from this point on. Remove all container thread groups
+	// from the map.
 	for key := range l.processes {
 		if key.cid == cid {
 			delete(l.processes, key)
 		}
 	}
 
-	// At this point, all processes inside of the container have exited,
-	// releasing all references to the container's MountNamespace and
-	// causing all submounts and overlays to be unmounted.
-	//
-	// Since the container's MountNamespace has been released,
-	// MountNamespace.destroy() will have executed, but that function may
-	// trigger async close operations. We must wait for those to complete
-	// before returning, otherwise the caller may kill the gofer before
-	// they complete, causing a cascade of failing RPCs.
-	fs.AsyncBarrier()
-
 	log.Debugf("Container destroyed %q", cid)
 	return nil
 }
@@ -813,6 +804,16 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 	})
 	defer args.MountNamespace.DecRef()
 
+	// Add the HOME enviroment varible if it is not already set.
+	root := args.MountNamespace.Root()
+	defer root.DecRef()
+	ctx := fs.WithRoot(l.k.SupervisorContext(), root)
+	envv, err := maybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
+	if err != nil {
+		return 0, err
+	}
+	args.Envv = envv
+
 	// Start the process.
 	proc := control.Proc{Kernel: l.k}
 	args.PIDNamespace = tg.PIDNamespace()
@@ -911,15 +912,17 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 
 	case NetworkNone, NetworkSandbox:
 		// NetworkNone sets up loopback using netstack.
-		netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}
-		protoNames := []string{tcp.ProtocolName, udp.ProtocolName, icmp.ProtocolName4}
-		s := epsocket.Stack{stack.New(netProtos, protoNames, stack.Options{
-			Clock:       clock,
-			Stats:       epsocket.Metrics,
-			HandleLocal: true,
+		netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
+		transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
+		s := netstack.Stack{stack.New(stack.Options{
+			NetworkProtocols:   netProtos,
+			TransportProtocols: transProtos,
+			Clock:              clock,
+			Stats:              netstack.Metrics,
+			HandleLocal:        true,
 			// Enable raw sockets for users with sufficient
 			// privileges.
-			Raw: true,
+			UnassociatedFactory: raw.EndpointFactory{},
 		})}
 
 		// Enable SACK Recovery.
@@ -927,6 +930,10 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 			return nil, fmt.Errorf("failed to enable SACK: %v", err)
 		}
 
+		// Set default TTLs as required by socket/netstack.
+		s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+		s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
+
 		// Enable Receive Buffer Auto-Tuning.
 		if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
 			return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
@@ -1043,21 +1050,8 @@ func (l *Loader) signalAllProcesses(cid string, signo int32) error {
 	// the signal is delivered. This prevents process leaks when SIGKILL is
 	// sent to the entire container.
 	l.k.Pause()
-	if err := l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo}); err != nil {
-		l.k.Unpause()
-		return err
-	}
-	l.k.Unpause()
-
-	// If SIGKILLing all processes, wait for them to exit.
-	if linux.Signal(signo) == linux.SIGKILL {
-		for _, t := range l.k.TaskSet().Root.Tasks() {
-			if t.ContainerID() == cid {
-				t.ThreadGroup().WaitExited()
-			}
-		}
-	}
-	return nil
+	defer l.k.Unpause()
+	return l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo})
 }
 
 // threadGroupFromID same as threadGroupFromIDLocked except that it acquires
@@ -1090,8 +1084,3 @@ func (l *Loader) threadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, *host
 	}
 	return ep.tg, ep.tty, true, nil
 }
-
-func init() {
-	// TODO(gvisor.dev/issue/365): Make this configurable.
-	refs.SetLeakMode(refs.NoLeakChecking)
-}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index e0e32b9d5..147ff7703 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -401,17 +401,16 @@ func TestCreateMountNamespace(t *testing.T) {
 			}
 			defer cleanup()
 
-			// setupRootContainer needs to find root from the context after the
-			// namespace is created.
-			var mns *fs.MountNamespace
-			setMountNS := func(m *fs.MountNamespace) {
-				mns = m
-				ctx.(*contexttest.TestContext).RegisterValue(fs.CtxRoot, mns.Root())
-			}
 			mntr := newContainerMounter(&tc.spec, []int{sandEnd}, nil, &podMountHints{})
-			if err := mntr.setupRootContainer(ctx, ctx, conf, setMountNS); err != nil {
-				t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err)
+			mns, err := mntr.createMountNamespace(ctx, conf)
+			if err != nil {
+				t.Fatalf("failed to create mount namespace: %v", err)
 			}
+			ctx = fs.WithRoot(ctx, mns.Root())
+			if err := mntr.mountSubmounts(ctx, conf, mns); err != nil {
+				t.Fatalf("failed to create mount namespace: %v", err)
+			}
+
 			root := mns.Root()
 			defer root.DecRef()
 			for _, p := range tc.expectedPaths {
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index ea0d9f790..32cba5ac1 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -121,10 +121,10 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		nicID++
 		nicids[link.Name] = nicID
 
-		linkEP := loopback.New()
+		ep := loopback.New()
 
 		log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
-		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, true /* loopback */); err != nil {
+		if err := n.createNICWithAddrs(nicID, link.Name, ep, link.Addresses, true /* loopback */); err != nil {
 			return err
 		}
 
@@ -156,7 +156,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 
 		mac := tcpip.LinkAddress(link.LinkAddress)
-		linkEP, err := fdbased.New(&fdbased.Options{
+		ep, err := fdbased.New(&fdbased.Options{
 			FDs:                FDs,
 			MTU:                uint32(link.MTU),
 			EthernetHeader:     true,
@@ -170,7 +170,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		}
 
 		log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
-		if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, false /* loopback */); err != nil {
+		if err := n.createNICWithAddrs(nicID, link.Name, ep, link.Addresses, false /* loopback */); err != nil {
 			return err
 		}
 
@@ -203,14 +203,14 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 
 // createNICWithAddrs creates a NIC in the network stack and adds the given
 // addresses.
-func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID, addrs []net.IP, loopback bool) error {
+func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP, loopback bool) error {
 	if loopback {
-		if err := n.Stack.CreateNamedLoopbackNIC(id, name, sniffer.New(linkEP)); err != nil {
-			return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v, %v) failed: %v", id, name, linkEP, err)
+		if err := n.Stack.CreateNamedLoopbackNIC(id, name, sniffer.New(ep)); err != nil {
+			return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v) failed: %v", id, name, err)
 		}
 	} else {
-		if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(linkEP)); err != nil {
-			return fmt.Errorf("CreateNamedNIC(%v, %v, %v) failed: %v", id, name, linkEP, err)
+		if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(ep)); err != nil {
+			return fmt.Errorf("CreateNamedNIC(%v, %v) failed: %v", id, name, err)
 		}
 	}
 
diff --git a/runsc/boot/user.go b/runsc/boot/user.go
index d1d423a5c..56cc12ee0 100644
--- a/runsc/boot/user.go
+++ b/runsc/boot/user.go
@@ -16,6 +16,7 @@ package boot
 
 import (
 	"bufio"
+	"fmt"
 	"io"
 	"strconv"
 	"strings"
@@ -23,6 +24,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 )
 
@@ -42,7 +44,7 @@ func (r *fileReader) Read(buf []byte) (int, error) {
 
 // getExecUserHome returns the home directory of the executing user read from
 // /etc/passwd as read from the container filesystem.
-func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid uint32) (string, error) {
+func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.KUID) (string, error) {
 	// The default user home directory to return if no user matching the user
 	// if found in the /etc/passwd found in the image.
 	const defaultHome = "/"
@@ -82,7 +84,7 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid uint32
 		File: f,
 	}
 
-	homeDir, err := findHomeInPasswd(uid, r, defaultHome)
+	homeDir, err := findHomeInPasswd(uint32(uid), r, defaultHome)
 	if err != nil {
 		return "", err
 	}
@@ -90,6 +92,28 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid uint32
 	return homeDir, nil
 }
 
+// maybeAddExecUserHome returns a new slice with the HOME enviroment variable
+// set if the slice does not already contain it, otherwise it returns the
+// original slice unmodified.
+func maybeAddExecUserHome(ctx context.Context, mns *fs.MountNamespace, uid auth.KUID, envv []string) ([]string, error) {
+	// Check if the envv already contains HOME.
+	for _, env := range envv {
+		if strings.HasPrefix(env, "HOME=") {
+			// We have it. Return the original slice unmodified.
+			return envv, nil
+		}
+	}
+
+	// Read /etc/passwd for the user's HOME directory and set the HOME
+	// environment variable as required by POSIX if it is not overridden by
+	// the user.
+	homeDir, err := getExecUserHome(ctx, mns, uid)
+	if err != nil {
+		return nil, fmt.Errorf("error reading exec user: %v", err)
+	}
+	return append(envv, "HOME="+homeDir), nil
+}
+
 // findHomeInPasswd parses a passwd file and returns the given user's home
 // directory. This function does it's best to replicate the runc's behavior.
 func findHomeInPasswd(uid uint32, passwd io.Reader, defaultHome string) (string, error) {
diff --git a/runsc/boot/user_test.go b/runsc/boot/user_test.go
index 01f666507..9aee2ad07 100644
--- a/runsc/boot/user_test.go
+++ b/runsc/boot/user_test.go
@@ -25,6 +25,7 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 )
 
 func setupTempDir() (string, error) {
@@ -68,7 +69,7 @@ func setupPasswd(contents string, perms os.FileMode) func() (string, error) {
 // TestGetExecUserHome tests the getExecUserHome function.
 func TestGetExecUserHome(t *testing.T) {
 	tests := map[string]struct {
-		uid        uint32
+		uid        auth.KUID
 		createRoot func() (string, error)
 		expected   string
 	}{
@@ -164,13 +165,13 @@ func TestGetExecUserHome(t *testing.T) {
 				},
 			}
 
-			var mns *fs.MountNamespace
-			setMountNS := func(m *fs.MountNamespace) {
-				mns = m
-				ctx.(*contexttest.TestContext).RegisterValue(fs.CtxRoot, mns.Root())
-			}
 			mntr := newContainerMounter(spec, []int{sandEnd}, nil, &podMountHints{})
-			if err := mntr.setupRootContainer(ctx, ctx, conf, setMountNS); err != nil {
+			mns, err := mntr.createMountNamespace(ctx, conf)
+			if err != nil {
+				t.Fatalf("failed to create mount namespace: %v", err)
+			}
+			ctx = fs.WithRoot(ctx, mns.Root())
+			if err := mntr.mountSubmounts(ctx, conf, mns); err != nil {
 				t.Fatalf("failed to create mount namespace: %v", err)
 			}