9 files changed, 231 insertions, 182 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 5025401dd..588bb8851 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -88,6 +88,7 @@ go_library(
         "//runsc/specutils",
         "@com_github_golang_protobuf//proto:go_default_library",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
+        "@org_golang_x_sys//unix:go_default_library",
     ],
 )
 
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 6a742f349..05b8f8761 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -113,7 +113,7 @@ func MakeWatchdogAction(s string) (watchdog.Action, error) {
 	}
 }
 
-// MakeRefsLeakMode converts type from string
+// MakeRefsLeakMode converts type from string.
 func MakeRefsLeakMode(s string) (refs.LeakMode, error) {
 	switch strings.ToLower(s) {
 	case "disabled":
@@ -127,6 +127,20 @@ func MakeRefsLeakMode(s string) (refs.LeakMode, error) {
 	}
 }
 
+func refsLeakModeToString(mode refs.LeakMode) string {
+	switch mode {
+	// If not set, default it to disabled.
+	case refs.UninitializedLeakChecking, refs.NoLeakChecking:
+		return "disabled"
+	case refs.LeaksLogWarning:
+		return "log-names"
+	case refs.LeaksLogTraces:
+		return "log-traces"
+	default:
+		panic(fmt.Sprintf("Invalid leakmode: %d", mode))
+	}
+}
+
 // Config holds configuration that is not part of the runtime spec.
 type Config struct {
 	// RootDir is the runtime root directory.
@@ -245,7 +259,7 @@ func (c *Config) ToFlags() []string {
 		"--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels),
 		"--rootless=" + strconv.FormatBool(c.Rootless),
 		"--alsologtostderr=" + strconv.FormatBool(c.AlsoLogToStderr),
-		"--ref-leak-mode=" + c.ReferenceLeakMode.String(),
+		"--ref-leak-mode=" + refsLeakModeToString(c.ReferenceLeakMode),
 	}
 	if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
 		// Only include if set since it is never to be used by users.
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index d79aaff60..72cbabd16 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -328,10 +328,8 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 		return fmt.Errorf("at most two files may be passed to Restore")
 	}
 
-	networkStack := cm.l.k.NetworkStack()
-	// Destroy the old kernel and create a new kernel.
+	// Pause the kernel while we build a new one.
 	cm.l.k.Pause()
-	cm.l.k.Destroy()
 
 	p, err := createPlatform(cm.l.conf, deviceFile)
 	if err != nil {
@@ -345,10 +343,11 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 		return fmt.Errorf("creating memory file: %v", err)
 	}
 	k.SetMemoryFile(mf)
+	networkStack := cm.l.k.NetworkStack()
 	cm.l.k = k
 
 	// Set up the restore environment.
-	mntr := newContainerMounter(cm.l.spec, "", cm.l.goferFDs, cm.l.k, cm.l.mountHints)
+	mntr := newContainerMounter(cm.l.spec, cm.l.goferFDs, cm.l.k, cm.l.mountHints)
 	renv, err := mntr.createRestoreEnvironment(cm.l.conf)
 	if err != nil {
 		return fmt.Errorf("creating RestoreEnvironment: %v", err)
@@ -380,13 +379,10 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 
 	// Load the state.
 	loadOpts := state.LoadOpts{Source: specFile}
-	if err := loadOpts.Load(k, networkStack); err != nil {
+	if err := loadOpts.Load(k, networkStack, time.NewCalibratedClocks()); err != nil {
 		return err
 	}
 
-	// Set timekeeper.
-	k.Timekeeper().SetClocks(time.NewCalibratedClocks())
-
 	// Since we have a new kernel we also must make a new watchdog.
 	dog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction)
 
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 0ee5b8bbd..7ca776b3a 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -207,7 +207,7 @@ var allowedSyscalls = seccomp.SyscallRules{
 	syscall.SYS_MPROTECT:  {},
 	syscall.SYS_MUNMAP:    {},
 	syscall.SYS_NANOSLEEP: {},
-	syscall.SYS_POLL:      {},
+	syscall.SYS_PPOLL:     {},
 	syscall.SYS_PREAD64:   {},
 	syscall.SYS_PWRITE64:  {},
 	syscall.SYS_READ:      {},
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 7e95e1f41..34c674840 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -25,10 +25,8 @@ import (
 
 	// Include filesystem types that OCI spec might mount.
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/dev"
-	"gvisor.dev/gvisor/pkg/sentry/fs/gofer"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/host"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/proc"
-	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
@@ -38,6 +36,8 @@ import (
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/gofer"
+	"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -263,6 +263,18 @@ func subtargets(root string, mnts []specs.Mount) []string {
 	return targets
 }
 
+func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
+	mns, err := mntr.setupFS(conf, procArgs)
+	if err != nil {
+		return err
+	}
+
+	// Set namespace here so that it can be found in ctx.
+	procArgs.MountNamespace = mns
+
+	return setExecutablePath(ctx, procArgs)
+}
+
 // setExecutablePath sets the procArgs.Filename by searching the PATH for an
 // executable matching the procArgs.Argv[0].
 func setExecutablePath(ctx context.Context, procArgs *kernel.CreateProcessArgs) error {
@@ -478,9 +490,6 @@ func (p *podMountHints) findMount(mount specs.Mount) *mountHint {
 }
 
 type containerMounter struct {
-	// cid is the container ID. May be set to empty for the root container.
-	cid string
-
 	root *specs.Root
 
 	// mounts is the set of submounts for the container. It's a copy from the spec
@@ -495,9 +504,8 @@ type containerMounter struct {
 	hints *podMountHints
 }
 
-func newContainerMounter(spec *specs.Spec, cid string, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter {
+func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter {
 	return &containerMounter{
-		cid:    cid,
 		root:   spec.Root,
 		mounts: compileMounts(spec),
 		fds:    fdDispenser{fds: goferFDs},
@@ -506,63 +514,88 @@ func newContainerMounter(spec *specs.Spec, cid string, goferFDs []int, k *kernel
 	}
 }
 
-// setupFS is used to set up the file system for containers and amend
-// the procArgs accordingly. This is the main entry point for this rest of
-// functions in this file. procArgs are passed by reference and the FDMap field
-// is modified. It dups stdioFDs.
-func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs, creds *auth.Credentials) error {
-	// Use root user to configure mounts. The current user might not have
-	// permission to do so.
-	rootProcArgs := kernel.CreateProcessArgs{
-		WorkingDirectory:     "/",
-		Credentials:          auth.NewRootCredentials(creds.UserNamespace),
-		Umask:                0022,
-		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
-		PIDNamespace:         procArgs.PIDNamespace,
+// processHints processes annotations that container hints about how volumes
+// should be mounted (e.g. a volume shared between containers). It must be
+// called for the root container only.
+func (c *containerMounter) processHints(conf *Config) error {
+	ctx := c.k.SupervisorContext()
+	for _, hint := range c.hints.mounts {
+		log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
+		inode, err := c.mountSharedMaster(ctx, conf, hint)
+		if err != nil {
+			return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
+		}
+		hint.root = inode
 	}
+	return nil
+}
+
+// setupFS is used to set up the file system for all containers. This is the
+// main entry point method, with most of the other being internal only. It
+// returns the mount namespace that is created for the container.
+func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) {
+	log.Infof("Configuring container's file system")
+
+	// Create context with root credentials to mount the filesystem (the current
+	// user may not be privileged enough).
+	rootProcArgs := *procArgs
+	rootProcArgs.WorkingDirectory = "/"
+	rootProcArgs.Credentials = auth.NewRootCredentials(procArgs.Credentials.UserNamespace)
+	rootProcArgs.Umask = 0022
+	rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
 	rootCtx := rootProcArgs.NewContext(c.k)
 
-	// If this is the root container, we also need to setup the root mount
-	// namespace.
-	rootMNS := c.k.RootMountNamespace()
-	if rootMNS == nil {
-		// Setup the root container.
-		if err := c.setupRootContainer(ctx, rootCtx, conf, func(rootMNS *fs.MountNamespace) {
-			// The callback to setupRootContainer inherits a
-			// reference on the rootMNS, so we don't need to take
-			// an additional reference here.
-			procArgs.MountNamespace = rootMNS
-			procArgs.Root = rootMNS.Root()
-			c.k.SetRootMountNamespace(rootMNS)
-		}); err != nil {
-			return err
-		}
-		return c.checkDispenser()
+	mns, err := c.createMountNamespace(rootCtx, conf)
+	if err != nil {
+		return nil, err
 	}
 
-	// Setup a child container.
-	log.Infof("Creating new process in child container.")
+	// Set namespace here so that it can be found in rootCtx.
+	rootProcArgs.MountNamespace = mns
 
-	// Create a new root inode and mount namespace for the container.
-	rootInode, err := c.createRootMount(rootCtx, conf)
+	if err := c.mountSubmounts(rootCtx, conf, mns); err != nil {
+		return nil, err
+	}
+	return mns, nil
+}
+
+func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Config) (*fs.MountNamespace, error) {
+	rootInode, err := c.createRootMount(ctx, conf)
 	if err != nil {
-		return fmt.Errorf("creating filesystem for container: %v", err)
+		return nil, fmt.Errorf("creating filesystem for container: %v", err)
 	}
-	mns, err := fs.NewMountNamespace(rootCtx, rootInode)
+	mns, err := fs.NewMountNamespace(ctx, rootInode)
 	if err != nil {
-		return fmt.Errorf("creating new mount namespace for container: %v", err)
+		return nil, fmt.Errorf("creating new mount namespace for container: %v", err)
 	}
+	return mns, nil
+}
 
-	// Set process root here, so 'rootCtx.Value(CtxRoot)' will return it.
-	// This will also donate a reference to procArgs, as required.
-	procArgs.MountNamespace = mns
-	procArgs.Root = mns.Root()
+func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error {
+	root := mns.Root()
+	defer root.DecRef()
 
-	// Mount all submounts.
-	if err := c.mountSubmounts(rootCtx, conf, mns, procArgs.Root); err != nil {
+	for _, m := range c.mounts {
+		log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options)
+		if hint := c.hints.findMount(m); hint != nil && hint.isSupported() {
+			if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil {
+				return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err)
+			}
+		} else {
+			if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil {
+				return fmt.Errorf("mount submount %q: %v", m.Destination, err)
+			}
+		}
+	}
+
+	if err := c.mountTmp(ctx, conf, mns, root); err != nil {
+		return fmt.Errorf("mount submount %q: %v", "tmp", err)
+	}
+
+	if err := c.checkDispenser(); err != nil {
 		return err
 	}
-	return c.checkDispenser()
+	return nil
 }
 
 func (c *containerMounter) checkDispenser() error {
@@ -572,36 +605,6 @@ func (c *containerMounter) checkDispenser() error {
 	return nil
 }
 
-// setupRootContainer creates a mount namespace containing the root filesystem
-// and all mounts. 'rootCtx' is used to walk directories to find mount points.
-// The 'setMountNS' callback is called after the mount namespace is created and
-// will get a reference on that namespace. The callback must ensure that the
-// rootCtx has the provided mount namespace.
-func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx context.Context, conf *Config, setMountNS func(*fs.MountNamespace)) error {
-	for _, hint := range c.hints.mounts {
-		log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
-		inode, err := c.mountSharedMaster(rootCtx, conf, hint)
-		if err != nil {
-			return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
-		}
-		hint.root = inode
-	}
-
-	rootInode, err := c.createRootMount(rootCtx, conf)
-	if err != nil {
-		return fmt.Errorf("creating root mount: %v", err)
-	}
-	mns, err := fs.NewMountNamespace(userCtx, rootInode)
-	if err != nil {
-		return fmt.Errorf("creating root mount namespace: %v", err)
-	}
-	setMountNS(mns)
-
-	root := mns.Root()
-	defer root.DecRef()
-	return c.mountSubmounts(rootCtx, conf, mns, root)
-}
-
 // mountSharedMaster mounts the master of a volume that is shared among
 // containers in a pod. It returns the root mount's inode.
 func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) {
@@ -717,25 +720,6 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
 	return fsName, opts, useOverlay, err
 }
 
-func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
-	for _, m := range c.mounts {
-		if hint := c.hints.findMount(m); hint != nil && hint.isSupported() {
-			if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil {
-				return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err)
-			}
-		} else {
-			if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil {
-				return fmt.Errorf("mount submount %q: %v", m.Destination, err)
-			}
-		}
-	}
-
-	if err := c.mountTmp(ctx, conf, mns, root); err != nil {
-		return fmt.Errorf("mount submount %q: %v", "tmp", err)
-	}
-	return nil
-}
-
 // mountSubmount mounts volumes inside the container's root. Because mounts may
 // be readonly, a lower ramfs overlay is added to create the mount point dir.
 // Another overlay is added with tmpfs on top if Config.Overlay is true.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 65ac67dbf..19b738705 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -27,12 +27,12 @@ import (
 	gtime "time"
 
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/cpuid"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/memutil"
 	"gvisor.dev/gvisor/pkg/rand"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/control"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -181,9 +181,6 @@ type Args struct {
 // New initializes a new kernel loader configured by spec.
 // New also handles setting up a kernel for restoring a container.
 func New(args Args) (*Loader, error) {
-	// Sets the reference leak check mode
-	refs.SetLeakMode(args.Conf.ReferenceLeakMode)
-
 	// We initialize the rand package now to make sure /dev/urandom is pre-opened
 	// on kernels that do not support getrandom(2).
 	if err := rand.Init(); err != nil {
@@ -527,15 +524,14 @@ func (l *Loader) run() error {
 		// ours either way.
 		l.rootProcArgs.FDTable = fdTable
 
-		// cid for root container can be empty. Only subcontainers need it to set
-		// the mount location.
-		mntr := newContainerMounter(l.spec, "", l.goferFDs, l.k, l.mountHints)
-		if err := mntr.setupFS(ctx, l.conf, &l.rootProcArgs, l.rootProcArgs.Credentials); err != nil {
+		// Setup the root container file system.
+		l.startGoferMonitor(l.sandboxID, l.goferFDs)
+
+		mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
+		if err := mntr.processHints(l.conf); err != nil {
 			return err
 		}
-
-		rootCtx := l.rootProcArgs.NewContext(l.k)
-		if err := setExecutablePath(rootCtx, &l.rootProcArgs); err != nil {
+		if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil {
 			return err
 		}
 
@@ -549,7 +545,7 @@ func (l *Loader) run() error {
 			}
 		}
 		if !hasHomeEnvv {
-			homeDir, err := getExecUserHome(rootCtx, l.rootProcArgs.MountNamespace, uint32(l.rootProcArgs.Credentials.RealKUID))
+			homeDir, err := getExecUserHome(ctx, l.rootProcArgs.MountNamespace, uint32(l.rootProcArgs.Credentials.RealKUID))
 			if err != nil {
 				return fmt.Errorf("error reading exec user: %v", err)
 			}
@@ -631,7 +627,6 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
 	// sentry currently supports only 1 mount namespace, which is tied to a
 	// single user namespace. Thus we must run in the same user namespace
 	// to access mounts.
-	// TODO(b/63601033): Create a new mount namespace for the container.
 	creds := auth.NewUserCredentials(
 		auth.KUID(spec.Process.User.UID),
 		auth.KGID(spec.Process.User.GID),
@@ -687,13 +682,12 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
 		goferFDs = append(goferFDs, fd)
 	}
 
-	mntr := newContainerMounter(spec, cid, goferFDs, l.k, l.mountHints)
-	if err := mntr.setupFS(ctx, conf, &procArgs, creds); err != nil {
-		return fmt.Errorf("configuring container FS: %v", err)
-	}
+	// Setup the child container file system.
+	l.startGoferMonitor(cid, goferFDs)
 
-	if err := setExecutablePath(ctx, &procArgs); err != nil {
-		return fmt.Errorf("setting executable path for %+v: %v", procArgs, err)
+	mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints)
+	if err := setupContainerFS(ctx, conf, mntr, &procArgs); err != nil {
+		return err
 	}
 
 	// Create and start the new process.
@@ -710,17 +704,59 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
 	return nil
 }
 
+// startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
+// the gofer FDs looking for disconnects, and destroys the container if a
+// disconnect occurs in any of the gofer FDs.
+func (l *Loader) startGoferMonitor(cid string, goferFDs []int) {
+	go func() {
+		log.Debugf("Monitoring gofer health for container %q", cid)
+		var events []unix.PollFd
+		for _, fd := range goferFDs {
+			events = append(events, unix.PollFd{
+				Fd:     int32(fd),
+				Events: unix.POLLHUP | unix.POLLRDHUP,
+			})
+		}
+		_, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) {
+			// Use ppoll instead of poll because it's already whilelisted in seccomp.
+			n, err := unix.Ppoll(events, nil, nil)
+			return uintptr(n), 0, err
+		})
+		if err != nil {
+			panic(fmt.Sprintf("Error monitoring gofer FDs: %v", err))
+		}
+
+		// Check if the gofer has stopped as part of normal container destruction.
+		// This is done just to avoid sending an annoying error message to the log.
+		// Note that there is a small race window in between mu.Unlock() and the
+		// lock being reacquired in destroyContainer(), but it's harmless to call
+		// destroyContainer() multiple times.
+		l.mu.Lock()
+		_, ok := l.processes[execID{cid: cid}]
+		l.mu.Unlock()
+		if ok {
+			log.Infof("Gofer socket disconnected, destroying container %q", cid)
+			if err := l.destroyContainer(cid); err != nil {
+				log.Warningf("Error destroying container %q after gofer stopped: %v", cid, err)
+			}
+		}
+	}()
+}
+
 // destroyContainer stops a container if it is still running and cleans up its
 // filesystem.
 func (l *Loader) destroyContainer(cid string) error {
 	l.mu.Lock()
 	defer l.mu.Unlock()
 
-	// Has the container started?
-	_, _, err := l.threadGroupFromIDLocked(execID{cid: cid})
+	_, _, started, err := l.threadGroupFromIDLocked(execID{cid: cid})
+	if err != nil {
+		// Container doesn't exist.
+		return err
+	}
 
-	// If the container has started, kill and wait for all processes.
-	if err == nil {
+	// The container exists, has it been started?
+	if started {
 		if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
 			return fmt.Errorf("sending SIGKILL to all container processes: %v", err)
 		}
@@ -754,27 +790,22 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 	l.mu.Lock()
 	defer l.mu.Unlock()
 
-	tg, _, err := l.threadGroupFromIDLocked(execID{cid: args.ContainerID})
+	tg, _, started, err := l.threadGroupFromIDLocked(execID{cid: args.ContainerID})
 	if err != nil {
-		return 0, fmt.Errorf("no such container: %q", args.ContainerID)
+		return 0, err
+	}
+	if !started {
+		return 0, fmt.Errorf("container %q not started", args.ContainerID)
 	}
 
-	// Get the container Root Dirent and MountNamespace from the Task.
+	// Get the container MountNamespace from the Task.
 	tg.Leader().WithMuLocked(func(t *kernel.Task) {
-		// FSContext.RootDirectory() will take an extra ref for us.
-		args.Root = t.FSContext().RootDirectory()
-
 		// task.MountNamespace() does not take a ref, so we must do so
 		// ourselves.
 		args.MountNamespace = t.MountNamespace()
 		args.MountNamespace.IncRef()
 	})
-	defer func() {
-		if args.Root != nil {
-			args.Root.DecRef()
-		}
-		args.MountNamespace.DecRef()
-	}()
+	defer args.MountNamespace.DecRef()
 
 	// Start the process.
 	proc := control.Proc{Kernel: l.k}
@@ -895,6 +926,8 @@ func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
 			return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err)
 		}
 
+		s.FillDefaultIPTables()
+
 		return &s, nil
 
 	default:
@@ -1026,20 +1059,28 @@ func (l *Loader) signalAllProcesses(cid string, signo int32) error {
 func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, error) {
 	l.mu.Lock()
 	defer l.mu.Unlock()
-	return l.threadGroupFromIDLocked(key)
+	tg, tty, ok, err := l.threadGroupFromIDLocked(key)
+	if err != nil {
+		return nil, nil, err
+	}
+	if !ok {
+		return nil, nil, fmt.Errorf("container %q not started", key.cid)
+	}
+	return tg, tty, nil
 }
 
 // threadGroupFromIDLocked returns the thread group and TTY for the given
 // execution ID. TTY may be nil if the process is not attached to a terminal.
-// Returns error if execution ID is invalid or if container/process has not
-// started yet. Caller must hold 'mu'.
-func (l *Loader) threadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, error) {
+// Also returns a boolean indicating whether the container has already started.
+// Returns error if execution ID is invalid or if the container cannot be
+// found (maybe it has been deleted). Caller must hold 'mu'.
+func (l *Loader) threadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, bool, error) {
 	ep := l.processes[key]
 	if ep == nil {
-		return nil, nil, fmt.Errorf("container not found")
+		return nil, nil, false, fmt.Errorf("container %q not found", key.cid)
 	}
 	if ep.tg == nil {
-		return nil, nil, fmt.Errorf("container not started")
+		return nil, nil, false, nil
 	}
-	return ep.tg, ep.tty, nil
+	return ep.tg, ep.tty, true, nil
 }
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index ff713660d..147ff7703 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -401,17 +401,16 @@ func TestCreateMountNamespace(t *testing.T) {
 			}
 			defer cleanup()
 
-			// setupRootContainer needs to find root from the context after the
-			// namespace is created.
-			var mns *fs.MountNamespace
-			setMountNS := func(m *fs.MountNamespace) {
-				mns = m
-				ctx.(*contexttest.TestContext).RegisterValue(fs.CtxRoot, mns.Root())
+			mntr := newContainerMounter(&tc.spec, []int{sandEnd}, nil, &podMountHints{})
+			mns, err := mntr.createMountNamespace(ctx, conf)
+			if err != nil {
+				t.Fatalf("failed to create mount namespace: %v", err)
 			}
-			mntr := newContainerMounter(&tc.spec, "", []int{sandEnd}, nil, &podMountHints{})
-			if err := mntr.setupRootContainer(ctx, ctx, conf, setMountNS); err != nil {
-				t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err)
+			ctx = fs.WithRoot(ctx, mns.Root())
+			if err := mntr.mountSubmounts(ctx, conf, mns); err != nil {
+				t.Fatalf("failed to create mount namespace: %v", err)
 			}
+
 			root := mns.Root()
 			defer root.DecRef()
 			for _, p := range tc.expectedPaths {
@@ -614,7 +613,7 @@ func TestRestoreEnvironment(t *testing.T) {
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
 			conf := testConfig()
-			mntr := newContainerMounter(tc.spec, "", tc.ioFDs, nil, &podMountHints{})
+			mntr := newContainerMounter(tc.spec, tc.ioFDs, nil, &podMountHints{})
 			actualRenv, err := mntr.createRestoreEnvironment(conf)
 			if !tc.errorExpected && err != nil {
 				t.Fatalf("could not create restore environment for test:%s", tc.name)
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index d3d98243d..ea0d9f790 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -38,8 +38,7 @@ type Network struct {
 
 // Route represents a route in the network stack.
 type Route struct {
-	Destination net.IP
-	Mask        net.IPMask
+	Destination net.IPNet
 	Gateway     net.IP
 }
 
@@ -85,16 +84,19 @@ type CreateLinksAndRoutesArgs struct {
 
 // Empty returns true if route hasn't been set.
 func (r *Route) Empty() bool {
-	return r.Destination == nil && r.Mask == nil && r.Gateway == nil
+	return r.Destination.IP == nil && r.Destination.Mask == nil && r.Gateway == nil
 }
 
-func (r *Route) toTcpipRoute(id tcpip.NICID) tcpip.Route {
+func (r *Route) toTcpipRoute(id tcpip.NICID) (tcpip.Route, error) {
+	subnet, err := tcpip.NewSubnet(ipToAddress(r.Destination.IP), ipMaskToAddressMask(r.Destination.Mask))
+	if err != nil {
+		return tcpip.Route{}, err
+	}
 	return tcpip.Route{
-		Destination: ipToAddress(r.Destination),
+		Destination: subnet,
 		Gateway:     ipToAddress(r.Gateway),
-		Mask:        ipToAddressMask(net.IP(r.Mask)),
 		NIC:         id,
-	}
+	}, nil
 }
 
 // CreateLinksAndRoutes creates links and routes in a network stack.  It should
@@ -128,7 +130,11 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 
 		// Collect the routes from this link.
 		for _, r := range link.Routes {
-			routes = append(routes, r.toTcpipRoute(nicID))
+			route, err := r.toTcpipRoute(nicID)
+			if err != nil {
+				return err
+			}
+			routes = append(routes, route)
 		}
 	}
 
@@ -170,7 +176,11 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 
 		// Collect the routes from this link.
 		for _, r := range link.Routes {
-			routes = append(routes, r.toTcpipRoute(nicID))
+			route, err := r.toTcpipRoute(nicID)
+			if err != nil {
+				return err
+			}
+			routes = append(routes, route)
 		}
 	}
 
@@ -179,7 +189,11 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 		if !ok {
 			return fmt.Errorf("invalid interface name %q for default route", args.DefaultGateway.Name)
 		}
-		routes = append(routes, args.DefaultGateway.Route.toTcpipRoute(nicID))
+		route, err := args.DefaultGateway.Route.toTcpipRoute(nicID)
+		if err != nil {
+			return err
+		}
+		routes = append(routes, route)
 	}
 
 	log.Infof("Setting routes %+v", routes)
@@ -230,8 +244,8 @@ func ipToAddress(ip net.IP) tcpip.Address {
 	return addr
 }
 
-// ipToAddressMask converts IP to tcpip.AddressMask, ignoring the protocol.
-func ipToAddressMask(ip net.IP) tcpip.AddressMask {
-	_, addr := ipToAddressAndProto(ip)
-	return tcpip.AddressMask(addr)
+// ipMaskToAddressMask converts IPMask to tcpip.AddressMask, ignoring the
+// protocol.
+func ipMaskToAddressMask(ipMask net.IPMask) tcpip.AddressMask {
+	return tcpip.AddressMask(ipToAddress(net.IP(ipMask)))
 }
diff --git a/runsc/boot/user_test.go b/runsc/boot/user_test.go
index 834003430..906baf3e5 100644
--- a/runsc/boot/user_test.go
+++ b/runsc/boot/user_test.go
@@ -164,13 +164,13 @@ func TestGetExecUserHome(t *testing.T) {
 				},
 			}
 
-			var mns *fs.MountNamespace
-			setMountNS := func(m *fs.MountNamespace) {
-				mns = m
-				ctx.(*contexttest.TestContext).RegisterValue(fs.CtxRoot, mns.Root())
+			mntr := newContainerMounter(spec, []int{sandEnd}, nil, &podMountHints{})
+			mns, err := mntr.createMountNamespace(ctx, conf)
+			if err != nil {
+				t.Fatalf("failed to create mount namespace: %v", err)
 			}
-			mntr := newContainerMounter(spec, "", []int{sandEnd}, nil, &podMountHints{})
-			if err := mntr.setupRootContainer(ctx, ctx, conf, setMountNS); err != nil {
+			ctx = fs.WithRoot(ctx, mns.Root())
+			if err := mntr.mountSubmounts(ctx, conf, mns); err != nil {
 				t.Fatalf("failed to create mount namespace: %v", err)
 			}