44 files changed, 1623 insertions, 671 deletions
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 8c73dc5dc..67307ab3c 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -33,6 +33,7 @@ go_library(
         "//pkg/cpuid",
         "//pkg/eventchannel",
         "//pkg/fd",
+        "//pkg/flipcall",
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/memutil",
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 7076ae2e2..a3a76b609 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -53,7 +53,7 @@ type compatEmitter struct {
 func newCompatEmitter(logFD int) (*compatEmitter, error) {
 	nameMap, ok := getSyscallNameMap()
 	if !ok {
-		return nil, fmt.Errorf("Linux syscall table not found")
+		return nil, fmt.Errorf("syscall table not found")
 	}
 
 	c := &compatEmitter{
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index fdf13c8e1..cb5d8ea31 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -104,13 +104,11 @@ const (
 
 // Profiling related commands (see pprof.go for more details).
 const (
-	StartCPUProfile = "Profile.StartCPUProfile"
-	StopCPUProfile  = "Profile.StopCPUProfile"
-	HeapProfile     = "Profile.HeapProfile"
-	BlockProfile    = "Profile.BlockProfile"
-	MutexProfile    = "Profile.MutexProfile"
-	StartTrace      = "Profile.StartTrace"
-	StopTrace       = "Profile.StopTrace"
+	CPUProfile   = "Profile.CPU"
+	HeapProfile  = "Profile.Heap"
+	BlockProfile = "Profile.Block"
+	MutexProfile = "Profile.Mutex"
+	Trace        = "Profile.Trace"
 )
 
 // Logging related commands (see logging.go for more details).
@@ -131,9 +129,6 @@ type controller struct {
 
 	// manager holds the containerManager methods.
 	manager *containerManager
-
-	// pprop holds the profile instance if enabled. It may be nil.
-	pprof *control.Profile
 }
 
 // newController creates a new controller. The caller must call
@@ -164,19 +159,14 @@ func newController(fd int, l *Loader) (*controller, error) {
 	ctrl.srv.Register(&control.Logging{})
 
 	if l.root.conf.ProfileEnable {
-		ctrl.pprof = &control.Profile{Kernel: l.k}
-		ctrl.srv.Register(ctrl.pprof)
+		ctrl.srv.Register(control.NewProfile(l.k))
 	}
 
 	return ctrl, nil
 }
 
 func (c *controller) stop() {
-	if c.pprof != nil {
-		// These are noop if there is nothing being profiled.
-		_ = c.pprof.StopCPUProfile(nil, nil)
-		_ = c.pprof.StopTrace(nil, nil)
-	}
+	c.srv.Stop()
 }
 
 // containerManager manages sandbox containers.
@@ -211,10 +201,31 @@ func (cm *containerManager) Processes(cid *string, out *[]*control.Process) erro
 	return control.Processes(cm.l.k, *cid, out)
 }
 
+// CreateArgs contains arguments to the Create method.
+type CreateArgs struct {
+	// CID is the ID of the container to start.
+	CID string
+
+	// FilePayload may contain a TTY file for the terminal, if enabled.
+	urpc.FilePayload
+}
+
 // Create creates a container within a sandbox.
-func (cm *containerManager) Create(cid *string, _ *struct{}) error {
-	log.Debugf("containerManager.Create, cid: %s", *cid)
-	return cm.l.createContainer(*cid)
+func (cm *containerManager) Create(args *CreateArgs, _ *struct{}) error {
+	log.Debugf("containerManager.Create: %s", args.CID)
+
+	if len(args.Files) > 1 {
+		return fmt.Errorf("start arguments must have at most 1 files for TTY")
+	}
+	var tty *fd.FD
+	if len(args.Files) == 1 {
+		var err error
+		tty, err = fd.NewFromFile(args.Files[0])
+		if err != nil {
+			return fmt.Errorf("error dup'ing TTY file: %w", err)
+		}
+	}
+	return cm.l.createContainer(args.CID, tty)
 }
 
 // StartArgs contains arguments to the Start method.
@@ -229,9 +240,8 @@ type StartArgs struct {
 	CID string
 
 	// FilePayload contains, in order:
-	//   * stdin, stdout, and stderr.
-	//   * the file descriptor over which the sandbox will
-	//     request files from its root filesystem.
+	//   * stdin, stdout, and stderr (optional: if terminal is disabled).
+	//   * file descriptors to connect to gofer to serve the root filesystem.
 	urpc.FilePayload
 }
 
@@ -251,23 +261,45 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
 	if args.CID == "" {
 		return errors.New("start argument missing container ID")
 	}
-	if len(args.FilePayload.Files) < 4 {
-		return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer")
+	if len(args.Files) < 1 {
+		return fmt.Errorf("start arguments must contain at least one file for the container root gofer")
 	}
 
 	// All validation passed, logs the spec for debugging.
 	specutils.LogSpec(args.Spec)
 
-	fds, err := fd.NewFromFiles(args.FilePayload.Files)
+	goferFiles := args.Files
+	var stdios []*fd.FD
+	if !args.Spec.Process.Terminal {
+		// When not using a terminal, stdios come as the first 3 files in the
+		// payload.
+		if l := len(args.Files); l < 4 {
+			return fmt.Errorf("start arguments (len: %d) must contain stdios and files for the container root gofer", l)
+		}
+		var err error
+		stdios, err = fd.NewFromFiles(goferFiles[:3])
+		if err != nil {
+			return fmt.Errorf("error dup'ing stdio files: %w", err)
+		}
+		goferFiles = goferFiles[3:]
+	}
+	defer func() {
+		for _, fd := range stdios {
+			_ = fd.Close()
+		}
+	}()
+
+	goferFDs, err := fd.NewFromFiles(goferFiles)
 	if err != nil {
-		return err
+		return fmt.Errorf("error dup'ing gofer files: %w", err)
 	}
 	defer func() {
-		for _, fd := range fds {
+		for _, fd := range goferFDs {
 			_ = fd.Close()
 		}
 	}()
-	if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, fds); err != nil {
+
+	if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, stdios, goferFDs); err != nil {
 		log.Debugf("containerManager.Start failed, cid: %s, args: %+v, err: %v", args.CID, args, err)
 		return err
 	}
@@ -330,18 +362,18 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
 	log.Debugf("containerManager.Restore")
 
 	var specFile, deviceFile *os.File
-	switch numFiles := len(o.FilePayload.Files); numFiles {
+	switch numFiles := len(o.Files); numFiles {
 	case 2:
 		// The device file is donated to the platform.
 		// Can't take ownership away from os.File. dup them to get a new FD.
-		fd, err := syscall.Dup(int(o.FilePayload.Files[1].Fd()))
+		fd, err := syscall.Dup(int(o.Files[1].Fd()))
 		if err != nil {
 			return fmt.Errorf("failed to dup file: %v", err)
 		}
 		deviceFile = os.NewFile(uintptr(fd), "platform device")
 		fallthrough
 	case 1:
-		specFile = o.FilePayload.Files[0]
+		specFile = o.Files[0]
 	case 0:
 		return fmt.Errorf("at least one file must be passed to Restore")
 	default:
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index a7c4ebb0c..eacd73531 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -343,6 +343,21 @@ func hostInetFilters() seccomp.SyscallRules {
 			},
 			{
 				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IP),
+				seccomp.EqualTo(syscall.IP_PKTINFO),
+			},
+			{
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IP),
+				seccomp.EqualTo(syscall.IP_RECVORIGDSTADDR),
+			},
+			{
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IP),
+				seccomp.EqualTo(syscall.IP_RECVERR),
+			},
+			{
+				seccomp.MatchAny{},
 				seccomp.EqualTo(syscall.SOL_IPV6),
 				seccomp.EqualTo(syscall.IPV6_TCLASS),
 			},
@@ -354,10 +369,20 @@ func hostInetFilters() seccomp.SyscallRules {
 			{
 				seccomp.MatchAny{},
 				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_RECVERR),
+			},
+			{
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
 				seccomp.EqualTo(syscall.IPV6_V6ONLY),
 			},
 			{
 				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(linux.IPV6_RECVORIGDSTADDR),
+			},
+			{
+				seccomp.MatchAny{},
 				seccomp.EqualTo(syscall.SOL_SOCKET),
 				seccomp.EqualTo(syscall.SO_ERROR),
 			},
@@ -393,6 +418,11 @@ func hostInetFilters() seccomp.SyscallRules {
 			},
 			{
 				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_TIMESTAMP),
+			},
+			{
+				seccomp.MatchAny{},
 				seccomp.EqualTo(syscall.SOL_TCP),
 				seccomp.EqualTo(syscall.TCP_NODELAY),
 			},
@@ -401,6 +431,11 @@ func hostInetFilters() seccomp.SyscallRules {
 				seccomp.EqualTo(syscall.SOL_TCP),
 				seccomp.EqualTo(syscall.TCP_INFO),
 			},
+			{
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_TCP),
+				seccomp.EqualTo(linux.TCP_INQ),
+			},
 		},
 		syscall.SYS_IOCTL: []seccomp.Rule{
 			{
@@ -421,29 +456,29 @@ func hostInetFilters() seccomp.SyscallRules {
 		syscall.SYS_SETSOCKOPT: []seccomp.Rule{
 			{
 				seccomp.MatchAny{},
-				seccomp.EqualTo(syscall.SOL_IPV6),
-				seccomp.EqualTo(syscall.IPV6_V6ONLY),
+				seccomp.EqualTo(syscall.SOL_SOCKET),
+				seccomp.EqualTo(syscall.SO_SNDBUF),
 				seccomp.MatchAny{},
 				seccomp.EqualTo(4),
 			},
 			{
 				seccomp.MatchAny{},
 				seccomp.EqualTo(syscall.SOL_SOCKET),
-				seccomp.EqualTo(syscall.SO_SNDBUF),
+				seccomp.EqualTo(syscall.SO_RCVBUF),
 				seccomp.MatchAny{},
 				seccomp.EqualTo(4),
 			},
 			{
 				seccomp.MatchAny{},
 				seccomp.EqualTo(syscall.SOL_SOCKET),
-				seccomp.EqualTo(syscall.SO_RCVBUF),
+				seccomp.EqualTo(syscall.SO_REUSEADDR),
 				seccomp.MatchAny{},
 				seccomp.EqualTo(4),
 			},
 			{
 				seccomp.MatchAny{},
 				seccomp.EqualTo(syscall.SOL_SOCKET),
-				seccomp.EqualTo(syscall.SO_REUSEADDR),
+				seccomp.EqualTo(syscall.SO_TIMESTAMP),
 				seccomp.MatchAny{},
 				seccomp.EqualTo(4),
 			},
@@ -456,6 +491,13 @@ func hostInetFilters() seccomp.SyscallRules {
 			},
 			{
 				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_TCP),
+				seccomp.EqualTo(linux.TCP_INQ),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
+			},
+			{
+				seccomp.MatchAny{},
 				seccomp.EqualTo(syscall.SOL_IP),
 				seccomp.EqualTo(syscall.IP_TOS),
 				seccomp.MatchAny{},
@@ -470,6 +512,27 @@ func hostInetFilters() seccomp.SyscallRules {
 			},
 			{
 				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IP),
+				seccomp.EqualTo(syscall.IP_PKTINFO),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
+			},
+			{
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IP),
+				seccomp.EqualTo(syscall.IP_RECVORIGDSTADDR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
+			},
+			{
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IP),
+				seccomp.EqualTo(syscall.IP_RECVERR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
+			},
+			{
+				seccomp.MatchAny{},
 				seccomp.EqualTo(syscall.SOL_IPV6),
 				seccomp.EqualTo(syscall.IPV6_TCLASS),
 				seccomp.MatchAny{},
@@ -482,6 +545,27 @@ func hostInetFilters() seccomp.SyscallRules {
 				seccomp.MatchAny{},
 				seccomp.EqualTo(4),
 			},
+			{
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(linux.IPV6_RECVORIGDSTADDR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
+			},
+			{
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_RECVERR),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
+			},
+			{
+				seccomp.MatchAny{},
+				seccomp.EqualTo(syscall.SOL_IPV6),
+				seccomp.EqualTo(syscall.IPV6_V6ONLY),
+				seccomp.MatchAny{},
+				seccomp.EqualTo(4),
+			},
 		},
 		syscall.SYS_SHUTDOWN: []seccomp.Rule{
 			{
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 6b6ae98d7..2b0d2cd51 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -22,15 +22,6 @@ import (
 	"strings"
 	"syscall"
 
-	// Include filesystem types that OCI spec might mount.
-	_ "gvisor.dev/gvisor/pkg/sentry/fs/dev"
-	_ "gvisor.dev/gvisor/pkg/sentry/fs/host"
-	_ "gvisor.dev/gvisor/pkg/sentry/fs/proc"
-	_ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
-	_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
-	_ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -48,9 +39,18 @@ import (
 	tmpfsvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/specutils"
+
+	// Include filesystem types that OCI spec might mount.
+	_ "gvisor.dev/gvisor/pkg/sentry/fs/dev"
+	_ "gvisor.dev/gvisor/pkg/sentry/fs/host"
+	_ "gvisor.dev/gvisor/pkg/sentry/fs/proc"
+	_ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
+	_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
+	_ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
 )
 
 const (
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index ebdd518d0..d37528ee7 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -75,12 +75,14 @@ import (
 	"gvisor.dev/gvisor/runsc/specutils"
 	"gvisor.dev/gvisor/runsc/specutils/seccomp"
 
-	// Include supported socket providers.
+	// Top-level inet providers.
 	"gvisor.dev/gvisor/pkg/sentry/socket/hostinet"
+	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
+
+	// Include other supported socket providers.
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink"
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route"
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent"
-	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
 	_ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
 )
 
@@ -157,6 +159,11 @@ type execProcess struct {
 
 	// pidnsPath is the pid namespace path in spec
 	pidnsPath string
+
+	// hostTTY is present when creating a sub-container with terminal enabled.
+	// TTY file is passed during container create and must be saved until
+	// container start.
+	hostTTY *fd.FD
 }
 
 func init() {
@@ -289,7 +296,7 @@ func New(args Args) (*Loader, error) {
 	if args.TotalMem > 0 {
 		// Adjust the total memory returned by the Sentry so that applications that
 		// use /proc/meminfo can make allocations based on this limit.
-		usage.MinimumTotalMemoryBytes = args.TotalMem
+		usage.MaximumTotalMemoryBytes = args.TotalMem
 		log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1<<30))
 	}
 
@@ -435,6 +442,10 @@ func createProcessArgs(id string, spec *specs.Spec, creds *auth.Credentials, k *
 	if err != nil {
 		return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %v", err)
 	}
+	env, err := specutils.ResolveEnvs(spec.Process.Env)
+	if err != nil {
+		return kernel.CreateProcessArgs{}, fmt.Errorf("resolving env: %w", err)
+	}
 
 	wd := spec.Process.Cwd
 	if wd == "" {
@@ -444,7 +455,7 @@ func createProcessArgs(id string, spec *specs.Spec, creds *auth.Credentials, k *
 	// Create the process arguments.
 	procArgs := kernel.CreateProcessArgs{
 		Argv:                    spec.Process.Args,
-		Envv:                    spec.Process.Env,
+		Envv:                    env,
 		WorkingDirectory:        wd,
 		Credentials:             creds,
 		Umask:                   0022,
@@ -588,10 +599,11 @@ func (l *Loader) run() error {
 
 		// Create the root container init task. It will begin running
 		// when the kernel is started.
-		if _, err := l.createContainerProcess(true, l.sandboxID, &l.root, ep); err != nil {
+		var err error
+		_, ep.tty, ep.ttyVFS2, err = l.createContainerProcess(true, l.sandboxID, &l.root)
+		if err != nil {
 			return err
 		}
-
 	}
 
 	ep.tg = l.k.GlobalInit()
@@ -627,7 +639,7 @@ func (l *Loader) run() error {
 }
 
 // createContainer creates a new container inside the sandbox.
-func (l *Loader) createContainer(cid string) error {
+func (l *Loader) createContainer(cid string, tty *fd.FD) error {
 	l.mu.Lock()
 	defer l.mu.Unlock()
 
@@ -635,14 +647,14 @@ func (l *Loader) createContainer(cid string) error {
 	if _, ok := l.processes[eid]; ok {
 		return fmt.Errorf("container %q already exists", cid)
 	}
-	l.processes[eid] = &execProcess{}
+	l.processes[eid] = &execProcess{hostTTY: tty}
 	return nil
 }
 
 // startContainer starts a child container. It returns the thread group ID of
 // the newly created process. Used FDs are either closed or released. It's safe
 // for the caller to close any remaining files upon return.
-func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, files []*fd.FD) error {
+func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid string, stdioFDs, goferFDs []*fd.FD) error {
 	// Create capabilities.
 	caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
 	if err != nil {
@@ -695,36 +707,41 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *config.Config, cid strin
 	info := &containerInfo{
 		conf:     conf,
 		spec:     spec,
-		stdioFDs: files[:3],
-		goferFDs: files[3:],
+		goferFDs: goferFDs,
 	}
 	info.procArgs, err = createProcessArgs(cid, spec, creds, l.k, pidns)
 	if err != nil {
 		return fmt.Errorf("creating new process: %v", err)
 	}
-	tg, err := l.createContainerProcess(false, cid, info, ep)
+
+	// Use stdios or TTY depending on the spec configuration.
+	if spec.Process.Terminal {
+		if len(stdioFDs) > 0 {
+			return fmt.Errorf("using TTY, stdios not expected: %v", stdioFDs)
+		}
+		if ep.hostTTY == nil {
+			return fmt.Errorf("terminal enabled but no TTY provided. Did you set --console-socket on create?")
+		}
+		info.stdioFDs = []*fd.FD{ep.hostTTY, ep.hostTTY, ep.hostTTY}
+		ep.hostTTY = nil
+	} else {
+		info.stdioFDs = stdioFDs
+	}
+
+	ep.tg, ep.tty, ep.ttyVFS2, err = l.createContainerProcess(false, cid, info)
 	if err != nil {
 		return err
 	}
-
-	// Success!
-	l.k.StartProcess(tg)
-	ep.tg = tg
+	l.k.StartProcess(ep.tg)
 	return nil
 }
 
-func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo, ep *execProcess) (*kernel.ThreadGroup, error) {
-	console := false
-	if root {
-		// Only root container supports terminal for now.
-		console = info.spec.Process.Terminal
-	}
-
+func (l *Loader) createContainerProcess(root bool, cid string, info *containerInfo) (*kernel.ThreadGroup, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
 	// Create the FD map, which will set stdin, stdout, and stderr.
 	ctx := info.procArgs.NewContext(l.k)
-	fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, console, info.stdioFDs)
+	fdTable, ttyFile, ttyFileVFS2, err := createFDTable(ctx, info.spec.Process.Terminal, info.stdioFDs)
 	if err != nil {
-		return nil, fmt.Errorf("importing fds: %v", err)
+		return nil, nil, nil, fmt.Errorf("importing fds: %v", err)
 	}
 	// CreateProcess takes a reference on fdTable if successful. We won't need
 	// ours either way.
@@ -736,11 +753,11 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
 	mntr := newContainerMounter(info.spec, info.goferFDs, l.k, l.mountHints)
 	if root {
 		if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil {
-			return nil, err
+			return nil, nil, nil, err
 		}
 	}
 	if err := setupContainerFS(ctx, info.conf, mntr, &info.procArgs); err != nil {
-		return nil, err
+		return nil, nil, nil, err
 	}
 
 	// Add the HOME environment variable if it is not already set.
@@ -754,29 +771,25 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
 			info.procArgs.Credentials.RealKUID, info.procArgs.Envv)
 	}
 	if err != nil {
-		return nil, err
+		return nil, nil, nil, err
 	}
 	info.procArgs.Envv = envv
 
 	// Create and start the new process.
 	tg, _, err := l.k.CreateProcess(info.procArgs)
 	if err != nil {
-		return nil, fmt.Errorf("creating process: %v", err)
+		return nil, nil, nil, fmt.Errorf("creating process: %v", err)
 	}
 	// CreateProcess takes a reference on FDTable if successful.
 	info.procArgs.FDTable.DecRef(ctx)
 
 	// Set the foreground process group on the TTY to the global init process
 	// group, since that is what we are about to start running.
-	if root {
-		switch {
-		case ttyFileVFS2 != nil:
-			ep.ttyVFS2 = ttyFileVFS2
-			ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup())
-		case ttyFile != nil:
-			ep.tty = ttyFile
-			ttyFile.InitForegroundProcessGroup(tg.ProcessGroup())
-		}
+	switch {
+	case ttyFileVFS2 != nil:
+		ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup())
+	case ttyFile != nil:
+		ttyFile.InitForegroundProcessGroup(tg.ProcessGroup())
 	}
 
 	// Install seccomp filters with the new task if there are any.
@@ -784,7 +797,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
 		if info.spec.Linux != nil && info.spec.Linux.Seccomp != nil {
 			program, err := seccomp.BuildProgram(info.spec.Linux.Seccomp)
 			if err != nil {
-				return nil, fmt.Errorf("building seccomp program: %v", err)
+				return nil, nil, nil, fmt.Errorf("building seccomp program: %v", err)
 			}
 
 			if log.IsLogging(log.Debug) {
@@ -795,7 +808,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
 			task := tg.Leader()
 			// NOTE: It seems Flags are ignored by runc so we ignore them too.
 			if err := task.AppendSyscallFilter(program, true); err != nil {
-				return nil, fmt.Errorf("appending seccomp filters: %v", err)
+				return nil, nil, nil, fmt.Errorf("appending seccomp filters: %v", err)
 			}
 		}
 	} else {
@@ -804,7 +817,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn
 		}
 	}
 
-	return tg, nil
+	return tg, ttyFile, ttyFileVFS2, nil
 }
 
 // startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
@@ -926,6 +939,11 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
 		}
 	}
 
+	args.Envv, err = specutils.ResolveEnvs(args.Envv)
+	if err != nil {
+		return 0, fmt.Errorf("resolving env: %w", err)
+	}
+
 	// Add the HOME environment variable if it is not already set.
 	if kernel.VFS2Enabled {
 		root := args.MountNamespaceVFS2.Root()
@@ -1037,9 +1055,10 @@ func (l *Loader) WaitExit() kernel.ExitStatus {
 	// Wait for container.
 	l.k.WaitExited()
 
-	// Cleanup
+	// Stop the control server.
 	l.ctrl.stop()
 
+	// Check all references.
 	refs.OnExit()
 
 	return l.k.GlobalInit().ExitStatus()
@@ -1074,7 +1093,12 @@ func newRootNetworkNamespace(conf *config.Config, clock tcpip.Clock, uniqueID st
 
 func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
 	netProtos := []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol, arp.NewProtocol}
-	transProtos := []stack.TransportProtocolFactory{tcp.NewProtocol, udp.NewProtocol, icmp.NewProtocol4}
+	transProtos := []stack.TransportProtocolFactory{
+		tcp.NewProtocol,
+		udp.NewProtocol,
+		icmp.NewProtocol4,
+		icmp.NewProtocol6,
+	}
 	s := netstack.Stack{stack.New(stack.Options{
 		NetworkProtocols:   netProtos,
 		TransportProtocols: transProtos,
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 988573640..3d3a813df 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -28,7 +28,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/link/packetsocket"
 	"gvisor.dev/gvisor/pkg/tcpip/link/qdisc/fifo"
 	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
-	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -41,9 +40,9 @@ var (
 	// "::1/8" on "lo" interface.
 	DefaultLoopbackLink = LoopbackLink{
 		Name: "lo",
-		Addresses: []net.IP{
-			net.IP("\x7f\x00\x00\x01"),
-			net.IPv6loopback,
+		Addresses: []IPWithPrefix{
+			{Address: net.IP("\x7f\x00\x00\x01"), PrefixLen: 8},
+			{Address: net.IPv6loopback, PrefixLen: 128},
 		},
 		Routes: []Route{
 			{
@@ -83,7 +82,7 @@ type DefaultRoute struct {
 type FDBasedLink struct {
 	Name               string
 	MTU                int
-	Addresses          []net.IP
+	Addresses          []IPWithPrefix
 	Routes             []Route
 	GSOMaxSize         uint32
 	SoftwareGSOEnabled bool
@@ -100,7 +99,7 @@ type FDBasedLink struct {
 // LoopbackLink configures a loopback li nk.
 type LoopbackLink struct {
 	Name      string
-	Addresses []net.IP
+	Addresses []IPWithPrefix
 	Routes    []Route
 }
 
@@ -118,6 +117,19 @@ type CreateLinksAndRoutesArgs struct {
 	Defaultv6Gateway DefaultRoute
 }
 
+// IPWithPrefix is an address with its subnet prefix length.
+type IPWithPrefix struct {
+	// Address is a network address.
+	Address net.IP
+
+	// PrefixLen is the subnet prefix length.
+	PrefixLen int
+}
+
+func (ip IPWithPrefix) String() string {
+	return fmt.Sprintf("%s/%d", ip.Address, ip.PrefixLen)
+}
+
 // Empty returns true if route hasn't been set.
 func (r *Route) Empty() bool {
 	return r.Destination.IP == nil && r.Destination.Mask == nil && r.Gateway == nil
@@ -265,20 +277,19 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
 
 // createNICWithAddrs creates a NIC in the network stack and adds the given
 // addresses.
-func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP) error {
+func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []IPWithPrefix) error {
 	opts := stack.NICOptions{Name: name}
 	if err := n.Stack.CreateNICWithOptions(id, sniffer.New(ep), opts); err != nil {
 		return fmt.Errorf("CreateNICWithOptions(%d, _, %+v) failed: %v", id, opts, err)
 	}
 
-	// Always start with an arp address for the NIC.
-	if err := n.Stack.AddAddress(id, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
-		return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, arp.ProtocolNumber, arp.ProtocolAddress, err)
-	}
-
 	for _, addr := range addrs {
-		proto, tcpipAddr := ipToAddressAndProto(addr)
-		if err := n.Stack.AddAddress(id, proto, tcpipAddr); err != nil {
+		proto, tcpipAddr := ipToAddressAndProto(addr.Address)
+		ap := tcpip.AddressWithPrefix{
+			Address:   tcpipAddr,
+			PrefixLen: addr.PrefixLen,
+		}
+		if err := n.Stack.AddAddressWithPrefix(id, proto, ap); err != nil {
 			return fmt.Errorf("AddAddress(%v, %v, %v) failed: %v", id, proto, tcpipAddr, err)
 		}
 	}
diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go
index b157387ef..3fd28e516 100644
--- a/runsc/boot/vfs.go
+++ b/runsc/boot/vfs.go
@@ -250,36 +250,76 @@ func (c *containerMounter) configureOverlay(ctx context.Context, creds *auth.Cre
 	overlayOpts := *lowerOpts
 	overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
 
-	// Next mount upper and lower. Upper is a tmpfs mount to keep all
-	// modifications inside the sandbox.
-	upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts)
-	if err != nil {
-		return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err)
-	}
-	cu := cleanup.Make(func() { upper.DecRef(ctx) })
-	defer cu.Clean()
-
 	// All writes go to the upper layer, be paranoid and make lower readonly.
 	lowerOpts.ReadOnly = true
 	lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts)
 	if err != nil {
 		return nil, nil, err
 	}
-	cu.Add(func() { lower.DecRef(ctx) })
+	cu := cleanup.Make(func() { lower.DecRef(ctx) })
+	defer cu.Clean()
 
-	// Propagate the lower layer's root's owner, group, and mode to the upper
-	// layer's root for consistency with VFS1.
-	upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root())
+	// Determine the lower layer's root's type.
 	lowerRootVD := vfs.MakeVirtualDentry(lower, lower.Root())
 	stat, err := c.k.VFS().StatAt(ctx, creds, &vfs.PathOperation{
 		Root:  lowerRootVD,
 		Start: lowerRootVD,
 	}, &vfs.StatOptions{
-		Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE,
+		Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE | linux.STATX_TYPE,
 	})
 	if err != nil {
-		return nil, nil, err
+		return nil, nil, fmt.Errorf("failed to stat lower layer's root: %v", err)
+	}
+	if stat.Mask&linux.STATX_TYPE == 0 {
+		return nil, nil, fmt.Errorf("failed to get file type of lower layer's root")
+	}
+	rootType := stat.Mode & linux.S_IFMT
+	if rootType != linux.S_IFDIR && rootType != linux.S_IFREG {
+		return nil, nil, fmt.Errorf("lower layer's root has unsupported file type %v", rootType)
+	}
+
+	// Upper is a tmpfs mount to keep all modifications inside the sandbox.
+	upperOpts.GetFilesystemOptions.InternalData = tmpfs.FilesystemOpts{
+		RootFileType: uint16(rootType),
+	}
+	upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts)
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err)
+	}
+	cu.Add(func() { upper.DecRef(ctx) })
+
+	// If the overlay mount consists of a regular file, copy up its contents
+	// from the lower layer, since in the overlay the otherwise-empty upper
+	// layer file will take precedence.
+	upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root())
+	if rootType == linux.S_IFREG {
+		lowerFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{
+			Root:  lowerRootVD,
+			Start: lowerRootVD,
+		}, &vfs.OpenOptions{
+			Flags: linux.O_RDONLY,
+		})
+		if err != nil {
+			return nil, nil, fmt.Errorf("failed to open lower layer root for copying: %v", err)
+		}
+		defer lowerFD.DecRef(ctx)
+		upperFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{
+			Root:  upperRootVD,
+			Start: upperRootVD,
+		}, &vfs.OpenOptions{
+			Flags: linux.O_WRONLY,
+		})
+		if err != nil {
+			return nil, nil, fmt.Errorf("failed to open upper layer root for copying: %v", err)
+		}
+		defer upperFD.DecRef(ctx)
+		if _, err := vfs.CopyRegularFileData(ctx, upperFD, lowerFD); err != nil {
+			return nil, nil, fmt.Errorf("failed to copy up overlay file: %v", err)
+		}
 	}
+
+	// Propagate the lower layer's root's owner, group, and mode to the upper
+	// layer's root for consistency with VFS1.
 	err = c.k.VFS().SetStatAt(ctx, creds, &vfs.PathOperation{
 		Root:  upperRootVD,
 		Start: upperRootVD,
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index 5bd0afc52..13c6a16a0 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -41,22 +41,22 @@ const (
 )
 
 var controllers = map[string]config{
-	"blkio":    config{ctrlr: &blockIO{}},
-	"cpu":      config{ctrlr: &cpu{}},
-	"cpuset":   config{ctrlr: &cpuSet{}},
-	"hugetlb":  config{ctrlr: &hugeTLB{}, optional: true},
-	"memory":   config{ctrlr: &memory{}},
-	"net_cls":  config{ctrlr: &networkClass{}},
-	"net_prio": config{ctrlr: &networkPrio{}},
-	"pids":     config{ctrlr: &pids{}},
+	"blkio":    {ctrlr: &blockIO{}},
+	"cpu":      {ctrlr: &cpu{}},
+	"cpuset":   {ctrlr: &cpuSet{}},
+	"hugetlb":  {ctrlr: &hugeTLB{}, optional: true},
+	"memory":   {ctrlr: &memory{}},
+	"net_cls":  {ctrlr: &networkClass{}},
+	"net_prio": {ctrlr: &networkPrio{}},
+	"pids":     {ctrlr: &pids{}},
 
 	// These controllers either don't have anything in the OCI spec or is
 	// irrelevant for a sandbox.
-	"devices":    config{ctrlr: &noop{}},
-	"freezer":    config{ctrlr: &noop{}},
-	"perf_event": config{ctrlr: &noop{}},
-	"rdma":       config{ctrlr: &noop{}, optional: true},
-	"systemd":    config{ctrlr: &noop{}},
+	"devices":    {ctrlr: &noop{}},
+	"freezer":    {ctrlr: &noop{}},
+	"perf_event": {ctrlr: &noop{}},
+	"rdma":       {ctrlr: &noop{}, optional: true},
+	"systemd":    {ctrlr: &noop{}},
 }
 
 func setOptionalValueInt(path, name string, val *int64) error {
@@ -234,7 +234,7 @@ func loadPathsHelper(cgroup io.Reader) (map[string]string, error) {
 type Cgroup struct {
 	Name    string            `json:"name"`
 	Parents map[string]string `json:"parents"`
-	Own     bool              `json:"own"`
+	Own     map[string]bool   `json:"own"`
 }
 
 // New creates a new Cgroup instance if the spec includes a cgroup path.
@@ -251,9 +251,11 @@ func New(spec *specs.Spec) (*Cgroup, error) {
 			return nil, fmt.Errorf("finding current cgroups: %w", err)
 		}
 	}
+	own := make(map[string]bool)
 	return &Cgroup{
 		Name:    spec.Linux.CgroupsPath,
 		Parents: parents,
+		Own:     own,
 	}, nil
 }
 
@@ -261,18 +263,8 @@ func New(spec *specs.Spec) (*Cgroup, error) {
 // already exists, it means that the caller has already provided a
 // pre-configured cgroups, and 'res' is ignored.
 func (c *Cgroup) Install(res *specs.LinuxResources) error {
-	if _, err := os.Stat(c.makePath("memory")); err == nil {
-		// If cgroup has already been created; it has been setup by caller. Don't
-		// make any changes to configuration, just join when sandbox/gofer starts.
-		log.Debugf("Using pre-created cgroup %q", c.Name)
-		return nil
-	}
-
 	log.Debugf("Creating cgroup %q", c.Name)
 
-	// Mark that cgroup resources are owned by me.
-	c.Own = true
-
 	// The Cleanup object cleans up partially created cgroups when an error occurs.
 	// Errors occuring during cleanup itself are ignored.
 	clean := cleanup.Make(func() { _ = c.Uninstall() })
@@ -280,6 +272,16 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error {
 
 	for key, cfg := range controllers {
 		path := c.makePath(key)
+		if _, err := os.Stat(path); err == nil {
+			// If cgroup has already been created; it has been setup by caller. Don't
+			// make any changes to configuration, just join when sandbox/gofer starts.
+			log.Debugf("Using pre-created cgroup %q", path)
+			continue
+		}
+
+		// Mark that cgroup resources are owned by me.
+		c.Own[key] = true
+
 		if err := os.MkdirAll(path, 0755); err != nil {
 			if cfg.optional && errors.Is(err, syscall.EROFS) {
 				log.Infof("Skipping cgroup %q", key)
@@ -298,12 +300,12 @@ func (c *Cgroup) Install(res *specs.LinuxResources) error {
 // Uninstall removes the settings done in Install(). If cgroup path already
 // existed when Install() was called, Uninstall is a noop.
 func (c *Cgroup) Uninstall() error {
-	if !c.Own {
-		// cgroup is managed by caller, don't touch it.
-		return nil
-	}
 	log.Debugf("Deleting cgroup %q", c.Name)
 	for key := range controllers {
+		if !c.Own[key] {
+			// cgroup is managed by caller, don't touch it.
+			continue
+		}
 		path := c.makePath(key)
 		log.Debugf("Removing cgroup controller for key=%q path=%q", key, path)
 
@@ -369,6 +371,7 @@ func (c *Cgroup) Join() (func(), error) {
 	return undo, nil
 }
 
+// CPUQuota returns the CFS CPU quota.
 func (c *Cgroup) CPUQuota() (float64, error) {
 	path := c.makePath("cpu")
 	quota, err := getInt(path, "cpu.cfs_quota_us")
diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go
index 9794517a7..931144cf9 100644
--- a/runsc/cgroup/cgroup_test.go
+++ b/runsc/cgroup/cgroup_test.go
@@ -29,7 +29,10 @@ func TestUninstallEnoent(t *testing.T) {
 	c := Cgroup{
 		// set a non-existent name
 		Name: "runsc-test-uninstall-656e6f656e740a",
-		Own:  true,
+	}
+	c.Own = make(map[string]bool)
+	for key := range controllers {
+		c.Own[key] = true
 	}
 	if err := c.Uninstall(); err != nil {
 		t.Errorf("Uninstall() failed: %v", err)
diff --git a/runsc/cli/main.go b/runsc/cli/main.go
index bca015db5..6c3bf4d21 100644
--- a/runsc/cli/main.go
+++ b/runsc/cli/main.go
@@ -22,6 +22,7 @@ import (
 	"io/ioutil"
 	"os"
 	"os/signal"
+	"runtime"
 	"syscall"
 	"time"
 
@@ -82,6 +83,7 @@ func Main(version string) {
 	subcommands.Register(new(cmd.Spec), "")
 	subcommands.Register(new(cmd.State), "")
 	subcommands.Register(new(cmd.Start), "")
+	subcommands.Register(new(cmd.Symbolize), "")
 	subcommands.Register(new(cmd.Wait), "")
 
 	// Register internal commands with the internal group name. This causes
@@ -207,6 +209,8 @@ func Main(version string) {
 	log.Infof("***************************")
 	log.Infof("Args: %s", os.Args)
 	log.Infof("Version %s", version)
+	log.Infof("GOOS: %s", runtime.GOOS)
+	log.Infof("GOARCH: %s", runtime.GOARCH)
 	log.Infof("PID: %d", os.Getpid())
 	log.Infof("UID: %d, GID: %d", os.Getuid(), os.Getgid())
 	log.Infof("Configuration:")
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 2556f6d9e..19520d7ab 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -32,6 +32,7 @@ go_library(
         "start.go",
         "state.go",
         "statefile.go",
+        "symbolize.go",
         "syscalls.go",
         "wait.go",
     ],
@@ -39,6 +40,7 @@ go_library(
         "//runsc:__subpackages__",
     ],
     deps = [
+        "//pkg/coverage",
         "//pkg/log",
         "//pkg/p9",
         "//pkg/sentry/control",
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index c0bc8f064..124198239 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -75,7 +75,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 	conf := args[0].(*config.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
-	cont, err := container.LoadAndCheck(conf.RootDir, id)
+	cont, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index 609e8231c..b84142b0d 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -17,8 +17,10 @@ package cmd
 import (
 	"context"
 	"os"
+	"os/signal"
 	"strconv"
 	"strings"
+	"sync"
 	"syscall"
 	"time"
 
@@ -43,6 +45,7 @@ type Debug struct {
 	strace       string
 	logLevel     string
 	logPackets   string
+	delay        time.Duration
 	duration     time.Duration
 	ps           bool
 }
@@ -70,10 +73,11 @@ func (d *Debug) SetFlags(f *flag.FlagSet) {
 	f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.")
 	f.StringVar(&d.profileBlock, "profile-block", "", "writes block profile to the given file.")
 	f.StringVar(&d.profileMutex, "profile-mutex", "", "writes mutex profile to the given file.")
-	f.DurationVar(&d.duration, "duration", time.Second, "amount of time to wait for CPU and trace profiles")
+	f.DurationVar(&d.delay, "delay", time.Hour, "amount of time to delay for collecting heap and goroutine profiles.")
+	f.DurationVar(&d.duration, "duration", time.Hour, "amount of time to wait for CPU and trace profiles.")
 	f.StringVar(&d.trace, "trace", "", "writes an execution trace to the given file.")
 	f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox")
-	f.StringVar(&d.strace, "strace", "", `A comma separated list of syscalls to trace. "all" enables all traces, "off" disables all`)
+	f.StringVar(&d.strace, "strace", "", `A comma separated list of syscalls to trace. "all" enables all traces, "off" disables all.`)
 	f.StringVar(&d.logLevel, "log-level", "", "The log level to set: warning (0), info (1), or debug (2).")
 	f.StringVar(&d.logPackets, "log-packets", "", "A boolean value to enable or disable packet logging: true or false.")
 	f.BoolVar(&d.ps, "ps", false, "lists processes")
@@ -90,8 +94,10 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 			f.Usage()
 			return subcommands.ExitUsageError
 		}
+		id := f.Arg(0)
+
 		var err error
-		c, err = container.LoadAndCheck(conf.RootDir, f.Arg(0))
+		c, err = container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 		if err != nil {
 			return Errorf("loading container %q: %v", f.Arg(0), err)
 		}
@@ -106,9 +112,10 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 			return Errorf("listing containers: %v", err)
 		}
 		for _, id := range ids {
-			candidate, err := container.LoadAndCheck(conf.RootDir, id)
+			candidate, err := container.Load(conf.RootDir, id, container.LoadOpts{Exact: true, SkipCheck: true})
 			if err != nil {
-				return Errorf("loading container %q: %v", id, err)
+				log.Warningf("Skipping container %q: %v", id, err)
+				continue
 			}
 			if candidate.SandboxPid() == d.pid {
 				c = candidate
@@ -120,11 +127,12 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		}
 	}
 
-	if c.Sandbox == nil || !c.Sandbox.IsRunning() {
+	if !c.IsSandboxRunning() {
 		return Errorf("container sandbox is not running")
 	}
 	log.Infof("Found sandbox %q, PID: %d", c.Sandbox.ID, c.Sandbox.Pid)
 
+	// Perform synchronous actions.
 	if d.signal > 0 {
 		log.Infof("Sending signal %d to process: %d", d.signal, c.Sandbox.Pid)
 		if err := syscall.Kill(c.Sandbox.Pid, syscall.Signal(d.signal)); err != nil {
@@ -139,81 +147,6 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		}
 		log.Infof("     *** Stack dump ***\n%s", stacks)
 	}
-	if d.profileHeap != "" {
-		f, err := os.Create(d.profileHeap)
-		if err != nil {
-			return Errorf(err.Error())
-		}
-		defer f.Close()
-
-		if err := c.Sandbox.HeapProfile(f); err != nil {
-			return Errorf(err.Error())
-		}
-		log.Infof("Heap profile written to %q", d.profileHeap)
-	}
-	if d.profileBlock != "" {
-		f, err := os.Create(d.profileBlock)
-		if err != nil {
-			return Errorf(err.Error())
-		}
-		defer f.Close()
-
-		if err := c.Sandbox.BlockProfile(f); err != nil {
-			return Errorf(err.Error())
-		}
-		log.Infof("Block profile written to %q", d.profileBlock)
-	}
-	if d.profileMutex != "" {
-		f, err := os.Create(d.profileMutex)
-		if err != nil {
-			return Errorf(err.Error())
-		}
-		defer f.Close()
-
-		if err := c.Sandbox.MutexProfile(f); err != nil {
-			return Errorf(err.Error())
-		}
-		log.Infof("Mutex profile written to %q", d.profileMutex)
-	}
-
-	delay := false
-	if d.profileCPU != "" {
-		delay = true
-		f, err := os.Create(d.profileCPU)
-		if err != nil {
-			return Errorf(err.Error())
-		}
-		defer func() {
-			f.Close()
-			if err := c.Sandbox.StopCPUProfile(); err != nil {
-				Fatalf(err.Error())
-			}
-			log.Infof("CPU profile written to %q", d.profileCPU)
-		}()
-		if err := c.Sandbox.StartCPUProfile(f); err != nil {
-			return Errorf(err.Error())
-		}
-		log.Infof("CPU profile started for %v, writing to %q", d.duration, d.profileCPU)
-	}
-	if d.trace != "" {
-		delay = true
-		f, err := os.Create(d.trace)
-		if err != nil {
-			return Errorf(err.Error())
-		}
-		defer func() {
-			f.Close()
-			if err := c.Sandbox.StopTrace(); err != nil {
-				Fatalf(err.Error())
-			}
-			log.Infof("Trace written to %q", d.trace)
-		}()
-		if err := c.Sandbox.StartTrace(f); err != nil {
-			return Errorf(err.Error())
-		}
-		log.Infof("Tracing started for %v, writing to %q", d.duration, d.trace)
-	}
-
 	if d.strace != "" || len(d.logLevel) != 0 || len(d.logPackets) != 0 {
 		args := control.LoggingArgs{}
 		switch strings.ToLower(d.strace) {
@@ -282,8 +215,156 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		log.Infof(o)
 	}
 
-	if delay {
-		time.Sleep(d.duration)
+	// Open profiling files.
+	var (
+		heapFile  *os.File
+		cpuFile   *os.File
+		traceFile *os.File
+		blockFile *os.File
+		mutexFile *os.File
+	)
+	if d.profileHeap != "" {
+		f, err := os.OpenFile(d.profileHeap, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
+		if err != nil {
+			return Errorf("error opening heap profile output: %v", err)
+		}
+		defer f.Close()
+		heapFile = f
+	}
+	if d.profileCPU != "" {
+		f, err := os.OpenFile(d.profileCPU, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
+		if err != nil {
+			return Errorf("error opening cpu profile output: %v", err)
+		}
+		defer f.Close()
+		cpuFile = f
+	}
+	if d.trace != "" {
+		f, err := os.OpenFile(d.trace, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
+		if err != nil {
+			return Errorf("error opening trace profile output: %v", err)
+		}
+		traceFile = f
+	}
+	if d.profileBlock != "" {
+		f, err := os.OpenFile(d.profileBlock, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
+		if err != nil {
+			return Errorf("error opening blocking profile output: %v", err)
+		}
+		defer f.Close()
+		blockFile = f
+	}
+	if d.profileMutex != "" {
+		f, err := os.OpenFile(d.profileMutex, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
+		if err != nil {
+			return Errorf("error opening mutex profile output: %v", err)
+		}
+		defer f.Close()
+		mutexFile = f
+	}
+
+	// Collect profiles.
+	var (
+		wg       sync.WaitGroup
+		heapErr  error
+		cpuErr   error
+		traceErr error
+		blockErr error
+		mutexErr error
+	)
+	if heapFile != nil {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			heapErr = c.Sandbox.HeapProfile(heapFile, d.delay)
+		}()
+	}
+	if cpuFile != nil {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			cpuErr = c.Sandbox.CPUProfile(cpuFile, d.duration)
+		}()
+	}
+	if traceFile != nil {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			traceErr = c.Sandbox.Trace(traceFile, d.duration)
+		}()
+	}
+	if blockFile != nil {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			blockErr = c.Sandbox.BlockProfile(blockFile, d.duration)
+		}()
+	}
+	if mutexFile != nil {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			mutexErr = c.Sandbox.MutexProfile(mutexFile, d.duration)
+		}()
+	}
+
+	// Before sleeping, allow us to catch signals and try to exit
+	// gracefully before just exiting. If we can't wait for wg, then
+	// we will not be able to read the errors below safely.
+	readyChan := make(chan struct{})
+	go func() {
+		defer close(readyChan)
+		wg.Wait()
+	}()
+	signals := make(chan os.Signal, 1)
+	signal.Notify(signals, syscall.SIGTERM, syscall.SIGINT)
+	select {
+	case <-readyChan:
+		break // Safe to proceed.
+	case <-signals:
+		log.Infof("caught signal, waiting at most one more second.")
+		select {
+		case <-signals:
+			log.Infof("caught second signal, exiting immediately.")
+			os.Exit(1) // Not finished.
+		case <-time.After(time.Second):
+			log.Infof("timeout, exiting.")
+			os.Exit(1) // Not finished.
+		case <-readyChan:
+			break // Safe to proceed.
+		}
+	}
+
+	// Collect all errors.
+	errorCount := 0
+	if heapErr != nil {
+		errorCount++
+		log.Infof("error collecting heap profile: %v", heapErr)
+		os.Remove(heapFile.Name())
+	}
+	if cpuErr != nil {
+		errorCount++
+		log.Infof("error collecting cpu profile: %v", cpuErr)
+		os.Remove(cpuFile.Name())
+	}
+	if traceErr != nil {
+		errorCount++
+		log.Infof("error collecting trace profile: %v", traceErr)
+		os.Remove(traceFile.Name())
+	}
+	if blockErr != nil {
+		errorCount++
+		log.Infof("error collecting block profile: %v", blockErr)
+		os.Remove(blockFile.Name())
+	}
+	if mutexErr != nil {
+		errorCount++
+		log.Infof("error collecting mutex profile: %v", mutexErr)
+		os.Remove(mutexFile.Name())
+	}
+
+	if errorCount > 0 {
+		return subcommands.ExitFailure
 	}
 
 	return subcommands.ExitSuccess
diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go
index a25637265..a750be131 100644
--- a/runsc/cmd/delete.go
+++ b/runsc/cmd/delete.go
@@ -68,7 +68,7 @@ func (d *Delete) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 
 func (d *Delete) execute(ids []string, conf *config.Config) error {
 	for _, id := range ids {
-		c, err := container.LoadAndCheck(conf.RootDir, id)
+		c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 		if err != nil {
 			if os.IsNotExist(err) && d.force {
 				log.Warningf("couldn't find container %q: %v", id, err)
diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
index 640de4c47..8a8d9f752 100644
--- a/runsc/cmd/do.go
+++ b/runsc/cmd/do.go
@@ -81,7 +81,7 @@ func (c *Do) SetFlags(f *flag.FlagSet) {
 // Execute implements subcommands.Command.Execute.
 func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
 	if len(f.Args()) == 0 {
-		c.Usage()
+		f.Usage()
 		return subcommands.ExitUsageError
 	}
 
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
index 3836b7b4e..75b0aac8d 100644
--- a/runsc/cmd/events.go
+++ b/runsc/cmd/events.go
@@ -74,7 +74,7 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa
 	id := f.Arg(0)
 	conf := args[0].(*config.Config)
 
-	c, err := container.LoadAndCheck(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading sandbox: %v", err)
 	}
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 86c02a22a..e9726401a 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -112,20 +112,20 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 	waitStatus := args[1].(*syscall.WaitStatus)
 
-	c, err := container.LoadAndCheck(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading sandbox: %v", err)
 	}
 
 	log.Debugf("Exec arguments: %+v", e)
-	log.Debugf("Exec capablities: %+v", e.Capabilities)
+	log.Debugf("Exec capabilities: %+v", e.Capabilities)
 
 	// Replace empty settings with defaults from container.
 	if e.WorkingDirectory == "" {
 		e.WorkingDirectory = c.Spec.Process.Cwd
 	}
 	if e.Envv == nil {
-		e.Envv, err = resolveEnvs(c.Spec.Process.Env, ex.env)
+		e.Envv, err = specutils.ResolveEnvs(c.Spec.Process.Env, ex.env)
 		if err != nil {
 			Fatalf("getting environment variables: %v", err)
 		}
@@ -150,7 +150,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 }
 
 func (ex *Exec) exec(c *container.Container, e *control.ExecArgs, waitStatus *syscall.WaitStatus) subcommands.ExitStatus {
-	// Start the new process and get it pid.
+	// Start the new process and get its pid.
 	pid, err := c.Execute(e)
 	if err != nil {
 		return Errorf("executing processes for container: %v", err)
@@ -382,31 +382,6 @@ func argsFromProcess(p *specs.Process, enableRaw bool) (*control.ExecArgs, error
 	}, nil
 }
 
-// resolveEnvs transforms lists of environment variables into a single list of
-// environment variables. If a variable is defined multiple times, the last
-// value is used.
-func resolveEnvs(envs ...[]string) ([]string, error) {
-	// First create a map of variable names to values. This removes any
-	// duplicates.
-	envMap := make(map[string]string)
-	for _, env := range envs {
-		for _, str := range env {
-			parts := strings.SplitN(str, "=", 2)
-			if len(parts) != 2 {
-				return nil, fmt.Errorf("invalid variable: %s", str)
-			}
-			envMap[parts[0]] = parts[1]
-		}
-	}
-	// Reassemble envMap into a list of environment variables of the form
-	// NAME=VALUE.
-	env := make([]string, 0, len(envMap))
-	for k, v := range envMap {
-		env = append(env, fmt.Sprintf("%s=%s", k, v))
-	}
-	return env, nil
-}
-
 // capabilities takes a list of capabilities as strings and returns an
 // auth.TaskCapabilities struct with those capabilities in every capability set.
 // This mimics runc's behavior.
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
index fe69e2a08..aecf0b7ab 100644
--- a/runsc/cmd/kill.go
+++ b/runsc/cmd/kill.go
@@ -69,7 +69,7 @@ func (k *Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Fatalf("it is invalid to specify both --all and --pid")
 	}
 
-	c, err := container.LoadAndCheck(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go
index 6907eb16a..9f9a47bd8 100644
--- a/runsc/cmd/list.go
+++ b/runsc/cmd/list.go
@@ -24,6 +24,7 @@ import (
 
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
@@ -71,7 +72,7 @@ func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 
 	if l.quiet {
 		for _, id := range ids {
-			fmt.Println(id)
+			fmt.Println(id.ContainerID)
 		}
 		return subcommands.ExitSuccess
 	}
@@ -79,9 +80,10 @@ func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// Collect the containers.
 	var containers []*container.Container
 	for _, id := range ids {
-		c, err := container.LoadAndCheck(conf.RootDir, id)
+		c, err := container.Load(conf.RootDir, id, container.LoadOpts{Exact: true})
 		if err != nil {
-			Fatalf("loading container %q: %v", id, err)
+			log.Warningf("Skipping container %q: %v", id, err)
+			continue
 		}
 		containers = append(containers, c)
 	}
diff --git a/runsc/cmd/pause.go b/runsc/cmd/pause.go
index fe7d4e257..15ef7b577 100644
--- a/runsc/cmd/pause.go
+++ b/runsc/cmd/pause.go
@@ -55,7 +55,7 @@ func (*Pause) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	id := f.Arg(0)
 	conf := args[0].(*config.Config)
 
-	cont, err := container.LoadAndCheck(conf.RootDir, id)
+	cont, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go
index 18d7a1436..04e3e0bdd 100644
--- a/runsc/cmd/ps.go
+++ b/runsc/cmd/ps.go
@@ -60,7 +60,7 @@ func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{})
 	id := f.Arg(0)
 	conf := args[0].(*config.Config)
 
-	c, err := container.LoadAndCheck(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading sandbox: %v", err)
 	}
diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go
index a00928204..856469252 100644
--- a/runsc/cmd/resume.go
+++ b/runsc/cmd/resume.go
@@ -56,7 +56,7 @@ func (r *Resume) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 	id := f.Arg(0)
 	conf := args[0].(*config.Config)
 
-	cont, err := container.LoadAndCheck(conf.RootDir, id)
+	cont, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
index f6499cc44..964a65064 100644
--- a/runsc/cmd/start.go
+++ b/runsc/cmd/start.go
@@ -55,7 +55,7 @@ func (*Start) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	id := f.Arg(0)
 	conf := args[0].(*config.Config)
 
-	c, err := container.LoadAndCheck(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go
index d8a70dd7f..1f7913d5a 100644
--- a/runsc/cmd/state.go
+++ b/runsc/cmd/state.go
@@ -57,7 +57,7 @@ func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	id := f.Arg(0)
 	conf := args[0].(*config.Config)
 
-	c, err := container.LoadAndCheck(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
diff --git a/runsc/cmd/symbolize.go b/runsc/cmd/symbolize.go
new file mode 100644
index 000000000..fc0c69358
--- /dev/null
+++ b/runsc/cmd/symbolize.go
@@ -0,0 +1,91 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"bufio"
+	"context"
+	"os"
+	"strconv"
+	"strings"
+
+	"github.com/google/subcommands"
+	"gvisor.dev/gvisor/pkg/coverage"
+	"gvisor.dev/gvisor/runsc/flag"
+)
+
+// Symbolize implements subcommands.Command for the "symbolize" command.
+type Symbolize struct {
+	dumpAll bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Symbolize) Name() string {
+	return "symbolize"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Symbolize) Synopsis() string {
+	return "Convert synthetic instruction pointers from kcov into positions in the runsc source code. Only used when Go coverage is enabled."
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Symbolize) Usage() string {
+	return `symbolize - converts synthetic instruction pointers into positions in the runsc source code.
+
+This command takes instruction pointers from stdin and converts them into their
+corresponding file names and line/column numbers in the runsc source code. The
+inputs are not interpreted as actual addresses, but as synthetic values that are
+exposed through /sys/kernel/debug/kcov. One can extract coverage information
+from kcov and translate those values into locations in the source code by
+running symbolize on the same runsc binary.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (c *Symbolize) SetFlags(f *flag.FlagSet) {
+	f.BoolVar(&c.dumpAll, "all", false, "dump information on all coverage blocks along with their synthetic PCs")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (c *Symbolize) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if f.NArg() != 0 {
+		f.Usage()
+		return subcommands.ExitUsageError
+	}
+	if !coverage.KcovAvailable() {
+		return Errorf("symbolize can only be used when coverage is available.")
+	}
+	coverage.InitCoverageData()
+
+	if c.dumpAll {
+		coverage.WriteAllBlocks(os.Stdout)
+		return subcommands.ExitSuccess
+	}
+
+	scanner := bufio.NewScanner(os.Stdin)
+	for scanner.Scan() {
+		// Input is always base 16, but may or may not have a leading "0x".
+		str := strings.TrimPrefix(scanner.Text(), "0x")
+		pc, err := strconv.ParseUint(str, 16 /* base */, 64 /* bitSize */)
+		if err != nil {
+			return Errorf("Failed to symbolize \"%s\": %v", scanner.Text(), err)
+		}
+		if err := coverage.Symbolize(os.Stdout, pc); err != nil {
+			return Errorf("Failed to symbolize \"%s\": %v", scanner.Text(), err)
+		}
+	}
+	return subcommands.ExitSuccess
+}
diff --git a/runsc/cmd/syscalls.go b/runsc/cmd/syscalls.go
index a37d66139..a8c83d662 100644
--- a/runsc/cmd/syscalls.go
+++ b/runsc/cmd/syscalls.go
@@ -147,7 +147,7 @@ func getCompatibilityInfo(osName string, archName string) (CompatibilityInfo, er
 	info := CompatibilityInfo(make(map[string]map[string]ArchInfo))
 	if osName == osAll {
 		// Special processing for the 'all' OS name.
-		for osName, _ := range syscallTableMap {
+		for osName := range syscallTableMap {
 			info[osName] = make(map[string]ArchInfo)
 			// osName is a specific OS name.
 			if err := addToCompatibilityInfo(info, osName, archName); err != nil {
@@ -171,7 +171,7 @@ func getCompatibilityInfo(osName string, archName string) (CompatibilityInfo, er
 func addToCompatibilityInfo(info CompatibilityInfo, osName string, archName string) error {
 	if archName == archAll {
 		// Special processing for the 'all' architecture name.
-		for archName, _ := range syscallTableMap[osName] {
+		for archName := range syscallTableMap[osName] {
 			archInfo, err := getArchInfo(osName, archName)
 			if err != nil {
 				return err
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
index c1d6aeae2..5d55422c7 100644
--- a/runsc/cmd/wait.go
+++ b/runsc/cmd/wait.go
@@ -72,7 +72,7 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	id := f.Arg(0)
 	conf := args[0].(*config.Config)
 
-	c, err := container.LoadAndCheck(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
diff --git a/runsc/config/config.go b/runsc/config/config.go
index b02d8e2e1..e9fd7708f 100644
--- a/runsc/config/config.go
+++ b/runsc/config/config.go
@@ -131,7 +131,7 @@ type Config struct {
 	NumNetworkChannels int `flag:"num-network-channels"`
 
 	// Rootless allows the sandbox to be started with a user that is not root.
-	// Defense is depth measures are weaker with rootless. Specifically, the
+	// Defense in depth measures are weaker in rootless mode. Specifically, the
 	// sandbox and Gofer process run as root inside a user namespace with root
 	// mapped to the caller's user.
 	Rootless bool `flag:"rootless"`
diff --git a/runsc/config/flags.go b/runsc/config/flags.go
index 13d8f1b25..02ab9255a 100644
--- a/runsc/config/flags.go
+++ b/runsc/config/flags.go
@@ -71,7 +71,7 @@ func RegisterFlags() {
 		flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
 		flag.Bool("overlayfs-stale-read", true, "assume root mount is an overlay filesystem")
 		flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.")
-		flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.")
+		flag.Bool("vfs2", false, "enables VFSv2. This uses the new VFS layer that is faster than the previous one.")
 		flag.Bool("fuse", false, "TEST ONLY; use while FUSE in VFSv2 is landing. This allows the use of the new experimental FUSE filesystem.")
 
 		// Flags that control sandbox runtime behavior: network related.
diff --git a/runsc/console/console.go b/runsc/console/console.go
index dbb88e117..b36028792 100644
--- a/runsc/console/console.go
+++ b/runsc/console/console.go
@@ -24,8 +24,8 @@ import (
 	"golang.org/x/sys/unix"
 )
 
-// NewWithSocket creates pty master/replica pair, sends the master FD over the given
-// socket, and returns the replica.
+// NewWithSocket creates pty master/replica pair, sends the master FD over the
+// given socket, and returns the replica.
 func NewWithSocket(socketPath string) (*os.File, error) {
 	// Create a new pty master and replica.
 	ptyMaster, ptyReplica, err := pty.Open()
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index c33755482..8793c8916 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test", "more_shards")
 
 package(licenses = ["notice"])
 
@@ -24,6 +24,7 @@ go_library(
         "//runsc/boot",
         "//runsc/cgroup",
         "//runsc/config",
+        "//runsc/console",
         "//runsc/sandbox",
         "//runsc/specutils",
         "@com_github_cenkalti_backoff//:go_default_library",
@@ -48,7 +49,7 @@ go_test(
         "//test/cmd/test_app",
     ],
     library = ":container",
-    shard_count = 10,
+    shard_count = more_shards,
     tags = [
         "requires-kvm",
     ],
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 4228399b8..1b0fdebd6 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -18,6 +18,7 @@ import (
 	"bytes"
 	"fmt"
 	"io"
+	"math/rand"
 	"os"
 	"path/filepath"
 	"syscall"
@@ -27,7 +28,6 @@ import (
 	"github.com/kr/pty"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/sentry/control"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/test/testutil"
 	"gvisor.dev/gvisor/pkg/unet"
@@ -38,19 +38,22 @@ import (
 // path is under 108 charactors (the unix socket path length limit),
 // relativizing the path if necessary.
 func socketPath(bundleDir string) (string, error) {
-	path := filepath.Join(bundleDir, "socket")
+	num := rand.Intn(10000)
+	path := filepath.Join(bundleDir, fmt.Sprintf("socket-%4d", num))
+	const maxPathLen = 108
+	if len(path) <= maxPathLen {
+		return path, nil
+	}
+
+	// Path is too large, try to make it smaller.
 	cwd, err := os.Getwd()
 	if err != nil {
 		return "", fmt.Errorf("error getting cwd: %v", err)
 	}
-	relPath, err := filepath.Rel(cwd, path)
+	path, err = filepath.Rel(cwd, path)
 	if err != nil {
 		return "", fmt.Errorf("error getting relative path for %q from cwd %q: %v", path, cwd, err)
 	}
-	if len(path) > len(relPath) {
-		path = relPath
-	}
-	const maxPathLen = 108
 	if len(path) > maxPathLen {
 		return "", fmt.Errorf("could not get socket path under length limit %d: %s", maxPathLen, path)
 	}
@@ -159,6 +162,82 @@ func TestConsoleSocket(t *testing.T) {
 	}
 }
 
+// Test that an pty FD is sent over the console socket if one is provided.
+func TestMultiContainerConsoleSocket(t *testing.T) {
+	for name, conf := range configsWithVFS2(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// Setup the containers.
+			sleep := []string{"sleep", "100"}
+			tru := []string{"true"}
+			testSpecs, ids := createSpecs(sleep, tru)
+			testSpecs[1].Process.Terminal = true
+
+			bundleDir, cleanup, err := testutil.SetupBundleDir(testSpecs[0])
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
+
+			args := Args{
+				ID:        ids[0],
+				Spec:      testSpecs[0],
+				BundleDir: bundleDir,
+			}
+			rootCont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer rootCont.Destroy()
+			if err := rootCont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			bundleDir, cleanup, err = testutil.SetupBundleDir(testSpecs[0])
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer cleanup()
+
+			sock, err := socketPath(bundleDir)
+			if err != nil {
+				t.Fatalf("error getting socket path: %v", err)
+			}
+			srv, cleanup := createConsoleSocket(t, sock)
+			defer cleanup()
+
+			// Create the container and pass the socket name.
+			args = Args{
+				ID:            ids[1],
+				Spec:          testSpecs[1],
+				BundleDir:     bundleDir,
+				ConsoleSocket: sock,
+			}
+			cont, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer cont.Destroy()
+			if err := cont.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			// Make sure we get a console PTY.
+			ptyMaster, err := receiveConsolePTY(srv)
+			if err != nil {
+				t.Fatalf("error receiving console FD: %v", err)
+			}
+			ptyMaster.Close()
+		})
+	}
+}
+
 // Test that job control signals work on a console created with "exec -ti".
 func TestJobControlSignalExec(t *testing.T) {
 	spec := testutil.NewSpecWithArgs("/bin/sleep", "10000")
@@ -221,9 +300,9 @@ func TestJobControlSignalExec(t *testing.T) {
 	// Make sure all the processes are running.
 	expectedPL := []*control.Process{
 		// Root container process.
-		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
+		newProcessBuilder().Cmd("sleep").Process(),
 		// Bash from exec process.
-		{PID: 2, Cmd: "bash", Threads: []kernel.ThreadID{2}},
+		newProcessBuilder().PID(2).Cmd("bash").Process(),
 	}
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Error(err)
@@ -233,7 +312,7 @@ func TestJobControlSignalExec(t *testing.T) {
 	ptyMaster.Write([]byte("sleep 100\n"))
 
 	// Wait for it to start. Sleep's PPID is bash's PID.
-	expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}})
+	expectedPL = append(expectedPL, newProcessBuilder().PID(3).PPID(2).Cmd("sleep").Process())
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Error(err)
 	}
@@ -254,7 +333,7 @@ func TestJobControlSignalExec(t *testing.T) {
 	// Sleep is dead, but it may take more time for bash to notice and
 	// change the foreground process back to itself. We know it is done
 	// when bash writes "Terminated" to the pty.
-	if err := testutil.WaitUntilRead(ptyMaster, "Terminated", nil, 5*time.Second); err != nil {
+	if err := testutil.WaitUntilRead(ptyMaster, "Terminated", 5*time.Second); err != nil {
 		t.Fatalf("bash did not take over pty: %v", err)
 	}
 
@@ -359,7 +438,7 @@ func TestJobControlSignalRootContainer(t *testing.T) {
 
 	// Wait for bash to start.
 	expectedPL := []*control.Process{
-		{PID: 1, Cmd: "bash", Threads: []kernel.ThreadID{1}},
+		newProcessBuilder().PID(1).Cmd("bash").Process(),
 	}
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Fatalf("error waiting for processes: %v", err)
@@ -369,7 +448,7 @@ func TestJobControlSignalRootContainer(t *testing.T) {
 	ptyMaster.Write([]byte("sleep 100\n"))
 
 	// Wait for sleep to start.
-	expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{2}})
+	expectedPL = append(expectedPL, newProcessBuilder().PID(2).PPID(1).Cmd("sleep").Process())
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Fatalf("error waiting for processes: %v", err)
 	}
@@ -393,7 +472,7 @@ func TestJobControlSignalRootContainer(t *testing.T) {
 	// Sleep is dead, but it may take more time for bash to notice and
 	// change the foreground process back to itself. We know it is done
 	// when bash writes "Terminated" to the pty.
-	if err := testutil.WaitUntilRead(ptyBuf, "Terminated", nil, 5*time.Second); err != nil {
+	if err := testutil.WaitUntilRead(ptyBuf, "Terminated", 5*time.Second); err != nil {
 		t.Fatalf("bash did not take over pty: %v", err)
 	}
 
@@ -414,6 +493,104 @@ func TestJobControlSignalRootContainer(t *testing.T) {
 	}
 }
 
+// Test that terminal works with root and sub-containers.
+func TestMultiContainerTerminal(t *testing.T) {
+	for name, conf := range configsWithVFS2(t, all...) {
+		t.Run(name, func(t *testing.T) {
+			rootDir, cleanup, err := testutil.SetupRootDir()
+			if err != nil {
+				t.Fatalf("error creating root dir: %v", err)
+			}
+			defer cleanup()
+			conf.RootDir = rootDir
+
+			// Don't let bash execute from profile or rc files, otherwise our PID
+			// counts get messed up.
+			bash := []string{"/bin/bash", "--noprofile", "--norc"}
+			testSpecs, ids := createSpecs(bash, bash)
+
+			type termContainer struct {
+				container *Container
+				master    *os.File
+			}
+			var containers []termContainer
+			for i, spec := range testSpecs {
+				bundleDir, cleanup, err := testutil.SetupBundleDir(spec)
+				if err != nil {
+					t.Fatalf("error setting up container: %v", err)
+				}
+				defer cleanup()
+
+				spec.Process.Terminal = true
+				sock, err := socketPath(bundleDir)
+				if err != nil {
+					t.Fatalf("error getting socket path: %v", err)
+				}
+				srv, cleanup := createConsoleSocket(t, sock)
+				defer cleanup()
+
+				// Create the container and pass the socket name.
+				args := Args{
+					ID:            ids[i],
+					Spec:          spec,
+					BundleDir:     bundleDir,
+					ConsoleSocket: sock,
+				}
+				cont, err := New(conf, args)
+				if err != nil {
+					t.Fatalf("error creating container: %v", err)
+				}
+				defer cont.Destroy()
+
+				if err := cont.Start(conf); err != nil {
+					t.Fatalf("error starting container: %v", err)
+				}
+
+				// Make sure we get a console PTY.
+				ptyMaster, err := receiveConsolePTY(srv)
+				if err != nil {
+					t.Fatalf("error receiving console FD: %v", err)
+				}
+				defer ptyMaster.Close()
+
+				containers = append(containers, termContainer{
+					container: cont,
+					master:    ptyMaster,
+				})
+			}
+
+			for _, tc := range containers {
+				// Bash output as well as sandbox output will be written to the PTY
+				// file. Writes after a certain point will block unless we drain the
+				// PTY, so we must continually copy from it.
+				//
+				// We log the output to stderr for debugabilitly, and also to a buffer,
+				// since we wait on particular output from bash below. We use a custom
+				// blockingBuffer which is thread-safe and also blocks on Read calls,
+				// which makes this a suitable Reader for WaitUntilRead.
+				ptyBuf := newBlockingBuffer()
+				tee := io.TeeReader(tc.master, ptyBuf)
+				go io.Copy(os.Stderr, tee)
+
+				// Wait for bash to start.
+				expectedPL := []*control.Process{
+					newProcessBuilder().Cmd("bash").Process(),
+				}
+				if err := waitForProcessList(tc.container, expectedPL); err != nil {
+					t.Fatalf("error waiting for processes: %v", err)
+				}
+
+				// Execute echo command and check that it was executed correctly. Use
+				// a variable to ensure it's not matching against command echo.
+				tc.master.Write([]byte("echo foo-${PWD}-123\n"))
+				if err := testutil.WaitUntilRead(ptyBuf, "foo-/-123", 5*time.Second); err != nil {
+					t.Fatalf("echo didn't execute: %v", err)
+				}
+			}
+		})
+	}
+}
+
 // blockingBuffer is a thread-safe buffer that blocks when reading if the
 // buffer is empty.  It implements io.ReadWriter.
 type blockingBuffer struct {
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 4aa139c88..5a0f8d5dc 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -38,6 +38,7 @@ import (
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/cgroup"
 	"gvisor.dev/gvisor/runsc/config"
+	"gvisor.dev/gvisor/runsc/console"
 	"gvisor.dev/gvisor/runsc/sandbox"
 	"gvisor.dev/gvisor/runsc/specutils"
 )
@@ -79,6 +80,7 @@ func validateID(id string) error {
 //   - It calls 'runsc delete'. runc implementation kills --all SIGKILL once
 //     again just to be sure, waits, and then proceeds with remaining teardown.
 //
+// Container is thread-unsafe.
 type Container struct {
 	// ID is the container ID.
 	ID string `json:"id"`
@@ -126,125 +128,6 @@ type Container struct {
 	goferIsChild bool
 }
 
-// loadSandbox loads all containers that belong to the sandbox with the given
-// ID.
-func loadSandbox(rootDir, id string) ([]*Container, error) {
-	cids, err := List(rootDir)
-	if err != nil {
-		return nil, err
-	}
-
-	// Load the container metadata.
-	var containers []*Container
-	for _, cid := range cids {
-		container, err := Load(rootDir, cid)
-		if err != nil {
-			// Container file may not exist if it raced with creation/deletion or
-			// directory was left behind. Load provides a snapshot in time, so it's
-			// fine to skip it.
-			if os.IsNotExist(err) {
-				continue
-			}
-			return nil, fmt.Errorf("loading container %q: %v", id, err)
-		}
-		if container.Sandbox.ID == id {
-			containers = append(containers, container)
-		}
-	}
-	return containers, nil
-}
-
-// Load loads a container with the given id from a metadata file. partialID may
-// be an abbreviation of the full container id, in which case Load loads the
-// container to which id unambiguously refers to. Returns ErrNotExist if
-// container doesn't exist.
-func Load(rootDir, partialID string) (*Container, error) {
-	log.Debugf("Load container, rootDir: %q, partial cid: %s", rootDir, partialID)
-	if err := validateID(partialID); err != nil {
-		return nil, fmt.Errorf("invalid container id: %v", err)
-	}
-
-	id, err := findContainerID(rootDir, partialID)
-	if err != nil {
-		// Preserve error so that callers can distinguish 'not found' errors.
-		return nil, err
-	}
-
-	state := StateFile{
-		RootDir: rootDir,
-		ID:      id,
-	}
-	defer state.close()
-
-	c := &Container{}
-	if err := state.load(c); err != nil {
-		if os.IsNotExist(err) {
-			// Preserve error so that callers can distinguish 'not found' errors.
-			return nil, err
-		}
-		return nil, fmt.Errorf("reading container metadata file %q: %v", state.statePath(), err)
-	}
-	return c, nil
-}
-
-// LoadAndCheck is similar to Load(), but also checks if the container is still
-// running to get an error earlier to the caller.
-func LoadAndCheck(rootDir, partialID string) (*Container, error) {
-	c, err := Load(rootDir, partialID)
-	if err != nil {
-		// Preserve error so that callers can distinguish 'not found' errors.
-		return nil, err
-	}
-
-	// If the status is "Running" or "Created", check that the sandbox/container
-	// is still running, setting it to Stopped if not.
-	//
-	// This is inherently racy.
-	switch c.Status {
-	case Created:
-		if !c.isSandboxRunning() {
-			// Sandbox no longer exists, so this container definitely does not exist.
-			c.changeStatus(Stopped)
-		}
-	case Running:
-		if err := c.SignalContainer(syscall.Signal(0), false); err != nil {
-			c.changeStatus(Stopped)
-		}
-	}
-
-	return c, nil
-}
-
-func findContainerID(rootDir, partialID string) (string, error) {
-	// Check whether the id fully specifies an existing container.
-	stateFile := buildStatePath(rootDir, partialID)
-	if _, err := os.Stat(stateFile); err == nil {
-		return partialID, nil
-	}
-
-	// Now see whether id could be an abbreviation of exactly 1 of the
-	// container ids. If id is ambiguous (it could match more than 1
-	// container), it is an error.
-	ids, err := List(rootDir)
-	if err != nil {
-		return "", err
-	}
-	rv := ""
-	for _, id := range ids {
-		if strings.HasPrefix(id, partialID) {
-			if rv != "" {
-				return "", fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, rv, id)
-			}
-			rv = id
-		}
-	}
-	if rv == "" {
-		return "", os.ErrNotExist
-	}
-	log.Debugf("abbreviated id %q resolves to full id %q", partialID, rv)
-	return rv, nil
-}
-
 // Args is used to configure a new container.
 type Args struct {
 	// ID is the container unique identifier.
@@ -289,6 +172,15 @@ func New(conf *config.Config, args Args) (*Container, error) {
 		return nil, fmt.Errorf("creating container root directory %q: %v", conf.RootDir, err)
 	}
 
+	sandboxID := args.ID
+	if !isRoot(args.Spec) {
+		var ok bool
+		sandboxID, ok = specutils.SandboxID(args.Spec)
+		if !ok {
+			return nil, fmt.Errorf("no sandbox ID found when creating container")
+		}
+	}
+
 	c := &Container{
 		ID:            args.ID,
 		Spec:          args.Spec,
@@ -299,7 +191,10 @@ func New(conf *config.Config, args Args) (*Container, error) {
 		Owner:         os.Getenv("USER"),
 		Saver: StateFile{
 			RootDir: conf.RootDir,
-			ID:      args.ID,
+			ID: FullID{
+				SandboxID:   sandboxID,
+				ContainerID: args.ID,
+			},
 		},
 	}
 	// The Cleanup object cleans up partially created containers when an error
@@ -314,10 +209,17 @@ func New(conf *config.Config, args Args) (*Container, error) {
 	}
 	defer c.Saver.unlock()
 
-	// If the metadata annotations indicate that this container should be
-	// started in an existing sandbox, we must do so. The metadata will
-	// indicate the ID of the sandbox, which is the same as the ID of the
-	// init container in the sandbox.
+	// If the metadata annotations indicate that this container should be started
+	// in an existing sandbox, we must do so. These are the possible metadata
+	// annotation states:
+	//   1. No annotations: it means that there is a single container and this
+	//      container is obviously the root. Both container and sandbox share the
+	//      ID.
+	//   2. Container type == sandbox: it means this is the root container
+	//  		starting the sandbox. Both container and sandbox share the same ID.
+	//   3. Container type == container: it means this is a subcontainer of an
+	//      already started sandbox. In this case, container ID is different than
+	//      the sandbox ID.
 	if isRoot(args.Spec) {
 		log.Debugf("Creating new sandbox for container, cid: %s", args.ID)
 
@@ -356,7 +258,7 @@ func New(conf *config.Config, args Args) (*Container, error) {
 			// Start a new sandbox for this container. Any errors after this point
 			// must destroy the container.
 			sandArgs := &sandbox.Args{
-				ID:            args.ID,
+				ID:            sandboxID,
 				Spec:          args.Spec,
 				BundleDir:     args.BundleDir,
 				ConsoleSocket: args.ConsoleSocket,
@@ -377,27 +279,34 @@ func New(conf *config.Config, args Args) (*Container, error) {
 			return nil, err
 		}
 	} else {
-		// This is sort of confusing. For a sandbox with a root
-		// container and a child container in it, runsc sees:
-		// * A container struct whose sandbox ID is equal to the
-		//   container ID. This is the root container that is tied to
-		//   the creation of the sandbox.
-		// * A container struct whose sandbox ID is equal to the above
-		//   container/sandbox ID, but that has a different container
-		//   ID. This is the child container.
-		sbid, ok := specutils.SandboxID(args.Spec)
-		if !ok {
-			return nil, fmt.Errorf("no sandbox ID found when creating container")
-		}
-		log.Debugf("Creating new container, cid: %s, sandbox: %s", c.ID, sbid)
+		log.Debugf("Creating new container, cid: %s, sandbox: %s", c.ID, sandboxID)
 
 		// Find the sandbox associated with this ID.
-		sb, err := LoadAndCheck(conf.RootDir, sbid)
+		fullID := FullID{
+			SandboxID:   sandboxID,
+			ContainerID: sandboxID,
+		}
+		sb, err := Load(conf.RootDir, fullID, LoadOpts{Exact: true})
 		if err != nil {
 			return nil, err
 		}
 		c.Sandbox = sb.Sandbox
-		if err := c.Sandbox.CreateContainer(c.ID); err != nil {
+
+		// If the console control socket file is provided, then create a new
+		// pty master/slave pair and send the TTY to the sandbox process.
+		var tty *os.File
+		if c.ConsoleSocket != "" {
+			// Create a new TTY pair and send the master on the provided socket.
+			var err error
+			tty, err = console.NewWithSocket(c.ConsoleSocket)
+			if err != nil {
+				return nil, fmt.Errorf("setting up console with socket %q: %w", c.ConsoleSocket, err)
+			}
+			// tty file is transferred to the sandbox, then it can be closed here.
+			defer tty.Close()
+		}
+
+		if err := c.Sandbox.CreateContainer(c.ID, tty); err != nil {
 			return nil, err
 		}
 	}
@@ -451,11 +360,16 @@ func (c *Container) Start(conf *config.Config) error {
 		// the start (and all their children processes).
 		if err := runInCgroup(c.Sandbox.Cgroup, func() error {
 			// Create the gofer process.
-			ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false)
+			goferFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false)
 			if err != nil {
 				return err
 			}
-			defer mountsFile.Close()
+			defer func() {
+				_ = mountsFile.Close()
+				for _, f := range goferFiles {
+					_ = f.Close()
+				}
+			}()
 
 			cleanMounts, err := specutils.ReadMounts(mountsFile)
 			if err != nil {
@@ -463,7 +377,14 @@ func (c *Container) Start(conf *config.Config) error {
 			}
 			c.Spec.Mounts = cleanMounts
 
-			return c.Sandbox.StartContainer(c.Spec, conf, c.ID, ioFiles)
+			// Setup stdios if the container is not using terminal. Otherwise TTY was
+			// already setup in create.
+			var stdios []*os.File
+			if !c.Spec.Process.Terminal {
+				stdios = []*os.File{os.Stdin, os.Stdout, os.Stderr}
+			}
+
+			return c.Sandbox.StartContainer(c.Spec, conf, c.ID, stdios, goferFiles)
 		}); err != nil {
 			return err
 		}
@@ -599,7 +520,7 @@ func (c *Container) Wait() (syscall.WaitStatus, error) {
 // returns its WaitStatus.
 func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on process %d in sandbox, cid: %s", pid, c.Sandbox.ID)
-	if !c.isSandboxRunning() {
+	if !c.IsSandboxRunning() {
 		return 0, fmt.Errorf("sandbox is not running")
 	}
 	return c.Sandbox.WaitPID(c.Sandbox.ID, pid)
@@ -609,7 +530,7 @@ func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) {
 // its WaitStatus.
 func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on process %d in container, cid: %s", pid, c.ID)
-	if !c.isSandboxRunning() {
+	if !c.IsSandboxRunning() {
 		return 0, fmt.Errorf("sandbox is not running")
 	}
 	return c.Sandbox.WaitPID(c.ID, pid)
@@ -629,7 +550,7 @@ func (c *Container) SignalContainer(sig syscall.Signal, all bool) error {
 	if err := c.requireStatus("signal", Running, Stopped); err != nil {
 		return err
 	}
-	if !c.isSandboxRunning() {
+	if !c.IsSandboxRunning() {
 		return fmt.Errorf("sandbox is not running")
 	}
 	return c.Sandbox.SignalContainer(c.ID, sig, all)
@@ -641,7 +562,7 @@ func (c *Container) SignalProcess(sig syscall.Signal, pid int32) error {
 	if err := c.requireStatus("signal a process inside", Running); err != nil {
 		return err
 	}
-	if !c.isSandboxRunning() {
+	if !c.IsSandboxRunning() {
 		return fmt.Errorf("sandbox is not running")
 	}
 	return c.Sandbox.SignalProcess(c.ID, int32(pid), sig, false)
@@ -860,7 +781,7 @@ func (c *Container) waitForStopped() error {
 	defer cancel()
 	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
 	op := func() error {
-		if c.isSandboxRunning() {
+		if c.IsSandboxRunning() {
 			if err := c.SignalContainer(syscall.Signal(0), false); err == nil {
 				return fmt.Errorf("container is still running")
 			}
@@ -1062,7 +983,8 @@ func (c *Container) changeStatus(s Status) {
 	c.Status = s
 }
 
-func (c *Container) isSandboxRunning() bool {
+// IsSandboxRunning returns true if the sandbox exists and is running.
+func (c *Container) IsSandboxRunning() bool {
 	return c.Sandbox != nil && c.Sandbox.IsRunning()
 }
 
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index fa99e403a..a92ae046d 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -364,7 +364,7 @@ func TestLifecycle(t *testing.T) {
 			defer c.Destroy()
 
 			// Load the container from disk and check the status.
-			c, err = LoadAndCheck(rootDir, args.ID)
+			c, err = Load(rootDir, FullID{ContainerID: args.ID}, LoadOpts{})
 			if err != nil {
 				t.Fatalf("error loading container: %v", err)
 			}
@@ -377,7 +377,11 @@ func TestLifecycle(t *testing.T) {
 			if err != nil {
 				t.Fatalf("error listing containers: %v", err)
 			}
-			if got, want := ids, []string{args.ID}; !reflect.DeepEqual(got, want) {
+			fullID := FullID{
+				SandboxID:   args.ID,
+				ContainerID: args.ID,
+			}
+			if got, want := ids, []FullID{fullID}; !reflect.DeepEqual(got, want) {
 				t.Errorf("container list got %v, want %v", got, want)
 			}
 
@@ -387,7 +391,7 @@ func TestLifecycle(t *testing.T) {
 			}
 
 			// Load the container from disk and check the status.
-			c, err = LoadAndCheck(rootDir, args.ID)
+			c, err = Load(rootDir, fullID, LoadOpts{Exact: true})
 			if err != nil {
 				t.Fatalf("error loading container: %v", err)
 			}
@@ -428,7 +432,7 @@ func TestLifecycle(t *testing.T) {
 			}
 
 			// Load the container from disk and check the status.
-			c, err = LoadAndCheck(rootDir, args.ID)
+			c, err = Load(rootDir, fullID, LoadOpts{Exact: true})
 			if err != nil {
 				t.Fatalf("error loading container: %v", err)
 			}
@@ -451,7 +455,7 @@ func TestLifecycle(t *testing.T) {
 			}
 
 			// Loading the container by id should fail.
-			if _, err = LoadAndCheck(rootDir, args.ID); err == nil {
+			if _, err = Load(rootDir, fullID, LoadOpts{Exact: true}); err == nil {
 				t.Errorf("expected loading destroyed container to fail, but it did not")
 			}
 		})
@@ -1738,7 +1742,7 @@ func doAbbreviatedIDsTest(t *testing.T, vfs2 bool) {
 		cids[2]: cids[2],
 	}
 	for shortid, longid := range unambiguous {
-		if _, err := LoadAndCheck(rootDir, shortid); err != nil {
+		if _, err := Load(rootDir, FullID{ContainerID: shortid}, LoadOpts{}); err != nil {
 			t.Errorf("%q should resolve to %q: %v", shortid, longid, err)
 		}
 	}
@@ -1749,7 +1753,7 @@ func doAbbreviatedIDsTest(t *testing.T, vfs2 bool) {
 		"ba",
 	}
 	for _, shortid := range ambiguous {
-		if s, err := LoadAndCheck(rootDir, shortid); err == nil {
+		if s, err := Load(rootDir, FullID{ContainerID: shortid}, LoadOpts{}); err == nil {
 			t.Errorf("%q should be ambiguous, but resolved to %q", shortid, s.ID)
 		}
 	}
@@ -2007,7 +2011,7 @@ func doDestroyStartingTest(t *testing.T, vfs2 bool) {
 
 		// Container is not thread safe, so load another instance to run in
 		// concurrently.
-		startCont, err := LoadAndCheck(rootDir, args.ID)
+		startCont, err := Load(rootDir, FullID{ContainerID: args.ID}, LoadOpts{})
 		if err != nil {
 			t.Fatalf("error loading container: %v", err)
 		}
@@ -2332,6 +2336,42 @@ func TestTTYField(t *testing.T) {
 	}
 }
 
+// Test that container can run even when there are corrupt state files in the
+// root directiry.
+func TestCreateWithCorruptedStateFile(t *testing.T) {
+	conf := testutil.TestConfig(t)
+	spec := testutil.NewSpecWithArgs("/bin/true")
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer cleanup()
+
+	// Create corrupted state file.
+	corruptID := testutil.RandomContainerID()
+	corruptState := buildPath(conf.RootDir, FullID{SandboxID: corruptID, ContainerID: corruptID}, stateFileExtension)
+	if err := ioutil.WriteFile(corruptState, []byte("this{file(is;not[valid.json"), 0777); err != nil {
+		t.Fatalf("createCorruptStateFile(): %v", err)
+	}
+	defer os.Remove(corruptState)
+
+	if _, err := Load(conf.RootDir, FullID{ContainerID: corruptID}, LoadOpts{SkipCheck: true}); err == nil {
+		t.Fatalf("loading corrupted state file should have failed")
+	}
+
+	args := Args{
+		ID:        testutil.RandomContainerID(),
+		Spec:      spec,
+		BundleDir: bundleDir,
+		Attached:  true,
+	}
+	if ws, err := Run(conf, args); err != nil {
+		t.Errorf("running container: %v", err)
+	} else if !ws.Exited() || ws.ExitStatus() != 0 {
+		t.Errorf("container failed, waitStatus: %v", ws)
+	}
+}
+
 func execute(cont *Container, name string, arg ...string) (syscall.WaitStatus, error) {
 	args := &control.ExecArgs{
 		Filename: name,
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index cadc63bf3..044eec6fe 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -301,54 +301,21 @@ func TestMultiContainerWait(t *testing.T) {
 	}
 	defer cleanup()
 
-	// Check via ps that multiple processes are running.
-	expectedPL := []*control.Process{
-		newProcessBuilder().PID(2).PPID(0).Cmd("sleep").Process(),
+	// Check that we can wait for the sub-container.
+	c := containers[1]
+	if ws, err := c.Wait(); err != nil {
+		t.Errorf("failed to wait for process %s: %v", c.Spec.Process.Args, err)
+	} else if es := ws.ExitStatus(); es != 0 {
+		t.Errorf("process %s exited with non-zero status %d", c.Spec.Process.Args, es)
 	}
-	if err := waitForProcessList(containers[1], expectedPL); err != nil {
-		t.Errorf("failed to wait for sleep to start: %v", err)
+	if _, err := c.Wait(); err != nil {
+		t.Errorf("wait for stopped container %s shouldn't fail: %v", c.Spec.Process.Args, err)
 	}
 
-	// Wait on the short lived container from multiple goroutines.
-	wg := sync.WaitGroup{}
-	for i := 0; i < 3; i++ {
-		wg.Add(1)
-		go func(c *Container) {
-			defer wg.Done()
-			if ws, err := c.Wait(); err != nil {
-				t.Errorf("failed to wait for process %s: %v", c.Spec.Process.Args, err)
-			} else if es := ws.ExitStatus(); es != 0 {
-				t.Errorf("process %s exited with non-zero status %d", c.Spec.Process.Args, es)
-			}
-			if _, err := c.Wait(); err != nil {
-				t.Errorf("wait for stopped container %s shouldn't fail: %v", c.Spec.Process.Args, err)
-			}
-		}(containers[1])
-	}
-
-	// Also wait via PID.
-	for i := 0; i < 3; i++ {
-		wg.Add(1)
-		go func(c *Container) {
-			defer wg.Done()
-			const pid = 2
-			if ws, err := c.WaitPID(pid); err != nil {
-				t.Errorf("failed to wait for PID %d: %v", pid, err)
-			} else if es := ws.ExitStatus(); es != 0 {
-				t.Errorf("PID %d exited with non-zero status %d", pid, es)
-			}
-			if _, err := c.WaitPID(pid); err == nil {
-				t.Errorf("wait for stopped PID %d should fail", pid)
-			}
-		}(containers[1])
-	}
-
-	wg.Wait()
-
 	// After Wait returns, ensure that the root container is running and
 	// the child has finished.
-	expectedPL = []*control.Process{
-		newProcessBuilder().Cmd("sleep").Process(),
+	expectedPL := []*control.Process{
+		newProcessBuilder().Cmd("sleep").PID(1).Process(),
 	}
 	if err := waitForProcessList(containers[0], expectedPL); err != nil {
 		t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err)
@@ -763,7 +730,7 @@ func TestMultiContainerKillAll(t *testing.T) {
 			// processes still running inside.
 			containers[1].SignalContainer(syscall.SIGKILL, false)
 			op := func() error {
-				c, err := LoadAndCheck(conf.RootDir, ids[1])
+				c, err := Load(conf.RootDir, FullID{ContainerID: ids[1]}, LoadOpts{})
 				if err != nil {
 					return err
 				}
@@ -777,7 +744,7 @@ func TestMultiContainerKillAll(t *testing.T) {
 			}
 		}
 
-		c, err := LoadAndCheck(conf.RootDir, ids[1])
+		c, err := Load(conf.RootDir, FullID{ContainerID: ids[1]}, LoadOpts{})
 		if err != nil {
 			t.Fatalf("failed to load child container %q: %v", c.ID, err)
 		}
@@ -900,7 +867,7 @@ func TestMultiContainerDestroyStarting(t *testing.T) {
 
 		// Container is not thread safe, so load another instance to run in
 		// concurrently.
-		startCont, err := LoadAndCheck(rootDir, ids[i])
+		startCont, err := Load(rootDir, FullID{ContainerID: ids[i]}, LoadOpts{})
 		if err != nil {
 			t.Fatalf("error loading container: %v", err)
 		}
@@ -1836,3 +1803,91 @@ func TestMultiContainerEvent(t *testing.T) {
 		}
 	}
 }
+
+// Tests that duplicate variables in the spec are merged into a single one.
+func TestDuplicateEnvVariable(t *testing.T) {
+	conf := testutil.TestConfig(t)
+
+	rootDir, cleanup, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer cleanup()
+	conf.RootDir = rootDir
+
+	// Create files to dump `env` output.
+	files := [3]*os.File{}
+	for i := 0; i < len(files); i++ {
+		var err error
+		files[i], err = ioutil.TempFile(testutil.TmpDir(), "env-var-test")
+		if err != nil {
+			t.Fatalf("creating temp file: %v", err)
+		}
+		defer files[i].Close()
+		defer os.Remove(files[i].Name())
+	}
+
+	// Setup the containers. Use root container to test exec too.
+	cmd1 := fmt.Sprintf("env > %q; sleep 1000", files[0].Name())
+	cmd2 := fmt.Sprintf("env > %q", files[1].Name())
+	cmdExec := fmt.Sprintf("env > %q", files[2].Name())
+	testSpecs, ids := createSpecs([]string{"/bin/bash", "-c", cmd1}, []string{"/bin/bash", "-c", cmd2})
+	testSpecs[0].Process.Env = append(testSpecs[0].Process.Env, "VAR=foo", "VAR=bar")
+	testSpecs[1].Process.Env = append(testSpecs[1].Process.Env, "VAR=foo", "VAR=bar")
+
+	containers, cleanup, err := startContainers(conf, testSpecs, ids)
+	if err != nil {
+		t.Fatalf("error starting containers: %v", err)
+	}
+	defer cleanup()
+
+	// Wait for the `env` from the root container to finish.
+	expectedPL := []*control.Process{
+		newProcessBuilder().Cmd("bash").Process(),
+		newProcessBuilder().Cmd("sleep").Process(),
+	}
+	if err := waitForProcessList(containers[0], expectedPL); err != nil {
+		t.Errorf("failed to wait for sleep to start: %v", err)
+	}
+	if ws, err := containers[1].Wait(); err != nil {
+		t.Errorf("failed to wait container 1: %v", err)
+	} else if es := ws.ExitStatus(); es != 0 {
+		t.Errorf("container %s exited with non-zero status: %v", containers[1].ID, es)
+	}
+
+	execArgs := &control.ExecArgs{
+		Filename: "/bin/bash",
+		Argv:     []string{"/bin/bash", "-c", cmdExec},
+		Envv:     []string{"VAR=foo", "VAR=bar"},
+	}
+	if ws, err := containers[0].executeSync(execArgs); err != nil || ws.ExitStatus() != 0 {
+		t.Fatalf("exec failed, ws: %v, err: %v", ws, err)
+	}
+
+	// Now read and check that none of the env has repeated values.
+	for _, file := range files {
+		out, err := ioutil.ReadAll(file)
+		if err != nil {
+			t.Fatal(err)
+		}
+		t.Logf("Checking env %q:\n%s", file.Name(), out)
+		envs := make(map[string]string)
+		for _, line := range strings.Split(string(out), "\n") {
+			if len(line) == 0 {
+				continue
+			}
+			envVar := strings.SplitN(line, "=", 2)
+			if len(envVar) != 2 {
+				t.Fatalf("invalid env variable: %s", line)
+			}
+			key := envVar[0]
+			if val, ok := envs[key]; ok {
+				t.Errorf("env variable %q is duplicated: %q and %q", key, val, envVar[1])
+			}
+			envs[key] = envVar[1]
+		}
+		if _, ok := envs["VAR"]; !ok {
+			t.Errorf("variable VAR missing: %v", envs)
+		}
+	}
+}
diff --git a/runsc/container/state_file.go b/runsc/container/state_file.go
index 17a251530..dfbf1f2d3 100644
--- a/runsc/container/state_file.go
+++ b/runsc/container/state_file.go
@@ -20,58 +20,228 @@ import (
 	"io/ioutil"
 	"os"
 	"path/filepath"
+	"regexp"
+	"strings"
+	"syscall"
 
 	"github.com/gofrs/flock"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
-const stateFileExtension = ".state"
+const stateFileExtension = "state"
 
-// StateFile handles load from/save to container state safely from multiple
-// processes. It uses a lock file to provide synchronization between operations.
+// LoadOpts provides options for Load()ing a container.
+type LoadOpts struct {
+	// Exact tells whether the search should be exact. See Load() for more.
+	Exact bool
+
+	// SkipCheck tells Load() to skip checking if container is runnning.
+	SkipCheck bool
+}
+
+// Load loads a container with the given id from a metadata file. "id" may
+// be an abbreviation of the full container id in case LoadOpts.Exact if not
+// set. It also checks if the container is still running, in order to return
+// an error to the caller earlier. This check is skipped if LoadOpts.SkipCheck
+// is set.
 //
-// The lock file is located at: "${s.RootDir}/${s.ID}.lock".
-// The state file is located at: "${s.RootDir}/${s.ID}.state".
-type StateFile struct {
-	// RootDir is the directory containing the container metadata file.
-	RootDir string `json:"rootDir"`
+// Returns ErrNotExist if no container is found. Returns error in case more than
+// one containers matching the ID prefix is found.
+func Load(rootDir string, id FullID, opts LoadOpts) (*Container, error) {
+	//log.Debugf("Load container, rootDir: %q, partial cid: %s", rootDir, partialID)
+	if !opts.Exact {
+		var err error
+		id, err = findContainerID(rootDir, id.ContainerID)
+		if err != nil {
+			// Preserve error so that callers can distinguish 'not found' errors.
+			return nil, err
+		}
+	}
 
-	// ID is the container ID.
-	ID string `json:"id"`
+	if err := id.validate(); err != nil {
+		return nil, fmt.Errorf("invalid container id: %v", err)
+	}
+	state := StateFile{
+		RootDir: rootDir,
+		ID:      id,
+	}
+	defer state.close()
 
-	//
-	// Fields below this line are not saved in the state file and will not
-	// be preserved across commands.
-	//
+	c := &Container{}
+	if err := state.load(c); err != nil {
+		if os.IsNotExist(err) {
+			// Preserve error so that callers can distinguish 'not found' errors.
+			return nil, err
+		}
+		return nil, fmt.Errorf("reading container metadata file %q: %v", state.statePath(), err)
+	}
 
-	once  sync.Once
-	flock *flock.Flock
+	if !opts.SkipCheck {
+		// If the status is "Running" or "Created", check that the sandbox/container
+		// is still running, setting it to Stopped if not.
+		//
+		// This is inherently racy.
+		switch c.Status {
+		case Created:
+			if !c.IsSandboxRunning() {
+				// Sandbox no longer exists, so this container definitely does not exist.
+				c.changeStatus(Stopped)
+			}
+		case Running:
+			if err := c.SignalContainer(syscall.Signal(0), false); err != nil {
+				c.changeStatus(Stopped)
+			}
+		}
+	}
+
+	return c, nil
 }
 
 // List returns all container ids in the given root directory.
-func List(rootDir string) ([]string, error) {
+func List(rootDir string) ([]FullID, error) {
 	log.Debugf("List containers %q", rootDir)
-	list, err := filepath.Glob(filepath.Join(rootDir, "*"+stateFileExtension))
+	return listMatch(rootDir, FullID{})
+}
+
+// listMatch returns all container ids that match the provided id.
+func listMatch(rootDir string, id FullID) ([]FullID, error) {
+	id.SandboxID += "*"
+	id.ContainerID += "*"
+	pattern := buildPath(rootDir, id, stateFileExtension)
+	list, err := filepath.Glob(pattern)
 	if err != nil {
 		return nil, err
 	}
-	var out []string
+	var out []FullID
 	for _, path := range list {
-		// Filter out files that do no belong to a container.
-		fileName := filepath.Base(path)
-		if len(fileName) < len(stateFileExtension) {
-			panic(fmt.Sprintf("invalid file match %q", path))
-		}
-		// Remove the extension.
-		cid := fileName[:len(fileName)-len(stateFileExtension)]
-		if validateID(cid) == nil {
-			out = append(out, cid)
+		id, err := parseFileName(filepath.Base(path))
+		if err == nil {
+			out = append(out, id)
 		}
 	}
 	return out, nil
 }
 
+// loadSandbox loads all containers that belong to the sandbox with the given
+// ID.
+func loadSandbox(rootDir, id string) ([]*Container, error) {
+	cids, err := listMatch(rootDir, FullID{SandboxID: id})
+	if err != nil {
+		return nil, err
+	}
+
+	// Load the container metadata.
+	var containers []*Container
+	for _, cid := range cids {
+		container, err := Load(rootDir, cid, LoadOpts{Exact: true, SkipCheck: true})
+		if err != nil {
+			// Container file may not exist if it raced with creation/deletion or
+			// directory was left behind. Load provides a snapshot in time, so it's
+			// fine to skip it.
+			if os.IsNotExist(err) {
+				continue
+			}
+			return nil, fmt.Errorf("loading sandbox %q, failed to load container %q: %v", id, cid, err)
+		}
+		containers = append(containers, container)
+	}
+	return containers, nil
+}
+
+func findContainerID(rootDir, partialID string) (FullID, error) {
+	// Check whether the id fully specifies an existing container.
+	pattern := buildPath(rootDir, FullID{SandboxID: "*", ContainerID: partialID + "*"}, stateFileExtension)
+	list, err := filepath.Glob(pattern)
+	if err != nil {
+		return FullID{}, err
+	}
+	switch len(list) {
+	case 0:
+		return FullID{}, os.ErrNotExist
+	case 1:
+		return parseFileName(filepath.Base(list[0]))
+	}
+
+	// Now see whether id could be an abbreviation of exactly 1 of the
+	// container ids. If id is ambiguous (it could match more than 1
+	// container), it is an error.
+	ids, err := List(rootDir)
+	if err != nil {
+		return FullID{}, err
+	}
+	var rv *FullID
+	for _, id := range ids {
+		if strings.HasPrefix(id.ContainerID, partialID) {
+			if rv != nil {
+				return FullID{}, fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, rv, id)
+			}
+			rv = &id
+		}
+	}
+	if rv == nil {
+		return FullID{}, os.ErrNotExist
+	}
+	log.Debugf("abbreviated id %q resolves to full id %v", partialID, *rv)
+	return *rv, nil
+}
+
+func parseFileName(name string) (FullID, error) {
+	re := regexp.MustCompile(`([\w+-\.]+)_sandbox:([\w+-\.]+)\.` + stateFileExtension)
+	groups := re.FindStringSubmatch(name)
+	if len(groups) != 3 {
+		return FullID{}, fmt.Errorf("invalid state file name format: %q", name)
+	}
+	id := FullID{
+		SandboxID:   groups[2],
+		ContainerID: groups[1],
+	}
+	if err := id.validate(); err != nil {
+		return FullID{}, fmt.Errorf("invalid state file name %q: %w", name, err)
+	}
+	return id, nil
+}
+
+// FullID combines sandbox and container ID to identify a container. Sandbox ID
+// is used to allow all containers for a given sandbox to be loaded by matching
+// sandbox ID in the file name.
+type FullID struct {
+	SandboxID   string `json:"sandboxId"`
+	ContainerID string `json:"containerId"`
+}
+
+func (f *FullID) String() string {
+	return f.SandboxID + "/" + f.ContainerID
+}
+
+func (f *FullID) validate() error {
+	if err := validateID(f.SandboxID); err != nil {
+		return err
+	}
+	return validateID(f.ContainerID)
+}
+
+// StateFile handles load from/save to container state safely from multiple
+// processes. It uses a lock file to provide synchronization between operations.
+//
+// The lock file is located at: "${s.RootDir}/${containerd-id}_sand:{sandbox-id}.lock".
+// The state file is located at: "${s.RootDir}/${containerd-id}_sand:{sandbox-id}.state".
+type StateFile struct {
+	// RootDir is the directory containing the container metadata file.
+	RootDir string `json:"rootDir"`
+
+	// ID is the sandbox+container ID.
+	ID FullID `json:"id"`
+
+	//
+	// Fields below this line are not saved in the state file and will not
+	// be preserved across commands.
+	//
+
+	once  sync.Once
+	flock *flock.Flock
+}
+
 // lock globally locks all locking operations for the container.
 func (s *StateFile) lock() error {
 	s.once.Do(func() {
@@ -157,18 +327,20 @@ func (s *StateFile) close() error {
 	return s.flock.Close()
 }
 
-func buildStatePath(rootDir, id string) string {
-	return filepath.Join(rootDir, id+stateFileExtension)
+func buildPath(rootDir string, id FullID, extension string) string {
+	// Note: "_" and ":" are not valid in IDs.
+	name := fmt.Sprintf("%s_sandbox:%s.%s", id.ContainerID, id.SandboxID, extension)
+	return filepath.Join(rootDir, name)
 }
 
 // statePath is the full path to the state file.
 func (s *StateFile) statePath() string {
-	return buildStatePath(s.RootDir, s.ID)
+	return buildPath(s.RootDir, s.ID, stateFileExtension)
 }
 
 // lockPath is the full path to the lock file.
 func (s *StateFile) lockPath() string {
-	return filepath.Join(s.RootDir, s.ID+".lock")
+	return buildPath(s.RootDir, s.ID, "lock")
 }
 
 // destroy deletes all state created by the stateFile. It may be called with the
diff --git a/runsc/flag/flag.go b/runsc/flag/flag.go
index 775325c06..f921a8107 100644
--- a/runsc/flag/flag.go
+++ b/runsc/flag/flag.go
@@ -19,8 +19,10 @@ import (
 	"flag"
 )
 
+// FlagSet is an alias for flag.FlagSet.
 type FlagSet = flag.FlagSet
 
+// Aliases for flag functions.
 var (
 	Bool        = flag.Bool
 	CommandLine = flag.CommandLine
@@ -32,6 +34,7 @@ var (
 	Var         = flag.Var
 )
 
+// ContinueOnError is an alias for flag.ContinueOnError.
 const ContinueOnError = flag.ContinueOnError
 
 // Get returns the flag's underlying object.
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index 96c57a426..c56e1d4d0 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -29,9 +29,12 @@ go_test(
     srcs = ["fsgofer_test.go"],
     library = ":fsgofer",
     deps = [
+        "//pkg/fd",
         "//pkg/log",
         "//pkg/p9",
         "//pkg/test/testutil",
+        "//runsc/specutils",
+        "@com_github_syndtr_gocapability//capability:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 0b628c8ce..c3bba0973 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -49,6 +49,21 @@ const (
 	allowedOpenFlags = unix.O_TRUNC
 )
 
+var (
+	// Remember the process uid/gid to skip chown calls when file owner/group
+	// doesn't need to be changed.
+	processUID = p9.UID(os.Getuid())
+	processGID = p9.GID(os.Getgid())
+)
+
+// join is equivalent to path.Join() but skips path.Clean() which is expensive.
+func join(parent, child string) string {
+	if child == "." || child == ".." {
+		panic(fmt.Sprintf("invalid child path %q", child))
+	}
+	return parent + "/" + child
+}
+
 // Config sets configuration options for each attach point.
 type Config struct {
 	// ROMount is set to true if this is a readonly mount.
@@ -115,7 +130,7 @@ func (a *attachPoint) Attach() (p9.File, error) {
 		return nil, fmt.Errorf("unable to stat %q: %v", a.prefix, err)
 	}
 
-	lf, err := newLocalFile(a, f, a.prefix, readable, stat)
+	lf, err := newLocalFile(a, f, a.prefix, readable, &stat)
 	if err != nil {
 		return nil, fmt.Errorf("unable to create localFile %q: %v", a.prefix, err)
 	}
@@ -124,7 +139,7 @@ func (a *attachPoint) Attach() (p9.File, error) {
 }
 
 // makeQID returns a unique QID for the given stat buffer.
-func (a *attachPoint) makeQID(stat unix.Stat_t) p9.QID {
+func (a *attachPoint) makeQID(stat *unix.Stat_t) p9.QID {
 	a.deviceMu.Lock()
 	defer a.deviceMu.Unlock()
 
@@ -245,7 +260,7 @@ func reopenProcFd(f *fd.FD, mode int) (*fd.FD, error) {
 }
 
 func openAnyFileFromParent(parent *localFile, name string) (*fd.FD, string, bool, error) {
-	pathDebug := path.Join(parent.hostPath, name)
+	pathDebug := join(parent.hostPath, name)
 	f, readable, err := openAnyFile(pathDebug, func(mode int) (*fd.FD, error) {
 		return fd.OpenAt(parent.file, name, openFlags|mode, 0)
 	})
@@ -297,8 +312,8 @@ func openAnyFile(pathDebug string, fn func(mode int) (*fd.FD, error)) (*fd.FD, b
 	return nil, false, extractErrno(err)
 }
 
-func checkSupportedFileType(stat unix.Stat_t, permitSocket bool) error {
-	switch stat.Mode & unix.S_IFMT {
+func checkSupportedFileType(mode uint32, permitSocket bool) error {
+	switch mode & unix.S_IFMT {
 	case unix.S_IFREG, unix.S_IFDIR, unix.S_IFLNK:
 		return nil
 
@@ -313,8 +328,8 @@ func checkSupportedFileType(stat unix.Stat_t, permitSocket bool) error {
 	}
 }
 
-func newLocalFile(a *attachPoint, file *fd.FD, path string, readable bool, stat unix.Stat_t) (*localFile, error) {
-	if err := checkSupportedFileType(stat, a.conf.HostUDS); err != nil {
+func newLocalFile(a *attachPoint, file *fd.FD, path string, readable bool, stat *unix.Stat_t) (*localFile, error) {
+	if err := checkSupportedFileType(stat.Mode, a.conf.HostUDS); err != nil {
 		return nil, err
 	}
 
@@ -442,8 +457,10 @@ func (l *localFile) Create(name string, p9Flags p9.OpenFlags, perm p9.FileMode,
 	})
 	defer cu.Clean()
 
-	if err := fchown(child.FD(), uid, gid); err != nil {
-		return nil, nil, p9.QID{}, 0, extractErrno(err)
+	if uid != processUID || gid != processGID {
+		if err := fchown(child.FD(), uid, gid); err != nil {
+			return nil, nil, p9.QID{}, 0, extractErrno(err)
+		}
 	}
 	stat, err := fstat(child.FD())
 	if err != nil {
@@ -452,11 +469,11 @@ func (l *localFile) Create(name string, p9Flags p9.OpenFlags, perm p9.FileMode,
 
 	c := &localFile{
 		attachPoint: l.attachPoint,
-		hostPath:    path.Join(l.hostPath, name),
+		hostPath:    join(l.hostPath, name),
 		file:        child,
 		mode:        mode,
 		fileType:    unix.S_IFREG,
-		qid:         l.attachPoint.makeQID(stat),
+		qid:         l.attachPoint.makeQID(&stat),
 	}
 
 	cu.Release()
@@ -488,8 +505,10 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 	}
 	defer f.Close()
 
-	if err := fchown(f.FD(), uid, gid); err != nil {
-		return p9.QID{}, extractErrno(err)
+	if uid != processUID || gid != processGID {
+		if err := fchown(f.FD(), uid, gid); err != nil {
+			return p9.QID{}, extractErrno(err)
+		}
 	}
 	stat, err := fstat(f.FD())
 	if err != nil {
@@ -497,7 +516,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID)
 	}
 
 	cu.Release()
-	return l.attachPoint.makeQID(stat), nil
+	return l.attachPoint.makeQID(&stat), nil
 }
 
 // Walk implements p9.File.
@@ -512,7 +531,7 @@ func (l *localFile) WalkGetAttr(names []string) ([]p9.QID, p9.File, p9.AttrMask,
 	if err != nil {
 		return nil, nil, p9.AttrMask{}, p9.Attr{}, err
 	}
-	mask, attr := l.fillAttr(stat)
+	mask, attr := l.fillAttr(&stat)
 	return qids, file, mask, attr, nil
 }
 
@@ -538,13 +557,13 @@ func (l *localFile) walk(names []string) ([]p9.QID, p9.File, unix.Stat_t, error)
 			file:            newFile,
 			mode:            invalidMode,
 			fileType:        l.fileType,
-			qid:             l.attachPoint.makeQID(stat),
+			qid:             l.attachPoint.makeQID(&stat),
 			controlReadable: readable,
 		}
 		return []p9.QID{c.qid}, c, stat, nil
 	}
 
-	var qids []p9.QID
+	qids := make([]p9.QID, 0, len(names))
 	var lastStat unix.Stat_t
 	last := l
 	for _, name := range names {
@@ -560,7 +579,7 @@ func (l *localFile) walk(names []string) ([]p9.QID, p9.File, unix.Stat_t, error)
 			_ = f.Close()
 			return nil, nil, unix.Stat_t{}, extractErrno(err)
 		}
-		c, err := newLocalFile(last.attachPoint, f, path, readable, lastStat)
+		c, err := newLocalFile(last.attachPoint, f, path, readable, &lastStat)
 		if err != nil {
 			_ = f.Close()
 			return nil, nil, unix.Stat_t{}, extractErrno(err)
@@ -609,11 +628,11 @@ func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error)
 	if err != nil {
 		return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err)
 	}
-	mask, attr := l.fillAttr(stat)
+	mask, attr := l.fillAttr(&stat)
 	return l.qid, mask, attr, nil
 }
 
-func (l *localFile) fillAttr(stat unix.Stat_t) (p9.AttrMask, p9.Attr) {
+func (l *localFile) fillAttr(stat *unix.Stat_t) (p9.AttrMask, p9.Attr) {
 	attr := p9.Attr{
 		Mode:             p9.FileMode(stat.Mode),
 		UID:              p9.UID(stat.Uid),
@@ -739,15 +758,15 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 			// utimensat operates different that other syscalls. To operate on a
 			// symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty
 			// name.
-			parent, err := unix.Open(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0)
-			if err != nil {
-				return extractErrno(err)
+			parent, oErr := unix.Open(path.Dir(l.hostPath), openFlags|unix.O_PATH, 0)
+			if oErr != nil {
+				return extractErrno(oErr)
 			}
 			defer unix.Close(parent)
 
-			if terr := utimensat(parent, path.Base(l.hostPath), utimes, linux.AT_SYMLINK_NOFOLLOW); terr != nil {
-				log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, terr)
-				err = extractErrno(terr)
+			if tErr := utimensat(parent, path.Base(l.hostPath), utimes, linux.AT_SYMLINK_NOFOLLOW); tErr != nil {
+				log.Debugf("SetAttr utimens failed %q, err: %v", l.hostPath, tErr)
+				err = extractErrno(tErr)
 			}
 		} else {
 			// Directories and regular files can operate directly on the fd
@@ -768,9 +787,9 @@ func (l *localFile) SetAttr(valid p9.SetAttrMask, attr p9.SetAttr) error {
 		if valid.GID {
 			gid = int(attr.GID)
 		}
-		if oerr := unix.Fchownat(f.FD(), "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oerr != nil {
-			log.Debugf("SetAttr fchownat failed %q, err: %v", l.hostPath, oerr)
-			err = extractErrno(oerr)
+		if oErr := unix.Fchownat(f.FD(), "", uid, gid, linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW); oErr != nil {
+			log.Debugf("SetAttr fchownat failed %q, err: %v", l.hostPath, oErr)
+			err = extractErrno(oErr)
 		}
 	}
 
@@ -881,8 +900,10 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
 	}
 	defer f.Close()
 
-	if err := fchown(f.FD(), uid, gid); err != nil {
-		return p9.QID{}, extractErrno(err)
+	if uid != processUID || gid != processGID {
+		if err := fchown(f.FD(), uid, gid); err != nil {
+			return p9.QID{}, extractErrno(err)
+		}
 	}
 	stat, err := fstat(f.FD())
 	if err != nil {
@@ -890,7 +911,7 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9.
 	}
 
 	cu.Release()
-	return l.attachPoint.makeQID(stat), nil
+	return l.attachPoint.makeQID(&stat), nil
 }
 
 // Link implements p9.File.
@@ -938,8 +959,10 @@ func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, uid
 	}
 	defer child.Close()
 
-	if err := fchown(child.FD(), uid, gid); err != nil {
-		return p9.QID{}, extractErrno(err)
+	if uid != processUID || gid != processGID {
+		if err := fchown(child.FD(), uid, gid); err != nil {
+			return p9.QID{}, extractErrno(err)
+		}
 	}
 	stat, err := fstat(child.FD())
 	if err != nil {
@@ -947,7 +970,7 @@ func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, uid
 	}
 
 	cu.Release()
-	return l.attachPoint.makeQID(stat), nil
+	return l.attachPoint.makeQID(&stat), nil
 }
 
 // UnlinkAt implements p9.File.
@@ -1045,7 +1068,7 @@ func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64)
 				log.Warningf("Readdir is skipping file with failed stat %q, err: %v", l.hostPath, err)
 				continue
 			}
-			qid := l.attachPoint.makeQID(stat)
+			qid := l.attachPoint.makeQID(&stat)
 			offset++
 			dirents = append(dirents, p9.Dirent{
 				QID:    qid,
@@ -1139,7 +1162,7 @@ func (l *localFile) isOpen() bool {
 
 // Renamed implements p9.Renamed.
 func (l *localFile) Renamed(newDir p9.File, newName string) {
-	l.hostPath = path.Join(newDir.(*localFile).hostPath, newName)
+	l.hostPath = join(newDir.(*localFile).hostPath, newName)
 }
 
 // extractErrno tries to determine the errno.
diff --git a/runsc/fsgofer/fsgofer_test.go b/runsc/fsgofer/fsgofer_test.go
index a84206686..c5daebe5e 100644
--- a/runsc/fsgofer/fsgofer_test.go
+++ b/runsc/fsgofer/fsgofer_test.go
@@ -23,10 +23,13 @@ import (
 	"path/filepath"
 	"testing"
 
+	"github.com/syndtr/gocapability/capability"
 	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/test/testutil"
+	"gvisor.dev/gvisor/runsc/specutils"
 )
 
 var allOpenFlags = []p9.OpenFlags{p9.ReadOnly, p9.WriteOnly, p9.ReadWrite}
@@ -197,10 +200,13 @@ func setup(fileType uint32) (string, string, error) {
 	switch fileType {
 	case unix.S_IFREG:
 		name = "file"
-		_, f, _, _, err := root.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
+		fd, f, _, _, err := root.Create(name, p9.ReadWrite, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid()))
 		if err != nil {
 			return "", "", fmt.Errorf("createFile(root, %q) failed, err: %v", "test", err)
 		}
+		if fd != nil {
+			fd.Close()
+		}
 		defer f.Close()
 	case unix.S_IFDIR:
 		name = "dir"
@@ -556,7 +562,28 @@ func TestROMountChecks(t *testing.T) {
 func TestWalkNotFound(t *testing.T) {
 	runCustom(t, []uint32{unix.S_IFDIR}, allConfs, func(t *testing.T, s state) {
 		if _, _, err := s.file.Walk([]string{"nobody-here"}); err != unix.ENOENT {
-			t.Errorf("%v: Walk(%q) should have failed, got: %v, expected: unix.ENOENT", s, "nobody-here", err)
+			t.Errorf("Walk(%q) should have failed, got: %v, expected: unix.ENOENT", "nobody-here", err)
+		}
+		if _, _, err := s.file.Walk([]string{"nobody", "here"}); err != unix.ENOENT {
+			t.Errorf("Walk(%q) should have failed, got: %v, expected: unix.ENOENT", "nobody/here", err)
+		}
+		if !s.conf.ROMount {
+			if _, err := s.file.Mkdir("dir", 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
+				t.Fatalf("MkDir(dir) failed, err: %v", err)
+			}
+			if _, _, err := s.file.Walk([]string{"dir", "nobody-here"}); err != unix.ENOENT {
+				t.Errorf("Walk(%q) should have failed, got: %v, expected: unix.ENOENT", "dir/nobody-here", err)
+			}
+		}
+	})
+}
+
+func TestWalkPanic(t *testing.T) {
+	runCustom(t, []uint32{unix.S_IFDIR}, allConfs, func(t *testing.T, s state) {
+		for _, name := range []string{".", ".."} {
+			assertPanic(t, func() {
+				s.file.Walk([]string{name})
+			})
 		}
 	})
 }
@@ -574,6 +601,27 @@ func TestWalkDup(t *testing.T) {
 	})
 }
 
+func TestWalkMultiple(t *testing.T) {
+	runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
+		var names []string
+		var parent p9.File = s.file
+		for i := 0; i < 5; i++ {
+			name := fmt.Sprintf("dir%d", i)
+			names = append(names, name)
+
+			if _, err := parent.Mkdir(name, 0777, p9.UID(os.Getuid()), p9.GID(os.Getgid())); err != nil {
+				t.Fatalf("MkDir(%q) failed, err: %v", name, err)
+			}
+
+			var err error
+			_, parent, err = s.file.Walk(names)
+			if err != nil {
+				t.Errorf("Walk(%q): %v", name, err)
+			}
+		}
+	})
+}
+
 func TestReaddir(t *testing.T) {
 	runCustom(t, []uint32{unix.S_IFDIR}, rwConfs, func(t *testing.T, s state) {
 		name := "dir"
@@ -819,3 +867,168 @@ func TestMknod(t *testing.T) {
 		}
 	})
 }
+
+func BenchmarkWalkOne(b *testing.B) {
+	path, name, err := setup(unix.S_IFDIR)
+	if err != nil {
+		b.Fatalf("%v", err)
+	}
+	defer os.RemoveAll(path)
+
+	a, err := NewAttachPoint(path, Config{})
+	if err != nil {
+		b.Fatalf("NewAttachPoint failed: %v", err)
+	}
+	root, err := a.Attach()
+	if err != nil {
+		b.Fatalf("Attach failed, err: %v", err)
+	}
+	defer root.Close()
+
+	names := []string{name}
+	files := make([]p9.File, 0, 1000)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_, file, err := root.Walk(names)
+		if err != nil {
+			b.Fatalf("Walk(%q): %v", name, err)
+		}
+		files = append(files, file)
+
+		// Avoid running out of FDs.
+		if len(files) == cap(files) {
+			b.StopTimer()
+			for _, file := range files {
+				file.Close()
+			}
+			files = files[:0]
+			b.StartTimer()
+		}
+	}
+
+	b.StopTimer()
+	for _, file := range files {
+		file.Close()
+	}
+}
+
+func BenchmarkCreate(b *testing.B) {
+	path, _, err := setup(unix.S_IFDIR)
+	if err != nil {
+		b.Fatalf("%v", err)
+	}
+	defer os.RemoveAll(path)
+
+	a, err := NewAttachPoint(path, Config{})
+	if err != nil {
+		b.Fatalf("NewAttachPoint failed: %v", err)
+	}
+	root, err := a.Attach()
+	if err != nil {
+		b.Fatalf("Attach failed, err: %v", err)
+	}
+	defer root.Close()
+
+	files := make([]p9.File, 0, 500)
+	fds := make([]*fd.FD, 0, 500)
+	uid := p9.UID(os.Getuid())
+	gid := p9.GID(os.Getgid())
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		name := fmt.Sprintf("same-%d", i)
+		fd, file, _, _, err := root.Create(name, p9.ReadOnly, 0777, uid, gid)
+		if err != nil {
+			b.Fatalf("Create(%q): %v", name, err)
+		}
+		files = append(files, file)
+		if fd != nil {
+			fds = append(fds, fd)
+		}
+
+		// Avoid running out of FDs.
+		if len(files) == cap(files) {
+			b.StopTimer()
+			for _, file := range files {
+				file.Close()
+			}
+			files = files[:0]
+			for _, fd := range fds {
+				fd.Close()
+			}
+			fds = fds[:0]
+			b.StartTimer()
+		}
+	}
+
+	b.StopTimer()
+	for _, file := range files {
+		file.Close()
+	}
+	for _, fd := range fds {
+		fd.Close()
+	}
+}
+
+func BenchmarkCreateDiffOwner(b *testing.B) {
+	if !specutils.HasCapabilities(capability.CAP_CHOWN) {
+		b.Skipf("Test requires CAP_CHOWN")
+	}
+
+	path, _, err := setup(unix.S_IFDIR)
+	if err != nil {
+		b.Fatalf("%v", err)
+	}
+	defer os.RemoveAll(path)
+
+	a, err := NewAttachPoint(path, Config{})
+	if err != nil {
+		b.Fatalf("NewAttachPoint failed: %v", err)
+	}
+	root, err := a.Attach()
+	if err != nil {
+		b.Fatalf("Attach failed, err: %v", err)
+	}
+	defer root.Close()
+
+	files := make([]p9.File, 0, 500)
+	fds := make([]*fd.FD, 0, 500)
+	gid := p9.GID(os.Getgid())
+	const nobody = 65534
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		name := fmt.Sprintf("diff-%d", i)
+		fd, file, _, _, err := root.Create(name, p9.ReadOnly, 0777, nobody, gid)
+		if err != nil {
+			b.Fatalf("Create(%q): %v", name, err)
+		}
+		files = append(files, file)
+		if fd != nil {
+			fds = append(fds, fd)
+		}
+
+		// Avoid running out of FDs.
+		if len(files) == cap(files) {
+			b.StopTimer()
+			for _, file := range files {
+				file.Close()
+			}
+			files = files[:0]
+			for _, fd := range fds {
+				fd.Close()
+			}
+			fds = fds[:0]
+			b.StartTimer()
+		}
+	}
+
+	b.StopTimer()
+	for _, file := range files {
+		file.Close()
+	}
+	for _, fd := range fds {
+		fd.Close()
+	}
+}
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 8f66dd1f8..9e429f7d5 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -127,7 +127,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
 	// Get all interfaces in the namespace.
 	ifaces, err := net.Interfaces()
 	if err != nil {
-		return fmt.Errorf("querying interfaces: %v", err)
+		return fmt.Errorf("querying interfaces: %w", err)
 	}
 
 	isRoot, err := isRootNS()
@@ -148,14 +148,14 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
 
 		allAddrs, err := iface.Addrs()
 		if err != nil {
-			return fmt.Errorf("fetching interface addresses for %q: %v", iface.Name, err)
+			return fmt.Errorf("fetching interface addresses for %q: %w", iface.Name, err)
 		}
 
 		// We build our own loopback device.
 		if iface.Flags&net.FlagLoopback != 0 {
 			link, err := loopbackLink(iface, allAddrs)
 			if err != nil {
-				return fmt.Errorf("getting loopback link for iface %q: %v", iface.Name, err)
+				return fmt.Errorf("getting loopback link for iface %q: %w", iface.Name, err)
 			}
 			args.LoopbackLinks = append(args.LoopbackLinks, link)
 			continue
@@ -209,7 +209,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
 		// Get the link for the interface.
 		ifaceLink, err := netlink.LinkByName(iface.Name)
 		if err != nil {
-			return fmt.Errorf("getting link for interface %q: %v", iface.Name, err)
+			return fmt.Errorf("getting link for interface %q: %w", iface.Name, err)
 		}
 		link.LinkAddress = ifaceLink.Attrs().HardwareAddr
 
@@ -219,7 +219,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
 			log.Debugf("Creating Channel %d", i)
 			socketEntry, err := createSocket(iface, ifaceLink, hardwareGSO)
 			if err != nil {
-				return fmt.Errorf("failed to createSocket for %s : %v", iface.Name, err)
+				return fmt.Errorf("failed to createSocket for %s : %w", iface.Name, err)
 			}
 			if i == 0 {
 				link.GSOMaxSize = socketEntry.gsoMaxSize
@@ -241,11 +241,12 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
 		// Collect the addresses for the interface, enable forwarding,
 		// and remove them from the host.
 		for _, addr := range ipAddrs {
-			link.Addresses = append(link.Addresses, addr.IP)
+			prefix, _ := addr.Mask.Size()
+			link.Addresses = append(link.Addresses, boot.IPWithPrefix{Address: addr.IP, PrefixLen: prefix})
 
 			// Steal IP address from NIC.
 			if err := removeAddress(ifaceLink, addr.String()); err != nil {
-				return fmt.Errorf("removing address %v from device %q: %v", iface.Name, addr, err)
+				return fmt.Errorf("removing address %v from device %q: %w", addr, iface.Name, err)
 			}
 		}
 
@@ -254,7 +255,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
 
 	log.Debugf("Setting up network, config: %+v", args)
 	if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &args, nil); err != nil {
-		return fmt.Errorf("creating links and routes: %v", err)
+		return fmt.Errorf("creating links and routes: %w", err)
 	}
 	return nil
 }
@@ -278,8 +279,6 @@ func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (
 	ll := syscall.SockaddrLinklayer{
 		Protocol: protocol,
 		Ifindex:  iface.Index,
-		Hatype:   0, // No ARP type.
-		Pkttype:  syscall.PACKET_OTHERHOST,
 	}
 	if err := syscall.Bind(fd, &ll); err != nil {
 		return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
@@ -339,9 +338,15 @@ func loopbackLink(iface net.Interface, addrs []net.Addr) (boot.LoopbackLink, err
 		if !ok {
 			return boot.LoopbackLink{}, fmt.Errorf("address is not IPNet: %+v", addr)
 		}
+
+		prefix, _ := ipNet.Mask.Size()
+		link.Addresses = append(link.Addresses, boot.IPWithPrefix{
+			Address:   ipNet.IP,
+			PrefixLen: prefix,
+		})
+
 		dst := *ipNet
 		dst.IP = dst.IP.Mask(dst.Mask)
-		link.Addresses = append(link.Addresses, ipNet.IP)
 		link.Routes = append(link.Routes, boot.Route{
 			Destination: dst,
 		})
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 4a4110477..266bc0bdc 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -173,7 +173,7 @@ func New(conf *config.Config, args *Args) (*Sandbox, error) {
 }
 
 // CreateContainer creates a non-root container inside the sandbox.
-func (s *Sandbox) CreateContainer(cid string) error {
+func (s *Sandbox) CreateContainer(cid string, tty *os.File) error {
 	log.Debugf("Create non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid)
 	sandboxConn, err := s.sandboxConnect()
 	if err != nil {
@@ -181,7 +181,16 @@ func (s *Sandbox) CreateContainer(cid string) error {
 	}
 	defer sandboxConn.Close()
 
-	if err := sandboxConn.Call(boot.ContainerCreate, &cid, nil); err != nil {
+	var files []*os.File
+	if tty != nil {
+		files = []*os.File{tty}
+	}
+
+	args := boot.CreateArgs{
+		CID:         cid,
+		FilePayload: urpc.FilePayload{Files: files},
+	}
+	if err := sandboxConn.Call(boot.ContainerCreate, &args, nil); err != nil {
 		return fmt.Errorf("creating non-root container %q: %v", cid, err)
 	}
 	return nil
@@ -211,11 +220,7 @@ func (s *Sandbox) StartRoot(spec *specs.Spec, conf *config.Config) error {
 }
 
 // StartContainer starts running a non-root container inside the sandbox.
-func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid string, goferFiles []*os.File) error {
-	for _, f := range goferFiles {
-		defer f.Close()
-	}
-
+func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles []*os.File) error {
 	log.Debugf("Start non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid)
 	sandboxConn, err := s.sandboxConnect()
 	if err != nil {
@@ -223,15 +228,18 @@ func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid stri
 	}
 	defer sandboxConn.Close()
 
-	// The payload must container stdin/stdout/stderr followed by gofer
-	// files.
-	files := append([]*os.File{os.Stdin, os.Stdout, os.Stderr}, goferFiles...)
+	// The payload must contain stdin/stdout/stderr (which may be empty if using
+	// TTY) followed by gofer files.
+	payload := urpc.FilePayload{}
+	payload.Files = append(payload.Files, stdios...)
+	payload.Files = append(payload.Files, goferFiles...)
+
 	// Start running the container.
 	args := boot.StartArgs{
 		Spec:        spec,
 		Conf:        conf,
 		CID:         cid,
-		FilePayload: urpc.FilePayload{Files: files},
+		FilePayload: payload,
 	}
 	if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil {
 		return fmt.Errorf("starting non-root container %v: %v", spec.Process.Args, err)
@@ -711,6 +719,8 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn
 		nextFD++
 	}
 
+	_ = nextFD // All FD assignment is finished.
+
 	if args.Attached {
 		// Kill sandbox if parent process exits in attached mode.
 		cmd.SysProcAttr.Pdeathsig = syscall.SIGKILL
@@ -983,7 +993,7 @@ func (s *Sandbox) Stacks() (string, error) {
 }
 
 // HeapProfile writes a heap profile to the given file.
-func (s *Sandbox) HeapProfile(f *os.File) error {
+func (s *Sandbox) HeapProfile(f *os.File, delay time.Duration) error {
 	log.Debugf("Heap profile %q", s.ID)
 	conn, err := s.sandboxConnect()
 	if err != nil {
@@ -991,54 +1001,31 @@ func (s *Sandbox) HeapProfile(f *os.File) error {
 	}
 	defer conn.Close()
 
-	opts := control.ProfileOpts{
-		FilePayload: urpc.FilePayload{
-			Files: []*os.File{f},
-		},
-	}
-	if err := conn.Call(boot.HeapProfile, &opts, nil); err != nil {
-		return fmt.Errorf("getting sandbox %q heap profile: %v", s.ID, err)
-	}
-	return nil
-}
-
-// StartCPUProfile start CPU profile writing to the given file.
-func (s *Sandbox) StartCPUProfile(f *os.File) error {
-	log.Debugf("CPU profile start %q", s.ID)
-	conn, err := s.sandboxConnect()
-	if err != nil {
-		return err
-	}
-	defer conn.Close()
-
-	opts := control.ProfileOpts{
-		FilePayload: urpc.FilePayload{
-			Files: []*os.File{f},
-		},
+	opts := control.HeapProfileOpts{
+		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
+		Delay:       delay,
 	}
-	if err := conn.Call(boot.StartCPUProfile, &opts, nil); err != nil {
-		return fmt.Errorf("starting sandbox %q CPU profile: %v", s.ID, err)
-	}
-	return nil
+	return conn.Call(boot.HeapProfile, &opts, nil)
 }
 
-// StopCPUProfile stops a previously started CPU profile.
-func (s *Sandbox) StopCPUProfile() error {
-	log.Debugf("CPU profile stop %q", s.ID)
+// CPUProfile collects a CPU profile.
+func (s *Sandbox) CPUProfile(f *os.File, duration time.Duration) error {
+	log.Debugf("CPU profile %q", s.ID)
 	conn, err := s.sandboxConnect()
 	if err != nil {
 		return err
 	}
 	defer conn.Close()
 
-	if err := conn.Call(boot.StopCPUProfile, nil, nil); err != nil {
-		return fmt.Errorf("stopping sandbox %q CPU profile: %v", s.ID, err)
+	opts := control.CPUProfileOpts{
+		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
+		Duration:    duration,
 	}
-	return nil
+	return conn.Call(boot.CPUProfile, &opts, nil)
 }
 
 // BlockProfile writes a block profile to the given file.
-func (s *Sandbox) BlockProfile(f *os.File) error {
+func (s *Sandbox) BlockProfile(f *os.File, duration time.Duration) error {
 	log.Debugf("Block profile %q", s.ID)
 	conn, err := s.sandboxConnect()
 	if err != nil {
@@ -1046,19 +1033,15 @@ func (s *Sandbox) BlockProfile(f *os.File) error {
 	}
 	defer conn.Close()
 
-	opts := control.ProfileOpts{
-		FilePayload: urpc.FilePayload{
-			Files: []*os.File{f},
-		},
-	}
-	if err := conn.Call(boot.BlockProfile, &opts, nil); err != nil {
-		return fmt.Errorf("getting sandbox %q block profile: %v", s.ID, err)
+	opts := control.BlockProfileOpts{
+		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
+		Duration:    duration,
 	}
-	return nil
+	return conn.Call(boot.BlockProfile, &opts, nil)
 }
 
 // MutexProfile writes a mutex profile to the given file.
-func (s *Sandbox) MutexProfile(f *os.File) error {
+func (s *Sandbox) MutexProfile(f *os.File, duration time.Duration) error {
 	log.Debugf("Mutex profile %q", s.ID)
 	conn, err := s.sandboxConnect()
 	if err != nil {
@@ -1066,50 +1049,27 @@ func (s *Sandbox) MutexProfile(f *os.File) error {
 	}
 	defer conn.Close()
 
-	opts := control.ProfileOpts{
-		FilePayload: urpc.FilePayload{
-			Files: []*os.File{f},
-		},
-	}
-	if err := conn.Call(boot.MutexProfile, &opts, nil); err != nil {
-		return fmt.Errorf("getting sandbox %q mutex profile: %v", s.ID, err)
-	}
-	return nil
-}
-
-// StartTrace start trace  writing to the given file.
-func (s *Sandbox) StartTrace(f *os.File) error {
-	log.Debugf("Trace start %q", s.ID)
-	conn, err := s.sandboxConnect()
-	if err != nil {
-		return err
-	}
-	defer conn.Close()
-
-	opts := control.ProfileOpts{
-		FilePayload: urpc.FilePayload{
-			Files: []*os.File{f},
-		},
+	opts := control.MutexProfileOpts{
+		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
+		Duration:    duration,
 	}
-	if err := conn.Call(boot.StartTrace, &opts, nil); err != nil {
-		return fmt.Errorf("starting sandbox %q trace: %v", s.ID, err)
-	}
-	return nil
+	return conn.Call(boot.MutexProfile, &opts, nil)
 }
 
-// StopTrace stops a previously started trace.
-func (s *Sandbox) StopTrace() error {
-	log.Debugf("Trace stop %q", s.ID)
+// Trace collects an execution trace.
+func (s *Sandbox) Trace(f *os.File, duration time.Duration) error {
+	log.Debugf("Trace %q", s.ID)
 	conn, err := s.sandboxConnect()
 	if err != nil {
 		return err
 	}
 	defer conn.Close()
 
-	if err := conn.Call(boot.StopTrace, nil, nil); err != nil {
-		return fmt.Errorf("stopping sandbox %q trace: %v", s.ID, err)
+	opts := control.TraceProfileOpts{
+		FilePayload: urpc.FilePayload{Files: []*os.File{f}},
+		Duration:    duration,
 	}
-	return nil
+	return conn.Call(boot.Trace, &opts, nil)
 }
 
 // ChangeLogging changes logging options.
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index fdbba1832..ea55bbc7d 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -493,6 +493,31 @@ func EnvVar(env []string, name string) (string, bool) {
 	return "", false
 }
 
+// ResolveEnvs transforms lists of environment variables into a single list of
+// environment variables. If a variable is defined multiple times, the last
+// value is used.
+func ResolveEnvs(envs ...[]string) ([]string, error) {
+	// First create a map of variable names to values. This removes any
+	// duplicates.
+	envMap := make(map[string]string)
+	for _, env := range envs {
+		for _, str := range env {
+			parts := strings.SplitN(str, "=", 2)
+			if len(parts) != 2 {
+				return nil, fmt.Errorf("invalid variable: %s", str)
+			}
+			envMap[parts[0]] = parts[1]
+		}
+	}
+	// Reassemble envMap into a list of environment variables of the form
+	// NAME=VALUE.
+	env := make([]string, 0, len(envMap))
+	for k, v := range envMap {
+		env = append(env, fmt.Sprintf("%s=%s", k, v))
+	}
+	return env, nil
+}
+
 // FaqErrorMsg returns an error message pointing to the FAQ.
 func FaqErrorMsg(anchor, msg string) string {
 	return fmt.Sprintf("%s; see https://gvisor.dev/faq#%s for more details", msg, anchor)