17 files changed, 315 insertions, 206 deletions
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index c0bc8f064..124198239 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -75,7 +75,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 	conf := args[0].(*config.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
-	cont, err := container.LoadAndCheck(conf.RootDir, id)
+	cont, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index 609e8231c..1e5a7471a 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -90,8 +90,10 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 			f.Usage()
 			return subcommands.ExitUsageError
 		}
+		id := f.Arg(0)
+
 		var err error
-		c, err = container.LoadAndCheck(conf.RootDir, f.Arg(0))
+		c, err = container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 		if err != nil {
 			return Errorf("loading container %q: %v", f.Arg(0), err)
 		}
@@ -106,9 +108,10 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 			return Errorf("listing containers: %v", err)
 		}
 		for _, id := range ids {
-			candidate, err := container.LoadAndCheck(conf.RootDir, id)
+			candidate, err := container.Load(conf.RootDir, id, container.LoadOpts{Exact: true, SkipCheck: true})
 			if err != nil {
-				return Errorf("loading container %q: %v", id, err)
+				log.Warningf("Skipping container %q: %v", id, err)
+				continue
 			}
 			if candidate.SandboxPid() == d.pid {
 				c = candidate
@@ -120,7 +123,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		}
 	}
 
-	if c.Sandbox == nil || !c.Sandbox.IsRunning() {
+	if c.IsSandboxRunning() {
 		return Errorf("container sandbox is not running")
 	}
 	log.Infof("Found sandbox %q, PID: %d", c.Sandbox.ID, c.Sandbox.Pid)
diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go
index a25637265..a750be131 100644
--- a/runsc/cmd/delete.go
+++ b/runsc/cmd/delete.go
@@ -68,7 +68,7 @@ func (d *Delete) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 
 func (d *Delete) execute(ids []string, conf *config.Config) error {
 	for _, id := range ids {
-		c, err := container.LoadAndCheck(conf.RootDir, id)
+		c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 		if err != nil {
 			if os.IsNotExist(err) && d.force {
 				log.Warningf("couldn't find container %q: %v", id, err)
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
index 3836b7b4e..75b0aac8d 100644
--- a/runsc/cmd/events.go
+++ b/runsc/cmd/events.go
@@ -74,7 +74,7 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa
 	id := f.Arg(0)
 	conf := args[0].(*config.Config)
 
-	c, err := container.LoadAndCheck(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading sandbox: %v", err)
 	}
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index eafd6285c..8558d34ae 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -112,7 +112,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 	waitStatus := args[1].(*syscall.WaitStatus)
 
-	c, err := container.LoadAndCheck(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading sandbox: %v", err)
 	}
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
index fe69e2a08..aecf0b7ab 100644
--- a/runsc/cmd/kill.go
+++ b/runsc/cmd/kill.go
@@ -69,7 +69,7 @@ func (k *Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Fatalf("it is invalid to specify both --all and --pid")
 	}
 
-	c, err := container.LoadAndCheck(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go
index 6907eb16a..9f9a47bd8 100644
--- a/runsc/cmd/list.go
+++ b/runsc/cmd/list.go
@@ -24,6 +24,7 @@ import (
 
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
@@ -71,7 +72,7 @@ func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 
 	if l.quiet {
 		for _, id := range ids {
-			fmt.Println(id)
+			fmt.Println(id.ContainerID)
 		}
 		return subcommands.ExitSuccess
 	}
@@ -79,9 +80,10 @@ func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// Collect the containers.
 	var containers []*container.Container
 	for _, id := range ids {
-		c, err := container.LoadAndCheck(conf.RootDir, id)
+		c, err := container.Load(conf.RootDir, id, container.LoadOpts{Exact: true})
 		if err != nil {
-			Fatalf("loading container %q: %v", id, err)
+			log.Warningf("Skipping container %q: %v", id, err)
+			continue
 		}
 		containers = append(containers, c)
 	}
diff --git a/runsc/cmd/pause.go b/runsc/cmd/pause.go
index fe7d4e257..15ef7b577 100644
--- a/runsc/cmd/pause.go
+++ b/runsc/cmd/pause.go
@@ -55,7 +55,7 @@ func (*Pause) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	id := f.Arg(0)
 	conf := args[0].(*config.Config)
 
-	cont, err := container.LoadAndCheck(conf.RootDir, id)
+	cont, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go
index 18d7a1436..04e3e0bdd 100644
--- a/runsc/cmd/ps.go
+++ b/runsc/cmd/ps.go
@@ -60,7 +60,7 @@ func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{})
 	id := f.Arg(0)
 	conf := args[0].(*config.Config)
 
-	c, err := container.LoadAndCheck(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading sandbox: %v", err)
 	}
diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go
index a00928204..856469252 100644
--- a/runsc/cmd/resume.go
+++ b/runsc/cmd/resume.go
@@ -56,7 +56,7 @@ func (r *Resume) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 	id := f.Arg(0)
 	conf := args[0].(*config.Config)
 
-	cont, err := container.LoadAndCheck(conf.RootDir, id)
+	cont, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
index f6499cc44..964a65064 100644
--- a/runsc/cmd/start.go
+++ b/runsc/cmd/start.go
@@ -55,7 +55,7 @@ func (*Start) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	id := f.Arg(0)
 	conf := args[0].(*config.Config)
 
-	c, err := container.LoadAndCheck(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go
index d8a70dd7f..1f7913d5a 100644
--- a/runsc/cmd/state.go
+++ b/runsc/cmd/state.go
@@ -57,7 +57,7 @@ func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	id := f.Arg(0)
 	conf := args[0].(*config.Config)
 
-	c, err := container.LoadAndCheck(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
index c1d6aeae2..5d55422c7 100644
--- a/runsc/cmd/wait.go
+++ b/runsc/cmd/wait.go
@@ -72,7 +72,7 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	id := f.Arg(0)
 	conf := args[0].(*config.Config)
 
-	c, err := container.LoadAndCheck(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 418a27beb..8b78660f7 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -128,125 +128,6 @@ type Container struct {
 	goferIsChild bool
 }
 
-// loadSandbox loads all containers that belong to the sandbox with the given
-// ID.
-func loadSandbox(rootDir, id string) ([]*Container, error) {
-	cids, err := List(rootDir)
-	if err != nil {
-		return nil, err
-	}
-
-	// Load the container metadata.
-	var containers []*Container
-	for _, cid := range cids {
-		container, err := Load(rootDir, cid)
-		if err != nil {
-			// Container file may not exist if it raced with creation/deletion or
-			// directory was left behind. Load provides a snapshot in time, so it's
-			// fine to skip it.
-			if os.IsNotExist(err) {
-				continue
-			}
-			return nil, fmt.Errorf("loading container %q: %v", id, err)
-		}
-		if container.Sandbox.ID == id {
-			containers = append(containers, container)
-		}
-	}
-	return containers, nil
-}
-
-// Load loads a container with the given id from a metadata file. partialID may
-// be an abbreviation of the full container id, in which case Load loads the
-// container to which id unambiguously refers to. Returns ErrNotExist if
-// container doesn't exist.
-func Load(rootDir, partialID string) (*Container, error) {
-	log.Debugf("Load container, rootDir: %q, partial cid: %s", rootDir, partialID)
-	if err := validateID(partialID); err != nil {
-		return nil, fmt.Errorf("invalid container id: %v", err)
-	}
-
-	id, err := findContainerID(rootDir, partialID)
-	if err != nil {
-		// Preserve error so that callers can distinguish 'not found' errors.
-		return nil, err
-	}
-
-	state := StateFile{
-		RootDir: rootDir,
-		ID:      id,
-	}
-	defer state.close()
-
-	c := &Container{}
-	if err := state.load(c); err != nil {
-		if os.IsNotExist(err) {
-			// Preserve error so that callers can distinguish 'not found' errors.
-			return nil, err
-		}
-		return nil, fmt.Errorf("reading container metadata file %q: %v", state.statePath(), err)
-	}
-	return c, nil
-}
-
-// LoadAndCheck is similar to Load(), but also checks if the container is still
-// running to get an error earlier to the caller.
-func LoadAndCheck(rootDir, partialID string) (*Container, error) {
-	c, err := Load(rootDir, partialID)
-	if err != nil {
-		// Preserve error so that callers can distinguish 'not found' errors.
-		return nil, err
-	}
-
-	// If the status is "Running" or "Created", check that the sandbox/container
-	// is still running, setting it to Stopped if not.
-	//
-	// This is inherently racy.
-	switch c.Status {
-	case Created:
-		if !c.isSandboxRunning() {
-			// Sandbox no longer exists, so this container definitely does not exist.
-			c.changeStatus(Stopped)
-		}
-	case Running:
-		if err := c.SignalContainer(syscall.Signal(0), false); err != nil {
-			c.changeStatus(Stopped)
-		}
-	}
-
-	return c, nil
-}
-
-func findContainerID(rootDir, partialID string) (string, error) {
-	// Check whether the id fully specifies an existing container.
-	stateFile := buildStatePath(rootDir, partialID)
-	if _, err := os.Stat(stateFile); err == nil {
-		return partialID, nil
-	}
-
-	// Now see whether id could be an abbreviation of exactly 1 of the
-	// container ids. If id is ambiguous (it could match more than 1
-	// container), it is an error.
-	ids, err := List(rootDir)
-	if err != nil {
-		return "", err
-	}
-	rv := ""
-	for _, id := range ids {
-		if strings.HasPrefix(id, partialID) {
-			if rv != "" {
-				return "", fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, rv, id)
-			}
-			rv = id
-		}
-	}
-	if rv == "" {
-		return "", os.ErrNotExist
-	}
-	log.Debugf("abbreviated id %q resolves to full id %q", partialID, rv)
-	return rv, nil
-}
-
 // Args is used to configure a new container.
 type Args struct {
 	// ID is the container unique identifier.
@@ -291,6 +172,15 @@ func New(conf *config.Config, args Args) (*Container, error) {
 		return nil, fmt.Errorf("creating container root directory %q: %v", conf.RootDir, err)
 	}
 
+	sandboxID := args.ID
+	if !isRoot(args.Spec) {
+		var ok bool
+		sandboxID, ok = specutils.SandboxID(args.Spec)
+		if !ok {
+			return nil, fmt.Errorf("no sandbox ID found when creating container")
+		}
+	}
+
 	c := &Container{
 		ID:            args.ID,
 		Spec:          args.Spec,
@@ -301,7 +191,10 @@ func New(conf *config.Config, args Args) (*Container, error) {
 		Owner:         os.Getenv("USER"),
 		Saver: StateFile{
 			RootDir: conf.RootDir,
-			ID:      args.ID,
+			ID: FullID{
+				SandboxID:   sandboxID,
+				ContainerID: args.ID,
+			},
 		},
 	}
 	// The Cleanup object cleans up partially created containers when an error
@@ -316,10 +209,17 @@ func New(conf *config.Config, args Args) (*Container, error) {
 	}
 	defer c.Saver.unlock()
 
-	// If the metadata annotations indicate that this container should be
-	// started in an existing sandbox, we must do so. The metadata will
-	// indicate the ID of the sandbox, which is the same as the ID of the
-	// init container in the sandbox.
+	// If the metadata annotations indicate that this container should be started
+	// in an existing sandbox, we must do so. These are the possible metadata
+	// annotation states:
+	//   1. No annotations: it means that there is a single container and this
+	//      container is obviously the root. Both container and sandbox share the
+	//      ID.
+	//   2. Container type == sandbox: it means this is the root container
+	//  		starting the sandbox. Both container and sandbox share the same ID.
+	//   3. Container type == container: it means this is a subcontainer of an
+	//      already started sandbox. In this case, container ID is different than
+	//      the sandbox ID.
 	if isRoot(args.Spec) {
 		log.Debugf("Creating new sandbox for container, cid: %s", args.ID)
 
@@ -358,7 +258,7 @@ func New(conf *config.Config, args Args) (*Container, error) {
 			// Start a new sandbox for this container. Any errors after this point
 			// must destroy the container.
 			sandArgs := &sandbox.Args{
-				ID:            args.ID,
+				ID:            sandboxID,
 				Spec:          args.Spec,
 				BundleDir:     args.BundleDir,
 				ConsoleSocket: args.ConsoleSocket,
@@ -379,22 +279,14 @@ func New(conf *config.Config, args Args) (*Container, error) {
 			return nil, err
 		}
 	} else {
-		// This is sort of confusing. For a sandbox with a root
-		// container and a child container in it, runsc sees:
-		// * A container struct whose sandbox ID is equal to the
-		//   container ID. This is the root container that is tied to
-		//   the creation of the sandbox.
-		// * A container struct whose sandbox ID is equal to the above
-		//   container/sandbox ID, but that has a different container
-		//   ID. This is the child container.
-		sbid, ok := specutils.SandboxID(args.Spec)
-		if !ok {
-			return nil, fmt.Errorf("no sandbox ID found when creating container")
-		}
-		log.Debugf("Creating new container, cid: %s, sandbox: %s", c.ID, sbid)
+		log.Debugf("Creating new container, cid: %s, sandbox: %s", c.ID, sandboxID)
 
 		// Find the sandbox associated with this ID.
-		sb, err := LoadAndCheck(conf.RootDir, sbid)
+		fullID := FullID{
+			SandboxID:   sandboxID,
+			ContainerID: sandboxID,
+		}
+		sb, err := Load(conf.RootDir, fullID, LoadOpts{Exact: true})
 		if err != nil {
 			return nil, err
 		}
@@ -628,7 +520,7 @@ func (c *Container) Wait() (syscall.WaitStatus, error) {
 // returns its WaitStatus.
 func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on process %d in sandbox, cid: %s", pid, c.Sandbox.ID)
-	if !c.isSandboxRunning() {
+	if !c.IsSandboxRunning() {
 		return 0, fmt.Errorf("sandbox is not running")
 	}
 	return c.Sandbox.WaitPID(c.Sandbox.ID, pid)
@@ -638,7 +530,7 @@ func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) {
 // its WaitStatus.
 func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on process %d in container, cid: %s", pid, c.ID)
-	if !c.isSandboxRunning() {
+	if !c.IsSandboxRunning() {
 		return 0, fmt.Errorf("sandbox is not running")
 	}
 	return c.Sandbox.WaitPID(c.ID, pid)
@@ -658,7 +550,7 @@ func (c *Container) SignalContainer(sig syscall.Signal, all bool) error {
 	if err := c.requireStatus("signal", Running, Stopped); err != nil {
 		return err
 	}
-	if !c.isSandboxRunning() {
+	if !c.IsSandboxRunning() {
 		return fmt.Errorf("sandbox is not running")
 	}
 	return c.Sandbox.SignalContainer(c.ID, sig, all)
@@ -670,7 +562,7 @@ func (c *Container) SignalProcess(sig syscall.Signal, pid int32) error {
 	if err := c.requireStatus("signal a process inside", Running); err != nil {
 		return err
 	}
-	if !c.isSandboxRunning() {
+	if !c.IsSandboxRunning() {
 		return fmt.Errorf("sandbox is not running")
 	}
 	return c.Sandbox.SignalProcess(c.ID, int32(pid), sig, false)
@@ -889,7 +781,7 @@ func (c *Container) waitForStopped() error {
 	defer cancel()
 	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
 	op := func() error {
-		if c.isSandboxRunning() {
+		if c.IsSandboxRunning() {
 			if err := c.SignalContainer(syscall.Signal(0), false); err == nil {
 				return fmt.Errorf("container is still running")
 			}
@@ -1091,7 +983,7 @@ func (c *Container) changeStatus(s Status) {
 	c.Status = s
 }
 
-func (c *Container) isSandboxRunning() bool {
+func (c *Container) IsSandboxRunning() bool {
 	return c.Sandbox != nil && c.Sandbox.IsRunning()
 }
 
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index fa99e403a..a92ae046d 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -364,7 +364,7 @@ func TestLifecycle(t *testing.T) {
 			defer c.Destroy()
 
 			// Load the container from disk and check the status.
-			c, err = LoadAndCheck(rootDir, args.ID)
+			c, err = Load(rootDir, FullID{ContainerID: args.ID}, LoadOpts{})
 			if err != nil {
 				t.Fatalf("error loading container: %v", err)
 			}
@@ -377,7 +377,11 @@ func TestLifecycle(t *testing.T) {
 			if err != nil {
 				t.Fatalf("error listing containers: %v", err)
 			}
-			if got, want := ids, []string{args.ID}; !reflect.DeepEqual(got, want) {
+			fullID := FullID{
+				SandboxID:   args.ID,
+				ContainerID: args.ID,
+			}
+			if got, want := ids, []FullID{fullID}; !reflect.DeepEqual(got, want) {
 				t.Errorf("container list got %v, want %v", got, want)
 			}
 
@@ -387,7 +391,7 @@ func TestLifecycle(t *testing.T) {
 			}
 
 			// Load the container from disk and check the status.
-			c, err = LoadAndCheck(rootDir, args.ID)
+			c, err = Load(rootDir, fullID, LoadOpts{Exact: true})
 			if err != nil {
 				t.Fatalf("error loading container: %v", err)
 			}
@@ -428,7 +432,7 @@ func TestLifecycle(t *testing.T) {
 			}
 
 			// Load the container from disk and check the status.
-			c, err = LoadAndCheck(rootDir, args.ID)
+			c, err = Load(rootDir, fullID, LoadOpts{Exact: true})
 			if err != nil {
 				t.Fatalf("error loading container: %v", err)
 			}
@@ -451,7 +455,7 @@ func TestLifecycle(t *testing.T) {
 			}
 
 			// Loading the container by id should fail.
-			if _, err = LoadAndCheck(rootDir, args.ID); err == nil {
+			if _, err = Load(rootDir, fullID, LoadOpts{Exact: true}); err == nil {
 				t.Errorf("expected loading destroyed container to fail, but it did not")
 			}
 		})
@@ -1738,7 +1742,7 @@ func doAbbreviatedIDsTest(t *testing.T, vfs2 bool) {
 		cids[2]: cids[2],
 	}
 	for shortid, longid := range unambiguous {
-		if _, err := LoadAndCheck(rootDir, shortid); err != nil {
+		if _, err := Load(rootDir, FullID{ContainerID: shortid}, LoadOpts{}); err != nil {
 			t.Errorf("%q should resolve to %q: %v", shortid, longid, err)
 		}
 	}
@@ -1749,7 +1753,7 @@ func doAbbreviatedIDsTest(t *testing.T, vfs2 bool) {
 		"ba",
 	}
 	for _, shortid := range ambiguous {
-		if s, err := LoadAndCheck(rootDir, shortid); err == nil {
+		if s, err := Load(rootDir, FullID{ContainerID: shortid}, LoadOpts{}); err == nil {
 			t.Errorf("%q should be ambiguous, but resolved to %q", shortid, s.ID)
 		}
 	}
@@ -2007,7 +2011,7 @@ func doDestroyStartingTest(t *testing.T, vfs2 bool) {
 
 		// Container is not thread safe, so load another instance to run in
 		// concurrently.
-		startCont, err := LoadAndCheck(rootDir, args.ID)
+		startCont, err := Load(rootDir, FullID{ContainerID: args.ID}, LoadOpts{})
 		if err != nil {
 			t.Fatalf("error loading container: %v", err)
 		}
@@ -2332,6 +2336,42 @@ func TestTTYField(t *testing.T) {
 	}
 }
 
+// Test that container can run even when there are corrupt state files in the
+// root directiry.
+func TestCreateWithCorruptedStateFile(t *testing.T) {
+	conf := testutil.TestConfig(t)
+	spec := testutil.NewSpecWithArgs("/bin/true")
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer cleanup()
+
+	// Create corrupted state file.
+	corruptID := testutil.RandomContainerID()
+	corruptState := buildPath(conf.RootDir, FullID{SandboxID: corruptID, ContainerID: corruptID}, stateFileExtension)
+	if err := ioutil.WriteFile(corruptState, []byte("this{file(is;not[valid.json"), 0777); err != nil {
+		t.Fatalf("createCorruptStateFile(): %v", err)
+	}
+	defer os.Remove(corruptState)
+
+	if _, err := Load(conf.RootDir, FullID{ContainerID: corruptID}, LoadOpts{SkipCheck: true}); err == nil {
+		t.Fatalf("loading corrupted state file should have failed")
+	}
+
+	args := Args{
+		ID:        testutil.RandomContainerID(),
+		Spec:      spec,
+		BundleDir: bundleDir,
+		Attached:  true,
+	}
+	if ws, err := Run(conf, args); err != nil {
+		t.Errorf("running container: %v", err)
+	} else if !ws.Exited() || ws.ExitStatus() != 0 {
+		t.Errorf("container failed, waitStatus: %v", ws)
+	}
+}
+
 func execute(cont *Container, name string, arg ...string) (syscall.WaitStatus, error) {
 	args := &control.ExecArgs{
 		Filename: name,
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 45d4e6e6e..29db1b7e8 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -730,7 +730,7 @@ func TestMultiContainerKillAll(t *testing.T) {
 			// processes still running inside.
 			containers[1].SignalContainer(syscall.SIGKILL, false)
 			op := func() error {
-				c, err := LoadAndCheck(conf.RootDir, ids[1])
+				c, err := Load(conf.RootDir, FullID{ContainerID: ids[1]}, LoadOpts{})
 				if err != nil {
 					return err
 				}
@@ -744,7 +744,7 @@ func TestMultiContainerKillAll(t *testing.T) {
 			}
 		}
 
-		c, err := LoadAndCheck(conf.RootDir, ids[1])
+		c, err := Load(conf.RootDir, FullID{ContainerID: ids[1]}, LoadOpts{})
 		if err != nil {
 			t.Fatalf("failed to load child container %q: %v", c.ID, err)
 		}
@@ -867,7 +867,7 @@ func TestMultiContainerDestroyStarting(t *testing.T) {
 
 		// Container is not thread safe, so load another instance to run in
 		// concurrently.
-		startCont, err := LoadAndCheck(rootDir, ids[i])
+		startCont, err := Load(rootDir, FullID{ContainerID: ids[i]}, LoadOpts{})
 		if err != nil {
 			t.Fatalf("error loading container: %v", err)
 		}
diff --git a/runsc/container/state_file.go b/runsc/container/state_file.go
index 17a251530..dfbf1f2d3 100644
--- a/runsc/container/state_file.go
+++ b/runsc/container/state_file.go
@@ -20,58 +20,228 @@ import (
 	"io/ioutil"
 	"os"
 	"path/filepath"
+	"regexp"
+	"strings"
+	"syscall"
 
 	"github.com/gofrs/flock"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
-const stateFileExtension = ".state"
+const stateFileExtension = "state"
 
-// StateFile handles load from/save to container state safely from multiple
-// processes. It uses a lock file to provide synchronization between operations.
+// LoadOpts provides options for Load()ing a container.
+type LoadOpts struct {
+	// Exact tells whether the search should be exact. See Load() for more.
+	Exact bool
+
+	// SkipCheck tells Load() to skip checking if container is runnning.
+	SkipCheck bool
+}
+
+// Load loads a container with the given id from a metadata file. "id" may
+// be an abbreviation of the full container id in case LoadOpts.Exact if not
+// set. It also checks if the container is still running, in order to return
+// an error to the caller earlier. This check is skipped if LoadOpts.SkipCheck
+// is set.
 //
-// The lock file is located at: "${s.RootDir}/${s.ID}.lock".
-// The state file is located at: "${s.RootDir}/${s.ID}.state".
-type StateFile struct {
-	// RootDir is the directory containing the container metadata file.
-	RootDir string `json:"rootDir"`
+// Returns ErrNotExist if no container is found. Returns error in case more than
+// one containers matching the ID prefix is found.
+func Load(rootDir string, id FullID, opts LoadOpts) (*Container, error) {
+	//log.Debugf("Load container, rootDir: %q, partial cid: %s", rootDir, partialID)
+	if !opts.Exact {
+		var err error
+		id, err = findContainerID(rootDir, id.ContainerID)
+		if err != nil {
+			// Preserve error so that callers can distinguish 'not found' errors.
+			return nil, err
+		}
+	}
 
-	// ID is the container ID.
-	ID string `json:"id"`
+	if err := id.validate(); err != nil {
+		return nil, fmt.Errorf("invalid container id: %v", err)
+	}
+	state := StateFile{
+		RootDir: rootDir,
+		ID:      id,
+	}
+	defer state.close()
 
-	//
-	// Fields below this line are not saved in the state file and will not
-	// be preserved across commands.
-	//
+	c := &Container{}
+	if err := state.load(c); err != nil {
+		if os.IsNotExist(err) {
+			// Preserve error so that callers can distinguish 'not found' errors.
+			return nil, err
+		}
+		return nil, fmt.Errorf("reading container metadata file %q: %v", state.statePath(), err)
+	}
 
-	once  sync.Once
-	flock *flock.Flock
+	if !opts.SkipCheck {
+		// If the status is "Running" or "Created", check that the sandbox/container
+		// is still running, setting it to Stopped if not.
+		//
+		// This is inherently racy.
+		switch c.Status {
+		case Created:
+			if !c.IsSandboxRunning() {
+				// Sandbox no longer exists, so this container definitely does not exist.
+				c.changeStatus(Stopped)
+			}
+		case Running:
+			if err := c.SignalContainer(syscall.Signal(0), false); err != nil {
+				c.changeStatus(Stopped)
+			}
+		}
+	}
+
+	return c, nil
 }
 
 // List returns all container ids in the given root directory.
-func List(rootDir string) ([]string, error) {
+func List(rootDir string) ([]FullID, error) {
 	log.Debugf("List containers %q", rootDir)
-	list, err := filepath.Glob(filepath.Join(rootDir, "*"+stateFileExtension))
+	return listMatch(rootDir, FullID{})
+}
+
+// listMatch returns all container ids that match the provided id.
+func listMatch(rootDir string, id FullID) ([]FullID, error) {
+	id.SandboxID += "*"
+	id.ContainerID += "*"
+	pattern := buildPath(rootDir, id, stateFileExtension)
+	list, err := filepath.Glob(pattern)
 	if err != nil {
 		return nil, err
 	}
-	var out []string
+	var out []FullID
 	for _, path := range list {
-		// Filter out files that do no belong to a container.
-		fileName := filepath.Base(path)
-		if len(fileName) < len(stateFileExtension) {
-			panic(fmt.Sprintf("invalid file match %q", path))
-		}
-		// Remove the extension.
-		cid := fileName[:len(fileName)-len(stateFileExtension)]
-		if validateID(cid) == nil {
-			out = append(out, cid)
+		id, err := parseFileName(filepath.Base(path))
+		if err == nil {
+			out = append(out, id)
 		}
 	}
 	return out, nil
 }
 
+// loadSandbox loads all containers that belong to the sandbox with the given
+// ID.
+func loadSandbox(rootDir, id string) ([]*Container, error) {
+	cids, err := listMatch(rootDir, FullID{SandboxID: id})
+	if err != nil {
+		return nil, err
+	}
+
+	// Load the container metadata.
+	var containers []*Container
+	for _, cid := range cids {
+		container, err := Load(rootDir, cid, LoadOpts{Exact: true, SkipCheck: true})
+		if err != nil {
+			// Container file may not exist if it raced with creation/deletion or
+			// directory was left behind. Load provides a snapshot in time, so it's
+			// fine to skip it.
+			if os.IsNotExist(err) {
+				continue
+			}
+			return nil, fmt.Errorf("loading sandbox %q, failed to load container %q: %v", id, cid, err)
+		}
+		containers = append(containers, container)
+	}
+	return containers, nil
+}
+
+func findContainerID(rootDir, partialID string) (FullID, error) {
+	// Check whether the id fully specifies an existing container.
+	pattern := buildPath(rootDir, FullID{SandboxID: "*", ContainerID: partialID + "*"}, stateFileExtension)
+	list, err := filepath.Glob(pattern)
+	if err != nil {
+		return FullID{}, err
+	}
+	switch len(list) {
+	case 0:
+		return FullID{}, os.ErrNotExist
+	case 1:
+		return parseFileName(filepath.Base(list[0]))
+	}
+
+	// Now see whether id could be an abbreviation of exactly 1 of the
+	// container ids. If id is ambiguous (it could match more than 1
+	// container), it is an error.
+	ids, err := List(rootDir)
+	if err != nil {
+		return FullID{}, err
+	}
+	var rv *FullID
+	for _, id := range ids {
+		if strings.HasPrefix(id.ContainerID, partialID) {
+			if rv != nil {
+				return FullID{}, fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, rv, id)
+			}
+			rv = &id
+		}
+	}
+	if rv == nil {
+		return FullID{}, os.ErrNotExist
+	}
+	log.Debugf("abbreviated id %q resolves to full id %v", partialID, *rv)
+	return *rv, nil
+}
+
+func parseFileName(name string) (FullID, error) {
+	re := regexp.MustCompile(`([\w+-\.]+)_sandbox:([\w+-\.]+)\.` + stateFileExtension)
+	groups := re.FindStringSubmatch(name)
+	if len(groups) != 3 {
+		return FullID{}, fmt.Errorf("invalid state file name format: %q", name)
+	}
+	id := FullID{
+		SandboxID:   groups[2],
+		ContainerID: groups[1],
+	}
+	if err := id.validate(); err != nil {
+		return FullID{}, fmt.Errorf("invalid state file name %q: %w", name, err)
+	}
+	return id, nil
+}
+
+// FullID combines sandbox and container ID to identify a container. Sandbox ID
+// is used to allow all containers for a given sandbox to be loaded by matching
+// sandbox ID in the file name.
+type FullID struct {
+	SandboxID   string `json:"sandboxId"`
+	ContainerID string `json:"containerId"`
+}
+
+func (f *FullID) String() string {
+	return f.SandboxID + "/" + f.ContainerID
+}
+
+func (f *FullID) validate() error {
+	if err := validateID(f.SandboxID); err != nil {
+		return err
+	}
+	return validateID(f.ContainerID)
+}
+
+// StateFile handles load from/save to container state safely from multiple
+// processes. It uses a lock file to provide synchronization between operations.
+//
+// The lock file is located at: "${s.RootDir}/${containerd-id}_sand:{sandbox-id}.lock".
+// The state file is located at: "${s.RootDir}/${containerd-id}_sand:{sandbox-id}.state".
+type StateFile struct {
+	// RootDir is the directory containing the container metadata file.
+	RootDir string `json:"rootDir"`
+
+	// ID is the sandbox+container ID.
+	ID FullID `json:"id"`
+
+	//
+	// Fields below this line are not saved in the state file and will not
+	// be preserved across commands.
+	//
+
+	once  sync.Once
+	flock *flock.Flock
+}
+
 // lock globally locks all locking operations for the container.
 func (s *StateFile) lock() error {
 	s.once.Do(func() {
@@ -157,18 +327,20 @@ func (s *StateFile) close() error {
 	return s.flock.Close()
 }
 
-func buildStatePath(rootDir, id string) string {
-	return filepath.Join(rootDir, id+stateFileExtension)
+func buildPath(rootDir string, id FullID, extension string) string {
+	// Note: "_" and ":" are not valid in IDs.
+	name := fmt.Sprintf("%s_sandbox:%s.%s", id.ContainerID, id.SandboxID, extension)
+	return filepath.Join(rootDir, name)
 }
 
 // statePath is the full path to the state file.
 func (s *StateFile) statePath() string {
-	return buildStatePath(s.RootDir, s.ID)
+	return buildPath(s.RootDir, s.ID, stateFileExtension)
 }
 
 // lockPath is the full path to the lock file.
 func (s *StateFile) lockPath() string {
-	return filepath.Join(s.RootDir, s.ID+".lock")
+	return buildPath(s.RootDir, s.ID, "lock")
 }
 
 // destroy deletes all state created by the stateFile. It may be called with the