Add sandbox ID to state file name

This allows to find all containers inside a sandbox more efficiently. This operation is required every time a container starts and stops, and previously required loading *all* container state files to check whether the container belonged to the sandbox. Apert from being inneficient, it has caused problems when state files are stale or corrupt, causing inavalability to create any container. Also adjust commands `list` and `debug` to skip over files that fail to load. Resolves #5052 PiperOrigin-RevId: 348050637
author: Fabricio Voznika <fvoznika@google.com> 2020-12-17 10:44:44 -0800
committer: gVisor bot <gvisor-bot@google.com> 2020-12-17 10:52:44 -0800
commit: 8ea19b5818d0c1e9b798bd0bd288c7f51a46261d (patch)
tree: d7f9da9c969f460fc6267e6fe212e37bfdf12185 /runsc
parent: e7493a9e23325c00ad9a0db341d5887afe3ae5eb (diff)
17 files changed, 315 insertions, 206 deletions
diff --git a/runsc/cmd/checkpoint.go b/runsc/cmd/checkpoint.go
index c0bc8f064..124198239 100644
--- a/runsc/cmd/checkpoint.go
+++ b/runsc/cmd/checkpoint.go
@@ -75,7 +75,7 @@ func (c *Checkpoint) Execute(_ context.Context, f *flag.FlagSet, args ...interfa
 	conf := args[0].(*config.Config)
 	waitStatus := args[1].(*syscall.WaitStatus)
 
-	cont, err := container.LoadAndCheck(conf.RootDir, id)
+	cont, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index 609e8231c..1e5a7471a 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -90,8 +90,10 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 			f.Usage()
 			return subcommands.ExitUsageError
 		}
+		id := f.Arg(0)
+
 		var err error
-		c, err = container.LoadAndCheck(conf.RootDir, f.Arg(0))
+		c, err = container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 		if err != nil {
 			return Errorf("loading container %q: %v", f.Arg(0), err)
 		}
@@ -106,9 +108,10 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 			return Errorf("listing containers: %v", err)
 		}
 		for _, id := range ids {
-			candidate, err := container.LoadAndCheck(conf.RootDir, id)
+			candidate, err := container.Load(conf.RootDir, id, container.LoadOpts{Exact: true, SkipCheck: true})
 			if err != nil {
-				return Errorf("loading container %q: %v", id, err)
+				log.Warningf("Skipping container %q: %v", id, err)
+				continue
 			}
 			if candidate.SandboxPid() == d.pid {
 				c = candidate
@@ -120,7 +123,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		}
 	}
 
-	if c.Sandbox == nil || !c.Sandbox.IsRunning() {
+	if c.IsSandboxRunning() {
 		return Errorf("container sandbox is not running")
 	}
 	log.Infof("Found sandbox %q, PID: %d", c.Sandbox.ID, c.Sandbox.Pid)
diff --git a/runsc/cmd/delete.go b/runsc/cmd/delete.go
index a25637265..a750be131 100644
--- a/runsc/cmd/delete.go
+++ b/runsc/cmd/delete.go
@@ -68,7 +68,7 @@ func (d *Delete) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 
 func (d *Delete) execute(ids []string, conf *config.Config) error {
 	for _, id := range ids {
-		c, err := container.LoadAndCheck(conf.RootDir, id)
+		c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 		if err != nil {
 			if os.IsNotExist(err) && d.force {
 				log.Warningf("couldn't find container %q: %v", id, err)
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
index 3836b7b4e..75b0aac8d 100644
--- a/runsc/cmd/events.go
+++ b/runsc/cmd/events.go
@@ -74,7 +74,7 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa
 	id := f.Arg(0)
 	conf := args[0].(*config.Config)
 
-	c, err := container.LoadAndCheck(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading sandbox: %v", err)
 	}
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index eafd6285c..8558d34ae 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -112,7 +112,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	}
 	waitStatus := args[1].(*syscall.WaitStatus)
 
-	c, err := container.LoadAndCheck(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading sandbox: %v", err)
 	}
diff --git a/runsc/cmd/kill.go b/runsc/cmd/kill.go
index fe69e2a08..aecf0b7ab 100644
--- a/runsc/cmd/kill.go
+++ b/runsc/cmd/kill.go
@@ -69,7 +69,7 @@ func (k *Kill) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 		Fatalf("it is invalid to specify both --all and --pid")
 	}
 
-	c, err := container.LoadAndCheck(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
diff --git a/runsc/cmd/list.go b/runsc/cmd/list.go
index 6907eb16a..9f9a47bd8 100644
--- a/runsc/cmd/list.go
+++ b/runsc/cmd/list.go
@@ -24,6 +24,7 @@ import (
 
 	"github.com/google/subcommands"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
+	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/runsc/config"
 	"gvisor.dev/gvisor/runsc/container"
 	"gvisor.dev/gvisor/runsc/flag"
@@ -71,7 +72,7 @@ func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 
 	if l.quiet {
 		for _, id := range ids {
-			fmt.Println(id)
+			fmt.Println(id.ContainerID)
 		}
 		return subcommands.ExitSuccess
 	}
@@ -79,9 +80,10 @@ func (l *List) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	// Collect the containers.
 	var containers []*container.Container
 	for _, id := range ids {
-		c, err := container.LoadAndCheck(conf.RootDir, id)
+		c, err := container.Load(conf.RootDir, id, container.LoadOpts{Exact: true})
 		if err != nil {
-			Fatalf("loading container %q: %v", id, err)
+			log.Warningf("Skipping container %q: %v", id, err)
+			continue
 		}
 		containers = append(containers, c)
 	}
diff --git a/runsc/cmd/pause.go b/runsc/cmd/pause.go
index fe7d4e257..15ef7b577 100644
--- a/runsc/cmd/pause.go
+++ b/runsc/cmd/pause.go
@@ -55,7 +55,7 @@ func (*Pause) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	id := f.Arg(0)
 	conf := args[0].(*config.Config)
 
-	cont, err := container.LoadAndCheck(conf.RootDir, id)
+	cont, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go
index 18d7a1436..04e3e0bdd 100644
--- a/runsc/cmd/ps.go
+++ b/runsc/cmd/ps.go
@@ -60,7 +60,7 @@ func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{})
 	id := f.Arg(0)
 	conf := args[0].(*config.Config)
 
-	c, err := container.LoadAndCheck(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading sandbox: %v", err)
 	}
diff --git a/runsc/cmd/resume.go b/runsc/cmd/resume.go
index a00928204..856469252 100644
--- a/runsc/cmd/resume.go
+++ b/runsc/cmd/resume.go
@@ -56,7 +56,7 @@ func (r *Resume) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
 	id := f.Arg(0)
 	conf := args[0].(*config.Config)
 
-	cont, err := container.LoadAndCheck(conf.RootDir, id)
+	cont, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
index f6499cc44..964a65064 100644
--- a/runsc/cmd/start.go
+++ b/runsc/cmd/start.go
@@ -55,7 +55,7 @@ func (*Start) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	id := f.Arg(0)
 	conf := args[0].(*config.Config)
 
-	c, err := container.LoadAndCheck(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
diff --git a/runsc/cmd/state.go b/runsc/cmd/state.go
index d8a70dd7f..1f7913d5a 100644
--- a/runsc/cmd/state.go
+++ b/runsc/cmd/state.go
@@ -57,7 +57,7 @@ func (*State) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
 	id := f.Arg(0)
 	conf := args[0].(*config.Config)
 
-	c, err := container.LoadAndCheck(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
index c1d6aeae2..5d55422c7 100644
--- a/runsc/cmd/wait.go
+++ b/runsc/cmd/wait.go
@@ -72,7 +72,7 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 	id := f.Arg(0)
 	conf := args[0].(*config.Config)
 
-	c, err := container.LoadAndCheck(conf.RootDir, id)
+	c, err := container.Load(conf.RootDir, container.FullID{ContainerID: id}, container.LoadOpts{})
 	if err != nil {
 		Fatalf("loading container: %v", err)
 	}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 418a27beb..8b78660f7 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -128,125 +128,6 @@ type Container struct {
 	goferIsChild bool
 }
 
-// loadSandbox loads all containers that belong to the sandbox with the given
-// ID.
-func loadSandbox(rootDir, id string) ([]*Container, error) {
-	cids, err := List(rootDir)
-	if err != nil {
-		return nil, err
-	}
-
-	// Load the container metadata.
-	var containers []*Container
-	for _, cid := range cids {
-		container, err := Load(rootDir, cid)
-		if err != nil {
-			// Container file may not exist if it raced with creation/deletion or
-			// directory was left behind. Load provides a snapshot in time, so it's
-			// fine to skip it.
-			if os.IsNotExist(err) {
-				continue
-			}
-			return nil, fmt.Errorf("loading container %q: %v", id, err)
-		}
-		if container.Sandbox.ID == id {
-			containers = append(containers, container)
-		}
-	}
-	return containers, nil
-}
-
-// Load loads a container with the given id from a metadata file. partialID may
-// be an abbreviation of the full container id, in which case Load loads the
-// container to which id unambiguously refers to. Returns ErrNotExist if
-// container doesn't exist.
-func Load(rootDir, partialID string) (*Container, error) {
-	log.Debugf("Load container, rootDir: %q, partial cid: %s", rootDir, partialID)
-	if err := validateID(partialID); err != nil {
-		return nil, fmt.Errorf("invalid container id: %v", err)
-	}
-
-	id, err := findContainerID(rootDir, partialID)
-	if err != nil {
-		// Preserve error so that callers can distinguish 'not found' errors.
-		return nil, err
-	}
-
-	state := StateFile{
-		RootDir: rootDir,
-		ID:      id,
-	}
-	defer state.close()
-
-	c := &Container{}
-	if err := state.load(c); err != nil {
-		if os.IsNotExist(err) {
-			// Preserve error so that callers can distinguish 'not found' errors.
-			return nil, err
-		}
-		return nil, fmt.Errorf("reading container metadata file %q: %v", state.statePath(), err)
-	}
-	return c, nil
-}
-
-// LoadAndCheck is similar to Load(), but also checks if the container is still
-// running to get an error earlier to the caller.
-func LoadAndCheck(rootDir, partialID string) (*Container, error) {
-	c, err := Load(rootDir, partialID)
-	if err != nil {
-		// Preserve error so that callers can distinguish 'not found' errors.
-		return nil, err
-	}
-
-	// If the status is "Running" or "Created", check that the sandbox/container
-	// is still running, setting it to Stopped if not.
-	//
-	// This is inherently racy.
-	switch c.Status {
-	case Created:
-		if !c.isSandboxRunning() {
-			// Sandbox no longer exists, so this container definitely does not exist.
-			c.changeStatus(Stopped)
-		}
-	case Running:
-		if err := c.SignalContainer(syscall.Signal(0), false); err != nil {
-			c.changeStatus(Stopped)
-		}
-	}
-
-	return c, nil
-}
-
-func findContainerID(rootDir, partialID string) (string, error) {
-	// Check whether the id fully specifies an existing container.
-	stateFile := buildStatePath(rootDir, partialID)
-	if _, err := os.Stat(stateFile); err == nil {
-		return partialID, nil
-	}
-
-	// Now see whether id could be an abbreviation of exactly 1 of the
-	// container ids. If id is ambiguous (it could match more than 1
-	// container), it is an error.
-	ids, err := List(rootDir)
-	if err != nil {
-		return "", err
-	}
-	rv := ""
-	for _, id := range ids {
-		if strings.HasPrefix(id, partialID) {
-			if rv != "" {
-				return "", fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, rv, id)
-			}
-			rv = id
-		}
-	}
-	if rv == "" {
-		return "", os.ErrNotExist
-	}
-	log.Debugf("abbreviated id %q resolves to full id %q", partialID, rv)
-	return rv, nil
-}
-
 // Args is used to configure a new container.
 type Args struct {
 	// ID is the container unique identifier.
@@ -291,6 +172,15 @@ func New(conf *config.Config, args Args) (*Container, error) {
 		return nil, fmt.Errorf("creating container root directory %q: %v", conf.RootDir, err)
 	}
 
+	sandboxID := args.ID
+	if !isRoot(args.Spec) {
+		var ok bool
+		sandboxID, ok = specutils.SandboxID(args.Spec)
+		if !ok {
+			return nil, fmt.Errorf("no sandbox ID found when creating container")
+		}
+	}
+
 	c := &Container{
 		ID:            args.ID,
 		Spec:          args.Spec,
@@ -301,7 +191,10 @@ func New(conf *config.Config, args Args) (*Container, error) {
 		Owner:         os.Getenv("USER"),
 		Saver: StateFile{
 			RootDir: conf.RootDir,
-			ID:      args.ID,
+			ID: FullID{
+				SandboxID:   sandboxID,
+				ContainerID: args.ID,
+			},
 		},
 	}
 	// The Cleanup object cleans up partially created containers when an error
@@ -316,10 +209,17 @@ func New(conf *config.Config, args Args) (*Container, error) {
 	}
 	defer c.Saver.unlock()
 
-	// If the metadata annotations indicate that this container should be
-	// started in an existing sandbox, we must do so. The metadata will
-	// indicate the ID of the sandbox, which is the same as the ID of the
-	// init container in the sandbox.
+	// If the metadata annotations indicate that this container should be started
+	// in an existing sandbox, we must do so. These are the possible metadata
+	// annotation states:
+	//   1. No annotations: it means that there is a single container and this
+	//      container is obviously the root. Both container and sandbox share the
+	//      ID.
+	//   2. Container type == sandbox: it means this is the root container
+	//  		starting the sandbox. Both container and sandbox share the same ID.
+	//   3. Container type == container: it means this is a subcontainer of an
+	//      already started sandbox. In this case, container ID is different than
+	//      the sandbox ID.
 	if isRoot(args.Spec) {
 		log.Debugf("Creating new sandbox for container, cid: %s", args.ID)
 
@@ -358,7 +258,7 @@ func New(conf *config.Config, args Args) (*Container, error) {
 			// Start a new sandbox for this container. Any errors after this point
 			// must destroy the container.
 			sandArgs := &sandbox.Args{
-				ID:            args.ID,
+				ID:            sandboxID,
 				Spec:          args.Spec,
 				BundleDir:     args.BundleDir,
 				ConsoleSocket: args.ConsoleSocket,
@@ -379,22 +279,14 @@ func New(conf *config.Config, args Args) (*Container, error) {
 			return nil, err
 		}
 	} else {
-		// This is sort of confusing. For a sandbox with a root
-		// container and a child container in it, runsc sees:
-		// * A container struct whose sandbox ID is equal to the
-		//   container ID. This is the root container that is tied to
-		//   the creation of the sandbox.
-		// * A container struct whose sandbox ID is equal to the above
-		//   container/sandbox ID, but that has a different container
-		//   ID. This is the child container.
-		sbid, ok := specutils.SandboxID(args.Spec)
-		if !ok {
-			return nil, fmt.Errorf("no sandbox ID found when creating container")
-		}
-		log.Debugf("Creating new container, cid: %s, sandbox: %s", c.ID, sbid)
+		log.Debugf("Creating new container, cid: %s, sandbox: %s", c.ID, sandboxID)
 
 		// Find the sandbox associated with this ID.
-		sb, err := LoadAndCheck(conf.RootDir, sbid)
+		fullID := FullID{
+			SandboxID:   sandboxID,
+			ContainerID: sandboxID,
+		}
+		sb, err := Load(conf.RootDir, fullID, LoadOpts{Exact: true})
 		if err != nil {
 			return nil, err
 		}
@@ -628,7 +520,7 @@ func (c *Container) Wait() (syscall.WaitStatus, error) {
 // returns its WaitStatus.
 func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on process %d in sandbox, cid: %s", pid, c.Sandbox.ID)
-	if !c.isSandboxRunning() {
+	if !c.IsSandboxRunning() {
 		return 0, fmt.Errorf("sandbox is not running")
 	}
 	return c.Sandbox.WaitPID(c.Sandbox.ID, pid)
@@ -638,7 +530,7 @@ func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) {
 // its WaitStatus.
 func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) {
 	log.Debugf("Wait on process %d in container, cid: %s", pid, c.ID)
-	if !c.isSandboxRunning() {
+	if !c.IsSandboxRunning() {
 		return 0, fmt.Errorf("sandbox is not running")
 	}
 	return c.Sandbox.WaitPID(c.ID, pid)
@@ -658,7 +550,7 @@ func (c *Container) SignalContainer(sig syscall.Signal, all bool) error {
 	if err := c.requireStatus("signal", Running, Stopped); err != nil {
 		return err
 	}
-	if !c.isSandboxRunning() {
+	if !c.IsSandboxRunning() {
 		return fmt.Errorf("sandbox is not running")
 	}
 	return c.Sandbox.SignalContainer(c.ID, sig, all)
@@ -670,7 +562,7 @@ func (c *Container) SignalProcess(sig syscall.Signal, pid int32) error {
 	if err := c.requireStatus("signal a process inside", Running); err != nil {
 		return err
 	}
-	if !c.isSandboxRunning() {
+	if !c.IsSandboxRunning() {
 		return fmt.Errorf("sandbox is not running")
 	}
 	return c.Sandbox.SignalProcess(c.ID, int32(pid), sig, false)
@@ -889,7 +781,7 @@ func (c *Container) waitForStopped() error {
 	defer cancel()
 	b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
 	op := func() error {
-		if c.isSandboxRunning() {
+		if c.IsSandboxRunning() {
 			if err := c.SignalContainer(syscall.Signal(0), false); err == nil {
 				return fmt.Errorf("container is still running")
 			}
@@ -1091,7 +983,7 @@ func (c *Container) changeStatus(s Status) {
 	c.Status = s
 }
 
-func (c *Container) isSandboxRunning() bool {
+func (c *Container) IsSandboxRunning() bool {
 	return c.Sandbox != nil && c.Sandbox.IsRunning()
 }
 
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index fa99e403a..a92ae046d 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -364,7 +364,7 @@ func TestLifecycle(t *testing.T) {
 			defer c.Destroy()
 
 			// Load the container from disk and check the status.
-			c, err = LoadAndCheck(rootDir, args.ID)
+			c, err = Load(rootDir, FullID{ContainerID: args.ID}, LoadOpts{})
 			if err != nil {
 				t.Fatalf("error loading container: %v", err)
 			}
@@ -377,7 +377,11 @@ func TestLifecycle(t *testing.T) {
 			if err != nil {
 				t.Fatalf("error listing containers: %v", err)
 			}
-			if got, want := ids, []string{args.ID}; !reflect.DeepEqual(got, want) {
+			fullID := FullID{
+				SandboxID:   args.ID,
+				ContainerID: args.ID,
+			}
+			if got, want := ids, []FullID{fullID}; !reflect.DeepEqual(got, want) {
 				t.Errorf("container list got %v, want %v", got, want)
 			}
 
@@ -387,7 +391,7 @@ func TestLifecycle(t *testing.T) {
 			}
 
 			// Load the container from disk and check the status.
-			c, err = LoadAndCheck(rootDir, args.ID)
+			c, err = Load(rootDir, fullID, LoadOpts{Exact: true})
 			if err != nil {
 				t.Fatalf("error loading container: %v", err)
 			}
@@ -428,7 +432,7 @@ func TestLifecycle(t *testing.T) {
 			}
 
 			// Load the container from disk and check the status.
-			c, err = LoadAndCheck(rootDir, args.ID)
+			c, err = Load(rootDir, fullID, LoadOpts{Exact: true})
 			if err != nil {
 				t.Fatalf("error loading container: %v", err)
 			}
@@ -451,7 +455,7 @@ func TestLifecycle(t *testing.T) {
 			}
 
 			// Loading the container by id should fail.
-			if _, err = LoadAndCheck(rootDir, args.ID); err == nil {
+			if _, err = Load(rootDir, fullID, LoadOpts{Exact: true}); err == nil {
 				t.Errorf("expected loading destroyed container to fail, but it did not")
 			}
 		})
@@ -1738,7 +1742,7 @@ func doAbbreviatedIDsTest(t *testing.T, vfs2 bool) {
 		cids[2]: cids[2],
 	}
 	for shortid, longid := range unambiguous {
-		if _, err := LoadAndCheck(rootDir, shortid); err != nil {
+		if _, err := Load(rootDir, FullID{ContainerID: shortid}, LoadOpts{}); err != nil {
 			t.Errorf("%q should resolve to %q: %v", shortid, longid, err)
 		}
 	}
@@ -1749,7 +1753,7 @@ func doAbbreviatedIDsTest(t *testing.T, vfs2 bool) {
 		"ba",
 	}
 	for _, shortid := range ambiguous {
-		if s, err := LoadAndCheck(rootDir, shortid); err == nil {
+		if s, err := Load(rootDir, FullID{ContainerID: shortid}, LoadOpts{}); err == nil {
 			t.Errorf("%q should be ambiguous, but resolved to %q", shortid, s.ID)
 		}
 	}
@@ -2007,7 +2011,7 @@ func doDestroyStartingTest(t *testing.T, vfs2 bool) {
 
 		// Container is not thread safe, so load another instance to run in
 		// concurrently.
-		startCont, err := LoadAndCheck(rootDir, args.ID)
+		startCont, err := Load(rootDir, FullID{ContainerID: args.ID}, LoadOpts{})
 		if err != nil {
 			t.Fatalf("error loading container: %v", err)
 		}
@@ -2332,6 +2336,42 @@ func TestTTYField(t *testing.T) {
 	}
 }
 
+// Test that container can run even when there are corrupt state files in the
+// root directiry.
+func TestCreateWithCorruptedStateFile(t *testing.T) {
+	conf := testutil.TestConfig(t)
+	spec := testutil.NewSpecWithArgs("/bin/true")
+	_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
+	if err != nil {
+		t.Fatalf("error setting up container: %v", err)
+	}
+	defer cleanup()
+
+	// Create corrupted state file.
+	corruptID := testutil.RandomContainerID()
+	corruptState := buildPath(conf.RootDir, FullID{SandboxID: corruptID, ContainerID: corruptID}, stateFileExtension)
+	if err := ioutil.WriteFile(corruptState, []byte("this{file(is;not[valid.json"), 0777); err != nil {
+		t.Fatalf("createCorruptStateFile(): %v", err)
+	}
+	defer os.Remove(corruptState)
+
+	if _, err := Load(conf.RootDir, FullID{ContainerID: corruptID}, LoadOpts{SkipCheck: true}); err == nil {
+		t.Fatalf("loading corrupted state file should have failed")
+	}
+
+	args := Args{
+		ID:        testutil.RandomContainerID(),
+		Spec:      spec,
+		BundleDir: bundleDir,
+		Attached:  true,
+	}
+	if ws, err := Run(conf, args); err != nil {
+		t.Errorf("running container: %v", err)
+	} else if !ws.Exited() || ws.ExitStatus() != 0 {
+		t.Errorf("container failed, waitStatus: %v", ws)
+	}
+}
+
 func execute(cont *Container, name string, arg ...string) (syscall.WaitStatus, error) {
 	args := &control.ExecArgs{
 		Filename: name,
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 45d4e6e6e..29db1b7e8 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -730,7 +730,7 @@ func TestMultiContainerKillAll(t *testing.T) {
 			// processes still running inside.
 			containers[1].SignalContainer(syscall.SIGKILL, false)
 			op := func() error {
-				c, err := LoadAndCheck(conf.RootDir, ids[1])
+				c, err := Load(conf.RootDir, FullID{ContainerID: ids[1]}, LoadOpts{})
 				if err != nil {
 					return err
 				}
@@ -744,7 +744,7 @@ func TestMultiContainerKillAll(t *testing.T) {
 			}
 		}
 
-		c, err := LoadAndCheck(conf.RootDir, ids[1])
+		c, err := Load(conf.RootDir, FullID{ContainerID: ids[1]}, LoadOpts{})
 		if err != nil {
 			t.Fatalf("failed to load child container %q: %v", c.ID, err)
 		}
@@ -867,7 +867,7 @@ func TestMultiContainerDestroyStarting(t *testing.T) {
 
 		// Container is not thread safe, so load another instance to run in
 		// concurrently.
-		startCont, err := LoadAndCheck(rootDir, ids[i])
+		startCont, err := Load(rootDir, FullID{ContainerID: ids[i]}, LoadOpts{})
 		if err != nil {
 			t.Fatalf("error loading container: %v", err)
 		}
diff --git a/runsc/container/state_file.go b/runsc/container/state_file.go
index 17a251530..dfbf1f2d3 100644
--- a/runsc/container/state_file.go
+++ b/runsc/container/state_file.go
@@ -20,58 +20,228 @@ import (
 	"io/ioutil"
 	"os"
 	"path/filepath"
+	"regexp"
+	"strings"
+	"syscall"
 
 	"github.com/gofrs/flock"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sync"
 )
 
-const stateFileExtension = ".state"
+const stateFileExtension = "state"
 
-// StateFile handles load from/save to container state safely from multiple
-// processes. It uses a lock file to provide synchronization between operations.
+// LoadOpts provides options for Load()ing a container.
+type LoadOpts struct {
+	// Exact tells whether the search should be exact. See Load() for more.
+	Exact bool
+
+	// SkipCheck tells Load() to skip checking if container is runnning.
+	SkipCheck bool
+}
+
+// Load loads a container with the given id from a metadata file. "id" may
+// be an abbreviation of the full container id in case LoadOpts.Exact if not
+// set. It also checks if the container is still running, in order to return
+// an error to the caller earlier. This check is skipped if LoadOpts.SkipCheck
+// is set.
 //
-// The lock file is located at: "${s.RootDir}/${s.ID}.lock".
-// The state file is located at: "${s.RootDir}/${s.ID}.state".
-type StateFile struct {
-	// RootDir is the directory containing the container metadata file.
-	RootDir string `json:"rootDir"`
+// Returns ErrNotExist if no container is found. Returns error in case more than
+// one containers matching the ID prefix is found.
+func Load(rootDir string, id FullID, opts LoadOpts) (*Container, error) {
+	//log.Debugf("Load container, rootDir: %q, partial cid: %s", rootDir, partialID)
+	if !opts.Exact {
+		var err error
+		id, err = findContainerID(rootDir, id.ContainerID)
+		if err != nil {
+			// Preserve error so that callers can distinguish 'not found' errors.
+			return nil, err
+		}
+	}
 
-	// ID is the container ID.
-	ID string `json:"id"`
+	if err := id.validate(); err != nil {
+		return nil, fmt.Errorf("invalid container id: %v", err)
+	}
+	state := StateFile{
+		RootDir: rootDir,
+		ID:      id,
+	}
+	defer state.close()
 
-	//
-	// Fields below this line are not saved in the state file and will not
-	// be preserved across commands.
-	//
+	c := &Container{}
+	if err := state.load(c); err != nil {
+		if os.IsNotExist(err) {
+			// Preserve error so that callers can distinguish 'not found' errors.
+			return nil, err
+		}
+		return nil, fmt.Errorf("reading container metadata file %q: %v", state.statePath(), err)
+	}
 
-	once  sync.Once
-	flock *flock.Flock
+	if !opts.SkipCheck {
+		// If the status is "Running" or "Created", check that the sandbox/container
+		// is still running, setting it to Stopped if not.
+		//
+		// This is inherently racy.
+		switch c.Status {
+		case Created:
+			if !c.IsSandboxRunning() {
+				// Sandbox no longer exists, so this container definitely does not exist.
+				c.changeStatus(Stopped)
+			}
+		case Running:
+			if err := c.SignalContainer(syscall.Signal(0), false); err != nil {
+				c.changeStatus(Stopped)
+			}
+		}
+	}
+
+	return c, nil
 }
 
 // List returns all container ids in the given root directory.
-func List(rootDir string) ([]string, error) {
+func List(rootDir string) ([]FullID, error) {
 	log.Debugf("List containers %q", rootDir)
-	list, err := filepath.Glob(filepath.Join(rootDir, "*"+stateFileExtension))
+	return listMatch(rootDir, FullID{})
+}
+
+// listMatch returns all container ids that match the provided id.
+func listMatch(rootDir string, id FullID) ([]FullID, error) {
+	id.SandboxID += "*"
+	id.ContainerID += "*"
+	pattern := buildPath(rootDir, id, stateFileExtension)
+	list, err := filepath.Glob(pattern)
 	if err != nil {
 		return nil, err
 	}
-	var out []string
+	var out []FullID
 	for _, path := range list {
-		// Filter out files that do no belong to a container.
-		fileName := filepath.Base(path)
-		if len(fileName) < len(stateFileExtension) {
-			panic(fmt.Sprintf("invalid file match %q", path))
-		}
-		// Remove the extension.
-		cid := fileName[:len(fileName)-len(stateFileExtension)]
-		if validateID(cid) == nil {
-			out = append(out, cid)
+		id, err := parseFileName(filepath.Base(path))
+		if err == nil {
+			out = append(out, id)
 		}
 	}
 	return out, nil
 }
 
+// loadSandbox loads all containers that belong to the sandbox with the given
+// ID.
+func loadSandbox(rootDir, id string) ([]*Container, error) {
+	cids, err := listMatch(rootDir, FullID{SandboxID: id})
+	if err != nil {
+		return nil, err
+	}
+
+	// Load the container metadata.
+	var containers []*Container
+	for _, cid := range cids {
+		container, err := Load(rootDir, cid, LoadOpts{Exact: true, SkipCheck: true})
+		if err != nil {
+			// Container file may not exist if it raced with creation/deletion or
+			// directory was left behind. Load provides a snapshot in time, so it's
+			// fine to skip it.
+			if os.IsNotExist(err) {
+				continue
+			}
+			return nil, fmt.Errorf("loading sandbox %q, failed to load container %q: %v", id, cid, err)
+		}
+		containers = append(containers, container)
+	}
+	return containers, nil
+}
+
+func findContainerID(rootDir, partialID string) (FullID, error) {
+	// Check whether the id fully specifies an existing container.
+	pattern := buildPath(rootDir, FullID{SandboxID: "*", ContainerID: partialID + "*"}, stateFileExtension)
+	list, err := filepath.Glob(pattern)
+	if err != nil {
+		return FullID{}, err
+	}
+	switch len(list) {
+	case 0:
+		return FullID{}, os.ErrNotExist
+	case 1:
+		return parseFileName(filepath.Base(list[0]))
+	}
+
+	// Now see whether id could be an abbreviation of exactly 1 of the
+	// container ids. If id is ambiguous (it could match more than 1
+	// container), it is an error.
+	ids, err := List(rootDir)
+	if err != nil {
+		return FullID{}, err
+	}
+	var rv *FullID
+	for _, id := range ids {
+		if strings.HasPrefix(id.ContainerID, partialID) {
+			if rv != nil {
+				return FullID{}, fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, rv, id)
+			}
+			rv = &id
+		}
+	}
+	if rv == nil {
+		return FullID{}, os.ErrNotExist
+	}
+	log.Debugf("abbreviated id %q resolves to full id %v", partialID, *rv)
+	return *rv, nil
+}
+
+func parseFileName(name string) (FullID, error) {
+	re := regexp.MustCompile(`([\w+-\.]+)_sandbox:([\w+-\.]+)\.` + stateFileExtension)
+	groups := re.FindStringSubmatch(name)
+	if len(groups) != 3 {
+		return FullID{}, fmt.Errorf("invalid state file name format: %q", name)
+	}
+	id := FullID{
+		SandboxID:   groups[2],
+		ContainerID: groups[1],
+	}
+	if err := id.validate(); err != nil {
+		return FullID{}, fmt.Errorf("invalid state file name %q: %w", name, err)
+	}
+	return id, nil
+}
+
+// FullID combines sandbox and container ID to identify a container. Sandbox ID
+// is used to allow all containers for a given sandbox to be loaded by matching
+// sandbox ID in the file name.
+type FullID struct {
+	SandboxID   string `json:"sandboxId"`
+	ContainerID string `json:"containerId"`
+}
+
+func (f *FullID) String() string {
+	return f.SandboxID + "/" + f.ContainerID
+}
+
+func (f *FullID) validate() error {
+	if err := validateID(f.SandboxID); err != nil {
+		return err
+	}
+	return validateID(f.ContainerID)
+}
+
+// StateFile handles load from/save to container state safely from multiple
+// processes. It uses a lock file to provide synchronization between operations.
+//
+// The lock file is located at: "${s.RootDir}/${containerd-id}_sand:{sandbox-id}.lock".
+// The state file is located at: "${s.RootDir}/${containerd-id}_sand:{sandbox-id}.state".
+type StateFile struct {
+	// RootDir is the directory containing the container metadata file.
+	RootDir string `json:"rootDir"`
+
+	// ID is the sandbox+container ID.
+	ID FullID `json:"id"`
+
+	//
+	// Fields below this line are not saved in the state file and will not
+	// be preserved across commands.
+	//
+
+	once  sync.Once
+	flock *flock.Flock
+}
+
 // lock globally locks all locking operations for the container.
 func (s *StateFile) lock() error {
 	s.once.Do(func() {
@@ -157,18 +327,20 @@ func (s *StateFile) close() error {
 	return s.flock.Close()
 }
 
-func buildStatePath(rootDir, id string) string {
-	return filepath.Join(rootDir, id+stateFileExtension)
+func buildPath(rootDir string, id FullID, extension string) string {
+	// Note: "_" and ":" are not valid in IDs.
+	name := fmt.Sprintf("%s_sandbox:%s.%s", id.ContainerID, id.SandboxID, extension)
+	return filepath.Join(rootDir, name)
 }
 
 // statePath is the full path to the state file.
 func (s *StateFile) statePath() string {
-	return buildStatePath(s.RootDir, s.ID)
+	return buildPath(s.RootDir, s.ID, stateFileExtension)
 }
 
 // lockPath is the full path to the lock file.
 func (s *StateFile) lockPath() string {
-	return filepath.Join(s.RootDir, s.ID+".lock")
+	return buildPath(s.RootDir, s.ID, "lock")
 }
 
 // destroy deletes all state created by the stateFile. It may be called with the
author	Fabricio Voznika <fvoznika@google.com>	2020-12-17 10:44:44 -0800
committer	gVisor bot <gvisor-bot@google.com>	2020-12-17 10:52:44 -0800
commit	8ea19b5818d0c1e9b798bd0bd288c7f51a46261d (patch)
tree	d7f9da9c969f460fc6267e6fe212e37bfdf12185 /runsc
parent	e7493a9e23325c00ad9a0db341d5887afe3ae5eb (diff)