8 files changed, 959 insertions, 350 deletions
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index bc1fa25e3..2bd12120d 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -7,6 +7,7 @@ go_library(
     srcs = [
         "container.go",
         "hook.go",
+        "state_file.go",
         "status.go",
     ],
     importpath = "gvisor.dev/gvisor/runsc/container",
@@ -47,6 +48,7 @@ go_test(
     ],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/bits",
         "//pkg/log",
         "//pkg/sentry/control",
         "//pkg/sentry/kernel",
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 7d67c3a75..5ed131a7f 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -28,6 +28,7 @@ import (
 	"github.com/kr/pty"
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/unet"
 	"gvisor.dev/gvisor/pkg/urpc"
 	"gvisor.dev/gvisor/runsc/testutil"
@@ -219,9 +220,9 @@ func TestJobControlSignalExec(t *testing.T) {
 	// Make sure all the processes are running.
 	expectedPL := []*control.Process{
 		// Root container process.
-		{PID: 1, Cmd: "sleep"},
+		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		// Bash from exec process.
-		{PID: 2, Cmd: "bash"},
+		{PID: 2, Cmd: "bash", Threads: []kernel.ThreadID{2}},
 	}
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Error(err)
@@ -231,7 +232,7 @@ func TestJobControlSignalExec(t *testing.T) {
 	ptyMaster.Write([]byte("sleep 100\n"))
 
 	// Wait for it to start. Sleep's PPID is bash's PID.
-	expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep"})
+	expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}})
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Error(err)
 	}
@@ -361,7 +362,7 @@ func TestJobControlSignalRootContainer(t *testing.T) {
 
 	// Wait for bash to start.
 	expectedPL := []*control.Process{
-		{PID: 1, Cmd: "bash"},
+		{PID: 1, Cmd: "bash", Threads: []kernel.ThreadID{1}},
 	}
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Fatal(err)
@@ -371,7 +372,7 @@ func TestJobControlSignalRootContainer(t *testing.T) {
 	ptyMaster.Write([]byte("sleep 100\n"))
 
 	// Wait for sleep to start.
-	expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep"})
+	expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{2}})
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Fatal(err)
 	}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index bbb364214..68782c4be 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -17,13 +17,11 @@ package container
 
 import (
 	"context"
-	"encoding/json"
 	"fmt"
 	"io/ioutil"
 	"os"
 	"os/exec"
 	"os/signal"
-	"path/filepath"
 	"regexp"
 	"strconv"
 	"strings"
@@ -31,7 +29,6 @@ import (
 	"time"
 
 	"github.com/cenkalti/backoff"
-	"github.com/gofrs/flock"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/control"
@@ -41,17 +38,6 @@ import (
 	"gvisor.dev/gvisor/runsc/specutils"
 )
 
-const (
-	// metadataFilename is the name of the metadata file relative to the
-	// container root directory that holds sandbox metadata.
-	metadataFilename = "meta.json"
-
-	// metadataLockFilename is the name of a lock file in the container
-	// root directory that is used to prevent concurrent modifications to
-	// the container state and metadata.
-	metadataLockFilename = "meta.lock"
-)
-
 // validateID validates the container id.
 func validateID(id string) error {
 	// See libcontainer/factory_linux.go.
@@ -99,11 +85,6 @@ type Container struct {
 	// BundleDir is the directory containing the container bundle.
 	BundleDir string `json:"bundleDir"`
 
-	// Root is the directory containing the container metadata file. If this
-	// container is the root container, Root and RootContainerDir will be the
-	// same.
-	Root string `json:"root"`
-
 	// CreatedAt is the time the container was created.
 	CreatedAt time.Time `json:"createdAt"`
 
@@ -121,21 +102,24 @@ type Container struct {
 	// be 0 if the gofer has been killed.
 	GoferPid int `json:"goferPid"`
 
+	// Sandbox is the sandbox this container is running in. It's set when the
+	// container is created and reset when the sandbox is destroyed.
+	Sandbox *sandbox.Sandbox `json:"sandbox"`
+
+	// Saver handles load from/save to the state file safely from multiple
+	// processes.
+	Saver StateFile `json:"saver"`
+
+	//
+	// Fields below this line are not saved in the state file and will not
+	// be preserved across commands.
+	//
+
 	// goferIsChild is set if a gofer process is a child of the current process.
 	//
 	// This field isn't saved to json, because only a creator of a gofer
 	// process will have it as a child process.
 	goferIsChild bool
-
-	// Sandbox is the sandbox this container is running in. It's set when the
-	// container is created and reset when the sandbox is destroyed.
-	Sandbox *sandbox.Sandbox `json:"sandbox"`
-
-	// RootContainerDir is the root directory containing the metadata file of the
-	// sandbox root container. It's used to lock in order to serialize creating
-	// and deleting this Container's metadata directory. If this container is the
-	// root container, this is the same as Root.
-	RootContainerDir string
 }
 
 // loadSandbox loads all containers that belong to the sandbox with the given
@@ -166,43 +150,35 @@ func loadSandbox(rootDir, id string) ([]*Container, error) {
 	return containers, nil
 }
 
-// Load loads a container with the given id from a metadata file. id may be an
-// abbreviation of the full container id, in which case Load loads the
-// container to which id unambiguously refers to.
-// Returns ErrNotExist if container doesn't exist.
-func Load(rootDir, id string) (*Container, error) {
-	log.Debugf("Load container %q %q", rootDir, id)
-	if err := validateID(id); err != nil {
+// Load loads a container with the given id from a metadata file. partialID may
+// be an abbreviation of the full container id, in which case Load loads the
+// container to which id unambiguously refers to. Returns ErrNotExist if
+// container doesn't exist.
+func Load(rootDir, partialID string) (*Container, error) {
+	log.Debugf("Load container %q %q", rootDir, partialID)
+	if err := validateID(partialID); err != nil {
 		return nil, fmt.Errorf("validating id: %v", err)
 	}
 
-	cRoot, err := findContainerRoot(rootDir, id)
+	id, err := findContainerID(rootDir, partialID)
 	if err != nil {
 		// Preserve error so that callers can distinguish 'not found' errors.
 		return nil, err
 	}
 
-	// Lock the container metadata to prevent other runsc instances from
-	// writing to it while we are reading it.
-	unlock, err := lockContainerMetadata(cRoot)
-	if err != nil {
-		return nil, err
+	state := StateFile{
+		RootDir: rootDir,
+		ID:      id,
 	}
-	defer unlock()
+	defer state.close()
 
-	// Read the container metadata file and create a new Container from it.
-	metaFile := filepath.Join(cRoot, metadataFilename)
-	metaBytes, err := ioutil.ReadFile(metaFile)
-	if err != nil {
+	c := &Container{}
+	if err := state.load(c); err != nil {
 		if os.IsNotExist(err) {
 			// Preserve error so that callers can distinguish 'not found' errors.
 			return nil, err
 		}
-		return nil, fmt.Errorf("reading container metadata file %q: %v", metaFile, err)
-	}
-	var c Container
-	if err := json.Unmarshal(metaBytes, &c); err != nil {
-		return nil, fmt.Errorf("unmarshaling container metadata from %q: %v", metaFile, err)
+		return nil, fmt.Errorf("reading container metadata file %q: %v", state.statePath(), err)
 	}
 
 	// If the status is "Running" or "Created", check that the sandbox
@@ -223,57 +199,37 @@ func Load(rootDir, id string) (*Container, error) {
 		}
 	}
 
-	return &c, nil
+	return c, nil
 }
 
-func findContainerRoot(rootDir, partialID string) (string, error) {
+func findContainerID(rootDir, partialID string) (string, error) {
 	// Check whether the id fully specifies an existing container.
-	cRoot := filepath.Join(rootDir, partialID)
-	if _, err := os.Stat(cRoot); err == nil {
-		return cRoot, nil
+	stateFile := buildStatePath(rootDir, partialID)
+	if _, err := os.Stat(stateFile); err == nil {
+		return partialID, nil
 	}
 
 	// Now see whether id could be an abbreviation of exactly 1 of the
 	// container ids. If id is ambiguous (it could match more than 1
 	// container), it is an error.
-	cRoot = ""
 	ids, err := List(rootDir)
 	if err != nil {
 		return "", err
 	}
+	rv := ""
 	for _, id := range ids {
 		if strings.HasPrefix(id, partialID) {
-			if cRoot != "" {
-				return "", fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, cRoot, id)
+			if rv != "" {
+				return "", fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, rv, id)
 			}
-			cRoot = id
+			rv = id
 		}
 	}
-	if cRoot == "" {
+	if rv == "" {
 		return "", os.ErrNotExist
 	}
-	log.Debugf("abbreviated id %q resolves to full id %q", partialID, cRoot)
-	return filepath.Join(rootDir, cRoot), nil
-}
-
-// List returns all container ids in the given root directory.
-func List(rootDir string) ([]string, error) {
-	log.Debugf("List containers %q", rootDir)
-	fs, err := ioutil.ReadDir(rootDir)
-	if err != nil {
-		return nil, fmt.Errorf("reading dir %q: %v", rootDir, err)
-	}
-	var out []string
-	for _, f := range fs {
-		// Filter out directories that do no belong to a container.
-		cid := f.Name()
-		if validateID(cid) == nil {
-			if _, err := os.Stat(filepath.Join(rootDir, cid, metadataFilename)); err == nil {
-				out = append(out, f.Name())
-			}
-		}
-	}
-	return out, nil
+	log.Debugf("abbreviated id %q resolves to full id %q", partialID, rv)
+	return rv, nil
 }
 
 // Args is used to configure a new container.
@@ -316,44 +272,34 @@ func New(conf *boot.Config, args Args) (*Container, error) {
 		return nil, err
 	}
 
-	unlockRoot, err := maybeLockRootContainer(args.Spec, conf.RootDir)
-	if err != nil {
-		return nil, err
+	if err := os.MkdirAll(conf.RootDir, 0711); err != nil {
+		return nil, fmt.Errorf("creating container root directory: %v", err)
 	}
-	defer unlockRoot()
+
+	c := &Container{
+		ID:            args.ID,
+		Spec:          args.Spec,
+		ConsoleSocket: args.ConsoleSocket,
+		BundleDir:     args.BundleDir,
+		Status:        Creating,
+		CreatedAt:     time.Now(),
+		Owner:         os.Getenv("USER"),
+		Saver: StateFile{
+			RootDir: conf.RootDir,
+			ID:      args.ID,
+		},
+	}
+	// The Cleanup object cleans up partially created containers when an error
+	// occurs. Any errors occurring during cleanup itself are ignored.
+	cu := specutils.MakeCleanup(func() { _ = c.Destroy() })
+	defer cu.Clean()
 
 	// Lock the container metadata file to prevent concurrent creations of
 	// containers with the same id.
-	containerRoot := filepath.Join(conf.RootDir, args.ID)
-	unlock, err := lockContainerMetadata(containerRoot)
-	if err != nil {
+	if err := c.Saver.lockForNew(); err != nil {
 		return nil, err
 	}
-	defer unlock()
-
-	// Check if the container already exists by looking for the metadata
-	// file.
-	if _, err := os.Stat(filepath.Join(containerRoot, metadataFilename)); err == nil {
-		return nil, fmt.Errorf("container with id %q already exists", args.ID)
-	} else if !os.IsNotExist(err) {
-		return nil, fmt.Errorf("looking for existing container in %q: %v", containerRoot, err)
-	}
-
-	c := &Container{
-		ID:               args.ID,
-		Spec:             args.Spec,
-		ConsoleSocket:    args.ConsoleSocket,
-		BundleDir:        args.BundleDir,
-		Root:             containerRoot,
-		Status:           Creating,
-		CreatedAt:        time.Now(),
-		Owner:            os.Getenv("USER"),
-		RootContainerDir: conf.RootDir,
-	}
-	// The Cleanup object cleans up partially created containers when an error occurs.
-	// Any errors occuring during cleanup itself are ignored.
-	cu := specutils.MakeCleanup(func() { _ = c.Destroy() })
-	defer cu.Clean()
+	defer c.Saver.unlock()
 
 	// If the metadata annotations indicate that this container should be
 	// started in an existing sandbox, we must do so. The metadata will
@@ -431,7 +377,7 @@ func New(conf *boot.Config, args Args) (*Container, error) {
 	c.changeStatus(Created)
 
 	// Save the metadata file.
-	if err := c.save(); err != nil {
+	if err := c.saveLocked(); err != nil {
 		return nil, err
 	}
 
@@ -451,17 +397,12 @@ func New(conf *boot.Config, args Args) (*Container, error) {
 func (c *Container) Start(conf *boot.Config) error {
 	log.Debugf("Start container %q", c.ID)
 
-	unlockRoot, err := maybeLockRootContainer(c.Spec, c.RootContainerDir)
-	if err != nil {
+	if err := c.Saver.lock(); err != nil {
 		return err
 	}
-	defer unlockRoot()
+	unlock := specutils.MakeCleanup(func() { c.Saver.unlock() })
+	defer unlock.Clean()
 
-	unlock, err := c.lock()
-	if err != nil {
-		return err
-	}
-	defer unlock()
 	if err := c.requireStatus("start", Created); err != nil {
 		return err
 	}
@@ -509,24 +450,31 @@ func (c *Container) Start(conf *boot.Config) error {
 	}
 
 	c.changeStatus(Running)
-	if err := c.save(); err != nil {
+	if err := c.saveLocked(); err != nil {
 		return err
 	}
 
-	// Adjust the oom_score_adj for sandbox and gofers. This must be done after
-	// save().
-	return c.adjustOOMScoreAdj(conf)
+	// Release lock before adjusting OOM score because the lock is acquired there.
+	unlock.Clean()
+
+	// Adjust the oom_score_adj for sandbox. This must be done after saveLocked().
+	if err := adjustSandboxOOMScoreAdj(c.Sandbox, c.Saver.RootDir, false); err != nil {
+		return err
+	}
+
+	// Set container's oom_score_adj to the gofer since it is dedicated to
+	// the container, in case the gofer uses up too much memory.
+	return c.adjustGoferOOMScoreAdj()
 }
 
 // Restore takes a container and replaces its kernel and file system
 // to restore a container from its state file.
 func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile string) error {
 	log.Debugf("Restore container %q", c.ID)
-	unlock, err := c.lock()
-	if err != nil {
+	if err := c.Saver.lock(); err != nil {
 		return err
 	}
-	defer unlock()
+	defer c.Saver.unlock()
 
 	if err := c.requireStatus("restore", Created); err != nil {
 		return err
@@ -544,7 +492,7 @@ func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile str
 		return err
 	}
 	c.changeStatus(Running)
-	return c.save()
+	return c.saveLocked()
 }
 
 // Run is a helper that calls Create + Start + Wait.
@@ -704,11 +652,10 @@ func (c *Container) Checkpoint(f *os.File) error {
 // The call only succeeds if the container's status is created or running.
 func (c *Container) Pause() error {
 	log.Debugf("Pausing container %q", c.ID)
-	unlock, err := c.lock()
-	if err != nil {
+	if err := c.Saver.lock(); err != nil {
 		return err
 	}
-	defer unlock()
+	defer c.Saver.unlock()
 
 	if c.Status != Created && c.Status != Running {
 		return fmt.Errorf("cannot pause container %q in state %v", c.ID, c.Status)
@@ -718,18 +665,17 @@ func (c *Container) Pause() error {
 		return fmt.Errorf("pausing container: %v", err)
 	}
 	c.changeStatus(Paused)
-	return c.save()
+	return c.saveLocked()
 }
 
 // Resume unpauses the container and its kernel.
 // The call only succeeds if the container's status is paused.
 func (c *Container) Resume() error {
 	log.Debugf("Resuming container %q", c.ID)
-	unlock, err := c.lock()
-	if err != nil {
+	if err := c.Saver.lock(); err != nil {
 		return err
 	}
-	defer unlock()
+	defer c.Saver.unlock()
 
 	if c.Status != Paused {
 		return fmt.Errorf("cannot resume container %q in state %v", c.ID, c.Status)
@@ -738,7 +684,7 @@ func (c *Container) Resume() error {
 		return fmt.Errorf("resuming container: %v", err)
 	}
 	c.changeStatus(Running)
-	return c.save()
+	return c.saveLocked()
 }
 
 // State returns the metadata of the container.
@@ -766,6 +712,17 @@ func (c *Container) Processes() ([]*control.Process, error) {
 func (c *Container) Destroy() error {
 	log.Debugf("Destroy container %q", c.ID)
 
+	if err := c.Saver.lock(); err != nil {
+		return err
+	}
+	defer func() {
+		c.Saver.unlock()
+		c.Saver.close()
+	}()
+
+	// Stored for later use as stop() sets c.Sandbox to nil.
+	sb := c.Sandbox
+
 	// We must perform the following cleanup steps:
 	// * stop the container and gofer processes,
 	// * remove the container filesystem on the host, and
@@ -775,35 +732,43 @@ func (c *Container) Destroy() error {
 	// do our best to perform all of the cleanups. Hence, we keep a slice
 	// of errors return their concatenation.
 	var errs []string
-
-	unlock, err := maybeLockRootContainer(c.Spec, c.RootContainerDir)
-	if err != nil {
-		return err
-	}
-	defer unlock()
-
 	if err := c.stop(); err != nil {
 		err = fmt.Errorf("stopping container: %v", err)
 		log.Warningf("%v", err)
 		errs = append(errs, err.Error())
 	}
 
-	if err := os.RemoveAll(c.Root); err != nil && !os.IsNotExist(err) {
-		err = fmt.Errorf("deleting container root directory %q: %v", c.Root, err)
+	if err := c.Saver.destroy(); err != nil {
+		err = fmt.Errorf("deleting container state files: %v", err)
 		log.Warningf("%v", err)
 		errs = append(errs, err.Error())
 	}
 
 	c.changeStatus(Stopped)
 
+	// Adjust oom_score_adj for the sandbox. This must be done after the container
+	// is stopped and the directory at c.Root is removed. Adjustment can be
+	// skipped if the root container is exiting, because it brings down the entire
+	// sandbox.
+	//
+	// Use 'sb' to tell whether it has been executed before because Destroy must
+	// be idempotent.
+	if sb != nil && !isRoot(c.Spec) {
+		if err := adjustSandboxOOMScoreAdj(sb, c.Saver.RootDir, true); err != nil {
+			errs = append(errs, err.Error())
+		}
+	}
+
 	// "If any poststop hook fails, the runtime MUST log a warning, but the
-	// remaining hooks and lifecycle continue as if the hook had succeeded" -OCI spec.
-	// Based on the OCI, "The post-stop hooks MUST be called after the container is
-	// deleted but before the delete operation returns"
+	// remaining hooks and lifecycle continue as if the hook had
+	// succeeded" - OCI spec.
+	//
+	// Based on the OCI, "The post-stop hooks MUST be called after the container
+	// is deleted but before the delete operation returns"
 	// Run it here to:
 	// 1) Conform to the OCI.
-	// 2) Make sure it only runs once, because the root has been deleted, the container
-	// can't be loaded again.
+	// 2) Make sure it only runs once, because the root has been deleted, the
+	// container can't be loaded again.
 	if c.Spec.Hooks != nil {
 		executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State())
 	}
@@ -814,18 +779,13 @@ func (c *Container) Destroy() error {
 	return fmt.Errorf(strings.Join(errs, "\n"))
 }
 
-// save saves the container metadata to a file.
+// saveLocked saves the container metadata to a file.
 //
 // Precondition: container must be locked with container.lock().
-func (c *Container) save() error {
+func (c *Container) saveLocked() error {
 	log.Debugf("Save container %q", c.ID)
-	metaFile := filepath.Join(c.Root, metadataFilename)
-	meta, err := json.Marshal(c)
-	if err != nil {
-		return fmt.Errorf("invalid container metadata: %v", err)
-	}
-	if err := ioutil.WriteFile(metaFile, meta, 0640); err != nil {
-		return fmt.Errorf("writing container metadata: %v", err)
+	if err := c.Saver.saveLocked(c); err != nil {
+		return fmt.Errorf("saving container metadata: %v", err)
 	}
 	return nil
 }
@@ -926,7 +886,14 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bund
 	}
 
 	if conf.DebugLog != "" {
-		debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "gofer")
+		test := ""
+		if len(conf.TestOnlyTestNameEnv) != 0 {
+			// Fetch test name if one is provided and the test only flag was set.
+			if t, ok := specutils.EnvVar(spec.Process.Env, conf.TestOnlyTestNameEnv); ok {
+				test = t
+			}
+		}
+		debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "gofer", test)
 		if err != nil {
 			return nil, nil, fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err)
 		}
@@ -1079,50 +1046,8 @@ func (c *Container) requireStatus(action string, statuses ...Status) error {
 	return fmt.Errorf("cannot %s container %q in state %s", action, c.ID, c.Status)
 }
 
-// lock takes a file lock on the container metadata lock file.
-func (c *Container) lock() (func() error, error) {
-	return lockContainerMetadata(filepath.Join(c.Root, c.ID))
-}
-
-// lockContainerMetadata takes a file lock on the metadata lock file in the
-// given container root directory.
-func lockContainerMetadata(containerRootDir string) (func() error, error) {
-	if err := os.MkdirAll(containerRootDir, 0711); err != nil {
-		return nil, fmt.Errorf("creating container root directory %q: %v", containerRootDir, err)
-	}
-	f := filepath.Join(containerRootDir, metadataLockFilename)
-	l := flock.NewFlock(f)
-	if err := l.Lock(); err != nil {
-		return nil, fmt.Errorf("acquiring lock on container lock file %q: %v", f, err)
-	}
-	return l.Unlock, nil
-}
-
-// maybeLockRootContainer locks the sandbox root container. It is used to
-// prevent races to create and delete child container sandboxes.
-func maybeLockRootContainer(spec *specs.Spec, rootDir string) (func() error, error) {
-	if isRoot(spec) {
-		return func() error { return nil }, nil
-	}
-
-	sbid, ok := specutils.SandboxID(spec)
-	if !ok {
-		return nil, fmt.Errorf("no sandbox ID found when locking root container")
-	}
-	sb, err := Load(rootDir, sbid)
-	if err != nil {
-		return nil, err
-	}
-
-	unlock, err := sb.lock()
-	if err != nil {
-		return nil, err
-	}
-	return unlock, nil
-}
-
 func isRoot(spec *specs.Spec) bool {
-	return specutils.ShouldCreateSandbox(spec)
+	return specutils.SpecContainerType(spec) != specutils.ContainerTypeContainer
 }
 
 // runInCgroup executes fn inside the specified cgroup. If cg is nil, execute
@@ -1139,33 +1064,85 @@ func runInCgroup(cg *cgroup.Cgroup, fn func() error) error {
 	return fn()
 }
 
-// adjustOOMScoreAdj sets the oom_score_adj for the sandbox and all gofers.
+// adjustGoferOOMScoreAdj sets the oom_store_adj for the container's gofer.
+func (c *Container) adjustGoferOOMScoreAdj() error {
+	if c.GoferPid != 0 && c.Spec.Process.OOMScoreAdj != nil {
+		if err := setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj); err != nil {
+			// Ignore NotExist error because it can be returned when the sandbox
+			// exited while OOM score was being adjusted.
+			if !os.IsNotExist(err) {
+				return fmt.Errorf("setting gofer oom_score_adj for container %q: %v", c.ID, err)
+			}
+			log.Warningf("Gofer process (%d) not found setting oom_score_adj", c.GoferPid)
+		}
+	}
+
+	return nil
+}
+
+// adjustSandboxOOMScoreAdj sets the oom_score_adj for the sandbox.
 // oom_score_adj is set to the lowest oom_score_adj among the containers
 // running in the sandbox.
 //
 // TODO(gvisor.dev/issue/512): This call could race with other containers being
 // created at the same time and end up setting the wrong oom_score_adj to the
 // sandbox.
-func (c *Container) adjustOOMScoreAdj(conf *boot.Config) error {
-	// If this container's OOMScoreAdj is nil then we can exit early as no
-	// change should be made to oom_score_adj for the sandbox.
-	if c.Spec.Process.OOMScoreAdj == nil {
-		return nil
-	}
-
-	containers, err := loadSandbox(conf.RootDir, c.Sandbox.ID)
+func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, rootDir string, destroy bool) error {
+	containers, err := loadSandbox(rootDir, s.ID)
 	if err != nil {
 		return fmt.Errorf("loading sandbox containers: %v", err)
 	}
 
+	// Do nothing if the sandbox has been terminated.
+	if len(containers) == 0 {
+		return nil
+	}
+
 	// Get the lowest score for all containers.
 	var lowScore int
 	scoreFound := false
-	for _, container := range containers {
-		if container.Spec.Process.OOMScoreAdj != nil && (!scoreFound || *container.Spec.Process.OOMScoreAdj < lowScore) {
+	if len(containers) == 1 && specutils.SpecContainerType(containers[0].Spec) == specutils.ContainerTypeUnspecified {
+		// This is a single-container sandbox. Set the oom_score_adj to
+		// the value specified in the OCI bundle.
+		if containers[0].Spec.Process.OOMScoreAdj != nil {
 			scoreFound = true
-			lowScore = *container.Spec.Process.OOMScoreAdj
+			lowScore = *containers[0].Spec.Process.OOMScoreAdj
 		}
+	} else {
+		for _, container := range containers {
+			// Special multi-container support for CRI. Ignore the root
+			// container when calculating oom_score_adj for the sandbox because
+			// it is the infrastructure (pause) container and always has a very
+			// low oom_score_adj.
+			//
+			// We will use OOMScoreAdj in the single-container case where the
+			// containerd container-type annotation is not present.
+			if specutils.SpecContainerType(container.Spec) == specutils.ContainerTypeSandbox {
+				continue
+			}
+
+			if container.Spec.Process.OOMScoreAdj != nil && (!scoreFound || *container.Spec.Process.OOMScoreAdj < lowScore) {
+				scoreFound = true
+				lowScore = *container.Spec.Process.OOMScoreAdj
+			}
+		}
+	}
+
+	// If the container is destroyed and remaining containers have no
+	// oomScoreAdj specified then we must revert to the oom_score_adj of the
+	// parent process.
+	if !scoreFound && destroy {
+		ppid, err := specutils.GetParentPid(s.Pid)
+		if err != nil {
+			return fmt.Errorf("getting parent pid of sandbox pid %d: %v", s.Pid, err)
+		}
+		pScore, err := specutils.GetOOMScoreAdj(ppid)
+		if err != nil {
+			return fmt.Errorf("getting oom_score_adj of parent %d: %v", ppid, err)
+		}
+
+		scoreFound = true
+		lowScore = pScore
 	}
 
 	// Only set oom_score_adj if one of the containers has oom_score_adj set
@@ -1177,15 +1154,15 @@ func (c *Container) adjustOOMScoreAdj(conf *boot.Config) error {
 	}
 
 	// Set the lowest of all containers oom_score_adj to the sandbox.
-	if err := setOOMScoreAdj(c.Sandbox.Pid, lowScore); err != nil {
-		return fmt.Errorf("setting oom_score_adj for sandbox %q: %v", c.Sandbox.ID, err)
+	if err := setOOMScoreAdj(s.Pid, lowScore); err != nil {
+		// Ignore NotExist error because it can be returned when the sandbox
+		// exited while OOM score was being adjusted.
+		if !os.IsNotExist(err) {
+			return fmt.Errorf("setting oom_score_adj for sandbox %q: %v", s.ID, err)
+		}
+		log.Warningf("Sandbox process (%d) not found setting oom_score_adj", s.Pid)
 	}
 
-	// Set container's oom_score_adj to the gofer since it is dedicated to the
-	// container, in case the gofer uses up too much memory.
-	if err := setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj); err != nil {
-		return fmt.Errorf("setting gofer oom_score_adj for container %q: %v", c.ID, err)
-	}
 	return nil
 }
 
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 2ac12e5b6..c10f85992 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -34,8 +34,10 @@ import (
 	"github.com/cenkalti/backoff"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/bits"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/control"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/runsc/boot"
 	"gvisor.dev/gvisor/runsc/boot/platforms"
@@ -51,8 +53,9 @@ func waitForProcessList(cont *Container, want []*control.Process) error {
 			err = fmt.Errorf("error getting process data from container: %v", err)
 			return &backoff.PermanentError{Err: err}
 		}
-		if !procListsEqual(got, want) {
-			return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want))
+		if r, err := procListsEqual(got, want); !r {
+			return fmt.Errorf("container got process list: %s, want: %s: error: %v",
+				procListToString(got), procListToString(want), err)
 		}
 		return nil
 	}
@@ -90,22 +93,34 @@ func blockUntilWaitable(pid int) error {
 
 // procListsEqual is used to check whether 2 Process lists are equal for all
 // implemented fields.
-func procListsEqual(got, want []*control.Process) bool {
+func procListsEqual(got, want []*control.Process) (bool, error) {
 	if len(got) != len(want) {
-		return false
+		return false, nil
 	}
 	for i := range got {
 		pd1 := got[i]
 		pd2 := want[i]
-		// Zero out unimplemented and timing dependant fields.
+		// Zero out timing dependant fields.
 		pd1.Time = ""
 		pd1.STime = ""
 		pd1.C = 0
-		if *pd1 != *pd2 {
-			return false
+		// Ignore TTY field too, since it's not relevant in the cases
+		// where we use this method. Tests that care about the TTY
+		// field should check for it themselves.
+		pd1.TTY = ""
+		pd1Json, err := control.ProcessListToJSON([]*control.Process{pd1})
+		if err != nil {
+			return false, err
+		}
+		pd2Json, err := control.ProcessListToJSON([]*control.Process{pd2})
+		if err != nil {
+			return false, err
+		}
+		if pd1Json != pd2Json {
+			return false, nil
 		}
 	}
-	return true
+	return true, nil
 }
 
 // getAndCheckProcLists is similar to waitForProcessList, but does not wait and retry the
@@ -115,7 +130,11 @@ func getAndCheckProcLists(cont *Container, want []*control.Process) error {
 	if err != nil {
 		return fmt.Errorf("error getting process data from container: %v", err)
 	}
-	if procListsEqual(got, want) {
+	equal, err := procListsEqual(got, want)
+	if err != nil {
+		return err
+	}
+	if equal {
 		return nil
 	}
 	return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want))
@@ -287,11 +306,12 @@ func TestLifecycle(t *testing.T) {
 		// expectedPL lists the expected process state of the container.
 		expectedPL := []*control.Process{
 			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     0,
+				PID:     1,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{1},
 			},
 		}
 		// Create the container.
@@ -589,18 +609,20 @@ func TestExec(t *testing.T) {
 		// expectedPL lists the expected process state of the container.
 		expectedPL := []*control.Process{
 			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     0,
+				PID:     1,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{1},
 			},
 			{
-				UID:  uid,
-				PID:  2,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     uid,
+				PID:     2,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{2},
 			},
 		}
 
@@ -1061,18 +1083,20 @@ func TestPauseResume(t *testing.T) {
 		// expectedPL lists the expected process state of the container.
 		expectedPL := []*control.Process{
 			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     0,
+				PID:     1,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{1},
 			},
 			{
-				UID:  uid,
-				PID:  2,
-				PPID: 0,
-				C:    0,
-				Cmd:  "bash",
+				UID:     uid,
+				PID:     2,
+				PPID:    0,
+				C:       0,
+				Cmd:     "bash",
+				Threads: []kernel.ThreadID{2},
 			},
 		}
 
@@ -1125,11 +1149,12 @@ func TestPauseResume(t *testing.T) {
 
 		expectedPL2 := []*control.Process{
 			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     0,
+				PID:     1,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{1},
 			},
 		}
 
@@ -1240,18 +1265,20 @@ func TestCapabilities(t *testing.T) {
 		// expectedPL lists the expected process state of the container.
 		expectedPL := []*control.Process{
 			{
-				UID:  0,
-				PID:  1,
-				PPID: 0,
-				C:    0,
-				Cmd:  "sleep",
+				UID:     0,
+				PID:     1,
+				PPID:    0,
+				C:       0,
+				Cmd:     "sleep",
+				Threads: []kernel.ThreadID{1},
 			},
 			{
-				UID:  uid,
-				PID:  2,
-				PPID: 0,
-				C:    0,
-				Cmd:  "exe",
+				UID:     uid,
+				PID:     2,
+				PPID:    0,
+				C:       0,
+				Cmd:     "exe",
+				Threads: []kernel.ThreadID{2},
 			},
 		}
 		if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
@@ -1547,7 +1574,8 @@ func TestAbbreviatedIDs(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfigWithRoot(rootDir)
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
 
 	cids := []string{
 		"foo-" + testutil.UniqueContainerID(),
@@ -2049,6 +2077,156 @@ func TestMountSymlink(t *testing.T) {
 	}
 }
 
+// Check that --net-raw disables the CAP_NET_RAW capability.
+func TestNetRaw(t *testing.T) {
+	capNetRaw := strconv.FormatUint(bits.MaskOf64(int(linux.CAP_NET_RAW)), 10)
+	app, err := testutil.FindFile("runsc/container/test_app/test_app")
+	if err != nil {
+		t.Fatal("error finding test_app:", err)
+	}
+
+	for _, enableRaw := range []bool{true, false} {
+		conf := testutil.TestConfig()
+		conf.EnableRaw = enableRaw
+
+		test := "--enabled"
+		if !enableRaw {
+			test = "--disabled"
+		}
+
+		spec := testutil.NewSpecWithArgs(app, "capability", test, capNetRaw)
+		if err := run(spec, conf); err != nil {
+			t.Fatalf("Error running container: %v", err)
+		}
+	}
+}
+
+// TestOverlayfsStaleRead most basic test that '--overlayfs-stale-read' works.
+func TestOverlayfsStaleRead(t *testing.T) {
+	conf := testutil.TestConfig()
+	conf.OverlayfsStaleRead = true
+
+	in, err := ioutil.TempFile(testutil.TmpDir(), "stale-read.in")
+	if err != nil {
+		t.Fatalf("ioutil.TempFile() failed: %v", err)
+	}
+	defer in.Close()
+	if _, err := in.WriteString("stale data"); err != nil {
+		t.Fatalf("in.Write() failed: %v", err)
+	}
+
+	out, err := ioutil.TempFile(testutil.TmpDir(), "stale-read.out")
+	if err != nil {
+		t.Fatalf("ioutil.TempFile() failed: %v", err)
+	}
+	defer out.Close()
+
+	const want = "foobar"
+	cmd := fmt.Sprintf("cat %q && echo %q> %q && cp %q %q", in.Name(), want, in.Name(), in.Name(), out.Name())
+	spec := testutil.NewSpecWithArgs("/bin/bash", "-c", cmd)
+	if err := run(spec, conf); err != nil {
+		t.Fatalf("Error running container: %v", err)
+	}
+
+	gotBytes, err := ioutil.ReadAll(out)
+	if err != nil {
+		t.Fatalf("out.Read() failed: %v", err)
+	}
+	got := strings.TrimSpace(string(gotBytes))
+	if want != got {
+		t.Errorf("Wrong content in out file, got: %q. want: %q", got, want)
+	}
+}
+
+// TestTTYField checks TTY field returned by container.Processes().
+func TestTTYField(t *testing.T) {
+	stop := testutil.StartReaper()
+	defer stop()
+
+	testApp, err := testutil.FindFile("runsc/container/test_app/test_app")
+	if err != nil {
+		t.Fatal("error finding test_app:", err)
+	}
+
+	testCases := []struct {
+		name         string
+		useTTY       bool
+		wantTTYField string
+	}{
+		{
+			name:         "no tty",
+			useTTY:       false,
+			wantTTYField: "?",
+		},
+		{
+			name:         "tty used",
+			useTTY:       true,
+			wantTTYField: "pts/0",
+		},
+	}
+
+	for _, test := range testCases {
+		t.Run(test.name, func(t *testing.T) {
+			conf := testutil.TestConfig()
+
+			// We will run /bin/sleep, possibly with an open TTY.
+			cmd := []string{"/bin/sleep", "10000"}
+			if test.useTTY {
+				// Run inside the "pty-runner".
+				cmd = append([]string{testApp, "pty-runner"}, cmd...)
+			}
+
+			spec := testutil.NewSpecWithArgs(cmd...)
+			rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+			if err != nil {
+				t.Fatalf("error setting up container: %v", err)
+			}
+			defer os.RemoveAll(rootDir)
+			defer os.RemoveAll(bundleDir)
+
+			// Create and start the container.
+			args := Args{
+				ID:        testutil.UniqueContainerID(),
+				Spec:      spec,
+				BundleDir: bundleDir,
+			}
+			c, err := New(conf, args)
+			if err != nil {
+				t.Fatalf("error creating container: %v", err)
+			}
+			defer c.Destroy()
+			if err := c.Start(conf); err != nil {
+				t.Fatalf("error starting container: %v", err)
+			}
+
+			// Wait for sleep to be running, and check the TTY
+			// field.
+			var gotTTYField string
+			cb := func() error {
+				ps, err := c.Processes()
+				if err != nil {
+					err = fmt.Errorf("error getting process data from container: %v", err)
+					return &backoff.PermanentError{Err: err}
+				}
+				for _, p := range ps {
+					if strings.Contains(p.Cmd, "sleep") {
+						gotTTYField = p.TTY
+						return nil
+					}
+				}
+				return fmt.Errorf("sleep not running")
+			}
+			if err := testutil.Poll(cb, 30*time.Second); err != nil {
+				t.Fatalf("error waiting for sleep process: %v", err)
+			}
+
+			if gotTTYField != test.wantTTYField {
+				t.Errorf("tty field got %q, want %q", gotTTYField, test.wantTTYField)
+			}
+		})
+	}
+}
+
 // executeSync synchronously executes a new process.
 func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
 	pid, err := cont.Execute(args)
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index bd45a5118..4ad09ceab 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -60,13 +60,8 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) {
 }
 
 func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*Container, func(), error) {
-	// Setup root dir if one hasn't been provided.
 	if len(conf.RootDir) == 0 {
-		rootDir, err := testutil.SetupRootDir()
-		if err != nil {
-			return nil, nil, fmt.Errorf("error creating root dir: %v", err)
-		}
-		conf.RootDir = rootDir
+		panic("conf.RootDir not set. Call testutil.SetupRootDir() to set.")
 	}
 
 	var containers []*Container
@@ -78,7 +73,6 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
 		for _, b := range bundles {
 			os.RemoveAll(b)
 		}
-		os.RemoveAll(conf.RootDir)
 	}
 	for i, spec := range specs {
 		bundleDir, err := testutil.SetupBundleDir(spec)
@@ -129,11 +123,11 @@ func execMany(execs []execDesc) error {
 
 func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) {
 	for _, spec := range pod {
-		spec.Annotations[path.Join(boot.MountPrefix, name, "source")] = mount.Source
-		spec.Annotations[path.Join(boot.MountPrefix, name, "type")] = mount.Type
-		spec.Annotations[path.Join(boot.MountPrefix, name, "share")] = "pod"
+		spec.Annotations[boot.MountPrefix+name+".source"] = mount.Source
+		spec.Annotations[boot.MountPrefix+name+".type"] = mount.Type
+		spec.Annotations[boot.MountPrefix+name+".share"] = "pod"
 		if len(mount.Options) > 0 {
-			spec.Annotations[path.Join(boot.MountPrefix, name, "options")] = strings.Join(mount.Options, ",")
+			spec.Annotations[boot.MountPrefix+name+".options"] = strings.Join(mount.Options, ",")
 		}
 	}
 }
@@ -144,6 +138,13 @@ func TestMultiContainerSanity(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		specs, ids := createSpecs(sleep, sleep)
@@ -155,13 +156,13 @@ func TestMultiContainerSanity(t *testing.T) {
 
 		// Check via ps that multiple processes are running.
 		expectedPL := []*control.Process{
-			{PID: 1, Cmd: "sleep"},
+			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		}
 		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
 		}
 		expectedPL = []*control.Process{
-			{PID: 2, Cmd: "sleep"},
+			{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
 		}
 		if err := waitForProcessList(containers[1], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
@@ -175,6 +176,13 @@ func TestMultiPIDNS(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		testSpecs, ids := createSpecs(sleep, sleep)
@@ -194,13 +202,13 @@ func TestMultiPIDNS(t *testing.T) {
 
 		// Check via ps that multiple processes are running.
 		expectedPL := []*control.Process{
-			{PID: 1, Cmd: "sleep"},
+			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		}
 		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
 		}
 		expectedPL = []*control.Process{
-			{PID: 1, Cmd: "sleep"},
+			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		}
 		if err := waitForProcessList(containers[1], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
@@ -213,6 +221,13 @@ func TestMultiPIDNSPath(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		testSpecs, ids := createSpecs(sleep, sleep, sleep)
@@ -249,7 +264,7 @@ func TestMultiPIDNSPath(t *testing.T) {
 
 		// Check via ps that multiple processes are running.
 		expectedPL := []*control.Process{
-			{PID: 1, Cmd: "sleep"},
+			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		}
 		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
@@ -259,7 +274,7 @@ func TestMultiPIDNSPath(t *testing.T) {
 		}
 
 		expectedPL = []*control.Process{
-			{PID: 2, Cmd: "sleep"},
+			{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
 		}
 		if err := waitForProcessList(containers[1], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
@@ -268,13 +283,21 @@ func TestMultiPIDNSPath(t *testing.T) {
 }
 
 func TestMultiContainerWait(t *testing.T) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	// The first container should run the entire duration of the test.
 	cmd1 := []string{"sleep", "100"}
 	// We'll wait on the second container, which is much shorter lived.
 	cmd2 := []string{"sleep", "1"}
 	specs, ids := createSpecs(cmd1, cmd2)
 
-	conf := testutil.TestConfig()
 	containers, cleanup, err := startContainers(conf, specs, ids)
 	if err != nil {
 		t.Fatalf("error starting containers: %v", err)
@@ -283,7 +306,7 @@ func TestMultiContainerWait(t *testing.T) {
 
 	// Check via ps that multiple processes are running.
 	expectedPL := []*control.Process{
-		{PID: 2, Cmd: "sleep"},
+		{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
 	}
 	if err := waitForProcessList(containers[1], expectedPL); err != nil {
 		t.Errorf("failed to wait for sleep to start: %v", err)
@@ -328,7 +351,7 @@ func TestMultiContainerWait(t *testing.T) {
 	// After Wait returns, ensure that the root container is running and
 	// the child has finished.
 	expectedPL = []*control.Process{
-		{PID: 1, Cmd: "sleep"},
+		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 	}
 	if err := waitForProcessList(containers[0], expectedPL); err != nil {
 		t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err)
@@ -344,12 +367,14 @@ func TestExecWait(t *testing.T) {
 	}
 	defer os.RemoveAll(rootDir)
 
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	// The first container should run the entire duration of the test.
 	cmd1 := []string{"sleep", "100"}
 	// We'll wait on the second container, which is much shorter lived.
 	cmd2 := []string{"sleep", "1"}
 	specs, ids := createSpecs(cmd1, cmd2)
-	conf := testutil.TestConfig()
 	containers, cleanup, err := startContainers(conf, specs, ids)
 	if err != nil {
 		t.Fatalf("error starting containers: %v", err)
@@ -358,7 +383,7 @@ func TestExecWait(t *testing.T) {
 
 	// Check via ps that process is running.
 	expectedPL := []*control.Process{
-		{PID: 2, Cmd: "sleep"},
+		{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
 	}
 	if err := waitForProcessList(containers[1], expectedPL); err != nil {
 		t.Fatalf("failed to wait for sleep to start: %v", err)
@@ -393,7 +418,7 @@ func TestExecWait(t *testing.T) {
 
 	// Wait for the exec'd process to exit.
 	expectedPL = []*control.Process{
-		{PID: 1, Cmd: "sleep"},
+		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 	}
 	if err := waitForProcessList(containers[0], expectedPL); err != nil {
 		t.Fatalf("failed to wait for second container to stop: %v", err)
@@ -432,7 +457,15 @@ func TestMultiContainerMount(t *testing.T) {
 	})
 
 	// Setup the containers.
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
 	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	containers, cleanup, err := startContainers(conf, sps, ids)
 	if err != nil {
 		t.Fatalf("error starting containers: %v", err)
@@ -454,6 +487,13 @@ func TestMultiContainerSignal(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		specs, ids := createSpecs(sleep, sleep)
@@ -465,7 +505,7 @@ func TestMultiContainerSignal(t *testing.T) {
 
 		// Check via ps that container 1 process is running.
 		expectedPL := []*control.Process{
-			{PID: 2, Cmd: "sleep"},
+			{PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
 		}
 
 		if err := waitForProcessList(containers[1], expectedPL); err != nil {
@@ -479,7 +519,7 @@ func TestMultiContainerSignal(t *testing.T) {
 
 		// Make sure process 1 is still running.
 		expectedPL = []*control.Process{
-			{PID: 1, Cmd: "sleep"},
+			{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 		}
 		if err := waitForProcessList(containers[0], expectedPL); err != nil {
 			t.Errorf("failed to wait for sleep to start: %v", err)
@@ -548,6 +588,13 @@ func TestMultiContainerDestroy(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// First container will remain intact while the second container is killed.
 		podSpecs, ids := createSpecs(
 			[]string{"sleep", "100"},
@@ -586,9 +633,10 @@ func TestMultiContainerDestroy(t *testing.T) {
 		if err != nil {
 			t.Fatalf("error getting process data from sandbox: %v", err)
 		}
-		expectedPL := []*control.Process{{PID: 1, Cmd: "sleep"}}
-		if !procListsEqual(pss, expectedPL) {
-			t.Errorf("container got process list: %s, want: %s", procListToString(pss), procListToString(expectedPL))
+		expectedPL := []*control.Process{{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}}
+		if r, err := procListsEqual(pss, expectedPL); !r {
+			t.Errorf("container got process list: %s, want: %s: error: %v",
+				procListToString(pss), procListToString(expectedPL), err)
 		}
 
 		// Check that cont.Destroy is safe to call multiple times.
@@ -599,13 +647,21 @@ func TestMultiContainerDestroy(t *testing.T) {
 }
 
 func TestMultiContainerProcesses(t *testing.T) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	// Note: use curly braces to keep 'sh' process around. Otherwise, shell
 	// will just execve into 'sleep' and both containers will look the
 	// same.
 	specs, ids := createSpecs(
 		[]string{"sleep", "100"},
 		[]string{"sh", "-c", "{ sleep 100; }"})
-	conf := testutil.TestConfig()
 	containers, cleanup, err := startContainers(conf, specs, ids)
 	if err != nil {
 		t.Fatalf("error starting containers: %v", err)
@@ -614,7 +670,7 @@ func TestMultiContainerProcesses(t *testing.T) {
 
 	// Check root's container process list doesn't include other containers.
 	expectedPL0 := []*control.Process{
-		{PID: 1, Cmd: "sleep"},
+		{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
 	}
 	if err := waitForProcessList(containers[0], expectedPL0); err != nil {
 		t.Errorf("failed to wait for process to start: %v", err)
@@ -622,8 +678,8 @@ func TestMultiContainerProcesses(t *testing.T) {
 
 	// Same for the other container.
 	expectedPL1 := []*control.Process{
-		{PID: 2, Cmd: "sh"},
-		{PID: 3, PPID: 2, Cmd: "sleep"},
+		{PID: 2, Cmd: "sh", Threads: []kernel.ThreadID{2}},
+		{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}},
 	}
 	if err := waitForProcessList(containers[1], expectedPL1); err != nil {
 		t.Errorf("failed to wait for process to start: %v", err)
@@ -637,7 +693,7 @@ func TestMultiContainerProcesses(t *testing.T) {
 	if _, err := containers[1].Execute(args); err != nil {
 		t.Fatalf("error exec'ing: %v", err)
 	}
-	expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep"})
+	expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep", Threads: []kernel.ThreadID{4}})
 	if err := waitForProcessList(containers[1], expectedPL1); err != nil {
 		t.Errorf("failed to wait for process to start: %v", err)
 	}
@@ -650,6 +706,15 @@ func TestMultiContainerProcesses(t *testing.T) {
 // TestMultiContainerKillAll checks that all process that belong to a container
 // are killed when SIGKILL is sent to *all* processes in that container.
 func TestMultiContainerKillAll(t *testing.T) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	for _, tc := range []struct {
 		killContainer bool
 	}{
@@ -665,7 +730,6 @@ func TestMultiContainerKillAll(t *testing.T) {
 		specs, ids := createSpecs(
 			[]string{app, "task-tree", "--depth=2", "--width=2"},
 			[]string{app, "task-tree", "--depth=4", "--width=2"})
-		conf := testutil.TestConfig()
 		containers, cleanup, err := startContainers(conf, specs, ids)
 		if err != nil {
 			t.Fatalf("error starting containers: %v", err)
@@ -739,19 +803,13 @@ func TestMultiContainerDestroyNotStarted(t *testing.T) {
 	specs, ids := createSpecs(
 		[]string{"/bin/sleep", "100"},
 		[]string{"/bin/sleep", "100"})
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
 
-	conf := testutil.TestConfigWithRoot(rootDir)
-
-	// Create and start root container.
-	rootBundleDir, err := testutil.SetupBundleDir(specs[0])
+	conf := testutil.TestConfig()
+	rootDir, rootBundleDir, err := testutil.SetupContainer(specs[0], conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
+	defer os.RemoveAll(rootDir)
 	defer os.RemoveAll(rootBundleDir)
 
 	rootArgs := Args{
@@ -800,19 +858,12 @@ func TestMultiContainerDestroyStarting(t *testing.T) {
 	}
 	specs, ids := createSpecs(cmds...)
 
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-
-	conf := testutil.TestConfigWithRoot(rootDir)
-
-	// Create and start root container.
-	rootBundleDir, err := testutil.SetupBundleDir(specs[0])
+	conf := testutil.TestConfig()
+	rootDir, rootBundleDir, err := testutil.SetupContainer(specs[0], conf)
 	if err != nil {
 		t.Fatalf("error setting up container: %v", err)
 	}
+	defer os.RemoveAll(rootDir)
 	defer os.RemoveAll(rootBundleDir)
 
 	rootArgs := Args{
@@ -886,9 +937,17 @@ func TestMultiContainerDifferentFilesystems(t *testing.T) {
 	script := fmt.Sprintf("if [ -f %q ]; then exit 1; else touch %q; fi", filename, filename)
 	cmd := []string{"sh", "-c", script}
 
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	// Make sure overlay is enabled, and none of the root filesystems are
 	// read-only, otherwise we won't be able to create the file.
-	conf := testutil.TestConfig()
 	conf.Overlay = true
 	specs, ids := createSpecs(cmdRoot, cmd, cmd)
 	for _, s := range specs {
@@ -941,26 +1000,21 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) {
 	}
 	allSpecs, allIDs := createSpecs(cmds...)
 
-	rootDir, err := testutil.SetupRootDir()
-	if err != nil {
-		t.Fatalf("error creating root dir: %v", err)
-	}
-	defer os.RemoveAll(rootDir)
-
 	// Split up the specs and IDs.
 	rootSpec := allSpecs[0]
 	rootID := allIDs[0]
 	childrenSpecs := allSpecs[1:]
 	childrenIDs := allIDs[1:]
 
-	bundleDir, err := testutil.SetupBundleDir(rootSpec)
+	conf := testutil.TestConfig()
+	rootDir, bundleDir, err := testutil.SetupContainer(rootSpec, conf)
 	if err != nil {
-		t.Fatalf("error setting up bundle dir: %v", err)
+		t.Fatalf("error setting up container: %v", err)
 	}
+	defer os.RemoveAll(rootDir)
 	defer os.RemoveAll(bundleDir)
 
 	// Start root container.
-	conf := testutil.TestConfigWithRoot(rootDir)
 	rootArgs := Args{
 		ID:        rootID,
 		Spec:      rootSpec,
@@ -1029,6 +1083,13 @@ func TestMultiContainerSharedMount(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		podSpec, ids := createSpecs(sleep, sleep)
@@ -1137,6 +1198,13 @@ func TestMultiContainerSharedMountReadonly(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		podSpec, ids := createSpecs(sleep, sleep)
@@ -1197,6 +1265,13 @@ func TestMultiContainerSharedMountRestart(t *testing.T) {
 	for _, conf := range configs(all...) {
 		t.Logf("Running test with conf: %+v", conf)
 
+		rootDir, err := testutil.SetupRootDir()
+		if err != nil {
+			t.Fatalf("error creating root dir: %v", err)
+		}
+		defer os.RemoveAll(rootDir)
+		conf.RootDir = rootDir
+
 		// Setup the containers.
 		sleep := []string{"sleep", "100"}
 		podSpec, ids := createSpecs(sleep, sleep)
@@ -1297,6 +1372,59 @@ func TestMultiContainerSharedMountRestart(t *testing.T) {
 	}
 }
 
+// Test that unsupported pod mounts options are ignored when matching master and
+// slave mounts.
+func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
+	// Setup the containers.
+	sleep := []string{"/bin/sleep", "100"}
+	podSpec, ids := createSpecs(sleep, sleep)
+	mnt0 := specs.Mount{
+		Destination: "/mydir/test",
+		Source:      "/some/dir",
+		Type:        "tmpfs",
+		Options:     []string{"rw", "rbind", "relatime"},
+	}
+	podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+
+	mnt1 := mnt0
+	mnt1.Destination = "/mydir2/test2"
+	mnt1.Options = []string{"rw", "nosuid"}
+	podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+
+	createSharedMount(mnt0, "test-mount", podSpec...)
+
+	containers, cleanup, err := startContainers(conf, podSpec, ids)
+	if err != nil {
+		t.Fatalf("error starting containers: %v", err)
+	}
+	defer cleanup()
+
+	execs := []execDesc{
+		{
+			c:    containers[0],
+			cmd:  []string{"/usr/bin/test", "-d", mnt0.Destination},
+			desc: "directory is mounted in container0",
+		},
+		{
+			c:    containers[1],
+			cmd:  []string{"/usr/bin/test", "-d", mnt1.Destination},
+			desc: "directory is mounted in container1",
+		},
+	}
+	if err := execMany(execs); err != nil {
+		t.Fatal(err.Error())
+	}
+}
+
 // Test that one container can send an FD to another container, even though
 // they have distinct MountNamespaces.
 func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
@@ -1329,6 +1457,15 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
 		Type:        "tmpfs",
 	}
 
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	// Create the specs.
 	specs, ids := createSpecs(
 		[]string{"sleep", "1000"},
@@ -1339,7 +1476,6 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
 	specs[1].Mounts = append(specs[2].Mounts, sharedMnt, writeableMnt)
 	specs[2].Mounts = append(specs[1].Mounts, sharedMnt)
 
-	conf := testutil.TestConfig()
 	containers, cleanup, err := startContainers(conf, specs, ids)
 	if err != nil {
 		t.Fatalf("error starting containers: %v", err)
@@ -1358,9 +1494,17 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) {
 
 // Test that container is destroyed when Gofer is killed.
 func TestMultiContainerGoferKilled(t *testing.T) {
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	sleep := []string{"sleep", "100"}
 	specs, ids := createSpecs(sleep, sleep, sleep)
-	conf := testutil.TestConfig()
 	containers, cleanup, err := startContainers(conf, specs, ids)
 	if err != nil {
 		t.Fatalf("error starting containers: %v", err)
@@ -1370,7 +1514,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 	// Ensure container is running
 	c := containers[2]
 	expectedPL := []*control.Process{
-		{PID: 3, Cmd: "sleep"},
+		{PID: 3, Cmd: "sleep", Threads: []kernel.ThreadID{3}},
 	}
 	if err := waitForProcessList(c, expectedPL); err != nil {
 		t.Errorf("failed to wait for sleep to start: %v", err)
@@ -1398,7 +1542,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 			continue // container[2] has been killed.
 		}
 		pl := []*control.Process{
-			{PID: kernel.ThreadID(i + 1), Cmd: "sleep"},
+			{PID: kernel.ThreadID(i + 1), Cmd: "sleep", Threads: []kernel.ThreadID{kernel.ThreadID(i + 1)}},
 		}
 		if err := waitForProcessList(c, pl); err != nil {
 			t.Errorf("Container %q was affected by another container: %v", c.ID, err)
@@ -1418,7 +1562,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 	// Wait until sandbox stops. waitForProcessList will loop until sandbox exits
 	// and RPC errors out.
 	impossiblePL := []*control.Process{
-		{PID: 100, Cmd: "non-existent-process"},
+		{PID: 100, Cmd: "non-existent-process", Threads: []kernel.ThreadID{100}},
 	}
 	if err := waitForProcessList(c, impossiblePL); err == nil {
 		t.Fatalf("Sandbox was not killed after gofer death")
@@ -1436,7 +1580,15 @@ func TestMultiContainerGoferKilled(t *testing.T) {
 func TestMultiContainerLoadSandbox(t *testing.T) {
 	sleep := []string{"sleep", "100"}
 	specs, ids := createSpecs(sleep, sleep, sleep)
+
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
 	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
 
 	// Create containers for the sandbox.
 	wants, cleanup, err := startContainers(conf, specs, ids)
@@ -1529,7 +1681,15 @@ func TestMultiContainerRunNonRoot(t *testing.T) {
 		Type:        "bind",
 	})
 
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
 	conf := testutil.TestConfig()
+	conf.RootDir = rootDir
+
 	pod, cleanup, err := startContainers(conf, podSpecs, ids)
 	if err != nil {
 		t.Fatalf("error starting containers: %v", err)
diff --git a/runsc/container/state_file.go b/runsc/container/state_file.go
new file mode 100644
index 000000000..d95151ea5
--- /dev/null
+++ b/runsc/container/state_file.go
@@ -0,0 +1,185 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+import (
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"sync"
+
+	"github.com/gofrs/flock"
+	"gvisor.dev/gvisor/pkg/log"
+)
+
+const stateFileExtension = ".state"
+
+// StateFile handles load from/save to container state safely from multiple
+// processes. It uses a lock file to provide synchronization between operations.
+//
+// The lock file is located at: "${s.RootDir}/${s.ID}.lock".
+// The state file is located at: "${s.RootDir}/${s.ID}.state".
+type StateFile struct {
+	// RootDir is the directory containing the container metadata file.
+	RootDir string `json:"rootDir"`
+
+	// ID is the container ID.
+	ID string `json:"id"`
+
+	//
+	// Fields below this line are not saved in the state file and will not
+	// be preserved across commands.
+	//
+
+	once  sync.Once
+	flock *flock.Flock
+}
+
+// List returns all container ids in the given root directory.
+func List(rootDir string) ([]string, error) {
+	log.Debugf("List containers %q", rootDir)
+	list, err := filepath.Glob(filepath.Join(rootDir, "*"+stateFileExtension))
+	if err != nil {
+		return nil, err
+	}
+	var out []string
+	for _, path := range list {
+		// Filter out files that do no belong to a container.
+		fileName := filepath.Base(path)
+		if len(fileName) < len(stateFileExtension) {
+			panic(fmt.Sprintf("invalid file match %q", path))
+		}
+		// Remove the extension.
+		cid := fileName[:len(fileName)-len(stateFileExtension)]
+		if validateID(cid) == nil {
+			out = append(out, cid)
+		}
+	}
+	return out, nil
+}
+
+// lock globally locks all locking operations for the container.
+func (s *StateFile) lock() error {
+	s.once.Do(func() {
+		s.flock = flock.NewFlock(s.lockPath())
+	})
+
+	if err := s.flock.Lock(); err != nil {
+		return fmt.Errorf("acquiring lock on %q: %v", s.flock, err)
+	}
+	return nil
+}
+
+// lockForNew acquires the lock and checks if the state file doesn't exist. This
+// is done to ensure that more than one creation didn't race to create
+// containers with the same ID.
+func (s *StateFile) lockForNew() error {
+	if err := s.lock(); err != nil {
+		return err
+	}
+
+	// Checks if the container already exists by looking for the metadata file.
+	if _, err := os.Stat(s.statePath()); err == nil {
+		s.unlock()
+		return fmt.Errorf("container already exists")
+	} else if !os.IsNotExist(err) {
+		s.unlock()
+		return fmt.Errorf("looking for existing container: %v", err)
+	}
+	return nil
+}
+
+// unlock globally unlocks all locking operations for the container.
+func (s *StateFile) unlock() error {
+	if !s.flock.Locked() {
+		panic("unlock called without lock held")
+	}
+
+	if err := s.flock.Unlock(); err != nil {
+		log.Warningf("Error to release lock on %q: %v", s.flock, err)
+		return fmt.Errorf("releasing lock on %q: %v", s.flock, err)
+	}
+	return nil
+}
+
+// saveLocked saves 'v' to the state file.
+//
+// Preconditions: lock() must been called before.
+func (s *StateFile) saveLocked(v interface{}) error {
+	if !s.flock.Locked() {
+		panic("saveLocked called without lock held")
+	}
+
+	meta, err := json.Marshal(v)
+	if err != nil {
+		return err
+	}
+	if err := ioutil.WriteFile(s.statePath(), meta, 0640); err != nil {
+		return fmt.Errorf("writing json file: %v", err)
+	}
+	return nil
+}
+
+func (s *StateFile) load(v interface{}) error {
+	if err := s.lock(); err != nil {
+		return err
+	}
+	defer s.unlock()
+
+	metaBytes, err := ioutil.ReadFile(s.statePath())
+	if err != nil {
+		return err
+	}
+	return json.Unmarshal(metaBytes, &v)
+}
+
+func (s *StateFile) close() error {
+	if s.flock == nil {
+		return nil
+	}
+	if s.flock.Locked() {
+		panic("Closing locked file")
+	}
+	return s.flock.Close()
+}
+
+func buildStatePath(rootDir, id string) string {
+	return filepath.Join(rootDir, id+stateFileExtension)
+}
+
+// statePath is the full path to the state file.
+func (s *StateFile) statePath() string {
+	return buildStatePath(s.RootDir, s.ID)
+}
+
+// lockPath is the full path to the lock file.
+func (s *StateFile) lockPath() string {
+	return filepath.Join(s.RootDir, s.ID+".lock")
+}
+
+// destroy deletes all state created by the stateFile. It may be called with the
+// lock file held. In that case, the lock file must still be unlocked and
+// properly closed after destroy returns.
+func (s *StateFile) destroy() error {
+	if err := os.Remove(s.statePath()); err != nil && !os.IsNotExist(err) {
+		return err
+	}
+	if err := os.Remove(s.lockPath()); err != nil && !os.IsNotExist(err) {
+		return err
+	}
+	return nil
+}
diff --git a/runsc/container/test_app/BUILD b/runsc/container/test_app/BUILD
index 9bf9e6e9d..bfd338bb6 100644
--- a/runsc/container/test_app/BUILD
+++ b/runsc/container/test_app/BUILD
@@ -15,5 +15,6 @@ go_binary(
         "//pkg/unet",
         "//runsc/testutil",
         "@com_github_google_subcommands//:go_default_library",
+        "@com_github_kr_pty//:go_default_library",
     ],
 )
diff --git a/runsc/container/test_app/test_app.go b/runsc/container/test_app/test_app.go
index 7f735c254..a1c8a741a 100644
--- a/runsc/container/test_app/test_app.go
+++ b/runsc/container/test_app/test_app.go
@@ -19,25 +19,31 @@ package main
 import (
 	"context"
 	"fmt"
+	"io"
+	"io/ioutil"
 	"log"
 	"net"
 	"os"
 	"os/exec"
+	"regexp"
 	"strconv"
 	sys "syscall"
 	"time"
 
 	"flag"
 	"github.com/google/subcommands"
+	"github.com/kr/pty"
 	"gvisor.dev/gvisor/runsc/testutil"
 )
 
 func main() {
 	subcommands.Register(subcommands.HelpCommand(), "")
 	subcommands.Register(subcommands.FlagsCommand(), "")
+	subcommands.Register(new(capability), "")
 	subcommands.Register(new(fdReceiver), "")
 	subcommands.Register(new(fdSender), "")
 	subcommands.Register(new(forkBomb), "")
+	subcommands.Register(new(ptyRunner), "")
 	subcommands.Register(new(reaper), "")
 	subcommands.Register(new(syscall), "")
 	subcommands.Register(new(taskTree), "")
@@ -287,3 +293,102 @@ func (s *syscall) Execute(ctx context.Context, f *flag.FlagSet, args ...interfac
 	}
 	return subcommands.ExitSuccess
 }
+
+type capability struct {
+	enabled  uint64
+	disabled uint64
+}
+
+// Name implements subcommands.Command.
+func (*capability) Name() string {
+	return "capability"
+}
+
+// Synopsis implements subcommands.Command.
+func (*capability) Synopsis() string {
+	return "checks if effective capabilities are set/unset"
+}
+
+// Usage implements subcommands.Command.
+func (*capability) Usage() string {
+	return "capability [--enabled=number] [--disabled=number]"
+}
+
+// SetFlags implements subcommands.Command.
+func (c *capability) SetFlags(f *flag.FlagSet) {
+	f.Uint64Var(&c.enabled, "enabled", 0, "")
+	f.Uint64Var(&c.disabled, "disabled", 0, "")
+}
+
+// Execute implements subcommands.Command.
+func (c *capability) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+	if c.enabled == 0 && c.disabled == 0 {
+		fmt.Println("One of the flags must be set")
+		return subcommands.ExitUsageError
+	}
+
+	status, err := ioutil.ReadFile("/proc/self/status")
+	if err != nil {
+		fmt.Printf("Error reading %q: %v\n", "proc/self/status", err)
+		return subcommands.ExitFailure
+	}
+	re := regexp.MustCompile("CapEff:\t([0-9a-f]+)\n")
+	matches := re.FindStringSubmatch(string(status))
+	if matches == nil || len(matches) != 2 {
+		fmt.Printf("Effective capabilities not found in\n%s\n", status)
+		return subcommands.ExitFailure
+	}
+	caps, err := strconv.ParseUint(matches[1], 16, 64)
+	if err != nil {
+		fmt.Printf("failed to convert capabilities %q: %v\n", matches[1], err)
+		return subcommands.ExitFailure
+	}
+
+	if c.enabled != 0 && (caps&c.enabled) != c.enabled {
+		fmt.Printf("Missing capabilities, want: %#x: got: %#x\n", c.enabled, caps)
+		return subcommands.ExitFailure
+	}
+	if c.disabled != 0 && (caps&c.disabled) != 0 {
+		fmt.Printf("Extra capabilities found, dont_want: %#x: got: %#x\n", c.disabled, caps)
+		return subcommands.ExitFailure
+	}
+
+	return subcommands.ExitSuccess
+}
+
+type ptyRunner struct{}
+
+// Name implements subcommands.Command.
+func (*ptyRunner) Name() string {
+	return "pty-runner"
+}
+
+// Synopsis implements subcommands.Command.
+func (*ptyRunner) Synopsis() string {
+	return "runs the given command with an open pty terminal"
+}
+
+// Usage implements subcommands.Command.
+func (*ptyRunner) Usage() string {
+	return "pty-runner [command]"
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*ptyRunner) SetFlags(f *flag.FlagSet) {}
+
+// Execute implements subcommands.Command.
+func (*ptyRunner) Execute(_ context.Context, fs *flag.FlagSet, _ ...interface{}) subcommands.ExitStatus {
+	c := exec.Command(fs.Args()[0], fs.Args()[1:]...)
+	f, err := pty.Start(c)
+	if err != nil {
+		fmt.Printf("pty.Start failed: %v", err)
+		return subcommands.ExitFailure
+	}
+	defer f.Close()
+
+	// Copy stdout from the command to keep this process alive until the
+	// subprocess exits.
+	io.Copy(os.Stdout, f)
+
+	return subcommands.ExitSuccess
+}