runsc: Enable container creation within existing sandboxes.

Containers are created as processes in the sandbox. Of the many things that don't work yet, the biggest issue is that the fsgofer is launched with its root as the sandbox's root directory. Thus, when a container is started and wants to read anything (including the init binary of the container), the gofer tries to serve from sandbox's root (which basically just has pause), not the container's. PiperOrigin-RevId: 201294560 Change-Id: I6423aa8830538959c56ae908ce067e4199d627b1
author: Kevin Krakauer <krakauer@google.com> 2018-06-19 21:42:21 -0700
committer: Shentubot <shentubot@google.com> 2018-06-19 21:44:33 -0700
commit: 5397963b5d4d57bd3d3668df880b5314ca2fc3d8 (patch)
tree: 1e56b21b1248c0d74772e7daf368a6ab91e35911 /runsc
parent: db66e383c33228c43efbe16ad3b14ae9833879dc (diff)
11 files changed, 386 insertions, 97 deletions
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index ae727f144..1a598199d 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -15,9 +15,12 @@
 package boot
 
 import (
+	"errors"
 	"fmt"
 
+	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"gvisor.googlesource.com/gvisor/pkg/control/server"
+	"gvisor.googlesource.com/gvisor/pkg/log"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/control"
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
@@ -50,6 +53,10 @@ const (
 	// ContainerSignal is used to send a signal to a container.
 	ContainerSignal = "containerManager.Signal"
 
+	// ContainerStart is the URPC endpoint for running a non-root container
+	// within a sandbox.
+	ContainerStart = "containerManager.Start"
+
 	// ContainerWait is used to wait on the init process of the container
 	// and return its ExitStatus.
 	ContainerWait = "containerManager.Wait"
@@ -127,10 +134,14 @@ type containerManager struct {
 
 	// watchdog is the kernel watchdog.
 	watchdog *watchdog.Watchdog
+
+	// l is the loader that creates containers and sandboxes.
+	l *Loader
 }
 
 // StartRoot will start the root container process.
 func (cm *containerManager) StartRoot(_, _ *struct{}) error {
+	log.Debugf("containerManager.StartRoot")
 	// Tell the root container to start and wait for the result.
 	cm.startChan <- struct{}{}
 	return <-cm.startResultChan
@@ -138,11 +149,42 @@ func (cm *containerManager) StartRoot(_, _ *struct{}) error {
 
 // Processes retrieves information about processes running in the sandbox.
 func (cm *containerManager) Processes(_, out *[]*control.Process) error {
+	log.Debugf("containerManager.Processes")
 	return control.Processes(cm.k, out)
 }
 
+// StartArgs contains arguments to the Start method.
+type StartArgs struct {
+	// Spec is the spec of the container to start.
+	Spec *specs.Spec
+
+	// TODO: Separate sandbox and container configs.
+	// Config is the runsc-specific configuration for the sandbox.
+	Conf *Config
+}
+
+// Start runs a created container within a sandbox.
+func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
+	log.Debugf("containerManager.Start")
+
+	// Validate arguments.
+	if args == nil {
+		return errors.New("start missing arguments")
+	}
+	if args.Spec == nil {
+		return errors.New("start arguments missing spec")
+	}
+	if args.Conf == nil {
+		return errors.New("start arguments missing config")
+	}
+
+	cm.l.startContainer(args, cm.k)
+	return nil
+}
+
 // Execute runs a command on a created or running sandbox.
 func (cm *containerManager) Execute(e *control.ExecArgs, waitStatus *uint32) error {
+	log.Debugf("containerManager.Execute")
 	proc := control.Proc{Kernel: cm.k}
 	if err := proc.Exec(e, waitStatus); err != nil {
 		return fmt.Errorf("error executing: %+v: %v", e, err)
@@ -152,6 +194,7 @@ func (cm *containerManager) Execute(e *control.ExecArgs, waitStatus *uint32) err
 
 // Checkpoint pauses a sandbox and saves its state.
 func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error {
+	log.Debugf("containerManager.Checkpoint")
 	state := control.State{
 		Kernel:   cm.k,
 		Watchdog: cm.watchdog,
@@ -173,6 +216,7 @@ func (cm *containerManager) Resume(_, _ *struct{}) error {
 
 // Wait waits for the init process in the given container.
 func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error {
+	log.Debugf("containerManager.Wait")
 	// TODO: Use the cid and wait on the init process in that
 	// container. Currently we just wait on PID 1 in the sandbox.
 	tg := cm.k.TaskSet().Root.ThreadGroupWithID(1)
@@ -195,6 +239,7 @@ type SignalArgs struct {
 
 // Signal sends a signal to the init process of the container.
 func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
+	log.Debugf("containerManager.Signal")
 	// TODO: Use the cid and send the signal to the init
 	// process in theat container. Currently we just signal PID 1 in the
 	// sandbox.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 526e8f8bb..d1a413cc7 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package boot loads the kernel and runs a container..
+// Package boot loads the kernel and runs a container.
 package boot
 
 import (
@@ -79,8 +79,8 @@ type Loader struct {
 	// container. It should be called when a sandbox is destroyed.
 	stopSignalForwarding func()
 
-	// procArgs refers to the root container task.
-	procArgs kernel.CreateProcessArgs
+	// rootProcArgs refers to the root sandbox init task.
+	rootProcArgs kernel.CreateProcessArgs
 }
 
 func init() {
@@ -117,12 +117,6 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 	}
 	tk.SetClocks(time.NewCalibratedClocks())
 
-	// Create initial limits.
-	ls, err := createLimitSet(spec)
-	if err != nil {
-		return nil, fmt.Errorf("error creating limits: %v", err)
-	}
-
 	// Create capabilities.
 	caps, err := specutils.Capabilities(spec.Process.Capabilities)
 	if err != nil {
@@ -154,13 +148,6 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 		return nil, fmt.Errorf("failed to enable strace: %v", err)
 	}
 
-	// Get the executable path, which is a bit tricky because we have to
-	// inspect the environment PATH which is relative to the root path.
-	exec, err := specutils.GetExecutablePath(spec.Process.Args[0], spec.Root.Path, spec.Process.Env)
-	if err != nil {
-		return nil, fmt.Errorf("error getting executable path: %v", err)
-	}
-
 	// Create an empty network stack because the network namespace may be empty at
 	// this point. Netns is configured before Run() is called. Netstack is
 	// configured using a control uRPC message. Host network is configured inside
@@ -223,16 +210,56 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 		return nil, fmt.Errorf("error creating control server: %v", err)
 	}
 
+	// We don't care about child signals; some platforms can generate a
+	// tremendous number of useless ones (I'm looking at you, ptrace).
+	if err := sighandling.IgnoreChildStop(); err != nil {
+		return nil, fmt.Errorf("failed to ignore child stop signals: %v", err)
+	}
+	// Ensure that most signals received in sentry context are forwarded to
+	// the emulated kernel.
+	stopSignalForwarding := sighandling.StartForwarding(k)
+
+	procArgs, err := newProcess(spec, conf, ioFDs, console, creds, utsns, ipcns, k)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create root process: %v", err)
+	}
+
+	l := &Loader{
+		k:                    k,
+		ctrl:                 ctrl,
+		conf:                 conf,
+		console:              console,
+		watchdog:             watchdog,
+		stopSignalForwarding: stopSignalForwarding,
+		rootProcArgs:         procArgs,
+	}
+	ctrl.manager.l = l
+	return l, nil
+}
+
+// newProcess creates a process that can be run with kernel.CreateProcess.
+func newProcess(spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, utsns *kernel.UTSNamespace, ipcns *kernel.IPCNamespace, k *kernel.Kernel) (kernel.CreateProcessArgs, error) {
+	// Create initial limits.
+	ls, err := createLimitSet(spec)
+	if err != nil {
+		return kernel.CreateProcessArgs{}, fmt.Errorf("error creating limits: %v", err)
+	}
+
+	// Get the executable path, which is a bit tricky because we have to
+	// inspect the environment PATH which is relative to the root path.
+	exec, err := specutils.GetExecutablePath(spec.Process.Args[0], spec.Root.Path, spec.Process.Env)
+	if err != nil {
+		return kernel.CreateProcessArgs{}, fmt.Errorf("error getting executable path: %v", err)
+	}
+
 	// Create the process arguments.
 	procArgs := kernel.CreateProcessArgs{
-		Filename:         exec,
-		Argv:             spec.Process.Args,
-		Envv:             spec.Process.Env,
-		WorkingDirectory: spec.Process.Cwd,
-		Credentials:      creds,
-		// Creating the FDMap requires that we have kernel.Kernel.fdMapUids, so
-		// it must wait until we have a Kernel.
-		Umask:                uint(syscall.Umask(0)),
+		Filename:             exec,
+		Argv:                 spec.Process.Args,
+		Envv:                 spec.Process.Env,
+		WorkingDirectory:     spec.Process.Cwd,
+		Credentials:          creds,
+		Umask:                uint(0022),
 		Limits:               ls,
 		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
 		UTSNamespace:         utsns,
@@ -240,52 +267,42 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in
 	}
 	ctx := procArgs.NewContext(k)
 
-	// Use root user to configure mounts. The current user might not have
-	// permission to do so.
-	rootProcArgs := kernel.CreateProcessArgs{
-		WorkingDirectory:     "/",
-		Credentials:          auth.NewRootCredentials(creds.UserNamespace),
-		Umask:                uint(syscall.Umask(0022)),
-		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
-	}
-	rootCtx := rootProcArgs.NewContext(k)
-
-	// Create the virtual filesystem.
-	mns, err := createMountNamespace(ctx, rootCtx, spec, conf, ioFDs)
-	if err != nil {
-		return nil, fmt.Errorf("error creating mounts: %v", err)
-	}
-	k.SetRootMountNamespace(mns)
-
-	// Create the FD map, which will set stdin, stdout, and stderr.  If console
-	// is true, then ioctl calls will be passed through to the host fd.
+	// Create the FD map, which will set stdin, stdout, and stderr.  If
+	// console is true, then ioctl calls will be passed through to the host
+	// fd.
 	fdm, err := createFDMap(ctx, k, ls, console)
 	if err != nil {
-		return nil, fmt.Errorf("error importing fds: %v", err)
+		return kernel.CreateProcessArgs{}, fmt.Errorf("error importing fds: %v", err)
 	}
 
 	// CreateProcess takes a reference on FDMap if successful. We
 	// won't need ours either way.
 	procArgs.FDMap = fdm
 
-	// We don't care about child signals; some platforms can generate a
-	// tremendous number of useless ones (I'm looking at you, ptrace).
-	if err := sighandling.IgnoreChildStop(); err != nil {
-		return nil, fmt.Errorf("failed to ignore child stop signals: %v", err)
+	// If this is the root container, we also need to setup the root mount
+	// namespace.
+	if k.RootMountNamespace() == nil {
+		// Use root user to configure mounts. The current user might not have
+		// permission to do so.
+		rootProcArgs := kernel.CreateProcessArgs{
+			WorkingDirectory: "/",
+			Credentials:      auth.NewRootCredentials(creds.UserNamespace),
+			// The sentry should run with a umask of 0.
+			Umask:                uint(syscall.Umask(0)),
+			MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
+		}
+		rootCtx := rootProcArgs.NewContext(k)
+
+		// Create the virtual filesystem.
+		mns, err := createMountNamespace(ctx, rootCtx, spec, conf, ioFDs)
+		if err != nil {
+			return kernel.CreateProcessArgs{}, fmt.Errorf("error creating mounts: %v", err)
+		}
+
+		k.SetRootMountNamespace(mns)
 	}
-	// Ensure that most signals received in sentry context are forwarded to
-	// the emulated kernel.
-	stopSignalForwarding := sighandling.StartForwarding(k)
 
-	return &Loader{
-		k:                    k,
-		ctrl:                 ctrl,
-		conf:                 conf,
-		console:              console,
-		watchdog:             watchdog,
-		stopSignalForwarding: stopSignalForwarding,
-		procArgs:             procArgs,
-	}, nil
+	return procArgs, nil
 }
 
 // Destroy cleans up all resources used by the loader.
@@ -350,17 +367,69 @@ func (l *Loader) run() error {
 	}
 
 	// Create the root container init task.
-	if _, err := l.k.CreateProcess(l.procArgs); err != nil {
+	if _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
 		return fmt.Errorf("failed to create init process: %v", err)
 	}
 
 	// CreateProcess takes a reference on FDMap if successful.
-	l.procArgs.FDMap.DecRef()
+	l.rootProcArgs.FDMap.DecRef()
 
 	l.watchdog.Start()
 	return l.k.Start()
 }
 
+func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) error {
+	spec := args.Spec
+	// Create capabilities.
+	caps, err := specutils.Capabilities(spec.Process.Capabilities)
+	if err != nil {
+		return fmt.Errorf("error creating capabilities: %v", err)
+	}
+
+	// Convert the spec's additional GIDs to KGIDs.
+	extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
+	for _, GID := range spec.Process.User.AdditionalGids {
+		extraKGIDs = append(extraKGIDs, auth.KGID(GID))
+	}
+
+	// Create credentials. We reuse the root user namespace because the
+	// sentry currently supports only 1 mount namespace, which is tied to a
+	// single user namespace. Thus we must run in the same user namespace
+	// to access mounts.
+	// TODO: Create a new mount namespace for the container.
+	creds := auth.NewUserCredentials(
+		auth.KUID(spec.Process.User.UID),
+		auth.KGID(spec.Process.User.GID),
+		extraKGIDs,
+		caps,
+		l.k.RootUserNamespace())
+
+	// TODO New containers should be started in new PID namespaces
+	// when indicated by the spec.
+
+	procArgs, err := newProcess(
+		args.Spec,
+		args.Conf,
+		nil,   // ioFDs
+		false, // console
+		creds,
+		k.RootUTSNamespace(),
+		k.RootIPCNamespace(),
+		k)
+	if err != nil {
+		return fmt.Errorf("failed to create new process: %v", err)
+	}
+
+	if _, err := l.k.CreateProcess(procArgs); err != nil {
+		return fmt.Errorf("failed to create process in sentry: %v", err)
+	}
+
+	// CreateProcess takes a reference on FDMap if successful.
+	procArgs.FDMap.DecRef()
+
+	return nil
+}
+
 // WaitForStartSignal waits for a start signal from the control server.
 func (l *Loader) WaitForStartSignal() {
 	<-l.ctrl.manager.startChan
diff --git a/runsc/cmd/events.go b/runsc/cmd/events.go
index f221ad3ae..df65ea31d 100644
--- a/runsc/cmd/events.go
+++ b/runsc/cmd/events.go
@@ -76,7 +76,7 @@ func (evs *Events) Execute(ctx context.Context, f *flag.FlagSet, args ...interfa
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
-		Fatalf("error loading sandox: %v", err)
+		Fatalf("error loading sandbox: %v", err)
 	}
 
 	// Repeatedly get stats from the container.
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 235ed9bc6..cbce07c8e 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -104,7 +104,7 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
-		Fatalf("error loading sandox: %v", err)
+		Fatalf("error loading sandbox: %v", err)
 	}
 
 	if e.WorkingDirectory == "" {
diff --git a/runsc/cmd/ps.go b/runsc/cmd/ps.go
index 9f9f4d15e..5d219bfdc 100644
--- a/runsc/cmd/ps.go
+++ b/runsc/cmd/ps.go
@@ -62,7 +62,7 @@ func (ps *PS) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{})
 
 	c, err := container.Load(conf.RootDir, id)
 	if err != nil {
-		Fatalf("error loading sandox: %v", err)
+		Fatalf("error loading sandbox: %v", err)
 	}
 	pList, err := c.Processes()
 	if err != nil {
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index fe477abf2..61e05e1c3 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -37,6 +37,7 @@ go_test(
         "//pkg/sentry/kernel/auth",
         "//pkg/unet",
         "//runsc/container",
+        "//runsc/specutils",
         "//runsc/test/testutil",
         "@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
         "@org_golang_x_sys//unix:go_default_library",
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 571784e07..3b7f95af9 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -214,22 +214,43 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 		Owner:         os.Getenv("USER"),
 	}
 
-	// TODO: If the metadata annotations indicate that this
-	// container should be started in another sandbox, we must do so. The
-	// metadata will indicate the ID of the sandbox, which is the same as
-	// the ID of the init container in the sandbox. We can look up that
-	// init container by ID to get the sandbox, then we need to expose a
-	// way to run a new container in the sandbox.
-
-	// Start a new sandbox for this container. Any errors after this point
-	// must destroy the container.
-	s, err := sandbox.Create(id, spec, conf, bundleDir, consoleSocket)
-	if err != nil {
-		c.Destroy()
-		return nil, err
-	}
+	// If the metadata annotations indicate that this container should be
+	// started in an existing sandbox, we must do so. The metadata will
+	// indicate the ID of the sandbox, which is the same as the ID of the
+	// init container in the sandbox.
+	if specutils.ShouldCreateSandbox(spec) {
+		log.Debugf("Creating new sandbox for container %q", id)
+		// Start a new sandbox for this container. Any errors after this point
+		// must destroy the container.
+		s, err := sandbox.Create(id, spec, conf, bundleDir, consoleSocket)
+		if err != nil {
+			c.Destroy()
+			return nil, err
+		}
+		c.Sandbox = s
+	} else {
+		// This is sort of confusing. For a sandbox with a root
+		// container and a child container in it, runsc sees:
+		// * A container struct whose sandbox ID is equal to the
+		//   container ID. This is the root container that is tied to
+		//   the creation of the sandbox.
+		// * A container struct whose sandbox ID is equal to the above
+		//   container/sandbox ID, but that has a different container
+		//   ID. This is the child container.
+		sbid, ok := specutils.SandboxID(spec)
+		if !ok {
+			return nil, fmt.Errorf("no sandbox ID found when creating container")
+		}
+		log.Debugf("Creating new container %q in sandbox %q", c.ID, sbid)
 
-	c.Sandbox = s
+		// Find the sandbox associated with this ID.
+		sb, err := Load(conf.RootDir, sbid)
+		if err != nil {
+			c.Destroy()
+			return nil, err
+		}
+		c.Sandbox = sb.Sandbox
+	}
 	c.Status = Created
 
 	// Save the metadata file.
@@ -242,7 +263,7 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 	// this file is created, so it must be the last thing we do.
 	if pidFile != "" {
 		if err := ioutil.WriteFile(pidFile, []byte(strconv.Itoa(c.Pid())), 0644); err != nil {
-			s.Destroy()
+			c.Destroy()
 			return nil, fmt.Errorf("error writing pid file: %v", err)
 		}
 	}
@@ -266,9 +287,16 @@ func (c *Container) Start(conf *boot.Config) error {
 		}
 	}
 
-	if err := c.Sandbox.Start(c.ID, c.Spec, conf); err != nil {
-		c.Destroy()
-		return err
+	if specutils.ShouldCreateSandbox(c.Spec) {
+		if err := c.Sandbox.StartRoot(c.Spec, conf); err != nil {
+			c.Destroy()
+			return err
+		}
+	} else {
+		if err := c.Sandbox.Start(c.Spec, conf); err != nil {
+			c.Destroy()
+			return err
+		}
 	}
 
 	// "If any poststart hook fails, the runtime MUST log a warning, but
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 7f87ea5ab..1116ca170 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -36,6 +36,7 @@ import (
 	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.googlesource.com/gvisor/pkg/unet"
 	"gvisor.googlesource.com/gvisor/runsc/container"
+	"gvisor.googlesource.com/gvisor/runsc/specutils"
 	"gvisor.googlesource.com/gvisor/runsc/test/testutil"
 )
 
@@ -51,7 +52,7 @@ func waitForProcessList(s *container.Container, expected []*control.Process) err
 	var got []*control.Process
 	for start := time.Now(); time.Now().Sub(start) < 10*time.Second; {
 		var err error
-		got, err := s.Processes()
+		got, err = s.Processes()
 		if err != nil {
 			return fmt.Errorf("error getting process data from container: %v", err)
 		}
@@ -946,3 +947,73 @@ func TestAbbreviatedIDs(t *testing.T) {
 		}
 	}
 }
+
+// TestMultiContainerSanity checks that it is possible to run 2 dead-simple
+// containers in the same sandbox.
+func TestMultiContainerSanity(t *testing.T) {
+	containerIDs := []string{
+		testutil.UniqueContainerID(),
+		testutil.UniqueContainerID(),
+	}
+	containerAnnotations := []map[string]string{
+		// The first container creates a sandbox.
+		map[string]string{
+			specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeSandbox,
+		},
+		// The second container creates a container within the first
+		// container's sandbox.
+		map[string]string{
+			specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer,
+			specutils.ContainerdSandboxIDAnnotation:     containerIDs[0],
+		},
+	}
+
+	rootDir, err := testutil.SetupRootDir()
+	if err != nil {
+		t.Fatalf("error creating root dir: %v", err)
+	}
+	defer os.RemoveAll(rootDir)
+
+	// Setup the containers.
+	containers := make([]*container.Container, 0, len(containerIDs))
+	for i, annotations := range containerAnnotations {
+		spec := testutil.NewSpecWithArgs("sleep", "100")
+		spec.Annotations = annotations
+		bundleDir, conf, err := testutil.SetupContainerInRoot(rootDir, spec)
+		if err != nil {
+			t.Fatalf("error setting up container: %v", err)
+		}
+		defer os.RemoveAll(bundleDir)
+		cont, err := container.Create(containerIDs[i], spec, conf, bundleDir, "", "")
+		if err != nil {
+			t.Fatalf("error creating container: %v", err)
+		}
+		defer cont.Destroy()
+		if err := cont.Start(conf); err != nil {
+			t.Fatalf("error starting container: %v", err)
+		}
+		containers = append(containers, cont)
+	}
+
+	expectedPL := []*control.Process{
+		{
+			UID:  0,
+			PID:  1,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+		{
+			UID:  0,
+			PID:  2,
+			PPID: 0,
+			C:    0,
+			Cmd:  "sleep",
+		},
+	}
+
+	// Check via ps that multiple processes are running.
+	if err := waitForProcessList(containers[0], expectedPL); err != nil {
+		t.Errorf("failed to wait for sleep to start: %v", err)
+	}
+}
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 0181dc9d4..90b46e247 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -81,9 +81,9 @@ func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSo
 	return s, nil
 }
 
-// Start starts running the containerized process inside the sandbox.
-func (s *Sandbox) Start(cid string, spec *specs.Spec, conf *boot.Config) error {
-	log.Debugf("Start sandbox %q, pid: %d", s.ID, s.Pid)
+// StartRoot starts running the root container process inside the sandbox.
+func (s *Sandbox) StartRoot(spec *specs.Spec, conf *boot.Config) error {
+	log.Debugf("Start root sandbox %q, pid: %d", s.ID, s.Pid)
 	conn, err := s.connect()
 	if err != nil {
 		return err
@@ -96,9 +96,7 @@ func (s *Sandbox) Start(cid string, spec *specs.Spec, conf *boot.Config) error {
 	}
 
 	// Send a message to the sandbox control server to start the root
-	// container..
-	//
-	// TODO: We need a way to start non-root containers.
+	// container.
 	if err := conn.Call(boot.RootContainerStart, nil, nil); err != nil {
 		return fmt.Errorf("error starting root container %v: %v", spec.Process.Args, err)
 	}
@@ -106,6 +104,26 @@ func (s *Sandbox) Start(cid string, spec *specs.Spec, conf *boot.Config) error {
 	return nil
 }
 
+// Start starts running a non-root container inside the sandbox.
+func (s *Sandbox) Start(spec *specs.Spec, conf *boot.Config) error {
+	log.Debugf("Start non-root container sandbox %q, pid: %d", s.ID, s.Pid)
+	conn, err := s.connect()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	args := boot.StartArgs{
+		Spec: spec,
+		Conf: conf,
+	}
+	if err := conn.Call(boot.ContainerStart, args, nil); err != nil {
+		return fmt.Errorf("error starting non-root container %v: %v", spec.Process.Args, err)
+	}
+
+	return nil
+}
+
 // Processes retrieves the list of processes and associated metadata for a
 // given container in this sandbox.
 func (s *Sandbox) Processes(cid string) ([]*control.Process, error) {
@@ -130,11 +148,11 @@ func (s *Sandbox) Execute(cid string, e *control.ExecArgs) (syscall.WaitStatus,
 	log.Debugf("Executing new process in container %q in sandbox %q", cid, s.ID)
 	conn, err := s.connect()
 	if err != nil {
-		return 0, fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err)
+		return 0, s.connError(err)
 	}
 	defer conn.Close()
 
-	// Send a message to the sandbox control server to start the container..
+	// Send a message to the sandbox control server to start the container.
 	var waitStatus uint32
 	// TODO: Pass in the container id (cid) here. The sandbox
 	// should execute in the context of that container.
@@ -168,11 +186,15 @@ func (s *Sandbox) connect() (*urpc.Client, error) {
 	log.Debugf("Connecting to sandbox %q", s.ID)
 	conn, err := client.ConnectTo(boot.ControlSocketAddr(s.ID))
 	if err != nil {
-		return nil, fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err)
+		return nil, s.connError(err)
 	}
 	return conn, nil
 }
 
+func (s *Sandbox) connError(err error) error {
+	return fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err)
+}
+
 func (s *Sandbox) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir, binPath string) ([]*os.File, error) {
 	if conf.FileAccess != boot.FileAccessProxy {
 		// Don't start a gofer. The sandbox will access host FS directly.
@@ -266,7 +288,7 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
 	}
 
 	// If the console control socket file is provided, then create a new
-	// pty master/slave pair and set the tty on the sandox process.
+	// pty master/slave pair and set the tty on the sandbox process.
 	if consoleEnabled {
 		// setupConsole will send the master on the socket, and return
 		// the slave.
diff --git a/runsc/sandbox/sandbox_test.go b/runsc/sandbox/sandbox_test.go
index e25290d5e..fee2de283 100644
--- a/runsc/sandbox/sandbox_test.go
+++ b/runsc/sandbox/sandbox_test.go
@@ -44,7 +44,7 @@ func TestGoferExits(t *testing.T) {
 		t.Fatalf("error creating container: %v", err)
 	}
 	defer s.Destroy()
-	if err := s.Start("123", spec, conf); err != nil {
+	if err := s.StartRoot(spec, conf); err != nil {
 		t.Fatalf("error starting container: %v", err)
 	}
 
diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go
index 8dae3efb1..c552111f2 100644
--- a/runsc/specutils/specutils.go
+++ b/runsc/specutils/specutils.go
@@ -63,6 +63,26 @@ func ValidateSpec(spec *specs.Spec) error {
 	if spec.Linux != nil && spec.Linux.Seccomp != nil {
 		log.Warningf("Seccomp spec is being ignored")
 	}
+
+	// 2 annotations are use by containerd to support multi-container pods.
+	//   "io.kubernetes.cri.container-type"
+	//   "io.kubernetes.cri.sandbox-id"
+	containerType, hasContainerType := spec.Annotations[ContainerdContainerTypeAnnotation]
+	_, hasSandboxID := spec.Annotations[ContainerdSandboxIDAnnotation]
+	switch {
+	// Non-containerd use won't set a container type.
+	case !hasContainerType:
+	case containerType == ContainerdContainerTypeSandbox:
+	// When starting a container in an existing sandbox, the sandbox ID
+	// must be set.
+	case containerType == ContainerdContainerTypeContainer:
+		if !hasSandboxID {
+			return fmt.Errorf("spec has container-type of %s, but no sandbox ID set", containerType)
+		}
+	default:
+		return fmt.Errorf("unknown container-type: %s", containerType)
+	}
+
 	return nil
 }
 
@@ -82,7 +102,7 @@ func ReadSpec(bundleDir string) (*specs.Spec, error) {
 }
 
 // GetExecutablePath returns the absolute path to the executable, relative to
-// the root.  It searches the environment PATH for the first file that exists
+// the root. It searches the environment PATH for the first file that exists
 // with the given name.
 func GetExecutablePath(exec, root string, env []string) (string, error) {
 	exec = filepath.Clean(exec)
@@ -246,6 +266,39 @@ func BinPath() (string, error) {
 	return binPath, nil
 }
 
+const (
+	// ContainerdContainerTypeAnnotation is the OCI annotation set by
+	// containerd to indicate whether the container to create should have
+	// its own sandbox or a container within an existing sandbox.
+	ContainerdContainerTypeAnnotation = "io.kubernetes.cri.container-type"
+	// ContainerdContainerTypeContainer is the container type value
+	// indicating the container should be created in an existing sandbox.
+	ContainerdContainerTypeContainer = "container"
+	// ContainerdContainerTypeSandbox is the container type value
+	// indicating the container should be created in a new sandbox.
+	ContainerdContainerTypeSandbox = "sandbox"
+
+	// ContainerdSandboxIDAnnotation is the OCI annotation set to indicate
+	// which sandbox the container should be created in when the container
+	// is not the first container in the sandbox.
+	ContainerdSandboxIDAnnotation = "io.kubernetes.cri.sandbox-id"
+)
+
+// ShouldCreateSandbox returns true if the spec indicates that a new sandbox
+// should be created for the container. If false, the container should be
+// started in an existing sandbox.
+func ShouldCreateSandbox(spec *specs.Spec) bool {
+	t, ok := spec.Annotations[ContainerdContainerTypeAnnotation]
+	return !ok || t == ContainerdContainerTypeSandbox
+}
+
+// SandboxID returns the ID of the sandbox to join and whether an ID was found
+// in the spec.
+func SandboxID(spec *specs.Spec) (string, bool) {
+	id, ok := spec.Annotations[ContainerdSandboxIDAnnotation]
+	return id, ok
+}
+
 // WaitForReady waits for a process to become ready. The process is ready when
 // the 'ready' function returns true. It continues to wait if 'ready' returns
 // false. It returns error on timeout, if the process stops or if 'ready' fails.
author	Kevin Krakauer <krakauer@google.com>	2018-06-19 21:42:21 -0700
committer	Shentubot <shentubot@google.com>	2018-06-19 21:44:33 -0700
commit	5397963b5d4d57bd3d3668df880b5314ca2fc3d8 (patch)
tree	1e56b21b1248c0d74772e7daf368a6ab91e35911 /runsc
parent	db66e383c33228c43efbe16ad3b14ae9833879dc (diff)