summaryrefslogtreecommitdiffhomepage
path: root/runsc/container
diff options
context:
space:
mode:
Diffstat (limited to 'runsc/container')
-rw-r--r--runsc/container/container.go1053
-rw-r--r--runsc/container/hook.go111
-rw-r--r--runsc/container/status.go60
3 files changed, 1224 insertions, 0 deletions
diff --git a/runsc/container/container.go b/runsc/container/container.go
new file mode 100644
index 000000000..513085836
--- /dev/null
+++ b/runsc/container/container.go
@@ -0,0 +1,1053 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package container creates and manipulates containers.
+package container
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "os/exec"
+ "os/signal"
+ "path/filepath"
+ "regexp"
+ "strconv"
+ "strings"
+ "syscall"
+ "time"
+
+ "github.com/cenkalti/backoff"
+ "github.com/gofrs/flock"
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/control"
+ "gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/cgroup"
+ "gvisor.googlesource.com/gvisor/runsc/sandbox"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+const (
+ // metadataFilename is the name of the metadata file relative to the
+ // container root directory that holds sandbox metadata.
+ metadataFilename = "meta.json"
+
+ // metadataLockFilename is the name of a lock file in the container
+ // root directory that is used to prevent concurrent modifications to
+ // the container state and metadata.
+ metadataLockFilename = "meta.lock"
+)
+
+// validateID validates the container id.
+func validateID(id string) error {
+ // See libcontainer/factory_linux.go.
+ idRegex := regexp.MustCompile(`^[\w+-\.]+$`)
+ if !idRegex.MatchString(id) {
+ return fmt.Errorf("invalid container id: %v", id)
+ }
+ return nil
+}
+
+// Container represents a containerized application. When running, the
+// container is associated with a single Sandbox.
+//
+// Container metadata can be saved and loaded to disk. Within a root directory,
+// we maintain subdirectories for each container named with the container id.
+// The container metadata is stored as a json within the container directory
+// in a file named "meta.json". This metadata format is defined by us and is
+// not part of the OCI spec.
+//
+// Containers must write their metadata files after any change to their internal
+// states. The entire container directory is deleted when the container is
+// destroyed.
+//
+// When the container is stopped, all processes that belong to the container
+// must be stopped before Destroy() returns. containerd makes roughly the
+// following calls to stop a container:
+// - First it attempts to kill the container process with
+// 'runsc kill SIGTERM'. After some time, it escalates to SIGKILL. In a
+// separate thread, it's waiting on the container. As soon as the wait
+// returns, it moves on to the next step:
+// - It calls 'runsc kill --all SIGKILL' to stop every process that belongs to
+// the container. 'kill --all SIGKILL' waits for all processes before
+// returning.
+// - Containerd waits for stdin, stdout and stderr to drain and be closed.
+// - It calls 'runsc delete'. runc implementation kills --all SIGKILL once
+// again just to be sure, waits, and then proceeds with remaining teardown.
+//
+type Container struct {
+ // ID is the container ID.
+ ID string `json:"id"`
+
+ // Spec is the OCI runtime spec that configures this container.
+ Spec *specs.Spec `json:"spec"`
+
+ // BundleDir is the directory containing the container bundle.
+ BundleDir string `json:"bundleDir"`
+
+ // Root is the directory containing the container metadata file. If this
+ // container is the root container, Root and RootContainerDir will be the
+ // same.
+ Root string `json:"root"`
+
+ // CreatedAt is the time the container was created.
+ CreatedAt time.Time `json:"createdAt"`
+
+ // Owner is the container owner.
+ Owner string `json:"owner"`
+
+ // ConsoleSocket is the path to a unix domain socket that will receive
+ // the console FD.
+ ConsoleSocket string `json:"consoleSocket"`
+
+ // Status is the current container Status.
+ Status Status `json:"status"`
+
+ // GoferPid is the PID of the gofer running along side the sandbox. May
+ // be 0 if the gofer has been killed.
+ GoferPid int `json:"goferPid"`
+
+ // goferIsChild is set if a gofer process is a child of the current process.
+ //
+ // This field isn't saved to json, because only a creator of a gofer
+ // process will have it as a child process.
+ goferIsChild bool
+
+ // Sandbox is the sandbox this container is running in. It's set when the
+ // container is created and reset when the sandbox is destroyed.
+ Sandbox *sandbox.Sandbox `json:"sandbox"`
+
+ // RootContainerDir is the root directory containing the metadata file of the
+ // sandbox root container. It's used to lock in order to serialize creating
+ // and deleting this Container's metadata directory. If this container is the
+ // root container, this is the same as Root.
+ RootContainerDir string
+}
+
+// Load loads a container with the given id from a metadata file. id may be an
+// abbreviation of the full container id, in which case Load loads the
+// container to which id unambiguously refers to.
+// Returns ErrNotExist if container doesn't exist.
+func Load(rootDir, id string) (*Container, error) {
+ log.Debugf("Load container %q %q", rootDir, id)
+ if err := validateID(id); err != nil {
+ return nil, fmt.Errorf("validating id: %v", err)
+ }
+
+ cRoot, err := findContainerRoot(rootDir, id)
+ if err != nil {
+ // Preserve error so that callers can distinguish 'not found' errors.
+ return nil, err
+ }
+
+ // Lock the container metadata to prevent other runsc instances from
+ // writing to it while we are reading it.
+ unlock, err := lockContainerMetadata(cRoot)
+ if err != nil {
+ return nil, err
+ }
+ defer unlock()
+
+ // Read the container metadata file and create a new Container from it.
+ metaFile := filepath.Join(cRoot, metadataFilename)
+ metaBytes, err := ioutil.ReadFile(metaFile)
+ if err != nil {
+ if os.IsNotExist(err) {
+ // Preserve error so that callers can distinguish 'not found' errors.
+ return nil, err
+ }
+ return nil, fmt.Errorf("reading container metadata file %q: %v", metaFile, err)
+ }
+ var c Container
+ if err := json.Unmarshal(metaBytes, &c); err != nil {
+ return nil, fmt.Errorf("unmarshaling container metadata from %q: %v", metaFile, err)
+ }
+
+ // If the status is "Running" or "Created", check that the sandbox
+ // process still exists, and set it to Stopped if it does not.
+ //
+ // This is inherently racey.
+ if c.Status == Running || c.Status == Created {
+ // Check if the sandbox process is still running.
+ if !c.isSandboxRunning() {
+ // Sandbox no longer exists, so this container definitely does not exist.
+ c.changeStatus(Stopped)
+ } else if c.Status == Running {
+ // Container state should reflect the actual state of the application, so
+ // we don't consider gofer process here.
+ if err := c.SignalContainer(syscall.Signal(0), false); err != nil {
+ c.changeStatus(Stopped)
+ }
+ }
+ }
+
+ return &c, nil
+}
+
+func findContainerRoot(rootDir, partialID string) (string, error) {
+ // Check whether the id fully specifies an existing container.
+ cRoot := filepath.Join(rootDir, partialID)
+ if _, err := os.Stat(cRoot); err == nil {
+ return cRoot, nil
+ }
+
+ // Now see whether id could be an abbreviation of exactly 1 of the
+ // container ids. If id is ambigious (it could match more than 1
+ // container), it is an error.
+ cRoot = ""
+ ids, err := List(rootDir)
+ if err != nil {
+ return "", err
+ }
+ for _, id := range ids {
+ if strings.HasPrefix(id, partialID) {
+ if cRoot != "" {
+ return "", fmt.Errorf("id %q is ambiguous and could refer to multiple containers: %q, %q", partialID, cRoot, id)
+ }
+ cRoot = id
+ }
+ }
+ if cRoot == "" {
+ return "", os.ErrNotExist
+ }
+ log.Debugf("abbreviated id %q resolves to full id %q", partialID, cRoot)
+ return filepath.Join(rootDir, cRoot), nil
+}
+
+// List returns all container ids in the given root directory.
+func List(rootDir string) ([]string, error) {
+ log.Debugf("List containers %q", rootDir)
+ fs, err := ioutil.ReadDir(rootDir)
+ if err != nil {
+ return nil, fmt.Errorf("reading dir %q: %v", rootDir, err)
+ }
+ var out []string
+ for _, f := range fs {
+ out = append(out, f.Name())
+ }
+ return out, nil
+}
+
+// Create creates the container in a new Sandbox process, unless the metadata
+// indicates that an existing Sandbox should be used. The caller must call
+// Destroy() on the container.
+func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile, userLog string) (*Container, error) {
+ log.Debugf("Create container %q in root dir: %s", id, conf.RootDir)
+ if err := validateID(id); err != nil {
+ return nil, err
+ }
+
+ unlockRoot, err := maybeLockRootContainer(spec, conf.RootDir)
+ if err != nil {
+ return nil, err
+ }
+ defer unlockRoot()
+
+ // Lock the container metadata file to prevent concurrent creations of
+ // containers with the same id.
+ containerRoot := filepath.Join(conf.RootDir, id)
+ unlock, err := lockContainerMetadata(containerRoot)
+ if err != nil {
+ return nil, err
+ }
+ defer unlock()
+
+ // Check if the container already exists by looking for the metadata
+ // file.
+ if _, err := os.Stat(filepath.Join(containerRoot, metadataFilename)); err == nil {
+ return nil, fmt.Errorf("container with id %q already exists", id)
+ } else if !os.IsNotExist(err) {
+ return nil, fmt.Errorf("looking for existing container in %q: %v", containerRoot, err)
+ }
+
+ c := &Container{
+ ID: id,
+ Spec: spec,
+ ConsoleSocket: consoleSocket,
+ BundleDir: bundleDir,
+ Root: containerRoot,
+ Status: Creating,
+ CreatedAt: time.Now(),
+ Owner: os.Getenv("USER"),
+ RootContainerDir: conf.RootDir,
+ }
+ // The Cleanup object cleans up partially created containers when an error occurs.
+ // Any errors occuring during cleanup itself are ignored.
+ cu := specutils.MakeCleanup(func() { _ = c.Destroy() })
+ defer cu.Clean()
+
+ // If the metadata annotations indicate that this container should be
+ // started in an existing sandbox, we must do so. The metadata will
+ // indicate the ID of the sandbox, which is the same as the ID of the
+ // init container in the sandbox.
+ if isRoot(spec) {
+ log.Debugf("Creating new sandbox for container %q", id)
+
+ // Create and join cgroup before processes are created to ensure they are
+ // part of the cgroup from the start (and all tneir children processes).
+ cg, err := cgroup.New(spec)
+ if err != nil {
+ return nil, err
+ }
+ if cg != nil {
+ // If there is cgroup config, install it before creating sandbox process.
+ if err := cg.Install(spec.Linux.Resources); err != nil {
+ return nil, fmt.Errorf("configuring cgroup: %v", err)
+ }
+ }
+ if err := runInCgroup(cg, func() error {
+ ioFiles, specFile, err := c.createGoferProcess(spec, conf, bundleDir)
+ if err != nil {
+ return err
+ }
+
+ // Start a new sandbox for this container. Any errors after this point
+ // must destroy the container.
+ c.Sandbox, err = sandbox.New(id, spec, conf, bundleDir, consoleSocket, userLog, ioFiles, specFile, cg)
+ return err
+ }); err != nil {
+ return nil, err
+ }
+ } else {
+ // This is sort of confusing. For a sandbox with a root
+ // container and a child container in it, runsc sees:
+ // * A container struct whose sandbox ID is equal to the
+ // container ID. This is the root container that is tied to
+ // the creation of the sandbox.
+ // * A container struct whose sandbox ID is equal to the above
+ // container/sandbox ID, but that has a different container
+ // ID. This is the child container.
+ sbid, ok := specutils.SandboxID(spec)
+ if !ok {
+ return nil, fmt.Errorf("no sandbox ID found when creating container")
+ }
+ log.Debugf("Creating new container %q in sandbox %q", c.ID, sbid)
+
+ // Find the sandbox associated with this ID.
+ sb, err := Load(conf.RootDir, sbid)
+ if err != nil {
+ return nil, err
+ }
+ c.Sandbox = sb.Sandbox
+ if err := c.Sandbox.CreateContainer(c.ID); err != nil {
+ return nil, err
+ }
+ }
+ c.changeStatus(Created)
+
+ // Save the metadata file.
+ if err := c.save(); err != nil {
+ return nil, err
+ }
+
+ // Write the PID file. Containerd considers the create complete after
+ // this file is created, so it must be the last thing we do.
+ if pidFile != "" {
+ if err := ioutil.WriteFile(pidFile, []byte(strconv.Itoa(c.SandboxPid())), 0644); err != nil {
+ return nil, fmt.Errorf("error writing PID file: %v", err)
+ }
+ }
+
+ cu.Release()
+ return c, nil
+}
+
+// Start starts running the containerized process inside the sandbox.
+func (c *Container) Start(conf *boot.Config) error {
+ log.Debugf("Start container %q", c.ID)
+
+ unlockRoot, err := maybeLockRootContainer(c.Spec, c.RootContainerDir)
+ if err != nil {
+ return err
+ }
+ defer unlockRoot()
+
+ unlock, err := c.lock()
+ if err != nil {
+ return err
+ }
+ defer unlock()
+ if err := c.requireStatus("start", Created); err != nil {
+ return err
+ }
+
+ // "If any prestart hook fails, the runtime MUST generate an error,
+ // stop and destroy the container" -OCI spec.
+ if c.Spec.Hooks != nil {
+ if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil {
+ return err
+ }
+ }
+
+ if isRoot(c.Spec) {
+ if err := c.Sandbox.StartRoot(c.Spec, conf); err != nil {
+ return err
+ }
+ } else {
+ // Join cgroup to strt gofer process to ensure it's part of the cgroup from
+ // the start (and all tneir children processes).
+ if err := runInCgroup(c.Sandbox.Cgroup, func() error {
+ // Create the gofer process.
+ ioFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir)
+ if err != nil {
+ return err
+ }
+ defer mountsFile.Close()
+
+ cleanMounts, err := specutils.ReadMounts(mountsFile)
+ if err != nil {
+ return fmt.Errorf("reading mounts file: %v", err)
+ }
+ c.Spec.Mounts = cleanMounts
+
+ return c.Sandbox.StartContainer(c.Spec, conf, c.ID, ioFiles)
+ }); err != nil {
+ return err
+ }
+ }
+
+ // "If any poststart hook fails, the runtime MUST log a warning, but
+ // the remaining hooks and lifecycle continue as if the hook had
+ // succeeded" -OCI spec.
+ if c.Spec.Hooks != nil {
+ executeHooksBestEffort(c.Spec.Hooks.Poststart, c.State())
+ }
+
+ c.changeStatus(Running)
+ return c.save()
+}
+
+// Restore takes a container and replaces its kernel and file system
+// to restore a container from its state file.
+func (c *Container) Restore(spec *specs.Spec, conf *boot.Config, restoreFile string) error {
+ log.Debugf("Restore container %q", c.ID)
+ unlock, err := c.lock()
+ if err != nil {
+ return err
+ }
+ defer unlock()
+
+ if err := c.requireStatus("restore", Created); err != nil {
+ return err
+ }
+
+ // "If any prestart hook fails, the runtime MUST generate an error,
+ // stop and destroy the container" -OCI spec.
+ if c.Spec.Hooks != nil {
+ if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil {
+ return err
+ }
+ }
+
+ if err := c.Sandbox.Restore(c.ID, spec, conf, restoreFile); err != nil {
+ return err
+ }
+ c.changeStatus(Running)
+ return c.save()
+}
+
+// Run is a helper that calls Create + Start + Wait.
+func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile, userLog string, detach bool) (syscall.WaitStatus, error) {
+ log.Debugf("Run container %q in root dir: %s", id, conf.RootDir)
+ c, err := Create(id, spec, conf, bundleDir, consoleSocket, pidFile, userLog)
+ if err != nil {
+ return 0, fmt.Errorf("creating container: %v", err)
+ }
+ // Clean up partially created container if an error ocurrs.
+ // Any errors returned by Destroy() itself are ignored.
+ cu := specutils.MakeCleanup(func() {
+ c.Destroy()
+ })
+ defer cu.Clean()
+
+ if conf.RestoreFile != "" {
+ log.Debugf("Restore: %v", conf.RestoreFile)
+ if err := c.Restore(spec, conf, conf.RestoreFile); err != nil {
+ return 0, fmt.Errorf("starting container: %v", err)
+ }
+ } else {
+ if err := c.Start(conf); err != nil {
+ return 0, fmt.Errorf("starting container: %v", err)
+ }
+ }
+ if detach {
+ cu.Release()
+ return 0, nil
+ }
+ return c.Wait()
+}
+
+// Execute runs the specified command in the container. It returns the PID of
+// the newly created process.
+func (c *Container) Execute(args *control.ExecArgs) (int32, error) {
+ log.Debugf("Execute in container %q, args: %+v", c.ID, args)
+ if err := c.requireStatus("execute in", Created, Running); err != nil {
+ return 0, err
+ }
+ args.ContainerID = c.ID
+ return c.Sandbox.Execute(args)
+}
+
+// Event returns events for the container.
+func (c *Container) Event() (*boot.Event, error) {
+ log.Debugf("Getting events for container %q", c.ID)
+ if err := c.requireStatus("get events for", Created, Running, Paused); err != nil {
+ return nil, err
+ }
+ return c.Sandbox.Event(c.ID)
+}
+
+// SandboxPid returns the Pid of the sandbox the container is running in, or -1 if the
+// container is not running.
+func (c *Container) SandboxPid() int {
+ if err := c.requireStatus("get PID", Created, Running, Paused); err != nil {
+ return -1
+ }
+ return c.Sandbox.Pid
+}
+
+// Wait waits for the container to exit, and returns its WaitStatus.
+// Call to wait on a stopped container is needed to retrieve the exit status
+// and wait returns immediately.
+func (c *Container) Wait() (syscall.WaitStatus, error) {
+ log.Debugf("Wait on container %q", c.ID)
+ return c.Sandbox.Wait(c.ID)
+}
+
+// WaitRootPID waits for process 'pid' in the sandbox's PID namespace and
+// returns its WaitStatus.
+func (c *Container) WaitRootPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
+ log.Debugf("Wait on PID %d in sandbox %q", pid, c.Sandbox.ID)
+ if !c.isSandboxRunning() {
+ return 0, fmt.Errorf("sandbox is not running")
+ }
+ return c.Sandbox.WaitPID(c.Sandbox.ID, pid, clearStatus)
+}
+
+// WaitPID waits for process 'pid' in the container's PID namespace and returns
+// its WaitStatus.
+func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
+ log.Debugf("Wait on PID %d in container %q", pid, c.ID)
+ if !c.isSandboxRunning() {
+ return 0, fmt.Errorf("sandbox is not running")
+ }
+ return c.Sandbox.WaitPID(c.ID, pid, clearStatus)
+}
+
+// SignalContainer sends the signal to the container. If all is true and signal
+// is SIGKILL, then waits for all processes to exit before returning.
+// SignalContainer returns an error if the container is already stopped.
+// TODO(b/113680494): Distinguish different error types.
+func (c *Container) SignalContainer(sig syscall.Signal, all bool) error {
+ log.Debugf("Signal container %q: %v", c.ID, sig)
+ // Signaling container in Stopped state is allowed. When all=false,
+ // an error will be returned anyway; when all=true, this allows
+ // sending signal to other processes inside the container even
+ // after the init process exits. This is especially useful for
+ // container cleanup.
+ if err := c.requireStatus("signal", Running, Stopped); err != nil {
+ return err
+ }
+ if !c.isSandboxRunning() {
+ return fmt.Errorf("sandbox is not running")
+ }
+ return c.Sandbox.SignalContainer(c.ID, sig, all)
+}
+
+// SignalProcess sends sig to a specific process in the container.
+func (c *Container) SignalProcess(sig syscall.Signal, pid int32) error {
+ log.Debugf("Signal process %d in container %q: %v", pid, c.ID, sig)
+ if err := c.requireStatus("signal a process inside", Running); err != nil {
+ return err
+ }
+ if !c.isSandboxRunning() {
+ return fmt.Errorf("sandbox is not running")
+ }
+ return c.Sandbox.SignalProcess(c.ID, int32(pid), sig, false)
+}
+
+// ForwardSignals forwards all signals received by the current process to the
+// container process inside the sandbox. It returns a function that will stop
+// forwarding signals.
+func (c *Container) ForwardSignals(pid int32, fgProcess bool) func() {
+ log.Debugf("Forwarding all signals to container %q PID %d fgProcess=%t", c.ID, pid, fgProcess)
+ sigCh := make(chan os.Signal, 1)
+ signal.Notify(sigCh)
+ go func() {
+ for s := range sigCh {
+ log.Debugf("Forwarding signal %d to container %q PID %d fgProcess=%t", s, c.ID, pid, fgProcess)
+ if err := c.Sandbox.SignalProcess(c.ID, pid, s.(syscall.Signal), fgProcess); err != nil {
+ log.Warningf("error forwarding signal %d to container %q: %v", s, c.ID, err)
+ }
+ }
+ log.Debugf("Done forwarding signals to container %q PID %d fgProcess=%t", c.ID, pid, fgProcess)
+ }()
+
+ return func() {
+ signal.Stop(sigCh)
+ close(sigCh)
+ }
+}
+
+// Checkpoint sends the checkpoint call to the container.
+// The statefile will be written to f, the file at the specified image-path.
+func (c *Container) Checkpoint(f *os.File) error {
+ log.Debugf("Checkpoint container %q", c.ID)
+ if err := c.requireStatus("checkpoint", Created, Running, Paused); err != nil {
+ return err
+ }
+ return c.Sandbox.Checkpoint(c.ID, f)
+}
+
+// Pause suspends the container and its kernel.
+// The call only succeeds if the container's status is created or running.
+func (c *Container) Pause() error {
+ log.Debugf("Pausing container %q", c.ID)
+ unlock, err := c.lock()
+ if err != nil {
+ return err
+ }
+ defer unlock()
+
+ if c.Status != Created && c.Status != Running {
+ return fmt.Errorf("cannot pause container %q in state %v", c.ID, c.Status)
+ }
+
+ if err := c.Sandbox.Pause(c.ID); err != nil {
+ return fmt.Errorf("pausing container: %v", err)
+ }
+ c.changeStatus(Paused)
+ return c.save()
+}
+
+// Resume unpauses the container and its kernel.
+// The call only succeeds if the container's status is paused.
+func (c *Container) Resume() error {
+ log.Debugf("Resuming container %q", c.ID)
+ unlock, err := c.lock()
+ if err != nil {
+ return err
+ }
+ defer unlock()
+
+ if c.Status != Paused {
+ return fmt.Errorf("cannot resume container %q in state %v", c.ID, c.Status)
+ }
+ if err := c.Sandbox.Resume(c.ID); err != nil {
+ return fmt.Errorf("resuming container: %v", err)
+ }
+ c.changeStatus(Running)
+ return c.save()
+}
+
+// State returns the metadata of the container.
+func (c *Container) State() specs.State {
+ return specs.State{
+ Version: specs.Version,
+ ID: c.ID,
+ Status: c.Status.String(),
+ Pid: c.SandboxPid(),
+ Bundle: c.BundleDir,
+ }
+}
+
+// Processes retrieves the list of processes and associated metadata inside a
+// container.
+func (c *Container) Processes() ([]*control.Process, error) {
+ if err := c.requireStatus("get processes of", Running, Paused); err != nil {
+ return nil, err
+ }
+ return c.Sandbox.Processes(c.ID)
+}
+
+// Destroy stops all processes and frees all resources associated with the
+// container.
+func (c *Container) Destroy() error {
+ log.Debugf("Destroy container %q", c.ID)
+
+ // We must perform the following cleanup steps:
+ // * stop the container and gofer processes,
+ // * remove the container filesystem on the host, and
+ // * delete the container metadata directory.
+ //
+ // It's possible for one or more of these steps to fail, but we should
+ // do our best to perform all of the cleanups. Hence, we keep a slice
+ // of errors return their concatenation.
+ var errs []string
+
+ unlock, err := maybeLockRootContainer(c.Spec, c.RootContainerDir)
+ if err != nil {
+ return err
+ }
+ defer unlock()
+
+ if err := c.stop(); err != nil {
+ err = fmt.Errorf("stopping container: %v", err)
+ log.Warningf("%v", err)
+ errs = append(errs, err.Error())
+ }
+
+ if err := os.RemoveAll(c.Root); err != nil && !os.IsNotExist(err) {
+ err = fmt.Errorf("deleting container root directory %q: %v", c.Root, err)
+ log.Warningf("%v", err)
+ errs = append(errs, err.Error())
+ }
+
+ c.changeStatus(Stopped)
+
+ // "If any poststop hook fails, the runtime MUST log a warning, but the
+ // remaining hooks and lifecycle continue as if the hook had succeeded" -OCI spec.
+ // Based on the OCI, "The post-stop hooks MUST be called after the container is
+ // deleted but before the delete operation returns"
+ // Run it here to:
+ // 1) Conform to the OCI.
+ // 2) Make sure it only runs once, because the root has been deleted, the container
+ // can't be loaded again.
+ if c.Spec.Hooks != nil {
+ executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State())
+ }
+
+ if len(errs) == 0 {
+ return nil
+ }
+ return fmt.Errorf(strings.Join(errs, "\n"))
+}
+
+// save saves the container metadata to a file.
+//
+// Precondition: container must be locked with container.lock().
+func (c *Container) save() error {
+ log.Debugf("Save container %q", c.ID)
+ metaFile := filepath.Join(c.Root, metadataFilename)
+ meta, err := json.Marshal(c)
+ if err != nil {
+ return fmt.Errorf("invalid container metadata: %v", err)
+ }
+ if err := ioutil.WriteFile(metaFile, meta, 0640); err != nil {
+ return fmt.Errorf("writing container metadata: %v", err)
+ }
+ return nil
+}
+
+// stop stops the container (for regular containers) or the sandbox (for
+// root containers), and waits for the container or sandbox and the gofer
+// to stop. If any of them doesn't stop before timeout, an error is returned.
+func (c *Container) stop() error {
+ var cgroup *cgroup.Cgroup
+
+ if c.Sandbox != nil {
+ log.Debugf("Destroying container %q", c.ID)
+ if err := c.Sandbox.DestroyContainer(c.ID); err != nil {
+ return fmt.Errorf("destroying container %q: %v", c.ID, err)
+ }
+ // Only uninstall cgroup for sandbox stop.
+ if c.Sandbox.IsRootContainer(c.ID) {
+ cgroup = c.Sandbox.Cgroup
+ }
+ // Only set sandbox to nil after it has been told to destroy the container.
+ c.Sandbox = nil
+ }
+
+ // Try killing gofer if it does not exit with container.
+ if c.GoferPid != 0 {
+ log.Debugf("Killing gofer for container %q, PID: %d", c.ID, c.GoferPid)
+ if err := syscall.Kill(c.GoferPid, syscall.SIGKILL); err != nil {
+ // The gofer may already be stopped, log the error.
+ log.Warningf("Error sending signal %d to gofer %d: %v", syscall.SIGKILL, c.GoferPid, err)
+ }
+ }
+
+ if err := c.waitForStopped(); err != nil {
+ return err
+ }
+
+ // Gofer is running in cgroups, so Cgroup.Uninstall has to be called after it.
+ if cgroup != nil {
+ if err := cgroup.Uninstall(); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func (c *Container) waitForStopped() error {
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+ b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx)
+ op := func() error {
+ if c.isSandboxRunning() {
+ if err := c.SignalContainer(syscall.Signal(0), false); err == nil {
+ return fmt.Errorf("container is still running")
+ }
+ }
+ if c.GoferPid == 0 {
+ return nil
+ }
+ if c.goferIsChild {
+ // The gofer process is a child of the current process,
+ // so we can wait it and collect its zombie.
+ wpid, err := syscall.Wait4(int(c.GoferPid), nil, syscall.WNOHANG, nil)
+ if err != nil {
+ return fmt.Errorf("error waiting the gofer process: %v", err)
+ }
+ if wpid == 0 {
+ return fmt.Errorf("gofer is still running")
+ }
+
+ } else if err := syscall.Kill(c.GoferPid, 0); err == nil {
+ return fmt.Errorf("gofer is still running")
+ }
+ c.GoferPid = 0
+ return nil
+ }
+ return backoff.Retry(op, b)
+}
+
+func (c *Container) createGoferProcess(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]*os.File, *os.File, error) {
+ // Start with the general config flags.
+ args := conf.ToFlags()
+
+ var goferEnds []*os.File
+
+ // nextFD is the next available file descriptor for the gofer process.
+ // It starts at 3 because 0-2 are used by stdin/stdout/stderr.
+ nextFD := 3
+
+ if conf.LogFilename != "" {
+ logFile, err := os.OpenFile(conf.LogFilename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
+ if err != nil {
+ return nil, nil, fmt.Errorf("opening log file %q: %v", conf.LogFilename, err)
+ }
+ defer logFile.Close()
+ goferEnds = append(goferEnds, logFile)
+ args = append(args, "--log-fd="+strconv.Itoa(nextFD))
+ nextFD++
+ }
+
+ if conf.DebugLog != "" {
+ debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "gofer")
+ if err != nil {
+ return nil, nil, fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err)
+ }
+ defer debugLogFile.Close()
+ goferEnds = append(goferEnds, debugLogFile)
+ args = append(args, "--debug-log-fd="+strconv.Itoa(nextFD))
+ nextFD++
+ }
+
+ args = append(args, "gofer", "--bundle", bundleDir)
+ if conf.Overlay {
+ args = append(args, "--panic-on-write=true")
+ }
+
+ // Open the spec file to donate to the sandbox.
+ specFile, err := specutils.OpenSpec(bundleDir)
+ if err != nil {
+ return nil, nil, fmt.Errorf("opening spec file: %v", err)
+ }
+ defer specFile.Close()
+ goferEnds = append(goferEnds, specFile)
+ args = append(args, "--spec-fd="+strconv.Itoa(nextFD))
+ nextFD++
+
+ // Create pipe that allows gofer to send mount list to sandbox after all paths
+ // have been resolved.
+ mountsSand, mountsGofer, err := os.Pipe()
+ if err != nil {
+ return nil, nil, err
+ }
+ defer mountsGofer.Close()
+ goferEnds = append(goferEnds, mountsGofer)
+ args = append(args, fmt.Sprintf("--mounts-fd=%d", nextFD))
+ nextFD++
+
+ // Add root mount and then add any other additional mounts.
+ mountCount := 1
+ for _, m := range spec.Mounts {
+ if specutils.Is9PMount(m) {
+ mountCount++
+ }
+ }
+
+ sandEnds := make([]*os.File, 0, mountCount)
+ for i := 0; i < mountCount; i++ {
+ fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
+ if err != nil {
+ return nil, nil, err
+ }
+ sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox IO FD"))
+
+ goferEnd := os.NewFile(uintptr(fds[1]), "gofer IO FD")
+ defer goferEnd.Close()
+ goferEnds = append(goferEnds, goferEnd)
+
+ args = append(args, fmt.Sprintf("--io-fds=%d", nextFD))
+ nextFD++
+ }
+
+ binPath := specutils.ExePath
+ cmd := exec.Command(binPath, args...)
+ cmd.ExtraFiles = goferEnds
+ cmd.Args[0] = "runsc-gofer"
+
+ // Enter new namespaces to isolate from the rest of the system. Don't unshare
+ // cgroup because gofer is added to a cgroup in the caller's namespace.
+ nss := []specs.LinuxNamespace{
+ {Type: specs.IPCNamespace},
+ {Type: specs.MountNamespace},
+ {Type: specs.NetworkNamespace},
+ {Type: specs.PIDNamespace},
+ {Type: specs.UTSNamespace},
+ }
+
+ // Setup any uid/gid mappings, and create or join the configured user
+ // namespace so the gofer's view of the filesystem aligns with the
+ // users in the sandbox.
+ userNS := specutils.FilterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec)
+ nss = append(nss, userNS...)
+ specutils.SetUIDGIDMappings(cmd, spec)
+ if len(userNS) != 0 {
+ // We need to set UID and GID to have capabilities in a new user namespace.
+ cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0}
+ }
+
+ // Start the gofer in the given namespace.
+ log.Debugf("Starting gofer: %s %v", binPath, args)
+ if err := specutils.StartInNS(cmd, nss); err != nil {
+ return nil, nil, fmt.Errorf("Gofer: %v", err)
+ }
+ log.Infof("Gofer started, PID: %d", cmd.Process.Pid)
+ c.GoferPid = cmd.Process.Pid
+ c.goferIsChild = true
+ return sandEnds, mountsSand, nil
+}
+
+// changeStatus transitions from one status to another ensuring that the
+// transition is valid.
+func (c *Container) changeStatus(s Status) {
+ switch s {
+ case Creating:
+ // Initial state, never transitions to it.
+ panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
+
+ case Created:
+ if c.Status != Creating {
+ panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
+ }
+ if c.Sandbox == nil {
+ panic("sandbox cannot be nil")
+ }
+
+ case Paused:
+ if c.Status != Running {
+ panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
+ }
+ if c.Sandbox == nil {
+ panic("sandbox cannot be nil")
+ }
+
+ case Running:
+ if c.Status != Created && c.Status != Paused {
+ panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
+ }
+ if c.Sandbox == nil {
+ panic("sandbox cannot be nil")
+ }
+
+ case Stopped:
+ if c.Status != Creating && c.Status != Created && c.Status != Running && c.Status != Stopped {
+ panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s))
+ }
+
+ default:
+ panic(fmt.Sprintf("invalid new state: %v", s))
+ }
+ c.Status = s
+}
+
+func (c *Container) isSandboxRunning() bool {
+ return c.Sandbox != nil && c.Sandbox.IsRunning()
+}
+
+func (c *Container) requireStatus(action string, statuses ...Status) error {
+ for _, s := range statuses {
+ if c.Status == s {
+ return nil
+ }
+ }
+ return fmt.Errorf("cannot %s container %q in state %s", action, c.ID, c.Status)
+}
+
+// lock takes a file lock on the container metadata lock file.
+func (c *Container) lock() (func() error, error) {
+ return lockContainerMetadata(filepath.Join(c.Root, c.ID))
+}
+
+// lockContainerMetadata takes a file lock on the metadata lock file in the
+// given container root directory.
+func lockContainerMetadata(containerRootDir string) (func() error, error) {
+ if err := os.MkdirAll(containerRootDir, 0711); err != nil {
+ return nil, fmt.Errorf("creating container root directory %q: %v", containerRootDir, err)
+ }
+ f := filepath.Join(containerRootDir, metadataLockFilename)
+ l := flock.NewFlock(f)
+ if err := l.Lock(); err != nil {
+ return nil, fmt.Errorf("acquiring lock on container lock file %q: %v", f, err)
+ }
+ return l.Unlock, nil
+}
+
+// maybeLockRootContainer locks the sandbox root container. It is used to
+// prevent races to create and delete child container sandboxes.
+func maybeLockRootContainer(spec *specs.Spec, rootDir string) (func() error, error) {
+ if isRoot(spec) {
+ return func() error { return nil }, nil
+ }
+
+ sbid, ok := specutils.SandboxID(spec)
+ if !ok {
+ return nil, fmt.Errorf("no sandbox ID found when locking root container")
+ }
+ sb, err := Load(rootDir, sbid)
+ if err != nil {
+ return nil, err
+ }
+
+ unlock, err := sb.lock()
+ if err != nil {
+ return nil, err
+ }
+ return unlock, nil
+}
+
+func isRoot(spec *specs.Spec) bool {
+ return specutils.ShouldCreateSandbox(spec)
+}
+
+// runInCgroup executes fn inside the specified cgroup. If cg is nil, execute
+// it in the current context.
+func runInCgroup(cg *cgroup.Cgroup, fn func() error) error {
+ if cg == nil {
+ return fn()
+ }
+ restore, err := cg.Join()
+ defer restore()
+ if err != nil {
+ return err
+ }
+ return fn()
+}
diff --git a/runsc/container/hook.go b/runsc/container/hook.go
new file mode 100644
index 000000000..acae6781e
--- /dev/null
+++ b/runsc/container/hook.go
@@ -0,0 +1,111 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+import (
+ "bytes"
+ "encoding/json"
+ "fmt"
+ "os/exec"
+ "path/filepath"
+ "strings"
+ "time"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+// This file implements hooks as defined in OCI spec:
+// https://github.com/opencontainers/runtime-spec/blob/master/config.md#toc22
+//
+// "hooks":{
+// "prestart":[{
+// "path":"/usr/bin/dockerd",
+// "args":[
+// "libnetwork-setkey", "arg2",
+// ]
+// }]
+// },
+
+// executeHooksBestEffort executes hooks and logs warning in case they fail.
+// Runs all hooks, always.
+func executeHooksBestEffort(hooks []specs.Hook, s specs.State) {
+ for _, h := range hooks {
+ if err := executeHook(h, s); err != nil {
+ log.Warningf("Failure to execute hook %+v, err: %v", h, err)
+ }
+ }
+}
+
+// executeHooks executes hooks until the first one fails or they all execute.
+func executeHooks(hooks []specs.Hook, s specs.State) error {
+ for _, h := range hooks {
+ if err := executeHook(h, s); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func executeHook(h specs.Hook, s specs.State) error {
+ log.Debugf("Executing hook %+v, state: %+v", h, s)
+
+ if strings.TrimSpace(h.Path) == "" {
+ return fmt.Errorf("empty path for hook")
+ }
+ if !filepath.IsAbs(h.Path) {
+ return fmt.Errorf("path for hook is not absolute: %q", h.Path)
+ }
+
+ b, err := json.Marshal(s)
+ if err != nil {
+ return err
+ }
+ var stdout, stderr bytes.Buffer
+ cmd := exec.Cmd{
+ Path: h.Path,
+ Args: h.Args,
+ Env: h.Env,
+ Stdin: bytes.NewReader(b),
+ Stdout: &stdout,
+ Stderr: &stderr,
+ }
+ if err := cmd.Start(); err != nil {
+ return err
+ }
+
+ c := make(chan error, 1)
+ go func() {
+ c <- cmd.Wait()
+ }()
+
+ var timer <-chan time.Time
+ if h.Timeout != nil {
+ timer = time.After(time.Duration(*h.Timeout) * time.Second)
+ }
+ select {
+ case err := <-c:
+ if err != nil {
+ return fmt.Errorf("failure executing hook %q, err: %v\nstdout: %s\nstderr: %s", h.Path, err, stdout.String(), stderr.String())
+ }
+ case <-timer:
+ cmd.Process.Kill()
+ cmd.Wait()
+ return fmt.Errorf("timeout executing hook %q\nstdout: %s\nstderr: %s", h.Path, stdout.String(), stderr.String())
+ }
+
+ log.Debugf("Execute hook %q success!", h.Path)
+ return nil
+}
diff --git a/runsc/container/status.go b/runsc/container/status.go
new file mode 100644
index 000000000..91d9112f1
--- /dev/null
+++ b/runsc/container/status.go
@@ -0,0 +1,60 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package container
+
+// Status enumerates container statuses. The statuses and their semantics are
+// part of the runtime CLI spec.
+type Status int
+
+const (
+ // Created indicates "the runtime has finished the create operation and
+ // the container process has neither exited nor executed the
+ // user-specified program".
+ Created Status = iota
+
+ // Creating indicates "the container is being created".
+ Creating
+
+ // Paused indicates that the process within the container has been
+ // suspended.
+ Paused
+
+ // Running indicates "the container process has executed the
+ // user-specified program but has not exited".
+ Running
+
+ // Stopped indicates "the container process has exited".
+ Stopped
+)
+
+// String converts a Status to a string. These strings are part of the runtime
+// CLI spec and should not be changed.
+func (s Status) String() string {
+ switch s {
+ case Created:
+ return "created"
+ case Creating:
+ return "creating"
+ case Paused:
+ return "paused"
+ case Running:
+ return "running"
+ case Stopped:
+ return "stopped"
+ default:
+ return "unknown"
+ }
+
+}