diff options
Diffstat (limited to 'runsc/sandbox/sandbox.go')
-rw-r--r-- | runsc/sandbox/sandbox.go | 666 |
1 files changed, 666 insertions, 0 deletions
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go new file mode 100644 index 000000000..b2fa1d58e --- /dev/null +++ b/runsc/sandbox/sandbox.go @@ -0,0 +1,666 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package sandbox creates and manipulates sandboxes. +package sandbox + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "regexp" + "strconv" + "syscall" + "time" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" + "gvisor.googlesource.com/gvisor/pkg/control/client" + "gvisor.googlesource.com/gvisor/pkg/control/server" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/control" + "gvisor.googlesource.com/gvisor/pkg/urpc" + "gvisor.googlesource.com/gvisor/runsc/boot" + "gvisor.googlesource.com/gvisor/runsc/specutils" +) + +// metadataFilename is the name of the metadata file relative to sandboxRoot +// that holds sandbox metadata. +const metadataFilename = "meta.json" + +// See libcontainer/factory_linux.go +var idRegex = regexp.MustCompile(`^[\w+-\.]+$`) + +// validateID validates the sandbox id. +func validateID(id string) error { + if !idRegex.MatchString(id) { + return fmt.Errorf("invalid sandbox id: %v", id) + } + return nil +} + +// Sandbox wraps a child sandbox process, and is responsible for saving and +// loading sandbox metadata to disk. +// +// Within a root directory, we maintain subdirectories for each sandbox named +// with the sandbox id. The sandbox metadata is is stored as json within the +// sandbox directoy in a file named "meta.json". This metadata format is +// defined by us, and is not part of the OCI spec. +// +// Sandboxes must write this metadata file after any change to their internal +// state. The entire sandbox directory is deleted when the sandbox is +// destroyed. +// +// TODO: Protect against concurrent changes to the sandbox metadata +// file. +type Sandbox struct { + // ID is the sandbox ID. + ID string `json:"id"` + + // Spec is the OCI runtime spec that configures this sandbox. + Spec *specs.Spec `json:"spec"` + + // BundleDir is the directory containing the sandbox bundle. + BundleDir string `json:"bundleDir"` + + // SandboxRoot is the directory containing the sandbox metadata file. + SandboxRoot string `json:"sandboxRoot"` + + // CreatedAt is the time the sandbox was created. + CreatedAt time.Time `json:"createdAt"` + + // Owner is the sandbox owner. + Owner string `json:"owner"` + + // ConsoleSocket is the path to a unix domain socket that will receive + // the console FD. It is only used during create, so we don't need to + // store it in the metadata. + ConsoleSocket string `json:"-"` + + // Pid is the pid of the running sandbox. Only valid if Status is + // Created or Running. + Pid int `json:"pid"` + + // GoferPid is the pid of the gofer running along side the sandbox. May be 0 + // if the gofer has been killed or it's not being used. + GoferPid int `json:"goferPid"` + + // Status is the current sandbox Status. + Status Status `json:"status"` +} + +// Create creates the sandbox subprocess and writes the metadata file. Args +// are additional arguments that will be passed to the sandbox process. +func Create(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string, args []string) (*Sandbox, error) { + log.Debugf("Create sandbox %q in root dir: %s", id, conf.RootDir) + if err := validateID(id); err != nil { + return nil, err + } + + sandboxRoot := filepath.Join(conf.RootDir, id) + if exists(sandboxRoot) { + return nil, fmt.Errorf("sandbox with id %q already exists: %q ", id, sandboxRoot) + } + + s := &Sandbox{ + ID: id, + Spec: spec, + ConsoleSocket: consoleSocket, + BundleDir: bundleDir, + SandboxRoot: sandboxRoot, + Status: Creating, + Owner: os.Getenv("USER"), + } + + // Create sandbox process. If anything errors between now and the end of this + // function, we MUST clean up all sandbox resources. + if err := s.createProcesses(conf, args); err != nil { + s.Destroy() + return nil, err + } + + // Wait for the control server to come up (or timeout). The sandbox is + // not "created" until that happens. + if err := s.waitForCreated(10 * time.Second); err != nil { + s.Destroy() + return nil, err + } + + s.Status = Created + s.CreatedAt = time.Now() + + // Save the metadata file. + if err := s.save(); err != nil { + s.Destroy() + return nil, err + } + + // Write the pid file. Containerd consideres the create complete after + // this file is created, so it must be the last thing we do. + if pidFile != "" { + if err := ioutil.WriteFile(pidFile, []byte(strconv.Itoa(s.Pid)), 0644); err != nil { + s.Destroy() + return nil, fmt.Errorf("error writing pid file: %v", err) + } + } + + return s, nil +} + +// Run is a helper that calls Create + Start + Wait. +func Run(id string, spec *specs.Spec, conf *boot.Config, bundleDir, consoleSocket, pidFile string, args []string) (syscall.WaitStatus, error) { + s, err := Create(id, spec, conf, bundleDir, consoleSocket, pidFile, args) + if err != nil { + return 0, fmt.Errorf("error creating sandbox: %v", err) + } + if err := s.Start(conf); err != nil { + return 0, fmt.Errorf("error starting sandbox: %v", err) + } + return s.Wait() +} + +// Load loads a sandbox from with the given id from a metadata file. +func Load(rootDir, id string) (*Sandbox, error) { + log.Debugf("Load sandbox %q %q", rootDir, id) + if err := validateID(id); err != nil { + return nil, err + } + sandboxRoot := filepath.Join(rootDir, id) + if !exists(sandboxRoot) { + return nil, fmt.Errorf("sandbox with id %q does not exist", id) + } + metaFile := filepath.Join(sandboxRoot, metadataFilename) + if !exists(metaFile) { + return nil, fmt.Errorf("sandbox with id %q does not have metadata file %q", id, metaFile) + } + metaBytes, err := ioutil.ReadFile(metaFile) + if err != nil { + return nil, fmt.Errorf("error reading sandbox metadata file %q: %v", metaFile, err) + } + var s Sandbox + if err := json.Unmarshal(metaBytes, &s); err != nil { + return nil, fmt.Errorf("error unmarshaling sandbox metadata from %q: %v", metaFile, err) + } + + // If the status is "Running" or "Created", check that the process + // still exists, and set it to Stopped if it does not. + // + // This is inherintly racey. + if s.Status == Running || s.Status == Created { + // Send signal 0 to check if process exists. + if err := s.Signal(0); err != nil { + // Process no longer exists. + s.Status = Stopped + s.Pid = 0 + } + } + + return &s, nil +} + +// List returns all sandbox ids in the given root directory. +func List(rootDir string) ([]string, error) { + log.Debugf("List sandboxes %q", rootDir) + fs, err := ioutil.ReadDir(rootDir) + if err != nil { + return nil, fmt.Errorf("ReadDir(%s) failed: %v", rootDir, err) + } + var out []string + for _, f := range fs { + out = append(out, f.Name()) + } + return out, nil +} + +// State returns the metadata of the sandbox. +func (s *Sandbox) State() specs.State { + return specs.State{ + Version: specs.Version, + ID: s.ID, + Status: s.Status.String(), + Pid: s.Pid, + Bundle: s.BundleDir, + } +} + +// Start starts running the containerized process inside the sandbox. +func (s *Sandbox) Start(conf *boot.Config) error { + log.Debugf("Start sandbox %q, pid: %d", s.ID, s.Pid) + if s.Status != Created { + return fmt.Errorf("cannot start container in state %s", s.Status) + } + + // "If any prestart hook fails, the runtime MUST generate an error, + // stop and destroy the container". + if s.Spec.Hooks != nil { + if err := executeHooks(s.Spec.Hooks.Prestart, s.State()); err != nil { + s.Destroy() + return err + } + } + + c, err := s.connect() + if err != nil { + s.Destroy() + return err + } + defer c.Close() + + // Configure the network. + if err := setupNetwork(c, s.Pid, s.Spec, conf); err != nil { + s.Destroy() + return fmt.Errorf("error setting up network: %v", err) + } + + // Send a message to the sandbox control server to start the + // application. + if err := c.Call(boot.ApplicationStart, nil, nil); err != nil { + s.Destroy() + return fmt.Errorf("error starting sandbox: %v", err) + } + + // "If any poststart hook fails, the runtime MUST log a warning, but + // the remaining hooks and lifecycle continue as if the hook had + // succeeded". + if s.Spec.Hooks != nil { + executeHooksBestEffort(s.Spec.Hooks.Poststart, s.State()) + } + + s.Status = Running + return s.save() +} + +// Processes retrieves the list of processes and associated metadata inside a +// sandbox. +func (s *Sandbox) Processes() ([]*control.Process, error) { + if s.Status != Running { + return nil, fmt.Errorf("cannot get processes of container %q because it isn't running. It is in state %v", s.ID, s.Status) + } + + c, err := s.connect() + if err != nil { + return nil, err + } + defer c.Close() + + var pl []*control.Process + if err := c.Call(boot.ApplicationProcesses, nil, &pl); err != nil { + return nil, fmt.Errorf("error retrieving process data from sandbox: %v", err) + } + return pl, nil +} + +// Execute runs the specified command in the sandbox. +func (s *Sandbox) Execute(e *control.ExecArgs) (syscall.WaitStatus, error) { + log.Debugf("Execute in sandbox %q, pid: %d, args: %+v", s.ID, s.Pid, e) + if s.Status != Created && s.Status != Running { + return 0, fmt.Errorf("cannot exec in container in state %s", s.Status) + } + + log.Debugf("Connecting to sandbox...") + c, err := s.connect() + if err != nil { + return 0, fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err) + } + defer c.Close() + + // Send a message to the sandbox control server to start the application. + var waitStatus uint32 + if err := c.Call(boot.ApplicationExecute, e, &waitStatus); err != nil { + return 0, fmt.Errorf("error executing in sandbox: %v", err) + } + + return syscall.WaitStatus(waitStatus), nil +} + +// Event retrieves stats about the sandbox such as memory and CPU utilization. +func (s *Sandbox) Event() (*boot.Event, error) { + if s.Status != Running && s.Status != Created { + return nil, fmt.Errorf("cannot get events for container in state: %s", s.Status) + } + + c, err := s.connect() + if err != nil { + return nil, err + } + defer c.Close() + + var e boot.Event + if err := c.Call(boot.ApplicationEvent, nil, &e); err != nil { + return nil, fmt.Errorf("error retrieving event data from sandbox: %v", err) + } + e.ID = s.ID + return &e, nil +} + +func (s *Sandbox) connect() (*urpc.Client, error) { + log.Debugf("Connecting to sandbox...") + c, err := client.ConnectTo(boot.ControlSocketAddr(s.ID)) + if err != nil { + return nil, fmt.Errorf("error connecting to control server at pid %d: %v", s.Pid, err) + } + return c, nil +} + +func (s *Sandbox) createProcesses(conf *boot.Config, args []string) error { + binPath, err := specutils.BinPath() + if err != nil { + return err + } + + ioFiles, err := s.createGoferProcess(conf, binPath, args) + if err != nil { + return err + } + return s.createSandboxProcess(conf, binPath, args, ioFiles) +} + +func (s *Sandbox) createGoferProcess(conf *boot.Config, binPath string, commonArgs []string) ([]*os.File, error) { + if conf.FileAccess != boot.FileAccessProxy { + // Don't start a gofer. The sandbox will access host FS directly. + return nil, nil + } + + var args []string + args = append(args, commonArgs...) + args = append(args, "gofer", "--bundle", s.BundleDir) + + // Start with root mount and then add any other additional mount. + mountCount := 1 + for _, m := range s.Spec.Mounts { + if specutils.Is9PMount(m) { + mountCount++ + } + } + + sandEnds := make([]*os.File, 0, mountCount) + goferEnds := make([]*os.File, 0, mountCount) + for i := 0; i < mountCount; i++ { + // Create socket that connects the sandbox and gofer. + fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) + if err != nil { + return nil, err + } + sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox io fd")) + + goferEnd := os.NewFile(uintptr(fds[1]), "gofer io fd") + defer goferEnd.Close() + goferEnds = append(goferEnds, goferEnd) + + args = append(args, fmt.Sprintf("--io-fds=%d", 3+i)) + } + + cmd := exec.Command(binPath, args...) + cmd.ExtraFiles = goferEnds + + // Setup any uid/gid mappings, and create or join the configured user + // namespace so the gofer's view of the filesystem aligns with the + // users in the sandbox. + setUIDGIDMappings(cmd, s.Spec) + nss := filterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, s.Spec) + + // Start the gofer in the given namespace. + log.Debugf("Starting gofer: %s %v", binPath, args) + if err := startInNS(cmd, nss); err != nil { + return nil, err + } + s.GoferPid = cmd.Process.Pid + log.Infof("Gofer started, pid: %d", cmd.Process.Pid) + return sandEnds, nil +} + +// createSandboxProcess starts the sandbox as a subprocess by running the "boot" +// command, passing in the bundle dir. +func (s *Sandbox) createSandboxProcess(conf *boot.Config, binPath string, commonArgs []string, ioFiles []*os.File) error { + // nextFD is used to get unused FDs that we can pass to the sandbox. It + // starts at 3 because 0, 1, and 2 are taken by stdin/out/err. + nextFD := 3 + + // Create control server socket here and donate FD to child process because + // it may be in a different network namespace and won't be reachable from + // outside. + fd, err := server.CreateSocket(boot.ControlSocketAddr(s.ID)) + if err != nil { + return fmt.Errorf("error creating control server socket for sandbox %q: %v", s.ID, err) + } + + consoleEnabled := s.ConsoleSocket != "" + + cmd := exec.Command(binPath, commonArgs...) + cmd.SysProcAttr = &syscall.SysProcAttr{} + cmd.Args = append(cmd.Args, + "boot", + "--bundle", s.BundleDir, + "--controller-fd="+strconv.Itoa(nextFD), + fmt.Sprintf("--console=%t", consoleEnabled)) + nextFD++ + + controllerFile := os.NewFile(uintptr(fd), "control_server_socket") + defer controllerFile.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile) + + // If there is a gofer, sends all socket ends to the sandbox. + for _, f := range ioFiles { + defer f.Close() + cmd.ExtraFiles = append(cmd.ExtraFiles, f) + cmd.Args = append(cmd.Args, "--io-fds="+strconv.Itoa(nextFD)) + nextFD++ + } + + // If the console control socket file is provided, then create a new + // pty master/slave pair and set the tty on the sandox process. + if consoleEnabled { + // setupConsole will send the master on the socket, and return + // the slave. + tty, err := setupConsole(s.ConsoleSocket) + if err != nil { + return fmt.Errorf("error setting up control socket %q: %v", s.ConsoleSocket, err) + } + defer tty.Close() + + cmd.Stdin = tty + cmd.Stdout = tty + cmd.Stderr = tty + cmd.SysProcAttr.Setctty = true + cmd.SysProcAttr.Ctty = int(tty.Fd()) + } else { + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + } + + // Detach from this session, otherwise cmd will get SIGHUP and SIGCONT + // when re-parented. + cmd.SysProcAttr.Setsid = true + + // nss is the set of namespaces to join or create before starting the sandbox + // process. IPC and UTS namespaces from the host are not used as they + // are virtualized inside the sandbox. Be paranoid and run inside an empty + // namespace for these. + log.Infof("Sandbox will be started in empty IPC and UTS namespaces") + nss := []specs.LinuxNamespace{ + specs.LinuxNamespace{Type: specs.IPCNamespace}, + specs.LinuxNamespace{Type: specs.UTSNamespace}, + } + + if conf.Platform == boot.PlatformPtrace { + // TODO: Also set an empty PID namespace so that we limit + // access to other host processes. + log.Infof("Sandbox will be started in the current PID namespace") + } else { + log.Infof("Sandbox will be started in empty PID namespace") + nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace}) + } + + if conf.FileAccess == boot.FileAccessProxy { + log.Infof("Sandbox will be started in empty mount namespace") + nss = append(nss, specs.LinuxNamespace{Type: specs.MountNamespace}) + } else { + log.Infof("Sandbox will be started in the current mount namespace") + } + + // Joins the network namespace if network is enabled. the sandbox talks + // directly to the host network, which may have been configured in the + // namespace. + if ns, ok := getNS(specs.NetworkNamespace, s.Spec); ok && conf.Network != boot.NetworkNone { + log.Infof("Sandbox will be started in the container's network namespace: %+v", ns) + nss = append(nss, ns) + } else { + log.Infof("Sandbox will be started in empty network namespace") + nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace}) + } + + // User namespace depends on the following options: + // - Host network/filesystem: requires to run inside the user namespace + // specified in the spec or the current namespace if none is configured. + // - Gofer: when using a Gofer, the sandbox process can run isolated in an + // empty namespace. + if conf.Network == boot.NetworkHost || conf.FileAccess == boot.FileAccessDirect { + if userns, ok := getNS(specs.UserNamespace, s.Spec); ok { + log.Infof("Sandbox will be started in container's user namespace: %+v", userns) + nss = append(nss, userns) + setUIDGIDMappings(cmd, s.Spec) + } else { + // TODO: Retrict capabilities since it's using current user + // namespace, i.e. root. + log.Infof("Sandbox will be started in the current user namespace") + } + // When running in the caller's defined user namespace, apply the same + // capabilities to the sandbox process to ensure it abides to the same + // rules. + cmd.Args = append(cmd.Args, "--apply-caps=true") + + } else { + log.Infof("Sandbox will be started in empty user namespace") + nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace}) + } + + log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args) + if err := startInNS(cmd, nss); err != nil { + return err + } + s.Pid = cmd.Process.Pid + log.Infof("Sandbox started, pid: %d", s.Pid) + return nil +} + +// waitForCreated waits for the sandbox subprocess control server to be +// running, at which point the sandbox is in Created state. +func (s *Sandbox) waitForCreated(timeout time.Duration) error { + log.Debugf("Waiting for sandbox %q creation", s.ID) + tchan := time.After(timeout) + for { + select { + case <-tchan: + return fmt.Errorf("timed out waiting for sandbox control server") + default: + if c, err := client.ConnectTo(boot.ControlSocketAddr(s.ID)); err == nil { + // It's alive! + c.Close() + return nil + } + } + } +} + +// Wait waits for the containerized process to exit, and returns its WaitStatus. +func (s *Sandbox) Wait() (syscall.WaitStatus, error) { + log.Debugf("Wait on sandbox %q with pid %d", s.ID, s.Pid) + p, err := os.FindProcess(s.Pid) + if err != nil { + // "On Unix systems, FindProcess always succeeds and returns a + // Process for the given pid." + panic(err) + } + ps, err := p.Wait() + if err != nil { + return 0, err + } + return ps.Sys().(syscall.WaitStatus), nil +} + +// Destroy frees all resources associated with the sandbox. +func (s *Sandbox) Destroy() error { + log.Debugf("Destroy sandbox %q", s.ID) + if s.Pid != 0 { + // TODO: Too harsh? + log.Debugf("Killing sandbox %q", s.ID) + sendSignal(s.Pid, unix.SIGKILL) + s.Pid = 0 + } + if s.GoferPid != 0 { + log.Debugf("Killing gofer for sandbox %q", s.ID) + sendSignal(s.GoferPid, unix.SIGKILL) + s.GoferPid = 0 + } + if err := os.RemoveAll(s.SandboxRoot); err != nil { + log.Warningf("Failed to delete sandbox root directory %q, err: %v", s.SandboxRoot, err) + } + + // "If any poststop hook fails, the runtime MUST log a warning, but the + // remaining hooks and lifecycle continue as if the hook had succeeded". + if s.Spec.Hooks != nil && (s.Status == Created || s.Status == Running) { + executeHooksBestEffort(s.Spec.Hooks.Poststop, s.State()) + } + + s.Status = Stopped + return nil +} + +// Signal sends the signal to the sandbox. +func (s *Sandbox) Signal(sig syscall.Signal) error { + log.Debugf("Signal sandbox %q", s.ID) + if s.Status == Stopped { + log.Warningf("sandbox %q not running, not sending signal %v to pid %d", s.ID, sig, s.Pid) + return nil + } + return sendSignal(s.Pid, sig) +} + +func sendSignal(pid int, sig syscall.Signal) error { + if err := syscall.Kill(pid, sig); err != nil { + return fmt.Errorf("error sending signal %d to pid %d: %v", sig, pid, err) + } + return nil +} + +// save saves the sandbox metadata to a file. +func (s *Sandbox) save() error { + log.Debugf("Save sandbox %q", s.ID) + if err := os.MkdirAll(s.SandboxRoot, 0711); err != nil { + return fmt.Errorf("error creating sandbox root directory %q: %v", s.SandboxRoot, err) + } + meta, err := json.Marshal(s) + if err != nil { + return fmt.Errorf("error marshaling sandbox metadata: %v", err) + } + metaFile := filepath.Join(s.SandboxRoot, metadataFilename) + if err := ioutil.WriteFile(metaFile, meta, 0640); err != nil { + return fmt.Errorf("error writing sandbox metadata: %v", err) + } + return nil +} + +// exists returns true if the given file exists. +func exists(f string) bool { + if _, err := os.Stat(f); err == nil { + return true + } else if !os.IsNotExist(err) { + log.Warningf("error checking for file %q: %v", f, err) + } + return false +} |