diff options
Diffstat (limited to 'runsc/boot/controller.go')
-rw-r--r-- | runsc/boot/controller.go | 491 |
1 files changed, 491 insertions, 0 deletions
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go new file mode 100644 index 000000000..72ab9ef86 --- /dev/null +++ b/runsc/boot/controller.go @@ -0,0 +1,491 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "errors" + "fmt" + "os" + "path" + "syscall" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.googlesource.com/gvisor/pkg/control/server" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/control" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket" + "gvisor.googlesource.com/gvisor/pkg/sentry/state" + "gvisor.googlesource.com/gvisor/pkg/sentry/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/watchdog" + "gvisor.googlesource.com/gvisor/pkg/tcpip/stack" + "gvisor.googlesource.com/gvisor/pkg/urpc" +) + +const ( + // ContainerCheckpoint checkpoints a container. + ContainerCheckpoint = "containerManager.Checkpoint" + + // ContainerCreate creates a container. + ContainerCreate = "containerManager.Create" + + // ContainerDestroy is used to stop a non-root container and free all + // associated resources in the sandbox. + ContainerDestroy = "containerManager.Destroy" + + // ContainerEvent is the URPC endpoint for getting stats about the + // container used by "runsc events". + ContainerEvent = "containerManager.Event" + + // ContainerExecuteAsync is the URPC endpoint for executing a command in a + // container.. + ContainerExecuteAsync = "containerManager.ExecuteAsync" + + // ContainerPause pauses the container. + ContainerPause = "containerManager.Pause" + + // ContainerProcesses is the URPC endpoint for getting the list of + // processes running in a container. + ContainerProcesses = "containerManager.Processes" + + // ContainerRestore restores a container from a statefile. + ContainerRestore = "containerManager.Restore" + + // ContainerResume unpauses the paused container. + ContainerResume = "containerManager.Resume" + + // ContainerSignal is used to send a signal to a container. + ContainerSignal = "containerManager.Signal" + + // ContainerSignalProcess is used to send a signal to a particular + // process in a container. + ContainerSignalProcess = "containerManager.SignalProcess" + + // ContainerStart is the URPC endpoint for running a non-root container + // within a sandbox. + ContainerStart = "containerManager.Start" + + // ContainerWait is used to wait on the init process of the container + // and return its ExitStatus. + ContainerWait = "containerManager.Wait" + + // ContainerWaitPID is used to wait on a process with a certain PID in + // the sandbox and return its ExitStatus. + ContainerWaitPID = "containerManager.WaitPID" + + // NetworkCreateLinksAndRoutes is the URPC endpoint for creating links + // and routes in a network stack. + NetworkCreateLinksAndRoutes = "Network.CreateLinksAndRoutes" + + // RootContainerStart is the URPC endpoint for starting a new sandbox + // with root container. + RootContainerStart = "containerManager.StartRoot" + + // SandboxStacks collects sandbox stacks for debugging. + SandboxStacks = "debug.Stacks" + + // Profiling related commands (see pprof.go for more details). + StartCPUProfile = "Profile.StartCPUProfile" + StopCPUProfile = "Profile.StopCPUProfile" + HeapProfile = "Profile.HeapProfile" + StartTrace = "Profile.StartTrace" + StopTrace = "Profile.StopTrace" +) + +// ControlSocketAddr generates an abstract unix socket name for the given ID. +func ControlSocketAddr(id string) string { + return fmt.Sprintf("\x00runsc-sandbox.%s", id) +} + +// controller holds the control server, and is used for communication into the +// sandbox. +type controller struct { + // srv is the control server. + srv *server.Server + + // manager holds the containerManager methods. + manager *containerManager +} + +// newController creates a new controller. The caller must call +// controller.srv.StartServing() to start the controller. +func newController(fd int, l *Loader) (*controller, error) { + srv, err := server.CreateFromFD(fd) + if err != nil { + return nil, err + } + + manager := &containerManager{ + startChan: make(chan struct{}), + startResultChan: make(chan error), + l: l, + } + srv.Register(manager) + + if eps, ok := l.k.NetworkStack().(*epsocket.Stack); ok { + net := &Network{ + Stack: eps.Stack, + } + srv.Register(net) + } + + srv.Register(&debug{}) + if l.conf.ProfileEnable { + srv.Register(&control.Profile{}) + } + + return &controller{ + srv: srv, + manager: manager, + }, nil +} + +// containerManager manages sandboes containers. +type containerManager struct { + // startChan is used to signal when the root container process should + // be started. + startChan chan struct{} + + // startResultChan is used to signal when the root container has + // started. Any errors encountered during startup will be sent to the + // channel. A nil value indicates success. + startResultChan chan error + + // l is the loader that creates containers and sandboxes. + l *Loader +} + +// StartRoot will start the root container process. +func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error { + log.Debugf("containerManager.StartRoot %q", *cid) + // Tell the root container to start and wait for the result. + cm.startChan <- struct{}{} + if err := <-cm.startResultChan; err != nil { + return fmt.Errorf("starting sandbox: %v", err) + } + return nil +} + +// Processes retrieves information about processes running in the sandbox. +func (cm *containerManager) Processes(cid *string, out *[]*control.Process) error { + log.Debugf("containerManager.Processes: %q", *cid) + return control.Processes(cm.l.k, *cid, out) +} + +// Create creates a container within a sandbox. +func (cm *containerManager) Create(cid *string, _ *struct{}) error { + log.Debugf("containerManager.Create: %q", *cid) + return cm.l.createContainer(*cid) +} + +// StartArgs contains arguments to the Start method. +type StartArgs struct { + // Spec is the spec of the container to start. + Spec *specs.Spec + + // Config is the runsc-specific configuration for the sandbox. + Conf *Config + + // CID is the ID of the container to start. + CID string + + // FilePayload contains, in order: + // * stdin, stdout, and stderr. + // * the file descriptor over which the sandbox will + // request files from its root filesystem. + urpc.FilePayload +} + +// Start runs a created container within a sandbox. +func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { + log.Debugf("containerManager.Start: %+v", args) + + // Validate arguments. + if args == nil { + return errors.New("start missing arguments") + } + if args.Spec == nil { + return errors.New("start arguments missing spec") + } + if args.Conf == nil { + return errors.New("start arguments missing config") + } + if args.CID == "" { + return errors.New("start argument missing container ID") + } + // Prevent CIDs containing ".." from confusing the sentry when creating + // /containers/<cid> directory. + // TODO(b/129293409): Once we have multiple independent roots, this + // check won't be necessary. + if path.Clean(args.CID) != args.CID { + return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", args.CID) + } + if len(args.FilePayload.Files) < 4 { + return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer") + } + + err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files) + if err != nil { + log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err) + return err + } + log.Debugf("Container %q started", args.CID) + + return nil +} + +// Destroy stops a container if it is still running and cleans up its +// filesystem. +func (cm *containerManager) Destroy(cid *string, _ *struct{}) error { + log.Debugf("containerManager.destroy %q", *cid) + return cm.l.destroyContainer(*cid) +} + +// ExecuteAsync starts running a command on a created or running sandbox. It +// returns the PID of the new process. +func (cm *containerManager) ExecuteAsync(args *control.ExecArgs, pid *int32) error { + log.Debugf("containerManager.ExecuteAsync: %+v", args) + tgid, err := cm.l.executeAsync(args) + if err != nil { + log.Debugf("containerManager.ExecuteAsync failed: %+v: %v", args, err) + return err + } + *pid = int32(tgid) + return nil +} + +// Checkpoint pauses a sandbox and saves its state. +func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error { + log.Debugf("containerManager.Checkpoint") + state := control.State{ + Kernel: cm.l.k, + Watchdog: cm.l.watchdog, + } + return state.Save(o, nil) +} + +// Pause suspends a container. +func (cm *containerManager) Pause(_, _ *struct{}) error { + log.Debugf("containerManager.Pause") + cm.l.k.Pause() + return nil +} + +// RestoreOpts contains options related to restoring a container's file system. +type RestoreOpts struct { + // FilePayload contains the state file to be restored, followed by the + // platform device file if necessary. + urpc.FilePayload + + // SandboxID contains the ID of the sandbox. + SandboxID string +} + +// Restore loads a container from a statefile. +// The container's current kernel is destroyed, a restore environment is +// created, and the kernel is recreated with the restore state file. The +// container then sends the signal to start. +func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { + log.Debugf("containerManager.Restore") + + var specFile, deviceFile *os.File + switch numFiles := len(o.FilePayload.Files); numFiles { + case 2: + // The device file is donated to the platform. + // Can't take ownership away from os.File. dup them to get a new FD. + fd, err := syscall.Dup(int(o.FilePayload.Files[1].Fd())) + if err != nil { + return fmt.Errorf("failed to dup file: %v", err) + } + deviceFile = os.NewFile(uintptr(fd), "platform device") + fallthrough + case 1: + specFile = o.FilePayload.Files[0] + case 0: + return fmt.Errorf("at least one file must be passed to Restore") + default: + return fmt.Errorf("at most two files may be passed to Restore") + } + + networkStack := cm.l.k.NetworkStack() + // Destroy the old kernel and create a new kernel. + cm.l.k.Pause() + cm.l.k.Destroy() + + p, err := createPlatform(cm.l.conf, deviceFile) + if err != nil { + return fmt.Errorf("creating platform: %v", err) + } + k := &kernel.Kernel{ + Platform: p, + } + mf, err := createMemoryFile() + if err != nil { + return fmt.Errorf("creating memory file: %v", err) + } + k.SetMemoryFile(mf) + cm.l.k = k + + // Set up the restore environment. + fds := &fdDispenser{fds: cm.l.goferFDs} + renv, err := createRestoreEnvironment(cm.l.spec, cm.l.conf, fds) + if err != nil { + return fmt.Errorf("creating RestoreEnvironment: %v", err) + } + fs.SetRestoreEnvironment(*renv) + + // Prepare to load from the state file. + if eps, ok := networkStack.(*epsocket.Stack); ok { + stack.StackFromEnv = eps.Stack // FIXME(b/36201077) + } + info, err := specFile.Stat() + if err != nil { + return err + } + if info.Size() == 0 { + return fmt.Errorf("file cannot be empty") + } + + // Load the state. + loadOpts := state.LoadOpts{Source: specFile} + if err := loadOpts.Load(k, networkStack); err != nil { + return err + } + + // Set timekeeper. + k.Timekeeper().SetClocks(time.NewCalibratedClocks()) + + // Since we have a new kernel we also must make a new watchdog. + watchdog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction) + + // Change the loader fields to reflect the changes made when restoring. + cm.l.k = k + cm.l.watchdog = watchdog + cm.l.rootProcArgs = kernel.CreateProcessArgs{} + cm.l.restore = true + + // Reinitialize the sandbox ID and processes map. Note that it doesn't + // restore the state of multiple containers, nor exec processes. + cm.l.sandboxID = o.SandboxID + cm.l.mu.Lock() + eid := execID{cid: o.SandboxID} + cm.l.processes = map[execID]*execProcess{ + eid: { + tg: cm.l.k.GlobalInit(), + }, + } + cm.l.mu.Unlock() + + // Tell the root container to start and wait for the result. + cm.startChan <- struct{}{} + if err := <-cm.startResultChan; err != nil { + return fmt.Errorf("starting sandbox: %v", err) + } + + return nil +} + +// Resume unpauses a container. +func (cm *containerManager) Resume(_, _ *struct{}) error { + log.Debugf("containerManager.Resume") + cm.l.k.Unpause() + return nil +} + +// Wait waits for the init process in the given container. +func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error { + log.Debugf("containerManager.Wait") + err := cm.l.waitContainer(*cid, waitStatus) + log.Debugf("containerManager.Wait returned, waitStatus: %v: %v", waitStatus, err) + return err +} + +// WaitPIDArgs are arguments to the WaitPID method. +type WaitPIDArgs struct { + // PID is the PID in the container's PID namespace. + PID int32 + + // CID is the container ID. + CID string + + // ClearStatus determines whether the exit status of the process should + // be cleared when WaitPID returns. + ClearStatus bool +} + +// WaitPID waits for the process with PID 'pid' in the sandbox. +func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error { + log.Debugf("containerManager.Wait") + return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, args.ClearStatus, waitStatus) +} + +// SignalDeliveryMode enumerates different signal delivery modes. +type SignalDeliveryMode int + +const ( + // DeliverToProcess delivers the signal to the container process with + // the specified PID. If PID is 0, then the container init process is + // signaled. + DeliverToProcess SignalDeliveryMode = iota + + // DeliverToAllProcesses delivers the signal to all processes in the + // container. PID must be 0. + DeliverToAllProcesses + + // DeliverToForegroundProcessGroup delivers the signal to the + // foreground process group in the same TTY session as the specified + // process. If PID is 0, then the signal is delivered to the foreground + // process group for the TTY for the init process. + DeliverToForegroundProcessGroup +) + +func (s SignalDeliveryMode) String() string { + switch s { + case DeliverToProcess: + return "Process" + case DeliverToAllProcesses: + return "All" + case DeliverToForegroundProcessGroup: + return "Foreground Process Group" + } + return fmt.Sprintf("unknown signal delivery mode: %d", s) +} + +// SignalArgs are arguments to the Signal method. +type SignalArgs struct { + // CID is the container ID. + CID string + + // Signo is the signal to send to the process. + Signo int32 + + // PID is the process ID in the given container that will be signaled. + // If 0, the root container will be signalled. + PID int32 + + // Mode is the signal delivery mode. + Mode SignalDeliveryMode +} + +// Signal sends a signal to one or more processes in a container. If args.PID +// is 0, then the container init process is used. Depending on the +// args.SignalDeliveryMode option, the signal may be sent directly to the +// indicated process, to all processes in the container, or to the foreground +// process group. +func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error { + log.Debugf("containerManager.Signal %+v", args) + return cm.l.signal(args.CID, args.PID, args.Signo, args.Mode) +} |