From c05660373e8bda36ddf5181220c76f4327f2abc6 Mon Sep 17 00:00:00 2001 From: Justine Olshan Date: Wed, 18 Jul 2018 16:57:29 -0700 Subject: Moved restore code out of create and made to be called after create. Docker expects containers to be created before they are restored. However, gVisor restoring requires specificactions regarding the kernel and the file system. These actions were originally in booting the sandbox. Now setting up the file system is deferred until a call to a call to runsc start. In the restore case, the kernel is destroyed and a new kernel is created in the same process, as we need the same process for Docker. These changes required careful execution of concurrent processes which required the use of a channel. Full docker integration still needs the ability to restore into the same container. PiperOrigin-RevId: 205161441 Change-Id: Ie1d2304ead7e06855319d5dc310678f701bd099f --- runsc/boot/controller.go | 131 ++++++++++++++++++++++----- runsc/boot/events.go | 4 +- runsc/boot/fs.go | 45 ++++++++++ runsc/boot/loader.go | 222 +++++++++++++++++++--------------------------- runsc/boot/loader_test.go | 3 +- 5 files changed, 253 insertions(+), 152 deletions(-) (limited to 'runsc/boot') diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index ff75a382e..c6e934e66 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -23,9 +23,13 @@ import ( "gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/control" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" "gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket" + "gvisor.googlesource.com/gvisor/pkg/sentry/state" + "gvisor.googlesource.com/gvisor/pkg/sentry/time" "gvisor.googlesource.com/gvisor/pkg/sentry/watchdog" + "gvisor.googlesource.com/gvisor/pkg/urpc" ) const ( @@ -47,9 +51,15 @@ const ( // processes running in a container. ContainerProcesses = "containerManager.Processes" + // ContainerRestore restores a container from a statefile. + ContainerRestore = "containerManager.Restore" + // ContainerResume unpauses the paused container. ContainerResume = "containerManager.Resume" + // ContainerWaitForLoader blocks until the container's loader has been created. + ContainerWaitForLoader = "containerManager.WaitForLoader" + // ContainerSignal is used to send a signal to a container. ContainerSignal = "containerManager.Signal" @@ -85,7 +95,7 @@ func ControlSocketAddr(id string) string { // controller holds the control server, and is used for communication into the // sandbox. type controller struct { - // srv is the contorl server. + // srv is the control server. srv *server.Server // manager holds the containerManager methods. @@ -100,10 +110,9 @@ func newController(fd int, k *kernel.Kernel, w *watchdog.Watchdog) (*controller, } manager := &containerManager{ - startChan: make(chan struct{}), - startResultChan: make(chan error), - k: k, - watchdog: w, + startChan: make(chan struct{}), + startResultChan: make(chan error), + loaderCreatedChan: make(chan struct{}), } srv.Register(manager) @@ -137,15 +146,13 @@ type containerManager struct { // channel. A nil value indicates success. startResultChan chan error - // k is the emulated linux kernel on which the sandboxed - // containers run. - k *kernel.Kernel - - // watchdog is the kernel watchdog. - watchdog *watchdog.Watchdog - // l is the loader that creates containers and sandboxes. l *Loader + + // loaderCreatedChan is used to signal when the loader has been created. + // After a loader is created, a notify method is called that writes to + // this channel. + loaderCreatedChan chan struct{} } // StartRoot will start the root container process. @@ -160,7 +167,7 @@ func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error { // Processes retrieves information about processes running in the sandbox. func (cm *containerManager) Processes(_, out *[]*control.Process) error { log.Debugf("containerManager.Processes") - return control.Processes(cm.k, out) + return control.Processes(cm.l.k, out) } // StartArgs contains arguments to the Start method. @@ -194,7 +201,7 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { return errors.New("start argument missing container ID") } - tgid, err := cm.l.startContainer(args, cm.k) + tgid, err := cm.l.startContainer(args, cm.l.k) if err != nil { return err } @@ -206,7 +213,7 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { // Execute runs a command on a created or running sandbox. func (cm *containerManager) Execute(e *control.ExecArgs, waitStatus *uint32) error { log.Debugf("containerManager.Execute") - proc := control.Proc{Kernel: cm.k} + proc := control.Proc{Kernel: cm.l.k} if err := proc.Exec(e, waitStatus); err != nil { return fmt.Errorf("error executing: %+v: %v", e, err) } @@ -217,21 +224,105 @@ func (cm *containerManager) Execute(e *control.ExecArgs, waitStatus *uint32) err func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error { log.Debugf("containerManager.Checkpoint") state := control.State{ - Kernel: cm.k, - Watchdog: cm.watchdog, + Kernel: cm.l.k, + Watchdog: cm.l.watchdog, } return state.Save(o, nil) } // Pause suspends a container. func (cm *containerManager) Pause(_, _ *struct{}) error { - cm.k.Pause() + cm.l.k.Pause() return nil } +// WaitForLoader blocks until the container's loader has been created. +func (cm *containerManager) WaitForLoader(_, _ *struct{}) error { + log.Debugf("containerManager.WaitForLoader") + <-cm.loaderCreatedChan + return nil +} + +// RestoreOpts contains options related to restoring a container's file system. +type RestoreOpts struct { + // FilePayload contains the state file to be restored. + urpc.FilePayload + + // SandboxID contains the ID of the sandbox. + SandboxID string +} + +// Restore loads a container from a statefile. +// The container's current kernel is destroyed, a restore environment is created, +// and the kernel is recreated with the restore state file. The container then sends the +// signal to start. +func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { + log.Debugf("containerManager.Restore") + if len(o.FilePayload.Files) != 1 { + return fmt.Errorf("exactly one file must be provided") + } + defer o.FilePayload.Files[0].Close() + + // Destroy the old kernel and create a new kernel. + cm.l.k.Pause() + cm.l.k.Destroy() + + p, err := createPlatform(cm.l.conf) + if err != nil { + return fmt.Errorf("error creating platform: %v", err) + } + k := &kernel.Kernel{ + Platform: p, + } + cm.l.k = k + + // Set up the restore environment. + fds := &fdDispenser{fds: cm.l.ioFDs} + renv, err := createRestoreEnvironment(cm.l.spec, cm.l.conf, fds) + if err != nil { + return fmt.Errorf("error creating RestoreEnvironment: %v", err) + } + fs.SetRestoreEnvironment(*renv) + + // Prepare to load from the state file. + networkStack := newEmptyNetworkStack(cm.l.conf, k) + info, err := o.FilePayload.Files[0].Stat() + if err != nil { + return err + } + if info.Size() == 0 { + return fmt.Errorf("error file was empty") + } + + // Load the state. + loadOpts := state.LoadOpts{ + Source: o.FilePayload.Files[0], + } + if err := loadOpts.Load(k, p, networkStack); err != nil { + return err + } + + // Set timekeeper. + k.Timekeeper().SetClocks(time.NewCalibratedClocks()) + + // Since we have a new kernel we also must make a new watchdog. + watchdog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction) + + // Change the loader fields to reflect the changes made when restoring. + cm.l.k = k + cm.l.watchdog = watchdog + cm.l.rootProcArgs = kernel.CreateProcessArgs{} + cm.l.setRootContainerID(o.SandboxID) + cm.l.restore = true + + // Tell the root container to start and wait for the result. + cm.startChan <- struct{}{} + return <-cm.startResultChan +} + // Resume unpauses a container. func (cm *containerManager) Resume(_, _ *struct{}) error { - cm.k.Unpause() + cm.l.k.Unpause() return nil } @@ -272,7 +363,7 @@ func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error { // process in theat container. Currently we just signal PID 1 in the // sandbox. si := arch.SignalInfo{Signo: args.Signo} - t := cm.k.TaskSet().Root.TaskWithID(1) + t := cm.l.k.TaskSet().Root.TaskWithID(1) if t == nil { return fmt.Errorf("cannot signal: no task with id 1") } diff --git a/runsc/boot/events.go b/runsc/boot/events.go index 0eb75c14c..832339cf4 100644 --- a/runsc/boot/events.go +++ b/runsc/boot/events.go @@ -62,8 +62,8 @@ type Memory struct { // Event gets the events from the container. func (cm *containerManager) Event(_ *struct{}, out *Event) error { stats := &Stats{} - stats.populateMemory(cm.k) - stats.populatePIDs(cm.k) + stats.populateMemory(cm.l.k) + stats.populatePIDs(cm.l.k) *out = Event{Type: "stats", Data: stats} return nil } diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 51c8d620d..e596c739f 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -27,6 +27,9 @@ import ( _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys" _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs" _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tty" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.googlesource.com/gvisor/pkg/abi/linux" @@ -563,3 +566,45 @@ func subtargets(root string, mnts []specs.Mount) []string { } return targets } + +// setFileSystemForProcess is used to set up the file system and amend the procArgs accordingly. +// procArgs are passed by reference and the FDMap field is modified. +func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel) error { + ctx := procArgs.NewContext(k) + + // Create the FD map, which will set stdin, stdout, and stderr. If + // console is true, then ioctl calls will be passed through to the host + // fd. + fdm, err := createFDMap(ctx, k, ls, console) + if err != nil { + return fmt.Errorf("error importing fds: %v", err) + } + + // CreateProcess takes a reference on FDMap if successful. We + // won't need ours either way. + procArgs.FDMap = fdm + + // If this is the root container, we also need to setup the root mount + // namespace. + if k.RootMountNamespace() == nil { + // Use root user to configure mounts. The current user might not have + // permission to do so. + rootProcArgs := kernel.CreateProcessArgs{ + WorkingDirectory: "/", + Credentials: auth.NewRootCredentials(creds.UserNamespace), + Umask: 0022, + MaxSymlinkTraversals: linux.MaxSymlinkTraversals, + } + rootCtx := rootProcArgs.NewContext(k) + + // Create the virtual filesystem. + mns, err := createMountNamespace(ctx, rootCtx, spec, conf, ioFDs) + if err != nil { + return fmt.Errorf("error creating mounts: %v", err) + } + + k.SetRootMountNamespace(mns) + } + + return nil +} diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 706910d8a..66394cdf8 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -19,7 +19,6 @@ import ( "errors" "fmt" "math/rand" - "os" "runtime" "sync" "sync/atomic" @@ -29,7 +28,6 @@ import ( "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/cpuid" "gvisor.googlesource.com/gvisor/pkg/log" - "gvisor.googlesource.com/gvisor/pkg/sentry/fs" "gvisor.googlesource.com/gvisor/pkg/sentry/inet" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" @@ -38,7 +36,6 @@ import ( "gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm" "gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace" "gvisor.googlesource.com/gvisor/pkg/sentry/sighandling" - "gvisor.googlesource.com/gvisor/pkg/sentry/state" slinux "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux" "gvisor.googlesource.com/gvisor/pkg/sentry/time" "gvisor.googlesource.com/gvisor/pkg/sentry/watchdog" @@ -77,6 +74,12 @@ type Loader struct { watchdog *watchdog.Watchdog + // ioFDs are the FDs that attach the sandbox to the gofers. + ioFDs []int + + // spec is the base configuration for the root container. + spec *specs.Spec + // stopSignalForwarding disables forwarding of signals to the sandboxed // container. It should be called when a sandbox is destroyed. stopSignalForwarding func() @@ -111,16 +114,7 @@ func init() { // New initializes a new kernel loader configured by spec. // New also handles setting up a kernel for restoring a container. -func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []int, console bool) (*Loader, error) { - var ( - tk *kernel.Timekeeper - creds *auth.Credentials - vdso *loader.VDSO - utsns *kernel.UTSNamespace - ipcns *kernel.IPCNamespace - restoreFile *os.File - procArgs kernel.CreateProcessArgs - ) +func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console bool) (*Loader, error) { // Create kernel and platform. p, err := createPlatform(conf) if err != nil { @@ -130,60 +124,47 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in Platform: p, } - if restoreFD == -1 { - // Create VDSO. - // - // Pass k as the platform since it is savable, unlike the actual platform. - vdso, err = loader.PrepareVDSO(k) - if err != nil { - return nil, fmt.Errorf("error creating vdso: %v", err) - } + // Create VDSO. + // + // Pass k as the platform since it is savable, unlike the actual platform. + vdso, err := loader.PrepareVDSO(k) + if err != nil { + return nil, fmt.Errorf("error creating vdso: %v", err) + } - // Create timekeeper. - tk, err = kernel.NewTimekeeper(k, vdso.ParamPage.FileRange()) - if err != nil { - return nil, fmt.Errorf("error creating timekeeper: %v", err) - } - tk.SetClocks(time.NewCalibratedClocks()) + // Create timekeeper. + tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange()) + if err != nil { + return nil, fmt.Errorf("error creating timekeeper: %v", err) + } + tk.SetClocks(time.NewCalibratedClocks()) - // Create capabilities. - caps, err := specutils.Capabilities(spec.Process.Capabilities) - if err != nil { - return nil, fmt.Errorf("error creating capabilities: %v", err) - } + // Create capabilities. + caps, err := specutils.Capabilities(spec.Process.Capabilities) + if err != nil { + return nil, fmt.Errorf("error creating capabilities: %v", err) + } - // Convert the spec's additional GIDs to KGIDs. - extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids)) - for _, GID := range spec.Process.User.AdditionalGids { - extraKGIDs = append(extraKGIDs, auth.KGID(GID)) - } + // Convert the spec's additional GIDs to KGIDs. + extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids)) + for _, GID := range spec.Process.User.AdditionalGids { + extraKGIDs = append(extraKGIDs, auth.KGID(GID)) + } - // Create credentials. - creds = auth.NewUserCredentials( - auth.KUID(spec.Process.User.UID), - auth.KGID(spec.Process.User.GID), - extraKGIDs, - caps, - auth.NewRootUserNamespace()) + // Create credentials. + creds := auth.NewUserCredentials( + auth.KUID(spec.Process.User.UID), + auth.KGID(spec.Process.User.GID), + extraKGIDs, + caps, + auth.NewRootUserNamespace()) - // Create user namespace. - // TODO: Not clear what domain name should be here. It is - // not configurable from runtime spec. - utsns = kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace) + // Create user namespace. + // TODO: Not clear what domain name should be here. It is + // not configurable from runtime spec. + utsns := kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace) - ipcns = kernel.NewIPCNamespace(creds.UserNamespace) - } else { - // Create and set RestoreEnvironment - fds := &fdDispenser{fds: ioFDs} - renv, err := createRestoreEnvironment(spec, conf, fds) - if err != nil { - return nil, fmt.Errorf("error creating RestoreEnvironment: %v", err) - } - fs.SetRestoreEnvironment(*renv) - - restoreFile = os.NewFile(uintptr(restoreFD), "restore_file") - defer restoreFile.Close() - } + ipcns := kernel.NewIPCNamespace(creds.UserNamespace) if err := enableStrace(conf); err != nil { return nil, fmt.Errorf("failed to enable strace: %v", err) @@ -195,33 +176,20 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in // Run(). networkStack := newEmptyNetworkStack(conf, k) - if restoreFile == nil { - // Initiate the Kernel object, which is required by the Context passed - // to createVFS in order to mount (among other things) procfs. - if err = k.Init(kernel.InitKernelArgs{ - FeatureSet: cpuid.HostFeatureSet(), - Timekeeper: tk, - RootUserNamespace: creds.UserNamespace, - NetworkStack: networkStack, - // TODO: use number of logical processors from cgroups. - ApplicationCores: uint(runtime.NumCPU()), - Vdso: vdso, - RootUTSNamespace: utsns, - RootIPCNamespace: ipcns, - }); err != nil { - return nil, fmt.Errorf("error initializing kernel: %v", err) - } - } else { - // Load the state. - loadOpts := state.LoadOpts{ - Source: restoreFile, - } - if err := loadOpts.Load(k, p, networkStack); err != nil { - return nil, err - } - - // Set timekeeper. - k.Timekeeper().SetClocks(time.NewCalibratedClocks()) + // Initiate the Kernel object, which is required by the Context passed + // to createVFS in order to mount (among other things) procfs. + if err = k.Init(kernel.InitKernelArgs{ + FeatureSet: cpuid.HostFeatureSet(), + Timekeeper: tk, + RootUserNamespace: creds.UserNamespace, + NetworkStack: networkStack, + // TODO: use number of logical processors from cgroups. + ApplicationCores: uint(runtime.NumCPU()), + Vdso: vdso, + RootUTSNamespace: utsns, + RootIPCNamespace: ipcns, + }); err != nil { + return nil, fmt.Errorf("error initializing kernel: %v", err) } // Turn on packet logging if enabled. @@ -258,11 +226,9 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in // Ensure that signals received are forwarded to the emulated kernel. stopSignalForwarding := sighandling.PrepareForwarding(k, false)() - if restoreFile == nil { - procArgs, err = newProcess(spec, conf, ioFDs, console, creds, utsns, ipcns, k) - if err != nil { - return nil, fmt.Errorf("failed to create root process: %v", err) - } + procArgs, err := newProcess(spec, conf, ioFDs, console, creds, utsns, ipcns, k) + if err != nil { + return nil, fmt.Errorf("failed to create root process: %v", err) } l := &Loader{ @@ -271,9 +237,10 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in conf: conf, console: console, watchdog: watchdog, + ioFDs: ioFDs, + spec: spec, stopSignalForwarding: stopSignalForwarding, rootProcArgs: procArgs, - restore: restoreFile != nil, } ctrl.manager.l = l return l, nil @@ -307,41 +274,6 @@ func newProcess(spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds UTSNamespace: utsns, IPCNamespace: ipcns, } - ctx := procArgs.NewContext(k) - - // Create the FD map, which will set stdin, stdout, and stderr. If - // console is true, then ioctl calls will be passed through to the host - // fd. - fdm, err := createFDMap(ctx, k, ls, console) - if err != nil { - return kernel.CreateProcessArgs{}, fmt.Errorf("error importing fds: %v", err) - } - - // CreateProcess takes a reference on FDMap if successful. We - // won't need ours either way. - procArgs.FDMap = fdm - - // If this is the root container, we also need to setup the root mount - // namespace. - if k.RootMountNamespace() == nil { - // Use root user to configure mounts. The current user might not have - // permission to do so. - rootProcArgs := kernel.CreateProcessArgs{ - WorkingDirectory: "/", - Credentials: auth.NewRootCredentials(creds.UserNamespace), - Umask: 0022, - MaxSymlinkTraversals: linux.MaxSymlinkTraversals, - } - rootCtx := rootProcArgs.NewContext(k) - - // Create the virtual filesystem. - mns, err := createMountNamespace(ctx, rootCtx, spec, conf, ioFDs) - if err != nil { - return kernel.CreateProcessArgs{}, fmt.Errorf("error creating mounts: %v", err) - } - - k.SetRootMountNamespace(mns) - } return procArgs, nil } @@ -411,7 +343,20 @@ func (l *Loader) run() error { } // If we are restoring, we do not want to create a process. + // l.restore is set by the container manager when a restore call is made. if !l.restore { + err := setFileSystemForProcess( + &l.rootProcArgs, + l.spec, + l.conf, + l.ioFDs, + l.console, + l.rootProcArgs.Credentials, + l.rootProcArgs.Limits, + l.k) + if err != nil { + return err + } // Create the root container init task. if _, err := l.k.CreateProcess(l.rootProcArgs); err != nil { return fmt.Errorf("failed to create init process: %v", err) @@ -421,6 +366,7 @@ func (l *Loader) run() error { l.rootProcArgs.FDMap.DecRef() } + log.Infof("Process should have started...") l.watchdog.Start() return l.k.Start() } @@ -468,6 +414,18 @@ func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.Threa if err != nil { return 0, fmt.Errorf("failed to create new process: %v", err) } + err = setFileSystemForProcess( + &procArgs, + args.Spec, + args.Conf, + nil, + false, + creds, + procArgs.Limits, + k) + if err != nil { + return 0, fmt.Errorf("failed to create new process: %v", err) + } tg, err := l.k.CreateProcess(procArgs) if err != nil { @@ -553,6 +511,12 @@ func (l *Loader) WaitForStartSignal() { <-l.ctrl.manager.startChan } +// NotifyLoaderCreated sends a signal to the container manager that this +// loader has been created. +func (l *Loader) NotifyLoaderCreated() { + l.ctrl.manager.loaderCreatedChan <- struct{}{} +} + // WaitExit waits for the root container to exit, and returns its exit status. func (l *Loader) WaitExit() kernel.ExitStatus { // Wait for container. diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index 30ec236e4..7ea2e1ee5 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -61,7 +61,8 @@ func createLoader() (*Loader, error) { FileAccess: FileAccessDirect, DisableSeccomp: true, } - return New(testSpec(), conf, fd, -1, nil, false) + spec := testSpec() + return New(spec, conf, fd, nil, false) } // TestRun runs a simple application in a sandbox and checks that it succeeds. -- cgit v1.2.3