diff options
author | Kevin Krakauer <krakauer@google.com> | 2018-06-19 21:42:21 -0700 |
---|---|---|
committer | Shentubot <shentubot@google.com> | 2018-06-19 21:44:33 -0700 |
commit | 5397963b5d4d57bd3d3668df880b5314ca2fc3d8 (patch) | |
tree | 1e56b21b1248c0d74772e7daf368a6ab91e35911 /runsc/boot | |
parent | db66e383c33228c43efbe16ad3b14ae9833879dc (diff) |
runsc: Enable container creation within existing sandboxes.
Containers are created as processes in the sandbox. Of the many things that
don't work yet, the biggest issue is that the fsgofer is launched with its root
as the sandbox's root directory. Thus, when a container is started and wants to
read anything (including the init binary of the container), the gofer tries to
serve from sandbox's root (which basically just has pause), not the container's.
PiperOrigin-RevId: 201294560
Change-Id: I6423aa8830538959c56ae908ce067e4199d627b1
Diffstat (limited to 'runsc/boot')
-rw-r--r-- | runsc/boot/controller.go | 45 | ||||
-rw-r--r-- | runsc/boot/loader.go | 193 |
2 files changed, 176 insertions, 62 deletions
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index ae727f144..1a598199d 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -15,9 +15,12 @@ package boot import ( + "errors" "fmt" + specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.googlesource.com/gvisor/pkg/control/server" + "gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/control" "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" @@ -50,6 +53,10 @@ const ( // ContainerSignal is used to send a signal to a container. ContainerSignal = "containerManager.Signal" + // ContainerStart is the URPC endpoint for running a non-root container + // within a sandbox. + ContainerStart = "containerManager.Start" + // ContainerWait is used to wait on the init process of the container // and return its ExitStatus. ContainerWait = "containerManager.Wait" @@ -127,10 +134,14 @@ type containerManager struct { // watchdog is the kernel watchdog. watchdog *watchdog.Watchdog + + // l is the loader that creates containers and sandboxes. + l *Loader } // StartRoot will start the root container process. func (cm *containerManager) StartRoot(_, _ *struct{}) error { + log.Debugf("containerManager.StartRoot") // Tell the root container to start and wait for the result. cm.startChan <- struct{}{} return <-cm.startResultChan @@ -138,11 +149,42 @@ func (cm *containerManager) StartRoot(_, _ *struct{}) error { // Processes retrieves information about processes running in the sandbox. func (cm *containerManager) Processes(_, out *[]*control.Process) error { + log.Debugf("containerManager.Processes") return control.Processes(cm.k, out) } +// StartArgs contains arguments to the Start method. +type StartArgs struct { + // Spec is the spec of the container to start. + Spec *specs.Spec + + // TODO: Separate sandbox and container configs. + // Config is the runsc-specific configuration for the sandbox. + Conf *Config +} + +// Start runs a created container within a sandbox. +func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { + log.Debugf("containerManager.Start") + + // Validate arguments. + if args == nil { + return errors.New("start missing arguments") + } + if args.Spec == nil { + return errors.New("start arguments missing spec") + } + if args.Conf == nil { + return errors.New("start arguments missing config") + } + + cm.l.startContainer(args, cm.k) + return nil +} + // Execute runs a command on a created or running sandbox. func (cm *containerManager) Execute(e *control.ExecArgs, waitStatus *uint32) error { + log.Debugf("containerManager.Execute") proc := control.Proc{Kernel: cm.k} if err := proc.Exec(e, waitStatus); err != nil { return fmt.Errorf("error executing: %+v: %v", e, err) @@ -152,6 +194,7 @@ func (cm *containerManager) Execute(e *control.ExecArgs, waitStatus *uint32) err // Checkpoint pauses a sandbox and saves its state. func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error { + log.Debugf("containerManager.Checkpoint") state := control.State{ Kernel: cm.k, Watchdog: cm.watchdog, @@ -173,6 +216,7 @@ func (cm *containerManager) Resume(_, _ *struct{}) error { // Wait waits for the init process in the given container. func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error { + log.Debugf("containerManager.Wait") // TODO: Use the cid and wait on the init process in that // container. Currently we just wait on PID 1 in the sandbox. tg := cm.k.TaskSet().Root.ThreadGroupWithID(1) @@ -195,6 +239,7 @@ type SignalArgs struct { // Signal sends a signal to the init process of the container. func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error { + log.Debugf("containerManager.Signal") // TODO: Use the cid and send the signal to the init // process in theat container. Currently we just signal PID 1 in the // sandbox. diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 526e8f8bb..d1a413cc7 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package boot loads the kernel and runs a container.. +// Package boot loads the kernel and runs a container. package boot import ( @@ -79,8 +79,8 @@ type Loader struct { // container. It should be called when a sandbox is destroyed. stopSignalForwarding func() - // procArgs refers to the root container task. - procArgs kernel.CreateProcessArgs + // rootProcArgs refers to the root sandbox init task. + rootProcArgs kernel.CreateProcessArgs } func init() { @@ -117,12 +117,6 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in } tk.SetClocks(time.NewCalibratedClocks()) - // Create initial limits. - ls, err := createLimitSet(spec) - if err != nil { - return nil, fmt.Errorf("error creating limits: %v", err) - } - // Create capabilities. caps, err := specutils.Capabilities(spec.Process.Capabilities) if err != nil { @@ -154,13 +148,6 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in return nil, fmt.Errorf("failed to enable strace: %v", err) } - // Get the executable path, which is a bit tricky because we have to - // inspect the environment PATH which is relative to the root path. - exec, err := specutils.GetExecutablePath(spec.Process.Args[0], spec.Root.Path, spec.Process.Env) - if err != nil { - return nil, fmt.Errorf("error getting executable path: %v", err) - } - // Create an empty network stack because the network namespace may be empty at // this point. Netns is configured before Run() is called. Netstack is // configured using a control uRPC message. Host network is configured inside @@ -223,16 +210,56 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in return nil, fmt.Errorf("error creating control server: %v", err) } + // We don't care about child signals; some platforms can generate a + // tremendous number of useless ones (I'm looking at you, ptrace). + if err := sighandling.IgnoreChildStop(); err != nil { + return nil, fmt.Errorf("failed to ignore child stop signals: %v", err) + } + // Ensure that most signals received in sentry context are forwarded to + // the emulated kernel. + stopSignalForwarding := sighandling.StartForwarding(k) + + procArgs, err := newProcess(spec, conf, ioFDs, console, creds, utsns, ipcns, k) + if err != nil { + return nil, fmt.Errorf("failed to create root process: %v", err) + } + + l := &Loader{ + k: k, + ctrl: ctrl, + conf: conf, + console: console, + watchdog: watchdog, + stopSignalForwarding: stopSignalForwarding, + rootProcArgs: procArgs, + } + ctrl.manager.l = l + return l, nil +} + +// newProcess creates a process that can be run with kernel.CreateProcess. +func newProcess(spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, utsns *kernel.UTSNamespace, ipcns *kernel.IPCNamespace, k *kernel.Kernel) (kernel.CreateProcessArgs, error) { + // Create initial limits. + ls, err := createLimitSet(spec) + if err != nil { + return kernel.CreateProcessArgs{}, fmt.Errorf("error creating limits: %v", err) + } + + // Get the executable path, which is a bit tricky because we have to + // inspect the environment PATH which is relative to the root path. + exec, err := specutils.GetExecutablePath(spec.Process.Args[0], spec.Root.Path, spec.Process.Env) + if err != nil { + return kernel.CreateProcessArgs{}, fmt.Errorf("error getting executable path: %v", err) + } + // Create the process arguments. procArgs := kernel.CreateProcessArgs{ - Filename: exec, - Argv: spec.Process.Args, - Envv: spec.Process.Env, - WorkingDirectory: spec.Process.Cwd, - Credentials: creds, - // Creating the FDMap requires that we have kernel.Kernel.fdMapUids, so - // it must wait until we have a Kernel. - Umask: uint(syscall.Umask(0)), + Filename: exec, + Argv: spec.Process.Args, + Envv: spec.Process.Env, + WorkingDirectory: spec.Process.Cwd, + Credentials: creds, + Umask: uint(0022), Limits: ls, MaxSymlinkTraversals: linux.MaxSymlinkTraversals, UTSNamespace: utsns, @@ -240,52 +267,42 @@ func New(spec *specs.Spec, conf *Config, controllerFD, restoreFD int, ioFDs []in } ctx := procArgs.NewContext(k) - // Use root user to configure mounts. The current user might not have - // permission to do so. - rootProcArgs := kernel.CreateProcessArgs{ - WorkingDirectory: "/", - Credentials: auth.NewRootCredentials(creds.UserNamespace), - Umask: uint(syscall.Umask(0022)), - MaxSymlinkTraversals: linux.MaxSymlinkTraversals, - } - rootCtx := rootProcArgs.NewContext(k) - - // Create the virtual filesystem. - mns, err := createMountNamespace(ctx, rootCtx, spec, conf, ioFDs) - if err != nil { - return nil, fmt.Errorf("error creating mounts: %v", err) - } - k.SetRootMountNamespace(mns) - - // Create the FD map, which will set stdin, stdout, and stderr. If console - // is true, then ioctl calls will be passed through to the host fd. + // Create the FD map, which will set stdin, stdout, and stderr. If + // console is true, then ioctl calls will be passed through to the host + // fd. fdm, err := createFDMap(ctx, k, ls, console) if err != nil { - return nil, fmt.Errorf("error importing fds: %v", err) + return kernel.CreateProcessArgs{}, fmt.Errorf("error importing fds: %v", err) } // CreateProcess takes a reference on FDMap if successful. We // won't need ours either way. procArgs.FDMap = fdm - // We don't care about child signals; some platforms can generate a - // tremendous number of useless ones (I'm looking at you, ptrace). - if err := sighandling.IgnoreChildStop(); err != nil { - return nil, fmt.Errorf("failed to ignore child stop signals: %v", err) + // If this is the root container, we also need to setup the root mount + // namespace. + if k.RootMountNamespace() == nil { + // Use root user to configure mounts. The current user might not have + // permission to do so. + rootProcArgs := kernel.CreateProcessArgs{ + WorkingDirectory: "/", + Credentials: auth.NewRootCredentials(creds.UserNamespace), + // The sentry should run with a umask of 0. + Umask: uint(syscall.Umask(0)), + MaxSymlinkTraversals: linux.MaxSymlinkTraversals, + } + rootCtx := rootProcArgs.NewContext(k) + + // Create the virtual filesystem. + mns, err := createMountNamespace(ctx, rootCtx, spec, conf, ioFDs) + if err != nil { + return kernel.CreateProcessArgs{}, fmt.Errorf("error creating mounts: %v", err) + } + + k.SetRootMountNamespace(mns) } - // Ensure that most signals received in sentry context are forwarded to - // the emulated kernel. - stopSignalForwarding := sighandling.StartForwarding(k) - return &Loader{ - k: k, - ctrl: ctrl, - conf: conf, - console: console, - watchdog: watchdog, - stopSignalForwarding: stopSignalForwarding, - procArgs: procArgs, - }, nil + return procArgs, nil } // Destroy cleans up all resources used by the loader. @@ -350,17 +367,69 @@ func (l *Loader) run() error { } // Create the root container init task. - if _, err := l.k.CreateProcess(l.procArgs); err != nil { + if _, err := l.k.CreateProcess(l.rootProcArgs); err != nil { return fmt.Errorf("failed to create init process: %v", err) } // CreateProcess takes a reference on FDMap if successful. - l.procArgs.FDMap.DecRef() + l.rootProcArgs.FDMap.DecRef() l.watchdog.Start() return l.k.Start() } +func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) error { + spec := args.Spec + // Create capabilities. + caps, err := specutils.Capabilities(spec.Process.Capabilities) + if err != nil { + return fmt.Errorf("error creating capabilities: %v", err) + } + + // Convert the spec's additional GIDs to KGIDs. + extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids)) + for _, GID := range spec.Process.User.AdditionalGids { + extraKGIDs = append(extraKGIDs, auth.KGID(GID)) + } + + // Create credentials. We reuse the root user namespace because the + // sentry currently supports only 1 mount namespace, which is tied to a + // single user namespace. Thus we must run in the same user namespace + // to access mounts. + // TODO: Create a new mount namespace for the container. + creds := auth.NewUserCredentials( + auth.KUID(spec.Process.User.UID), + auth.KGID(spec.Process.User.GID), + extraKGIDs, + caps, + l.k.RootUserNamespace()) + + // TODO New containers should be started in new PID namespaces + // when indicated by the spec. + + procArgs, err := newProcess( + args.Spec, + args.Conf, + nil, // ioFDs + false, // console + creds, + k.RootUTSNamespace(), + k.RootIPCNamespace(), + k) + if err != nil { + return fmt.Errorf("failed to create new process: %v", err) + } + + if _, err := l.k.CreateProcess(procArgs); err != nil { + return fmt.Errorf("failed to create process in sentry: %v", err) + } + + // CreateProcess takes a reference on FDMap if successful. + procArgs.FDMap.DecRef() + + return nil +} + // WaitForStartSignal waits for a start signal from the control server. func (l *Loader) WaitForStartSignal() { <-l.ctrl.manager.startChan |