From 635b0c45933cd841298b0c21a513a9169e849594 Mon Sep 17 00:00:00 2001 From: Kevin Krakauer Date: Wed, 15 Aug 2018 16:24:07 -0700 Subject: runsc fsgofer: Support dynamic serving of filesystems. When multiple containers run inside a sentry, each container has its own root filesystem and set of mounts. Containers are also added after sentry boot rather than all configured and known at boot time. The fsgofer needs to be able to serve the root filesystem of each container. Thus, it must be possible to add filesystems after the fsgofer has already started. This change: * Creates a URPC endpoint within the gofer process that listens for requests to serve new content. * Enables the sentry, when starting a new container, to add the new container's filesystem. * Mounts those new filesystems at separate roots within the sentry. PiperOrigin-RevId: 208903248 Change-Id: Ifa91ec9c8caf5f2f0a9eead83c4a57090ce92068 --- runsc/boot/controller.go | 19 +++++- runsc/boot/fs.go | 159 ++++++++++++++++++++++++++++++++++------------- runsc/boot/loader.go | 36 +++++------ 3 files changed, 150 insertions(+), 64 deletions(-) (limited to 'runsc/boot') diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index fc6ea326a..69e88d8e0 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -17,6 +17,7 @@ package boot import ( "errors" "fmt" + "path" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.googlesource.com/gvisor/pkg/control/server" @@ -181,11 +182,15 @@ type StartArgs struct { // CID is the ID of the container to start. CID string + + // FilePayload contains the file descriptor over which the sandbox will + // request files from its root filesystem. + urpc.FilePayload } // Start runs a created container within a sandbox. func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { - log.Debugf("containerManager.Start") + log.Debugf("containerManager.Start: %+v", args) // Validate arguments. if args == nil { @@ -200,8 +205,18 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error { if args.CID == "" { return errors.New("start argument missing container ID") } + // Prevent CIDs containing ".." from confusing the sentry when creating + // /containers/ directory. + // TODO: Once we have multiple independant roots, this + // check won't be necessary. + if path.Clean(args.CID) != args.CID { + return fmt.Errorf("container ID shouldn't contain directory traversals such as \"..\": %q", args.CID) + } + if len(args.FilePayload.Files) != 1 { + return fmt.Errorf("start arguments must contain one file for the container root") + } - tgid, err := cm.l.startContainer(args, cm.l.k) + tgid, err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files[0]) if err != nil { return err } diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index eea2ec1f5..8996b1398 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -48,6 +48,19 @@ const ( // Device name for root mount. rootDevice = "9pfs-/" + + // childContainersDir is the directory where child container root + // filesystems are mounted. + childContainersDir = "/__runsc_containers__" + + // Filesystems that runsc supports. + bind = "bind" + devpts = "devpts" + devtmpfs = "devtmpfs" + proc = "proc" + sysfs = "sysfs" + tmpfs = "tmpfs" + nonefs = "none" ) type fdDispenser struct { @@ -70,8 +83,15 @@ func (f *fdDispenser) empty() bool { // createMountNamespace creates a mount namespace containing the root filesystem // and all mounts. 'rootCtx' is used to walk directories to find mount points. func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, ioFDs []int) (*fs.MountNamespace, error) { + mounts := compileMounts(spec) + // Create a tmpfs mount where we create and mount a root filesystem for + // each child container. + mounts = append(mounts, specs.Mount{ + Type: tmpfs, + Destination: childContainersDir, + }) fds := &fdDispenser{fds: ioFDs} - rootInode, err := createRootMount(rootCtx, spec, conf, fds) + rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts) if err != nil { return nil, fmt.Errorf("failed to create root mount: %v", err) } @@ -79,7 +99,7 @@ func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec if err != nil { return nil, fmt.Errorf("failed to create root mount namespace: %v", err) } - mounts := compileMounts(spec) + if err := setMounts(rootCtx, conf, mns, fds, mounts); err != nil { return nil, fmt.Errorf("failed to configure mounts: %v", err) } @@ -98,12 +118,12 @@ func compileMounts(spec *specs.Spec) []specs.Mount { // Always mount /dev. mounts = append(mounts, specs.Mount{ - Type: "devtmpfs", + Type: devtmpfs, Destination: "/dev", }) mounts = append(mounts, specs.Mount{ - Type: "devpts", + Type: devpts, Destination: "/dev/pts", }) @@ -129,13 +149,13 @@ func compileMounts(spec *specs.Spec) []specs.Mount { var mandatoryMounts []specs.Mount if !procMounted { mandatoryMounts = append(mandatoryMounts, specs.Mount{ - Type: "proc", + Type: proc, Destination: "/proc", }) } if !sysMounted { mandatoryMounts = append(mandatoryMounts, specs.Mount{ - Type: "sysfs", + Type: sysfs, Destination: "/sys", }) } @@ -149,7 +169,7 @@ func compileMounts(spec *specs.Spec) []specs.Mount { // that. Until then, the /tmp mount will always appear empty at // container creation. mandatoryMounts = append(mandatoryMounts, specs.Mount{ - Type: "tmpfs", + Type: tmpfs, Destination: "/tmp", }) } @@ -165,7 +185,7 @@ func compileMounts(spec *specs.Spec) []specs.Mount { // mount namespace. func setMounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, mounts []specs.Mount) error { for _, m := range mounts { - if err := mountSubmount(ctx, conf, mns, fds, m, mounts); err != nil { + if err := mountSubmount(ctx, conf, mns, fds, m, mounts, m.Destination); err != nil { return err } } @@ -173,7 +193,7 @@ func setMounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *f } // createRootMount creates the root filesystem. -func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.Inode, error) { +func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser, mounts []specs.Mount) (*fs.Inode, error) { // First construct the filesystem from the spec.Root. mf := fs.MountSourceFlags{ReadOnly: spec.Root.Readonly} @@ -207,7 +227,7 @@ func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *f // We need to overlay the root on top of a ramfs with stub directories // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always // mounted even if they are not in the spec. - submounts := append(subtargets("/", spec.Mounts), "/dev", "/sys", "/proc", "/tmp") + submounts := append(subtargets("/", mounts), "/dev", "/sys", "/proc", "/tmp") rootInode, err = addSubmountOverlay(ctx, rootInode, submounts) if err != nil { return nil, fmt.Errorf("error adding submount overlay: %v", err) @@ -256,17 +276,17 @@ func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (stri ) switch m.Type { - case "devpts", "devtmpfs", "proc", "sysfs": + case devpts, devtmpfs, proc, sysfs: fsName = m.Type - case "none": - fsName = "sysfs" - case "tmpfs": + case nonefs: + fsName = sysfs + case tmpfs: fsName = m.Type // tmpfs has some extra supported options that we must pass through. opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid") - case "bind": + case bind: switch conf.FileAccess { case FileAccessProxy, FileAccessProxyExclusive: fd := fds.remove() @@ -291,7 +311,7 @@ func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (stri return fsName, opts, useOverlay, err } -func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount, mounts []specs.Mount) error { +func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount, mounts []specs.Mount, dest string) error { // Map mount type to filesystem name, and parse out the options that we are // capable of dealing with. fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds) @@ -342,51 +362,52 @@ func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fd // in the right location, e.g. // mount: /var/run/secrets, may be created in '/run/secrets' if // '/var/run' => '/var'. - if err := mkdirAll(ctx, mns, m.Destination); err != nil { + if err := mkdirAll(ctx, mns, dest); err != nil { return err } root := mns.Root() defer root.DecRef() - dirent, err := mns.FindInode(ctx, root, nil, m.Destination, linux.MaxSymlinkTraversals) + dirent, err := mns.FindInode(ctx, root, nil, dest, linux.MaxSymlinkTraversals) if err != nil { - return fmt.Errorf("failed to find mount destination %q: %v", m.Destination, err) + return fmt.Errorf("failed to find mount destination %q: %v", dest, err) } defer dirent.DecRef() if err := mns.Mount(ctx, dirent, inode); err != nil { - return fmt.Errorf("failed to mount at destination %q: %v", m.Destination, err) + return fmt.Errorf("failed to mount at destination %q: %v", dest, err) } - log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type) + log.Infof("Mounted %q to %q type %s", m.Source, dest, m.Type) return nil } func mkdirAll(ctx context.Context, mns *fs.MountNamespace, path string) error { + log.Infof("mkdirAll called with path %s", path) root := mns.Root() defer root.DecRef() // Starting at the root, walk the path. parent := root ps := strings.Split(filepath.Clean(path), string(filepath.Separator)) - for i := 0; i < len(ps); i++ { - if ps[i] == "" { + for _, pathElem := range ps { + if pathElem == "" { // This will be case for the first and last element, if the path // begins or ends with '/'. Note that we always treat the path as // absolute, regardless of what the first character contains. continue } - d, err := mns.FindInode(ctx, root, parent, ps[i], fs.DefaultTraversalLimit) + d, err := mns.FindInode(ctx, root, parent, pathElem, fs.DefaultTraversalLimit) if err == syserror.ENOENT { // If we encounter a path that does not exist, then // create it. - if err := parent.CreateDirectory(ctx, root, ps[i], fs.FilePermsFromMode(0755)); err != nil { - return fmt.Errorf("failed to create directory %q: %v", ps[i], err) + if err := parent.CreateDirectory(ctx, root, pathElem, fs.FilePermsFromMode(0755)); err != nil { + return fmt.Errorf("failed to create directory %q: %v", pathElem, err) } - if d, err = parent.Walk(ctx, root, ps[i]); err != nil { - return fmt.Errorf("walk to %q failed: %v", ps[i], err) + if d, err = parent.Walk(ctx, root, pathElem); err != nil { + return fmt.Errorf("walk to %q failed: %v", pathElem, err) } } else if err != nil { - return fmt.Errorf("failed to find inode %q: %v", ps[i], err) + return fmt.Errorf("failed to find inode %q: %v", pathElem, err) } parent = d } @@ -444,7 +465,7 @@ func destinations(mounts []specs.Mount, extra ...string) []string { // mountDevice returns a device string based on the fs type and target // of the mount. func mountDevice(m specs.Mount) string { - if m.Type == "bind" { + if m.Type == bind { // Make a device string that includes the target, which is consistent across // S/R and uniquely identifies the connection. return "9pfs-" + m.Destination @@ -589,7 +610,7 @@ func subtargets(root string, mnts []specs.Mount) []string { // setFileSystemForProcess is used to set up the file system and amend the procArgs accordingly. // procArgs are passed by reference and the FDMap field is modified. -func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel) error { +func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error { ctx := procArgs.NewContext(k) // Create the FD map, which will set stdin, stdout, and stderr. If @@ -604,27 +625,79 @@ func setFileSystemForProcess(procArgs *kernel.CreateProcessArgs, spec *specs.Spe // won't need ours either way. procArgs.FDMap = fdm + // Use root user to configure mounts. The current user might not have + // permission to do so. + rootProcArgs := kernel.CreateProcessArgs{ + WorkingDirectory: "/", + Credentials: auth.NewRootCredentials(creds.UserNamespace), + Umask: 0022, + MaxSymlinkTraversals: linux.MaxSymlinkTraversals, + } + rootCtx := rootProcArgs.NewContext(k) + // If this is the root container, we also need to setup the root mount // namespace. - if k.RootMountNamespace() == nil { - // Use root user to configure mounts. The current user might not have - // permission to do so. - rootProcArgs := kernel.CreateProcessArgs{ - WorkingDirectory: "/", - Credentials: auth.NewRootCredentials(creds.UserNamespace), - Umask: 0022, - MaxSymlinkTraversals: linux.MaxSymlinkTraversals, - } - rootCtx := rootProcArgs.NewContext(k) - + mns := k.RootMountNamespace() + if mns == nil { // Create the virtual filesystem. mns, err := createMountNamespace(ctx, rootCtx, spec, conf, ioFDs) if err != nil { return fmt.Errorf("error creating mounts: %v", err) } - k.SetRootMountNamespace(mns) + return nil + } + + // Setup a child container. + + // Create the container's root filesystem mount. + log.Infof("Creating new process in child container.") + fds := &fdDispenser{fds: append([]int{}, ioFDs...)} + rootInode, err := createRootMount(rootCtx, spec, conf, fds, nil) + if err != nil { + return fmt.Errorf("error creating filesystem for container: %v", err) + } + + // Make directories for submounts within the container. + rootDir := mns.Root() + defer rootDir.DecRef() + containerRoot := filepath.Join(childContainersDir, cid) + mkdirAll(ctx, mns, containerRoot) + + // Mount the container's root filesystem to the newly created + // mount point. + containerRootDirent, err := mns.FindInode(ctx, rootDir, nil, containerRoot, linux.MaxSymlinkTraversals) + if err != nil { + return fmt.Errorf("failed to find mount destination: %q: %v", containerRoot, err) + } + if err := mns.Mount(ctx, containerRootDirent, rootInode); err != nil { + return fmt.Errorf("failed to mount at destination %q: %v", containerRoot, err) + } + containerRootDirent.DecRef() + + // We have to re-walk to the dirent to find the mounted + // directory. The old dirent is invalid at this point. + containerRootDirent, err = mns.FindInode(ctx, rootDir, nil, containerRoot, linux.MaxSymlinkTraversals) + if err != nil { + return fmt.Errorf("failed to find mount destination2: %q: %v", containerRoot, err) + } + log.Infof("Mounted child's root fs to %q", containerRoot) + + // Mount all submounts. + mounts := compileMounts(spec) + for _, m := range mounts { + // TODO: Enable bind mounts in child containers. + if m.Type == bind { + log.Infof("Bind mounts in child containers are not yet supported: %+v", m) + continue + } + dest := filepath.Join(containerRoot, m.Destination) + if err := mountSubmount(rootCtx, conf, k.RootMountNamespace(), fds, m, mounts, dest); err != nil { + return fmt.Errorf("error mounting filesystem for container: %v", err) + } } + // Set the procArgs root directory. + procArgs.Root = containerRootDirent return nil } diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index f6c7bf223..7debf0ac2 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -19,6 +19,7 @@ import ( "errors" "fmt" "math/rand" + "os" "runtime" "sync" "sync/atomic" @@ -229,7 +230,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console // Ensure that signals received are forwarded to the emulated kernel. stopSignalForwarding := sighandling.PrepareForwarding(k, false)() - procArgs, err := newProcess(spec, conf, ioFDs, console, creds, utsns, ipcns, k) + procArgs, err := newProcess(spec, creds, utsns, ipcns, k) if err != nil { return nil, fmt.Errorf("failed to create root process: %v", err) } @@ -250,7 +251,7 @@ func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console } // newProcess creates a process that can be run with kernel.CreateProcess. -func newProcess(spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds *auth.Credentials, utsns *kernel.UTSNamespace, ipcns *kernel.IPCNamespace, k *kernel.Kernel) (kernel.CreateProcessArgs, error) { +func newProcess(spec *specs.Spec, creds *auth.Credentials, utsns *kernel.UTSNamespace, ipcns *kernel.IPCNamespace, k *kernel.Kernel) (kernel.CreateProcessArgs, error) { // Create initial limits. ls, err := createLimitSet(spec) if err != nil { @@ -277,7 +278,6 @@ func newProcess(spec *specs.Spec, conf *Config, ioFDs []int, console bool, creds UTSNamespace: utsns, IPCNamespace: ipcns, } - return procArgs, nil } @@ -356,7 +356,8 @@ func (l *Loader) run() error { l.console, l.rootProcArgs.Credentials, l.rootProcArgs.Limits, - l.k) + l.k, + "" /* CID, which isn't needed for the root container */) if err != nil { return err } @@ -376,8 +377,7 @@ func (l *Loader) run() error { // startContainer starts a child container. It returns the thread group ID of // the newly created process. -func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.ThreadID, error) { - spec := args.Spec +func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, file *os.File) (kernel.ThreadID, error) { // Create capabilities. caps, err := specutils.Capabilities(spec.Process.Capabilities) if err != nil { @@ -406,26 +406,24 @@ func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.Threa // when indicated by the spec. procArgs, err := newProcess( - args.Spec, - args.Conf, - nil, // ioFDs - false, // console + spec, creds, - k.RootUTSNamespace(), - k.RootIPCNamespace(), - k) + l.k.RootUTSNamespace(), + l.k.RootIPCNamespace(), + l.k) if err != nil { return 0, fmt.Errorf("failed to create new process: %v", err) } err = setFileSystemForProcess( &procArgs, - args.Spec, - args.Conf, - nil, + spec, + conf, + []int{int(file.Fd())}, // ioFDs false, creds, procArgs.Limits, - k) + k, + cid) if err != nil { return 0, fmt.Errorf("failed to create new process: %v", err) } @@ -435,7 +433,7 @@ func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.Threa return 0, fmt.Errorf("failed to create process in sentry: %v", err) } - ts := k.TaskSet() + ts := l.k.TaskSet() tgid := ts.Root.IDOfThreadGroup(tg) if tgid == 0 { return 0, errors.New("failed to get thread group ID of new process") @@ -446,7 +444,7 @@ func (l *Loader) startContainer(args *StartArgs, k *kernel.Kernel) (kernel.Threa l.mu.Lock() defer l.mu.Unlock() - l.containerRootTGIDs[args.CID] = tgid + l.containerRootTGIDs[cid] = tgid return tgid, nil } -- cgit v1.2.3