// Copyright 2018 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package boot import ( "fmt" "path/filepath" "strings" // Include filesystem types that OCI spec might mount. _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/dev" _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer" _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host" _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc" _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys" _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs" _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tty" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/pkg/sentry/context" "gvisor.googlesource.com/gvisor/pkg/sentry/fs" "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" "gvisor.googlesource.com/gvisor/pkg/syserror" "gvisor.googlesource.com/gvisor/runsc/specutils" ) type fdDispenser struct { fds []int } func (f *fdDispenser) remove() int { rv := f.fds[0] f.fds = f.fds[1:] return rv } func (f *fdDispenser) empty() bool { return len(f.fds) == 0 } // createMountNamespace creates a mount namespace containing the root filesystem // and all mounts. 'rootCtx' is used to walk directories to find mount points. func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, ioFDs []int) (*fs.MountNamespace, error) { fds := &fdDispenser{fds: ioFDs} rootInode, err := createRootMount(rootCtx, spec, conf, fds) if err != nil { return nil, fmt.Errorf("failed to create root mount: %v", err) } mns, err := fs.NewMountNamespace(userCtx, rootInode) if err != nil { return nil, fmt.Errorf("failed to create root mount namespace: %v", err) } if err := configureMounts(rootCtx, spec, conf, mns, fds); err != nil { return nil, fmt.Errorf("failed to configure mounts: %v", err) } if !fds.empty() { return nil, fmt.Errorf("not all mount points were consumed, remaining: %v", fds) } return mns, nil } // configureMounts iterates over Spec.Mounts and mounts them in the specified // mount namespace. func configureMounts(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.MountNamespace, fds *fdDispenser) error { // Keep track of whether proc, sys, and tmp were mounted. var procMounted, sysMounted, tmpMounted bool // Always mount /dev. if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{ Type: "devtmpfs", Destination: "/dev", }); err != nil { return err } // Always mount /dev/pts. if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{ Type: "devpts", Destination: "/dev/pts", }); err != nil { return err } // Mount all submounts from the spec. for _, m := range spec.Mounts { if !specutils.IsSupportedDevMount(m) { log.Warningf("ignoring dev mount at %q", m.Destination) continue } switch filepath.Clean(m.Destination) { case "/proc": procMounted = true case "/sys": sysMounted = true case "/tmp": tmpMounted = true } if err := mountSubmount(ctx, spec, conf, mns, fds, m); err != nil { return err } } // Mount proc and sys even if the user did not ask for it, as the spec // says we SHOULD. if !procMounted { if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{ Type: "proc", Destination: "/proc", }); err != nil { return err } } if !sysMounted { if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{ Type: "sysfs", Destination: "/sys", }); err != nil { return err } } // Technically we don't have to mount tmpfs at /tmp, as we could just // rely on the host /tmp, but this is a nice optimization, and fixes // some apps that call mknod in /tmp. if !tmpMounted { if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{ Type: "tmpfs", Destination: "/tmp", }); err != nil { return err } } return nil } // createRootMount creates the root filesystem. func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.Inode, error) { // First construct the filesystem from the spec.Root. mf := fs.MountSourceFlags{ReadOnly: spec.Root.Readonly} var ( rootInode *fs.Inode err error ) switch conf.FileAccess { case FileAccessProxy: fd := fds.remove() log.Infof("Mounting root over 9P, ioFD: %d", fd) hostFS := mustFindFilesystem("9p") rootInode, err = hostFS.Mount(ctx, "root", mf, fmt.Sprintf("trans=fd,rfdno=%d,wfdno=%d,privateunixsocket=true", fd, fd)) if err != nil { return nil, fmt.Errorf("failed to generate root mount point: %v", err) } case FileAccessDirect: hostFS := mustFindFilesystem("whitelistfs") rootInode, err = hostFS.Mount(ctx, "root", mf, "root="+spec.Root.Path+",dont_translate_ownership=true") if err != nil { return nil, fmt.Errorf("failed to generate root mount point: %v", err) } default: return nil, fmt.Errorf("invalid file access type: %v", conf.FileAccess) } // We need to overlay the root on top of a ramfs with stub directories // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always // mounted even if they are not in the spec. submounts := append(subtargets("/", spec.Mounts), "/dev", "/sys", "/proc", "/tmp") rootInode, err = addSubmountOverlay(ctx, rootInode, submounts) if err != nil { return nil, fmt.Errorf("error adding submount overlay: %v", err) } if conf.Overlay { log.Debugf("Adding overlay on top of root mount") // Overlay a tmpfs filesystem on top of the root. rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf) if err != nil { return nil, err } } log.Infof("Mounted %q to \"/\" type root", spec.Root.Path) return rootInode, nil } func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) { // Upper layer uses the same flags as lower, but it must be read-write. lowerFlags.ReadOnly = false tmpFS := mustFindFilesystem("tmpfs") if !fs.IsDir(lower.StableAttr) { // Create overlay on top of mount file, e.g. /etc/hostname. msrc := fs.NewCachingMountSource(tmpFS, lowerFlags) return fs.NewOverlayRootFile(ctx, msrc, lower, lowerFlags) } // Create overlay on top of mount dir. upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "") if err != nil { return nil, fmt.Errorf("failed to create tmpfs overlay: %v", err) } return fs.NewOverlayRoot(ctx, upper, lower, lowerFlags) } // getMountNameAndOptions retrieves the fsName, data, and useOverlay values // used for mounts. func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (string, []string, bool, error) { var fsName string var data []string var useOverlay bool var err error switch m.Type { case "devpts", "devtmpfs", "proc", "sysfs": fsName = m.Type case "none": fsName = "sysfs" case "tmpfs": fsName = m.Type // tmpfs has some extra supported options that we must pass through. data, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid") case "bind": switch conf.FileAccess { case FileAccessProxy: fd := fds.remove() fsName = "9p" data = []string{"trans=fd", fmt.Sprintf("rfdno=%d", fd), fmt.Sprintf("wfdno=%d", fd), "privateunixsocket=true"} case FileAccessDirect: fsName = "whitelistfs" data = []string{"root=" + m.Source, "dont_translate_ownership=true"} default: err = fmt.Errorf("invalid file access type: %v", conf.FileAccess) } // If configured, add overlay to all writable mounts. useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly default: // TODO: Support all the mount types and make this a // fatal error. Most applications will "just work" without // them, so this is a warning for now. // we do not support. log.Warningf("ignoring unknown filesystem type %q", m.Type) } return fsName, data, useOverlay, err } func mountSubmount(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount) error { // Map mount type to filesystem name, and parse out the options that we are // capable of dealing with. fsName, data, useOverlay, err := getMountNameAndOptions(conf, m, fds) // Return the error or nil that corresponds to the default case in getMountNameAndOptions. if err != nil { return err } if fsName == "" { return nil } // All filesystem names should have been mapped to something we know. filesystem := mustFindFilesystem(fsName) mf := mountFlags(m.Options) if useOverlay { // All writes go to upper, be paranoid and make lower readonly. mf.ReadOnly = true } inode, err := filesystem.Mount(ctx, m.Type, mf, strings.Join(data, ",")) if err != nil { return fmt.Errorf("failed to create mount with source %q: %v", m.Source, err) } // If there are submounts, we need to overlay the mount on top of a // ramfs with stub directories for submount paths. mounts := specutils.SupportedMounts(spec.Mounts) submounts := subtargets(m.Destination, mounts) if len(submounts) > 0 { log.Infof("Adding submount overlay over %q", m.Destination) inode, err = addSubmountOverlay(ctx, inode, submounts) if err != nil { return fmt.Errorf("error adding submount overlay: %v", err) } } if useOverlay { log.Debugf("Adding overlay on top of mount %q", m.Destination) inode, err = addOverlay(ctx, conf, inode, m.Type, mf) if err != nil { return err } } // Create destination in case it doesn't exist. This is required, in addition // to 'addSubmountOverlay', in case there are symlinks to create directories // in the right location, e.g. // mount: /var/run/secrets, may be created in '/run/secrets' if // '/var/run' => '/var'. if err := mkdirAll(ctx, mns, m.Destination); err != nil { return err } root := mns.Root() defer root.DecRef() dirent, err := mns.FindInode(ctx, root, nil, m.Destination, linux.MaxSymlinkTraversals) if err != nil { return fmt.Errorf("failed to find mount destination %q: %v", m.Destination, err) } defer dirent.DecRef() if err := mns.Mount(ctx, dirent, inode); err != nil { return fmt.Errorf("failed to mount at destination %q: %v", m.Destination, err) } log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type) return nil } func mkdirAll(ctx context.Context, mns *fs.MountNamespace, path string) error { root := mns.Root() defer root.DecRef() // Starting at the root, walk the path. parent := root ps := strings.Split(filepath.Clean(path), string(filepath.Separator)) for i := 0; i < len(ps); i++ { if ps[i] == "" { // This will be case for the first and last element, if the path // begins or ends with '/'. Note that we always treat the path as // absolute, regardless of what the first character contains. continue } d, err := mns.FindInode(ctx, root, parent, ps[i], fs.DefaultTraversalLimit) if err == syserror.ENOENT { // If we encounter a path that does not exist, then // create it. if err := parent.CreateDirectory(ctx, root, ps[i], fs.FilePermsFromMode(0755)); err != nil { return fmt.Errorf("failed to create directory %q: %v", ps[i], err) } if d, err = parent.Walk(ctx, root, ps[i]); err != nil { return fmt.Errorf("walk to %q failed: %v", ps[i], err) } } else if err != nil { return fmt.Errorf("failed to find inode %q: %v", ps[i], err) } parent = d } return nil } // parseAndFilterOptions parses a MountOptions slice and filters by the allowed // keys. func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) { var out []string for _, o := range opts { kv := strings.Split(o, "=") switch len(kv) { case 1: if contains(allowedKeys, o) { out = append(out, o) continue } log.Warningf("ignoring unsupported key %q", kv) case 2: if contains(allowedKeys, kv[0]) { out = append(out, o) continue } log.Warningf("ignoring unsupported key %q", kv[0]) default: return nil, fmt.Errorf("invalid option %q", o) } } return out, nil } func destinations(mounts []specs.Mount, extra ...string) []string { var ds []string for _, m := range mounts { ds = append(ds, m.Destination) } return append(ds, extra...) } // mountDevice returns a device string based on the fs type and target // of the mount. func mountDevice(m specs.Mount) string { if m.Type == "bind" { // Make a device string that includes the target, which is consistent across // S/R and uniquely identifies the connection. return "p9fs-" + m.Destination } // All other fs types use device "none". return "none" } // addRestoreMount adds a mount to the MountSources map used for restoring a // checkpointed container. func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, fds *fdDispenser) error { fsName, data, _, err := getMountNameAndOptions(conf, m, fds) dataString := strings.Join(data, ",") if err != nil { return err } renv.MountSources[fsName] = append(renv.MountSources[fsName], fs.MountArgs{ Dev: mountDevice(m), Flags: mountFlags(m.Options), Data: dataString, }) return nil } // createRestoreEnviroment builds a fs.RestoreEnvironment called renv by adding the mounts // to the environment. func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.RestoreEnvironment, error) { if conf.FileAccess == FileAccessDirect { return nil, fmt.Errorf("host filesystem with whitelist not supported with S/R") } renv := &fs.RestoreEnvironment{ MountSources: make(map[string][]fs.MountArgs), } // Add root mount. fd := fds.remove() dataString := strings.Join([]string{"trans=fd", fmt.Sprintf("rfdno=%d", fd), fmt.Sprintf("wfdno=%d", fd), "privateunixsocket=true"}, ",") mf := fs.MountSourceFlags{} if spec.Root.Readonly { mf.ReadOnly = true } const rootFSName = "9p" renv.MountSources[rootFSName] = append(renv.MountSources[rootFSName], fs.MountArgs{ Dev: "p9fs-/", Flags: mf, Data: dataString, }) // Add submounts for _, m := range spec.Mounts { if err := addRestoreMount(conf, renv, m, fds); err != nil { return nil, err } } return renv, nil } func mountFlags(opts []string) fs.MountSourceFlags { mf := fs.MountSourceFlags{} for _, o := range opts { switch o { case "rw": mf.ReadOnly = false case "ro": mf.ReadOnly = true case "noatime": mf.NoAtime = true default: log.Warningf("ignoring unknown mount option %q", o) } } return mf } func contains(strs []string, str string) bool { for _, s := range strs { if s == str { return true } } return false } func mustFindFilesystem(name string) fs.Filesystem { fs, ok := fs.FindFilesystem(name) if !ok { panic(fmt.Sprintf("could not find filesystem %q", name)) } return fs } // addSubmountOverlay overlays the inode over a ramfs tree containing the given // paths. func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) { // There is no real filesystem backing this ramfs tree, so we pass in // "nil" here. mountTree, err := ramfs.MakeDirectoryTree(ctx, fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), submounts) if err != nil { return nil, fmt.Errorf("error creating mount tree: %v", err) } overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{}) if err != nil { return nil, fmt.Errorf("failed to make mount overlay: %v", err) } return overlayInode, err } // subtargets takes a set of Mounts and returns only the targets that are // children of the given root. The returned paths are relative to the root. func subtargets(root string, mnts []specs.Mount) []string { r := filepath.Clean(root) if len(r) > 0 && r[len(r)-1] != '/' { r += "/" } var targets []string for _, mnt := range mnts { t := filepath.Clean(mnt.Destination) if strings.HasPrefix(t, r) { // Make the mnt path relative to the root path. If the // result is empty, then mnt IS the root mount, not a // submount. We don't want to include those. if t := strings.TrimPrefix(t, r); t != "" { targets = append(targets, t) } } } return targets }