diff options
Diffstat (limited to 'runsc/boot/fs.go')
-rw-r--r-- | runsc/boot/fs.go | 441 |
1 files changed, 441 insertions, 0 deletions
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go new file mode 100644 index 000000000..2073bd0b1 --- /dev/null +++ b/runsc/boot/fs.go @@ -0,0 +1,441 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + // Include filesystem types that OCI spec might mount. + _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/dev" + _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer" + _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host" + _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc" + _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys" + _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +type fdDispenser struct { + fds []int +} + +func (f *fdDispenser) remove() int { + rv := f.fds[0] + f.fds = f.fds[1:] + return rv +} + +func (f *fdDispenser) empty() bool { + return len(f.fds) == 0 +} + +// createMountNamespace creates a mount manager containing the root filesystem +// and all mounts. +func createMountNamespace(ctx context.Context, spec *specs.Spec, conf *Config, ioFDs []int) (*fs.MountNamespace, error) { + fds := &fdDispenser{fds: ioFDs} + + // Create the MountNamespace from the root. + rootInode, err := createRootMount(ctx, spec, conf, fds) + if err != nil { + return nil, fmt.Errorf("failed to create root overlay: %v", err) + } + mns, err := fs.NewMountNamespace(ctx, rootInode) + if err != nil { + return nil, fmt.Errorf("failed to construct MountNamespace: %v", err) + } + + // Keep track of whether proc, sys, and tmp were mounted. + var procMounted, sysMounted, tmpMounted bool + + // Mount all submounts from the spec. + for _, m := range spec.Mounts { + // OCI spec uses many different mounts for the things inside of '/dev'. We + // have a single mount at '/dev' that is always mounted, regardless of + // whether it was asked for, as the spec says we SHOULD. + if strings.HasPrefix(m.Destination, "/dev") { + log.Warningf("ignoring dev mount at %q", m.Destination) + continue + } + switch m.Destination { + case "/proc": + procMounted = true + case "/sys": + sysMounted = true + case "/tmp": + tmpMounted = true + } + + if err := mountSubmount(ctx, spec, conf, mns, fds, m); err != nil { + return nil, err + } + } + + // Always mount /dev. + if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{ + Type: "devtmpfs", + Destination: "/dev", + }); err != nil { + return nil, err + } + + // Mount proc and sys even if the user did not ask for it, as the spec + // says we SHOULD. + if !procMounted { + if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{ + Type: "proc", + Destination: "/proc", + }); err != nil { + return nil, err + } + } + if !sysMounted { + if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{ + Type: "sysfs", + Destination: "/sys", + }); err != nil { + return nil, err + } + } + + // Technically we don't have to mount tmpfs at /tmp, as we could just + // rely on the host /tmp, but this is a nice optimization, and fixes + // some apps that call mknod in /tmp. + if !tmpMounted { + if err := mountSubmount(ctx, spec, conf, mns, nil, specs.Mount{ + Type: "tmpfs", + Destination: "/tmp", + }); err != nil { + return nil, err + } + } + + if !fds.empty() { + return nil, fmt.Errorf("not all mount points were consumed, remaining: %v", fds) + } + + return mns, nil +} + +// createRootMount creates the root filesystem. +func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.Inode, error) { + // First construct the filesystem from the spec.Root. + mf := fs.MountSourceFlags{ + ReadOnly: spec.Root.Readonly, + NoAtime: true, + } + + var ( + rootInode *fs.Inode + err error + ) + switch conf.FileAccess { + case FileAccessProxy: + fd := fds.remove() + log.Infof("Mounting root over 9P, ioFD: %d", fd) + hostFS := mustFindFilesystem("9p") + rootInode, err = hostFS.Mount(ctx, "root", mf, fmt.Sprintf("trans=fd,rfdno=%d,wfdno=%d,privateunixsocket=true", fd, fd)) + if err != nil { + return nil, fmt.Errorf("failed to generate root mount point: %v", err) + } + + case FileAccessDirect: + hostFS := mustFindFilesystem("whitelistfs") + rootInode, err = hostFS.Mount(ctx, "root", mf, "root="+spec.Root.Path+",dont_translate_ownership=true") + if err != nil { + return nil, fmt.Errorf("failed to generate root mount point: %v", err) + } + + default: + return nil, fmt.Errorf("invalid file access type: %v", conf.FileAccess) + } + + // We need to overlay the root on top of a ramfs with stub directories + // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always + // mounted even if they are not in the spec. + submounts := append(subtargets("/", spec.Mounts), "/dev", "/sys", "/proc", "/tmp") + rootInode, err = addSubmountOverlay(ctx, rootInode, submounts) + if err != nil { + return nil, fmt.Errorf("error adding submount overlay: %v", err) + } + + if conf.Overlay { + log.Debugf("Adding overlay on top of root mount") + // Overlay a tmpfs filesystem on top of the root. + rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf) + if err != nil { + return nil, err + } + } + + log.Infof("Mounted %q to \"/\" type root", spec.Root.Path) + return rootInode, nil +} + +func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) { + // Upper layer uses the same flags as lower, but it must be read-write. + lowerFlags.ReadOnly = false + + tmpFS := mustFindFilesystem("tmpfs") + upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "") + if err != nil { + return nil, fmt.Errorf("failed to create tmpfs overlay: %v", err) + } + return fs.NewOverlayRoot(ctx, upper, lower, lowerFlags) +} + +func mountSubmount(ctx context.Context, spec *specs.Spec, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount) error { + // Map mount type to filesystem name, and parse out the options that we are + // capable of dealing with. + var data []string + var fsName string + var useOverlay bool + switch m.Type { + case "proc", "sysfs", "devtmpfs": + fsName = m.Type + case "none": + fsName = "sysfs" + case "tmpfs": + fsName = m.Type + + // tmpfs has some extra supported options that we must pass through. + var err error + data, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid") + if err != nil { + return err + } + case "bind": + switch conf.FileAccess { + case FileAccessProxy: + fd := fds.remove() + fsName = "9p" + data = []string{"trans=fd", fmt.Sprintf("rfdno=%d", fd), fmt.Sprintf("wfdno=%d", fd), "privateunixsocket=true"} + case FileAccessDirect: + fsName = "whitelistfs" + data = []string{"root=" + m.Source, "dont_translate_ownership=true"} + default: + return fmt.Errorf("invalid file access type: %v", conf.FileAccess) + } + + fi, err := os.Stat(m.Source) + if err != nil { + return err + } + // Add overlay to all writable mounts, except when mapping an individual file. + useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly && fi.Mode().IsDir() + default: + // TODO: Support all the mount types and make this a + // fatal error. Most applications will "just work" without + // them, so this is a warning for now. + // we do not support. + log.Warningf("ignoring unknown filesystem type %q", m.Type) + return nil + } + + // All filesystem names should have been mapped to something we know. + filesystem := mustFindFilesystem(fsName) + + mf := mountFlags(m.Options) + if useOverlay { + // All writes go to upper, be paranoid and make lower readonly. + mf.ReadOnly = true + } + mf.NoAtime = true + + inode, err := filesystem.Mount(ctx, m.Type, mf, strings.Join(data, ",")) + if err != nil { + return fmt.Errorf("failed to create mount with source %q: %v", m.Source, err) + } + + // If there are submounts, we need to overlay the mount on top of a + // ramfs with stub directories for submount paths. + // + // We do not do this for /dev, since there will usually be submounts in + // the spec, but our devfs implementation contains all the necessary + // directories and files (well, most of them anyways). + if m.Destination != "/dev" { + submounts := subtargets(m.Destination, spec.Mounts) + if len(submounts) > 0 { + log.Infof("Adding submount overlay over %q", m.Destination) + inode, err = addSubmountOverlay(ctx, inode, submounts) + if err != nil { + return fmt.Errorf("error adding submount overlay: %v", err) + } + } + } + + if useOverlay { + log.Debugf("Adding overlay on top of mount %q", m.Destination) + if inode, err = addOverlay(ctx, conf, inode, m.Type, mf); err != nil { + return err + } + } + + root := mns.Root() + defer root.DecRef() + dirent, err := mns.FindInode(ctx, root, nil, m.Destination, linux.MaxSymlinkTraversals) + if err != nil { + return fmt.Errorf("failed to find mount destination %q: %v", m.Destination, err) + } + defer dirent.DecRef() + if err := mns.Mount(ctx, dirent, inode); err != nil { + return fmt.Errorf("failed to mount at destination %q: %v", m.Destination, err) + } + + log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type) + return nil +} + +func mkdirAll(ctx context.Context, mns *fs.MountNamespace, path string) error { + root := mns.Root() + defer root.DecRef() + + // Starting at the root, walk the path. + parent := root + ps := strings.Split(filepath.Clean(path), string(filepath.Separator)) + for i := 0; i < len(ps); i++ { + if ps[i] == "" { + // This will be case for the first and last element, if the path + // begins or ends with '/'. Note that we always treat the path as + // absolute, regardless of what the first character contains. + continue + } + d, err := mns.FindInode(ctx, root, parent, ps[i], fs.DefaultTraversalLimit) + if err == syserror.ENOENT { + // If we encounter a path that does not exist, then + // create it. + if err := parent.CreateDirectory(ctx, root, ps[i], fs.FilePermsFromMode(0755)); err != nil { + return fmt.Errorf("failed to create directory %q: %v", ps[i], err) + } + if d, err = parent.Walk(ctx, root, ps[i]); err != nil { + return fmt.Errorf("walk to %q failed: %v", ps[i], err) + } + } else if err != nil { + return fmt.Errorf("failed to find inode %q: %v", ps[i], err) + } + parent = d + } + return nil +} + +// parseAndFilterOptions parses a MountOptions slice and filters by the allowed +// keys. +func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) { + var out []string + for _, o := range opts { + kv := strings.Split(o, "=") + switch len(kv) { + case 1: + if contains(allowedKeys, o) { + out = append(out, o) + continue + } + log.Warningf("ignoring unsupported key %q", kv) + case 2: + if contains(allowedKeys, kv[0]) { + out = append(out, o) + continue + } + log.Warningf("ignoring unsupported key %q", kv[0]) + default: + return nil, fmt.Errorf("invalid option %q", o) + } + } + return out, nil +} + +func destinations(mounts []specs.Mount, extra ...string) []string { + var ds []string + for _, m := range mounts { + ds = append(ds, m.Destination) + } + return append(ds, extra...) +} + +func mountFlags(opts []string) fs.MountSourceFlags { + mf := fs.MountSourceFlags{} + for _, o := range opts { + switch o { + case "ro": + mf.ReadOnly = true + case "noatime": + mf.NoAtime = true + default: + log.Warningf("ignorning unknown mount option %q", o) + } + } + return mf +} + +func contains(strs []string, str string) bool { + for _, s := range strs { + if s == str { + return true + } + } + return false +} + +func mustFindFilesystem(name string) fs.Filesystem { + fs, ok := fs.FindFilesystem(name) + if !ok { + panic(fmt.Sprintf("could not find filesystem %q", name)) + } + return fs +} + +// addSubmountOverlay overlays the inode over a ramfs tree containing the given +// paths. +func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) { + // There is no real filesystem backing this ramfs tree, so we pass in + // "nil" here. + mountTree, err := ramfs.MakeDirectoryTree(ctx, fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), submounts) + if err != nil { + return nil, fmt.Errorf("error creating mount tree: %v", err) + } + overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{}) + if err != nil { + return nil, fmt.Errorf("failed to make mount overlay: %v", err) + } + return overlayInode, err +} + +// subtargets takes a set of Mounts and returns only the targets that are +// children of the given root. The returned paths are relative to the root. +func subtargets(root string, mnts []specs.Mount) []string { + r := filepath.Clean(root) + var targets []string + for _, mnt := range mnts { + t := filepath.Clean(mnt.Destination) + if strings.HasPrefix(t, r) { + // Make the mnt path relative to the root path. If the + // result is empty, then mnt IS the root mount, not a + // submount. We don't want to include those. + if t := strings.TrimPrefix(t, r); t != "" { + targets = append(targets, t) + } + } + } + return targets +} |