// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package boot import ( "fmt" "path/filepath" "sort" "strconv" "strings" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/gofer" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/fs/user" "gvisor.dev/gvisor/pkg/sentry/fsimpl/cgroupfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" gofervfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer" procvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" sysvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys" tmpfsvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/runsc/config" "gvisor.dev/gvisor/runsc/specutils" // Include filesystem types that OCI spec might mount. _ "gvisor.dev/gvisor/pkg/sentry/fs/dev" _ "gvisor.dev/gvisor/pkg/sentry/fs/host" _ "gvisor.dev/gvisor/pkg/sentry/fs/proc" _ "gvisor.dev/gvisor/pkg/sentry/fs/sys" _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" _ "gvisor.dev/gvisor/pkg/sentry/fs/tty" ) const ( // Device name for root mount. rootDevice = "9pfs-/" // MountPrefix is the annotation prefix for mount hints. MountPrefix = "dev.gvisor.spec.mount." // Supported filesystems that map to different internal filesystem. bind = "bind" nonefs = "none" ) // tmpfs has some extra supported options that we must pass through. var tmpfsAllowedData = []string{"mode", "uid", "gid"} func addOverlay(ctx context.Context, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) { // Upper layer uses the same flags as lower, but it must be read-write. upperFlags := lowerFlags upperFlags.ReadOnly = false tmpFS := mustFindFilesystem("tmpfs") if !fs.IsDir(lower.StableAttr) { // Create overlay on top of mount file, e.g. /etc/hostname. msrc := fs.NewCachingMountSource(ctx, tmpFS, upperFlags) return fs.NewOverlayRootFile(ctx, msrc, lower, upperFlags) } // Create overlay on top of mount dir. upper, err := tmpFS.Mount(ctx, name+"-upper", upperFlags, "", nil) if err != nil { return nil, fmt.Errorf("creating tmpfs overlay: %v", err) } // Replicate permissions and owner from lower to upper mount point. attr, err := lower.UnstableAttr(ctx) if err != nil { return nil, fmt.Errorf("reading attributes from lower mount point: %v", err) } if !upper.InodeOperations.SetPermissions(ctx, upper, attr.Perms) { return nil, fmt.Errorf("error setting permission to upper mount point") } if err := upper.InodeOperations.SetOwner(ctx, upper, attr.Owner); err != nil { return nil, fmt.Errorf("setting owner to upper mount point: %v", err) } return fs.NewOverlayRoot(ctx, upper, lower, upperFlags) } // compileMounts returns the supported mounts from the mount spec, adding any // mandatory mounts that are required by the OCI specification. func compileMounts(spec *specs.Spec, conf *config.Config, vfs2Enabled bool) []specs.Mount { // Keep track of whether proc and sys were mounted. var procMounted, sysMounted, devMounted, devptsMounted bool var mounts []specs.Mount // Mount all submounts from the spec. for _, m := range spec.Mounts { if !specutils.IsSupportedDevMount(m, vfs2Enabled) { log.Warningf("ignoring dev mount at %q", m.Destination) continue } // Unconditionally drop any cgroupfs mounts. If requested, we'll add our // own below. if m.Type == cgroupfs.Name { continue } switch filepath.Clean(m.Destination) { case "/proc": procMounted = true case "/sys": sysMounted = true case "/dev": m.Type = devtmpfs.Name devMounted = true case "/dev/pts": m.Type = devpts.Name devptsMounted = true } mounts = append(mounts, m) } // Mount proc and sys even if the user did not ask for it, as the spec // says we SHOULD. var mandatoryMounts []specs.Mount if conf.Cgroupfs { mandatoryMounts = append(mandatoryMounts, specs.Mount{ Type: tmpfsvfs2.Name, Destination: "/sys/fs/cgroup", }) mandatoryMounts = append(mandatoryMounts, specs.Mount{ Type: cgroupfs.Name, Destination: "/sys/fs/cgroup/memory", Options: []string{"memory"}, }) mandatoryMounts = append(mandatoryMounts, specs.Mount{ Type: cgroupfs.Name, Destination: "/sys/fs/cgroup/cpu", Options: []string{"cpu"}, }) } if !procMounted { mandatoryMounts = append(mandatoryMounts, specs.Mount{ Type: procvfs2.Name, Destination: "/proc", }) } if !sysMounted { mandatoryMounts = append(mandatoryMounts, specs.Mount{ Type: sysvfs2.Name, Destination: "/sys", }) } if !devMounted { mandatoryMounts = append(mandatoryMounts, specs.Mount{ Type: devtmpfs.Name, Destination: "/dev", }) } if !devptsMounted { mandatoryMounts = append(mandatoryMounts, specs.Mount{ Type: devpts.Name, Destination: "/dev/pts", }) } // The mandatory mounts should be ordered right after the root, in case // there are submounts of these mandatory mounts already in the spec. mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...) return mounts } // p9MountData creates a slice of p9 mount data. func p9MountData(fd int, fa config.FileAccessType, vfs2 bool) []string { opts := []string{ "trans=fd", "rfdno=" + strconv.Itoa(fd), "wfdno=" + strconv.Itoa(fd), } if !vfs2 { // privateunixsocket is always enabled in VFS2. VFS1 requires explicit // enablement. opts = append(opts, "privateunixsocket=true") } if fa == config.FileAccessShared { opts = append(opts, "cache=remote_revalidating") } return opts } // parseAndFilterOptions parses a MountOptions slice and filters by the allowed // keys. func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) { var out []string for _, o := range opts { ok, err := parseMountOption(o, allowedKeys...) if err != nil { return nil, err } if ok { out = append(out, o) } } return out, nil } func parseMountOption(opt string, allowedKeys ...string) (bool, error) { kv := strings.SplitN(opt, "=", 3) if len(kv) > 2 { return false, fmt.Errorf("invalid option %q", opt) } return specutils.ContainsStr(allowedKeys, kv[0]), nil } // mountDevice returns a device string based on the fs type and target // of the mount. func mountDevice(m *specs.Mount) string { if m.Type == bind { // Make a device string that includes the target, which is consistent across // S/R and uniquely identifies the connection. return "9pfs-" + m.Destination } // All other fs types use device "none". return "none" } func mountFlags(opts []string) fs.MountSourceFlags { mf := fs.MountSourceFlags{} // Note: changes to supported options must be reflected in // isSupportedMountFlag() as well. for _, o := range opts { switch o { case "rw": mf.ReadOnly = false case "ro": mf.ReadOnly = true case "noatime": mf.NoAtime = true case "noexec": mf.NoExec = true case "bind", "rbind": // These are the same as a mount with type="bind". default: log.Warningf("ignoring unknown mount option %q", o) } } return mf } func isSupportedMountFlag(fstype, opt string) bool { switch opt { case "rw", "ro", "noatime", "noexec": return true } if fstype == tmpfsvfs2.Name { ok, err := parseMountOption(opt, tmpfsAllowedData...) return ok && err == nil } if fstype == cgroupfs.Name { ok, err := parseMountOption(opt, cgroupfs.SupportedMountOptions...) return ok && err == nil } return false } func mustFindFilesystem(name string) fs.Filesystem { fs, ok := fs.FindFilesystem(name) if !ok { panic(fmt.Sprintf("could not find filesystem %q", name)) } return fs } // addSubmountOverlay overlays the inode over a ramfs tree containing the given // paths. func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string, mf fs.MountSourceFlags) (*fs.Inode, error) { // Construct a ramfs tree of mount points. The contents never // change, so this can be fully caching. There's no real // filesystem backing this tree, so we set the filesystem to // nil. msrc := fs.NewCachingMountSource(ctx, nil, fs.MountSourceFlags{}) mountTree, err := ramfs.MakeDirectoryTree(ctx, msrc, submounts) if err != nil { return nil, fmt.Errorf("creating mount tree: %v", err) } overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, mf) if err != nil { return nil, fmt.Errorf("adding mount overlay: %v", err) } return overlayInode, err } // subtargets takes a set of Mounts and returns only the targets that are // children of the given root. The returned paths are relative to the root. func subtargets(root string, mnts []specs.Mount) []string { var targets []string for _, mnt := range mnts { if relPath, isSubpath := fs.IsSubpath(mnt.Destination, root); isSubpath { targets = append(targets, relPath) } } return targets } func setupContainerFS(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { if conf.VFS2 { return setupContainerVFS2(ctx, conf, mntr, procArgs) } mns, err := mntr.setupFS(conf, procArgs) if err != nil { return err } // Set namespace here so that it can be found in ctx. procArgs.MountNamespace = mns // Resolve the executable path from working dir and environment. resolved, err := user.ResolveExecutablePath(ctx, procArgs) if err != nil { return err } procArgs.Filename = resolved return nil } func adjustDirentCache(k *kernel.Kernel) error { var hl unix.Rlimit if err := unix.Getrlimit(unix.RLIMIT_NOFILE, &hl); err != nil { return fmt.Errorf("getting RLIMIT_NOFILE: %v", err) } if hl.Cur != unix.RLIM_INFINITY { newSize := hl.Cur / 2 if newSize < gofer.DefaultDirentCacheSize { log.Infof("Setting gofer dirent cache size to %d", newSize) gofer.DefaultDirentCacheSize = newSize k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize) } } return nil } type fdDispenser struct { fds []*fd.FD } func (f *fdDispenser) remove() int { if f.empty() { panic("fdDispenser out of fds") } rv := f.fds[0].Release() f.fds = f.fds[1:] return rv } func (f *fdDispenser) empty() bool { return len(f.fds) == 0 } type shareType int const ( invalid shareType = iota // container shareType indicates that the mount is used by a single container. container // pod shareType indicates that the mount is used by more than one container // inside the pod. pod // shared shareType indicates that the mount can also be shared with a process // outside the pod, e.g. NFS. shared ) func parseShare(val string) (shareType, error) { switch val { case "container": return container, nil case "pod": return pod, nil case "shared": return shared, nil default: return 0, fmt.Errorf("invalid share value %q", val) } } func (s shareType) String() string { switch s { case invalid: return "invalid" case container: return "container" case pod: return "pod" case shared: return "shared" default: return fmt.Sprintf("invalid share value %d", s) } } // mountHint represents extra information about mounts that are provided via // annotations. They can override mount type, and provide sharing information // so that mounts can be correctly shared inside the pod. type mountHint struct { name string share shareType mount specs.Mount // root is the inode where the volume is mounted. For mounts with 'pod' share // the volume is mounted once and then bind mounted inside the containers. root *fs.Inode // vfsMount is the master mount for the volume. For mounts with 'pod' share // the master volume is bind mounted inside the containers. vfsMount *vfs.Mount } func (m *mountHint) setField(key, val string) error { switch key { case "source": if len(val) == 0 { return fmt.Errorf("source cannot be empty") } m.mount.Source = val case "type": return m.setType(val) case "share": share, err := parseShare(val) if err != nil { return err } m.share = share case "options": return m.setOptions(val) default: return fmt.Errorf("invalid mount annotation: %s=%s", key, val) } return nil } func (m *mountHint) setType(val string) error { switch val { case "tmpfs", "bind": m.mount.Type = val default: return fmt.Errorf("invalid type %q", val) } return nil } func (m *mountHint) setOptions(val string) error { opts := strings.Split(val, ",") if err := specutils.ValidateMountOptions(opts); err != nil { return err } // Sort options so it can be compared with container mount options later on. sort.Strings(opts) m.mount.Options = opts return nil } func (m *mountHint) isSupported() bool { return m.mount.Type == tmpfsvfs2.Name && m.share == pod } // checkCompatible verifies that shared mount is compatible with master. // For now enforce that all options are the same. Once bind mount is properly // supported, then we should ensure the master is less restrictive than the // container, e.g. master can be 'rw' while container mounts as 'ro'. func (m *mountHint) checkCompatible(mount *specs.Mount) error { // Remove options that don't affect to mount's behavior. masterOpts := filterUnsupportedOptions(&m.mount) replicaOpts := filterUnsupportedOptions(mount) if len(masterOpts) != len(replicaOpts) { return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts) } sort.Strings(masterOpts) sort.Strings(replicaOpts) for i, opt := range masterOpts { if opt != replicaOpts[i] { return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts) } } return nil } func (m *mountHint) fileAccessType() config.FileAccessType { if m.share == container { return config.FileAccessExclusive } return config.FileAccessShared } func filterUnsupportedOptions(mount *specs.Mount) []string { rv := make([]string, 0, len(mount.Options)) for _, o := range mount.Options { if isSupportedMountFlag(mount.Type, o) { rv = append(rv, o) } } return rv } // podMountHints contains a collection of mountHints for the pod. type podMountHints struct { mounts map[string]*mountHint } func newPodMountHints(spec *specs.Spec) (*podMountHints, error) { mnts := make(map[string]*mountHint) for k, v := range spec.Annotations { // Look for 'dev.gvisor.spec.mount' annotations and parse them. if strings.HasPrefix(k, MountPrefix) { // Remove the prefix and split the rest. parts := strings.Split(k[len(MountPrefix):], ".") if len(parts) != 2 { return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v) } name := parts[0] if len(name) == 0 { return nil, fmt.Errorf("invalid mount name: %s", name) } mnt := mnts[name] if mnt == nil { mnt = &mountHint{name: name} mnts[name] = mnt } if err := mnt.setField(parts[1], v); err != nil { return nil, err } } } // Validate all hints after done parsing. for name, m := range mnts { log.Infof("Mount annotation found, name: %s, source: %q, type: %s, share: %v", name, m.mount.Source, m.mount.Type, m.share) if m.share == invalid { return nil, fmt.Errorf("share field for %q has not been set", m.name) } if len(m.mount.Source) == 0 { return nil, fmt.Errorf("source field for %q has not been set", m.name) } if len(m.mount.Type) == 0 { return nil, fmt.Errorf("type field for %q has not been set", m.name) } // Check for duplicate mount sources. for name2, m2 := range mnts { if name != name2 && m.mount.Source == m2.mount.Source { return nil, fmt.Errorf("mounts %q and %q have the same mount source %q", m.name, m2.name, m.mount.Source) } } } return &podMountHints{mounts: mnts}, nil } func (p *podMountHints) findMount(mount *specs.Mount) *mountHint { for _, m := range p.mounts { if m.mount.Source == mount.Source { return m } } return nil } type containerMounter struct { root *specs.Root // mounts is the set of submounts for the container. It's a copy from the spec // that may be freely modified without affecting the original spec. mounts []specs.Mount // fds is the list of FDs to be dispensed for mounts that require it. fds fdDispenser k *kernel.Kernel hints *podMountHints } func newContainerMounter(info *containerInfo, k *kernel.Kernel, hints *podMountHints, vfs2Enabled bool) *containerMounter { return &containerMounter{ root: info.spec.Root, mounts: compileMounts(info.spec, info.conf, vfs2Enabled), fds: fdDispenser{fds: info.goferFDs}, k: k, hints: hints, } } // processHints processes annotations that container hints about how volumes // should be mounted (e.g. a volume shared between containers). It must be // called for the root container only. func (c *containerMounter) processHints(conf *config.Config, creds *auth.Credentials) error { if conf.VFS2 { return c.processHintsVFS2(conf, creds) } ctx := c.k.SupervisorContext() for _, hint := range c.hints.mounts { // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a // common gofer to mount all shared volumes. if hint.mount.Type != tmpfsvfs2.Name { continue } log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type) inode, err := c.mountSharedMaster(ctx, conf, hint) if err != nil { return fmt.Errorf("mounting shared master %q: %v", hint.name, err) } hint.root = inode } return nil } // setupFS is used to set up the file system for all containers. This is the // main entry point method, with most of the other being internal only. It // returns the mount namespace that is created for the container. func (c *containerMounter) setupFS(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) { log.Infof("Configuring container's file system") // Create context with root credentials to mount the filesystem (the current // user may not be privileged enough). rootProcArgs := *procArgs rootProcArgs.WorkingDirectory = "/" rootProcArgs.Credentials = auth.NewRootCredentials(procArgs.Credentials.UserNamespace) rootProcArgs.Umask = 0022 rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals rootCtx := rootProcArgs.NewContext(c.k) mns, err := c.createMountNamespace(rootCtx, conf) if err != nil { return nil, err } // Set namespace here so that it can be found in rootCtx. rootProcArgs.MountNamespace = mns if err := c.mountSubmounts(rootCtx, conf, mns); err != nil { return nil, err } return mns, nil } func (c *containerMounter) createMountNamespace(ctx context.Context, conf *config.Config) (*fs.MountNamespace, error) { rootInode, err := c.createRootMount(ctx, conf) if err != nil { return nil, fmt.Errorf("creating filesystem for container: %v", err) } mns, err := fs.NewMountNamespace(ctx, rootInode) if err != nil { return nil, fmt.Errorf("creating new mount namespace for container: %v", err) } return mns, nil } func (c *containerMounter) mountSubmounts(ctx context.Context, conf *config.Config, mns *fs.MountNamespace) error { root := mns.Root() defer root.DecRef(ctx) for i := range c.mounts { m := &c.mounts[i] log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options) if hint := c.hints.findMount(m); hint != nil && hint.isSupported() { if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil { return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err) } } else { if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil { return fmt.Errorf("mount submount %q: %v", m.Destination, err) } } } if err := c.mountTmp(ctx, conf, mns, root); err != nil { return fmt.Errorf("mount submount %q: %v", "tmp", err) } if err := c.checkDispenser(); err != nil { return err } return nil } func (c *containerMounter) checkDispenser() error { if !c.fds.empty() { return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.fds) } return nil } // mountSharedMaster mounts the master of a volume that is shared among // containers in a pod. It returns the root mount's inode. func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *config.Config, hint *mountHint) (*fs.Inode, error) { // Map mount type to filesystem name, and parse out the options that we are // capable of dealing with. fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, &hint.mount) if err != nil { return nil, err } if len(fsName) == 0 { return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type) } // Mount with revalidate because it's shared among containers. opts = append(opts, "cache=revalidate") // All filesystem names should have been mapped to something we know. filesystem := mustFindFilesystem(fsName) mf := mountFlags(hint.mount.Options) if useOverlay { // All writes go to upper, be paranoid and make lower readonly. mf.ReadOnly = true } inode, err := filesystem.Mount(ctx, mountDevice(&hint.mount), mf, strings.Join(opts, ","), nil) if err != nil { return nil, fmt.Errorf("creating mount %q: %v", hint.name, err) } if useOverlay { log.Debugf("Adding overlay on top of shared mount %q", hint.name) inode, err = addOverlay(ctx, inode, hint.mount.Type, mf) if err != nil { return nil, err } } return inode, nil } // createRootMount creates the root filesystem. func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Config) (*fs.Inode, error) { // First construct the filesystem from the spec.Root. mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay} fd := c.fds.remove() log.Infof("Mounting root over 9P, ioFD: %d", fd) p9FS := mustFindFilesystem("9p") opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */) // We can't check for overlayfs here because sandbox is chroot'ed and gofer // can only send mount options for specs.Mounts (specs.Root is missing // Options field). So assume root is always on top of overlayfs. opts = append(opts, "overlayfs_stale_read") rootInode, err := p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil) if err != nil { return nil, fmt.Errorf("creating root mount point: %v", err) } // We need to overlay the root on top of a ramfs with stub directories // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always // mounted even if they are not in the spec. submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp") rootInode, err = addSubmountOverlay(ctx, rootInode, submounts, mf) if err != nil { return nil, fmt.Errorf("adding submount overlay: %v", err) } if conf.Overlay && !c.root.Readonly { log.Debugf("Adding overlay on top of root mount") // Overlay a tmpfs filesystem on top of the root. rootInode, err = addOverlay(ctx, rootInode, "root-overlay-upper", mf) if err != nil { return nil, err } } log.Infof("Mounted %q to %q type root", c.root.Path, "/") return rootInode, nil } // getMountNameAndOptions retrieves the fsName, opts, and useOverlay values // used for mounts. func (c *containerMounter) getMountNameAndOptions(conf *config.Config, m *specs.Mount) (string, []string, bool, error) { specutils.MaybeConvertToBindMount(m) var ( fsName string opts []string useOverlay bool ) switch m.Type { case devpts.Name, devtmpfs.Name, procvfs2.Name, sysvfs2.Name: fsName = m.Type case nonefs: fsName = sysvfs2.Name case tmpfsvfs2.Name: fsName = m.Type var err error opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...) if err != nil { return "", nil, false, err } case bind: fd := c.fds.remove() fsName = gofervfs2.Name opts = p9MountData(fd, c.getMountAccessType(conf, m), conf.VFS2) // If configured, add overlay to all writable mounts. useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly case cgroupfs.Name: fsName = m.Type var err error opts, err = parseAndFilterOptions(m.Options, cgroupfs.SupportedMountOptions...) if err != nil { return "", nil, false, err } default: log.Warningf("ignoring unknown filesystem type %q", m.Type) } return fsName, opts, useOverlay, nil } func (c *containerMounter) getMountAccessType(conf *config.Config, mount *specs.Mount) config.FileAccessType { if hint := c.hints.findMount(mount); hint != nil { return hint.fileAccessType() } return conf.FileAccessMounts } // mountSubmount mounts volumes inside the container's root. Because mounts may // be readonly, a lower ramfs overlay is added to create the mount point dir. // Another overlay is added with tmpfs on top if Config.Overlay is true. // 'm.Destination' must be an absolute path with '..' and symlinks resolved. func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent, m *specs.Mount) error { // Map mount type to filesystem name, and parse out the options that we are // capable of dealing with. fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m) if err != nil { return err } if fsName == "" { // Filesystem is not supported (e.g. cgroup), just skip it. return nil } // All filesystem names should have been mapped to something we know. filesystem := mustFindFilesystem(fsName) mf := mountFlags(m.Options) if useOverlay { // All writes go to upper, be paranoid and make lower readonly. mf.ReadOnly = true } inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil) if err != nil { err := fmt.Errorf("creating mount with source %q: %v", m.Source, err) // Check to see if this is a common error due to a Linux bug. // This error is generated here in order to cause it to be // printed to the user using Docker via 'runsc create' etc. rather // than simply printed to the logs for the 'runsc boot' command. // // We check the error message string rather than type because the // actual error types (unix.EIO, unix.EPIPE) are lost by file system // implementation (e.g. p9). // TODO(gvisor.dev/issue/1765): Remove message when bug is resolved. if strings.Contains(err.Error(), unix.EIO.Error()) || strings.Contains(err.Error(), unix.EPIPE.Error()) { return fmt.Errorf("%v: %s", err, specutils.FaqErrorMsg("memlock", "you may be encountering a Linux kernel bug")) } return err } // If there are submounts, we need to overlay the mount on top of a ramfs // with stub directories for submount paths. submounts := subtargets(m.Destination, c.mounts) if len(submounts) > 0 { log.Infof("Adding submount overlay over %q", m.Destination) inode, err = addSubmountOverlay(ctx, inode, submounts, mf) if err != nil { return fmt.Errorf("adding submount overlay: %v", err) } } if useOverlay { log.Debugf("Adding overlay on top of mount %q", m.Destination) inode, err = addOverlay(ctx, inode, m.Type, mf) if err != nil { return err } } maxTraversals := uint(0) dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals) if err != nil { return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err) } defer dirent.DecRef(ctx) if err := mns.Mount(ctx, dirent, inode); err != nil { return fmt.Errorf("mount %q error: %v", m.Destination, err) } log.Infof("Mounted %q to %q type: %s, internal-options: %q", m.Source, m.Destination, m.Type, opts) return nil } // mountSharedSubmount binds mount to a previously mounted volume that is shared // among containers in the same pod. func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount *specs.Mount, source *mountHint) error { if err := source.checkCompatible(mount); err != nil { return err } maxTraversals := uint(0) target, err := mns.FindInode(ctx, root, root, mount.Destination, &maxTraversals) if err != nil { return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err) } defer target.DecRef(ctx) // Take a ref on the inode that is about to be (re)-mounted. source.root.IncRef() if err := mns.Mount(ctx, target, source.root); err != nil { source.root.DecRef(ctx) return fmt.Errorf("bind mount %q error: %v", mount.Destination, err) } log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name) return nil } // addRestoreMount adds a mount to the MountSources map used for restoring a // checkpointed container. func (c *containerMounter) addRestoreMount(conf *config.Config, renv *fs.RestoreEnvironment, m *specs.Mount) error { fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m) if err != nil { return err } if fsName == "" { // Filesystem is not supported (e.g. cgroup), just skip it. return nil } newMount := fs.MountArgs{ Dev: mountDevice(m), Flags: mountFlags(m.Options), DataString: strings.Join(opts, ","), } if useOverlay { newMount.Flags.ReadOnly = true } renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount) log.Infof("Added mount at %q: %+v", fsName, newMount) return nil } // createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding // the mounts to the environment. func (c *containerMounter) createRestoreEnvironment(conf *config.Config) (*fs.RestoreEnvironment, error) { renv := &fs.RestoreEnvironment{ MountSources: make(map[string][]fs.MountArgs), } // Add root mount. fd := c.fds.remove() opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */) mf := fs.MountSourceFlags{} if c.root.Readonly || conf.Overlay { mf.ReadOnly = true } rootMount := fs.MountArgs{ Dev: rootDevice, Flags: mf, DataString: strings.Join(opts, ","), } renv.MountSources[gofervfs2.Name] = append(renv.MountSources[gofervfs2.Name], rootMount) // Add submounts. var tmpMounted bool for i := range c.mounts { m := &c.mounts[i] if err := c.addRestoreMount(conf, renv, m); err != nil { return nil, err } if filepath.Clean(m.Destination) == "/tmp" { tmpMounted = true } } // TODO(b/67958150): handle '/tmp' properly (see mountTmp()). if !tmpMounted { tmpMount := specs.Mount{ Type: tmpfsvfs2.Name, Destination: "/tmp", } if err := c.addRestoreMount(conf, renv, &tmpMount); err != nil { return nil, err } } return renv, nil } // mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so. // Technically we don't have to mount tmpfs at /tmp, as we could just rely on // the host /tmp, but this is a nice optimization, and fixes some apps that call // mknod in /tmp. It's unsafe to mount tmpfs if: // 1. /tmp is mounted explicitly: we should not override user's wish // 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp // // Note that when there are submounts inside of '/tmp', directories for the // mount points must be present, making '/tmp' not empty anymore. func (c *containerMounter) mountTmp(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent) error { for _, m := range c.mounts { if filepath.Clean(m.Destination) == "/tmp" { log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m) return nil } } maxTraversals := uint(0) tmp, err := mns.FindInode(ctx, root, root, "tmp", &maxTraversals) switch { case err == nil: // Found '/tmp' in filesystem, check if it's empty. defer tmp.DecRef(ctx) f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true}) if err != nil { return err } defer f.DecRef(ctx) serializer := &fs.CollectEntriesSerializer{} if err := f.Readdir(ctx, serializer); err != nil { return err } // If more than "." and ".." is found, skip internal tmpfs to prevent hiding // existing files. if len(serializer.Order) > 2 { log.Infof("Skipping internal tmpfs on top %q, because it's not empty", "/tmp") return nil } log.Infof("Mounting internal tmpfs on top of empty %q", "/tmp") fallthrough case linuxerr.Equals(linuxerr.ENOENT, err): // No '/tmp' found (or fallthrough from above). Safe to mount internal // tmpfs. tmpMount := specs.Mount{ Type: tmpfsvfs2.Name, Destination: "/tmp", // Sticky bit is added to prevent accidental deletion of files from // another user. This is normally done for /tmp. Options: []string{"mode=01777"}, } return c.mountSubmount(ctx, conf, mns, root, &tmpMount) default: return err } }