summaryrefslogtreecommitdiffhomepage
path: root/runsc/boot/fs.go
diff options
context:
space:
mode:
Diffstat (limited to 'runsc/boot/fs.go')
-rw-r--r--runsc/boot/fs.go774
1 files changed, 774 insertions, 0 deletions
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
new file mode 100644
index 000000000..4b1557b9a
--- /dev/null
+++ b/runsc/boot/fs.go
@@ -0,0 +1,774 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+ "path"
+ "path/filepath"
+ "strconv"
+ "strings"
+ "syscall"
+
+ // Include filesystem types that OCI spec might mount.
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/dev"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
+ _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tty"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
+)
+
+const (
+ // Filesystem name for 9p gofer mounts.
+ rootFsName = "9p"
+
+ // Device name for root mount.
+ rootDevice = "9pfs-/"
+
+ // ChildContainersDir is the directory where child container root
+ // filesystems are mounted.
+ ChildContainersDir = "/__runsc_containers__"
+
+ // Filesystems that runsc supports.
+ bind = "bind"
+ devpts = "devpts"
+ devtmpfs = "devtmpfs"
+ proc = "proc"
+ sysfs = "sysfs"
+ tmpfs = "tmpfs"
+ nonefs = "none"
+)
+
+type fdDispenser struct {
+ fds []int
+}
+
+func (f *fdDispenser) remove() int {
+ if f.empty() {
+ panic("fdDispenser out of fds")
+ }
+ rv := f.fds[0]
+ f.fds = f.fds[1:]
+ return rv
+}
+
+func (f *fdDispenser) empty() bool {
+ return len(f.fds) == 0
+}
+
+func adjustDirentCache(k *kernel.Kernel) error {
+ var hl syscall.Rlimit
+ if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil {
+ return fmt.Errorf("getting RLIMIT_NOFILE: %v", err)
+ }
+ if int64(hl.Cur) != syscall.RLIM_INFINITY {
+ newSize := hl.Cur / 2
+ if newSize < gofer.DefaultDirentCacheSize {
+ log.Infof("Setting gofer dirent cache size to %d", newSize)
+ gofer.DefaultDirentCacheSize = newSize
+ k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize)
+ }
+ }
+ return nil
+}
+
+// setupRootContainerFS creates a mount namespace containing the root filesystem
+// and all mounts. 'rootCtx' is used to walk directories to find mount points.
+// 'setMountNS' is called after namespace is created. It must set the mount NS
+// to 'rootCtx'.
+func setupRootContainerFS(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, goferFDs []int, setMountNS func(*fs.MountNamespace)) error {
+ mounts := compileMounts(spec)
+
+ // Create a tmpfs mount where we create and mount a root filesystem for
+ // each child container.
+ mounts = append(mounts, specs.Mount{
+ Type: tmpfs,
+ Destination: ChildContainersDir,
+ })
+
+ fds := &fdDispenser{fds: goferFDs}
+ rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts)
+ if err != nil {
+ return fmt.Errorf("creating root mount: %v", err)
+ }
+ mns, err := fs.NewMountNamespace(userCtx, rootInode)
+ if err != nil {
+ return fmt.Errorf("creating root mount namespace: %v", err)
+ }
+ setMountNS(mns)
+
+ root := mns.Root()
+ defer root.DecRef()
+ return mountSubmounts(rootCtx, conf, mns, root, mounts, fds)
+}
+
+// compileMounts returns the supported mounts from the mount spec, adding any
+// mandatory mounts that are required by the OCI specification.
+func compileMounts(spec *specs.Spec) []specs.Mount {
+ // Keep track of whether proc and sys were mounted.
+ var procMounted, sysMounted bool
+ var mounts []specs.Mount
+
+ // Always mount /dev.
+ mounts = append(mounts, specs.Mount{
+ Type: devtmpfs,
+ Destination: "/dev",
+ })
+
+ mounts = append(mounts, specs.Mount{
+ Type: devpts,
+ Destination: "/dev/pts",
+ })
+
+ // Mount all submounts from the spec.
+ for _, m := range spec.Mounts {
+ if !specutils.IsSupportedDevMount(m) {
+ log.Warningf("ignoring dev mount at %q", m.Destination)
+ continue
+ }
+ mounts = append(mounts, m)
+ switch filepath.Clean(m.Destination) {
+ case "/proc":
+ procMounted = true
+ case "/sys":
+ sysMounted = true
+ }
+ }
+
+ // Mount proc and sys even if the user did not ask for it, as the spec
+ // says we SHOULD.
+ var mandatoryMounts []specs.Mount
+ if !procMounted {
+ mandatoryMounts = append(mandatoryMounts, specs.Mount{
+ Type: proc,
+ Destination: "/proc",
+ })
+ }
+ if !sysMounted {
+ mandatoryMounts = append(mandatoryMounts, specs.Mount{
+ Type: sysfs,
+ Destination: "/sys",
+ })
+ }
+
+ // The mandatory mounts should be ordered right after the root, in case
+ // there are submounts of these mandatory mounts already in the spec.
+ mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...)
+
+ return mounts
+}
+
+// createRootMount creates the root filesystem.
+func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser, mounts []specs.Mount) (*fs.Inode, error) {
+ // First construct the filesystem from the spec.Root.
+ mf := fs.MountSourceFlags{ReadOnly: spec.Root.Readonly || conf.Overlay}
+
+ var (
+ rootInode *fs.Inode
+ err error
+ )
+
+ fd := fds.remove()
+ log.Infof("Mounting root over 9P, ioFD: %d", fd)
+ p9FS := mustFindFilesystem("9p")
+ opts := p9MountOptions(fd, conf.FileAccess)
+ rootInode, err = p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
+ if err != nil {
+ return nil, fmt.Errorf("creating root mount point: %v", err)
+ }
+
+ // We need to overlay the root on top of a ramfs with stub directories
+ // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always
+ // mounted even if they are not in the spec.
+ submounts := append(subtargets("/", mounts), "/dev", "/sys", "/proc", "/tmp")
+ rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
+ if err != nil {
+ return nil, fmt.Errorf("adding submount overlay: %v", err)
+ }
+
+ if conf.Overlay && !spec.Root.Readonly {
+ log.Debugf("Adding overlay on top of root mount")
+ // Overlay a tmpfs filesystem on top of the root.
+ rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf)
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ log.Infof("Mounted %q to %q type root", spec.Root.Path, "/")
+ return rootInode, nil
+}
+
+func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
+ // Upper layer uses the same flags as lower, but it must be read-write.
+ lowerFlags.ReadOnly = false
+
+ tmpFS := mustFindFilesystem("tmpfs")
+ if !fs.IsDir(lower.StableAttr) {
+ // Create overlay on top of mount file, e.g. /etc/hostname.
+ msrc := fs.NewCachingMountSource(tmpFS, lowerFlags)
+ return fs.NewOverlayRootFile(ctx, msrc, lower, lowerFlags)
+ }
+
+ // Create overlay on top of mount dir.
+ upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "", nil)
+ if err != nil {
+ return nil, fmt.Errorf("creating tmpfs overlay: %v", err)
+ }
+ return fs.NewOverlayRoot(ctx, upper, lower, lowerFlags)
+}
+
+// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
+// used for mounts.
+func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (string, []string, bool, error) {
+ var (
+ fsName string
+ opts []string
+ useOverlay bool
+ err error
+ )
+
+ switch m.Type {
+ case devpts, devtmpfs, proc, sysfs:
+ fsName = m.Type
+ case nonefs:
+ fsName = sysfs
+ case tmpfs:
+ fsName = m.Type
+
+ // tmpfs has some extra supported options that we must pass through.
+ opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
+
+ case bind:
+ fd := fds.remove()
+ fsName = "9p"
+ // Non-root bind mounts are always shared.
+ opts = p9MountOptions(fd, FileAccessShared)
+ // If configured, add overlay to all writable mounts.
+ useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
+
+ default:
+ // TODO(nlacasse): Support all the mount types and make this a
+ // fatal error. Most applications will "just work" without
+ // them, so this is a warning for now.
+ // we do not support.
+ log.Warningf("ignoring unknown filesystem type %q", m.Type)
+ }
+ return fsName, opts, useOverlay, err
+}
+
+func mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, mounts []specs.Mount, fds *fdDispenser) error {
+ for _, m := range mounts {
+ if err := mountSubmount(ctx, conf, mns, root, fds, m, mounts); err != nil {
+ return fmt.Errorf("mount submount %q: %v", m.Destination, err)
+ }
+ }
+
+ if err := mountTmp(ctx, conf, mns, root, mounts); err != nil {
+ return fmt.Errorf("mount submount %q: %v", "tmp", err)
+ }
+
+ if !fds.empty() {
+ return fmt.Errorf("not all mount points were consumed, remaining: %v", fds)
+ }
+ return nil
+}
+
+// mountSubmount mounts volumes inside the container's root. Because mounts may
+// be readonly, a lower ramfs overlay is added to create the mount point dir.
+// Another overlay is added with tmpfs on top if Config.Overlay is true.
+// 'm.Destination' must be an absolute path with '..' and symlinks resolved.
+func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, fds *fdDispenser, m specs.Mount, mounts []specs.Mount) error {
+ // Map mount type to filesystem name, and parse out the options that we are
+ // capable of dealing with.
+ fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds)
+
+ // Return the error or nil that corresponds to the default case in getMountNameAndOptions.
+ if err != nil {
+ return err
+ }
+ if fsName == "" {
+ return nil
+ }
+
+ // All filesystem names should have been mapped to something we know.
+ filesystem := mustFindFilesystem(fsName)
+
+ mf := mountFlags(m.Options)
+ if useOverlay {
+ // All writes go to upper, be paranoid and make lower readonly.
+ mf.ReadOnly = true
+ }
+
+ inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil)
+ if err != nil {
+ return fmt.Errorf("creating mount with source %q: %v", m.Source, err)
+ }
+
+ // If there are submounts, we need to overlay the mount on top of a
+ // ramfs with stub directories for submount paths.
+ submounts := subtargets(m.Destination, mounts)
+ if len(submounts) > 0 {
+ log.Infof("Adding submount overlay over %q", m.Destination)
+ inode, err = addSubmountOverlay(ctx, inode, submounts)
+ if err != nil {
+ return fmt.Errorf("adding submount overlay: %v", err)
+ }
+ }
+
+ if useOverlay {
+ log.Debugf("Adding overlay on top of mount %q", m.Destination)
+ inode, err = addOverlay(ctx, conf, inode, m.Type, mf)
+ if err != nil {
+ return err
+ }
+ }
+
+ maxTraversals := uint(0)
+ dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals)
+ if err != nil {
+ return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
+ }
+ defer dirent.DecRef()
+ if err := mns.Mount(ctx, dirent, inode); err != nil {
+ return fmt.Errorf("mount %q error: %v", m.Destination, err)
+ }
+
+ log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
+ return nil
+}
+
+// p9MountOptions creates a slice of options for a p9 mount.
+func p9MountOptions(fd int, fa FileAccessType) []string {
+ opts := []string{
+ "trans=fd",
+ "rfdno=" + strconv.Itoa(fd),
+ "wfdno=" + strconv.Itoa(fd),
+ "privateunixsocket=true",
+ }
+ if fa == FileAccessShared {
+ opts = append(opts, "cache=remote_revalidating")
+ }
+ return opts
+}
+
+// parseAndFilterOptions parses a MountOptions slice and filters by the allowed
+// keys.
+func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) {
+ var out []string
+ for _, o := range opts {
+ kv := strings.Split(o, "=")
+ switch len(kv) {
+ case 1:
+ if specutils.ContainsStr(allowedKeys, o) {
+ out = append(out, o)
+ continue
+ }
+ log.Warningf("ignoring unsupported key %q", kv)
+ case 2:
+ if specutils.ContainsStr(allowedKeys, kv[0]) {
+ out = append(out, o)
+ continue
+ }
+ log.Warningf("ignoring unsupported key %q", kv[0])
+ default:
+ return nil, fmt.Errorf("invalid option %q", o)
+ }
+ }
+ return out, nil
+}
+
+// mountDevice returns a device string based on the fs type and target
+// of the mount.
+func mountDevice(m specs.Mount) string {
+ if m.Type == bind {
+ // Make a device string that includes the target, which is consistent across
+ // S/R and uniquely identifies the connection.
+ return "9pfs-" + m.Destination
+ }
+ // All other fs types use device "none".
+ return "none"
+}
+
+// addRestoreMount adds a mount to the MountSources map used for restoring a
+// checkpointed container.
+func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, fds *fdDispenser) error {
+ fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds)
+
+ // Return the error or nil that corresponds to the default case in getMountNameAndOptions.
+ if err != nil {
+ return err
+ }
+ // TODO(nlacasse): Fix this when we support all the mount types and
+ // make this a fatal error.
+ if fsName == "" {
+ return nil
+ }
+
+ newMount := fs.MountArgs{
+ Dev: mountDevice(m),
+ Flags: mountFlags(m.Options),
+ DataString: strings.Join(opts, ","),
+ }
+ if useOverlay {
+ newMount.Flags.ReadOnly = true
+ }
+ renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
+ log.Infof("Added mount at %q: %+v", fsName, newMount)
+ return nil
+}
+
+// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding the mounts
+// to the environment.
+func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.RestoreEnvironment, error) {
+ renv := &fs.RestoreEnvironment{
+ MountSources: make(map[string][]fs.MountArgs),
+ }
+
+ // Add root mount.
+ fd := fds.remove()
+ opts := p9MountOptions(fd, conf.FileAccess)
+
+ mf := fs.MountSourceFlags{}
+ if spec.Root.Readonly || conf.Overlay {
+ mf.ReadOnly = true
+ }
+
+ rootMount := fs.MountArgs{
+ Dev: rootDevice,
+ Flags: mf,
+ DataString: strings.Join(opts, ","),
+ }
+ renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
+
+ // Add submounts.
+ var tmpMounted bool
+ for _, m := range compileMounts(spec) {
+ if err := addRestoreMount(conf, renv, m, fds); err != nil {
+ return nil, err
+ }
+ if filepath.Clean(m.Destination) == "/tmp" {
+ tmpMounted = true
+ }
+ }
+
+ // TODO(b/67958150): handle '/tmp' properly (see mountTmp()).
+ if !tmpMounted {
+ tmpMount := specs.Mount{
+ Type: tmpfs,
+ Destination: "/tmp",
+ }
+ if err := addRestoreMount(conf, renv, tmpMount, fds); err != nil {
+ return nil, err
+ }
+ }
+
+ return renv, nil
+}
+
+func mountFlags(opts []string) fs.MountSourceFlags {
+ mf := fs.MountSourceFlags{}
+ for _, o := range opts {
+ switch o {
+ case "rw":
+ mf.ReadOnly = false
+ case "ro":
+ mf.ReadOnly = true
+ case "noatime":
+ mf.NoAtime = true
+ case "noexec":
+ mf.NoExec = true
+ default:
+ log.Warningf("ignoring unknown mount option %q", o)
+ }
+ }
+ return mf
+}
+
+func mustFindFilesystem(name string) fs.Filesystem {
+ fs, ok := fs.FindFilesystem(name)
+ if !ok {
+ panic(fmt.Sprintf("could not find filesystem %q", name))
+ }
+ return fs
+}
+
+// addSubmountOverlay overlays the inode over a ramfs tree containing the given
+// paths.
+func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
+ msrc := fs.NewPseudoMountSource()
+ mountTree, err := ramfs.MakeDirectoryTree(ctx, msrc, submounts)
+ if err != nil {
+ return nil, fmt.Errorf("creating mount tree: %v", err)
+ }
+ overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{})
+ if err != nil {
+ return nil, fmt.Errorf("adding mount overlay: %v", err)
+ }
+ return overlayInode, err
+}
+
+// subtargets takes a set of Mounts and returns only the targets that are
+// children of the given root. The returned paths are relative to the root.
+func subtargets(root string, mnts []specs.Mount) []string {
+ var targets []string
+ for _, mnt := range mnts {
+ if relPath, isSubpath := fs.IsSubpath(mnt.Destination, root); isSubpath {
+ targets = append(targets, relPath)
+ }
+ }
+ return targets
+}
+
+// setupContainerFS is used to set up the file system and amend the procArgs accordingly.
+// procArgs are passed by reference and the FDMap field is modified. It dups stdioFDs.
+func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
+ ctx := procArgs.NewContext(k)
+
+ // Create the FD map, which will set stdin, stdout, and stderr. If console
+ // is true, then ioctl calls will be passed through to the host fd.
+ fdm, err := createFDMap(ctx, k, ls, console, stdioFDs)
+ if err != nil {
+ return fmt.Errorf("importing fds: %v", err)
+ }
+
+ // CreateProcess takes a reference on FDMap if successful. We
+ // won't need ours either way.
+ procArgs.FDMap = fdm
+
+ // Use root user to configure mounts. The current user might not have
+ // permission to do so.
+ rootProcArgs := kernel.CreateProcessArgs{
+ WorkingDirectory: "/",
+ Credentials: auth.NewRootCredentials(creds.UserNamespace),
+ Umask: 0022,
+ MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
+ }
+ rootCtx := rootProcArgs.NewContext(k)
+
+ // If this is the root container, we also need to setup the root mount
+ // namespace.
+ mns := k.RootMountNamespace()
+ if mns == nil {
+ // Setup the root container.
+ return setupRootContainerFS(ctx, rootCtx, spec, conf, goferFDs, func(mns *fs.MountNamespace) {
+ k.SetRootMountNamespace(mns)
+ })
+ }
+
+ // Setup a child container.
+ log.Infof("Creating new process in child container.")
+ globalRoot := mns.Root()
+ defer globalRoot.DecRef()
+
+ // Create mount point for the container's rootfs.
+ maxTraversals := uint(0)
+ contDir, err := mns.FindInode(ctx, globalRoot, nil, ChildContainersDir, &maxTraversals)
+ if err != nil {
+ return fmt.Errorf("couldn't find child container dir %q: %v", ChildContainersDir, err)
+ }
+ if err := contDir.CreateDirectory(ctx, globalRoot, cid, fs.FilePermsFromMode(0755)); err != nil {
+ return fmt.Errorf("create directory %q: %v", cid, err)
+ }
+ containerRoot, err := contDir.Walk(ctx, globalRoot, cid)
+ if err != nil {
+ return fmt.Errorf("walk to %q failed: %v", cid, err)
+ }
+ defer containerRoot.DecRef()
+
+ // Create the container's root filesystem mount.
+ fds := &fdDispenser{fds: goferFDs}
+ rootInode, err := createRootMount(rootCtx, spec, conf, fds, nil)
+ if err != nil {
+ return fmt.Errorf("creating filesystem for container: %v", err)
+ }
+
+ // Mount the container's root filesystem to the newly created mount point.
+ if err := mns.Mount(ctx, containerRoot, rootInode); err != nil {
+ return fmt.Errorf("mount container root: %v", err)
+ }
+
+ // We have to re-walk to the dirent to find the mounted
+ // directory. The old dirent is invalid at this point.
+ containerRoot, err = contDir.Walk(ctx, globalRoot, cid)
+ if err != nil {
+ return fmt.Errorf("find container mount point %q: %v", cid, err)
+ }
+ cu := specutils.MakeCleanup(func() { containerRoot.DecRef() })
+ defer cu.Clean()
+
+ log.Infof("Mounted child's root fs to %q", filepath.Join(ChildContainersDir, cid))
+
+ // Set process root here, so 'rootCtx.Value(CtxRoot)' will return it.
+ procArgs.Root = containerRoot
+
+ // Mount all submounts.
+ mounts := compileMounts(spec)
+ if err := mountSubmounts(rootCtx, conf, mns, containerRoot, mounts, fds); err != nil {
+ return err
+ }
+ cu.Release()
+ return nil
+}
+
+// setExecutablePath sets the procArgs.Filename by searching the PATH for an
+// executable matching the procArgs.Argv[0].
+func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error {
+ paths := fs.GetPath(procArgs.Envv)
+ exe := procArgs.Argv[0]
+ f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths)
+ if err != nil {
+ return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err)
+ }
+ procArgs.Filename = f
+ return nil
+}
+
+// destroyContainerFS cleans up the filesystem by unmounting all mounts for the
+// given container and deleting the container root directory.
+func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error {
+ defer func() {
+ // Flushing dirent references triggers many async close
+ // operations. We must wait for those to complete before
+ // returning, otherwise the caller may kill the gofer before
+ // they complete, causing a cascade of failing RPCs.
+ //
+ // This must take place in the first deferred function, so that
+ // it runs after all the other deferred DecRef() calls in this
+ // function.
+ log.Infof("Waiting for async filesystem operations to complete")
+ fs.AsyncBarrier()
+ }()
+
+ // First get a reference to the container root directory.
+ mns := k.RootMountNamespace()
+ mnsRoot := mns.Root()
+ defer mnsRoot.DecRef()
+ containerRoot := path.Join(ChildContainersDir, cid)
+ maxTraversals := uint(0)
+ containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, &maxTraversals)
+ if err == syserror.ENOENT {
+ // Container must have been destroyed already. That's fine.
+ return nil
+ }
+ if err != nil {
+ return fmt.Errorf("finding container root directory %q: %v", containerRoot, err)
+ }
+ defer containerRootDirent.DecRef()
+
+ // Iterate through all submounts and unmount them. We unmount lazily by
+ // setting detach=true, so we can unmount in any order.
+ mnt := mns.FindMount(containerRootDirent)
+ for _, m := range mns.AllMountsUnder(mnt) {
+ root := m.Root()
+ defer root.DecRef()
+
+ // Do a best-effort unmount by flushing the refs and unmount
+ // with "detach only = true". Unmount returns EINVAL when the mount point
+ // doesn't exist, i.e. it has already been unmounted.
+ log.Debugf("Unmounting container mount %q", root.BaseName())
+ root.Inode.MountSource.FlushDirentRefs()
+ if err := mns.Unmount(ctx, root, true /* detach only */); err != nil && err != syserror.EINVAL {
+ return fmt.Errorf("unmounting container mount %q: %v", root.BaseName(), err)
+ }
+ }
+
+ // Get a reference to the parent directory and remove the root
+ // container directory.
+ maxTraversals = 0
+ containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, &maxTraversals)
+ if err != nil {
+ return fmt.Errorf("finding containers directory %q: %v", ChildContainersDir, err)
+ }
+ defer containersDirDirent.DecRef()
+ log.Debugf("Deleting container root %q", containerRoot)
+ if err := containersDirDirent.RemoveDirectory(ctx, mnsRoot, cid); err != nil {
+ return fmt.Errorf("removing directory %q: %v", containerRoot, err)
+ }
+
+ return nil
+}
+
+// mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so.
+// Technically we don't have to mount tmpfs at /tmp, as we could just rely on
+// the host /tmp, but this is a nice optimization, and fixes some apps that call
+// mknod in /tmp. It's unsafe to mount tmpfs if:
+// 1. /tmp is mounted explictly: we should not override user's wish
+// 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
+//
+// Note that when there are submounts inside of '/tmp', directories for the
+// mount points must be present, making '/tmp' not empty anymore.
+func mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, mounts []specs.Mount) error {
+ for _, m := range mounts {
+ if filepath.Clean(m.Destination) == "/tmp" {
+ log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m)
+ return nil
+ }
+ }
+
+ maxTraversals := uint(0)
+ tmp, err := mns.FindInode(ctx, root, root, "tmp", &maxTraversals)
+ switch err {
+ case nil:
+ // Found '/tmp' in filesystem, check if it's empty.
+ defer tmp.DecRef()
+ f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true})
+ if err != nil {
+ return err
+ }
+ defer f.DecRef()
+ serializer := &fs.CollectEntriesSerializer{}
+ if err := f.Readdir(ctx, serializer); err != nil {
+ return err
+ }
+ // If more than "." and ".." is found, skip internal tmpfs to prevent hiding
+ // existing files.
+ if len(serializer.Order) > 2 {
+ log.Infof("Skipping internal tmpfs on top %q, because it's not empty", "/tmp")
+ return nil
+ }
+ log.Infof("Mounting internal tmpfs on top of empty %q", "/tmp")
+ fallthrough
+
+ case syserror.ENOENT:
+ // No '/tmp' found (or fallthrough from above). Safe to mount internal
+ // tmpfs.
+ tmpMount := specs.Mount{
+ Type: tmpfs,
+ Destination: "/tmp",
+ // Sticky bit is added to prevent accidental deletion of files from
+ // another user. This is normally done for /tmp.
+ Options: []string{"mode=1777"},
+ }
+ return mountSubmount(ctx, conf, mns, root, nil, tmpMount, mounts)
+
+ default:
+ return err
+ }
+}