diff options
author | Nicolas Lacasse <nlacasse@google.com> | 2019-07-23 14:35:50 -0700 |
---|---|---|
committer | gVisor bot <gvisor-bot@google.com> | 2019-07-23 14:37:07 -0700 |
commit | 04cbb13ce9b151cf906f42e3f18ce3a875f01f63 (patch) | |
tree | 3c68885355ff140b59f5aee4b149911bcb72c439 /runsc/boot/fs.go | |
parent | 57745994384ee1ff94fc7bed4f814ba75e39d48e (diff) |
Give each container a distinct MountNamespace.
This keeps all container filesystem completely separate from eachother
(including from the root container filesystem), and allows us to get rid of the
"__runsc_containers__" directory.
It also simplifies container startup/teardown as we don't have to muck around
in the root container's filesystem.
PiperOrigin-RevId: 259613346
Diffstat (limited to 'runsc/boot/fs.go')
-rw-r--r-- | runsc/boot/fs.go | 140 |
1 files changed, 21 insertions, 119 deletions
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index d3e3196fd..55bfc27ff 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -54,10 +54,6 @@ const ( // MountPrefix is the annotation prefix for mount hints. MountPrefix = "gvisor.dev/spec/mount" - // ChildContainersDir is the directory where child container root - // filesystems are mounted. - ChildContainersDir = "/__runsc_containers__" - // Filesystems that runsc supports. bind = "bind" devpts = "devpts" @@ -256,10 +252,10 @@ func subtargets(root string, mnts []specs.Mount) []string { // setExecutablePath sets the procArgs.Filename by searching the PATH for an // executable matching the procArgs.Argv[0]. -func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error { +func setExecutablePath(ctx context.Context, procArgs *kernel.CreateProcessArgs) error { paths := fs.GetPath(procArgs.Envv) exe := procArgs.Argv[0] - f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths) + f, err := procArgs.MountNamespace.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths) if err != nil { return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err) } @@ -514,11 +510,16 @@ func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs * // If this is the root container, we also need to setup the root mount // namespace. - mns := c.k.RootMountNamespace() - if mns == nil { + rootMNS := c.k.RootMountNamespace() + if rootMNS == nil { // Setup the root container. - if err := c.setupRootContainer(ctx, rootCtx, conf, func(mns *fs.MountNamespace) { - c.k.SetRootMountNamespace(mns) + if err := c.setupRootContainer(ctx, rootCtx, conf, func(rootMNS *fs.MountNamespace) { + // The callback to setupRootContainer inherits a + // reference on the rootMNS, so we don't need to take + // an additional reference here. + procArgs.MountNamespace = rootMNS + procArgs.Root = rootMNS.Root() + c.k.SetRootMountNamespace(rootMNS) }); err != nil { return err } @@ -527,54 +528,26 @@ func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs * // Setup a child container. log.Infof("Creating new process in child container.") - globalRoot := mns.Root() - defer globalRoot.DecRef() - - // Create mount point for the container's rootfs. - maxTraversals := uint(0) - contDir, err := mns.FindInode(ctx, globalRoot, nil, ChildContainersDir, &maxTraversals) - if err != nil { - return fmt.Errorf("couldn't find child container dir %q: %v", ChildContainersDir, err) - } - if err := contDir.CreateDirectory(ctx, globalRoot, c.cid, fs.FilePermsFromMode(0755)); err != nil { - return fmt.Errorf("create directory %q: %v", c.cid, err) - } - containerRoot, err := contDir.Walk(ctx, globalRoot, c.cid) - if err != nil { - return fmt.Errorf("walk to %q failed: %v", c.cid, err) - } - defer containerRoot.DecRef() - // Create the container's root filesystem mount. + // Create a new root inode and mount namespace for the container. rootInode, err := c.createRootMount(rootCtx, conf) if err != nil { return fmt.Errorf("creating filesystem for container: %v", err) } - - // Mount the container's root filesystem to the newly created mount point. - if err := mns.Mount(ctx, containerRoot, rootInode); err != nil { - return fmt.Errorf("mount container root: %v", err) - } - - // We have to re-walk to the dirent to find the mounted directory. The old - // dirent is invalid at this point. - containerRoot, err = contDir.Walk(ctx, globalRoot, c.cid) + mns, err := fs.NewMountNamespace(rootCtx, rootInode) if err != nil { - return fmt.Errorf("find container mount point %q: %v", c.cid, err) + return fmt.Errorf("creating new mount namespace for container: %v", err) } - cu := specutils.MakeCleanup(func() { containerRoot.DecRef() }) - defer cu.Clean() - - log.Infof("Mounted child's root fs to %q", filepath.Join(ChildContainersDir, c.cid)) // Set process root here, so 'rootCtx.Value(CtxRoot)' will return it. - procArgs.Root = containerRoot + // This will also donate a reference to procArgs, as required. + procArgs.MountNamespace = mns + procArgs.Root = mns.Root() // Mount all submounts. - if err := c.mountSubmounts(rootCtx, conf, mns, containerRoot); err != nil { + if err := c.mountSubmounts(rootCtx, conf, mns, procArgs.Root); err != nil { return err } - cu.Release() return c.checkDispenser() } @@ -585,75 +558,11 @@ func (c *containerMounter) checkDispenser() error { return nil } -// destroyContainerFS cleans up the filesystem by unmounting all mounts for the -// given container and deleting the container root directory. -func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error { - defer func() { - // Flushing dirent references triggers many async close - // operations. We must wait for those to complete before - // returning, otherwise the caller may kill the gofer before - // they complete, causing a cascade of failing RPCs. - // - // This must take place in the first deferred function, so that - // it runs after all the other deferred DecRef() calls in this - // function. - log.Infof("Waiting for async filesystem operations to complete") - fs.AsyncBarrier() - }() - - // First get a reference to the container root directory. - mns := k.RootMountNamespace() - mnsRoot := mns.Root() - defer mnsRoot.DecRef() - containerRoot := path.Join(ChildContainersDir, cid) - maxTraversals := uint(0) - containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, &maxTraversals) - if err == syserror.ENOENT { - // Container must have been destroyed already. That's fine. - return nil - } - if err != nil { - return fmt.Errorf("finding container root directory %q: %v", containerRoot, err) - } - defer containerRootDirent.DecRef() - - // Iterate through all submounts and unmount them. We unmount lazily by - // setting detach=true, so we can unmount in any order. - mnt := mns.FindMount(containerRootDirent) - for _, m := range mns.AllMountsUnder(mnt) { - root := m.Root() - defer root.DecRef() - - // Do a best-effort unmount by flushing the refs and unmount - // with "detach only = true". Unmount returns EINVAL when the mount point - // doesn't exist, i.e. it has already been unmounted. - log.Debugf("Unmounting container mount %q", root.BaseName()) - root.Inode.MountSource.FlushDirentRefs() - if err := mns.Unmount(ctx, root, true /* detach only */); err != nil && err != syserror.EINVAL { - return fmt.Errorf("unmounting container mount %q: %v", root.BaseName(), err) - } - } - - // Get a reference to the parent directory and remove the root - // container directory. - maxTraversals = 0 - containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, &maxTraversals) - if err != nil { - return fmt.Errorf("finding containers directory %q: %v", ChildContainersDir, err) - } - defer containersDirDirent.DecRef() - log.Debugf("Deleting container root %q", containerRoot) - if err := containersDirDirent.RemoveDirectory(ctx, mnsRoot, cid); err != nil { - return fmt.Errorf("removing directory %q: %v", containerRoot, err) - } - - return nil -} - // setupRootContainer creates a mount namespace containing the root filesystem // and all mounts. 'rootCtx' is used to walk directories to find mount points. -// 'setMountNS' is called after namespace is created. It must set the mount NS -// to 'rootCtx'. +// The 'setMountNS' callback is called after the mount namespace is created and +// will get a reference on that namespace. The callback must ensure that the +// rootCtx has the provided mount namespace. func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx context.Context, conf *Config, setMountNS func(*fs.MountNamespace)) error { for _, hint := range c.hints.mounts { log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type) @@ -664,13 +573,6 @@ func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx c hint.root = inode } - // Create a tmpfs mount where we create and mount a root filesystem for - // each child container. - c.mounts = append(c.mounts, specs.Mount{ - Type: tmpfs, - Destination: ChildContainersDir, - }) - rootInode, err := c.createRootMount(rootCtx, conf) if err != nil { return fmt.Errorf("creating root mount: %v", err) |