diff options
Diffstat (limited to 'runsc')
-rw-r--r-- | runsc/boot/fs.go | 140 | ||||
-rw-r--r-- | runsc/boot/loader.go | 48 |
2 files changed, 52 insertions, 136 deletions
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index d3e3196fd..55bfc27ff 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -54,10 +54,6 @@ const ( // MountPrefix is the annotation prefix for mount hints. MountPrefix = "gvisor.dev/spec/mount" - // ChildContainersDir is the directory where child container root - // filesystems are mounted. - ChildContainersDir = "/__runsc_containers__" - // Filesystems that runsc supports. bind = "bind" devpts = "devpts" @@ -256,10 +252,10 @@ func subtargets(root string, mnts []specs.Mount) []string { // setExecutablePath sets the procArgs.Filename by searching the PATH for an // executable matching the procArgs.Argv[0]. -func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error { +func setExecutablePath(ctx context.Context, procArgs *kernel.CreateProcessArgs) error { paths := fs.GetPath(procArgs.Envv) exe := procArgs.Argv[0] - f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths) + f, err := procArgs.MountNamespace.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths) if err != nil { return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err) } @@ -514,11 +510,16 @@ func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs * // If this is the root container, we also need to setup the root mount // namespace. - mns := c.k.RootMountNamespace() - if mns == nil { + rootMNS := c.k.RootMountNamespace() + if rootMNS == nil { // Setup the root container. - if err := c.setupRootContainer(ctx, rootCtx, conf, func(mns *fs.MountNamespace) { - c.k.SetRootMountNamespace(mns) + if err := c.setupRootContainer(ctx, rootCtx, conf, func(rootMNS *fs.MountNamespace) { + // The callback to setupRootContainer inherits a + // reference on the rootMNS, so we don't need to take + // an additional reference here. + procArgs.MountNamespace = rootMNS + procArgs.Root = rootMNS.Root() + c.k.SetRootMountNamespace(rootMNS) }); err != nil { return err } @@ -527,54 +528,26 @@ func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs * // Setup a child container. log.Infof("Creating new process in child container.") - globalRoot := mns.Root() - defer globalRoot.DecRef() - - // Create mount point for the container's rootfs. - maxTraversals := uint(0) - contDir, err := mns.FindInode(ctx, globalRoot, nil, ChildContainersDir, &maxTraversals) - if err != nil { - return fmt.Errorf("couldn't find child container dir %q: %v", ChildContainersDir, err) - } - if err := contDir.CreateDirectory(ctx, globalRoot, c.cid, fs.FilePermsFromMode(0755)); err != nil { - return fmt.Errorf("create directory %q: %v", c.cid, err) - } - containerRoot, err := contDir.Walk(ctx, globalRoot, c.cid) - if err != nil { - return fmt.Errorf("walk to %q failed: %v", c.cid, err) - } - defer containerRoot.DecRef() - // Create the container's root filesystem mount. + // Create a new root inode and mount namespace for the container. rootInode, err := c.createRootMount(rootCtx, conf) if err != nil { return fmt.Errorf("creating filesystem for container: %v", err) } - - // Mount the container's root filesystem to the newly created mount point. - if err := mns.Mount(ctx, containerRoot, rootInode); err != nil { - return fmt.Errorf("mount container root: %v", err) - } - - // We have to re-walk to the dirent to find the mounted directory. The old - // dirent is invalid at this point. - containerRoot, err = contDir.Walk(ctx, globalRoot, c.cid) + mns, err := fs.NewMountNamespace(rootCtx, rootInode) if err != nil { - return fmt.Errorf("find container mount point %q: %v", c.cid, err) + return fmt.Errorf("creating new mount namespace for container: %v", err) } - cu := specutils.MakeCleanup(func() { containerRoot.DecRef() }) - defer cu.Clean() - - log.Infof("Mounted child's root fs to %q", filepath.Join(ChildContainersDir, c.cid)) // Set process root here, so 'rootCtx.Value(CtxRoot)' will return it. - procArgs.Root = containerRoot + // This will also donate a reference to procArgs, as required. + procArgs.MountNamespace = mns + procArgs.Root = mns.Root() // Mount all submounts. - if err := c.mountSubmounts(rootCtx, conf, mns, containerRoot); err != nil { + if err := c.mountSubmounts(rootCtx, conf, mns, procArgs.Root); err != nil { return err } - cu.Release() return c.checkDispenser() } @@ -585,75 +558,11 @@ func (c *containerMounter) checkDispenser() error { return nil } -// destroyContainerFS cleans up the filesystem by unmounting all mounts for the -// given container and deleting the container root directory. -func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error { - defer func() { - // Flushing dirent references triggers many async close - // operations. We must wait for those to complete before - // returning, otherwise the caller may kill the gofer before - // they complete, causing a cascade of failing RPCs. - // - // This must take place in the first deferred function, so that - // it runs after all the other deferred DecRef() calls in this - // function. - log.Infof("Waiting for async filesystem operations to complete") - fs.AsyncBarrier() - }() - - // First get a reference to the container root directory. - mns := k.RootMountNamespace() - mnsRoot := mns.Root() - defer mnsRoot.DecRef() - containerRoot := path.Join(ChildContainersDir, cid) - maxTraversals := uint(0) - containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, &maxTraversals) - if err == syserror.ENOENT { - // Container must have been destroyed already. That's fine. - return nil - } - if err != nil { - return fmt.Errorf("finding container root directory %q: %v", containerRoot, err) - } - defer containerRootDirent.DecRef() - - // Iterate through all submounts and unmount them. We unmount lazily by - // setting detach=true, so we can unmount in any order. - mnt := mns.FindMount(containerRootDirent) - for _, m := range mns.AllMountsUnder(mnt) { - root := m.Root() - defer root.DecRef() - - // Do a best-effort unmount by flushing the refs and unmount - // with "detach only = true". Unmount returns EINVAL when the mount point - // doesn't exist, i.e. it has already been unmounted. - log.Debugf("Unmounting container mount %q", root.BaseName()) - root.Inode.MountSource.FlushDirentRefs() - if err := mns.Unmount(ctx, root, true /* detach only */); err != nil && err != syserror.EINVAL { - return fmt.Errorf("unmounting container mount %q: %v", root.BaseName(), err) - } - } - - // Get a reference to the parent directory and remove the root - // container directory. - maxTraversals = 0 - containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, &maxTraversals) - if err != nil { - return fmt.Errorf("finding containers directory %q: %v", ChildContainersDir, err) - } - defer containersDirDirent.DecRef() - log.Debugf("Deleting container root %q", containerRoot) - if err := containersDirDirent.RemoveDirectory(ctx, mnsRoot, cid); err != nil { - return fmt.Errorf("removing directory %q: %v", containerRoot, err) - } - - return nil -} - // setupRootContainer creates a mount namespace containing the root filesystem // and all mounts. 'rootCtx' is used to walk directories to find mount points. -// 'setMountNS' is called after namespace is created. It must set the mount NS -// to 'rootCtx'. +// The 'setMountNS' callback is called after the mount namespace is created and +// will get a reference on that namespace. The callback must ensure that the +// rootCtx has the provided mount namespace. func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx context.Context, conf *Config, setMountNS func(*fs.MountNamespace)) error { for _, hint := range c.hints.mounts { log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type) @@ -664,13 +573,6 @@ func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx c hint.root = inode } - // Create a tmpfs mount where we create and mount a root filesystem for - // each child container. - c.mounts = append(c.mounts, specs.Mount{ - Type: tmpfs, - Destination: ChildContainersDir, - }) - rootInode, err := c.createRootMount(rootCtx, conf) if err != nil { return fmt.Errorf("creating root mount: %v", err) diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 8e8c6105b..a8adaf292 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -35,6 +35,7 @@ import ( "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -525,8 +526,7 @@ func (l *Loader) run() error { } rootCtx := l.rootProcArgs.NewContext(l.k) - rootMns := l.k.RootMountNamespace() - if err := setExecutablePath(rootCtx, rootMns, &l.rootProcArgs); err != nil { + if err := setExecutablePath(rootCtx, &l.rootProcArgs); err != nil { return err } @@ -540,7 +540,7 @@ func (l *Loader) run() error { } } if !hasHomeEnvv { - homeDir, err := getExecUserHome(rootCtx, rootMns, uint32(l.rootProcArgs.Credentials.RealKUID)) + homeDir, err := getExecUserHome(rootCtx, l.rootProcArgs.MountNamespace, uint32(l.rootProcArgs.Credentials.RealKUID)) if err != nil { return fmt.Errorf("error reading exec user: %v", err) } @@ -663,8 +663,7 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file return fmt.Errorf("configuring container FS: %v", err) } - mns := l.k.RootMountNamespace() - if err := setExecutablePath(ctx, mns, &procArgs); err != nil { + if err := setExecutablePath(ctx, &procArgs); err != nil { return fmt.Errorf("setting executable path for %+v: %v", procArgs, err) } @@ -689,8 +688,10 @@ func (l *Loader) destroyContainer(cid string) error { defer l.mu.Unlock() // Has the container started? - if _, _, err := l.threadGroupFromIDLocked(execID{cid: cid}); err == nil { - // If the container has started, kill and wait for all processes. + _, _, err := l.threadGroupFromIDLocked(execID{cid: cid}) + + // If the container has started, kill and wait for all processes. + if err == nil { if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil { return fmt.Errorf("sending SIGKILL to all container processes: %v", err) } @@ -703,12 +704,17 @@ func (l *Loader) destroyContainer(cid string) error { } } - ctx := l.rootProcArgs.NewContext(l.k) - if err := destroyContainerFS(ctx, cid, l.k); err != nil { - return fmt.Errorf("destroying filesystem for container %q: %v", cid, err) - } + // At this point, all processes inside of the container have exited, + // releasing all references to the container's MountNamespace and + // causing all submounts and overlays to be unmounted. + // + // Since the container's MountNamespace has been released, + // MountNamespace.destroy() will have executed, but that function may + // trigger async close operations. We must wait for those to complete + // before returning, otherwise the caller may kill the gofer before + // they complete, causing a cascade of failing RPCs. + fs.AsyncBarrier() - // We made it! log.Debugf("Container destroyed %q", cid) return nil } @@ -724,14 +730,22 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { return 0, fmt.Errorf("no such container: %q", args.ContainerID) } - // Get the container Root Dirent from the Task, since we must run this - // process with the same Root. + // Get the container Root Dirent and MountNamespace from the Task. tg.Leader().WithMuLocked(func(t *kernel.Task) { + // FSContext.RootDirectory() will take an extra ref for us. args.Root = t.FSContext().RootDirectory() + + // task.MountNamespace() does not take a ref, so we must do so + // ourselves. + args.MountNamespace = t.MountNamespace() + args.MountNamespace.IncRef() }) - if args.Root != nil { - defer args.Root.DecRef() - } + defer func() { + if args.Root != nil { + args.Root.DecRef() + } + args.MountNamespace.DecRef() + }() // Start the process. proc := control.Proc{Kernel: l.k} |