From aaaefdf9cadf033fa281b612315c3227f5ab1c7a Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Fri, 2 Aug 2019 11:21:50 -0700 Subject: Remove kernel.mounts. We can get the mount namespace from the CreateProcessArgs in all cases where we need it. This also gets rid of kernel.Destroy method, since the only thing it was doing was DecRefing the mounts. Removing the need to call kernel.SetRootMountNamespace also allowed for some more simplifications in the container fs setup code. PiperOrigin-RevId: 261357060 --- pkg/sentry/control/proc.go | 17 ++------- pkg/sentry/fs/context.go | 24 +++++++++++++ pkg/sentry/fs/mounts.go | 11 ++++++ pkg/sentry/kernel/kernel.go | 86 ++++++++------------------------------------- runsc/boot/controller.go | 5 ++- runsc/boot/fs.go | 55 +++++++---------------------- runsc/boot/loader.go | 25 ++++++------- 7 files changed, 78 insertions(+), 145 deletions(-) diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index 3f9772b87..c35faeb4c 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -56,15 +56,10 @@ type ExecArgs struct { // MountNamespace is the mount namespace to execute the new process in. // A reference on MountNamespace must be held for the lifetime of the - // ExecArgs. If MountNamespace is nil, it will default to the kernel's - // root MountNamespace. + // ExecArgs. If MountNamespace is nil, it will default to the init + // process's MountNamespace. MountNamespace *fs.MountNamespace - // Root defines the root directory for the new process. A reference on - // Root must be held for the lifetime of the ExecArgs. If Root is nil, - // it will default to the VFS root. - Root *fs.Dirent - // WorkingDirectory defines the working directory for the new process. WorkingDirectory string `json:"wd"` @@ -155,7 +150,6 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI Envv: args.Envv, WorkingDirectory: args.WorkingDirectory, MountNamespace: args.MountNamespace, - Root: args.Root, Credentials: creds, FDTable: fdTable, Umask: 0022, @@ -167,11 +161,6 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI ContainerID: args.ContainerID, PIDNamespace: args.PIDNamespace, } - if initArgs.Root != nil { - // initArgs must hold a reference on Root, which will be - // donated to the new process in CreateProcess. - initArgs.Root.IncRef() - } if initArgs.MountNamespace != nil { // initArgs must hold a reference on MountNamespace, which will // be donated to the new process in CreateProcess. @@ -184,7 +173,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI paths := fs.GetPath(initArgs.Envv) mns := initArgs.MountNamespace if mns == nil { - mns = proc.Kernel.RootMountNamespace() + mns = proc.Kernel.GlobalInit().Leader().MountNamespace() } f, err := mns.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths) if err != nil { diff --git a/pkg/sentry/fs/context.go b/pkg/sentry/fs/context.go index 51b4c7ee1..dd427de5d 100644 --- a/pkg/sentry/fs/context.go +++ b/pkg/sentry/fs/context.go @@ -112,3 +112,27 @@ func DirentCacheLimiterFromContext(ctx context.Context) *DirentCacheLimiter { } return nil } + +type rootContext struct { + context.Context + root *Dirent +} + +// WithRoot returns a copy of ctx with the given root. +func WithRoot(ctx context.Context, root *Dirent) context.Context { + return &rootContext{ + Context: ctx, + root: root, + } +} + +// Value implements Context.Value. +func (rc rootContext) Value(key interface{}) interface{} { + switch key { + case CtxRoot: + rc.root.IncRef() + return rc.root + default: + return rc.Context.Value(key) + } +} diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index 728575864..9b713e785 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -219,6 +219,13 @@ func (mns *MountNamespace) flushMountSourceRefsLocked() { } } + if mns.root == nil { + // No root? This MountSource must have already been destroyed. + // This can happen when a Save is triggered while a process is + // exiting. There is nothing to flush. + return + } + // Flush root's MountSource references. mns.root.Inode.MountSource.FlushDirentRefs() } @@ -249,6 +256,10 @@ func (mns *MountNamespace) destroy() { // Drop reference on the root. mns.root.DecRef() + // Ensure that root cannot be accessed via this MountNamespace any + // more. + mns.root = nil + // Wait for asynchronous work (queued by dropping Dirent references // above) to complete before destroying this MountNamespace. AsyncBarrier() diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 53c25e49e..56a329f83 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -112,11 +112,6 @@ type Kernel struct { rootIPCNamespace *IPCNamespace rootAbstractSocketNamespace *AbstractSocketNamespace - // mounts holds the state of the virtual filesystem. mounts is initially - // nil, and must be set by calling Kernel.SetRootMountNamespace before - // Kernel.CreateProcess can succeed. - mounts *fs.MountNamespace - // futexes is the "root" futex.Manager, from which all others are forked. // This is necessary to ensure that shared futexes are coherent across all // tasks, including those created by CreateProcess. @@ -392,11 +387,7 @@ func (k *Kernel) SaveTo(w io.Writer) error { // flushMountSourceRefs flushes the MountSources for all mounted filesystems // and open FDs. func (k *Kernel) flushMountSourceRefs() error { - // Flush all mount sources for currently mounted filesystems in the - // root mount namespace. - k.mounts.FlushMountSourceRefs() - - // Some tasks may have other mount namespaces; flush those as well. + // Flush all mount sources for currently mounted filesystems in each task. flushed := make(map[*fs.MountNamespace]struct{}) k.tasks.mu.RLock() k.tasks.forEachThreadGroupLocked(func(tg *ThreadGroup) { @@ -573,16 +564,6 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error { return nil } -// Destroy releases resources owned by k. -// -// Preconditions: There must be no task goroutines running in k. -func (k *Kernel) Destroy() { - if k.mounts != nil { - k.mounts.DecRef() - k.mounts = nil - } -} - // UniqueID returns a unique identifier. func (k *Kernel) UniqueID() uint64 { id := atomic.AddUint64(&k.uniqueID, 1) @@ -646,19 +627,12 @@ type CreateProcessArgs struct { AbstractSocketNamespace *AbstractSocketNamespace // MountNamespace optionally contains the mount namespace for this - // process. If nil, the kernel's mount namespace is used. + // process. If nil, the init process's mount namespace is used. // // Anyone setting MountNamespace must donate a reference (i.e. // increment it). MountNamespace *fs.MountNamespace - // Root optionally contains the dirent that serves as the root for the - // process. If nil, the mount namespace's root is used as the process' - // root. - // - // Anyone setting Root must donate a reference (i.e. increment it). - Root *fs.Dirent - // ContainerID is the container that the process belongs to. ContainerID string } @@ -696,16 +670,10 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} { case auth.CtxCredentials: return ctx.args.Credentials case fs.CtxRoot: - if ctx.args.Root != nil { - // Take a reference on the root dirent that will be - // given to the caller. - ctx.args.Root.IncRef() - return ctx.args.Root - } - if ctx.k.mounts != nil { - // MountNamespace.Root() will take a reference on the - // root dirent for us. - return ctx.k.mounts.Root() + if ctx.args.MountNamespace != nil { + // MountNamespace.Root() will take a reference on the root + // dirent for us. + return ctx.args.MountNamespace.Root() } return nil case fs.CtxDirentCacheLimiter: @@ -749,30 +717,18 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, defer k.extMu.Unlock() log.Infof("EXEC: %v", args.Argv) - if k.mounts == nil { - return nil, 0, fmt.Errorf("no kernel MountNamespace") - } - // Grab the mount namespace. mounts := args.MountNamespace if mounts == nil { - // If no MountNamespace was configured, then use the kernel's - // root mount namespace, with an extra reference that will be - // donated to the task. - mounts = k.mounts + mounts = k.GlobalInit().Leader().MountNamespace() mounts.IncRef() } tg := k.newThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock) ctx := args.NewContext(k) - // Grab the root directory. - root := args.Root - if root == nil { - // If no Root was configured, then get it from the - // MountNamespace. - root = mounts.Root() - } + // Get the root directory from the MountNamespace. + root := mounts.Root() // The call to newFSContext below will take a reference on root, so we // don't need to hold this one. defer root.DecRef() @@ -782,7 +738,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, wd := root // Default. if args.WorkingDirectory != "" { var err error - wd, err = k.mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals) + wd, err = mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals) if err != nil { return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) } @@ -811,8 +767,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, // Create a fresh task context. remainingTraversals = uint(args.MaxSymlinkTraversals) - tc, se := k.LoadTaskImage(ctx, k.mounts, root, wd, &remainingTraversals, args.Filename, args.File, args.Argv, args.Envv, k.featureSet) - + tc, se := k.LoadTaskImage(ctx, mounts, root, wd, &remainingTraversals, args.Filename, args.File, args.Argv, args.Envv, k.featureSet) if se != nil { return nil, 0, errors.New(se.String()) } @@ -1056,20 +1011,6 @@ func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace { return k.rootAbstractSocketNamespace } -// RootMountNamespace returns the MountNamespace. -func (k *Kernel) RootMountNamespace() *fs.MountNamespace { - k.extMu.Lock() - defer k.extMu.Unlock() - return k.mounts -} - -// SetRootMountNamespace sets the MountNamespace. -func (k *Kernel) SetRootMountNamespace(mounts *fs.MountNamespace) { - k.extMu.Lock() - defer k.extMu.Unlock() - k.mounts = mounts -} - // NetworkStack returns the network stack. NetworkStack may return nil if no // network stack is available. func (k *Kernel) NetworkStack() inet.Stack { @@ -1260,7 +1201,10 @@ func (ctx supervisorContext) Value(key interface{}) interface{} { // The supervisor context is global root. return auth.NewRootCredentials(ctx.k.rootUserNamespace) case fs.CtxRoot: - return ctx.k.mounts.Root() + if ctx.k.globalInit != nil { + return ctx.k.globalInit.mounts.Root() + } + return nil case fs.CtxDirentCacheLimiter: return ctx.k.DirentCacheLimiter case ktime.CtxRealtimeClock: diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index d79aaff60..f1e4a9ba4 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -328,10 +328,8 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { return fmt.Errorf("at most two files may be passed to Restore") } - networkStack := cm.l.k.NetworkStack() - // Destroy the old kernel and create a new kernel. + // Pause the kernel while we build a new one. cm.l.k.Pause() - cm.l.k.Destroy() p, err := createPlatform(cm.l.conf, deviceFile) if err != nil { @@ -345,6 +343,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { return fmt.Errorf("creating memory file: %v", err) } k.SetMemoryFile(mf) + networkStack := cm.l.k.NetworkStack() cm.l.k = k // Set up the restore environment. diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 7e95e1f41..98ce40991 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -34,12 +34,10 @@ import ( _ "gvisor.dev/gvisor/pkg/sentry/fs/tty" specs "github.com/opencontainers/runtime-spec/specs-go" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/runsc/specutils" ) @@ -506,44 +504,16 @@ func newContainerMounter(spec *specs.Spec, cid string, goferFDs []int, k *kernel } } -// setupFS is used to set up the file system for containers and amend -// the procArgs accordingly. This is the main entry point for this rest of -// functions in this file. procArgs are passed by reference and the FDMap field -// is modified. It dups stdioFDs. -func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs, creds *auth.Credentials) error { - // Use root user to configure mounts. The current user might not have - // permission to do so. - rootProcArgs := kernel.CreateProcessArgs{ - WorkingDirectory: "/", - Credentials: auth.NewRootCredentials(creds.UserNamespace), - Umask: 0022, - MaxSymlinkTraversals: linux.MaxSymlinkTraversals, - PIDNamespace: procArgs.PIDNamespace, - } - rootCtx := rootProcArgs.NewContext(c.k) - - // If this is the root container, we also need to setup the root mount - // namespace. - rootMNS := c.k.RootMountNamespace() - if rootMNS == nil { - // Setup the root container. - if err := c.setupRootContainer(ctx, rootCtx, conf, func(rootMNS *fs.MountNamespace) { - // The callback to setupRootContainer inherits a - // reference on the rootMNS, so we don't need to take - // an additional reference here. - procArgs.MountNamespace = rootMNS - procArgs.Root = rootMNS.Root() - c.k.SetRootMountNamespace(rootMNS) - }); err != nil { - return err - } - return c.checkDispenser() - } - +// setupChildContainer is used to set up the file system for non-root containers +// and amend the procArgs accordingly. This is the main entry point for this +// rest of functions in this file. procArgs are passed by reference and the +// FDMap field is modified. It dups stdioFDs. +func (c *containerMounter) setupChildContainer(conf *Config, procArgs *kernel.CreateProcessArgs) error { // Setup a child container. log.Infof("Creating new process in child container.") // Create a new root inode and mount namespace for the container. + rootCtx := c.k.SupervisorContext() rootInode, err := c.createRootMount(rootCtx, conf) if err != nil { return fmt.Errorf("creating filesystem for container: %v", err) @@ -552,14 +522,12 @@ func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs * if err != nil { return fmt.Errorf("creating new mount namespace for container: %v", err) } - - // Set process root here, so 'rootCtx.Value(CtxRoot)' will return it. - // This will also donate a reference to procArgs, as required. procArgs.MountNamespace = mns - procArgs.Root = mns.Root() + root := mns.Root() + defer root.DecRef() // Mount all submounts. - if err := c.mountSubmounts(rootCtx, conf, mns, procArgs.Root); err != nil { + if err := c.mountSubmounts(rootCtx, conf, mns, root); err != nil { return err } return c.checkDispenser() @@ -599,7 +567,10 @@ func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx c root := mns.Root() defer root.DecRef() - return c.mountSubmounts(rootCtx, conf, mns, root) + if err := c.mountSubmounts(rootCtx, conf, mns, root); err != nil { + return fmt.Errorf("mounting submounts: %v", err) + } + return c.checkDispenser() } // mountSharedMaster mounts the master of a volume that is shared among diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 50cac0433..77e1aa456 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -527,12 +527,15 @@ func (l *Loader) run() error { // cid for root container can be empty. Only subcontainers need it to set // the mount location. mntr := newContainerMounter(l.spec, "", l.goferFDs, l.k, l.mountHints) - if err := mntr.setupFS(ctx, l.conf, &l.rootProcArgs, l.rootProcArgs.Credentials); err != nil { + + // Setup the root container. + if err := mntr.setupRootContainer(ctx, ctx, l.conf, func(mns *fs.MountNamespace) { + l.rootProcArgs.MountNamespace = mns + }); err != nil { return err } - rootCtx := l.rootProcArgs.NewContext(l.k) - if err := setExecutablePath(rootCtx, &l.rootProcArgs); err != nil { + if err := setExecutablePath(ctx, &l.rootProcArgs); err != nil { return err } @@ -546,7 +549,7 @@ func (l *Loader) run() error { } } if !hasHomeEnvv { - homeDir, err := getExecUserHome(rootCtx, l.rootProcArgs.MountNamespace, uint32(l.rootProcArgs.Credentials.RealKUID)) + homeDir, err := getExecUserHome(ctx, l.rootProcArgs.MountNamespace, uint32(l.rootProcArgs.Credentials.RealKUID)) if err != nil { return fmt.Errorf("error reading exec user: %v", err) } @@ -685,7 +688,7 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file } mntr := newContainerMounter(spec, cid, goferFDs, l.k, l.mountHints) - if err := mntr.setupFS(ctx, conf, &procArgs, creds); err != nil { + if err := mntr.setupChildContainer(conf, &procArgs); err != nil { return fmt.Errorf("configuring container FS: %v", err) } @@ -756,22 +759,14 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { return 0, fmt.Errorf("no such container: %q", args.ContainerID) } - // Get the container Root Dirent and MountNamespace from the Task. + // Get the container MountNamespace from the Task. tg.Leader().WithMuLocked(func(t *kernel.Task) { - // FSContext.RootDirectory() will take an extra ref for us. - args.Root = t.FSContext().RootDirectory() - // task.MountNamespace() does not take a ref, so we must do so // ourselves. args.MountNamespace = t.MountNamespace() args.MountNamespace.IncRef() }) - defer func() { - if args.Root != nil { - args.Root.DecRef() - } - args.MountNamespace.DecRef() - }() + defer args.MountNamespace.DecRef() // Start the process. proc := control.Proc{Kernel: l.k} -- cgit v1.2.3