summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--pkg/sentry/control/proc.go17
-rw-r--r--pkg/sentry/fs/context.go24
-rw-r--r--pkg/sentry/fs/mounts.go11
-rw-r--r--pkg/sentry/kernel/kernel.go86
-rwxr-xr-xpkg/sentry/kernel/kernel_state_autogen.go2
-rwxr-xr-xpkg/sentry/kernel/seqatomic_taskgoroutineschedinfo_unsafe.go2
-rwxr-xr-xpkg/sentry/platform/ring0/defs_impl.go2
-rwxr-xr-xpkg/sentry/time/seqatomic_parameters_unsafe.go2
-rw-r--r--runsc/boot/controller.go5
-rw-r--r--runsc/boot/fs.go55
-rw-r--r--runsc/boot/loader.go25
11 files changed, 81 insertions, 150 deletions
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 3f9772b87..c35faeb4c 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -56,15 +56,10 @@ type ExecArgs struct {
// MountNamespace is the mount namespace to execute the new process in.
// A reference on MountNamespace must be held for the lifetime of the
- // ExecArgs. If MountNamespace is nil, it will default to the kernel's
- // root MountNamespace.
+ // ExecArgs. If MountNamespace is nil, it will default to the init
+ // process's MountNamespace.
MountNamespace *fs.MountNamespace
- // Root defines the root directory for the new process. A reference on
- // Root must be held for the lifetime of the ExecArgs. If Root is nil,
- // it will default to the VFS root.
- Root *fs.Dirent
-
// WorkingDirectory defines the working directory for the new process.
WorkingDirectory string `json:"wd"`
@@ -155,7 +150,6 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
Envv: args.Envv,
WorkingDirectory: args.WorkingDirectory,
MountNamespace: args.MountNamespace,
- Root: args.Root,
Credentials: creds,
FDTable: fdTable,
Umask: 0022,
@@ -167,11 +161,6 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
ContainerID: args.ContainerID,
PIDNamespace: args.PIDNamespace,
}
- if initArgs.Root != nil {
- // initArgs must hold a reference on Root, which will be
- // donated to the new process in CreateProcess.
- initArgs.Root.IncRef()
- }
if initArgs.MountNamespace != nil {
// initArgs must hold a reference on MountNamespace, which will
// be donated to the new process in CreateProcess.
@@ -184,7 +173,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
paths := fs.GetPath(initArgs.Envv)
mns := initArgs.MountNamespace
if mns == nil {
- mns = proc.Kernel.RootMountNamespace()
+ mns = proc.Kernel.GlobalInit().Leader().MountNamespace()
}
f, err := mns.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
if err != nil {
diff --git a/pkg/sentry/fs/context.go b/pkg/sentry/fs/context.go
index 51b4c7ee1..dd427de5d 100644
--- a/pkg/sentry/fs/context.go
+++ b/pkg/sentry/fs/context.go
@@ -112,3 +112,27 @@ func DirentCacheLimiterFromContext(ctx context.Context) *DirentCacheLimiter {
}
return nil
}
+
+type rootContext struct {
+ context.Context
+ root *Dirent
+}
+
+// WithRoot returns a copy of ctx with the given root.
+func WithRoot(ctx context.Context, root *Dirent) context.Context {
+ return &rootContext{
+ Context: ctx,
+ root: root,
+ }
+}
+
+// Value implements Context.Value.
+func (rc rootContext) Value(key interface{}) interface{} {
+ switch key {
+ case CtxRoot:
+ rc.root.IncRef()
+ return rc.root
+ default:
+ return rc.Context.Value(key)
+ }
+}
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index 728575864..9b713e785 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -219,6 +219,13 @@ func (mns *MountNamespace) flushMountSourceRefsLocked() {
}
}
+ if mns.root == nil {
+ // No root? This MountSource must have already been destroyed.
+ // This can happen when a Save is triggered while a process is
+ // exiting. There is nothing to flush.
+ return
+ }
+
// Flush root's MountSource references.
mns.root.Inode.MountSource.FlushDirentRefs()
}
@@ -249,6 +256,10 @@ func (mns *MountNamespace) destroy() {
// Drop reference on the root.
mns.root.DecRef()
+ // Ensure that root cannot be accessed via this MountNamespace any
+ // more.
+ mns.root = nil
+
// Wait for asynchronous work (queued by dropping Dirent references
// above) to complete before destroying this MountNamespace.
AsyncBarrier()
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 53c25e49e..56a329f83 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -112,11 +112,6 @@ type Kernel struct {
rootIPCNamespace *IPCNamespace
rootAbstractSocketNamespace *AbstractSocketNamespace
- // mounts holds the state of the virtual filesystem. mounts is initially
- // nil, and must be set by calling Kernel.SetRootMountNamespace before
- // Kernel.CreateProcess can succeed.
- mounts *fs.MountNamespace
-
// futexes is the "root" futex.Manager, from which all others are forked.
// This is necessary to ensure that shared futexes are coherent across all
// tasks, including those created by CreateProcess.
@@ -392,11 +387,7 @@ func (k *Kernel) SaveTo(w io.Writer) error {
// flushMountSourceRefs flushes the MountSources for all mounted filesystems
// and open FDs.
func (k *Kernel) flushMountSourceRefs() error {
- // Flush all mount sources for currently mounted filesystems in the
- // root mount namespace.
- k.mounts.FlushMountSourceRefs()
-
- // Some tasks may have other mount namespaces; flush those as well.
+ // Flush all mount sources for currently mounted filesystems in each task.
flushed := make(map[*fs.MountNamespace]struct{})
k.tasks.mu.RLock()
k.tasks.forEachThreadGroupLocked(func(tg *ThreadGroup) {
@@ -573,16 +564,6 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error {
return nil
}
-// Destroy releases resources owned by k.
-//
-// Preconditions: There must be no task goroutines running in k.
-func (k *Kernel) Destroy() {
- if k.mounts != nil {
- k.mounts.DecRef()
- k.mounts = nil
- }
-}
-
// UniqueID returns a unique identifier.
func (k *Kernel) UniqueID() uint64 {
id := atomic.AddUint64(&k.uniqueID, 1)
@@ -646,19 +627,12 @@ type CreateProcessArgs struct {
AbstractSocketNamespace *AbstractSocketNamespace
// MountNamespace optionally contains the mount namespace for this
- // process. If nil, the kernel's mount namespace is used.
+ // process. If nil, the init process's mount namespace is used.
//
// Anyone setting MountNamespace must donate a reference (i.e.
// increment it).
MountNamespace *fs.MountNamespace
- // Root optionally contains the dirent that serves as the root for the
- // process. If nil, the mount namespace's root is used as the process'
- // root.
- //
- // Anyone setting Root must donate a reference (i.e. increment it).
- Root *fs.Dirent
-
// ContainerID is the container that the process belongs to.
ContainerID string
}
@@ -696,16 +670,10 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
case auth.CtxCredentials:
return ctx.args.Credentials
case fs.CtxRoot:
- if ctx.args.Root != nil {
- // Take a reference on the root dirent that will be
- // given to the caller.
- ctx.args.Root.IncRef()
- return ctx.args.Root
- }
- if ctx.k.mounts != nil {
- // MountNamespace.Root() will take a reference on the
- // root dirent for us.
- return ctx.k.mounts.Root()
+ if ctx.args.MountNamespace != nil {
+ // MountNamespace.Root() will take a reference on the root
+ // dirent for us.
+ return ctx.args.MountNamespace.Root()
}
return nil
case fs.CtxDirentCacheLimiter:
@@ -749,30 +717,18 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
defer k.extMu.Unlock()
log.Infof("EXEC: %v", args.Argv)
- if k.mounts == nil {
- return nil, 0, fmt.Errorf("no kernel MountNamespace")
- }
-
// Grab the mount namespace.
mounts := args.MountNamespace
if mounts == nil {
- // If no MountNamespace was configured, then use the kernel's
- // root mount namespace, with an extra reference that will be
- // donated to the task.
- mounts = k.mounts
+ mounts = k.GlobalInit().Leader().MountNamespace()
mounts.IncRef()
}
tg := k.newThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
ctx := args.NewContext(k)
- // Grab the root directory.
- root := args.Root
- if root == nil {
- // If no Root was configured, then get it from the
- // MountNamespace.
- root = mounts.Root()
- }
+ // Get the root directory from the MountNamespace.
+ root := mounts.Root()
// The call to newFSContext below will take a reference on root, so we
// don't need to hold this one.
defer root.DecRef()
@@ -782,7 +738,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
wd := root // Default.
if args.WorkingDirectory != "" {
var err error
- wd, err = k.mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
+ wd, err = mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
if err != nil {
return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
}
@@ -811,8 +767,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
// Create a fresh task context.
remainingTraversals = uint(args.MaxSymlinkTraversals)
- tc, se := k.LoadTaskImage(ctx, k.mounts, root, wd, &remainingTraversals, args.Filename, args.File, args.Argv, args.Envv, k.featureSet)
-
+ tc, se := k.LoadTaskImage(ctx, mounts, root, wd, &remainingTraversals, args.Filename, args.File, args.Argv, args.Envv, k.featureSet)
if se != nil {
return nil, 0, errors.New(se.String())
}
@@ -1056,20 +1011,6 @@ func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace {
return k.rootAbstractSocketNamespace
}
-// RootMountNamespace returns the MountNamespace.
-func (k *Kernel) RootMountNamespace() *fs.MountNamespace {
- k.extMu.Lock()
- defer k.extMu.Unlock()
- return k.mounts
-}
-
-// SetRootMountNamespace sets the MountNamespace.
-func (k *Kernel) SetRootMountNamespace(mounts *fs.MountNamespace) {
- k.extMu.Lock()
- defer k.extMu.Unlock()
- k.mounts = mounts
-}
-
// NetworkStack returns the network stack. NetworkStack may return nil if no
// network stack is available.
func (k *Kernel) NetworkStack() inet.Stack {
@@ -1260,7 +1201,10 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
// The supervisor context is global root.
return auth.NewRootCredentials(ctx.k.rootUserNamespace)
case fs.CtxRoot:
- return ctx.k.mounts.Root()
+ if ctx.k.globalInit != nil {
+ return ctx.k.globalInit.mounts.Root()
+ }
+ return nil
case fs.CtxDirentCacheLimiter:
return ctx.k.DirentCacheLimiter
case ktime.CtxRealtimeClock:
diff --git a/pkg/sentry/kernel/kernel_state_autogen.go b/pkg/sentry/kernel/kernel_state_autogen.go
index b24036d33..bf909f2fc 100755
--- a/pkg/sentry/kernel/kernel_state_autogen.go
+++ b/pkg/sentry/kernel/kernel_state_autogen.go
@@ -131,7 +131,6 @@ func (x *Kernel) save(m state.Map) {
m.Save("rootUTSNamespace", &x.rootUTSNamespace)
m.Save("rootIPCNamespace", &x.rootIPCNamespace)
m.Save("rootAbstractSocketNamespace", &x.rootAbstractSocketNamespace)
- m.Save("mounts", &x.mounts)
m.Save("futexes", &x.futexes)
m.Save("globalInit", &x.globalInit)
m.Save("realtimeClock", &x.realtimeClock)
@@ -160,7 +159,6 @@ func (x *Kernel) load(m state.Map) {
m.Load("rootUTSNamespace", &x.rootUTSNamespace)
m.Load("rootIPCNamespace", &x.rootIPCNamespace)
m.Load("rootAbstractSocketNamespace", &x.rootAbstractSocketNamespace)
- m.Load("mounts", &x.mounts)
m.Load("futexes", &x.futexes)
m.Load("globalInit", &x.globalInit)
m.Load("realtimeClock", &x.realtimeClock)
diff --git a/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo_unsafe.go b/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo_unsafe.go
index c284a1b11..25ad17a4e 100755
--- a/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo_unsafe.go
+++ b/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo_unsafe.go
@@ -1,11 +1,11 @@
package kernel
import (
+ "fmt"
"reflect"
"strings"
"unsafe"
- "fmt"
"gvisor.dev/gvisor/third_party/gvsync"
)
diff --git a/pkg/sentry/platform/ring0/defs_impl.go b/pkg/sentry/platform/ring0/defs_impl.go
index a30a9dd4a..a36a17e37 100755
--- a/pkg/sentry/platform/ring0/defs_impl.go
+++ b/pkg/sentry/platform/ring0/defs_impl.go
@@ -2,13 +2,13 @@ package ring0
import (
"gvisor.dev/gvisor/pkg/cpuid"
- "io"
"reflect"
"syscall"
"fmt"
"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "io"
)
var (
diff --git a/pkg/sentry/time/seqatomic_parameters_unsafe.go b/pkg/sentry/time/seqatomic_parameters_unsafe.go
index 1ec221edd..89792c56d 100755
--- a/pkg/sentry/time/seqatomic_parameters_unsafe.go
+++ b/pkg/sentry/time/seqatomic_parameters_unsafe.go
@@ -1,11 +1,11 @@
package time
import (
+ "fmt"
"reflect"
"strings"
"unsafe"
- "fmt"
"gvisor.dev/gvisor/third_party/gvsync"
)
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index d79aaff60..f1e4a9ba4 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -328,10 +328,8 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
return fmt.Errorf("at most two files may be passed to Restore")
}
- networkStack := cm.l.k.NetworkStack()
- // Destroy the old kernel and create a new kernel.
+ // Pause the kernel while we build a new one.
cm.l.k.Pause()
- cm.l.k.Destroy()
p, err := createPlatform(cm.l.conf, deviceFile)
if err != nil {
@@ -345,6 +343,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
return fmt.Errorf("creating memory file: %v", err)
}
k.SetMemoryFile(mf)
+ networkStack := cm.l.k.NetworkStack()
cm.l.k = k
// Set up the restore environment.
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 7e95e1f41..98ce40991 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -34,12 +34,10 @@ import (
_ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
specs "github.com/opencontainers/runtime-spec/specs-go"
- "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/runsc/specutils"
)
@@ -506,44 +504,16 @@ func newContainerMounter(spec *specs.Spec, cid string, goferFDs []int, k *kernel
}
}
-// setupFS is used to set up the file system for containers and amend
-// the procArgs accordingly. This is the main entry point for this rest of
-// functions in this file. procArgs are passed by reference and the FDMap field
-// is modified. It dups stdioFDs.
-func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs, creds *auth.Credentials) error {
- // Use root user to configure mounts. The current user might not have
- // permission to do so.
- rootProcArgs := kernel.CreateProcessArgs{
- WorkingDirectory: "/",
- Credentials: auth.NewRootCredentials(creds.UserNamespace),
- Umask: 0022,
- MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
- PIDNamespace: procArgs.PIDNamespace,
- }
- rootCtx := rootProcArgs.NewContext(c.k)
-
- // If this is the root container, we also need to setup the root mount
- // namespace.
- rootMNS := c.k.RootMountNamespace()
- if rootMNS == nil {
- // Setup the root container.
- if err := c.setupRootContainer(ctx, rootCtx, conf, func(rootMNS *fs.MountNamespace) {
- // The callback to setupRootContainer inherits a
- // reference on the rootMNS, so we don't need to take
- // an additional reference here.
- procArgs.MountNamespace = rootMNS
- procArgs.Root = rootMNS.Root()
- c.k.SetRootMountNamespace(rootMNS)
- }); err != nil {
- return err
- }
- return c.checkDispenser()
- }
-
+// setupChildContainer is used to set up the file system for non-root containers
+// and amend the procArgs accordingly. This is the main entry point for this
+// rest of functions in this file. procArgs are passed by reference and the
+// FDMap field is modified. It dups stdioFDs.
+func (c *containerMounter) setupChildContainer(conf *Config, procArgs *kernel.CreateProcessArgs) error {
// Setup a child container.
log.Infof("Creating new process in child container.")
// Create a new root inode and mount namespace for the container.
+ rootCtx := c.k.SupervisorContext()
rootInode, err := c.createRootMount(rootCtx, conf)
if err != nil {
return fmt.Errorf("creating filesystem for container: %v", err)
@@ -552,14 +522,12 @@ func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs *
if err != nil {
return fmt.Errorf("creating new mount namespace for container: %v", err)
}
-
- // Set process root here, so 'rootCtx.Value(CtxRoot)' will return it.
- // This will also donate a reference to procArgs, as required.
procArgs.MountNamespace = mns
- procArgs.Root = mns.Root()
+ root := mns.Root()
+ defer root.DecRef()
// Mount all submounts.
- if err := c.mountSubmounts(rootCtx, conf, mns, procArgs.Root); err != nil {
+ if err := c.mountSubmounts(rootCtx, conf, mns, root); err != nil {
return err
}
return c.checkDispenser()
@@ -599,7 +567,10 @@ func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx c
root := mns.Root()
defer root.DecRef()
- return c.mountSubmounts(rootCtx, conf, mns, root)
+ if err := c.mountSubmounts(rootCtx, conf, mns, root); err != nil {
+ return fmt.Errorf("mounting submounts: %v", err)
+ }
+ return c.checkDispenser()
}
// mountSharedMaster mounts the master of a volume that is shared among
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 50cac0433..77e1aa456 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -527,12 +527,15 @@ func (l *Loader) run() error {
// cid for root container can be empty. Only subcontainers need it to set
// the mount location.
mntr := newContainerMounter(l.spec, "", l.goferFDs, l.k, l.mountHints)
- if err := mntr.setupFS(ctx, l.conf, &l.rootProcArgs, l.rootProcArgs.Credentials); err != nil {
+
+ // Setup the root container.
+ if err := mntr.setupRootContainer(ctx, ctx, l.conf, func(mns *fs.MountNamespace) {
+ l.rootProcArgs.MountNamespace = mns
+ }); err != nil {
return err
}
- rootCtx := l.rootProcArgs.NewContext(l.k)
- if err := setExecutablePath(rootCtx, &l.rootProcArgs); err != nil {
+ if err := setExecutablePath(ctx, &l.rootProcArgs); err != nil {
return err
}
@@ -546,7 +549,7 @@ func (l *Loader) run() error {
}
}
if !hasHomeEnvv {
- homeDir, err := getExecUserHome(rootCtx, l.rootProcArgs.MountNamespace, uint32(l.rootProcArgs.Credentials.RealKUID))
+ homeDir, err := getExecUserHome(ctx, l.rootProcArgs.MountNamespace, uint32(l.rootProcArgs.Credentials.RealKUID))
if err != nil {
return fmt.Errorf("error reading exec user: %v", err)
}
@@ -685,7 +688,7 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
}
mntr := newContainerMounter(spec, cid, goferFDs, l.k, l.mountHints)
- if err := mntr.setupFS(ctx, conf, &procArgs, creds); err != nil {
+ if err := mntr.setupChildContainer(conf, &procArgs); err != nil {
return fmt.Errorf("configuring container FS: %v", err)
}
@@ -756,22 +759,14 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
return 0, fmt.Errorf("no such container: %q", args.ContainerID)
}
- // Get the container Root Dirent and MountNamespace from the Task.
+ // Get the container MountNamespace from the Task.
tg.Leader().WithMuLocked(func(t *kernel.Task) {
- // FSContext.RootDirectory() will take an extra ref for us.
- args.Root = t.FSContext().RootDirectory()
-
// task.MountNamespace() does not take a ref, so we must do so
// ourselves.
args.MountNamespace = t.MountNamespace()
args.MountNamespace.IncRef()
})
- defer func() {
- if args.Root != nil {
- args.Root.DecRef()
- }
- args.MountNamespace.DecRef()
- }()
+ defer args.MountNamespace.DecRef()
// Start the process.
proc := control.Proc{Kernel: l.k}