summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry
diff options
context:
space:
mode:
authorNicolas Lacasse <nlacasse@google.com>2019-06-19 09:20:10 -0700
committergVisor bot <gvisor-bot@google.com>2019-06-19 09:21:21 -0700
commitf7428af9c11cd47e6252a3fbf24db411e513c241 (patch)
tree0098df7cccea01185d1cc75e4a2f296d4160b415 /pkg/sentry
parent0d1dc50b70baf6b4a3752d5c761f608feea9f30e (diff)
Add MountNamespace to task.
This allows tasks to have distinct mount namespace, instead of all sharing the kernel's root mount namespace. Currently, the only way for a task to get a different mount namespace than the kernel's root is by explicitly setting a different MountNamespace in CreateProcessArgs, and nothing does this (yet). In a follow-up CL, we will set CreateProcessArgs.MountNamespace when creating a new container inside runsc. Note that "MountNamespace" is a poor term for this thing. It's more like a distinct VFS tree. When we get around to adding real mount namespaces, this will need a better naem. PiperOrigin-RevId: 254009310
Diffstat (limited to 'pkg/sentry')
-rw-r--r--pkg/sentry/fs/mounts.go11
-rw-r--r--pkg/sentry/kernel/kernel.go49
-rw-r--r--pkg/sentry/kernel/task.go4
-rw-r--r--pkg/sentry/kernel/task_clone.go3
-rw-r--r--pkg/sentry/kernel/thread_group.go13
5 files changed, 65 insertions, 15 deletions
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index a5d6f8b9a..281364dfc 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -124,7 +124,16 @@ func (m *Mount) IsUndo() bool {
return false
}
-// MountNamespace defines a collection of mounts.
+// MountNamespace defines a VFS root. It contains collection of Mounts that are
+// mounted inside the Dirent tree rooted at the Root Dirent. It provides
+// methods for traversing the Dirent, and for mounting/unmounting in the tree.
+//
+// Note that this does not correspond to a "mount namespace" in the Linux. It
+// is more like a unique VFS instance.
+//
+// It's possible for different processes to have different MountNamespaces. In
+// this case, the file systems exposed to the processes are completely
+// distinct.
//
// +stateify savable
type MountNamespace struct {
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 07ae592c4..9fe9eb914 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -381,9 +381,23 @@ func (k *Kernel) SaveTo(w io.Writer) error {
// flushMountSourceRefs flushes the MountSources for all mounted filesystems
// and open FDs.
func (k *Kernel) flushMountSourceRefs() error {
- // Flush all mount sources for currently mounted filesystems.
+ // Flush all mount sources for currently mounted filesystems in the
+ // root mount namespace.
k.mounts.FlushMountSourceRefs()
+ // Some tasks may have other mount namespaces; flush those as well.
+ flushed := make(map[*fs.MountNamespace]struct{})
+ k.tasks.mu.RLock()
+ k.tasks.forEachThreadGroupLocked(func(tg *ThreadGroup) {
+ if _, ok := flushed[tg.mounts]; ok {
+ // Already flushed.
+ return
+ }
+ tg.mounts.FlushMountSourceRefs()
+ flushed[tg.mounts] = struct{}{}
+ })
+ k.tasks.mu.RUnlock()
+
// There may be some open FDs whose filesystems have been unmounted. We
// must flush those as well.
return k.tasks.forEachFDPaused(func(desc descriptor) error {
@@ -611,12 +625,18 @@ type CreateProcessArgs struct {
// AbstractSocketNamespace is the initial Abstract Socket namespace.
AbstractSocketNamespace *AbstractSocketNamespace
+ // MountNamespace optionally contains the mount namespace for this
+ // process. If nil, the kernel's mount namespace is used.
+ //
+ // Anyone setting MountNamespace must donate a reference (i.e.
+ // increment it).
+ MountNamespace *fs.MountNamespace
+
// Root optionally contains the dirent that serves as the root for the
// process. If nil, the mount namespace's root is used as the process'
// root.
//
- // Anyone setting Root must donate a reference (i.e. increment it) to
- // keep it alive until it is decremented by CreateProcess.
+ // Anyone setting Root must donate a reference (i.e. increment it).
Root *fs.Dirent
// ContainerID is the container that the process belongs to.
@@ -715,20 +735,29 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
return nil, 0, fmt.Errorf("no kernel MountNamespace")
}
- tg := k.newThreadGroup(k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
+ // Grab the mount namespace.
+ mounts := args.MountNamespace
+ if mounts == nil {
+ // If no MountNamespace was configured, then use the kernel's
+ // root mount namespace, with an extra reference that will be
+ // donated to the task.
+ mounts = k.mounts
+ mounts.IncRef()
+ }
+
+ tg := k.newThreadGroup(mounts, k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
ctx := args.NewContext(k)
// Grab the root directory.
root := args.Root
if root == nil {
- root = fs.RootFromContext(ctx)
- // Is the root STILL nil?
- if root == nil {
- return nil, 0, fmt.Errorf("CreateProcessArgs.Root was not provided, and failed to get root from context")
- }
+ // If no Root was configured, then get it from the
+ // MountNamespace.
+ root = mounts.Root()
}
+ // The call to newFSContext below will take a reference on root, so we
+ // don't need to hold this one.
defer root.DecRef()
- args.Root = nil
// Grab the working directory.
remainingTraversals := uint(args.MaxSymlinkTraversals)
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index e883696f9..7ed589a02 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -665,7 +665,7 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
// Preconditions: The caller must be running on the task goroutine, or t.mu
// must be locked.
func (t *Task) IsChrooted() bool {
- realRoot := t.k.mounts.Root()
+ realRoot := t.tg.mounts.Root()
defer realRoot.DecRef()
root := t.fsc.RootDirectory()
if root != nil {
@@ -710,7 +710,7 @@ func (t *Task) WithMuLocked(f func(*Task)) {
// MountNamespace returns t's MountNamespace. MountNamespace does not take an
// additional reference on the returned MountNamespace.
func (t *Task) MountNamespace() *fs.MountNamespace {
- return t.k.mounts
+ return t.tg.mounts
}
// AbstractSockets returns t's AbstractSocketNamespace.
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index cafc9296f..0e621f0d1 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -238,11 +238,12 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
}
tg := t.tg
if opts.NewThreadGroup {
+ tg.mounts.IncRef()
sh := t.tg.signalHandlers
if opts.NewSignalHandlers {
sh = sh.Fork()
}
- tg = t.k.newThreadGroup(pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock)
+ tg = t.k.newThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock)
}
cfg := &TaskConfig{
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 95346290d..3562ef179 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -19,6 +19,7 @@ import (
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/usage"
@@ -236,13 +237,21 @@ type ThreadGroup struct {
// rscr is the thread group's RSEQ critical region.
rscr atomic.Value `state:".(*RSEQCriticalRegion)"`
+
+ // mounts is the thread group's mount namespace. This does not really
+ // correspond to a "mount namespace" in Linux, but is more like a
+ // complete VFS that need not be shared between processes. See the
+ // comment in mounts.go for more information.
+ //
+ // mounts is immutable.
+ mounts *fs.MountNamespace
}
// newThreadGroup returns a new, empty thread group in PID namespace ns. The
// thread group leader will send its parent terminationSignal when it exits.
// The new thread group isn't visible to the system until a task has been
// created inside of it by a successful call to TaskSet.NewTask.
-func (k *Kernel) newThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup {
+func (k *Kernel) newThreadGroup(mounts *fs.MountNamespace, ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup {
tg := &ThreadGroup{
threadGroupNode: threadGroupNode{
pidns: ns,
@@ -251,6 +260,7 @@ func (k *Kernel) newThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminatio
terminationSignal: terminationSignal,
ioUsage: &usage.IO{},
limits: limits,
+ mounts: mounts,
}
tg.itimerRealTimer = ktime.NewTimer(k.monotonicClock, &itimerRealListener{tg: tg})
tg.timers = make(map[linux.TimerID]*IntervalTimer)
@@ -298,6 +308,7 @@ func (tg *ThreadGroup) release() {
for _, it := range its {
it.DestroyTimer()
}
+ tg.mounts.DecRef()
}
// forEachChildThreadGroupLocked indicates over all child ThreadGroups.