summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/fs/mounts.go
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/fs/mounts.go')
-rw-r--r--pkg/sentry/fs/mounts.go511
1 files changed, 511 insertions, 0 deletions
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
new file mode 100644
index 000000000..1e6b5b70e
--- /dev/null
+++ b/pkg/sentry/fs/mounts.go
@@ -0,0 +1,511 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fs
+
+import (
+ "fmt"
+ "sync"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/refs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// DefaultTraversalLimit provides a sensible default traversal limit that may
+// be passed to FindInode and FindLink. You may want to provide other options in
+// individual syscall implementations, but for internal functions this will be
+// sane.
+const DefaultTraversalLimit = 10
+
+// MountNamespace defines a collection of mounts.
+type MountNamespace struct {
+ refs.AtomicRefCount
+
+ // userns is the user namespace associated with this mount namespace.
+ //
+ // All privileged operations on this mount namespace must have
+ // appropriate capabilities in this userns.
+ //
+ // userns is immutable.
+ userns *auth.UserNamespace
+
+ // root is the root directory.
+ root *Dirent
+
+ // mu protects mounts and mountID counter.
+ mu sync.Mutex `state:"nosave"`
+
+ // mounts is a map of the last mounted Dirent -> stack of old Dirents
+ // that were mounted over, with the oldest mounted Dirent first and
+ // more recent mounted Dirents at the end of the slice.
+ //
+ // A reference to all Dirents in mounts (keys and values) must be held
+ // to ensure the Dirents are recoverable when unmounting.
+ mounts map[*Dirent][]*Dirent
+
+ // mountID is the next mount id to assign.
+ mountID uint64
+}
+
+// NewMountNamespace returns a new MountNamespace, with the provided node at the
+// root, and the given cache size. A root must always be provided.
+func NewMountNamespace(ctx context.Context, root *Inode) (*MountNamespace, error) {
+ creds := auth.CredentialsFromContext(ctx)
+
+ root.MountSource.mu.Lock()
+ defer root.MountSource.mu.Unlock()
+
+ // Set the root dirent and id on the root mount.
+ d := NewDirent(root, "/")
+ root.MountSource.root = d
+ root.MountSource.id = 1
+
+ return &MountNamespace{
+ userns: creds.UserNamespace,
+ root: d,
+ mounts: make(map[*Dirent][]*Dirent),
+ mountID: 2,
+ }, nil
+}
+
+// UserNamespace returns the user namespace associated with this mount manager.
+func (mns *MountNamespace) UserNamespace() *auth.UserNamespace {
+ return mns.userns
+}
+
+// Root returns the MountNamespace's root Dirent and increments its reference
+// count. The caller must call DecRef when finished.
+func (mns *MountNamespace) Root() *Dirent {
+ mns.root.IncRef()
+ return mns.root
+}
+
+// FlushMountSourceRefs flushes extra references held by MountSources for all active mount points;
+// see fs/mount.go:MountSource.FlushDirentRefs.
+func (mns *MountNamespace) FlushMountSourceRefs() {
+ mns.mu.Lock()
+ defer mns.mu.Unlock()
+ mns.flushMountSourceRefsLocked()
+}
+
+func (mns *MountNamespace) flushMountSourceRefsLocked() {
+ // Flush mounts' MountSource references.
+ for current, stack := range mns.mounts {
+ current.Inode.MountSource.FlushDirentRefs()
+ for _, prev := range stack {
+ prev.Inode.MountSource.FlushDirentRefs()
+ }
+ }
+
+ // Flush root's MountSource references.
+ mns.root.Inode.MountSource.FlushDirentRefs()
+}
+
+// destroy drops root and mounts dirent references and closes any original nodes.
+//
+// After destroy is called, the MountNamespace may continue to be referenced (for
+// example via /proc/mounts), but should free all resources and shouldn't have
+// Find* methods called.
+func (mns *MountNamespace) destroy() {
+ mns.mu.Lock()
+ defer mns.mu.Unlock()
+
+ // Flush all mounts' MountSource references to Dirents. This allows for mount
+ // points to be torn down since there should be no remaining references after
+ // this and DecRef below.
+ mns.flushMountSourceRefsLocked()
+
+ // Teardown mounts.
+ for current, mp := range mns.mounts {
+ // Drop the mount reference on all mounted dirents.
+ for _, d := range mp {
+ d.DecRef()
+ }
+ current.DecRef()
+ }
+ mns.mounts = nil
+
+ // Drop reference on the root.
+ mns.root.DecRef()
+
+ // Wait for asynchronous work (queued by dropping Dirent references
+ // above) to complete before destroying this MountNamespace.
+ AsyncBarrier()
+}
+
+// DecRef implements RefCounter.DecRef with destructor mns.destroy.
+func (mns *MountNamespace) DecRef() {
+ mns.DecRefWithDestructor(mns.destroy)
+}
+
+// Freeze freezes the entire mount tree.
+func (mns *MountNamespace) Freeze() {
+ mns.mu.Lock()
+ defer mns.mu.Unlock()
+
+ // We only want to freeze Dirents with active references, not Dirents referenced
+ // by a mount's MountSource.
+ mns.flushMountSourceRefsLocked()
+
+ // Freeze the entire shebang.
+ mns.root.Freeze()
+}
+
+// withMountLocked prevents further walks to `node`, because `node` is about to
+// be a mount point.
+func (mns *MountNamespace) withMountLocked(node *Dirent, fn func() error) error {
+ mns.mu.Lock()
+ defer mns.mu.Unlock()
+
+ renameMu.Lock()
+ defer renameMu.Unlock()
+
+ // Linux allows mounting over the root (?). It comes with a strange set
+ // of semantics. We'll just not do this for now.
+ if node.parent == nil {
+ return syserror.EBUSY
+ }
+
+ // For both mount and unmount, we take this lock so we can swap out the
+ // appropriate child in parent.children.
+ //
+ // For unmount, this also ensures that if `node` is a mount point, the
+ // underlying mount's MountSource.direntRefs cannot increase by preventing
+ // walks to node.
+ node.parent.dirMu.Lock()
+ defer node.parent.dirMu.Unlock()
+
+ node.parent.mu.Lock()
+ defer node.parent.mu.Unlock()
+
+ // We need not take node.dirMu since we have parent.dirMu.
+
+ // We need to take node.mu, so that we can check for deletion.
+ node.mu.Lock()
+ defer node.mu.Unlock()
+
+ return fn()
+}
+
+// Mount mounts a `inode` over the subtree at `node`.
+func (mns *MountNamespace) Mount(ctx context.Context, node *Dirent, inode *Inode) error {
+ return mns.withMountLocked(node, func() error {
+ // replacement already has one reference taken; this is the mount
+ // reference.
+ replacement, err := node.mount(ctx, inode)
+ if err != nil {
+ return err
+ }
+
+ // Set child/parent dirent relationship.
+ parentMountSource := node.Inode.MountSource
+ childMountSource := inode.MountSource
+ parentMountSource.mu.Lock()
+ defer parentMountSource.mu.Unlock()
+ childMountSource.mu.Lock()
+ defer childMountSource.mu.Unlock()
+
+ parentMountSource.children[childMountSource] = struct{}{}
+ childMountSource.parent = parentMountSource
+
+ // Set the mount's root dirent and id.
+ childMountSource.root = replacement
+ childMountSource.id = mns.mountID
+ mns.mountID++
+
+ // Drop node from its dirent cache.
+ node.dropExtendedReference()
+
+ // If node is already a mount point, push node on the stack so it can
+ // be recovered on unmount.
+ if stack, ok := mns.mounts[node]; ok {
+ mns.mounts[replacement] = append(stack, node)
+ delete(mns.mounts, node)
+ return nil
+ }
+
+ // Was not already mounted, just add another mount point.
+ // Take a reference on node so it can be recovered on unmount.
+ node.IncRef()
+ mns.mounts[replacement] = []*Dirent{node}
+ return nil
+ })
+}
+
+// Unmount ensures no references to the MountSource remain and removes `node` from
+// this subtree. The subtree formerly mounted in `node`'s place will be
+// restored. node's MountSource will be destroyed as soon as the last reference to
+// `node` is dropped, as no references to Dirents within will remain.
+//
+// If detachOnly is set, Unmount merely removes `node` from the subtree, but
+// allows existing references to the MountSource remain. E.g. if an open file still
+// refers to Dirents in MountSource, the Unmount will succeed anyway and MountSource will
+// be destroyed at a later time when all references to Dirents within are
+// dropped.
+//
+// The caller must hold a reference to node from walking to it.
+func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly bool) error {
+ // This takes locks to prevent further walks to Dirents in this mount
+ // under the assumption that `node` is the root of the mount.
+ return mns.withMountLocked(node, func() error {
+ origs, ok := mns.mounts[node]
+ if !ok {
+ // node is not a mount point.
+ return syserror.EINVAL
+ }
+
+ if len(origs) == 0 {
+ panic("cannot unmount initial dirent")
+ }
+
+ if !detachOnly {
+ m := node.Inode.MountSource
+
+ // Lock the parent MountSource first, if it exists. We are
+ // holding mns.Lock, so the parent can not change out
+ // from under us.
+ parent := m.Parent()
+ if parent != nil {
+ parent.mu.Lock()
+ defer parent.mu.Unlock()
+ }
+
+ // Lock the mount that is being unmounted.
+ m.mu.Lock()
+ defer m.mu.Unlock()
+
+ if m.parent != nil {
+ // Sanity check.
+ if _, ok := m.parent.children[m]; !ok {
+ panic(fmt.Sprintf("mount %+v is not a child of parent %+v", m, m.parent))
+ }
+ delete(m.parent.children, m)
+ m.parent = nil
+ }
+
+ // Flush all references on the mounted node.
+ m.FlushDirentRefs()
+
+ // At this point, exactly two references must be held
+ // to mount: one mount reference on node, and one due
+ // to walking to node.
+ //
+ // We must also be guaranteed that no more references
+ // can be taken on mount. This is why withMountLocked
+ // must be held at this point to prevent any walks to
+ // and from node.
+ if refs := m.DirentRefs(); refs < 2 {
+ panic(fmt.Sprintf("have %d refs on unmount, expect 2 or more", refs))
+ } else if refs != 2 {
+ return syserror.EBUSY
+ }
+ }
+
+ original := origs[len(origs)-1]
+ if err := node.unmount(ctx, original); err != nil {
+ return err
+ }
+
+ switch {
+ case len(origs) > 1:
+ mns.mounts[original] = origs[:len(origs)-1]
+ case len(origs) == 1:
+ // Drop mount reference taken at the end of
+ // MountNamespace.Mount.
+ original.DecRef()
+ }
+
+ delete(mns.mounts, node)
+ return nil
+ })
+}
+
+// FindLink returns an Dirent from a given node, which may be a symlink.
+//
+// The root argument is treated as the root directory, and FindLink will not
+// return anything above that. The wd dirent provides the starting directory,
+// and may be nil which indicates the root should be used. You must call DecRef
+// on the resulting Dirent when you are no longer using the object.
+//
+// If wd is nil, then the root will be used as the working directory. If the
+// path is absolute, this has no functional impact.
+//
+// Precondition: root must be non-nil.
+// Precondition: the path must be non-empty.
+func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path string, maxTraversals uint) (*Dirent, error) {
+ if root == nil {
+ panic("MountNamespace.FindInode: root must not be nil")
+ }
+ if len(path) == 0 {
+ panic("MountNamespace.FindInode: path is empty")
+ }
+
+ // Split the path.
+ first, remainder := SplitFirst(path)
+
+ // Where does this walk originate?
+ current := wd
+ if current == nil {
+ current = root
+ }
+ for first == "/" {
+ // Special case: it's possible that we have nothing to walk at
+ // all. This is necessary since we're resplitting the path.
+ if remainder == "" {
+ root.IncRef()
+ return root, nil
+ }
+
+ // Start at the root and advance the path component so that the
+ // walk below can proceed. Note at this point, it handles the
+ // no-op walk case perfectly fine.
+ current = root
+ first, remainder = SplitFirst(remainder)
+ }
+
+ current.IncRef() // Transferred during walk.
+
+ for {
+ // Check that the file is a directory and that we have
+ // permissions to walk.
+ //
+ // Note that we elide this check for the root directory as an
+ // optimization; a non-executable root may still be walked. A
+ // non-directory root is hopeless.
+ if current != root {
+ if !IsDir(current.Inode.StableAttr) {
+ current.DecRef() // Drop reference from above.
+ return nil, syserror.ENOTDIR
+ }
+ if err := current.Inode.CheckPermission(ctx, PermMask{Execute: true}); err != nil {
+ current.DecRef() // Drop reference from above.
+ return nil, err
+ }
+ }
+
+ // Move to the next level.
+ next, err := current.Walk(ctx, root, first)
+ if err != nil {
+ // Allow failed walks to cache the dirent, because no
+ // children will acquire a reference at the end.
+ current.maybeExtendReference()
+ current.DecRef()
+ return nil, err
+ }
+
+ // Drop old reference.
+ current.DecRef()
+
+ if remainder != "" {
+ // Ensure it's resolved, unless it's the last level.
+ //
+ // See resolve for reference semantics; on err next
+ // will have one dropped.
+ current, err = mns.resolve(ctx, root, next, maxTraversals)
+ if err != nil {
+ return nil, err
+ }
+ } else {
+ // Allow the file system to take an extra reference on the
+ // found child. This will hold a reference on the containing
+ // directory, so the whole tree will be implicitly cached.
+ next.maybeExtendReference()
+ return next, nil
+ }
+
+ // Move to the next element.
+ first, remainder = SplitFirst(remainder)
+ }
+}
+
+// FindInode is identical to FindLink except the return value is resolved.
+//
+//go:nosplit
+func (mns *MountNamespace) FindInode(ctx context.Context, root, wd *Dirent, path string, maxTraversals uint) (*Dirent, error) {
+ d, err := mns.FindLink(ctx, root, wd, path, maxTraversals)
+ if err != nil {
+ return nil, err
+ }
+
+ // See resolve for reference semantics; on err d will have the
+ // reference dropped.
+ return mns.resolve(ctx, root, d, maxTraversals)
+}
+
+// resolve resolves the given link.
+//
+// If successful, a reference is dropped on node and one is acquired on the
+// caller's behalf for the returned dirent.
+//
+// If not successful, a reference is _also_ dropped on the node and an error
+// returned. This is for convenience in using resolve directly as a return
+// value.
+func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, maxTraversals uint) (*Dirent, error) {
+ // Resolve the path.
+ target, err := node.Inode.Getlink(ctx)
+
+ switch err {
+ case nil:
+ // Make sure we didn't exhaust the traversal budget.
+ if maxTraversals == 0 {
+ target.DecRef()
+ return nil, syscall.ELOOP
+ }
+
+ node.DecRef() // Drop the original reference.
+ return target, nil
+
+ case syscall.ENOLINK:
+ // Not a symlink.
+ return node, nil
+
+ case ErrResolveViaReadlink:
+ defer node.DecRef() // See above.
+
+ // First, check if we should traverse.
+ if maxTraversals == 0 {
+ return nil, syscall.ELOOP
+ }
+
+ // Read the target path.
+ targetPath, err := node.Inode.Readlink(ctx)
+ if err != nil {
+ return nil, err
+ }
+
+ // Find the node; we resolve relative to the current symlink's parent.
+ d, err := mns.FindInode(ctx, root, node.parent, targetPath, maxTraversals-1)
+ if err != nil {
+ return nil, err
+ }
+
+ return d, err
+
+ default:
+ node.DecRef() // Drop for err; see above.
+
+ // Propagate the error.
+ return nil, err
+ }
+}
+
+// SyncAll calls Dirent.SyncAll on the root.
+func (mns *MountNamespace) SyncAll(ctx context.Context) {
+ mns.mu.Lock()
+ defer mns.mu.Unlock()
+ mns.root.SyncAll(ctx)
+}