diff options
Diffstat (limited to 'pkg/sentry/fs/mounts.go')
-rw-r--r-- | pkg/sentry/fs/mounts.go | 511 |
1 files changed, 511 insertions, 0 deletions
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go new file mode 100644 index 000000000..1e6b5b70e --- /dev/null +++ b/pkg/sentry/fs/mounts.go @@ -0,0 +1,511 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "fmt" + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// DefaultTraversalLimit provides a sensible default traversal limit that may +// be passed to FindInode and FindLink. You may want to provide other options in +// individual syscall implementations, but for internal functions this will be +// sane. +const DefaultTraversalLimit = 10 + +// MountNamespace defines a collection of mounts. +type MountNamespace struct { + refs.AtomicRefCount + + // userns is the user namespace associated with this mount namespace. + // + // All privileged operations on this mount namespace must have + // appropriate capabilities in this userns. + // + // userns is immutable. + userns *auth.UserNamespace + + // root is the root directory. + root *Dirent + + // mu protects mounts and mountID counter. + mu sync.Mutex `state:"nosave"` + + // mounts is a map of the last mounted Dirent -> stack of old Dirents + // that were mounted over, with the oldest mounted Dirent first and + // more recent mounted Dirents at the end of the slice. + // + // A reference to all Dirents in mounts (keys and values) must be held + // to ensure the Dirents are recoverable when unmounting. + mounts map[*Dirent][]*Dirent + + // mountID is the next mount id to assign. + mountID uint64 +} + +// NewMountNamespace returns a new MountNamespace, with the provided node at the +// root, and the given cache size. A root must always be provided. +func NewMountNamespace(ctx context.Context, root *Inode) (*MountNamespace, error) { + creds := auth.CredentialsFromContext(ctx) + + root.MountSource.mu.Lock() + defer root.MountSource.mu.Unlock() + + // Set the root dirent and id on the root mount. + d := NewDirent(root, "/") + root.MountSource.root = d + root.MountSource.id = 1 + + return &MountNamespace{ + userns: creds.UserNamespace, + root: d, + mounts: make(map[*Dirent][]*Dirent), + mountID: 2, + }, nil +} + +// UserNamespace returns the user namespace associated with this mount manager. +func (mns *MountNamespace) UserNamespace() *auth.UserNamespace { + return mns.userns +} + +// Root returns the MountNamespace's root Dirent and increments its reference +// count. The caller must call DecRef when finished. +func (mns *MountNamespace) Root() *Dirent { + mns.root.IncRef() + return mns.root +} + +// FlushMountSourceRefs flushes extra references held by MountSources for all active mount points; +// see fs/mount.go:MountSource.FlushDirentRefs. +func (mns *MountNamespace) FlushMountSourceRefs() { + mns.mu.Lock() + defer mns.mu.Unlock() + mns.flushMountSourceRefsLocked() +} + +func (mns *MountNamespace) flushMountSourceRefsLocked() { + // Flush mounts' MountSource references. + for current, stack := range mns.mounts { + current.Inode.MountSource.FlushDirentRefs() + for _, prev := range stack { + prev.Inode.MountSource.FlushDirentRefs() + } + } + + // Flush root's MountSource references. + mns.root.Inode.MountSource.FlushDirentRefs() +} + +// destroy drops root and mounts dirent references and closes any original nodes. +// +// After destroy is called, the MountNamespace may continue to be referenced (for +// example via /proc/mounts), but should free all resources and shouldn't have +// Find* methods called. +func (mns *MountNamespace) destroy() { + mns.mu.Lock() + defer mns.mu.Unlock() + + // Flush all mounts' MountSource references to Dirents. This allows for mount + // points to be torn down since there should be no remaining references after + // this and DecRef below. + mns.flushMountSourceRefsLocked() + + // Teardown mounts. + for current, mp := range mns.mounts { + // Drop the mount reference on all mounted dirents. + for _, d := range mp { + d.DecRef() + } + current.DecRef() + } + mns.mounts = nil + + // Drop reference on the root. + mns.root.DecRef() + + // Wait for asynchronous work (queued by dropping Dirent references + // above) to complete before destroying this MountNamespace. + AsyncBarrier() +} + +// DecRef implements RefCounter.DecRef with destructor mns.destroy. +func (mns *MountNamespace) DecRef() { + mns.DecRefWithDestructor(mns.destroy) +} + +// Freeze freezes the entire mount tree. +func (mns *MountNamespace) Freeze() { + mns.mu.Lock() + defer mns.mu.Unlock() + + // We only want to freeze Dirents with active references, not Dirents referenced + // by a mount's MountSource. + mns.flushMountSourceRefsLocked() + + // Freeze the entire shebang. + mns.root.Freeze() +} + +// withMountLocked prevents further walks to `node`, because `node` is about to +// be a mount point. +func (mns *MountNamespace) withMountLocked(node *Dirent, fn func() error) error { + mns.mu.Lock() + defer mns.mu.Unlock() + + renameMu.Lock() + defer renameMu.Unlock() + + // Linux allows mounting over the root (?). It comes with a strange set + // of semantics. We'll just not do this for now. + if node.parent == nil { + return syserror.EBUSY + } + + // For both mount and unmount, we take this lock so we can swap out the + // appropriate child in parent.children. + // + // For unmount, this also ensures that if `node` is a mount point, the + // underlying mount's MountSource.direntRefs cannot increase by preventing + // walks to node. + node.parent.dirMu.Lock() + defer node.parent.dirMu.Unlock() + + node.parent.mu.Lock() + defer node.parent.mu.Unlock() + + // We need not take node.dirMu since we have parent.dirMu. + + // We need to take node.mu, so that we can check for deletion. + node.mu.Lock() + defer node.mu.Unlock() + + return fn() +} + +// Mount mounts a `inode` over the subtree at `node`. +func (mns *MountNamespace) Mount(ctx context.Context, node *Dirent, inode *Inode) error { + return mns.withMountLocked(node, func() error { + // replacement already has one reference taken; this is the mount + // reference. + replacement, err := node.mount(ctx, inode) + if err != nil { + return err + } + + // Set child/parent dirent relationship. + parentMountSource := node.Inode.MountSource + childMountSource := inode.MountSource + parentMountSource.mu.Lock() + defer parentMountSource.mu.Unlock() + childMountSource.mu.Lock() + defer childMountSource.mu.Unlock() + + parentMountSource.children[childMountSource] = struct{}{} + childMountSource.parent = parentMountSource + + // Set the mount's root dirent and id. + childMountSource.root = replacement + childMountSource.id = mns.mountID + mns.mountID++ + + // Drop node from its dirent cache. + node.dropExtendedReference() + + // If node is already a mount point, push node on the stack so it can + // be recovered on unmount. + if stack, ok := mns.mounts[node]; ok { + mns.mounts[replacement] = append(stack, node) + delete(mns.mounts, node) + return nil + } + + // Was not already mounted, just add another mount point. + // Take a reference on node so it can be recovered on unmount. + node.IncRef() + mns.mounts[replacement] = []*Dirent{node} + return nil + }) +} + +// Unmount ensures no references to the MountSource remain and removes `node` from +// this subtree. The subtree formerly mounted in `node`'s place will be +// restored. node's MountSource will be destroyed as soon as the last reference to +// `node` is dropped, as no references to Dirents within will remain. +// +// If detachOnly is set, Unmount merely removes `node` from the subtree, but +// allows existing references to the MountSource remain. E.g. if an open file still +// refers to Dirents in MountSource, the Unmount will succeed anyway and MountSource will +// be destroyed at a later time when all references to Dirents within are +// dropped. +// +// The caller must hold a reference to node from walking to it. +func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly bool) error { + // This takes locks to prevent further walks to Dirents in this mount + // under the assumption that `node` is the root of the mount. + return mns.withMountLocked(node, func() error { + origs, ok := mns.mounts[node] + if !ok { + // node is not a mount point. + return syserror.EINVAL + } + + if len(origs) == 0 { + panic("cannot unmount initial dirent") + } + + if !detachOnly { + m := node.Inode.MountSource + + // Lock the parent MountSource first, if it exists. We are + // holding mns.Lock, so the parent can not change out + // from under us. + parent := m.Parent() + if parent != nil { + parent.mu.Lock() + defer parent.mu.Unlock() + } + + // Lock the mount that is being unmounted. + m.mu.Lock() + defer m.mu.Unlock() + + if m.parent != nil { + // Sanity check. + if _, ok := m.parent.children[m]; !ok { + panic(fmt.Sprintf("mount %+v is not a child of parent %+v", m, m.parent)) + } + delete(m.parent.children, m) + m.parent = nil + } + + // Flush all references on the mounted node. + m.FlushDirentRefs() + + // At this point, exactly two references must be held + // to mount: one mount reference on node, and one due + // to walking to node. + // + // We must also be guaranteed that no more references + // can be taken on mount. This is why withMountLocked + // must be held at this point to prevent any walks to + // and from node. + if refs := m.DirentRefs(); refs < 2 { + panic(fmt.Sprintf("have %d refs on unmount, expect 2 or more", refs)) + } else if refs != 2 { + return syserror.EBUSY + } + } + + original := origs[len(origs)-1] + if err := node.unmount(ctx, original); err != nil { + return err + } + + switch { + case len(origs) > 1: + mns.mounts[original] = origs[:len(origs)-1] + case len(origs) == 1: + // Drop mount reference taken at the end of + // MountNamespace.Mount. + original.DecRef() + } + + delete(mns.mounts, node) + return nil + }) +} + +// FindLink returns an Dirent from a given node, which may be a symlink. +// +// The root argument is treated as the root directory, and FindLink will not +// return anything above that. The wd dirent provides the starting directory, +// and may be nil which indicates the root should be used. You must call DecRef +// on the resulting Dirent when you are no longer using the object. +// +// If wd is nil, then the root will be used as the working directory. If the +// path is absolute, this has no functional impact. +// +// Precondition: root must be non-nil. +// Precondition: the path must be non-empty. +func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path string, maxTraversals uint) (*Dirent, error) { + if root == nil { + panic("MountNamespace.FindInode: root must not be nil") + } + if len(path) == 0 { + panic("MountNamespace.FindInode: path is empty") + } + + // Split the path. + first, remainder := SplitFirst(path) + + // Where does this walk originate? + current := wd + if current == nil { + current = root + } + for first == "/" { + // Special case: it's possible that we have nothing to walk at + // all. This is necessary since we're resplitting the path. + if remainder == "" { + root.IncRef() + return root, nil + } + + // Start at the root and advance the path component so that the + // walk below can proceed. Note at this point, it handles the + // no-op walk case perfectly fine. + current = root + first, remainder = SplitFirst(remainder) + } + + current.IncRef() // Transferred during walk. + + for { + // Check that the file is a directory and that we have + // permissions to walk. + // + // Note that we elide this check for the root directory as an + // optimization; a non-executable root may still be walked. A + // non-directory root is hopeless. + if current != root { + if !IsDir(current.Inode.StableAttr) { + current.DecRef() // Drop reference from above. + return nil, syserror.ENOTDIR + } + if err := current.Inode.CheckPermission(ctx, PermMask{Execute: true}); err != nil { + current.DecRef() // Drop reference from above. + return nil, err + } + } + + // Move to the next level. + next, err := current.Walk(ctx, root, first) + if err != nil { + // Allow failed walks to cache the dirent, because no + // children will acquire a reference at the end. + current.maybeExtendReference() + current.DecRef() + return nil, err + } + + // Drop old reference. + current.DecRef() + + if remainder != "" { + // Ensure it's resolved, unless it's the last level. + // + // See resolve for reference semantics; on err next + // will have one dropped. + current, err = mns.resolve(ctx, root, next, maxTraversals) + if err != nil { + return nil, err + } + } else { + // Allow the file system to take an extra reference on the + // found child. This will hold a reference on the containing + // directory, so the whole tree will be implicitly cached. + next.maybeExtendReference() + return next, nil + } + + // Move to the next element. + first, remainder = SplitFirst(remainder) + } +} + +// FindInode is identical to FindLink except the return value is resolved. +// +//go:nosplit +func (mns *MountNamespace) FindInode(ctx context.Context, root, wd *Dirent, path string, maxTraversals uint) (*Dirent, error) { + d, err := mns.FindLink(ctx, root, wd, path, maxTraversals) + if err != nil { + return nil, err + } + + // See resolve for reference semantics; on err d will have the + // reference dropped. + return mns.resolve(ctx, root, d, maxTraversals) +} + +// resolve resolves the given link. +// +// If successful, a reference is dropped on node and one is acquired on the +// caller's behalf for the returned dirent. +// +// If not successful, a reference is _also_ dropped on the node and an error +// returned. This is for convenience in using resolve directly as a return +// value. +func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, maxTraversals uint) (*Dirent, error) { + // Resolve the path. + target, err := node.Inode.Getlink(ctx) + + switch err { + case nil: + // Make sure we didn't exhaust the traversal budget. + if maxTraversals == 0 { + target.DecRef() + return nil, syscall.ELOOP + } + + node.DecRef() // Drop the original reference. + return target, nil + + case syscall.ENOLINK: + // Not a symlink. + return node, nil + + case ErrResolveViaReadlink: + defer node.DecRef() // See above. + + // First, check if we should traverse. + if maxTraversals == 0 { + return nil, syscall.ELOOP + } + + // Read the target path. + targetPath, err := node.Inode.Readlink(ctx) + if err != nil { + return nil, err + } + + // Find the node; we resolve relative to the current symlink's parent. + d, err := mns.FindInode(ctx, root, node.parent, targetPath, maxTraversals-1) + if err != nil { + return nil, err + } + + return d, err + + default: + node.DecRef() // Drop for err; see above. + + // Propagate the error. + return nil, err + } +} + +// SyncAll calls Dirent.SyncAll on the root. +func (mns *MountNamespace) SyncAll(ctx context.Context) { + mns.mu.Lock() + defer mns.mu.Unlock() + mns.root.SyncAll(ctx) +} |