diff options
Diffstat (limited to 'pkg/sentry/fs/mounts.go')
-rw-r--r-- | pkg/sentry/fs/mounts.go | 623 |
1 files changed, 623 insertions, 0 deletions
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go new file mode 100644 index 000000000..3f2bd0e87 --- /dev/null +++ b/pkg/sentry/fs/mounts.go @@ -0,0 +1,623 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "fmt" + "math" + "syscall" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// DefaultTraversalLimit provides a sensible default traversal limit that may +// be passed to FindInode and FindLink. You may want to provide other options in +// individual syscall implementations, but for internal functions this will be +// sane. +const DefaultTraversalLimit = 10 + +const invalidMountID = math.MaxUint64 + +// Mount represents a mount in the file system. It holds the root dirent for the +// mount. It also points back to the dirent or mount where it was mounted over, +// so that it can be restored when unmounted. The chained mount can be either: +// - Mount: when it's mounted on top of another mount point. +// - Dirent: when it's mounted on top of a dirent. In this case the mount is +// called an "undo" mount and only 'root' is set. All other fields are +// either invalid or nil. +// +// +stateify savable +type Mount struct { + // ID is a unique id for this mount. It may be invalidMountID if this is + // used to cache a dirent that was mounted over. + ID uint64 + + // ParentID is the parent's mount unique id. It may be invalidMountID if this + // is the root mount or if this is used to cache a dirent that was mounted + // over. + ParentID uint64 + + // root is the root Dirent of this mount. A reference on this Dirent must be + // held through the lifetime of the Mount which contains it. + root *Dirent + + // previous is the existing dirent or mount that this object was mounted over. + // It's nil for the root mount and for the last entry in the chain (always an + // "undo" mount). + previous *Mount +} + +// newMount creates a new mount, taking a reference on 'root'. Caller must +// release the reference when it's done with the mount. +func newMount(id, pid uint64, root *Dirent) *Mount { + root.IncRef() + return &Mount{ + ID: id, + ParentID: pid, + root: root, + } +} + +// newRootMount creates a new root mount (no parent), taking a reference on +// 'root'. Caller must release the reference when it's done with the mount. +func newRootMount(id uint64, root *Dirent) *Mount { + root.IncRef() + return &Mount{ + ID: id, + ParentID: invalidMountID, + root: root, + } +} + +// newUndoMount creates a new undo mount, taking a reference on 'd'. Caller must +// release the reference when it's done with the mount. +func newUndoMount(d *Dirent) *Mount { + d.IncRef() + return &Mount{ + ID: invalidMountID, + ParentID: invalidMountID, + root: d, + } +} + +// Root returns the root dirent of this mount. +// +// This may return nil if the mount has already been free. Callers must handle this +// case appropriately. If non-nil, callers must call DecRef on the returned *Dirent. +func (m *Mount) Root() *Dirent { + if !m.root.TryIncRef() { + return nil + } + return m.root +} + +// IsRoot returns true if the mount has no parent. +func (m *Mount) IsRoot() bool { + return !m.IsUndo() && m.ParentID == invalidMountID +} + +// IsUndo returns true if 'm' is an undo mount that should be used to restore +// the original dirent during unmount only and it's not a valid mount. +func (m *Mount) IsUndo() bool { + if m.ID == invalidMountID { + if m.ParentID != invalidMountID { + panic(fmt.Sprintf("Undo mount with valid parentID: %+v", m)) + } + return true + } + return false +} + +// MountNamespace defines a VFS root. It contains collection of Mounts that are +// mounted inside the Dirent tree rooted at the Root Dirent. It provides +// methods for traversing the Dirent, and for mounting/unmounting in the tree. +// +// Note that this does not correspond to a "mount namespace" in the Linux. It +// is more like a unique VFS instance. +// +// It's possible for different processes to have different MountNamespaces. In +// this case, the file systems exposed to the processes are completely +// distinct. +// +// +stateify savable +type MountNamespace struct { + refs.AtomicRefCount + + // userns is the user namespace associated with this mount namespace. + // + // All privileged operations on this mount namespace must have + // appropriate capabilities in this userns. + // + // userns is immutable. + userns *auth.UserNamespace + + // root is the root directory. + root *Dirent + + // mu protects mounts and mountID counter. + mu sync.Mutex `state:"nosave"` + + // mounts is a map of mounted Dirent -> Mount object. There are three + // possible cases: + // - Dirent is mounted over a mount point: the stored Mount object will be + // the Mount for that mount point. + // - Dirent is mounted over a regular (non-mount point) Dirent: the stored + // Mount object will be an "undo" mount containing the mounted-over + // Dirent. + // - Dirent is the root mount: the stored Mount object will be a root mount + // containing the Dirent itself. + mounts map[*Dirent]*Mount + + // mountID is the next mount id to assign. + mountID uint64 +} + +// NewMountNamespace returns a new MountNamespace, with the provided node at the +// root, and the given cache size. A root must always be provided. +func NewMountNamespace(ctx context.Context, root *Inode) (*MountNamespace, error) { + // Set the root dirent and id on the root mount. The reference returned from + // NewDirent will be donated to the MountNamespace constructed below. + d := NewDirent(ctx, root, "/") + + mnts := map[*Dirent]*Mount{ + d: newRootMount(1, d), + } + + creds := auth.CredentialsFromContext(ctx) + mns := MountNamespace{ + userns: creds.UserNamespace, + root: d, + mounts: mnts, + mountID: 2, + } + mns.EnableLeakCheck("fs.MountNamespace") + return &mns, nil +} + +// UserNamespace returns the user namespace associated with this mount manager. +func (mns *MountNamespace) UserNamespace() *auth.UserNamespace { + return mns.userns +} + +// Root returns the MountNamespace's root Dirent and increments its reference +// count. The caller must call DecRef when finished. +func (mns *MountNamespace) Root() *Dirent { + mns.root.IncRef() + return mns.root +} + +// FlushMountSourceRefs flushes extra references held by MountSources for all active mount points; +// see fs/mount.go:MountSource.FlushDirentRefs. +func (mns *MountNamespace) FlushMountSourceRefs() { + mns.mu.Lock() + defer mns.mu.Unlock() + mns.flushMountSourceRefsLocked() +} + +func (mns *MountNamespace) flushMountSourceRefsLocked() { + // Flush mounts' MountSource references. + for _, mp := range mns.mounts { + for ; mp != nil; mp = mp.previous { + mp.root.Inode.MountSource.FlushDirentRefs() + } + } + + if mns.root == nil { + // No root? This MountSource must have already been destroyed. + // This can happen when a Save is triggered while a process is + // exiting. There is nothing to flush. + return + } + + // Flush root's MountSource references. + mns.root.Inode.MountSource.FlushDirentRefs() +} + +// destroy drops root and mounts dirent references and closes any original nodes. +// +// After destroy is called, the MountNamespace may continue to be referenced (for +// example via /proc/mounts), but should free all resources and shouldn't have +// Find* methods called. +func (mns *MountNamespace) destroy() { + mns.mu.Lock() + defer mns.mu.Unlock() + + // Flush all mounts' MountSource references to Dirents. This allows for mount + // points to be torn down since there should be no remaining references after + // this and DecRef below. + mns.flushMountSourceRefsLocked() + + // Teardown mounts. + for _, mp := range mns.mounts { + // Drop the mount reference on all mounted dirents. + for ; mp != nil; mp = mp.previous { + mp.root.DecRef() + } + } + mns.mounts = nil + + // Drop reference on the root. + mns.root.DecRef() + + // Ensure that root cannot be accessed via this MountNamespace any + // more. + mns.root = nil + + // Wait for asynchronous work (queued by dropping Dirent references + // above) to complete before destroying this MountNamespace. + AsyncBarrier() +} + +// DecRef implements RefCounter.DecRef with destructor mns.destroy. +func (mns *MountNamespace) DecRef() { + mns.DecRefWithDestructor(mns.destroy) +} + +// withMountLocked prevents further walks to `node`, because `node` is about to +// be a mount point. +func (mns *MountNamespace) withMountLocked(node *Dirent, fn func() error) error { + mns.mu.Lock() + defer mns.mu.Unlock() + + renameMu.Lock() + defer renameMu.Unlock() + + // Linux allows mounting over the root (?). It comes with a strange set + // of semantics. We'll just not do this for now. + if node.parent == nil { + return syserror.EBUSY + } + + // For both mount and unmount, we take this lock so we can swap out the + // appropriate child in parent.children. + // + // For unmount, this also ensures that if `node` is a mount point, the + // underlying mount's MountSource.direntRefs cannot increase by preventing + // walks to node. + node.parent.dirMu.Lock() + defer node.parent.dirMu.Unlock() + + node.parent.mu.Lock() + defer node.parent.mu.Unlock() + + // We need not take node.dirMu since we have parent.dirMu. + + // We need to take node.mu, so that we can check for deletion. + node.mu.Lock() + defer node.mu.Unlock() + + return fn() +} + +// Mount mounts a `inode` over the subtree at `node`. +func (mns *MountNamespace) Mount(ctx context.Context, mountPoint *Dirent, inode *Inode) error { + return mns.withMountLocked(mountPoint, func() error { + replacement, err := mountPoint.mount(ctx, inode) + if err != nil { + return err + } + defer replacement.DecRef() + + // Set the mount's root dirent and id. + parentMnt := mns.findMountLocked(mountPoint) + childMnt := newMount(mns.mountID, parentMnt.ID, replacement) + mns.mountID++ + + // Drop mountPoint from its dirent cache. + mountPoint.dropExtendedReference() + + // If mountPoint is already a mount, push mountPoint on the stack so it can + // be recovered on unmount. + if prev := mns.mounts[mountPoint]; prev != nil { + childMnt.previous = prev + mns.mounts[replacement] = childMnt + delete(mns.mounts, mountPoint) + return nil + } + + // Was not already mounted, just add another mount point. + childMnt.previous = newUndoMount(mountPoint) + mns.mounts[replacement] = childMnt + return nil + }) +} + +// Unmount ensures no references to the MountSource remain and removes `node` from +// this subtree. The subtree formerly mounted in `node`'s place will be +// restored. node's MountSource will be destroyed as soon as the last reference to +// `node` is dropped, as no references to Dirents within will remain. +// +// If detachOnly is set, Unmount merely removes `node` from the subtree, but +// allows existing references to the MountSource remain. E.g. if an open file still +// refers to Dirents in MountSource, the Unmount will succeed anyway and MountSource will +// be destroyed at a later time when all references to Dirents within are +// dropped. +// +// The caller must hold a reference to node from walking to it. +func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly bool) error { + // This takes locks to prevent further walks to Dirents in this mount + // under the assumption that `node` is the root of the mount. + return mns.withMountLocked(node, func() error { + orig, ok := mns.mounts[node] + if !ok { + // node is not a mount point. + return syserror.EINVAL + } + + if orig.previous == nil { + panic("cannot unmount initial dirent") + } + + m := node.Inode.MountSource + if !detachOnly { + // Flush all references on the mounted node. + m.FlushDirentRefs() + + // At this point, exactly two references must be held + // to mount: one mount reference on node, and one due + // to walking to node. + // + // We must also be guaranteed that no more references + // can be taken on mount. This is why withMountLocked + // must be held at this point to prevent any walks to + // and from node. + if refs := m.DirentRefs(); refs < 2 { + panic(fmt.Sprintf("have %d refs on unmount, expect 2 or more", refs)) + } else if refs != 2 { + return syserror.EBUSY + } + } + + prev := orig.previous + if err := node.unmount(ctx, prev.root); err != nil { + return err + } + + if prev.previous == nil { + if !prev.IsUndo() { + panic(fmt.Sprintf("Last mount in the chain must be a undo mount: %+v", prev)) + } + // Drop mount reference taken at the end of MountNamespace.Mount. + prev.root.DecRef() + } else { + mns.mounts[prev.root] = prev + } + delete(mns.mounts, node) + + return nil + }) +} + +// FindMount returns the mount that 'd' belongs to. It walks the dirent back +// until a mount is found. It may return nil if no mount was found. +func (mns *MountNamespace) FindMount(d *Dirent) *Mount { + mns.mu.Lock() + defer mns.mu.Unlock() + renameMu.Lock() + defer renameMu.Unlock() + + return mns.findMountLocked(d) +} + +func (mns *MountNamespace) findMountLocked(d *Dirent) *Mount { + for { + if mnt := mns.mounts[d]; mnt != nil { + return mnt + } + if d.parent == nil { + return nil + } + d = d.parent + } +} + +// AllMountsUnder returns a slice of all mounts under the parent, including +// itself. +func (mns *MountNamespace) AllMountsUnder(parent *Mount) []*Mount { + mns.mu.Lock() + defer mns.mu.Unlock() + + var rv []*Mount + for _, mp := range mns.mounts { + if !mp.IsUndo() && mp.root.descendantOf(parent.root) { + rv = append(rv, mp) + } + } + return rv +} + +// FindLink returns an Dirent from a given node, which may be a symlink. +// +// The root argument is treated as the root directory, and FindLink will not +// return anything above that. The wd dirent provides the starting directory, +// and may be nil which indicates the root should be used. You must call DecRef +// on the resulting Dirent when you are no longer using the object. +// +// If wd is nil, then the root will be used as the working directory. If the +// path is absolute, this has no functional impact. +// +// Precondition: root must be non-nil. +// Precondition: the path must be non-empty. +func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path string, remainingTraversals *uint) (*Dirent, error) { + if root == nil { + panic("MountNamespace.FindLink: root must not be nil") + } + if len(path) == 0 { + panic("MountNamespace.FindLink: path is empty") + } + + // Split the path. + first, remainder := SplitFirst(path) + + // Where does this walk originate? + current := wd + if current == nil { + current = root + } + for first == "/" { + // Special case: it's possible that we have nothing to walk at + // all. This is necessary since we're resplitting the path. + if remainder == "" { + root.IncRef() + return root, nil + } + + // Start at the root and advance the path component so that the + // walk below can proceed. Note at this point, it handles the + // no-op walk case perfectly fine. + current = root + first, remainder = SplitFirst(remainder) + } + + current.IncRef() // Transferred during walk. + + for { + // Check that the file is a directory and that we have + // permissions to walk. + // + // Note that we elide this check for the root directory as an + // optimization; a non-executable root may still be walked. A + // non-directory root is hopeless. + if current != root { + if !IsDir(current.Inode.StableAttr) { + current.DecRef() // Drop reference from above. + return nil, syserror.ENOTDIR + } + if err := current.Inode.CheckPermission(ctx, PermMask{Execute: true}); err != nil { + current.DecRef() // Drop reference from above. + return nil, err + } + } + + // Move to the next level. + next, err := current.Walk(ctx, root, first) + if err != nil { + // Allow failed walks to cache the dirent, because no + // children will acquire a reference at the end. + current.maybeExtendReference() + current.DecRef() + return nil, err + } + + // Drop old reference. + current.DecRef() + + if remainder != "" { + // Ensure it's resolved, unless it's the last level. + // + // See resolve for reference semantics; on err next + // will have one dropped. + current, err = mns.resolve(ctx, root, next, remainingTraversals) + if err != nil { + return nil, err + } + } else { + // Allow the file system to take an extra reference on the + // found child. This will hold a reference on the containing + // directory, so the whole tree will be implicitly cached. + next.maybeExtendReference() + return next, nil + } + + // Move to the next element. + first, remainder = SplitFirst(remainder) + } +} + +// FindInode is identical to FindLink except the return value is resolved. +// +//go:nosplit +func (mns *MountNamespace) FindInode(ctx context.Context, root, wd *Dirent, path string, remainingTraversals *uint) (*Dirent, error) { + d, err := mns.FindLink(ctx, root, wd, path, remainingTraversals) + if err != nil { + return nil, err + } + + // See resolve for reference semantics; on err d will have the + // reference dropped. + return mns.resolve(ctx, root, d, remainingTraversals) +} + +// resolve resolves the given link. +// +// If successful, a reference is dropped on node and one is acquired on the +// caller's behalf for the returned dirent. +// +// If not successful, a reference is _also_ dropped on the node and an error +// returned. This is for convenience in using resolve directly as a return +// value. +func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, remainingTraversals *uint) (*Dirent, error) { + // Resolve the path. + target, err := node.Inode.Getlink(ctx) + + switch err { + case nil: + // Make sure we didn't exhaust the traversal budget. + if *remainingTraversals == 0 { + target.DecRef() + return nil, syscall.ELOOP + } + + node.DecRef() // Drop the original reference. + return target, nil + + case syscall.ENOLINK: + // Not a symlink. + return node, nil + + case ErrResolveViaReadlink: + defer node.DecRef() // See above. + + // First, check if we should traverse. + if *remainingTraversals == 0 { + return nil, syscall.ELOOP + } + + // Read the target path. + targetPath, err := node.Inode.Readlink(ctx) + if err != nil { + return nil, err + } + + // Find the node; we resolve relative to the current symlink's parent. + renameMu.RLock() + parent := node.parent + renameMu.RUnlock() + *remainingTraversals-- + d, err := mns.FindInode(ctx, root, parent, targetPath, remainingTraversals) + if err != nil { + return nil, err + } + + return d, err + + default: + node.DecRef() // Drop for err; see above. + + // Propagate the error. + return nil, err + } +} + +// SyncAll calls Dirent.SyncAll on the root. +func (mns *MountNamespace) SyncAll(ctx context.Context) { + mns.mu.Lock() + defer mns.mu.Unlock() + mns.root.SyncAll(ctx) +} |