diff options
Diffstat (limited to 'pkg/sentry/vfs/mount.go')
-rw-r--r-- | pkg/sentry/vfs/mount.go | 411 |
1 files changed, 411 insertions, 0 deletions
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go new file mode 100644 index 000000000..11702f720 --- /dev/null +++ b/pkg/sentry/vfs/mount.go @@ -0,0 +1,411 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "math" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +// A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem +// (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem +// (Mount.fs), which applies to path resolution in the context of a particular +// Mount (Mount.key.parent). +// +// Mounts are reference-counted. Unless otherwise specified, all Mount methods +// require that a reference is held. +// +// Mount and Filesystem are distinct types because it's possible for a single +// Filesystem to be mounted at multiple locations and/or in multiple mount +// namespaces. +// +// Mount is analogous to Linux's struct mount. (gVisor does not distinguish +// between struct mount and struct vfsmount.) +type Mount struct { + // The lower 63 bits of refs are a reference count. The MSB of refs is set + // if the Mount has been eagerly unmounted, as by umount(2) without the + // MNT_DETACH flag. refs is accessed using atomic memory operations. + refs int64 + + // The lower 63 bits of writers is the number of calls to + // Mount.CheckBeginWrite() that have not yet been paired with a call to + // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect. + // writers is accessed using atomic memory operations. + writers int64 + + // key is protected by VirtualFilesystem.mountMu and + // VirtualFilesystem.mounts.seq, and may be nil. References are held on + // key.parent and key.point if they are not nil. + // + // Invariant: key.parent != nil iff key.point != nil. key.point belongs to + // key.parent.fs. + key mountKey + + // fs, root, and ns are immutable. References are held on fs and root (but + // not ns). + // + // Invariant: root belongs to fs. + fs *Filesystem + root *Dentry + ns *MountNamespace +} + +// A MountNamespace is a collection of Mounts. +// +// MountNamespaces are reference-counted. Unless otherwise specified, all +// MountNamespace methods require that a reference is held. +// +// MountNamespace is analogous to Linux's struct mnt_namespace. +type MountNamespace struct { + refs int64 // accessed using atomic memory operations + + // root is the MountNamespace's root mount. root is immutable. + root *Mount + + // mountpoints contains all Dentries which are mount points in this + // namespace. mountpoints is protected by VirtualFilesystem.mountMu. + // + // mountpoints is used to determine if a Dentry can be moved or removed + // (which requires that the Dentry is not a mount point in the calling + // namespace). + // + // mountpoints is maintained even if there are no references held on the + // MountNamespace; this is required to ensure that + // VFS.PrepareDeleteDentry() and VFS.PrepareRemoveDentry() operate + // correctly on unreferenced MountNamespaces. + mountpoints map[*Dentry]struct{} +} + +// NewMountNamespace returns a new mount namespace with a root filesystem +// configured by the given arguments. A reference is taken on the returned +// MountNamespace. +func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *NewFilesystemOptions) (*MountNamespace, error) { + fsType := vfs.getFilesystemType(fsTypeName) + if fsType == nil { + return nil, syserror.ENODEV + } + fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts) + if err != nil { + return nil, err + } + mntns := &MountNamespace{ + refs: 1, + mountpoints: make(map[*Dentry]struct{}), + } + mntns.root = &Mount{ + fs: fs, + root: root, + ns: mntns, + refs: 1, + } + return mntns, nil +} + +// NewMount creates and mounts a new Filesystem. +func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *NewFilesystemOptions) error { + fsType := vfs.getFilesystemType(fsTypeName) + if fsType == nil { + return syserror.ENODEV + } + fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts) + if err != nil { + return err + } + // We can't hold vfs.mountMu while calling FilesystemImpl methods due to + // lock ordering. + vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{}) + if err != nil { + root.decRef(fs) + fs.decRef() + return err + } + vfs.mountMu.Lock() + for { + if vd.dentry.IsDisowned() { + vfs.mountMu.Unlock() + vd.DecRef() + root.decRef(fs) + fs.decRef() + return syserror.ENOENT + } + // vd might have been mounted over between vfs.GetDentryAt() and + // vfs.mountMu.Lock(). + if !vd.dentry.isMounted() { + break + } + nextmnt := vfs.mounts.Lookup(vd.mount, vd.dentry) + if nextmnt == nil { + break + } + nextmnt.incRef() + nextmnt.root.incRef(nextmnt.fs) + vd.DecRef() + vd = VirtualDentry{ + mount: nextmnt, + dentry: nextmnt.root, + } + } + // TODO: Linux requires that either both the mount point and the mount root + // are directories, or neither are, and returns ENOTDIR if this is not the + // case. + mntns := vd.mount.ns + mnt := &Mount{ + fs: fs, + root: root, + ns: mntns, + refs: 1, + } + mnt.storeKey(vd.mount, vd.dentry) + atomic.AddUint32(&vd.dentry.mounts, 1) + mntns.mountpoints[vd.dentry] = struct{}{} + vfsmpmounts, ok := vfs.mountpoints[vd.dentry] + if !ok { + vfsmpmounts = make(map[*Mount]struct{}) + vfs.mountpoints[vd.dentry] = vfsmpmounts + } + vfsmpmounts[mnt] = struct{}{} + vfs.mounts.Insert(mnt) + vfs.mountMu.Unlock() + return nil +} + +// getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes +// a reference on the returned Mount. If (mnt, d) is not a mount point, +// getMountAt returns nil. +// +// getMountAt is analogous to Linux's fs/namei.c:follow_mount(). +// +// Preconditions: References are held on mnt and d. +func (vfs *VirtualFilesystem) getMountAt(mnt *Mount, d *Dentry) *Mount { + // The first mount is special-cased: + // + // - The caller is assumed to have checked d.isMounted() already. (This + // isn't a precondition because it doesn't matter for correctness.) + // + // - We return nil, instead of mnt, if there is no mount at (mnt, d). + // + // - We don't drop the caller's references on mnt and d. +retryFirst: + next := vfs.mounts.Lookup(mnt, d) + if next == nil { + return nil + } + if !next.tryIncMountedRef() { + // Raced with umount. + goto retryFirst + } + mnt = next + d = next.root + // We don't need to take Dentry refs anywhere in this function because + // Mounts hold references on Mount.root, which is immutable. + for d.isMounted() { + next := vfs.mounts.Lookup(mnt, d) + if next == nil { + break + } + if !next.tryIncMountedRef() { + // Raced with umount. + continue + } + mnt.decRef() + mnt = next + d = next.root + } + return mnt +} + +// getMountpointAt returns the mount point for the stack of Mounts including +// mnt. It takes a reference on the returned Mount and Dentry. If no such mount +// point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil). +// +// Preconditions: References are held on mnt and root. vfsroot is not (mnt, +// mnt.root). +func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) (*Mount, *Dentry) { + // The first mount is special-cased: + // + // - The caller must have already checked mnt against vfsroot. + // + // - We return nil, instead of mnt, if there is no mount point for mnt. + // + // - We don't drop the caller's reference on mnt. +retryFirst: + epoch := vfs.mounts.seq.BeginRead() + parent, point := mnt.loadKey() + if !vfs.mounts.seq.ReadOk(epoch) { + goto retryFirst + } + if parent == nil { + return nil, nil + } + if !parent.tryIncMountedRef() { + // Raced with umount. + goto retryFirst + } + if !point.tryIncRef(parent.fs) { + // Since Mount holds a reference on Mount.key.point, this can only + // happen due to a racing change to Mount.key. + parent.decRef() + goto retryFirst + } + mnt = parent + d := point + for { + if mnt == vfsroot.mount && d == vfsroot.dentry { + break + } + if d != mnt.root { + break + } + retryNotFirst: + epoch := vfs.mounts.seq.BeginRead() + parent, point := mnt.loadKey() + if !vfs.mounts.seq.ReadOk(epoch) { + goto retryNotFirst + } + if parent == nil { + break + } + if !parent.tryIncMountedRef() { + // Raced with umount. + goto retryNotFirst + } + if !point.tryIncRef(parent.fs) { + // Since Mount holds a reference on Mount.key.point, this can + // only happen due to a racing change to Mount.key. + parent.decRef() + goto retryNotFirst + } + if !vfs.mounts.seq.ReadOk(epoch) { + point.decRef(parent.fs) + parent.decRef() + goto retryNotFirst + } + d.decRef(mnt.fs) + mnt.decRef() + mnt = parent + d = point + } + return mnt, d +} + +// tryIncMountedRef increments mnt's reference count and returns true. If mnt's +// reference count is already zero, or has been eagerly unmounted, +// tryIncMountedRef does nothing and returns false. +// +// tryIncMountedRef does not require that a reference is held on mnt. +func (mnt *Mount) tryIncMountedRef() bool { + for { + refs := atomic.LoadInt64(&mnt.refs) + if refs <= 0 { // refs < 0 => MSB set => eagerly unmounted + return false + } + if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs+1) { + return true + } + } +} + +func (mnt *Mount) incRef() { + // In general, negative values for mnt.refs are valid because the MSB is + // the eager-unmount bit. + atomic.AddInt64(&mnt.refs, 1) +} + +func (mnt *Mount) decRef() { + refs := atomic.AddInt64(&mnt.refs, -1) + if refs&^math.MinInt64 == 0 { // mask out MSB + parent, point := mnt.loadKey() + if point != nil { + point.decRef(parent.fs) + parent.decRef() + } + mnt.root.decRef(mnt.fs) + mnt.fs.decRef() + } +} + +// CheckBeginWrite increments the counter of in-progress write operations on +// mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns +// EROFS. +// +// If CheckBeginWrite succeeds, EndWrite must be called when the write +// operation is finished. +func (mnt *Mount) CheckBeginWrite() error { + if atomic.AddInt64(&mnt.writers, 1) < 0 { + atomic.AddInt64(&mnt.writers, -1) + return syserror.EROFS + } + return nil +} + +// EndWrite indicates that a write operation signaled by a previous successful +// call to CheckBeginWrite has finished. +func (mnt *Mount) EndWrite() { + atomic.AddInt64(&mnt.writers, -1) +} + +// Preconditions: VirtualFilesystem.mountMu must be locked for writing. +func (mnt *Mount) setReadOnlyLocked(ro bool) error { + if oldRO := atomic.LoadInt64(&mnt.writers) < 0; oldRO == ro { + return nil + } + if ro { + if !atomic.CompareAndSwapInt64(&mnt.writers, 0, math.MinInt64) { + return syserror.EBUSY + } + return nil + } + // Unset MSB without dropping any temporary increments from failed calls to + // mnt.CheckBeginWrite(). + atomic.AddInt64(&mnt.writers, math.MinInt64) + return nil +} + +// Filesystem returns the mounted Filesystem. It does not take a reference on +// the returned Filesystem. +func (mnt *Mount) Filesystem() *Filesystem { + return mnt.fs +} + +// IncRef increments mntns' reference count. +func (mntns *MountNamespace) IncRef() { + if atomic.AddInt64(&mntns.refs, 1) <= 1 { + panic("MountNamespace.IncRef() called without holding a reference") + } +} + +// DecRef decrements mntns' reference count. +func (mntns *MountNamespace) DecRef() { + if refs := atomic.AddInt64(&mntns.refs, 0); refs == 0 { + // TODO: unmount mntns.root + } else if refs < 0 { + panic("MountNamespace.DecRef() called without holding a reference") + } +} + +// Root returns mntns' root. A reference is taken on the returned +// VirtualDentry. +func (mntns *MountNamespace) Root() VirtualDentry { + vd := VirtualDentry{ + mount: mntns.root, + dentry: mntns.root.root, + } + vd.IncRef() + return vd +} |