summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/vfs/mount.go
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/vfs/mount.go')
-rw-r--r--pkg/sentry/vfs/mount.go411
1 files changed, 411 insertions, 0 deletions
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
new file mode 100644
index 000000000..11702f720
--- /dev/null
+++ b/pkg/sentry/vfs/mount.go
@@ -0,0 +1,411 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "math"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem
+// (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem
+// (Mount.fs), which applies to path resolution in the context of a particular
+// Mount (Mount.key.parent).
+//
+// Mounts are reference-counted. Unless otherwise specified, all Mount methods
+// require that a reference is held.
+//
+// Mount and Filesystem are distinct types because it's possible for a single
+// Filesystem to be mounted at multiple locations and/or in multiple mount
+// namespaces.
+//
+// Mount is analogous to Linux's struct mount. (gVisor does not distinguish
+// between struct mount and struct vfsmount.)
+type Mount struct {
+ // The lower 63 bits of refs are a reference count. The MSB of refs is set
+ // if the Mount has been eagerly unmounted, as by umount(2) without the
+ // MNT_DETACH flag. refs is accessed using atomic memory operations.
+ refs int64
+
+ // The lower 63 bits of writers is the number of calls to
+ // Mount.CheckBeginWrite() that have not yet been paired with a call to
+ // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect.
+ // writers is accessed using atomic memory operations.
+ writers int64
+
+ // key is protected by VirtualFilesystem.mountMu and
+ // VirtualFilesystem.mounts.seq, and may be nil. References are held on
+ // key.parent and key.point if they are not nil.
+ //
+ // Invariant: key.parent != nil iff key.point != nil. key.point belongs to
+ // key.parent.fs.
+ key mountKey
+
+ // fs, root, and ns are immutable. References are held on fs and root (but
+ // not ns).
+ //
+ // Invariant: root belongs to fs.
+ fs *Filesystem
+ root *Dentry
+ ns *MountNamespace
+}
+
+// A MountNamespace is a collection of Mounts.
+//
+// MountNamespaces are reference-counted. Unless otherwise specified, all
+// MountNamespace methods require that a reference is held.
+//
+// MountNamespace is analogous to Linux's struct mnt_namespace.
+type MountNamespace struct {
+ refs int64 // accessed using atomic memory operations
+
+ // root is the MountNamespace's root mount. root is immutable.
+ root *Mount
+
+ // mountpoints contains all Dentries which are mount points in this
+ // namespace. mountpoints is protected by VirtualFilesystem.mountMu.
+ //
+ // mountpoints is used to determine if a Dentry can be moved or removed
+ // (which requires that the Dentry is not a mount point in the calling
+ // namespace).
+ //
+ // mountpoints is maintained even if there are no references held on the
+ // MountNamespace; this is required to ensure that
+ // VFS.PrepareDeleteDentry() and VFS.PrepareRemoveDentry() operate
+ // correctly on unreferenced MountNamespaces.
+ mountpoints map[*Dentry]struct{}
+}
+
+// NewMountNamespace returns a new mount namespace with a root filesystem
+// configured by the given arguments. A reference is taken on the returned
+// MountNamespace.
+func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *NewFilesystemOptions) (*MountNamespace, error) {
+ fsType := vfs.getFilesystemType(fsTypeName)
+ if fsType == nil {
+ return nil, syserror.ENODEV
+ }
+ fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts)
+ if err != nil {
+ return nil, err
+ }
+ mntns := &MountNamespace{
+ refs: 1,
+ mountpoints: make(map[*Dentry]struct{}),
+ }
+ mntns.root = &Mount{
+ fs: fs,
+ root: root,
+ ns: mntns,
+ refs: 1,
+ }
+ return mntns, nil
+}
+
+// NewMount creates and mounts a new Filesystem.
+func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *NewFilesystemOptions) error {
+ fsType := vfs.getFilesystemType(fsTypeName)
+ if fsType == nil {
+ return syserror.ENODEV
+ }
+ fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts)
+ if err != nil {
+ return err
+ }
+ // We can't hold vfs.mountMu while calling FilesystemImpl methods due to
+ // lock ordering.
+ vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{})
+ if err != nil {
+ root.decRef(fs)
+ fs.decRef()
+ return err
+ }
+ vfs.mountMu.Lock()
+ for {
+ if vd.dentry.IsDisowned() {
+ vfs.mountMu.Unlock()
+ vd.DecRef()
+ root.decRef(fs)
+ fs.decRef()
+ return syserror.ENOENT
+ }
+ // vd might have been mounted over between vfs.GetDentryAt() and
+ // vfs.mountMu.Lock().
+ if !vd.dentry.isMounted() {
+ break
+ }
+ nextmnt := vfs.mounts.Lookup(vd.mount, vd.dentry)
+ if nextmnt == nil {
+ break
+ }
+ nextmnt.incRef()
+ nextmnt.root.incRef(nextmnt.fs)
+ vd.DecRef()
+ vd = VirtualDentry{
+ mount: nextmnt,
+ dentry: nextmnt.root,
+ }
+ }
+ // TODO: Linux requires that either both the mount point and the mount root
+ // are directories, or neither are, and returns ENOTDIR if this is not the
+ // case.
+ mntns := vd.mount.ns
+ mnt := &Mount{
+ fs: fs,
+ root: root,
+ ns: mntns,
+ refs: 1,
+ }
+ mnt.storeKey(vd.mount, vd.dentry)
+ atomic.AddUint32(&vd.dentry.mounts, 1)
+ mntns.mountpoints[vd.dentry] = struct{}{}
+ vfsmpmounts, ok := vfs.mountpoints[vd.dentry]
+ if !ok {
+ vfsmpmounts = make(map[*Mount]struct{})
+ vfs.mountpoints[vd.dentry] = vfsmpmounts
+ }
+ vfsmpmounts[mnt] = struct{}{}
+ vfs.mounts.Insert(mnt)
+ vfs.mountMu.Unlock()
+ return nil
+}
+
+// getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes
+// a reference on the returned Mount. If (mnt, d) is not a mount point,
+// getMountAt returns nil.
+//
+// getMountAt is analogous to Linux's fs/namei.c:follow_mount().
+//
+// Preconditions: References are held on mnt and d.
+func (vfs *VirtualFilesystem) getMountAt(mnt *Mount, d *Dentry) *Mount {
+ // The first mount is special-cased:
+ //
+ // - The caller is assumed to have checked d.isMounted() already. (This
+ // isn't a precondition because it doesn't matter for correctness.)
+ //
+ // - We return nil, instead of mnt, if there is no mount at (mnt, d).
+ //
+ // - We don't drop the caller's references on mnt and d.
+retryFirst:
+ next := vfs.mounts.Lookup(mnt, d)
+ if next == nil {
+ return nil
+ }
+ if !next.tryIncMountedRef() {
+ // Raced with umount.
+ goto retryFirst
+ }
+ mnt = next
+ d = next.root
+ // We don't need to take Dentry refs anywhere in this function because
+ // Mounts hold references on Mount.root, which is immutable.
+ for d.isMounted() {
+ next := vfs.mounts.Lookup(mnt, d)
+ if next == nil {
+ break
+ }
+ if !next.tryIncMountedRef() {
+ // Raced with umount.
+ continue
+ }
+ mnt.decRef()
+ mnt = next
+ d = next.root
+ }
+ return mnt
+}
+
+// getMountpointAt returns the mount point for the stack of Mounts including
+// mnt. It takes a reference on the returned Mount and Dentry. If no such mount
+// point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil).
+//
+// Preconditions: References are held on mnt and root. vfsroot is not (mnt,
+// mnt.root).
+func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) (*Mount, *Dentry) {
+ // The first mount is special-cased:
+ //
+ // - The caller must have already checked mnt against vfsroot.
+ //
+ // - We return nil, instead of mnt, if there is no mount point for mnt.
+ //
+ // - We don't drop the caller's reference on mnt.
+retryFirst:
+ epoch := vfs.mounts.seq.BeginRead()
+ parent, point := mnt.loadKey()
+ if !vfs.mounts.seq.ReadOk(epoch) {
+ goto retryFirst
+ }
+ if parent == nil {
+ return nil, nil
+ }
+ if !parent.tryIncMountedRef() {
+ // Raced with umount.
+ goto retryFirst
+ }
+ if !point.tryIncRef(parent.fs) {
+ // Since Mount holds a reference on Mount.key.point, this can only
+ // happen due to a racing change to Mount.key.
+ parent.decRef()
+ goto retryFirst
+ }
+ mnt = parent
+ d := point
+ for {
+ if mnt == vfsroot.mount && d == vfsroot.dentry {
+ break
+ }
+ if d != mnt.root {
+ break
+ }
+ retryNotFirst:
+ epoch := vfs.mounts.seq.BeginRead()
+ parent, point := mnt.loadKey()
+ if !vfs.mounts.seq.ReadOk(epoch) {
+ goto retryNotFirst
+ }
+ if parent == nil {
+ break
+ }
+ if !parent.tryIncMountedRef() {
+ // Raced with umount.
+ goto retryNotFirst
+ }
+ if !point.tryIncRef(parent.fs) {
+ // Since Mount holds a reference on Mount.key.point, this can
+ // only happen due to a racing change to Mount.key.
+ parent.decRef()
+ goto retryNotFirst
+ }
+ if !vfs.mounts.seq.ReadOk(epoch) {
+ point.decRef(parent.fs)
+ parent.decRef()
+ goto retryNotFirst
+ }
+ d.decRef(mnt.fs)
+ mnt.decRef()
+ mnt = parent
+ d = point
+ }
+ return mnt, d
+}
+
+// tryIncMountedRef increments mnt's reference count and returns true. If mnt's
+// reference count is already zero, or has been eagerly unmounted,
+// tryIncMountedRef does nothing and returns false.
+//
+// tryIncMountedRef does not require that a reference is held on mnt.
+func (mnt *Mount) tryIncMountedRef() bool {
+ for {
+ refs := atomic.LoadInt64(&mnt.refs)
+ if refs <= 0 { // refs < 0 => MSB set => eagerly unmounted
+ return false
+ }
+ if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs+1) {
+ return true
+ }
+ }
+}
+
+func (mnt *Mount) incRef() {
+ // In general, negative values for mnt.refs are valid because the MSB is
+ // the eager-unmount bit.
+ atomic.AddInt64(&mnt.refs, 1)
+}
+
+func (mnt *Mount) decRef() {
+ refs := atomic.AddInt64(&mnt.refs, -1)
+ if refs&^math.MinInt64 == 0 { // mask out MSB
+ parent, point := mnt.loadKey()
+ if point != nil {
+ point.decRef(parent.fs)
+ parent.decRef()
+ }
+ mnt.root.decRef(mnt.fs)
+ mnt.fs.decRef()
+ }
+}
+
+// CheckBeginWrite increments the counter of in-progress write operations on
+// mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns
+// EROFS.
+//
+// If CheckBeginWrite succeeds, EndWrite must be called when the write
+// operation is finished.
+func (mnt *Mount) CheckBeginWrite() error {
+ if atomic.AddInt64(&mnt.writers, 1) < 0 {
+ atomic.AddInt64(&mnt.writers, -1)
+ return syserror.EROFS
+ }
+ return nil
+}
+
+// EndWrite indicates that a write operation signaled by a previous successful
+// call to CheckBeginWrite has finished.
+func (mnt *Mount) EndWrite() {
+ atomic.AddInt64(&mnt.writers, -1)
+}
+
+// Preconditions: VirtualFilesystem.mountMu must be locked for writing.
+func (mnt *Mount) setReadOnlyLocked(ro bool) error {
+ if oldRO := atomic.LoadInt64(&mnt.writers) < 0; oldRO == ro {
+ return nil
+ }
+ if ro {
+ if !atomic.CompareAndSwapInt64(&mnt.writers, 0, math.MinInt64) {
+ return syserror.EBUSY
+ }
+ return nil
+ }
+ // Unset MSB without dropping any temporary increments from failed calls to
+ // mnt.CheckBeginWrite().
+ atomic.AddInt64(&mnt.writers, math.MinInt64)
+ return nil
+}
+
+// Filesystem returns the mounted Filesystem. It does not take a reference on
+// the returned Filesystem.
+func (mnt *Mount) Filesystem() *Filesystem {
+ return mnt.fs
+}
+
+// IncRef increments mntns' reference count.
+func (mntns *MountNamespace) IncRef() {
+ if atomic.AddInt64(&mntns.refs, 1) <= 1 {
+ panic("MountNamespace.IncRef() called without holding a reference")
+ }
+}
+
+// DecRef decrements mntns' reference count.
+func (mntns *MountNamespace) DecRef() {
+ if refs := atomic.AddInt64(&mntns.refs, 0); refs == 0 {
+ // TODO: unmount mntns.root
+ } else if refs < 0 {
+ panic("MountNamespace.DecRef() called without holding a reference")
+ }
+}
+
+// Root returns mntns' root. A reference is taken on the returned
+// VirtualDentry.
+func (mntns *MountNamespace) Root() VirtualDentry {
+ vd := VirtualDentry{
+ mount: mntns.root,
+ dentry: mntns.root.root,
+ }
+ vd.IncRef()
+ return vd
+}