summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/vfs
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/vfs')
-rw-r--r--pkg/sentry/vfs/BUILD46
-rw-r--r--pkg/sentry/vfs/README.md197
-rw-r--r--pkg/sentry/vfs/context.go37
-rw-r--r--pkg/sentry/vfs/debug.go22
-rw-r--r--pkg/sentry/vfs/dentry.go347
-rw-r--r--pkg/sentry/vfs/file_description.go213
-rw-r--r--pkg/sentry/vfs/file_description_impl_util.go142
-rw-r--r--pkg/sentry/vfs/filesystem.go155
-rw-r--r--pkg/sentry/vfs/filesystem_type.go70
-rw-r--r--pkg/sentry/vfs/mount.go411
-rw-r--r--pkg/sentry/vfs/mount_test.go465
-rw-r--r--pkg/sentry/vfs/mount_unsafe.go356
-rw-r--r--pkg/sentry/vfs/options.go123
-rw-r--r--pkg/sentry/vfs/permissions.go121
-rw-r--r--pkg/sentry/vfs/resolving_path.go453
-rw-r--r--pkg/sentry/vfs/syscalls.go217
-rw-r--r--pkg/sentry/vfs/vfs.go135
17 files changed, 0 insertions, 3510 deletions
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
deleted file mode 100644
index 4de6c41cf..000000000
--- a/pkg/sentry/vfs/BUILD
+++ /dev/null
@@ -1,46 +0,0 @@
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
-
-package(licenses = ["notice"])
-
-go_library(
- name = "vfs",
- srcs = [
- "context.go",
- "debug.go",
- "dentry.go",
- "file_description.go",
- "file_description_impl_util.go",
- "filesystem.go",
- "filesystem_type.go",
- "mount.go",
- "mount_unsafe.go",
- "options.go",
- "permissions.go",
- "resolving_path.go",
- "syscalls.go",
- "vfs.go",
- ],
- importpath = "gvisor.dev/gvisor/pkg/sentry/vfs",
- visibility = ["//pkg/sentry:internal"],
- deps = [
- "//pkg/abi/linux",
- "//pkg/fspath",
- "//pkg/sentry/arch",
- "//pkg/sentry/context",
- "//pkg/sentry/kernel/auth",
- "//pkg/sentry/memmap",
- "//pkg/sentry/usermem",
- "//pkg/syserror",
- "//pkg/waiter",
- "//third_party/gvsync",
- ],
-)
-
-go_test(
- name = "vfs_test",
- size = "small",
- srcs = [
- "mount_test.go",
- ],
- embed = [":vfs"],
-)
diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md
deleted file mode 100644
index 7847854bc..000000000
--- a/pkg/sentry/vfs/README.md
+++ /dev/null
@@ -1,197 +0,0 @@
-# The gVisor Virtual Filesystem
-
-THIS PACKAGE IS CURRENTLY EXPERIMENTAL AND NOT READY OR ENABLED FOR PRODUCTION
-USE. For the filesystem implementation currently used by gVisor, see the `fs`
-package.
-
-## Implementation Notes
-
-### Reference Counting
-
-Filesystem, Dentry, Mount, MountNamespace, and FileDescription are all
-reference-counted. Mount and MountNamespace are exclusively VFS-managed; when
-their reference count reaches zero, VFS releases their resources. Filesystem and
-FileDescription management is shared between VFS and filesystem implementations;
-when their reference count reaches zero, VFS notifies the implementation by
-calling `FilesystemImpl.Release()` or `FileDescriptionImpl.Release()`
-respectively and then releases VFS-owned resources. Dentries are exclusively
-managed by filesystem implementations; reference count changes are abstracted
-through DentryImpl, which should release resources when reference count reaches
-zero.
-
-Filesystem references are held by:
-
-- Mount: Each referenced Mount holds a reference on the mounted Filesystem.
-
-Dentry references are held by:
-
-- FileDescription: Each referenced FileDescription holds a reference on the
- Dentry through which it was opened, via `FileDescription.vd.dentry`.
-
-- Mount: Each referenced Mount holds a reference on its mount point and on the
- mounted filesystem root. The mount point is mutable (`mount(MS_MOVE)`).
-
-Mount references are held by:
-
-- FileDescription: Each referenced FileDescription holds a reference on the
- Mount on which it was opened, via `FileDescription.vd.mount`.
-
-- Mount: Each referenced Mount holds a reference on its parent, which is the
- mount containing its mount point.
-
-- VirtualFilesystem: A reference is held on all Mounts that are attached
- (reachable by Mount traversal).
-
-MountNamespace and FileDescription references are held by users of VFS. The
-expectation is that each `kernel.Task` holds a reference on its corresponding
-MountNamespace, and each file descriptor holds a reference on its represented
-FileDescription.
-
-Notes:
-
-- Dentries do not hold a reference on their owning Filesystem. Instead, all
- uses of a Dentry occur in the context of a Mount, which holds a reference on
- the relevant Filesystem (see e.g. the VirtualDentry type). As a corollary,
- when releasing references on both a Dentry and its corresponding Mount, the
- Dentry's reference must be released first (because releasing the Mount's
- reference may release the last reference on the Filesystem, whose state may
- be required to release the Dentry reference).
-
-### The Inheritance Pattern
-
-Filesystem, Dentry, and FileDescription are all concepts featuring both state
-that must be shared between VFS and filesystem implementations, and operations
-that are implementation-defined. To facilitate this, each of these three
-concepts follows the same pattern, shown below for Dentry:
-
-```go
-// Dentry represents a node in a filesystem tree.
-type Dentry struct {
- // VFS-required dentry state.
- parent *Dentry
- // ...
-
- // impl is the DentryImpl associated with this Dentry. impl is immutable.
- // This should be the last field in Dentry.
- impl DentryImpl
-}
-
-// Init must be called before first use of d.
-func (d *Dentry) Init(impl DentryImpl) {
- d.impl = impl
-}
-
-// Impl returns the DentryImpl associated with d.
-func (d *Dentry) Impl() DentryImpl {
- return d.impl
-}
-
-// DentryImpl contains implementation-specific details of a Dentry.
-// Implementations of DentryImpl should contain their associated Dentry by
-// value as their first field.
-type DentryImpl interface {
- // VFS-required implementation-defined dentry operations.
- IncRef()
- // ...
-}
-```
-
-This construction, which is essentially a type-safe analogue to Linux's
-`container_of` pattern, has the following properties:
-
-- VFS works almost exclusively with pointers to Dentry rather than DentryImpl
- interface objects, such as in the type of `Dentry.parent`. This avoids
- interface method calls (which are somewhat expensive to perform, and defeat
- inlining and escape analysis), reduces the size of VFS types (since an
- interface object is two pointers in size), and allows pointers to be loaded
- and stored atomically using `sync/atomic`. Implementation-defined behavior
- is accessed via `Dentry.impl` when required.
-
-- Filesystem implementations can access the implementation-defined state
- associated with objects of VFS types by type-asserting or type-switching
- (e.g. `Dentry.Impl().(*myDentry)`). Type assertions to a concrete type
- require only an equality comparison of the interface object's type pointer
- to a static constant, and are consequently very fast.
-
-- Filesystem implementations can access the VFS state associated with objects
- of implementation-defined types directly.
-
-- VFS and implementation-defined state for a given type occupy the same
- object, minimizing memory allocations and maximizing memory locality. `impl`
- is the last field in `Dentry`, and `Dentry` is the first field in
- `DentryImpl` implementations, for similar reasons: this tends to cause
- fetching of the `Dentry.impl` interface object to also fetch `DentryImpl`
- fields, either because they are in the same cache line or via next-line
- prefetching.
-
-## Future Work
-
-- Most `mount(2)` features, and unmounting, are incomplete.
-
-- VFS1 filesystems are not directly compatible with VFS2. It may be possible
- to implement shims that implement `vfs.FilesystemImpl` for
- `fs.MountNamespace`, `vfs.DentryImpl` for `fs.Dirent`, and
- `vfs.FileDescriptionImpl` for `fs.File`, which may be adequate for
- filesystems that are not performance-critical (e.g. sysfs); however, it is
- not clear that this will be less effort than simply porting the filesystems
- in question. Practically speaking, the following filesystems will probably
- need to be ported or made compatible through a shim to evaluate filesystem
- performance on realistic workloads:
-
- - devfs/procfs/sysfs, which will realistically be necessary to execute
- most applications. (Note that procfs and sysfs do not support hard
- links, so they do not require the complexity of separate inode objects.
- Also note that Linux's /dev is actually a variant of tmpfs called
- devtmpfs.)
-
- - tmpfs. This should be relatively straightforward: copy/paste memfs,
- store regular file contents in pgalloc-allocated memory instead of
- `[]byte`, and add support for file timestamps. (In fact, it probably
- makes more sense to convert memfs to tmpfs and not keep the former.)
-
- - A remote filesystem, either lisafs (if it is ready by the time that
- other benchmarking prerequisites are) or v9fs (aka 9P, aka gofers).
-
- - epoll files.
-
- Filesystems that will need to be ported before switching to VFS2, but can
- probably be skipped for early testing:
-
- - overlayfs, which is needed for (at least) synthetic mount points.
-
- - Support for host ttys.
-
- - timerfd files.
-
- Filesystems that can be probably dropped:
-
- - ashmem, which is far too incomplete to use.
-
- - binder, which is similarly far too incomplete to use.
-
- - whitelistfs, which we are already actively attempting to remove.
-
-- Save/restore. For instance, it is unclear if the current implementation of
- the `state` package supports the inheritance pattern described above.
-
-- Many features that were previously implemented by VFS must now be
- implemented by individual filesystems (though, in most cases, this should
- consist of calls to hooks or libraries provided by `vfs` or other packages).
- This includes, but is not necessarily limited to:
-
- - Block and character device special files
-
- - Inotify
-
- - File locking
-
- - `O_ASYNC`
-
-- Reference counts in the `vfs` package do not use the `refs` package since
- `refs.AtomicRefCount` adds 64 bytes of overhead to each 8-byte reference
- count, resulting in considerable cache bloat. 24 bytes of this overhead is
- for weak reference support, which have poor performance and will not be used
- by VFS2. The remaining 40 bytes is to store a descriptive string and stack
- trace for reference leak checking; we can support reference leak checking
- without incurring this space overhead by including the applicable
- information directly in finalizers for applicable types.
diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go
deleted file mode 100644
index 32cf9151b..000000000
--- a/pkg/sentry/vfs/context.go
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs
-
-import (
- "gvisor.dev/gvisor/pkg/sentry/context"
-)
-
-// contextID is this package's type for context.Context.Value keys.
-type contextID int
-
-const (
- // CtxMountNamespace is a Context.Value key for a MountNamespace.
- CtxMountNamespace contextID = iota
-)
-
-// MountNamespaceFromContext returns the MountNamespace used by ctx. It does
-// not take a reference on the returned MountNamespace. If ctx is not
-// associated with a MountNamespace, MountNamespaceFromContext returns nil.
-func MountNamespaceFromContext(ctx context.Context) *MountNamespace {
- if v := ctx.Value(CtxMountNamespace); v != nil {
- return v.(*MountNamespace)
- }
- return nil
-}
diff --git a/pkg/sentry/vfs/debug.go b/pkg/sentry/vfs/debug.go
deleted file mode 100644
index 0ed20f249..000000000
--- a/pkg/sentry/vfs/debug.go
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs
-
-const (
- // If checkInvariants is true, perform runtime checks for invariants
- // expected by the vfs package. This is normally disabled since VFS is
- // often a hot path.
- checkInvariants = false
-)
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
deleted file mode 100644
index 45912fc58..000000000
--- a/pkg/sentry/vfs/dentry.go
+++ /dev/null
@@ -1,347 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs
-
-import (
- "fmt"
- "sync/atomic"
-
- "gvisor.dev/gvisor/pkg/syserror"
-)
-
-// Dentry represents a node in a Filesystem tree which may represent a file.
-//
-// Dentries are reference-counted. Unless otherwise specified, all Dentry
-// methods require that a reference is held.
-//
-// A Dentry transitions through up to 3 different states through its lifetime:
-//
-// - Dentries are initially "independent". Independent Dentries have no parent,
-// and consequently no name.
-//
-// - Dentry.InsertChild() causes an independent Dentry to become a "child" of
-// another Dentry. A child node has a parent node, and a name in that parent,
-// both of which are mutable by DentryMoveChild(). Each child Dentry's name is
-// unique within its parent.
-//
-// - Dentry.RemoveChild() causes a child Dentry to become "disowned". A
-// disowned Dentry can still refer to its former parent and its former name in
-// said parent, but the disowned Dentry is no longer reachable from its parent,
-// and a new Dentry with the same name may become a child of the parent. (This
-// is analogous to a struct dentry being "unhashed" in Linux.)
-//
-// Dentry is loosely analogous to Linux's struct dentry, but:
-//
-// - VFS does not associate Dentries with inodes. gVisor interacts primarily
-// with filesystems that are accessed through filesystem APIs (as opposed to
-// raw block devices); many such APIs support only paths and file descriptors,
-// and not inodes. Furthermore, when parties outside the scope of VFS can
-// rename inodes on such filesystems, VFS generally cannot "follow" the rename,
-// both due to synchronization issues and because it may not even be able to
-// name the destination path; this implies that it would in fact be *incorrect*
-// for Dentries to be associated with inodes on such filesystems. Consequently,
-// operations that are inode operations in Linux are FilesystemImpl methods
-// and/or FileDescriptionImpl methods in gVisor's VFS. Filesystems that do
-// support inodes may store appropriate state in implementations of DentryImpl.
-//
-// - VFS does not provide synchronization for mutable Dentry fields, other than
-// mount-related ones.
-//
-// - VFS does not require that Dentries are instantiated for all paths accessed
-// through VFS, only those that are tracked beyond the scope of a single
-// Filesystem operation. This includes file descriptions, mount points, mount
-// roots, process working directories, and chroots. This avoids instantiation
-// of Dentries for operations on mutable remote filesystems that can't actually
-// cache any state in the Dentry.
-//
-// - For the reasons above, VFS is not directly responsible for managing Dentry
-// lifetime. Dentry reference counts only indicate the extent to which VFS
-// requires Dentries to exist; Filesystems may elect to cache or discard
-// Dentries with zero references.
-type Dentry struct {
- // parent is this Dentry's parent in this Filesystem. If this Dentry is
- // independent, parent is nil.
- parent *Dentry
-
- // name is this Dentry's name in parent.
- name string
-
- flags uint32
-
- // mounts is the number of Mounts for which this Dentry is Mount.point.
- // mounts is accessed using atomic memory operations.
- mounts uint32
-
- // children are child Dentries.
- children map[string]*Dentry
-
- // impl is the DentryImpl associated with this Dentry. impl is immutable.
- // This should be the last field in Dentry.
- impl DentryImpl
-}
-
-const (
- // dflagsDisownedMask is set in Dentry.flags if the Dentry has been
- // disowned.
- dflagsDisownedMask = 1 << iota
-)
-
-// Init must be called before first use of d.
-func (d *Dentry) Init(impl DentryImpl) {
- d.impl = impl
-}
-
-// Impl returns the DentryImpl associated with d.
-func (d *Dentry) Impl() DentryImpl {
- return d.impl
-}
-
-// DentryImpl contains implementation details for a Dentry. Implementations of
-// DentryImpl should contain their associated Dentry by value as their first
-// field.
-type DentryImpl interface {
- // IncRef increments the Dentry's reference count. A Dentry with a non-zero
- // reference count must remain coherent with the state of the filesystem.
- IncRef(fs *Filesystem)
-
- // TryIncRef increments the Dentry's reference count and returns true. If
- // the Dentry's reference count is zero, TryIncRef may do nothing and
- // return false. (It is also permitted to succeed if it can restore the
- // guarantee that the Dentry is coherent with the state of the filesystem.)
- //
- // TryIncRef does not require that a reference is held on the Dentry.
- TryIncRef(fs *Filesystem) bool
-
- // DecRef decrements the Dentry's reference count.
- DecRef(fs *Filesystem)
-}
-
-// IsDisowned returns true if d is disowned.
-func (d *Dentry) IsDisowned() bool {
- return atomic.LoadUint32(&d.flags)&dflagsDisownedMask != 0
-}
-
-// Preconditions: !d.IsDisowned().
-func (d *Dentry) setDisowned() {
- atomic.AddUint32(&d.flags, dflagsDisownedMask)
-}
-
-func (d *Dentry) isMounted() bool {
- return atomic.LoadUint32(&d.mounts) != 0
-}
-
-func (d *Dentry) incRef(fs *Filesystem) {
- d.impl.IncRef(fs)
-}
-
-func (d *Dentry) tryIncRef(fs *Filesystem) bool {
- return d.impl.TryIncRef(fs)
-}
-
-func (d *Dentry) decRef(fs *Filesystem) {
- d.impl.DecRef(fs)
-}
-
-// These functions are exported so that filesystem implementations can use
-// them. The vfs package, and users of VFS, should not call these functions.
-// Unless otherwise specified, these methods require that there are no
-// concurrent mutators of d.
-
-// Name returns d's name in its parent in its owning Filesystem. If d is
-// independent, Name returns an empty string.
-func (d *Dentry) Name() string {
- return d.name
-}
-
-// Parent returns d's parent in its owning Filesystem. It does not take a
-// reference on the returned Dentry. If d is independent, Parent returns nil.
-func (d *Dentry) Parent() *Dentry {
- return d.parent
-}
-
-// ParentOrSelf is equivalent to Parent, but returns d if d is independent.
-func (d *Dentry) ParentOrSelf() *Dentry {
- if d.parent == nil {
- return d
- }
- return d.parent
-}
-
-// Child returns d's child with the given name in its owning Filesystem. It
-// does not take a reference on the returned Dentry. If no such child exists,
-// Child returns nil.
-func (d *Dentry) Child(name string) *Dentry {
- return d.children[name]
-}
-
-// HasChildren returns true if d has any children.
-func (d *Dentry) HasChildren() bool {
- return len(d.children) != 0
-}
-
-// InsertChild makes child a child of d with the given name.
-//
-// InsertChild is a mutator of d and child.
-//
-// Preconditions: child must be an independent Dentry. d and child must be from
-// the same Filesystem. d must not already have a child with the given name.
-func (d *Dentry) InsertChild(child *Dentry, name string) {
- if checkInvariants {
- if _, ok := d.children[name]; ok {
- panic(fmt.Sprintf("parent already contains a child named %q", name))
- }
- if child.parent != nil || child.name != "" {
- panic(fmt.Sprintf("child is not independent: parent = %v, name = %q", child.parent, child.name))
- }
- }
- if d.children == nil {
- d.children = make(map[string]*Dentry)
- }
- d.children[name] = child
- child.parent = d
- child.name = name
-}
-
-// PrepareDeleteDentry must be called before attempting to delete the file
-// represented by d. If PrepareDeleteDentry succeeds, the caller must call
-// AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome.
-//
-// Preconditions: d is a child Dentry.
-func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dentry) error {
- if checkInvariants {
- if d.parent == nil {
- panic("d is independent")
- }
- if d.IsDisowned() {
- panic("d is already disowned")
- }
- }
- vfs.mountMu.RLock()
- if _, ok := mntns.mountpoints[d]; ok {
- vfs.mountMu.RUnlock()
- return syserror.EBUSY
- }
- // Return with vfs.mountMu locked, which will be unlocked by
- // AbortDeleteDentry or CommitDeleteDentry.
- return nil
-}
-
-// AbortDeleteDentry must be called after PrepareDeleteDentry if the deletion
-// fails.
-func (vfs *VirtualFilesystem) AbortDeleteDentry() {
- vfs.mountMu.RUnlock()
-}
-
-// CommitDeleteDentry must be called after the file represented by d is
-// deleted, and causes d to become disowned.
-//
-// Preconditions: PrepareDeleteDentry was previously called on d.
-func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) {
- delete(d.parent.children, d.name)
- d.setDisowned()
- // TODO: lazily unmount mounts at d
- vfs.mountMu.RUnlock()
-}
-
-// DeleteDentry combines PrepareDeleteDentry and CommitDeleteDentry, as
-// appropriate for in-memory filesystems that don't need to ensure that some
-// external state change succeeds before committing the deletion.
-func (vfs *VirtualFilesystem) DeleteDentry(mntns *MountNamespace, d *Dentry) error {
- if err := vfs.PrepareDeleteDentry(mntns, d); err != nil {
- return err
- }
- vfs.CommitDeleteDentry(d)
- return nil
-}
-
-// PrepareRenameDentry must be called before attempting to rename the file
-// represented by from. If to is not nil, it represents the file that will be
-// replaced or exchanged by the rename. If PrepareRenameDentry succeeds, the
-// caller must call AbortRenameDentry, CommitRenameReplaceDentry, or
-// CommitRenameExchangeDentry depending on the rename's outcome.
-//
-// Preconditions: from is a child Dentry. If to is not nil, it must be a child
-// Dentry from the same Filesystem.
-func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error {
- if checkInvariants {
- if from.parent == nil {
- panic("from is independent")
- }
- if from.IsDisowned() {
- panic("from is already disowned")
- }
- if to != nil {
- if to.parent == nil {
- panic("to is independent")
- }
- if to.IsDisowned() {
- panic("to is already disowned")
- }
- }
- }
- vfs.mountMu.RLock()
- if _, ok := mntns.mountpoints[from]; ok {
- vfs.mountMu.RUnlock()
- return syserror.EBUSY
- }
- if to != nil {
- if _, ok := mntns.mountpoints[to]; ok {
- vfs.mountMu.RUnlock()
- return syserror.EBUSY
- }
- }
- // Return with vfs.mountMu locked, which will be unlocked by
- // AbortRenameDentry, CommitRenameReplaceDentry, or
- // CommitRenameExchangeDentry.
- return nil
-}
-
-// AbortRenameDentry must be called after PrepareRenameDentry if the rename
-// fails.
-func (vfs *VirtualFilesystem) AbortRenameDentry() {
- vfs.mountMu.RUnlock()
-}
-
-// CommitRenameReplaceDentry must be called after the file represented by from
-// is renamed without RENAME_EXCHANGE. If to is not nil, it represents the file
-// that was replaced by from.
-//
-// Preconditions: PrepareRenameDentry was previously called on from and to.
-// newParent.Child(newName) == to.
-func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, newParent *Dentry, newName string, to *Dentry) {
- if to != nil {
- to.setDisowned()
- // TODO: lazily unmount mounts at d
- }
- if newParent.children == nil {
- newParent.children = make(map[string]*Dentry)
- }
- newParent.children[newName] = from
- from.parent = newParent
- from.name = newName
- vfs.mountMu.RUnlock()
-}
-
-// CommitRenameExchangeDentry must be called after the files represented by
-// from and to are exchanged by rename(RENAME_EXCHANGE).
-//
-// Preconditions: PrepareRenameDentry was previously called on from and to.
-func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) {
- from.parent, to.parent = to.parent, from.parent
- from.name, to.name = to.name, from.name
- from.parent.children[from.name] = from
- to.parent.children[to.name] = to
- vfs.mountMu.RUnlock()
-}
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
deleted file mode 100644
index 86bde7fb3..000000000
--- a/pkg/sentry/vfs/file_description.go
+++ /dev/null
@@ -1,213 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs
-
-import (
- "sync/atomic"
-
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
- "gvisor.dev/gvisor/pkg/waiter"
-)
-
-// A FileDescription represents an open file description, which is the entity
-// referred to by a file descriptor (POSIX.1-2017 3.258 "Open File
-// Description").
-//
-// FileDescriptions are reference-counted. Unless otherwise specified, all
-// FileDescription methods require that a reference is held.
-//
-// FileDescription is analogous to Linux's struct file.
-type FileDescription struct {
- // refs is the reference count. refs is accessed using atomic memory
- // operations.
- refs int64
-
- // vd is the filesystem location at which this FileDescription was opened.
- // A reference is held on vd. vd is immutable.
- vd VirtualDentry
-
- // impl is the FileDescriptionImpl associated with this Filesystem. impl is
- // immutable. This should be the last field in FileDescription.
- impl FileDescriptionImpl
-}
-
-// Init must be called before first use of fd. It takes references on mnt and
-// d.
-func (fd *FileDescription) Init(impl FileDescriptionImpl, mnt *Mount, d *Dentry) {
- fd.refs = 1
- fd.vd = VirtualDentry{
- mount: mnt,
- dentry: d,
- }
- fd.vd.IncRef()
- fd.impl = impl
-}
-
-// Impl returns the FileDescriptionImpl associated with fd.
-func (fd *FileDescription) Impl() FileDescriptionImpl {
- return fd.impl
-}
-
-// VirtualDentry returns the location at which fd was opened. It does not take
-// a reference on the returned VirtualDentry.
-func (fd *FileDescription) VirtualDentry() VirtualDentry {
- return fd.vd
-}
-
-// IncRef increments fd's reference count.
-func (fd *FileDescription) IncRef() {
- atomic.AddInt64(&fd.refs, 1)
-}
-
-// DecRef decrements fd's reference count.
-func (fd *FileDescription) DecRef() {
- if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 {
- fd.impl.Release()
- fd.vd.DecRef()
- } else if refs < 0 {
- panic("FileDescription.DecRef() called without holding a reference")
- }
-}
-
-// FileDescriptionImpl contains implementation details for an FileDescription.
-// Implementations of FileDescriptionImpl should contain their associated
-// FileDescription by value as their first field.
-//
-// For all functions that return linux.Statx, Statx.Uid and Statx.Gid will
-// be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID and
-// auth.KGID respectively).
-//
-// FileDescriptionImpl is analogous to Linux's struct file_operations.
-type FileDescriptionImpl interface {
- // Release is called when the associated FileDescription reaches zero
- // references.
- Release()
-
- // OnClose is called when a file descriptor representing the
- // FileDescription is closed. Note that returning a non-nil error does not
- // prevent the file descriptor from being closed.
- OnClose() error
-
- // StatusFlags returns file description status flags, as for
- // fcntl(F_GETFL).
- StatusFlags(ctx context.Context) (uint32, error)
-
- // SetStatusFlags sets file description status flags, as for
- // fcntl(F_SETFL).
- SetStatusFlags(ctx context.Context, flags uint32) error
-
- // Stat returns metadata for the file represented by the FileDescription.
- Stat(ctx context.Context, opts StatOptions) (linux.Statx, error)
-
- // SetStat updates metadata for the file represented by the
- // FileDescription.
- SetStat(ctx context.Context, opts SetStatOptions) error
-
- // StatFS returns metadata for the filesystem containing the file
- // represented by the FileDescription.
- StatFS(ctx context.Context) (linux.Statfs, error)
-
- // waiter.Waitable methods may be used to poll for I/O events.
- waiter.Waitable
-
- // PRead reads from the file into dst, starting at the given offset, and
- // returns the number of bytes read. PRead is permitted to return partial
- // reads with a nil error.
- PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)
-
- // Read is similar to PRead, but does not specify an offset.
- //
- // For files with an implicit FileDescription offset (e.g. regular files),
- // Read begins at the FileDescription offset, and advances the offset by
- // the number of bytes read; note that POSIX 2.9.7 "Thread Interactions
- // with Regular File Operations" requires that all operations that may
- // mutate the FileDescription offset are serialized.
- Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error)
-
- // PWrite writes src to the file, starting at the given offset, and returns
- // the number of bytes written. PWrite is permitted to return partial
- // writes with a nil error.
- //
- // As in Linux (but not POSIX), if O_APPEND is in effect for the
- // FileDescription, PWrite should ignore the offset and append data to the
- // end of the file.
- PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)
-
- // Write is similar to PWrite, but does not specify an offset, which is
- // implied as for Read.
- //
- // Write is a FileDescriptionImpl method, instead of a wrapper around
- // PWrite that uses a FileDescription offset, to make it possible for
- // remote filesystems to implement O_APPEND correctly (i.e. atomically with
- // respect to writers outside the scope of VFS).
- Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error)
-
- // IterDirents invokes cb on each entry in the directory represented by the
- // FileDescription. If IterDirents has been called since the last call to
- // Seek, it continues iteration from the end of the last call.
- IterDirents(ctx context.Context, cb IterDirentsCallback) error
-
- // Seek changes the FileDescription offset (assuming one exists) and
- // returns its new value.
- //
- // For directories, if whence == SEEK_SET and offset == 0, the caller is
- // rewinddir(), such that Seek "shall also cause the directory stream to
- // refer to the current state of the corresponding directory" -
- // POSIX.1-2017.
- Seek(ctx context.Context, offset int64, whence int32) (int64, error)
-
- // Sync requests that cached state associated with the file represented by
- // the FileDescription is synchronized with persistent storage, and blocks
- // until this is complete.
- Sync(ctx context.Context) error
-
- // ConfigureMMap mutates opts to implement mmap(2) for the file. Most
- // implementations that support memory mapping can call
- // GenericConfigureMMap with the appropriate memmap.Mappable.
- ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error
-
- // Ioctl implements the ioctl(2) syscall.
- Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)
-
- // TODO: extended attributes; file locking
-}
-
-// Dirent holds the information contained in struct linux_dirent64.
-type Dirent struct {
- // Name is the filename.
- Name string
-
- // Type is the file type, a linux.DT_* constant.
- Type uint8
-
- // Ino is the inode number.
- Ino uint64
-
- // Off is this Dirent's offset.
- Off int64
-}
-
-// IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents.
-type IterDirentsCallback interface {
- // Handle handles the given iterated Dirent. It returns true if iteration
- // should continue, and false if FileDescriptionImpl.IterDirents should
- // terminate now and restart with the same Dirent the next time it is
- // called.
- Handle(dirent Dirent) bool
-}
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
deleted file mode 100644
index 486893e70..000000000
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ /dev/null
@@ -1,142 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs
-
-import (
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
- "gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/waiter"
-)
-
-// FileDescriptionDefaultImpl may be embedded by implementations of
-// FileDescriptionImpl to obtain implementations of many FileDescriptionImpl
-// methods with default behavior analogous to Linux's.
-type FileDescriptionDefaultImpl struct{}
-
-// OnClose implements FileDescriptionImpl.OnClose analogously to
-// file_operations::flush == NULL in Linux.
-func (FileDescriptionDefaultImpl) OnClose() error {
- return nil
-}
-
-// StatFS implements FileDescriptionImpl.StatFS analogously to
-// super_operations::statfs == NULL in Linux.
-func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, error) {
- return linux.Statfs{}, syserror.ENOSYS
-}
-
-// Readiness implements waiter.Waitable.Readiness analogously to
-// file_operations::poll == NULL in Linux.
-func (FileDescriptionDefaultImpl) Readiness(mask waiter.EventMask) waiter.EventMask {
- // include/linux/poll.h:vfs_poll() => DEFAULT_POLLMASK
- return waiter.EventIn | waiter.EventOut
-}
-
-// EventRegister implements waiter.Waitable.EventRegister analogously to
-// file_operations::poll == NULL in Linux.
-func (FileDescriptionDefaultImpl) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
-}
-
-// EventUnregister implements waiter.Waitable.EventUnregister analogously to
-// file_operations::poll == NULL in Linux.
-func (FileDescriptionDefaultImpl) EventUnregister(e *waiter.Entry) {
-}
-
-// PRead implements FileDescriptionImpl.PRead analogously to
-// file_operations::read == file_operations::read_iter == NULL in Linux.
-func (FileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
- return 0, syserror.EINVAL
-}
-
-// Read implements FileDescriptionImpl.Read analogously to
-// file_operations::read == file_operations::read_iter == NULL in Linux.
-func (FileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
- return 0, syserror.EINVAL
-}
-
-// PWrite implements FileDescriptionImpl.PWrite analogously to
-// file_operations::write == file_operations::write_iter == NULL in Linux.
-func (FileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
- return 0, syserror.EINVAL
-}
-
-// Write implements FileDescriptionImpl.Write analogously to
-// file_operations::write == file_operations::write_iter == NULL in Linux.
-func (FileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
- return 0, syserror.EINVAL
-}
-
-// IterDirents implements FileDescriptionImpl.IterDirents analogously to
-// file_operations::iterate == file_operations::iterate_shared == NULL in
-// Linux.
-func (FileDescriptionDefaultImpl) IterDirents(ctx context.Context, cb IterDirentsCallback) error {
- return syserror.ENOTDIR
-}
-
-// Seek implements FileDescriptionImpl.Seek analogously to
-// file_operations::llseek == NULL in Linux.
-func (FileDescriptionDefaultImpl) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
- return 0, syserror.ESPIPE
-}
-
-// Sync implements FileDescriptionImpl.Sync analogously to
-// file_operations::fsync == NULL in Linux.
-func (FileDescriptionDefaultImpl) Sync(ctx context.Context) error {
- return syserror.EINVAL
-}
-
-// ConfigureMMap implements FileDescriptionImpl.ConfigureMMap analogously to
-// file_operations::mmap == NULL in Linux.
-func (FileDescriptionDefaultImpl) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error {
- return syserror.ENODEV
-}
-
-// Ioctl implements FileDescriptionImpl.Ioctl analogously to
-// file_operations::unlocked_ioctl == NULL in Linux.
-func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
- return 0, syserror.ENOTTY
-}
-
-// DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
-// FileDescriptionImpl that always represent directories to obtain
-// implementations of non-directory I/O methods that return EISDIR, and
-// implementations of other methods consistent with FileDescriptionDefaultImpl.
-type DirectoryFileDescriptionDefaultImpl struct {
- FileDescriptionDefaultImpl
-}
-
-// PRead implements FileDescriptionImpl.PRead.
-func (DirectoryFileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
- return 0, syserror.EISDIR
-}
-
-// Read implements FileDescriptionImpl.Read.
-func (DirectoryFileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
- return 0, syserror.EISDIR
-}
-
-// PWrite implements FileDescriptionImpl.PWrite.
-func (DirectoryFileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
- return 0, syserror.EISDIR
-}
-
-// Write implements FileDescriptionImpl.Write.
-func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
- return 0, syserror.EISDIR
-}
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
deleted file mode 100644
index 7a074b718..000000000
--- a/pkg/sentry/vfs/filesystem.go
+++ /dev/null
@@ -1,155 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs
-
-import (
- "sync/atomic"
-
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/context"
-)
-
-// A Filesystem is a tree of nodes represented by Dentries, which forms part of
-// a VirtualFilesystem.
-//
-// Filesystems are reference-counted. Unless otherwise specified, all
-// Filesystem methods require that a reference is held.
-//
-// Filesystem is analogous to Linux's struct super_block.
-type Filesystem struct {
- // refs is the reference count. refs is accessed using atomic memory
- // operations.
- refs int64
-
- // impl is the FilesystemImpl associated with this Filesystem. impl is
- // immutable. This should be the last field in Dentry.
- impl FilesystemImpl
-}
-
-// Init must be called before first use of fs.
-func (fs *Filesystem) Init(impl FilesystemImpl) {
- fs.refs = 1
- fs.impl = impl
-}
-
-// Impl returns the FilesystemImpl associated with fs.
-func (fs *Filesystem) Impl() FilesystemImpl {
- return fs.impl
-}
-
-func (fs *Filesystem) incRef() {
- if atomic.AddInt64(&fs.refs, 1) <= 1 {
- panic("Filesystem.incRef() called without holding a reference")
- }
-}
-
-func (fs *Filesystem) decRef() {
- if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 {
- fs.impl.Release()
- } else if refs < 0 {
- panic("Filesystem.decRef() called without holding a reference")
- }
-}
-
-// FilesystemImpl contains implementation details for a Filesystem.
-// Implementations of FilesystemImpl should contain their associated Filesystem
-// by value as their first field.
-//
-// All methods that take a ResolvingPath must resolve the path before
-// performing any other checks, including rejection of the operation if not
-// supported by the FilesystemImpl. This is because the final FilesystemImpl
-// (responsible for actually implementing the operation) isn't known until path
-// resolution is complete.
-//
-// For all methods that take or return linux.Statx, Statx.Uid and Statx.Gid
-// should be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID
-// and auth.KGID respectively).
-//
-// FilesystemImpl combines elements of Linux's struct super_operations and
-// struct inode_operations, for reasons described in the documentation for
-// Dentry.
-type FilesystemImpl interface {
- // Release is called when the associated Filesystem reaches zero
- // references.
- Release()
-
- // Sync "causes all pending modifications to filesystem metadata and cached
- // file data to be written to the underlying [filesystem]", as by syncfs(2).
- Sync(ctx context.Context) error
-
- // GetDentryAt returns a Dentry representing the file at rp. A reference is
- // taken on the returned Dentry.
- //
- // GetDentryAt does not correspond directly to a Linux syscall; it is used
- // in the implementation of:
- //
- // - Syscalls that need to resolve two paths: rename(), renameat(),
- // renameat2(), link(), linkat().
- //
- // - Syscalls that need to refer to a filesystem position outside the
- // context of a file description: chdir(), fchdir(), chroot(), mount(),
- // umount().
- GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error)
-
- // LinkAt creates a hard link at rp representing the same file as vd. It
- // does not take ownership of references on vd.
- //
- // The implementation is responsible for checking that vd.Mount() ==
- // rp.Mount(), and that vd does not represent a directory.
- LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error
-
- // MkdirAt creates a directory at rp.
- MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error
-
- // MknodAt creates a regular file, device special file, or named pipe at
- // rp.
- MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error
-
- // OpenAt returns an FileDescription providing access to the file at rp. A
- // reference is taken on the returned FileDescription.
- OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error)
-
- // ReadlinkAt returns the target of the symbolic link at rp.
- ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error)
-
- // RenameAt renames the Dentry represented by vd to rp. It does not take
- // ownership of references on vd.
- //
- // The implementation is responsible for checking that vd.Mount() ==
- // rp.Mount().
- RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error
-
- // RmdirAt removes the directory at rp.
- RmdirAt(ctx context.Context, rp *ResolvingPath) error
-
- // SetStatAt updates metadata for the file at the given path.
- SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error
-
- // StatAt returns metadata for the file at rp.
- StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error)
-
- // StatFSAt returns metadata for the filesystem containing the file at rp.
- // (This method takes a path because a FilesystemImpl may consist of any
- // number of constituent filesystems.)
- StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error)
-
- // SymlinkAt creates a symbolic link at rp referring to the given target.
- SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error
-
- // UnlinkAt removes the non-directory file at rp.
- UnlinkAt(ctx context.Context, rp *ResolvingPath) error
-
- // TODO: d_path(); extended attributes; inotify_add_watch(); bind()
-}
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
deleted file mode 100644
index f401ad7f3..000000000
--- a/pkg/sentry/vfs/filesystem_type.go
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs
-
-import (
- "fmt"
-
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-)
-
-// A FilesystemType constructs filesystems.
-//
-// FilesystemType is analogous to Linux's struct file_system_type.
-type FilesystemType interface {
- // NewFilesystem returns a Filesystem configured by the given options,
- // along with its mount root. A reference is taken on the returned
- // Filesystem and Dentry.
- NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts NewFilesystemOptions) (*Filesystem, *Dentry, error)
-}
-
-// NewFilesystemOptions contains options to FilesystemType.NewFilesystem.
-type NewFilesystemOptions struct {
- // Data is the string passed as the 5th argument to mount(2), which is
- // usually a comma-separated list of filesystem-specific mount options.
- Data string
-
- // InternalData holds opaque FilesystemType-specific data. There is
- // intentionally no way for applications to specify InternalData; if it is
- // not nil, the call to NewFilesystem originates from within the sentry.
- InternalData interface{}
-}
-
-// RegisterFilesystemType registers the given FilesystemType in vfs with the
-// given name.
-func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType) error {
- vfs.fsTypesMu.Lock()
- defer vfs.fsTypesMu.Unlock()
- if existing, ok := vfs.fsTypes[name]; ok {
- return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing)
- }
- vfs.fsTypes[name] = fsType
- return nil
-}
-
-// MustRegisterFilesystemType is equivalent to RegisterFilesystemType but
-// panics on failure.
-func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType) {
- if err := vfs.RegisterFilesystemType(name, fsType); err != nil {
- panic(fmt.Sprintf("failed to register filesystem type %T: %v", fsType, err))
- }
-}
-
-func (vfs *VirtualFilesystem) getFilesystemType(name string) FilesystemType {
- vfs.fsTypesMu.RLock()
- defer vfs.fsTypesMu.RUnlock()
- return vfs.fsTypes[name]
-}
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
deleted file mode 100644
index 11702f720..000000000
--- a/pkg/sentry/vfs/mount.go
+++ /dev/null
@@ -1,411 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs
-
-import (
- "math"
- "sync/atomic"
-
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/syserror"
-)
-
-// A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem
-// (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem
-// (Mount.fs), which applies to path resolution in the context of a particular
-// Mount (Mount.key.parent).
-//
-// Mounts are reference-counted. Unless otherwise specified, all Mount methods
-// require that a reference is held.
-//
-// Mount and Filesystem are distinct types because it's possible for a single
-// Filesystem to be mounted at multiple locations and/or in multiple mount
-// namespaces.
-//
-// Mount is analogous to Linux's struct mount. (gVisor does not distinguish
-// between struct mount and struct vfsmount.)
-type Mount struct {
- // The lower 63 bits of refs are a reference count. The MSB of refs is set
- // if the Mount has been eagerly unmounted, as by umount(2) without the
- // MNT_DETACH flag. refs is accessed using atomic memory operations.
- refs int64
-
- // The lower 63 bits of writers is the number of calls to
- // Mount.CheckBeginWrite() that have not yet been paired with a call to
- // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect.
- // writers is accessed using atomic memory operations.
- writers int64
-
- // key is protected by VirtualFilesystem.mountMu and
- // VirtualFilesystem.mounts.seq, and may be nil. References are held on
- // key.parent and key.point if they are not nil.
- //
- // Invariant: key.parent != nil iff key.point != nil. key.point belongs to
- // key.parent.fs.
- key mountKey
-
- // fs, root, and ns are immutable. References are held on fs and root (but
- // not ns).
- //
- // Invariant: root belongs to fs.
- fs *Filesystem
- root *Dentry
- ns *MountNamespace
-}
-
-// A MountNamespace is a collection of Mounts.
-//
-// MountNamespaces are reference-counted. Unless otherwise specified, all
-// MountNamespace methods require that a reference is held.
-//
-// MountNamespace is analogous to Linux's struct mnt_namespace.
-type MountNamespace struct {
- refs int64 // accessed using atomic memory operations
-
- // root is the MountNamespace's root mount. root is immutable.
- root *Mount
-
- // mountpoints contains all Dentries which are mount points in this
- // namespace. mountpoints is protected by VirtualFilesystem.mountMu.
- //
- // mountpoints is used to determine if a Dentry can be moved or removed
- // (which requires that the Dentry is not a mount point in the calling
- // namespace).
- //
- // mountpoints is maintained even if there are no references held on the
- // MountNamespace; this is required to ensure that
- // VFS.PrepareDeleteDentry() and VFS.PrepareRemoveDentry() operate
- // correctly on unreferenced MountNamespaces.
- mountpoints map[*Dentry]struct{}
-}
-
-// NewMountNamespace returns a new mount namespace with a root filesystem
-// configured by the given arguments. A reference is taken on the returned
-// MountNamespace.
-func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *NewFilesystemOptions) (*MountNamespace, error) {
- fsType := vfs.getFilesystemType(fsTypeName)
- if fsType == nil {
- return nil, syserror.ENODEV
- }
- fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts)
- if err != nil {
- return nil, err
- }
- mntns := &MountNamespace{
- refs: 1,
- mountpoints: make(map[*Dentry]struct{}),
- }
- mntns.root = &Mount{
- fs: fs,
- root: root,
- ns: mntns,
- refs: 1,
- }
- return mntns, nil
-}
-
-// NewMount creates and mounts a new Filesystem.
-func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *NewFilesystemOptions) error {
- fsType := vfs.getFilesystemType(fsTypeName)
- if fsType == nil {
- return syserror.ENODEV
- }
- fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts)
- if err != nil {
- return err
- }
- // We can't hold vfs.mountMu while calling FilesystemImpl methods due to
- // lock ordering.
- vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{})
- if err != nil {
- root.decRef(fs)
- fs.decRef()
- return err
- }
- vfs.mountMu.Lock()
- for {
- if vd.dentry.IsDisowned() {
- vfs.mountMu.Unlock()
- vd.DecRef()
- root.decRef(fs)
- fs.decRef()
- return syserror.ENOENT
- }
- // vd might have been mounted over between vfs.GetDentryAt() and
- // vfs.mountMu.Lock().
- if !vd.dentry.isMounted() {
- break
- }
- nextmnt := vfs.mounts.Lookup(vd.mount, vd.dentry)
- if nextmnt == nil {
- break
- }
- nextmnt.incRef()
- nextmnt.root.incRef(nextmnt.fs)
- vd.DecRef()
- vd = VirtualDentry{
- mount: nextmnt,
- dentry: nextmnt.root,
- }
- }
- // TODO: Linux requires that either both the mount point and the mount root
- // are directories, or neither are, and returns ENOTDIR if this is not the
- // case.
- mntns := vd.mount.ns
- mnt := &Mount{
- fs: fs,
- root: root,
- ns: mntns,
- refs: 1,
- }
- mnt.storeKey(vd.mount, vd.dentry)
- atomic.AddUint32(&vd.dentry.mounts, 1)
- mntns.mountpoints[vd.dentry] = struct{}{}
- vfsmpmounts, ok := vfs.mountpoints[vd.dentry]
- if !ok {
- vfsmpmounts = make(map[*Mount]struct{})
- vfs.mountpoints[vd.dentry] = vfsmpmounts
- }
- vfsmpmounts[mnt] = struct{}{}
- vfs.mounts.Insert(mnt)
- vfs.mountMu.Unlock()
- return nil
-}
-
-// getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes
-// a reference on the returned Mount. If (mnt, d) is not a mount point,
-// getMountAt returns nil.
-//
-// getMountAt is analogous to Linux's fs/namei.c:follow_mount().
-//
-// Preconditions: References are held on mnt and d.
-func (vfs *VirtualFilesystem) getMountAt(mnt *Mount, d *Dentry) *Mount {
- // The first mount is special-cased:
- //
- // - The caller is assumed to have checked d.isMounted() already. (This
- // isn't a precondition because it doesn't matter for correctness.)
- //
- // - We return nil, instead of mnt, if there is no mount at (mnt, d).
- //
- // - We don't drop the caller's references on mnt and d.
-retryFirst:
- next := vfs.mounts.Lookup(mnt, d)
- if next == nil {
- return nil
- }
- if !next.tryIncMountedRef() {
- // Raced with umount.
- goto retryFirst
- }
- mnt = next
- d = next.root
- // We don't need to take Dentry refs anywhere in this function because
- // Mounts hold references on Mount.root, which is immutable.
- for d.isMounted() {
- next := vfs.mounts.Lookup(mnt, d)
- if next == nil {
- break
- }
- if !next.tryIncMountedRef() {
- // Raced with umount.
- continue
- }
- mnt.decRef()
- mnt = next
- d = next.root
- }
- return mnt
-}
-
-// getMountpointAt returns the mount point for the stack of Mounts including
-// mnt. It takes a reference on the returned Mount and Dentry. If no such mount
-// point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil).
-//
-// Preconditions: References are held on mnt and root. vfsroot is not (mnt,
-// mnt.root).
-func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) (*Mount, *Dentry) {
- // The first mount is special-cased:
- //
- // - The caller must have already checked mnt against vfsroot.
- //
- // - We return nil, instead of mnt, if there is no mount point for mnt.
- //
- // - We don't drop the caller's reference on mnt.
-retryFirst:
- epoch := vfs.mounts.seq.BeginRead()
- parent, point := mnt.loadKey()
- if !vfs.mounts.seq.ReadOk(epoch) {
- goto retryFirst
- }
- if parent == nil {
- return nil, nil
- }
- if !parent.tryIncMountedRef() {
- // Raced with umount.
- goto retryFirst
- }
- if !point.tryIncRef(parent.fs) {
- // Since Mount holds a reference on Mount.key.point, this can only
- // happen due to a racing change to Mount.key.
- parent.decRef()
- goto retryFirst
- }
- mnt = parent
- d := point
- for {
- if mnt == vfsroot.mount && d == vfsroot.dentry {
- break
- }
- if d != mnt.root {
- break
- }
- retryNotFirst:
- epoch := vfs.mounts.seq.BeginRead()
- parent, point := mnt.loadKey()
- if !vfs.mounts.seq.ReadOk(epoch) {
- goto retryNotFirst
- }
- if parent == nil {
- break
- }
- if !parent.tryIncMountedRef() {
- // Raced with umount.
- goto retryNotFirst
- }
- if !point.tryIncRef(parent.fs) {
- // Since Mount holds a reference on Mount.key.point, this can
- // only happen due to a racing change to Mount.key.
- parent.decRef()
- goto retryNotFirst
- }
- if !vfs.mounts.seq.ReadOk(epoch) {
- point.decRef(parent.fs)
- parent.decRef()
- goto retryNotFirst
- }
- d.decRef(mnt.fs)
- mnt.decRef()
- mnt = parent
- d = point
- }
- return mnt, d
-}
-
-// tryIncMountedRef increments mnt's reference count and returns true. If mnt's
-// reference count is already zero, or has been eagerly unmounted,
-// tryIncMountedRef does nothing and returns false.
-//
-// tryIncMountedRef does not require that a reference is held on mnt.
-func (mnt *Mount) tryIncMountedRef() bool {
- for {
- refs := atomic.LoadInt64(&mnt.refs)
- if refs <= 0 { // refs < 0 => MSB set => eagerly unmounted
- return false
- }
- if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs+1) {
- return true
- }
- }
-}
-
-func (mnt *Mount) incRef() {
- // In general, negative values for mnt.refs are valid because the MSB is
- // the eager-unmount bit.
- atomic.AddInt64(&mnt.refs, 1)
-}
-
-func (mnt *Mount) decRef() {
- refs := atomic.AddInt64(&mnt.refs, -1)
- if refs&^math.MinInt64 == 0 { // mask out MSB
- parent, point := mnt.loadKey()
- if point != nil {
- point.decRef(parent.fs)
- parent.decRef()
- }
- mnt.root.decRef(mnt.fs)
- mnt.fs.decRef()
- }
-}
-
-// CheckBeginWrite increments the counter of in-progress write operations on
-// mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns
-// EROFS.
-//
-// If CheckBeginWrite succeeds, EndWrite must be called when the write
-// operation is finished.
-func (mnt *Mount) CheckBeginWrite() error {
- if atomic.AddInt64(&mnt.writers, 1) < 0 {
- atomic.AddInt64(&mnt.writers, -1)
- return syserror.EROFS
- }
- return nil
-}
-
-// EndWrite indicates that a write operation signaled by a previous successful
-// call to CheckBeginWrite has finished.
-func (mnt *Mount) EndWrite() {
- atomic.AddInt64(&mnt.writers, -1)
-}
-
-// Preconditions: VirtualFilesystem.mountMu must be locked for writing.
-func (mnt *Mount) setReadOnlyLocked(ro bool) error {
- if oldRO := atomic.LoadInt64(&mnt.writers) < 0; oldRO == ro {
- return nil
- }
- if ro {
- if !atomic.CompareAndSwapInt64(&mnt.writers, 0, math.MinInt64) {
- return syserror.EBUSY
- }
- return nil
- }
- // Unset MSB without dropping any temporary increments from failed calls to
- // mnt.CheckBeginWrite().
- atomic.AddInt64(&mnt.writers, math.MinInt64)
- return nil
-}
-
-// Filesystem returns the mounted Filesystem. It does not take a reference on
-// the returned Filesystem.
-func (mnt *Mount) Filesystem() *Filesystem {
- return mnt.fs
-}
-
-// IncRef increments mntns' reference count.
-func (mntns *MountNamespace) IncRef() {
- if atomic.AddInt64(&mntns.refs, 1) <= 1 {
- panic("MountNamespace.IncRef() called without holding a reference")
- }
-}
-
-// DecRef decrements mntns' reference count.
-func (mntns *MountNamespace) DecRef() {
- if refs := atomic.AddInt64(&mntns.refs, 0); refs == 0 {
- // TODO: unmount mntns.root
- } else if refs < 0 {
- panic("MountNamespace.DecRef() called without holding a reference")
- }
-}
-
-// Root returns mntns' root. A reference is taken on the returned
-// VirtualDentry.
-func (mntns *MountNamespace) Root() VirtualDentry {
- vd := VirtualDentry{
- mount: mntns.root,
- dentry: mntns.root.root,
- }
- vd.IncRef()
- return vd
-}
diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go
deleted file mode 100644
index f394d7483..000000000
--- a/pkg/sentry/vfs/mount_test.go
+++ /dev/null
@@ -1,465 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs
-
-import (
- "fmt"
- "runtime"
- "sync"
- "testing"
-)
-
-func TestMountTableLookupEmpty(t *testing.T) {
- var mt mountTable
- mt.Init()
-
- parent := &Mount{}
- point := &Dentry{}
- if m := mt.Lookup(parent, point); m != nil {
- t.Errorf("empty mountTable lookup: got %p, wanted nil", m)
- }
-}
-
-func TestMountTableInsertLookup(t *testing.T) {
- var mt mountTable
- mt.Init()
-
- mount := &Mount{}
- mount.storeKey(&Mount{}, &Dentry{})
- mt.Insert(mount)
-
- if m := mt.Lookup(mount.parent(), mount.point()); m != mount {
- t.Errorf("mountTable positive lookup: got %p, wanted %p", m, mount)
- }
-
- otherParent := &Mount{}
- if m := mt.Lookup(otherParent, mount.point()); m != nil {
- t.Errorf("mountTable lookup with wrong mount parent: got %p, wanted nil", m)
- }
- otherPoint := &Dentry{}
- if m := mt.Lookup(mount.parent(), otherPoint); m != nil {
- t.Errorf("mountTable lookup with wrong mount point: got %p, wanted nil", m)
- }
-}
-
-// TODO: concurrent lookup/insertion/removal
-
-// must be powers of 2
-var benchNumMounts = []int{1 << 2, 1 << 5, 1 << 8}
-
-// For all of the following:
-//
-// - BenchmarkMountTableFoo tests usage pattern "Foo" for mountTable.
-//
-// - BenchmarkMountMapFoo tests usage pattern "Foo" for a
-// sync.RWMutex-protected map. (Mutator benchmarks do not use a RWMutex, since
-// mountTable also requires external synchronization between mutators.)
-//
-// - BenchmarkMountSyncMapFoo tests usage pattern "Foo" for a sync.Map.
-//
-// ParallelLookup is by far the most common and performance-sensitive operation
-// for this application. NegativeLookup is also important, but less so (only
-// relevant with multiple mount namespaces and significant differences in
-// mounts between them). Insertion and removal are benchmarked for
-// completeness.
-const enableComparativeBenchmarks = false
-
-func newBenchMount() *Mount {
- mount := &Mount{}
- mount.storeKey(&Mount{}, &Dentry{})
- return mount
-}
-
-func vdkey(mnt *Mount) VirtualDentry {
- parent, point := mnt.loadKey()
- return VirtualDentry{
- mount: parent,
- dentry: point,
- }
-}
-
-func BenchmarkMountTableParallelLookup(b *testing.B) {
- for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 {
- for _, numMounts := range benchNumMounts {
- desc := fmt.Sprintf("%dx%d", numG, numMounts)
- b.Run(desc, func(b *testing.B) {
- var mt mountTable
- mt.Init()
- keys := make([]VirtualDentry, 0, numMounts)
- for i := 0; i < numMounts; i++ {
- mount := newBenchMount()
- mt.Insert(mount)
- keys = append(keys, vdkey(mount))
- }
-
- var ready sync.WaitGroup
- begin := make(chan struct{})
- var end sync.WaitGroup
- for g := 0; g < numG; g++ {
- ready.Add(1)
- end.Add(1)
- go func() {
- defer end.Done()
- ready.Done()
- <-begin
- for i := 0; i < b.N; i++ {
- k := keys[i&(numMounts-1)]
- m := mt.Lookup(k.mount, k.dentry)
- if m == nil {
- b.Fatalf("lookup failed")
- }
- if parent := m.parent(); parent != k.mount {
- b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount)
- }
- if point := m.point(); point != k.dentry {
- b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry)
- }
- }
- }()
- }
-
- ready.Wait()
- b.ResetTimer()
- close(begin)
- end.Wait()
- })
- }
- }
-}
-
-func BenchmarkMountMapParallelLookup(b *testing.B) {
- if !enableComparativeBenchmarks {
- b.Skipf("comparative benchmarks are disabled")
- }
-
- for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 {
- for _, numMounts := range benchNumMounts {
- desc := fmt.Sprintf("%dx%d", numG, numMounts)
- b.Run(desc, func(b *testing.B) {
- var mu sync.RWMutex
- ms := make(map[VirtualDentry]*Mount)
- keys := make([]VirtualDentry, 0, numMounts)
- for i := 0; i < numMounts; i++ {
- mount := newBenchMount()
- key := vdkey(mount)
- ms[key] = mount
- keys = append(keys, key)
- }
-
- var ready sync.WaitGroup
- begin := make(chan struct{})
- var end sync.WaitGroup
- for g := 0; g < numG; g++ {
- ready.Add(1)
- end.Add(1)
- go func() {
- defer end.Done()
- ready.Done()
- <-begin
- for i := 0; i < b.N; i++ {
- k := keys[i&(numMounts-1)]
- mu.RLock()
- m := ms[k]
- mu.RUnlock()
- if m == nil {
- b.Fatalf("lookup failed")
- }
- if parent := m.parent(); parent != k.mount {
- b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount)
- }
- if point := m.point(); point != k.dentry {
- b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry)
- }
- }
- }()
- }
-
- ready.Wait()
- b.ResetTimer()
- close(begin)
- end.Wait()
- })
- }
- }
-}
-
-func BenchmarkMountSyncMapParallelLookup(b *testing.B) {
- if !enableComparativeBenchmarks {
- b.Skipf("comparative benchmarks are disabled")
- }
-
- for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 {
- for _, numMounts := range benchNumMounts {
- desc := fmt.Sprintf("%dx%d", numG, numMounts)
- b.Run(desc, func(b *testing.B) {
- var ms sync.Map
- keys := make([]VirtualDentry, 0, numMounts)
- for i := 0; i < numMounts; i++ {
- mount := newBenchMount()
- key := vdkey(mount)
- ms.Store(key, mount)
- keys = append(keys, key)
- }
-
- var ready sync.WaitGroup
- begin := make(chan struct{})
- var end sync.WaitGroup
- for g := 0; g < numG; g++ {
- ready.Add(1)
- end.Add(1)
- go func() {
- defer end.Done()
- ready.Done()
- <-begin
- for i := 0; i < b.N; i++ {
- k := keys[i&(numMounts-1)]
- mi, ok := ms.Load(k)
- if !ok {
- b.Fatalf("lookup failed")
- }
- m := mi.(*Mount)
- if parent := m.parent(); parent != k.mount {
- b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount)
- }
- if point := m.point(); point != k.dentry {
- b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry)
- }
- }
- }()
- }
-
- ready.Wait()
- b.ResetTimer()
- close(begin)
- end.Wait()
- })
- }
- }
-}
-
-func BenchmarkMountTableNegativeLookup(b *testing.B) {
- for _, numMounts := range benchNumMounts {
- desc := fmt.Sprintf("%d", numMounts)
- b.Run(desc, func(b *testing.B) {
- var mt mountTable
- mt.Init()
- for i := 0; i < numMounts; i++ {
- mt.Insert(newBenchMount())
- }
- negkeys := make([]VirtualDentry, 0, numMounts)
- for i := 0; i < numMounts; i++ {
- negkeys = append(negkeys, VirtualDentry{
- mount: &Mount{},
- dentry: &Dentry{},
- })
- }
-
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- k := negkeys[i&(numMounts-1)]
- m := mt.Lookup(k.mount, k.dentry)
- if m != nil {
- b.Fatalf("lookup got %p, wanted nil", m)
- }
- }
- })
- }
-}
-
-func BenchmarkMountMapNegativeLookup(b *testing.B) {
- if !enableComparativeBenchmarks {
- b.Skipf("comparative benchmarks are disabled")
- }
-
- for _, numMounts := range benchNumMounts {
- desc := fmt.Sprintf("%d", numMounts)
- b.Run(desc, func(b *testing.B) {
- var mu sync.RWMutex
- ms := make(map[VirtualDentry]*Mount)
- for i := 0; i < numMounts; i++ {
- mount := newBenchMount()
- ms[vdkey(mount)] = mount
- }
- negkeys := make([]VirtualDentry, 0, numMounts)
- for i := 0; i < numMounts; i++ {
- negkeys = append(negkeys, VirtualDentry{
- mount: &Mount{},
- dentry: &Dentry{},
- })
- }
-
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- k := negkeys[i&(numMounts-1)]
- mu.RLock()
- m := ms[k]
- mu.RUnlock()
- if m != nil {
- b.Fatalf("lookup got %p, wanted nil", m)
- }
- }
- })
- }
-}
-
-func BenchmarkMountSyncMapNegativeLookup(b *testing.B) {
- if !enableComparativeBenchmarks {
- b.Skipf("comparative benchmarks are disabled")
- }
-
- for _, numMounts := range benchNumMounts {
- desc := fmt.Sprintf("%d", numMounts)
- b.Run(desc, func(b *testing.B) {
- var ms sync.Map
- for i := 0; i < numMounts; i++ {
- mount := newBenchMount()
- ms.Store(vdkey(mount), mount)
- }
- negkeys := make([]VirtualDentry, 0, numMounts)
- for i := 0; i < numMounts; i++ {
- negkeys = append(negkeys, VirtualDentry{
- mount: &Mount{},
- dentry: &Dentry{},
- })
- }
-
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- k := negkeys[i&(numMounts-1)]
- m, _ := ms.Load(k)
- if m != nil {
- b.Fatalf("lookup got %p, wanted nil", m)
- }
- }
- })
- }
-}
-
-func BenchmarkMountTableInsert(b *testing.B) {
- // Preallocate Mounts so that allocation time isn't included in the
- // benchmark.
- mounts := make([]*Mount, 0, b.N)
- for i := 0; i < b.N; i++ {
- mounts = append(mounts, newBenchMount())
- }
-
- var mt mountTable
- mt.Init()
- b.ResetTimer()
- for i := range mounts {
- mt.Insert(mounts[i])
- }
-}
-
-func BenchmarkMountMapInsert(b *testing.B) {
- if !enableComparativeBenchmarks {
- b.Skipf("comparative benchmarks are disabled")
- }
-
- // Preallocate Mounts so that allocation time isn't included in the
- // benchmark.
- mounts := make([]*Mount, 0, b.N)
- for i := 0; i < b.N; i++ {
- mounts = append(mounts, newBenchMount())
- }
-
- ms := make(map[VirtualDentry]*Mount)
- b.ResetTimer()
- for i := range mounts {
- mount := mounts[i]
- ms[vdkey(mount)] = mount
- }
-}
-
-func BenchmarkMountSyncMapInsert(b *testing.B) {
- if !enableComparativeBenchmarks {
- b.Skipf("comparative benchmarks are disabled")
- }
-
- // Preallocate Mounts so that allocation time isn't included in the
- // benchmark.
- mounts := make([]*Mount, 0, b.N)
- for i := 0; i < b.N; i++ {
- mounts = append(mounts, newBenchMount())
- }
-
- var ms sync.Map
- b.ResetTimer()
- for i := range mounts {
- mount := mounts[i]
- ms.Store(vdkey(mount), mount)
- }
-}
-
-func BenchmarkMountTableRemove(b *testing.B) {
- mounts := make([]*Mount, 0, b.N)
- for i := 0; i < b.N; i++ {
- mounts = append(mounts, newBenchMount())
- }
- var mt mountTable
- mt.Init()
- for i := range mounts {
- mt.Insert(mounts[i])
- }
-
- b.ResetTimer()
- for i := range mounts {
- mt.Remove(mounts[i])
- }
-}
-
-func BenchmarkMountMapRemove(b *testing.B) {
- if !enableComparativeBenchmarks {
- b.Skipf("comparative benchmarks are disabled")
- }
-
- mounts := make([]*Mount, 0, b.N)
- for i := 0; i < b.N; i++ {
- mounts = append(mounts, newBenchMount())
- }
- ms := make(map[VirtualDentry]*Mount)
- for i := range mounts {
- mount := mounts[i]
- ms[vdkey(mount)] = mount
- }
-
- b.ResetTimer()
- for i := range mounts {
- mount := mounts[i]
- delete(ms, vdkey(mount))
- }
-}
-
-func BenchmarkMountSyncMapRemove(b *testing.B) {
- if !enableComparativeBenchmarks {
- b.Skipf("comparative benchmarks are disabled")
- }
-
- mounts := make([]*Mount, 0, b.N)
- for i := 0; i < b.N; i++ {
- mounts = append(mounts, newBenchMount())
- }
- var ms sync.Map
- for i := range mounts {
- mount := mounts[i]
- ms.Store(vdkey(mount), mount)
- }
-
- b.ResetTimer()
- for i := range mounts {
- mount := mounts[i]
- ms.Delete(vdkey(mount))
- }
-}
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
deleted file mode 100644
index b0511aa40..000000000
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ /dev/null
@@ -1,356 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// +build go1.12
-// +build !go1.14
-
-// Check go:linkname function signatures when updating Go version.
-
-package vfs
-
-import (
- "fmt"
- "math/bits"
- "reflect"
- "sync/atomic"
- "unsafe"
-
- "gvisor.dev/gvisor/third_party/gvsync"
-)
-
-// mountKey represents the location at which a Mount is mounted. It is
-// structurally identical to VirtualDentry, but stores its fields as
-// unsafe.Pointer since mutators synchronize with VFS path traversal using
-// seqcounts.
-type mountKey struct {
- parent unsafe.Pointer // *Mount
- point unsafe.Pointer // *Dentry
-}
-
-// Invariant: mnt.key's fields are nil. parent and point are non-nil.
-func (mnt *Mount) storeKey(parent *Mount, point *Dentry) {
- atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(parent))
- atomic.StorePointer(&mnt.key.point, unsafe.Pointer(point))
-}
-
-func (mnt *Mount) loadKey() (*Mount, *Dentry) {
- return (*Mount)(atomic.LoadPointer(&mnt.key.parent)), (*Dentry)(atomic.LoadPointer(&mnt.key.point))
-}
-
-func (mnt *Mount) parent() *Mount {
- return (*Mount)(atomic.LoadPointer(&mnt.key.parent))
-}
-
-func (mnt *Mount) point() *Dentry {
- return (*Dentry)(atomic.LoadPointer(&mnt.key.point))
-}
-
-// mountTable maps (mount parent, mount point) pairs to mounts. It supports
-// efficient concurrent lookup, even in the presence of concurrent mutators
-// (provided mutation is sufficiently uncommon).
-//
-// mountTable.Init() must be called on new mountTables before use.
-type mountTable struct {
- // mountTable is implemented as a seqcount-protected hash table that
- // resolves collisions with linear probing, featuring Robin Hood insertion
- // and backward shift deletion. These minimize probe length variance,
- // significantly improving the performance of linear probing at high load
- // factors. (mountTable doesn't use bucketing, which is the other major
- // technique commonly used in high-performance hash tables; the efficiency
- // of bucketing is largely due to SIMD lookup, and Go lacks both SIMD
- // intrinsics and inline assembly, limiting the performance of this
- // approach.)
-
- seq gvsync.SeqCount
- seed uint32 // for hashing keys
-
- // size holds both length (number of elements) and capacity (number of
- // slots): capacity is stored as its base-2 log (referred to as order) in
- // the least significant bits of size, and length is stored in the
- // remaining bits. Go defines bit shifts >= width of shifted unsigned
- // operand as shifting to 0, which differs from x86's SHL, so the Go
- // compiler inserts a bounds check for each bit shift unless we mask order
- // anyway (cf. runtime.bucketShift()), and length isn't used by lookup;
- // thus this bit packing gets us more bits for the length (vs. storing
- // length and cap in separate uint32s) for ~free.
- size uint64
-
- slots unsafe.Pointer // []mountSlot; never nil after Init
-}
-
-type mountSlot struct {
- // We don't store keys in slots; instead, we just check Mount.parent and
- // Mount.point directly. Any practical use of lookup will need to touch
- // Mounts anyway, and comparing hashes means that false positives are
- // extremely rare, so this isn't an extra cache line touch overall.
- value unsafe.Pointer // *Mount
- hash uintptr
-}
-
-const (
- mtSizeOrderBits = 6 // log2 of pointer size in bits
- mtSizeOrderMask = (1 << mtSizeOrderBits) - 1
- mtSizeOrderOne = 1
- mtSizeLenLSB = mtSizeOrderBits
- mtSizeLenOne = 1 << mtSizeLenLSB
- mtSizeLenNegOne = ^uint64(mtSizeOrderMask) // uint64(-1) << mtSizeLenLSB
-
- mountSlotBytes = unsafe.Sizeof(mountSlot{})
- mountKeyBytes = unsafe.Sizeof(mountKey{})
-
- // Tuning parameters.
- //
- // Essentially every mountTable will contain at least /proc, /sys, and
- // /dev/shm, so there is ~no reason for mtInitCap to be < 4.
- mtInitOrder = 2
- mtInitCap = 1 << mtInitOrder
- mtMaxLoadNum = 13
- mtMaxLoadDen = 16
-)
-
-func init() {
- // We can't just define mtSizeOrderBits as follows because Go doesn't have
- // constexpr.
- if ptrBits := uint(unsafe.Sizeof(uintptr(0)) * 8); mtSizeOrderBits != bits.TrailingZeros(ptrBits) {
- panic(fmt.Sprintf("mtSizeOrderBits (%d) must be %d = log2 of pointer size in bits (%d)", mtSizeOrderBits, bits.TrailingZeros(ptrBits), ptrBits))
- }
- if bits.OnesCount(uint(mountSlotBytes)) != 1 {
- panic(fmt.Sprintf("sizeof(mountSlotBytes) (%d) must be a power of 2 to use bit masking for wraparound", mountSlotBytes))
- }
- if mtInitCap <= 1 {
- panic(fmt.Sprintf("mtInitCap (%d) must be at least 2 since mountTable methods assume that there will always be at least one empty slot", mtInitCap))
- }
- if mtMaxLoadNum >= mtMaxLoadDen {
- panic(fmt.Sprintf("invalid mountTable maximum load factor (%d/%d)", mtMaxLoadNum, mtMaxLoadDen))
- }
-}
-
-// Init must be called exactly once on each mountTable before use.
-func (mt *mountTable) Init() {
- mt.seed = rand32()
- mt.size = mtInitOrder
- mt.slots = newMountTableSlots(mtInitCap)
-}
-
-func newMountTableSlots(cap uintptr) unsafe.Pointer {
- slice := make([]mountSlot, cap, cap)
- hdr := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
- return unsafe.Pointer(hdr.Data)
-}
-
-// Lookup returns the Mount with the given parent, mounted at the given point.
-// If no such Mount exists, Lookup returns nil.
-//
-// Lookup may be called even if there are concurrent mutators of mt.
-func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount {
- key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)}
- hash := memhash(noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes)
-
-loop:
- for {
- epoch := mt.seq.BeginRead()
- size := atomic.LoadUint64(&mt.size)
- slots := atomic.LoadPointer(&mt.slots)
- if !mt.seq.ReadOk(epoch) {
- continue
- }
- tcap := uintptr(1) << (size & mtSizeOrderMask)
- mask := tcap - 1
- off := (hash & mask) * mountSlotBytes
- offmask := mask * mountSlotBytes
- for {
- // This avoids bounds checking.
- slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
- slotValue := atomic.LoadPointer(&slot.value)
- slotHash := atomic.LoadUintptr(&slot.hash)
- if !mt.seq.ReadOk(epoch) {
- // The element we're looking for might have been moved into a
- // slot we've previously checked, so restart entirely.
- continue loop
- }
- if slotValue == nil {
- return nil
- }
- if slotHash == hash {
- mount := (*Mount)(slotValue)
- var mountKey mountKey
- mountKey.parent = atomic.LoadPointer(&mount.key.parent)
- mountKey.point = atomic.LoadPointer(&mount.key.point)
- if !mt.seq.ReadOk(epoch) {
- continue loop
- }
- if key == mountKey {
- return mount
- }
- }
- off = (off + mountSlotBytes) & offmask
- }
- }
-}
-
-// Insert inserts the given mount into mt.
-//
-// Preconditions: There are no concurrent mutators of mt. mt must not already
-// contain a Mount with the same mount point and parent.
-func (mt *mountTable) Insert(mount *Mount) {
- hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
-
- // We're under the maximum load factor if:
- //
- // (len+1) / cap <= mtMaxLoadNum / mtMaxLoadDen
- // (len+1) * mtMaxLoadDen <= mtMaxLoadNum * cap
- tlen := mt.size >> mtSizeLenLSB
- order := mt.size & mtSizeOrderMask
- tcap := uintptr(1) << order
- if ((tlen + 1) * mtMaxLoadDen) <= (uint64(mtMaxLoadNum) << order) {
- // Atomically insert the new element into the table.
- mt.seq.BeginWrite()
- atomic.AddUint64(&mt.size, mtSizeLenOne)
- mtInsertLocked(mt.slots, tcap, unsafe.Pointer(mount), hash)
- mt.seq.EndWrite()
- return
- }
-
- // Otherwise, we have to expand. Double the number of slots in the new
- // table.
- newOrder := order + 1
- if newOrder > mtSizeOrderMask {
- panic("mount table size overflow")
- }
- newCap := uintptr(1) << newOrder
- newSlots := newMountTableSlots(newCap)
- // Copy existing elements to the new table.
- oldCur := mt.slots
- // Go does not permit pointers to the end of allocated objects, so we
- // must use a pointer to the last element of the old table. The
- // following expression is equivalent to
- // `slots+(cap-1)*mountSlotBytes` but has a critical path length of 2
- // arithmetic instructions instead of 3.
- oldLast := unsafe.Pointer((uintptr(mt.slots) - mountSlotBytes) + (tcap * mountSlotBytes))
- for {
- oldSlot := (*mountSlot)(oldCur)
- if oldSlot.value != nil {
- // Don't need to lock mt.seq yet since newSlots isn't visible
- // to readers.
- mtInsertLocked(newSlots, newCap, oldSlot.value, oldSlot.hash)
- }
- if oldCur == oldLast {
- break
- }
- oldCur = unsafe.Pointer(uintptr(oldCur) + mountSlotBytes)
- }
- // Insert the new element into the new table.
- mtInsertLocked(newSlots, newCap, unsafe.Pointer(mount), hash)
- // Atomically switch to the new table.
- mt.seq.BeginWrite()
- atomic.AddUint64(&mt.size, mtSizeLenOne|mtSizeOrderOne)
- atomic.StorePointer(&mt.slots, newSlots)
- mt.seq.EndWrite()
-}
-
-// Preconditions: There are no concurrent mutators of the table (slots, cap).
-// If the table is visible to readers, then mt.seq must be in a writer critical
-// section. cap must be a power of 2.
-func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, hash uintptr) {
- mask := cap - 1
- off := (hash & mask) * mountSlotBytes
- offmask := mask * mountSlotBytes
- disp := uintptr(0)
- for {
- slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
- slotValue := slot.value
- if slotValue == nil {
- atomic.StorePointer(&slot.value, value)
- atomic.StoreUintptr(&slot.hash, hash)
- return
- }
- // If we've been displaced farther from our first-probed slot than the
- // element stored in this one, swap elements and switch to inserting
- // the replaced one. (This is Robin Hood insertion.)
- slotHash := slot.hash
- slotDisp := ((off / mountSlotBytes) - slotHash) & mask
- if disp > slotDisp {
- atomic.StorePointer(&slot.value, value)
- atomic.StoreUintptr(&slot.hash, hash)
- value = slotValue
- hash = slotHash
- disp = slotDisp
- }
- off = (off + mountSlotBytes) & offmask
- disp++
- }
-}
-
-// Remove removes the given mount from mt.
-//
-// Preconditions: There are no concurrent mutators of mt. mt must contain
-// mount.
-func (mt *mountTable) Remove(mount *Mount) {
- hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
- tcap := uintptr(1) << (mt.size & mtSizeOrderMask)
- mask := tcap - 1
- slots := mt.slots
- off := (hash & mask) * mountSlotBytes
- offmask := mask * mountSlotBytes
- for {
- slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
- slotValue := slot.value
- if slotValue == unsafe.Pointer(mount) {
- // Found the element to remove. Move all subsequent elements
- // backward until we either find an empty slot, or an element that
- // is already in its first-probed slot. (This is backward shift
- // deletion.)
- mt.seq.BeginWrite()
- for {
- nextOff := (off + mountSlotBytes) & offmask
- nextSlot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + nextOff))
- nextSlotValue := nextSlot.value
- if nextSlotValue == nil {
- break
- }
- nextSlotHash := nextSlot.hash
- if (nextOff / mountSlotBytes) == (nextSlotHash & mask) {
- break
- }
- atomic.StorePointer(&slot.value, nextSlotValue)
- atomic.StoreUintptr(&slot.hash, nextSlotHash)
- off = nextOff
- slot = nextSlot
- }
- atomic.StorePointer(&slot.value, nil)
- atomic.AddUint64(&mt.size, mtSizeLenNegOne)
- mt.seq.EndWrite()
- return
- }
- if checkInvariants && slotValue == nil {
- panic(fmt.Sprintf("mountTable.Remove() called on missing Mount %v", mount))
- }
- off = (off + mountSlotBytes) & offmask
- }
-}
-
-//go:linkname memhash runtime.memhash
-func memhash(p unsafe.Pointer, seed, s uintptr) uintptr
-
-//go:linkname rand32 runtime.fastrand
-func rand32() uint32
-
-// This is copy/pasted from runtime.noescape(), and is needed because arguments
-// apparently escape from all functions defined by linkname.
-//
-//go:nosplit
-func noescape(p unsafe.Pointer) unsafe.Pointer {
- x := uintptr(p)
- return unsafe.Pointer(x ^ 0)
-}
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
deleted file mode 100644
index 187e5410c..000000000
--- a/pkg/sentry/vfs/options.go
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs
-
-import (
- "gvisor.dev/gvisor/pkg/abi/linux"
-)
-
-// GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and
-// FilesystemImpl.GetDentryAt().
-type GetDentryOptions struct {
- // If CheckSearchable is true, FilesystemImpl.GetDentryAt() must check that
- // the returned Dentry is a directory for which creds has search
- // permission.
- CheckSearchable bool
-}
-
-// MkdirOptions contains options to VirtualFilesystem.MkdirAt() and
-// FilesystemImpl.MkdirAt().
-type MkdirOptions struct {
- // Mode is the file mode bits for the created directory.
- Mode uint16
-}
-
-// MknodOptions contains options to VirtualFilesystem.MknodAt() and
-// FilesystemImpl.MknodAt().
-type MknodOptions struct {
- // Mode is the file type and mode bits for the created file.
- Mode uint16
-
- // If Mode specifies a character or block device special file, DevMajor and
- // DevMinor are the major and minor device numbers for the created device.
- DevMajor uint32
- DevMinor uint32
-}
-
-// OpenOptions contains options to VirtualFilesystem.OpenAt() and
-// FilesystemImpl.OpenAt().
-type OpenOptions struct {
- // Flags contains access mode and flags as specified for open(2).
- //
- // FilesystemImpls is reponsible for implementing the following flags:
- // O_RDONLY, O_WRONLY, O_RDWR, O_APPEND, O_CREAT, O_DIRECT, O_DSYNC,
- // O_EXCL, O_NOATIME, O_NOCTTY, O_NONBLOCK, O_PATH, O_SYNC, O_TMPFILE, and
- // O_TRUNC. VFS is responsible for handling O_DIRECTORY, O_LARGEFILE, and
- // O_NOFOLLOW. VFS users are responsible for handling O_CLOEXEC, since file
- // descriptors are mostly outside the scope of VFS.
- Flags uint32
-
- // If FilesystemImpl.OpenAt() creates a file, Mode is the file mode for the
- // created file.
- Mode uint16
-}
-
-// ReadOptions contains options to FileDescription.PRead(),
-// FileDescriptionImpl.PRead(), FileDescription.Read(), and
-// FileDescriptionImpl.Read().
-type ReadOptions struct {
- // Flags contains flags as specified for preadv2(2).
- Flags uint32
-}
-
-// RenameOptions contains options to VirtualFilesystem.RenameAt() and
-// FilesystemImpl.RenameAt().
-type RenameOptions struct {
- // Flags contains flags as specified for renameat2(2).
- Flags uint32
-}
-
-// SetStatOptions contains options to VirtualFilesystem.SetStatAt(),
-// FilesystemImpl.SetStatAt(), FileDescription.SetStat(), and
-// FileDescriptionImpl.SetStat().
-type SetStatOptions struct {
- // Stat is the metadata that should be set. Only fields indicated by
- // Stat.Mask should be set.
- //
- // If Stat specifies that a timestamp should be set,
- // FilesystemImpl.SetStatAt() and FileDescriptionImpl.SetStat() must
- // special-case StatxTimestamp.Nsec == UTIME_NOW as described by
- // utimensat(2); however, they do not need to check for StatxTimestamp.Nsec
- // == UTIME_OMIT (VFS users must unset the corresponding bit in Stat.Mask
- // instead).
- Stat linux.Statx
-}
-
-// StatOptions contains options to VirtualFilesystem.StatAt(),
-// FilesystemImpl.StatAt(), FileDescription.Stat(), and
-// FileDescriptionImpl.Stat().
-type StatOptions struct {
- // Mask is the set of fields in the returned Statx that the FilesystemImpl
- // or FileDescriptionImpl should provide. Bits are as in linux.Statx.Mask.
- //
- // The FilesystemImpl or FileDescriptionImpl may return fields not
- // requested in Mask, and may fail to return fields requested in Mask that
- // are not supported by the underlying filesystem implementation, without
- // returning an error.
- Mask uint32
-
- // Sync specifies the synchronization required, and is one of
- // linux.AT_STATX_SYNC_AS_STAT (which is 0, and therefore the default),
- // linux.AT_STATX_SYNC_FORCE_SYNC, or linux.AT_STATX_SYNC_DONT_SYNC.
- Sync uint32
-}
-
-// WriteOptions contains options to FileDescription.PWrite(),
-// FileDescriptionImpl.PWrite(), FileDescription.Write(), and
-// FileDescriptionImpl.Write().
-type WriteOptions struct {
- // Flags contains flags as specified for pwritev2(2).
- Flags uint32
-}
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
deleted file mode 100644
index f8e74355c..000000000
--- a/pkg/sentry/vfs/permissions.go
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs
-
-import (
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/syserror"
-)
-
-// AccessTypes is a bitmask of Unix file permissions.
-type AccessTypes uint16
-
-// Bits in AccessTypes.
-const (
- MayRead AccessTypes = 4
- MayWrite = 2
- MayExec = 1
-)
-
-// GenericCheckPermissions checks that creds has the given access rights on a
-// file with the given permissions, UID, and GID, subject to the rules of
-// fs/namei.c:generic_permission(). isDir is true if the file is a directory.
-func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir bool, mode uint16, kuid auth.KUID, kgid auth.KGID) error {
- // Check permission bits.
- perms := mode
- if creds.EffectiveKUID == kuid {
- perms >>= 6
- } else if creds.InGroup(kgid) {
- perms >>= 3
- }
- if uint16(ats)&perms == uint16(ats) {
- return nil
- }
-
- // Caller capabilities require that the file's KUID and KGID are mapped in
- // the caller's user namespace; compare
- // kernel/capability.c:privileged_wrt_inode_uidgid().
- if !kuid.In(creds.UserNamespace).Ok() || !kgid.In(creds.UserNamespace).Ok() {
- return syserror.EACCES
- }
- // CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary
- // directories, and read arbitrary non-directory files.
- if (isDir && (ats&MayWrite == 0)) || ats == MayRead {
- if creds.HasCapability(linux.CAP_DAC_READ_SEARCH) {
- return nil
- }
- }
- // CAP_DAC_OVERRIDE allows arbitrary access to directories, read/write
- // access to non-directory files, and execute access to non-directory files
- // for which at least one execute bit is set.
- if isDir || (ats&MayExec == 0) || (mode&0111 != 0) {
- if creds.HasCapability(linux.CAP_DAC_OVERRIDE) {
- return nil
- }
- }
- return syserror.EACCES
-}
-
-// AccessTypesForOpenFlags returns the access types required to open a file
-// with the given OpenOptions.Flags. Note that this is NOT the same thing as
-// the set of accesses permitted for the opened file:
-//
-// - O_TRUNC causes MayWrite to be set in the returned AccessTypes (since it
-// mutates the file), but does not permit the opened to write to the file
-// thereafter.
-//
-// - "Linux reserves the special, nonstandard access mode 3 (binary 11) in
-// flags to mean: check for read and write permission on the file and return a
-// file descriptor that can't be used for reading or writing." - open(2). Thus
-// AccessTypesForOpenFlags returns MayRead|MayWrite in this case, but
-// filesystems are responsible for ensuring that access is denied.
-//
-// Use May{Read,Write}FileWithOpenFlags() for these checks instead.
-func AccessTypesForOpenFlags(flags uint32) AccessTypes {
- switch flags & linux.O_ACCMODE {
- case linux.O_RDONLY:
- if flags&linux.O_TRUNC != 0 {
- return MayRead | MayWrite
- }
- return MayRead
- case linux.O_WRONLY:
- return MayWrite
- default:
- return MayRead | MayWrite
- }
-}
-
-// MayReadFileWithOpenFlags returns true if a file with the given open flags
-// should be readable.
-func MayReadFileWithOpenFlags(flags uint32) bool {
- switch flags & linux.O_ACCMODE {
- case linux.O_RDONLY, linux.O_RDWR:
- return true
- default:
- return false
- }
-}
-
-// MayWriteFileWithOpenFlags returns true if a file with the given open flags
-// should be writable.
-func MayWriteFileWithOpenFlags(flags uint32) bool {
- switch flags & linux.O_ACCMODE {
- case linux.O_WRONLY, linux.O_RDWR:
- return true
- default:
- return false
- }
-}
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
deleted file mode 100644
index 8d05c8583..000000000
--- a/pkg/sentry/vfs/resolving_path.go
+++ /dev/null
@@ -1,453 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs
-
-import (
- "fmt"
- "sync"
-
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/fspath"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/syserror"
-)
-
-// ResolvingPath represents the state of an in-progress path resolution, shared
-// between VFS and FilesystemImpl methods that take a path.
-//
-// From the perspective of FilesystemImpl methods, a ResolvingPath represents a
-// starting Dentry on the associated Filesystem (on which a reference is
-// already held) and a stream of path components relative to that Dentry.
-//
-// ResolvingPath is loosely analogous to Linux's struct nameidata.
-type ResolvingPath struct {
- vfs *VirtualFilesystem
- root VirtualDentry // refs borrowed from PathOperation
- mount *Mount
- start *Dentry
- pit fspath.Iterator
-
- flags uint16
- mustBeDir bool // final file must be a directory?
- mustBeDirOrig bool
- symlinks uint8 // number of symlinks traversed
- symlinksOrig uint8
- curPart uint8 // index into parts
- numOrigParts uint8
-
- creds *auth.Credentials
-
- // Data associated with resolve*Errors, stored in ResolvingPath so that
- // those errors don't need to allocate.
- nextMount *Mount // ref held if not nil
- nextStart *Dentry // ref held if not nil
- absSymlinkTarget fspath.Path
-
- // ResolvingPath must track up to two relative paths: the "current"
- // relative path, which is updated whenever a relative symlink is
- // encountered, and the "original" relative path, which is updated from the
- // current relative path by handleError() when resolution must change
- // filesystems (due to reaching a mount boundary or absolute symlink) and
- // overwrites the current relative path when Restart() is called.
- parts [1 + linux.MaxSymlinkTraversals]fspath.Iterator
- origParts [1 + linux.MaxSymlinkTraversals]fspath.Iterator
-}
-
-const (
- rpflagsHaveMountRef = 1 << iota // do we hold a reference on mount?
- rpflagsHaveStartRef // do we hold a reference on start?
- rpflagsFollowFinalSymlink // same as PathOperation.FollowFinalSymlink
-)
-
-func init() {
- if maxParts := len(ResolvingPath{}.parts); maxParts > 255 {
- panic(fmt.Sprintf("uint8 is insufficient to accommodate len(ResolvingPath.parts) (%d)", maxParts))
- }
-}
-
-// Error types that communicate state from the FilesystemImpl-caller,
-// VFS-callee side of path resolution (i.e. errors returned by
-// ResolvingPath.Resolve*()) to the VFS-caller, FilesystemImpl-callee side
-// (i.e. VFS methods => ResolvingPath.handleError()). These are empty structs
-// rather than error values because Go doesn't support non-primitive constants,
-// so error "constants" are really mutable vars, necessitating somewhat
-// expensive interface object comparisons.
-
-type resolveMountRootError struct{}
-
-// Error implements error.Error.
-func (resolveMountRootError) Error() string {
- return "resolving mount root"
-}
-
-type resolveMountPointError struct{}
-
-// Error implements error.Error.
-func (resolveMountPointError) Error() string {
- return "resolving mount point"
-}
-
-type resolveAbsSymlinkError struct{}
-
-// Error implements error.Error.
-func (resolveAbsSymlinkError) Error() string {
- return "resolving absolute symlink"
-}
-
-var resolvingPathPool = sync.Pool{
- New: func() interface{} {
- return &ResolvingPath{}
- },
-}
-
-func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) (*ResolvingPath, error) {
- path, err := fspath.Parse(pop.Pathname)
- if err != nil {
- return nil, err
- }
- rp := resolvingPathPool.Get().(*ResolvingPath)
- rp.vfs = vfs
- rp.root = pop.Root
- rp.mount = pop.Start.mount
- rp.start = pop.Start.dentry
- rp.pit = path.Begin
- rp.flags = 0
- if pop.FollowFinalSymlink {
- rp.flags |= rpflagsFollowFinalSymlink
- }
- rp.mustBeDir = path.Dir
- rp.mustBeDirOrig = path.Dir
- rp.symlinks = 0
- rp.curPart = 0
- rp.numOrigParts = 1
- rp.creds = creds
- rp.parts[0] = path.Begin
- rp.origParts[0] = path.Begin
- return rp, nil
-}
-
-func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) {
- rp.root = VirtualDentry{}
- rp.decRefStartAndMount()
- rp.mount = nil
- rp.start = nil
- rp.releaseErrorState()
- resolvingPathPool.Put(rp)
-}
-
-func (rp *ResolvingPath) decRefStartAndMount() {
- if rp.flags&rpflagsHaveStartRef != 0 {
- rp.start.decRef(rp.mount.fs)
- }
- if rp.flags&rpflagsHaveMountRef != 0 {
- rp.mount.decRef()
- }
-}
-
-func (rp *ResolvingPath) releaseErrorState() {
- if rp.nextStart != nil {
- rp.nextStart.decRef(rp.nextMount.fs)
- rp.nextStart = nil
- }
- if rp.nextMount != nil {
- rp.nextMount.decRef()
- rp.nextMount = nil
- }
-}
-
-// VirtualFilesystem returns the containing VirtualFilesystem.
-func (rp *ResolvingPath) VirtualFilesystem() *VirtualFilesystem {
- return rp.vfs
-}
-
-// Credentials returns the credentials of rp's provider.
-func (rp *ResolvingPath) Credentials() *auth.Credentials {
- return rp.creds
-}
-
-// Mount returns the Mount on which path resolution is currently occurring. It
-// does not take a reference on the returned Mount.
-func (rp *ResolvingPath) Mount() *Mount {
- return rp.mount
-}
-
-// Start returns the starting Dentry represented by rp. It does not take a
-// reference on the returned Dentry.
-func (rp *ResolvingPath) Start() *Dentry {
- return rp.start
-}
-
-// Done returns true if there are no remaining path components in the stream
-// represented by rp.
-func (rp *ResolvingPath) Done() bool {
- // We don't need to check for rp.curPart == 0 because rp.Advance() won't
- // set rp.pit to a terminal iterator otherwise.
- return !rp.pit.Ok()
-}
-
-// Final returns true if there is exactly one remaining path component in the
-// stream represented by rp.
-//
-// Preconditions: !rp.Done().
-func (rp *ResolvingPath) Final() bool {
- return rp.curPart == 0 && !rp.pit.NextOk()
-}
-
-// Component returns the current path component in the stream represented by
-// rp.
-//
-// Preconditions: !rp.Done().
-func (rp *ResolvingPath) Component() string {
- if checkInvariants {
- if !rp.pit.Ok() {
- panic("ResolvingPath.Component() called at end of relative path")
- }
- }
- return rp.pit.String()
-}
-
-// Advance advances the stream of path components represented by rp.
-//
-// Preconditions: !rp.Done().
-func (rp *ResolvingPath) Advance() {
- if checkInvariants {
- if !rp.pit.Ok() {
- panic("ResolvingPath.Advance() called at end of relative path")
- }
- }
- next := rp.pit.Next()
- if next.Ok() || rp.curPart == 0 { // have next component, or at end of path
- rp.pit = next
- } else { // at end of path segment, continue with next one
- rp.curPart--
- rp.pit = rp.parts[rp.curPart-1]
- }
-}
-
-// Restart resets the stream of path components represented by rp to its state
-// on entry to the current FilesystemImpl method.
-func (rp *ResolvingPath) Restart() {
- rp.pit = rp.origParts[rp.numOrigParts-1]
- rp.mustBeDir = rp.mustBeDirOrig
- rp.symlinks = rp.symlinksOrig
- rp.curPart = rp.numOrigParts - 1
- copy(rp.parts[:], rp.origParts[:rp.numOrigParts])
- rp.releaseErrorState()
-}
-
-func (rp *ResolvingPath) relpathCommit() {
- rp.mustBeDirOrig = rp.mustBeDir
- rp.symlinksOrig = rp.symlinks
- rp.numOrigParts = rp.curPart + 1
- copy(rp.origParts[:rp.curPart], rp.parts[:])
- rp.origParts[rp.curPart] = rp.pit
-}
-
-// ResolveParent returns the VFS parent of d. It does not take a reference on
-// the returned Dentry.
-//
-// Preconditions: There are no concurrent mutators of d.
-//
-// Postconditions: If the returned error is nil, then the returned Dentry is
-// not nil.
-func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) {
- var parent *Dentry
- if d == rp.root.dentry && rp.mount == rp.root.mount {
- // At contextual VFS root.
- parent = d
- } else if d == rp.mount.root {
- // At mount root ...
- mnt, mntpt := rp.vfs.getMountpointAt(rp.mount, rp.root)
- if mnt != nil {
- // ... of non-root mount.
- rp.nextMount = mnt
- rp.nextStart = mntpt
- return nil, resolveMountRootError{}
- }
- // ... of root mount.
- parent = d
- } else if d.parent == nil {
- // At filesystem root.
- parent = d
- } else {
- parent = d.parent
- }
- if parent.isMounted() {
- if mnt := rp.vfs.getMountAt(rp.mount, parent); mnt != nil {
- rp.nextMount = mnt
- return nil, resolveMountPointError{}
- }
- }
- return parent, nil
-}
-
-// ResolveChild returns the VFS child of d with the given name. It does not
-// take a reference on the returned Dentry. If no such child exists,
-// ResolveChild returns (nil, nil).
-//
-// Preconditions: There are no concurrent mutators of d.
-func (rp *ResolvingPath) ResolveChild(d *Dentry, name string) (*Dentry, error) {
- child := d.children[name]
- if child == nil {
- return nil, nil
- }
- if child.isMounted() {
- if mnt := rp.vfs.getMountAt(rp.mount, child); mnt != nil {
- rp.nextMount = mnt
- return nil, resolveMountPointError{}
- }
- }
- return child, nil
-}
-
-// ResolveComponent returns the Dentry reached by starting at d and resolving
-// the current path component in the stream represented by rp. It does not
-// advance the stream. It does not take a reference on the returned Dentry. If
-// no such Dentry exists, ResolveComponent returns (nil, nil).
-//
-// Preconditions: !rp.Done(). There are no concurrent mutators of d.
-func (rp *ResolvingPath) ResolveComponent(d *Dentry) (*Dentry, error) {
- switch pc := rp.Component(); pc {
- case ".":
- return d, nil
- case "..":
- return rp.ResolveParent(d)
- default:
- return rp.ResolveChild(d, pc)
- }
-}
-
-// ShouldFollowSymlink returns true if, supposing that the current path
-// component in pcs represents a symbolic link, the symbolic link should be
-// followed.
-//
-// Preconditions: !rp.Done().
-func (rp *ResolvingPath) ShouldFollowSymlink() bool {
- // Non-final symlinks are always followed.
- return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final()
-}
-
-// HandleSymlink is called when the current path component is a symbolic link
-// to the given target. If the calling Filesystem method should continue path
-// traversal, HandleSymlink updates the path component stream to reflect the
-// symlink target and returns nil. Otherwise it returns a non-nil error.
-//
-// Preconditions: !rp.Done().
-func (rp *ResolvingPath) HandleSymlink(target string) error {
- if rp.symlinks >= linux.MaxSymlinkTraversals {
- return syserror.ELOOP
- }
- targetPath, err := fspath.Parse(target)
- if err != nil {
- return err
- }
- rp.symlinks++
- if targetPath.Absolute {
- rp.absSymlinkTarget = targetPath
- return resolveAbsSymlinkError{}
- }
- if !targetPath.Begin.Ok() {
- panic(fmt.Sprintf("symbolic link has non-empty target %q that is both relative and has no path components?", target))
- }
- // Consume the path component that represented the symlink.
- rp.Advance()
- // Prepend the symlink target to the relative path.
- rp.relpathPrepend(targetPath)
- return nil
-}
-
-func (rp *ResolvingPath) relpathPrepend(path fspath.Path) {
- if rp.pit.Ok() {
- rp.parts[rp.curPart] = rp.pit
- rp.pit = path.Begin
- rp.curPart++
- } else {
- // The symlink was the final path component, so now the symlink target
- // is the whole path.
- rp.pit = path.Begin
- // Symlink targets can set rp.mustBeDir (if they end in a trailing /),
- // but can't unset it.
- if path.Dir {
- rp.mustBeDir = true
- }
- }
-}
-
-func (rp *ResolvingPath) handleError(err error) bool {
- switch err.(type) {
- case resolveMountRootError:
- // Switch to the new Mount. We hold references on the Mount and Dentry
- // (from VFS.getMountpointAt()).
- rp.decRefStartAndMount()
- rp.mount = rp.nextMount
- rp.start = rp.nextStart
- rp.flags |= rpflagsHaveMountRef | rpflagsHaveStartRef
- rp.nextMount = nil
- rp.nextStart = nil
- // Commit the previous FileystemImpl's progress through the relative
- // path. (Don't consume the path component that caused us to traverse
- // through the mount root - i.e. the ".." - because we still need to
- // resolve the mount point's parent in the new FilesystemImpl.)
- rp.relpathCommit()
- // Restart path resolution on the new Mount. Don't bother calling
- // rp.releaseErrorState() since we already set nextMount and nextStart
- // to nil above.
- return true
-
- case resolveMountPointError:
- // Switch to the new Mount. We hold a reference on the Mount (from
- // VFS.getMountAt()), but borrow the reference on the mount root from
- // the Mount.
- rp.decRefStartAndMount()
- rp.mount = rp.nextMount
- rp.start = rp.nextMount.root
- rp.flags = rp.flags&^rpflagsHaveStartRef | rpflagsHaveMountRef
- rp.nextMount = nil
- // Consume the path component that represented the mount point.
- rp.Advance()
- // Commit the previous FilesystemImpl's progress through the relative
- // path.
- rp.relpathCommit()
- // Restart path resolution on the new Mount.
- rp.releaseErrorState()
- return true
-
- case resolveAbsSymlinkError:
- // Switch to the new Mount. References are borrowed from rp.root.
- rp.decRefStartAndMount()
- rp.mount = rp.root.mount
- rp.start = rp.root.dentry
- rp.flags &^= rpflagsHaveMountRef | rpflagsHaveStartRef
- // Consume the path component that represented the symlink.
- rp.Advance()
- // Prepend the symlink target to the relative path.
- rp.relpathPrepend(rp.absSymlinkTarget)
- // Commit the previous FilesystemImpl's progress through the relative
- // path, including the symlink target we just prepended.
- rp.relpathCommit()
- // Restart path resolution on the new Mount.
- rp.releaseErrorState()
- return true
-
- default:
- // Not an error we can handle.
- return false
- }
-}
-
-// MustBeDir returns true if the file traversed by rp must be a directory.
-func (rp *ResolvingPath) MustBeDir() bool {
- return rp.mustBeDir
-}
diff --git a/pkg/sentry/vfs/syscalls.go b/pkg/sentry/vfs/syscalls.go
deleted file mode 100644
index 23f2b9e08..000000000
--- a/pkg/sentry/vfs/syscalls.go
+++ /dev/null
@@ -1,217 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs
-
-import (
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/syserror"
-)
-
-// PathOperation specifies the path operated on by a VFS method.
-//
-// PathOperation is passed to VFS methods by pointer to reduce memory copying:
-// it's somewhat large and should never escape. (Options structs are passed by
-// pointer to VFS and FileDescription methods for the same reason.)
-type PathOperation struct {
- // Root is the VFS root. References on Root are borrowed from the provider
- // of the PathOperation.
- //
- // Invariants: Root.Ok().
- Root VirtualDentry
-
- // Start is the starting point for the path traversal. References on Start
- // are borrowed from the provider of the PathOperation (i.e. the caller of
- // the VFS method to which the PathOperation was passed).
- //
- // Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root.
- Start VirtualDentry
-
- // Path is the pathname traversed by this operation.
- Pathname string
-
- // If FollowFinalSymlink is true, and the Dentry traversed by the final
- // path component represents a symbolic link, the symbolic link should be
- // followed.
- FollowFinalSymlink bool
-}
-
-// GetDentryAt returns a VirtualDentry representing the given path, at which a
-// file must exist. A reference is taken on the returned VirtualDentry.
-func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
- rp, err := vfs.getResolvingPath(creds, pop)
- if err != nil {
- return VirtualDentry{}, err
- }
- for {
- d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts)
- if err == nil {
- vd := VirtualDentry{
- mount: rp.mount,
- dentry: d,
- }
- rp.mount.incRef()
- vfs.putResolvingPath(rp)
- return vd, nil
- }
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
- return VirtualDentry{}, err
- }
- }
-}
-
-// MkdirAt creates a directory at the given path.
-func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
- // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is
- // also honored." - mkdir(2)
- opts.Mode &= 01777
- rp, err := vfs.getResolvingPath(creds, pop)
- if err != nil {
- return err
- }
- for {
- err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
- if err == nil {
- vfs.putResolvingPath(rp)
- return nil
- }
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
- return err
- }
- }
-}
-
-// OpenAt returns a FileDescription providing access to the file at the given
-// path. A reference is taken on the returned FileDescription.
-func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
- // Remove:
- //
- // - O_LARGEFILE, which we always report in FileDescription status flags
- // since only 64-bit architectures are supported at this time.
- //
- // - O_CLOEXEC, which affects file descriptors and therefore must be
- // handled outside of VFS.
- //
- // - Unknown flags.
- opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
- // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC.
- if opts.Flags&linux.O_SYNC != 0 {
- opts.Flags |= linux.O_DSYNC
- }
- // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified
- // with O_DIRECTORY and a writable access mode (to ensure that it fails on
- // filesystem implementations that do not support it).
- if opts.Flags&linux.O_TMPFILE != 0 {
- if opts.Flags&linux.O_DIRECTORY == 0 {
- return nil, syserror.EINVAL
- }
- if opts.Flags&linux.O_CREAT != 0 {
- return nil, syserror.EINVAL
- }
- if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY {
- return nil, syserror.EINVAL
- }
- }
- // O_PATH causes most other flags to be ignored.
- if opts.Flags&linux.O_PATH != 0 {
- opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH
- }
- // "On Linux, the following bits are also honored in mode: [S_ISUID,
- // S_ISGID, S_ISVTX]" - open(2)
- opts.Mode &= 07777
-
- if opts.Flags&linux.O_NOFOLLOW != 0 {
- pop.FollowFinalSymlink = false
- }
- rp, err := vfs.getResolvingPath(creds, pop)
- if err != nil {
- return nil, err
- }
- if opts.Flags&linux.O_DIRECTORY != 0 {
- rp.mustBeDir = true
- rp.mustBeDirOrig = true
- }
- for {
- fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
- if err == nil {
- vfs.putResolvingPath(rp)
- return fd, nil
- }
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
- return nil, err
- }
- }
-}
-
-// StatAt returns metadata for the file at the given path.
-func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) {
- rp, err := vfs.getResolvingPath(creds, pop)
- if err != nil {
- return linux.Statx{}, err
- }
- for {
- stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
- if err == nil {
- vfs.putResolvingPath(rp)
- return stat, nil
- }
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
- return linux.Statx{}, err
- }
- }
-}
-
-// StatusFlags returns file description status flags.
-func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) {
- flags, err := fd.impl.StatusFlags(ctx)
- flags |= linux.O_LARGEFILE
- return flags, err
-}
-
-// SetStatusFlags sets file description status flags.
-func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
- return fd.impl.SetStatusFlags(ctx, flags)
-}
-
-// TODO:
-//
-// - VFS.SyncAllFilesystems() for sync(2)
-//
-// - Something for syncfs(2)
-//
-// - VFS.LinkAt()
-//
-// - VFS.MknodAt()
-//
-// - VFS.ReadlinkAt()
-//
-// - VFS.RenameAt()
-//
-// - VFS.RmdirAt()
-//
-// - VFS.SetStatAt()
-//
-// - VFS.StatFSAt()
-//
-// - VFS.SymlinkAt()
-//
-// - VFS.UnlinkAt()
-//
-// - FileDescription.(almost everything)
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
deleted file mode 100644
index 4a8a69540..000000000
--- a/pkg/sentry/vfs/vfs.go
+++ /dev/null
@@ -1,135 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package vfs implements a virtual filesystem layer.
-//
-// Lock order:
-//
-// Filesystem implementation locks
-// VirtualFilesystem.mountMu
-// VirtualFilesystem.fsTypesMu
-package vfs
-
-import (
- "sync"
-)
-
-// A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts.
-//
-// There is no analogue to the VirtualFilesystem type in Linux, as the
-// equivalent state in Linux is global.
-type VirtualFilesystem struct {
- // mountMu serializes mount mutations.
- //
- // mountMu is analogous to Linux's namespace_sem.
- mountMu sync.RWMutex
-
- // mounts maps (mount parent, mount point) pairs to mounts. (Since mounts
- // are uniquely namespaced, including mount parent in the key correctly
- // handles both bind mounts and mount namespaces; Linux does the same.)
- // Synchronization between mutators and readers is provided by mounts.seq;
- // synchronization between mutators is provided by mountMu.
- //
- // mounts is used to follow mount points during path traversal. We use a
- // single table rather than per-Dentry tables to reduce size (and therefore
- // cache footprint) for the vast majority of Dentries that are not mount
- // points.
- //
- // mounts is analogous to Linux's mount_hashtable.
- mounts mountTable
-
- // mountpoints maps mount points to mounts at those points in all
- // namespaces. mountpoints is protected by mountMu.
- //
- // mountpoints is used to find mounts that must be unmounted due to
- // removal of a mount point Dentry from another mount namespace. ("A file
- // or directory that is a mount point in one namespace that is not a mount
- // point in another namespace, may be renamed, unlinked, or removed
- // (rmdir(2)) in the mount namespace in which it is not a mount point
- // (subject to the usual permission checks)." - mount_namespaces(7))
- //
- // mountpoints is analogous to Linux's mountpoint_hashtable.
- mountpoints map[*Dentry]map[*Mount]struct{}
-
- // fsTypes contains all FilesystemTypes that are usable in the
- // VirtualFilesystem. fsTypes is protected by fsTypesMu.
- fsTypesMu sync.RWMutex
- fsTypes map[string]FilesystemType
-}
-
-// New returns a new VirtualFilesystem with no mounts or FilesystemTypes.
-func New() *VirtualFilesystem {
- vfs := &VirtualFilesystem{
- mountpoints: make(map[*Dentry]map[*Mount]struct{}),
- fsTypes: make(map[string]FilesystemType),
- }
- vfs.mounts.Init()
- return vfs
-}
-
-// A VirtualDentry represents a node in a VFS tree, by combining a Dentry
-// (which represents a node in a Filesystem's tree) and a Mount (which
-// represents the Filesystem's position in a VFS mount tree).
-//
-// VirtualDentry's semantics are similar to that of a Go interface object
-// representing a pointer: it is a copyable value type that represents
-// references to another entity. The zero value of VirtualDentry is an "empty
-// VirtualDentry", directly analogous to a nil interface object.
-// VirtualDentry.Ok() checks that a VirtualDentry is not zero-valued; unless
-// otherwise specified, all other VirtualDentry methods require
-// VirtualDentry.Ok() == true.
-//
-// Mounts and Dentries are reference-counted, requiring that users call
-// VirtualDentry.{Inc,Dec}Ref() as appropriate. We often colloquially refer to
-// references on the Mount and Dentry referred to by a VirtualDentry as
-// references on the VirtualDentry itself. Unless otherwise specified, all
-// VirtualDentry methods require that a reference is held on the VirtualDentry.
-//
-// VirtualDentry is analogous to Linux's struct path.
-type VirtualDentry struct {
- mount *Mount
- dentry *Dentry
-}
-
-// Ok returns true if vd is not empty. It does not require that a reference is
-// held.
-func (vd VirtualDentry) Ok() bool {
- return vd.mount != nil
-}
-
-// IncRef increments the reference counts on the Mount and Dentry represented
-// by vd.
-func (vd VirtualDentry) IncRef() {
- vd.mount.incRef()
- vd.dentry.incRef(vd.mount.fs)
-}
-
-// DecRef decrements the reference counts on the Mount and Dentry represented
-// by vd.
-func (vd VirtualDentry) DecRef() {
- vd.dentry.decRef(vd.mount.fs)
- vd.mount.decRef()
-}
-
-// Mount returns the Mount associated with vd. It does not take a reference on
-// the returned Mount.
-func (vd VirtualDentry) Mount() *Mount {
- return vd.mount
-}
-
-// Dentry returns the Dentry associated with vd. It does not take a reference
-// on the returned Dentry.
-func (vd VirtualDentry) Dentry() *Dentry {
- return vd.dentry
-}