diff options
Diffstat (limited to 'pkg/sentry/vfs')
-rw-r--r-- | pkg/sentry/vfs/BUILD | 57 | ||||
-rw-r--r-- | pkg/sentry/vfs/README.md | 197 | ||||
-rw-r--r-- | pkg/sentry/vfs/context.go | 37 | ||||
-rw-r--r-- | pkg/sentry/vfs/debug.go | 22 | ||||
-rw-r--r-- | pkg/sentry/vfs/dentry.go | 347 | ||||
-rw-r--r-- | pkg/sentry/vfs/file_description.go | 213 | ||||
-rw-r--r-- | pkg/sentry/vfs/file_description_impl_util.go | 254 | ||||
-rw-r--r-- | pkg/sentry/vfs/file_description_impl_util_test.go | 141 | ||||
-rw-r--r-- | pkg/sentry/vfs/filesystem.go | 155 | ||||
-rw-r--r-- | pkg/sentry/vfs/filesystem_type.go | 70 | ||||
-rw-r--r-- | pkg/sentry/vfs/mount.go | 411 | ||||
-rw-r--r-- | pkg/sentry/vfs/mount_test.go | 465 | ||||
-rw-r--r-- | pkg/sentry/vfs/mount_unsafe.go | 356 | ||||
-rw-r--r-- | pkg/sentry/vfs/options.go | 123 | ||||
-rw-r--r-- | pkg/sentry/vfs/permissions.go | 121 | ||||
-rw-r--r-- | pkg/sentry/vfs/resolving_path.go | 453 | ||||
-rw-r--r-- | pkg/sentry/vfs/syscalls.go | 217 | ||||
-rw-r--r-- | pkg/sentry/vfs/testutil.go | 139 | ||||
-rw-r--r-- | pkg/sentry/vfs/vfs.go | 135 |
19 files changed, 0 insertions, 3913 deletions
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD deleted file mode 100644 index eff4b44f6..000000000 --- a/pkg/sentry/vfs/BUILD +++ /dev/null @@ -1,57 +0,0 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") - -package(licenses = ["notice"]) - -go_library( - name = "vfs", - srcs = [ - "context.go", - "debug.go", - "dentry.go", - "file_description.go", - "file_description_impl_util.go", - "filesystem.go", - "filesystem_type.go", - "mount.go", - "mount_unsafe.go", - "options.go", - "permissions.go", - "resolving_path.go", - "syscalls.go", - "testutil.go", - "vfs.go", - ], - importpath = "gvisor.dev/gvisor/pkg/sentry/vfs", - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/abi/linux", - "//pkg/fspath", - "//pkg/sentry/arch", - "//pkg/sentry/context", - "//pkg/sentry/kernel/auth", - "//pkg/sentry/memmap", - "//pkg/sentry/usermem", - "//pkg/syserror", - "//pkg/waiter", - "//third_party/gvsync", - ], -) - -go_test( - name = "vfs_test", - size = "small", - srcs = [ - "file_description_impl_util_test.go", - "mount_test.go", - ], - embed = [":vfs"], - deps = [ - "//pkg/abi/linux", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", - "//pkg/sentry/kernel/auth", - "//pkg/sentry/usermem", - "//pkg/syserror", - ], -) diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md deleted file mode 100644 index 7847854bc..000000000 --- a/pkg/sentry/vfs/README.md +++ /dev/null @@ -1,197 +0,0 @@ -# The gVisor Virtual Filesystem - -THIS PACKAGE IS CURRENTLY EXPERIMENTAL AND NOT READY OR ENABLED FOR PRODUCTION -USE. For the filesystem implementation currently used by gVisor, see the `fs` -package. - -## Implementation Notes - -### Reference Counting - -Filesystem, Dentry, Mount, MountNamespace, and FileDescription are all -reference-counted. Mount and MountNamespace are exclusively VFS-managed; when -their reference count reaches zero, VFS releases their resources. Filesystem and -FileDescription management is shared between VFS and filesystem implementations; -when their reference count reaches zero, VFS notifies the implementation by -calling `FilesystemImpl.Release()` or `FileDescriptionImpl.Release()` -respectively and then releases VFS-owned resources. Dentries are exclusively -managed by filesystem implementations; reference count changes are abstracted -through DentryImpl, which should release resources when reference count reaches -zero. - -Filesystem references are held by: - -- Mount: Each referenced Mount holds a reference on the mounted Filesystem. - -Dentry references are held by: - -- FileDescription: Each referenced FileDescription holds a reference on the - Dentry through which it was opened, via `FileDescription.vd.dentry`. - -- Mount: Each referenced Mount holds a reference on its mount point and on the - mounted filesystem root. The mount point is mutable (`mount(MS_MOVE)`). - -Mount references are held by: - -- FileDescription: Each referenced FileDescription holds a reference on the - Mount on which it was opened, via `FileDescription.vd.mount`. - -- Mount: Each referenced Mount holds a reference on its parent, which is the - mount containing its mount point. - -- VirtualFilesystem: A reference is held on all Mounts that are attached - (reachable by Mount traversal). - -MountNamespace and FileDescription references are held by users of VFS. The -expectation is that each `kernel.Task` holds a reference on its corresponding -MountNamespace, and each file descriptor holds a reference on its represented -FileDescription. - -Notes: - -- Dentries do not hold a reference on their owning Filesystem. Instead, all - uses of a Dentry occur in the context of a Mount, which holds a reference on - the relevant Filesystem (see e.g. the VirtualDentry type). As a corollary, - when releasing references on both a Dentry and its corresponding Mount, the - Dentry's reference must be released first (because releasing the Mount's - reference may release the last reference on the Filesystem, whose state may - be required to release the Dentry reference). - -### The Inheritance Pattern - -Filesystem, Dentry, and FileDescription are all concepts featuring both state -that must be shared between VFS and filesystem implementations, and operations -that are implementation-defined. To facilitate this, each of these three -concepts follows the same pattern, shown below for Dentry: - -```go -// Dentry represents a node in a filesystem tree. -type Dentry struct { - // VFS-required dentry state. - parent *Dentry - // ... - - // impl is the DentryImpl associated with this Dentry. impl is immutable. - // This should be the last field in Dentry. - impl DentryImpl -} - -// Init must be called before first use of d. -func (d *Dentry) Init(impl DentryImpl) { - d.impl = impl -} - -// Impl returns the DentryImpl associated with d. -func (d *Dentry) Impl() DentryImpl { - return d.impl -} - -// DentryImpl contains implementation-specific details of a Dentry. -// Implementations of DentryImpl should contain their associated Dentry by -// value as their first field. -type DentryImpl interface { - // VFS-required implementation-defined dentry operations. - IncRef() - // ... -} -``` - -This construction, which is essentially a type-safe analogue to Linux's -`container_of` pattern, has the following properties: - -- VFS works almost exclusively with pointers to Dentry rather than DentryImpl - interface objects, such as in the type of `Dentry.parent`. This avoids - interface method calls (which are somewhat expensive to perform, and defeat - inlining and escape analysis), reduces the size of VFS types (since an - interface object is two pointers in size), and allows pointers to be loaded - and stored atomically using `sync/atomic`. Implementation-defined behavior - is accessed via `Dentry.impl` when required. - -- Filesystem implementations can access the implementation-defined state - associated with objects of VFS types by type-asserting or type-switching - (e.g. `Dentry.Impl().(*myDentry)`). Type assertions to a concrete type - require only an equality comparison of the interface object's type pointer - to a static constant, and are consequently very fast. - -- Filesystem implementations can access the VFS state associated with objects - of implementation-defined types directly. - -- VFS and implementation-defined state for a given type occupy the same - object, minimizing memory allocations and maximizing memory locality. `impl` - is the last field in `Dentry`, and `Dentry` is the first field in - `DentryImpl` implementations, for similar reasons: this tends to cause - fetching of the `Dentry.impl` interface object to also fetch `DentryImpl` - fields, either because they are in the same cache line or via next-line - prefetching. - -## Future Work - -- Most `mount(2)` features, and unmounting, are incomplete. - -- VFS1 filesystems are not directly compatible with VFS2. It may be possible - to implement shims that implement `vfs.FilesystemImpl` for - `fs.MountNamespace`, `vfs.DentryImpl` for `fs.Dirent`, and - `vfs.FileDescriptionImpl` for `fs.File`, which may be adequate for - filesystems that are not performance-critical (e.g. sysfs); however, it is - not clear that this will be less effort than simply porting the filesystems - in question. Practically speaking, the following filesystems will probably - need to be ported or made compatible through a shim to evaluate filesystem - performance on realistic workloads: - - - devfs/procfs/sysfs, which will realistically be necessary to execute - most applications. (Note that procfs and sysfs do not support hard - links, so they do not require the complexity of separate inode objects. - Also note that Linux's /dev is actually a variant of tmpfs called - devtmpfs.) - - - tmpfs. This should be relatively straightforward: copy/paste memfs, - store regular file contents in pgalloc-allocated memory instead of - `[]byte`, and add support for file timestamps. (In fact, it probably - makes more sense to convert memfs to tmpfs and not keep the former.) - - - A remote filesystem, either lisafs (if it is ready by the time that - other benchmarking prerequisites are) or v9fs (aka 9P, aka gofers). - - - epoll files. - - Filesystems that will need to be ported before switching to VFS2, but can - probably be skipped for early testing: - - - overlayfs, which is needed for (at least) synthetic mount points. - - - Support for host ttys. - - - timerfd files. - - Filesystems that can be probably dropped: - - - ashmem, which is far too incomplete to use. - - - binder, which is similarly far too incomplete to use. - - - whitelistfs, which we are already actively attempting to remove. - -- Save/restore. For instance, it is unclear if the current implementation of - the `state` package supports the inheritance pattern described above. - -- Many features that were previously implemented by VFS must now be - implemented by individual filesystems (though, in most cases, this should - consist of calls to hooks or libraries provided by `vfs` or other packages). - This includes, but is not necessarily limited to: - - - Block and character device special files - - - Inotify - - - File locking - - - `O_ASYNC` - -- Reference counts in the `vfs` package do not use the `refs` package since - `refs.AtomicRefCount` adds 64 bytes of overhead to each 8-byte reference - count, resulting in considerable cache bloat. 24 bytes of this overhead is - for weak reference support, which have poor performance and will not be used - by VFS2. The remaining 40 bytes is to store a descriptive string and stack - trace for reference leak checking; we can support reference leak checking - without incurring this space overhead by including the applicable - information directly in finalizers for applicable types. diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go deleted file mode 100644 index 32cf9151b..000000000 --- a/pkg/sentry/vfs/context.go +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "gvisor.dev/gvisor/pkg/sentry/context" -) - -// contextID is this package's type for context.Context.Value keys. -type contextID int - -const ( - // CtxMountNamespace is a Context.Value key for a MountNamespace. - CtxMountNamespace contextID = iota -) - -// MountNamespaceFromContext returns the MountNamespace used by ctx. It does -// not take a reference on the returned MountNamespace. If ctx is not -// associated with a MountNamespace, MountNamespaceFromContext returns nil. -func MountNamespaceFromContext(ctx context.Context) *MountNamespace { - if v := ctx.Value(CtxMountNamespace); v != nil { - return v.(*MountNamespace) - } - return nil -} diff --git a/pkg/sentry/vfs/debug.go b/pkg/sentry/vfs/debug.go deleted file mode 100644 index 0ed20f249..000000000 --- a/pkg/sentry/vfs/debug.go +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -const ( - // If checkInvariants is true, perform runtime checks for invariants - // expected by the vfs package. This is normally disabled since VFS is - // often a hot path. - checkInvariants = false -) diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go deleted file mode 100644 index 45912fc58..000000000 --- a/pkg/sentry/vfs/dentry.go +++ /dev/null @@ -1,347 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "fmt" - "sync/atomic" - - "gvisor.dev/gvisor/pkg/syserror" -) - -// Dentry represents a node in a Filesystem tree which may represent a file. -// -// Dentries are reference-counted. Unless otherwise specified, all Dentry -// methods require that a reference is held. -// -// A Dentry transitions through up to 3 different states through its lifetime: -// -// - Dentries are initially "independent". Independent Dentries have no parent, -// and consequently no name. -// -// - Dentry.InsertChild() causes an independent Dentry to become a "child" of -// another Dentry. A child node has a parent node, and a name in that parent, -// both of which are mutable by DentryMoveChild(). Each child Dentry's name is -// unique within its parent. -// -// - Dentry.RemoveChild() causes a child Dentry to become "disowned". A -// disowned Dentry can still refer to its former parent and its former name in -// said parent, but the disowned Dentry is no longer reachable from its parent, -// and a new Dentry with the same name may become a child of the parent. (This -// is analogous to a struct dentry being "unhashed" in Linux.) -// -// Dentry is loosely analogous to Linux's struct dentry, but: -// -// - VFS does not associate Dentries with inodes. gVisor interacts primarily -// with filesystems that are accessed through filesystem APIs (as opposed to -// raw block devices); many such APIs support only paths and file descriptors, -// and not inodes. Furthermore, when parties outside the scope of VFS can -// rename inodes on such filesystems, VFS generally cannot "follow" the rename, -// both due to synchronization issues and because it may not even be able to -// name the destination path; this implies that it would in fact be *incorrect* -// for Dentries to be associated with inodes on such filesystems. Consequently, -// operations that are inode operations in Linux are FilesystemImpl methods -// and/or FileDescriptionImpl methods in gVisor's VFS. Filesystems that do -// support inodes may store appropriate state in implementations of DentryImpl. -// -// - VFS does not provide synchronization for mutable Dentry fields, other than -// mount-related ones. -// -// - VFS does not require that Dentries are instantiated for all paths accessed -// through VFS, only those that are tracked beyond the scope of a single -// Filesystem operation. This includes file descriptions, mount points, mount -// roots, process working directories, and chroots. This avoids instantiation -// of Dentries for operations on mutable remote filesystems that can't actually -// cache any state in the Dentry. -// -// - For the reasons above, VFS is not directly responsible for managing Dentry -// lifetime. Dentry reference counts only indicate the extent to which VFS -// requires Dentries to exist; Filesystems may elect to cache or discard -// Dentries with zero references. -type Dentry struct { - // parent is this Dentry's parent in this Filesystem. If this Dentry is - // independent, parent is nil. - parent *Dentry - - // name is this Dentry's name in parent. - name string - - flags uint32 - - // mounts is the number of Mounts for which this Dentry is Mount.point. - // mounts is accessed using atomic memory operations. - mounts uint32 - - // children are child Dentries. - children map[string]*Dentry - - // impl is the DentryImpl associated with this Dentry. impl is immutable. - // This should be the last field in Dentry. - impl DentryImpl -} - -const ( - // dflagsDisownedMask is set in Dentry.flags if the Dentry has been - // disowned. - dflagsDisownedMask = 1 << iota -) - -// Init must be called before first use of d. -func (d *Dentry) Init(impl DentryImpl) { - d.impl = impl -} - -// Impl returns the DentryImpl associated with d. -func (d *Dentry) Impl() DentryImpl { - return d.impl -} - -// DentryImpl contains implementation details for a Dentry. Implementations of -// DentryImpl should contain their associated Dentry by value as their first -// field. -type DentryImpl interface { - // IncRef increments the Dentry's reference count. A Dentry with a non-zero - // reference count must remain coherent with the state of the filesystem. - IncRef(fs *Filesystem) - - // TryIncRef increments the Dentry's reference count and returns true. If - // the Dentry's reference count is zero, TryIncRef may do nothing and - // return false. (It is also permitted to succeed if it can restore the - // guarantee that the Dentry is coherent with the state of the filesystem.) - // - // TryIncRef does not require that a reference is held on the Dentry. - TryIncRef(fs *Filesystem) bool - - // DecRef decrements the Dentry's reference count. - DecRef(fs *Filesystem) -} - -// IsDisowned returns true if d is disowned. -func (d *Dentry) IsDisowned() bool { - return atomic.LoadUint32(&d.flags)&dflagsDisownedMask != 0 -} - -// Preconditions: !d.IsDisowned(). -func (d *Dentry) setDisowned() { - atomic.AddUint32(&d.flags, dflagsDisownedMask) -} - -func (d *Dentry) isMounted() bool { - return atomic.LoadUint32(&d.mounts) != 0 -} - -func (d *Dentry) incRef(fs *Filesystem) { - d.impl.IncRef(fs) -} - -func (d *Dentry) tryIncRef(fs *Filesystem) bool { - return d.impl.TryIncRef(fs) -} - -func (d *Dentry) decRef(fs *Filesystem) { - d.impl.DecRef(fs) -} - -// These functions are exported so that filesystem implementations can use -// them. The vfs package, and users of VFS, should not call these functions. -// Unless otherwise specified, these methods require that there are no -// concurrent mutators of d. - -// Name returns d's name in its parent in its owning Filesystem. If d is -// independent, Name returns an empty string. -func (d *Dentry) Name() string { - return d.name -} - -// Parent returns d's parent in its owning Filesystem. It does not take a -// reference on the returned Dentry. If d is independent, Parent returns nil. -func (d *Dentry) Parent() *Dentry { - return d.parent -} - -// ParentOrSelf is equivalent to Parent, but returns d if d is independent. -func (d *Dentry) ParentOrSelf() *Dentry { - if d.parent == nil { - return d - } - return d.parent -} - -// Child returns d's child with the given name in its owning Filesystem. It -// does not take a reference on the returned Dentry. If no such child exists, -// Child returns nil. -func (d *Dentry) Child(name string) *Dentry { - return d.children[name] -} - -// HasChildren returns true if d has any children. -func (d *Dentry) HasChildren() bool { - return len(d.children) != 0 -} - -// InsertChild makes child a child of d with the given name. -// -// InsertChild is a mutator of d and child. -// -// Preconditions: child must be an independent Dentry. d and child must be from -// the same Filesystem. d must not already have a child with the given name. -func (d *Dentry) InsertChild(child *Dentry, name string) { - if checkInvariants { - if _, ok := d.children[name]; ok { - panic(fmt.Sprintf("parent already contains a child named %q", name)) - } - if child.parent != nil || child.name != "" { - panic(fmt.Sprintf("child is not independent: parent = %v, name = %q", child.parent, child.name)) - } - } - if d.children == nil { - d.children = make(map[string]*Dentry) - } - d.children[name] = child - child.parent = d - child.name = name -} - -// PrepareDeleteDentry must be called before attempting to delete the file -// represented by d. If PrepareDeleteDentry succeeds, the caller must call -// AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome. -// -// Preconditions: d is a child Dentry. -func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dentry) error { - if checkInvariants { - if d.parent == nil { - panic("d is independent") - } - if d.IsDisowned() { - panic("d is already disowned") - } - } - vfs.mountMu.RLock() - if _, ok := mntns.mountpoints[d]; ok { - vfs.mountMu.RUnlock() - return syserror.EBUSY - } - // Return with vfs.mountMu locked, which will be unlocked by - // AbortDeleteDentry or CommitDeleteDentry. - return nil -} - -// AbortDeleteDentry must be called after PrepareDeleteDentry if the deletion -// fails. -func (vfs *VirtualFilesystem) AbortDeleteDentry() { - vfs.mountMu.RUnlock() -} - -// CommitDeleteDentry must be called after the file represented by d is -// deleted, and causes d to become disowned. -// -// Preconditions: PrepareDeleteDentry was previously called on d. -func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) { - delete(d.parent.children, d.name) - d.setDisowned() - // TODO: lazily unmount mounts at d - vfs.mountMu.RUnlock() -} - -// DeleteDentry combines PrepareDeleteDentry and CommitDeleteDentry, as -// appropriate for in-memory filesystems that don't need to ensure that some -// external state change succeeds before committing the deletion. -func (vfs *VirtualFilesystem) DeleteDentry(mntns *MountNamespace, d *Dentry) error { - if err := vfs.PrepareDeleteDentry(mntns, d); err != nil { - return err - } - vfs.CommitDeleteDentry(d) - return nil -} - -// PrepareRenameDentry must be called before attempting to rename the file -// represented by from. If to is not nil, it represents the file that will be -// replaced or exchanged by the rename. If PrepareRenameDentry succeeds, the -// caller must call AbortRenameDentry, CommitRenameReplaceDentry, or -// CommitRenameExchangeDentry depending on the rename's outcome. -// -// Preconditions: from is a child Dentry. If to is not nil, it must be a child -// Dentry from the same Filesystem. -func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error { - if checkInvariants { - if from.parent == nil { - panic("from is independent") - } - if from.IsDisowned() { - panic("from is already disowned") - } - if to != nil { - if to.parent == nil { - panic("to is independent") - } - if to.IsDisowned() { - panic("to is already disowned") - } - } - } - vfs.mountMu.RLock() - if _, ok := mntns.mountpoints[from]; ok { - vfs.mountMu.RUnlock() - return syserror.EBUSY - } - if to != nil { - if _, ok := mntns.mountpoints[to]; ok { - vfs.mountMu.RUnlock() - return syserror.EBUSY - } - } - // Return with vfs.mountMu locked, which will be unlocked by - // AbortRenameDentry, CommitRenameReplaceDentry, or - // CommitRenameExchangeDentry. - return nil -} - -// AbortRenameDentry must be called after PrepareRenameDentry if the rename -// fails. -func (vfs *VirtualFilesystem) AbortRenameDentry() { - vfs.mountMu.RUnlock() -} - -// CommitRenameReplaceDentry must be called after the file represented by from -// is renamed without RENAME_EXCHANGE. If to is not nil, it represents the file -// that was replaced by from. -// -// Preconditions: PrepareRenameDentry was previously called on from and to. -// newParent.Child(newName) == to. -func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, newParent *Dentry, newName string, to *Dentry) { - if to != nil { - to.setDisowned() - // TODO: lazily unmount mounts at d - } - if newParent.children == nil { - newParent.children = make(map[string]*Dentry) - } - newParent.children[newName] = from - from.parent = newParent - from.name = newName - vfs.mountMu.RUnlock() -} - -// CommitRenameExchangeDentry must be called after the files represented by -// from and to are exchanged by rename(RENAME_EXCHANGE). -// -// Preconditions: PrepareRenameDentry was previously called on from and to. -func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) { - from.parent, to.parent = to.parent, from.parent - from.name, to.name = to.name, from.name - from.parent.children[from.name] = from - to.parent.children[to.name] = to - vfs.mountMu.RUnlock() -} diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go deleted file mode 100644 index 86bde7fb3..000000000 --- a/pkg/sentry/vfs/file_description.go +++ /dev/null @@ -1,213 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "sync/atomic" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/waiter" -) - -// A FileDescription represents an open file description, which is the entity -// referred to by a file descriptor (POSIX.1-2017 3.258 "Open File -// Description"). -// -// FileDescriptions are reference-counted. Unless otherwise specified, all -// FileDescription methods require that a reference is held. -// -// FileDescription is analogous to Linux's struct file. -type FileDescription struct { - // refs is the reference count. refs is accessed using atomic memory - // operations. - refs int64 - - // vd is the filesystem location at which this FileDescription was opened. - // A reference is held on vd. vd is immutable. - vd VirtualDentry - - // impl is the FileDescriptionImpl associated with this Filesystem. impl is - // immutable. This should be the last field in FileDescription. - impl FileDescriptionImpl -} - -// Init must be called before first use of fd. It takes references on mnt and -// d. -func (fd *FileDescription) Init(impl FileDescriptionImpl, mnt *Mount, d *Dentry) { - fd.refs = 1 - fd.vd = VirtualDentry{ - mount: mnt, - dentry: d, - } - fd.vd.IncRef() - fd.impl = impl -} - -// Impl returns the FileDescriptionImpl associated with fd. -func (fd *FileDescription) Impl() FileDescriptionImpl { - return fd.impl -} - -// VirtualDentry returns the location at which fd was opened. It does not take -// a reference on the returned VirtualDentry. -func (fd *FileDescription) VirtualDentry() VirtualDentry { - return fd.vd -} - -// IncRef increments fd's reference count. -func (fd *FileDescription) IncRef() { - atomic.AddInt64(&fd.refs, 1) -} - -// DecRef decrements fd's reference count. -func (fd *FileDescription) DecRef() { - if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 { - fd.impl.Release() - fd.vd.DecRef() - } else if refs < 0 { - panic("FileDescription.DecRef() called without holding a reference") - } -} - -// FileDescriptionImpl contains implementation details for an FileDescription. -// Implementations of FileDescriptionImpl should contain their associated -// FileDescription by value as their first field. -// -// For all functions that return linux.Statx, Statx.Uid and Statx.Gid will -// be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID and -// auth.KGID respectively). -// -// FileDescriptionImpl is analogous to Linux's struct file_operations. -type FileDescriptionImpl interface { - // Release is called when the associated FileDescription reaches zero - // references. - Release() - - // OnClose is called when a file descriptor representing the - // FileDescription is closed. Note that returning a non-nil error does not - // prevent the file descriptor from being closed. - OnClose() error - - // StatusFlags returns file description status flags, as for - // fcntl(F_GETFL). - StatusFlags(ctx context.Context) (uint32, error) - - // SetStatusFlags sets file description status flags, as for - // fcntl(F_SETFL). - SetStatusFlags(ctx context.Context, flags uint32) error - - // Stat returns metadata for the file represented by the FileDescription. - Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) - - // SetStat updates metadata for the file represented by the - // FileDescription. - SetStat(ctx context.Context, opts SetStatOptions) error - - // StatFS returns metadata for the filesystem containing the file - // represented by the FileDescription. - StatFS(ctx context.Context) (linux.Statfs, error) - - // waiter.Waitable methods may be used to poll for I/O events. - waiter.Waitable - - // PRead reads from the file into dst, starting at the given offset, and - // returns the number of bytes read. PRead is permitted to return partial - // reads with a nil error. - PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) - - // Read is similar to PRead, but does not specify an offset. - // - // For files with an implicit FileDescription offset (e.g. regular files), - // Read begins at the FileDescription offset, and advances the offset by - // the number of bytes read; note that POSIX 2.9.7 "Thread Interactions - // with Regular File Operations" requires that all operations that may - // mutate the FileDescription offset are serialized. - Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) - - // PWrite writes src to the file, starting at the given offset, and returns - // the number of bytes written. PWrite is permitted to return partial - // writes with a nil error. - // - // As in Linux (but not POSIX), if O_APPEND is in effect for the - // FileDescription, PWrite should ignore the offset and append data to the - // end of the file. - PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) - - // Write is similar to PWrite, but does not specify an offset, which is - // implied as for Read. - // - // Write is a FileDescriptionImpl method, instead of a wrapper around - // PWrite that uses a FileDescription offset, to make it possible for - // remote filesystems to implement O_APPEND correctly (i.e. atomically with - // respect to writers outside the scope of VFS). - Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) - - // IterDirents invokes cb on each entry in the directory represented by the - // FileDescription. If IterDirents has been called since the last call to - // Seek, it continues iteration from the end of the last call. - IterDirents(ctx context.Context, cb IterDirentsCallback) error - - // Seek changes the FileDescription offset (assuming one exists) and - // returns its new value. - // - // For directories, if whence == SEEK_SET and offset == 0, the caller is - // rewinddir(), such that Seek "shall also cause the directory stream to - // refer to the current state of the corresponding directory" - - // POSIX.1-2017. - Seek(ctx context.Context, offset int64, whence int32) (int64, error) - - // Sync requests that cached state associated with the file represented by - // the FileDescription is synchronized with persistent storage, and blocks - // until this is complete. - Sync(ctx context.Context) error - - // ConfigureMMap mutates opts to implement mmap(2) for the file. Most - // implementations that support memory mapping can call - // GenericConfigureMMap with the appropriate memmap.Mappable. - ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error - - // Ioctl implements the ioctl(2) syscall. - Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) - - // TODO: extended attributes; file locking -} - -// Dirent holds the information contained in struct linux_dirent64. -type Dirent struct { - // Name is the filename. - Name string - - // Type is the file type, a linux.DT_* constant. - Type uint8 - - // Ino is the inode number. - Ino uint64 - - // Off is this Dirent's offset. - Off int64 -} - -// IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents. -type IterDirentsCallback interface { - // Handle handles the given iterated Dirent. It returns true if iteration - // should continue, and false if FileDescriptionImpl.IterDirents should - // terminate now and restart with the same Dirent the next time it is - // called. - Handle(dirent Dirent) bool -} diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go deleted file mode 100644 index ba230da72..000000000 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ /dev/null @@ -1,254 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "bytes" - "io" - "sync" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/waiter" -) - -// The following design pattern is strongly recommended for filesystem -// implementations to adapt: -// - Have a local fileDescription struct (containing FileDescription) which -// embeds FileDescriptionDefaultImpl and overrides the default methods -// which are common to all fd implementations for that for that filesystem -// like StatusFlags, SetStatusFlags, Stat, SetStat, StatFS, etc. -// - This should be embedded in all file description implementations as the -// first field by value. -// - Directory FDs would also embed DirectoryFileDescriptionDefaultImpl. - -// FileDescriptionDefaultImpl may be embedded by implementations of -// FileDescriptionImpl to obtain implementations of many FileDescriptionImpl -// methods with default behavior analogous to Linux's. -type FileDescriptionDefaultImpl struct{} - -// OnClose implements FileDescriptionImpl.OnClose analogously to -// file_operations::flush == NULL in Linux. -func (FileDescriptionDefaultImpl) OnClose() error { - return nil -} - -// StatFS implements FileDescriptionImpl.StatFS analogously to -// super_operations::statfs == NULL in Linux. -func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, error) { - return linux.Statfs{}, syserror.ENOSYS -} - -// Readiness implements waiter.Waitable.Readiness analogously to -// file_operations::poll == NULL in Linux. -func (FileDescriptionDefaultImpl) Readiness(mask waiter.EventMask) waiter.EventMask { - // include/linux/poll.h:vfs_poll() => DEFAULT_POLLMASK - return waiter.EventIn | waiter.EventOut -} - -// EventRegister implements waiter.Waitable.EventRegister analogously to -// file_operations::poll == NULL in Linux. -func (FileDescriptionDefaultImpl) EventRegister(e *waiter.Entry, mask waiter.EventMask) { -} - -// EventUnregister implements waiter.Waitable.EventUnregister analogously to -// file_operations::poll == NULL in Linux. -func (FileDescriptionDefaultImpl) EventUnregister(e *waiter.Entry) { -} - -// PRead implements FileDescriptionImpl.PRead analogously to -// file_operations::read == file_operations::read_iter == NULL in Linux. -func (FileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { - return 0, syserror.EINVAL -} - -// Read implements FileDescriptionImpl.Read analogously to -// file_operations::read == file_operations::read_iter == NULL in Linux. -func (FileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { - return 0, syserror.EINVAL -} - -// PWrite implements FileDescriptionImpl.PWrite analogously to -// file_operations::write == file_operations::write_iter == NULL in Linux. -func (FileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { - return 0, syserror.EINVAL -} - -// Write implements FileDescriptionImpl.Write analogously to -// file_operations::write == file_operations::write_iter == NULL in Linux. -func (FileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { - return 0, syserror.EINVAL -} - -// IterDirents implements FileDescriptionImpl.IterDirents analogously to -// file_operations::iterate == file_operations::iterate_shared == NULL in -// Linux. -func (FileDescriptionDefaultImpl) IterDirents(ctx context.Context, cb IterDirentsCallback) error { - return syserror.ENOTDIR -} - -// Seek implements FileDescriptionImpl.Seek analogously to -// file_operations::llseek == NULL in Linux. -func (FileDescriptionDefaultImpl) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - return 0, syserror.ESPIPE -} - -// Sync implements FileDescriptionImpl.Sync analogously to -// file_operations::fsync == NULL in Linux. -func (FileDescriptionDefaultImpl) Sync(ctx context.Context) error { - return syserror.EINVAL -} - -// ConfigureMMap implements FileDescriptionImpl.ConfigureMMap analogously to -// file_operations::mmap == NULL in Linux. -func (FileDescriptionDefaultImpl) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error { - return syserror.ENODEV -} - -// Ioctl implements FileDescriptionImpl.Ioctl analogously to -// file_operations::unlocked_ioctl == NULL in Linux. -func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { - return 0, syserror.ENOTTY -} - -// DirectoryFileDescriptionDefaultImpl may be embedded by implementations of -// FileDescriptionImpl that always represent directories to obtain -// implementations of non-directory I/O methods that return EISDIR. -type DirectoryFileDescriptionDefaultImpl struct{} - -// PRead implements FileDescriptionImpl.PRead. -func (DirectoryFileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { - return 0, syserror.EISDIR -} - -// Read implements FileDescriptionImpl.Read. -func (DirectoryFileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { - return 0, syserror.EISDIR -} - -// PWrite implements FileDescriptionImpl.PWrite. -func (DirectoryFileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { - return 0, syserror.EISDIR -} - -// Write implements FileDescriptionImpl.Write. -func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { - return 0, syserror.EISDIR -} - -// DynamicBytesFileDescriptionImpl may be embedded by implementations of -// FileDescriptionImpl that represent read-only regular files whose contents -// are backed by a bytes.Buffer that is regenerated when necessary, consistent -// with Linux's fs/seq_file.c:single_open(). -// -// DynamicBytesFileDescriptionImpl.SetDataSource() must be called before first -// use. -type DynamicBytesFileDescriptionImpl struct { - data DynamicBytesSource // immutable - mu sync.Mutex // protects the following fields - buf bytes.Buffer - off int64 - lastRead int64 // offset at which the last Read, PRead, or Seek ended -} - -// DynamicBytesSource represents a data source for a -// DynamicBytesFileDescriptionImpl. -type DynamicBytesSource interface { - // Generate writes the file's contents to buf. - Generate(ctx context.Context, buf *bytes.Buffer) error -} - -// SetDataSource must be called exactly once on fd before first use. -func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) { - fd.data = data -} - -// Preconditions: fd.mu must be locked. -func (fd *DynamicBytesFileDescriptionImpl) preadLocked(ctx context.Context, dst usermem.IOSequence, offset int64, opts *ReadOptions) (int64, error) { - // Regenerate the buffer if it's empty, or before pread() at a new offset. - // Compare fs/seq_file.c:seq_read() => traverse(). - switch { - case offset != fd.lastRead: - fd.buf.Reset() - fallthrough - case fd.buf.Len() == 0: - if err := fd.data.Generate(ctx, &fd.buf); err != nil { - fd.buf.Reset() - // fd.off is not updated in this case. - fd.lastRead = 0 - return 0, err - } - } - bs := fd.buf.Bytes() - if offset >= int64(len(bs)) { - return 0, io.EOF - } - n, err := dst.CopyOut(ctx, bs[offset:]) - fd.lastRead = offset + int64(n) - return int64(n), err -} - -// PRead implements FileDescriptionImpl.PRead. -func (fd *DynamicBytesFileDescriptionImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { - fd.mu.Lock() - n, err := fd.preadLocked(ctx, dst, offset, &opts) - fd.mu.Unlock() - return n, err -} - -// Read implements FileDescriptionImpl.Read. -func (fd *DynamicBytesFileDescriptionImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { - fd.mu.Lock() - n, err := fd.preadLocked(ctx, dst, fd.off, &opts) - fd.off += n - fd.mu.Unlock() - return n, err -} - -// Seek implements FileDescriptionImpl.Seek. -func (fd *DynamicBytesFileDescriptionImpl) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - fd.mu.Lock() - defer fd.mu.Unlock() - switch whence { - case linux.SEEK_SET: - // Use offset as given. - case linux.SEEK_CUR: - offset += fd.off - default: - // fs/seq_file:seq_lseek() rejects SEEK_END etc. - return 0, syserror.EINVAL - } - if offset < 0 { - return 0, syserror.EINVAL - } - if offset != fd.lastRead { - // Regenerate the file's contents immediately. Compare - // fs/seq_file.c:seq_lseek() => traverse(). - fd.buf.Reset() - if err := fd.data.Generate(ctx, &fd.buf); err != nil { - fd.buf.Reset() - fd.off = 0 - fd.lastRead = 0 - return 0, err - } - fd.lastRead = offset - } - fd.off = offset - return offset, nil -} diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go deleted file mode 100644 index 511b829fc..000000000 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ /dev/null @@ -1,141 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "bytes" - "fmt" - "io" - "sync/atomic" - "testing" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/syserror" -) - -// fileDescription is the common fd struct which a filesystem implementation -// embeds in all of its file description implementations as required. -type fileDescription struct { - vfsfd FileDescription - FileDescriptionDefaultImpl -} - -// genCountFD is a read-only FileDescriptionImpl representing a regular file -// that contains the number of times its DynamicBytesSource.Generate() -// implementation has been called. -type genCountFD struct { - fileDescription - DynamicBytesFileDescriptionImpl - - count uint64 // accessed using atomic memory ops -} - -func newGenCountFD(mnt *Mount, vfsd *Dentry) *FileDescription { - var fd genCountFD - fd.vfsfd.Init(&fd, mnt, vfsd) - fd.DynamicBytesFileDescriptionImpl.SetDataSource(&fd) - return &fd.vfsfd -} - -// Release implements FileDescriptionImpl.Release. -func (fd *genCountFD) Release() { -} - -// StatusFlags implements FileDescriptionImpl.StatusFlags. -func (fd *genCountFD) StatusFlags(ctx context.Context) (uint32, error) { - return 0, nil -} - -// SetStatusFlags implements FileDescriptionImpl.SetStatusFlags. -func (fd *genCountFD) SetStatusFlags(ctx context.Context, flags uint32) error { - return syserror.EPERM -} - -// Stat implements FileDescriptionImpl.Stat. -func (fd *genCountFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { - // Note that Statx.Mask == 0 in the return value. - return linux.Statx{}, nil -} - -// SetStat implements FileDescriptionImpl.SetStat. -func (fd *genCountFD) SetStat(ctx context.Context, opts SetStatOptions) error { - return syserror.EPERM -} - -// Generate implements DynamicBytesSource.Generate. -func (fd *genCountFD) Generate(ctx context.Context, buf *bytes.Buffer) error { - fmt.Fprintf(buf, "%d", atomic.AddUint64(&fd.count, 1)) - return nil -} - -func TestGenCountFD(t *testing.T) { - ctx := contexttest.Context(t) - creds := auth.CredentialsFromContext(ctx) - - vfsObj := New() // vfs.New() - vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{}) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "testfs", &NewFilesystemOptions{}) - if err != nil { - t.Fatalf("failed to create testfs root mount: %v", err) - } - vd := mntns.Root() - defer vd.DecRef() - - fd := newGenCountFD(vd.Mount(), vd.Dentry()) - defer fd.DecRef() - - // The first read causes Generate to be called to fill the FD's buffer. - buf := make([]byte, 2) - ioseq := usermem.BytesIOSequence(buf) - n, err := fd.Impl().Read(ctx, ioseq, ReadOptions{}) - if n != 1 || (err != nil && err != io.EOF) { - t.Fatalf("first Read: got (%d, %v), wanted (1, nil or EOF)", n, err) - } - if want := byte('1'); buf[0] != want { - t.Errorf("first Read: got byte %c, wanted %c", buf[0], want) - } - - // A second read without seeking is still at EOF. - n, err = fd.Impl().Read(ctx, ioseq, ReadOptions{}) - if n != 0 || err != io.EOF { - t.Fatalf("second Read: got (%d, %v), wanted (0, EOF)", n, err) - } - - // Seeking to the beginning of the file causes it to be regenerated. - n, err = fd.Impl().Seek(ctx, 0, linux.SEEK_SET) - if n != 0 || err != nil { - t.Fatalf("Seek: got (%d, %v), wanted (0, nil)", n, err) - } - n, err = fd.Impl().Read(ctx, ioseq, ReadOptions{}) - if n != 1 || (err != nil && err != io.EOF) { - t.Fatalf("Read after Seek: got (%d, %v), wanted (1, nil or EOF)", n, err) - } - if want := byte('2'); buf[0] != want { - t.Errorf("Read after Seek: got byte %c, wanted %c", buf[0], want) - } - - // PRead at the beginning of the file also causes it to be regenerated. - n, err = fd.Impl().PRead(ctx, ioseq, 0, ReadOptions{}) - if n != 1 || (err != nil && err != io.EOF) { - t.Fatalf("PRead: got (%d, %v), wanted (1, nil or EOF)", n, err) - } - if want := byte('3'); buf[0] != want { - t.Errorf("PRead: got byte %c, wanted %c", buf[0], want) - } -} diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go deleted file mode 100644 index 7a074b718..000000000 --- a/pkg/sentry/vfs/filesystem.go +++ /dev/null @@ -1,155 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "sync/atomic" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" -) - -// A Filesystem is a tree of nodes represented by Dentries, which forms part of -// a VirtualFilesystem. -// -// Filesystems are reference-counted. Unless otherwise specified, all -// Filesystem methods require that a reference is held. -// -// Filesystem is analogous to Linux's struct super_block. -type Filesystem struct { - // refs is the reference count. refs is accessed using atomic memory - // operations. - refs int64 - - // impl is the FilesystemImpl associated with this Filesystem. impl is - // immutable. This should be the last field in Dentry. - impl FilesystemImpl -} - -// Init must be called before first use of fs. -func (fs *Filesystem) Init(impl FilesystemImpl) { - fs.refs = 1 - fs.impl = impl -} - -// Impl returns the FilesystemImpl associated with fs. -func (fs *Filesystem) Impl() FilesystemImpl { - return fs.impl -} - -func (fs *Filesystem) incRef() { - if atomic.AddInt64(&fs.refs, 1) <= 1 { - panic("Filesystem.incRef() called without holding a reference") - } -} - -func (fs *Filesystem) decRef() { - if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 { - fs.impl.Release() - } else if refs < 0 { - panic("Filesystem.decRef() called without holding a reference") - } -} - -// FilesystemImpl contains implementation details for a Filesystem. -// Implementations of FilesystemImpl should contain their associated Filesystem -// by value as their first field. -// -// All methods that take a ResolvingPath must resolve the path before -// performing any other checks, including rejection of the operation if not -// supported by the FilesystemImpl. This is because the final FilesystemImpl -// (responsible for actually implementing the operation) isn't known until path -// resolution is complete. -// -// For all methods that take or return linux.Statx, Statx.Uid and Statx.Gid -// should be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID -// and auth.KGID respectively). -// -// FilesystemImpl combines elements of Linux's struct super_operations and -// struct inode_operations, for reasons described in the documentation for -// Dentry. -type FilesystemImpl interface { - // Release is called when the associated Filesystem reaches zero - // references. - Release() - - // Sync "causes all pending modifications to filesystem metadata and cached - // file data to be written to the underlying [filesystem]", as by syncfs(2). - Sync(ctx context.Context) error - - // GetDentryAt returns a Dentry representing the file at rp. A reference is - // taken on the returned Dentry. - // - // GetDentryAt does not correspond directly to a Linux syscall; it is used - // in the implementation of: - // - // - Syscalls that need to resolve two paths: rename(), renameat(), - // renameat2(), link(), linkat(). - // - // - Syscalls that need to refer to a filesystem position outside the - // context of a file description: chdir(), fchdir(), chroot(), mount(), - // umount(). - GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) - - // LinkAt creates a hard link at rp representing the same file as vd. It - // does not take ownership of references on vd. - // - // The implementation is responsible for checking that vd.Mount() == - // rp.Mount(), and that vd does not represent a directory. - LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error - - // MkdirAt creates a directory at rp. - MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error - - // MknodAt creates a regular file, device special file, or named pipe at - // rp. - MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error - - // OpenAt returns an FileDescription providing access to the file at rp. A - // reference is taken on the returned FileDescription. - OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) - - // ReadlinkAt returns the target of the symbolic link at rp. - ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) - - // RenameAt renames the Dentry represented by vd to rp. It does not take - // ownership of references on vd. - // - // The implementation is responsible for checking that vd.Mount() == - // rp.Mount(). - RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error - - // RmdirAt removes the directory at rp. - RmdirAt(ctx context.Context, rp *ResolvingPath) error - - // SetStatAt updates metadata for the file at the given path. - SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error - - // StatAt returns metadata for the file at rp. - StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) - - // StatFSAt returns metadata for the filesystem containing the file at rp. - // (This method takes a path because a FilesystemImpl may consist of any - // number of constituent filesystems.) - StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) - - // SymlinkAt creates a symbolic link at rp referring to the given target. - SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error - - // UnlinkAt removes the non-directory file at rp. - UnlinkAt(ctx context.Context, rp *ResolvingPath) error - - // TODO: d_path(); extended attributes; inotify_add_watch(); bind() -} diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go deleted file mode 100644 index f401ad7f3..000000000 --- a/pkg/sentry/vfs/filesystem_type.go +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "fmt" - - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" -) - -// A FilesystemType constructs filesystems. -// -// FilesystemType is analogous to Linux's struct file_system_type. -type FilesystemType interface { - // NewFilesystem returns a Filesystem configured by the given options, - // along with its mount root. A reference is taken on the returned - // Filesystem and Dentry. - NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts NewFilesystemOptions) (*Filesystem, *Dentry, error) -} - -// NewFilesystemOptions contains options to FilesystemType.NewFilesystem. -type NewFilesystemOptions struct { - // Data is the string passed as the 5th argument to mount(2), which is - // usually a comma-separated list of filesystem-specific mount options. - Data string - - // InternalData holds opaque FilesystemType-specific data. There is - // intentionally no way for applications to specify InternalData; if it is - // not nil, the call to NewFilesystem originates from within the sentry. - InternalData interface{} -} - -// RegisterFilesystemType registers the given FilesystemType in vfs with the -// given name. -func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType) error { - vfs.fsTypesMu.Lock() - defer vfs.fsTypesMu.Unlock() - if existing, ok := vfs.fsTypes[name]; ok { - return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing) - } - vfs.fsTypes[name] = fsType - return nil -} - -// MustRegisterFilesystemType is equivalent to RegisterFilesystemType but -// panics on failure. -func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType) { - if err := vfs.RegisterFilesystemType(name, fsType); err != nil { - panic(fmt.Sprintf("failed to register filesystem type %T: %v", fsType, err)) - } -} - -func (vfs *VirtualFilesystem) getFilesystemType(name string) FilesystemType { - vfs.fsTypesMu.RLock() - defer vfs.fsTypesMu.RUnlock() - return vfs.fsTypes[name] -} diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go deleted file mode 100644 index 11702f720..000000000 --- a/pkg/sentry/vfs/mount.go +++ /dev/null @@ -1,411 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "math" - "sync/atomic" - - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" -) - -// A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem -// (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem -// (Mount.fs), which applies to path resolution in the context of a particular -// Mount (Mount.key.parent). -// -// Mounts are reference-counted. Unless otherwise specified, all Mount methods -// require that a reference is held. -// -// Mount and Filesystem are distinct types because it's possible for a single -// Filesystem to be mounted at multiple locations and/or in multiple mount -// namespaces. -// -// Mount is analogous to Linux's struct mount. (gVisor does not distinguish -// between struct mount and struct vfsmount.) -type Mount struct { - // The lower 63 bits of refs are a reference count. The MSB of refs is set - // if the Mount has been eagerly unmounted, as by umount(2) without the - // MNT_DETACH flag. refs is accessed using atomic memory operations. - refs int64 - - // The lower 63 bits of writers is the number of calls to - // Mount.CheckBeginWrite() that have not yet been paired with a call to - // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect. - // writers is accessed using atomic memory operations. - writers int64 - - // key is protected by VirtualFilesystem.mountMu and - // VirtualFilesystem.mounts.seq, and may be nil. References are held on - // key.parent and key.point if they are not nil. - // - // Invariant: key.parent != nil iff key.point != nil. key.point belongs to - // key.parent.fs. - key mountKey - - // fs, root, and ns are immutable. References are held on fs and root (but - // not ns). - // - // Invariant: root belongs to fs. - fs *Filesystem - root *Dentry - ns *MountNamespace -} - -// A MountNamespace is a collection of Mounts. -// -// MountNamespaces are reference-counted. Unless otherwise specified, all -// MountNamespace methods require that a reference is held. -// -// MountNamespace is analogous to Linux's struct mnt_namespace. -type MountNamespace struct { - refs int64 // accessed using atomic memory operations - - // root is the MountNamespace's root mount. root is immutable. - root *Mount - - // mountpoints contains all Dentries which are mount points in this - // namespace. mountpoints is protected by VirtualFilesystem.mountMu. - // - // mountpoints is used to determine if a Dentry can be moved or removed - // (which requires that the Dentry is not a mount point in the calling - // namespace). - // - // mountpoints is maintained even if there are no references held on the - // MountNamespace; this is required to ensure that - // VFS.PrepareDeleteDentry() and VFS.PrepareRemoveDentry() operate - // correctly on unreferenced MountNamespaces. - mountpoints map[*Dentry]struct{} -} - -// NewMountNamespace returns a new mount namespace with a root filesystem -// configured by the given arguments. A reference is taken on the returned -// MountNamespace. -func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *NewFilesystemOptions) (*MountNamespace, error) { - fsType := vfs.getFilesystemType(fsTypeName) - if fsType == nil { - return nil, syserror.ENODEV - } - fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts) - if err != nil { - return nil, err - } - mntns := &MountNamespace{ - refs: 1, - mountpoints: make(map[*Dentry]struct{}), - } - mntns.root = &Mount{ - fs: fs, - root: root, - ns: mntns, - refs: 1, - } - return mntns, nil -} - -// NewMount creates and mounts a new Filesystem. -func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *NewFilesystemOptions) error { - fsType := vfs.getFilesystemType(fsTypeName) - if fsType == nil { - return syserror.ENODEV - } - fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts) - if err != nil { - return err - } - // We can't hold vfs.mountMu while calling FilesystemImpl methods due to - // lock ordering. - vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{}) - if err != nil { - root.decRef(fs) - fs.decRef() - return err - } - vfs.mountMu.Lock() - for { - if vd.dentry.IsDisowned() { - vfs.mountMu.Unlock() - vd.DecRef() - root.decRef(fs) - fs.decRef() - return syserror.ENOENT - } - // vd might have been mounted over between vfs.GetDentryAt() and - // vfs.mountMu.Lock(). - if !vd.dentry.isMounted() { - break - } - nextmnt := vfs.mounts.Lookup(vd.mount, vd.dentry) - if nextmnt == nil { - break - } - nextmnt.incRef() - nextmnt.root.incRef(nextmnt.fs) - vd.DecRef() - vd = VirtualDentry{ - mount: nextmnt, - dentry: nextmnt.root, - } - } - // TODO: Linux requires that either both the mount point and the mount root - // are directories, or neither are, and returns ENOTDIR if this is not the - // case. - mntns := vd.mount.ns - mnt := &Mount{ - fs: fs, - root: root, - ns: mntns, - refs: 1, - } - mnt.storeKey(vd.mount, vd.dentry) - atomic.AddUint32(&vd.dentry.mounts, 1) - mntns.mountpoints[vd.dentry] = struct{}{} - vfsmpmounts, ok := vfs.mountpoints[vd.dentry] - if !ok { - vfsmpmounts = make(map[*Mount]struct{}) - vfs.mountpoints[vd.dentry] = vfsmpmounts - } - vfsmpmounts[mnt] = struct{}{} - vfs.mounts.Insert(mnt) - vfs.mountMu.Unlock() - return nil -} - -// getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes -// a reference on the returned Mount. If (mnt, d) is not a mount point, -// getMountAt returns nil. -// -// getMountAt is analogous to Linux's fs/namei.c:follow_mount(). -// -// Preconditions: References are held on mnt and d. -func (vfs *VirtualFilesystem) getMountAt(mnt *Mount, d *Dentry) *Mount { - // The first mount is special-cased: - // - // - The caller is assumed to have checked d.isMounted() already. (This - // isn't a precondition because it doesn't matter for correctness.) - // - // - We return nil, instead of mnt, if there is no mount at (mnt, d). - // - // - We don't drop the caller's references on mnt and d. -retryFirst: - next := vfs.mounts.Lookup(mnt, d) - if next == nil { - return nil - } - if !next.tryIncMountedRef() { - // Raced with umount. - goto retryFirst - } - mnt = next - d = next.root - // We don't need to take Dentry refs anywhere in this function because - // Mounts hold references on Mount.root, which is immutable. - for d.isMounted() { - next := vfs.mounts.Lookup(mnt, d) - if next == nil { - break - } - if !next.tryIncMountedRef() { - // Raced with umount. - continue - } - mnt.decRef() - mnt = next - d = next.root - } - return mnt -} - -// getMountpointAt returns the mount point for the stack of Mounts including -// mnt. It takes a reference on the returned Mount and Dentry. If no such mount -// point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil). -// -// Preconditions: References are held on mnt and root. vfsroot is not (mnt, -// mnt.root). -func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) (*Mount, *Dentry) { - // The first mount is special-cased: - // - // - The caller must have already checked mnt against vfsroot. - // - // - We return nil, instead of mnt, if there is no mount point for mnt. - // - // - We don't drop the caller's reference on mnt. -retryFirst: - epoch := vfs.mounts.seq.BeginRead() - parent, point := mnt.loadKey() - if !vfs.mounts.seq.ReadOk(epoch) { - goto retryFirst - } - if parent == nil { - return nil, nil - } - if !parent.tryIncMountedRef() { - // Raced with umount. - goto retryFirst - } - if !point.tryIncRef(parent.fs) { - // Since Mount holds a reference on Mount.key.point, this can only - // happen due to a racing change to Mount.key. - parent.decRef() - goto retryFirst - } - mnt = parent - d := point - for { - if mnt == vfsroot.mount && d == vfsroot.dentry { - break - } - if d != mnt.root { - break - } - retryNotFirst: - epoch := vfs.mounts.seq.BeginRead() - parent, point := mnt.loadKey() - if !vfs.mounts.seq.ReadOk(epoch) { - goto retryNotFirst - } - if parent == nil { - break - } - if !parent.tryIncMountedRef() { - // Raced with umount. - goto retryNotFirst - } - if !point.tryIncRef(parent.fs) { - // Since Mount holds a reference on Mount.key.point, this can - // only happen due to a racing change to Mount.key. - parent.decRef() - goto retryNotFirst - } - if !vfs.mounts.seq.ReadOk(epoch) { - point.decRef(parent.fs) - parent.decRef() - goto retryNotFirst - } - d.decRef(mnt.fs) - mnt.decRef() - mnt = parent - d = point - } - return mnt, d -} - -// tryIncMountedRef increments mnt's reference count and returns true. If mnt's -// reference count is already zero, or has been eagerly unmounted, -// tryIncMountedRef does nothing and returns false. -// -// tryIncMountedRef does not require that a reference is held on mnt. -func (mnt *Mount) tryIncMountedRef() bool { - for { - refs := atomic.LoadInt64(&mnt.refs) - if refs <= 0 { // refs < 0 => MSB set => eagerly unmounted - return false - } - if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs+1) { - return true - } - } -} - -func (mnt *Mount) incRef() { - // In general, negative values for mnt.refs are valid because the MSB is - // the eager-unmount bit. - atomic.AddInt64(&mnt.refs, 1) -} - -func (mnt *Mount) decRef() { - refs := atomic.AddInt64(&mnt.refs, -1) - if refs&^math.MinInt64 == 0 { // mask out MSB - parent, point := mnt.loadKey() - if point != nil { - point.decRef(parent.fs) - parent.decRef() - } - mnt.root.decRef(mnt.fs) - mnt.fs.decRef() - } -} - -// CheckBeginWrite increments the counter of in-progress write operations on -// mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns -// EROFS. -// -// If CheckBeginWrite succeeds, EndWrite must be called when the write -// operation is finished. -func (mnt *Mount) CheckBeginWrite() error { - if atomic.AddInt64(&mnt.writers, 1) < 0 { - atomic.AddInt64(&mnt.writers, -1) - return syserror.EROFS - } - return nil -} - -// EndWrite indicates that a write operation signaled by a previous successful -// call to CheckBeginWrite has finished. -func (mnt *Mount) EndWrite() { - atomic.AddInt64(&mnt.writers, -1) -} - -// Preconditions: VirtualFilesystem.mountMu must be locked for writing. -func (mnt *Mount) setReadOnlyLocked(ro bool) error { - if oldRO := atomic.LoadInt64(&mnt.writers) < 0; oldRO == ro { - return nil - } - if ro { - if !atomic.CompareAndSwapInt64(&mnt.writers, 0, math.MinInt64) { - return syserror.EBUSY - } - return nil - } - // Unset MSB without dropping any temporary increments from failed calls to - // mnt.CheckBeginWrite(). - atomic.AddInt64(&mnt.writers, math.MinInt64) - return nil -} - -// Filesystem returns the mounted Filesystem. It does not take a reference on -// the returned Filesystem. -func (mnt *Mount) Filesystem() *Filesystem { - return mnt.fs -} - -// IncRef increments mntns' reference count. -func (mntns *MountNamespace) IncRef() { - if atomic.AddInt64(&mntns.refs, 1) <= 1 { - panic("MountNamespace.IncRef() called without holding a reference") - } -} - -// DecRef decrements mntns' reference count. -func (mntns *MountNamespace) DecRef() { - if refs := atomic.AddInt64(&mntns.refs, 0); refs == 0 { - // TODO: unmount mntns.root - } else if refs < 0 { - panic("MountNamespace.DecRef() called without holding a reference") - } -} - -// Root returns mntns' root. A reference is taken on the returned -// VirtualDentry. -func (mntns *MountNamespace) Root() VirtualDentry { - vd := VirtualDentry{ - mount: mntns.root, - dentry: mntns.root.root, - } - vd.IncRef() - return vd -} diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go deleted file mode 100644 index f394d7483..000000000 --- a/pkg/sentry/vfs/mount_test.go +++ /dev/null @@ -1,465 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "fmt" - "runtime" - "sync" - "testing" -) - -func TestMountTableLookupEmpty(t *testing.T) { - var mt mountTable - mt.Init() - - parent := &Mount{} - point := &Dentry{} - if m := mt.Lookup(parent, point); m != nil { - t.Errorf("empty mountTable lookup: got %p, wanted nil", m) - } -} - -func TestMountTableInsertLookup(t *testing.T) { - var mt mountTable - mt.Init() - - mount := &Mount{} - mount.storeKey(&Mount{}, &Dentry{}) - mt.Insert(mount) - - if m := mt.Lookup(mount.parent(), mount.point()); m != mount { - t.Errorf("mountTable positive lookup: got %p, wanted %p", m, mount) - } - - otherParent := &Mount{} - if m := mt.Lookup(otherParent, mount.point()); m != nil { - t.Errorf("mountTable lookup with wrong mount parent: got %p, wanted nil", m) - } - otherPoint := &Dentry{} - if m := mt.Lookup(mount.parent(), otherPoint); m != nil { - t.Errorf("mountTable lookup with wrong mount point: got %p, wanted nil", m) - } -} - -// TODO: concurrent lookup/insertion/removal - -// must be powers of 2 -var benchNumMounts = []int{1 << 2, 1 << 5, 1 << 8} - -// For all of the following: -// -// - BenchmarkMountTableFoo tests usage pattern "Foo" for mountTable. -// -// - BenchmarkMountMapFoo tests usage pattern "Foo" for a -// sync.RWMutex-protected map. (Mutator benchmarks do not use a RWMutex, since -// mountTable also requires external synchronization between mutators.) -// -// - BenchmarkMountSyncMapFoo tests usage pattern "Foo" for a sync.Map. -// -// ParallelLookup is by far the most common and performance-sensitive operation -// for this application. NegativeLookup is also important, but less so (only -// relevant with multiple mount namespaces and significant differences in -// mounts between them). Insertion and removal are benchmarked for -// completeness. -const enableComparativeBenchmarks = false - -func newBenchMount() *Mount { - mount := &Mount{} - mount.storeKey(&Mount{}, &Dentry{}) - return mount -} - -func vdkey(mnt *Mount) VirtualDentry { - parent, point := mnt.loadKey() - return VirtualDentry{ - mount: parent, - dentry: point, - } -} - -func BenchmarkMountTableParallelLookup(b *testing.B) { - for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 { - for _, numMounts := range benchNumMounts { - desc := fmt.Sprintf("%dx%d", numG, numMounts) - b.Run(desc, func(b *testing.B) { - var mt mountTable - mt.Init() - keys := make([]VirtualDentry, 0, numMounts) - for i := 0; i < numMounts; i++ { - mount := newBenchMount() - mt.Insert(mount) - keys = append(keys, vdkey(mount)) - } - - var ready sync.WaitGroup - begin := make(chan struct{}) - var end sync.WaitGroup - for g := 0; g < numG; g++ { - ready.Add(1) - end.Add(1) - go func() { - defer end.Done() - ready.Done() - <-begin - for i := 0; i < b.N; i++ { - k := keys[i&(numMounts-1)] - m := mt.Lookup(k.mount, k.dentry) - if m == nil { - b.Fatalf("lookup failed") - } - if parent := m.parent(); parent != k.mount { - b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount) - } - if point := m.point(); point != k.dentry { - b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry) - } - } - }() - } - - ready.Wait() - b.ResetTimer() - close(begin) - end.Wait() - }) - } - } -} - -func BenchmarkMountMapParallelLookup(b *testing.B) { - if !enableComparativeBenchmarks { - b.Skipf("comparative benchmarks are disabled") - } - - for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 { - for _, numMounts := range benchNumMounts { - desc := fmt.Sprintf("%dx%d", numG, numMounts) - b.Run(desc, func(b *testing.B) { - var mu sync.RWMutex - ms := make(map[VirtualDentry]*Mount) - keys := make([]VirtualDentry, 0, numMounts) - for i := 0; i < numMounts; i++ { - mount := newBenchMount() - key := vdkey(mount) - ms[key] = mount - keys = append(keys, key) - } - - var ready sync.WaitGroup - begin := make(chan struct{}) - var end sync.WaitGroup - for g := 0; g < numG; g++ { - ready.Add(1) - end.Add(1) - go func() { - defer end.Done() - ready.Done() - <-begin - for i := 0; i < b.N; i++ { - k := keys[i&(numMounts-1)] - mu.RLock() - m := ms[k] - mu.RUnlock() - if m == nil { - b.Fatalf("lookup failed") - } - if parent := m.parent(); parent != k.mount { - b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount) - } - if point := m.point(); point != k.dentry { - b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry) - } - } - }() - } - - ready.Wait() - b.ResetTimer() - close(begin) - end.Wait() - }) - } - } -} - -func BenchmarkMountSyncMapParallelLookup(b *testing.B) { - if !enableComparativeBenchmarks { - b.Skipf("comparative benchmarks are disabled") - } - - for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 { - for _, numMounts := range benchNumMounts { - desc := fmt.Sprintf("%dx%d", numG, numMounts) - b.Run(desc, func(b *testing.B) { - var ms sync.Map - keys := make([]VirtualDentry, 0, numMounts) - for i := 0; i < numMounts; i++ { - mount := newBenchMount() - key := vdkey(mount) - ms.Store(key, mount) - keys = append(keys, key) - } - - var ready sync.WaitGroup - begin := make(chan struct{}) - var end sync.WaitGroup - for g := 0; g < numG; g++ { - ready.Add(1) - end.Add(1) - go func() { - defer end.Done() - ready.Done() - <-begin - for i := 0; i < b.N; i++ { - k := keys[i&(numMounts-1)] - mi, ok := ms.Load(k) - if !ok { - b.Fatalf("lookup failed") - } - m := mi.(*Mount) - if parent := m.parent(); parent != k.mount { - b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount) - } - if point := m.point(); point != k.dentry { - b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry) - } - } - }() - } - - ready.Wait() - b.ResetTimer() - close(begin) - end.Wait() - }) - } - } -} - -func BenchmarkMountTableNegativeLookup(b *testing.B) { - for _, numMounts := range benchNumMounts { - desc := fmt.Sprintf("%d", numMounts) - b.Run(desc, func(b *testing.B) { - var mt mountTable - mt.Init() - for i := 0; i < numMounts; i++ { - mt.Insert(newBenchMount()) - } - negkeys := make([]VirtualDentry, 0, numMounts) - for i := 0; i < numMounts; i++ { - negkeys = append(negkeys, VirtualDentry{ - mount: &Mount{}, - dentry: &Dentry{}, - }) - } - - b.ResetTimer() - for i := 0; i < b.N; i++ { - k := negkeys[i&(numMounts-1)] - m := mt.Lookup(k.mount, k.dentry) - if m != nil { - b.Fatalf("lookup got %p, wanted nil", m) - } - } - }) - } -} - -func BenchmarkMountMapNegativeLookup(b *testing.B) { - if !enableComparativeBenchmarks { - b.Skipf("comparative benchmarks are disabled") - } - - for _, numMounts := range benchNumMounts { - desc := fmt.Sprintf("%d", numMounts) - b.Run(desc, func(b *testing.B) { - var mu sync.RWMutex - ms := make(map[VirtualDentry]*Mount) - for i := 0; i < numMounts; i++ { - mount := newBenchMount() - ms[vdkey(mount)] = mount - } - negkeys := make([]VirtualDentry, 0, numMounts) - for i := 0; i < numMounts; i++ { - negkeys = append(negkeys, VirtualDentry{ - mount: &Mount{}, - dentry: &Dentry{}, - }) - } - - b.ResetTimer() - for i := 0; i < b.N; i++ { - k := negkeys[i&(numMounts-1)] - mu.RLock() - m := ms[k] - mu.RUnlock() - if m != nil { - b.Fatalf("lookup got %p, wanted nil", m) - } - } - }) - } -} - -func BenchmarkMountSyncMapNegativeLookup(b *testing.B) { - if !enableComparativeBenchmarks { - b.Skipf("comparative benchmarks are disabled") - } - - for _, numMounts := range benchNumMounts { - desc := fmt.Sprintf("%d", numMounts) - b.Run(desc, func(b *testing.B) { - var ms sync.Map - for i := 0; i < numMounts; i++ { - mount := newBenchMount() - ms.Store(vdkey(mount), mount) - } - negkeys := make([]VirtualDentry, 0, numMounts) - for i := 0; i < numMounts; i++ { - negkeys = append(negkeys, VirtualDentry{ - mount: &Mount{}, - dentry: &Dentry{}, - }) - } - - b.ResetTimer() - for i := 0; i < b.N; i++ { - k := negkeys[i&(numMounts-1)] - m, _ := ms.Load(k) - if m != nil { - b.Fatalf("lookup got %p, wanted nil", m) - } - } - }) - } -} - -func BenchmarkMountTableInsert(b *testing.B) { - // Preallocate Mounts so that allocation time isn't included in the - // benchmark. - mounts := make([]*Mount, 0, b.N) - for i := 0; i < b.N; i++ { - mounts = append(mounts, newBenchMount()) - } - - var mt mountTable - mt.Init() - b.ResetTimer() - for i := range mounts { - mt.Insert(mounts[i]) - } -} - -func BenchmarkMountMapInsert(b *testing.B) { - if !enableComparativeBenchmarks { - b.Skipf("comparative benchmarks are disabled") - } - - // Preallocate Mounts so that allocation time isn't included in the - // benchmark. - mounts := make([]*Mount, 0, b.N) - for i := 0; i < b.N; i++ { - mounts = append(mounts, newBenchMount()) - } - - ms := make(map[VirtualDentry]*Mount) - b.ResetTimer() - for i := range mounts { - mount := mounts[i] - ms[vdkey(mount)] = mount - } -} - -func BenchmarkMountSyncMapInsert(b *testing.B) { - if !enableComparativeBenchmarks { - b.Skipf("comparative benchmarks are disabled") - } - - // Preallocate Mounts so that allocation time isn't included in the - // benchmark. - mounts := make([]*Mount, 0, b.N) - for i := 0; i < b.N; i++ { - mounts = append(mounts, newBenchMount()) - } - - var ms sync.Map - b.ResetTimer() - for i := range mounts { - mount := mounts[i] - ms.Store(vdkey(mount), mount) - } -} - -func BenchmarkMountTableRemove(b *testing.B) { - mounts := make([]*Mount, 0, b.N) - for i := 0; i < b.N; i++ { - mounts = append(mounts, newBenchMount()) - } - var mt mountTable - mt.Init() - for i := range mounts { - mt.Insert(mounts[i]) - } - - b.ResetTimer() - for i := range mounts { - mt.Remove(mounts[i]) - } -} - -func BenchmarkMountMapRemove(b *testing.B) { - if !enableComparativeBenchmarks { - b.Skipf("comparative benchmarks are disabled") - } - - mounts := make([]*Mount, 0, b.N) - for i := 0; i < b.N; i++ { - mounts = append(mounts, newBenchMount()) - } - ms := make(map[VirtualDentry]*Mount) - for i := range mounts { - mount := mounts[i] - ms[vdkey(mount)] = mount - } - - b.ResetTimer() - for i := range mounts { - mount := mounts[i] - delete(ms, vdkey(mount)) - } -} - -func BenchmarkMountSyncMapRemove(b *testing.B) { - if !enableComparativeBenchmarks { - b.Skipf("comparative benchmarks are disabled") - } - - mounts := make([]*Mount, 0, b.N) - for i := 0; i < b.N; i++ { - mounts = append(mounts, newBenchMount()) - } - var ms sync.Map - for i := range mounts { - mount := mounts[i] - ms.Store(vdkey(mount), mount) - } - - b.ResetTimer() - for i := range mounts { - mount := mounts[i] - ms.Delete(vdkey(mount)) - } -} diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go deleted file mode 100644 index b0511aa40..000000000 --- a/pkg/sentry/vfs/mount_unsafe.go +++ /dev/null @@ -1,356 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build go1.12 -// +build !go1.14 - -// Check go:linkname function signatures when updating Go version. - -package vfs - -import ( - "fmt" - "math/bits" - "reflect" - "sync/atomic" - "unsafe" - - "gvisor.dev/gvisor/third_party/gvsync" -) - -// mountKey represents the location at which a Mount is mounted. It is -// structurally identical to VirtualDentry, but stores its fields as -// unsafe.Pointer since mutators synchronize with VFS path traversal using -// seqcounts. -type mountKey struct { - parent unsafe.Pointer // *Mount - point unsafe.Pointer // *Dentry -} - -// Invariant: mnt.key's fields are nil. parent and point are non-nil. -func (mnt *Mount) storeKey(parent *Mount, point *Dentry) { - atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(parent)) - atomic.StorePointer(&mnt.key.point, unsafe.Pointer(point)) -} - -func (mnt *Mount) loadKey() (*Mount, *Dentry) { - return (*Mount)(atomic.LoadPointer(&mnt.key.parent)), (*Dentry)(atomic.LoadPointer(&mnt.key.point)) -} - -func (mnt *Mount) parent() *Mount { - return (*Mount)(atomic.LoadPointer(&mnt.key.parent)) -} - -func (mnt *Mount) point() *Dentry { - return (*Dentry)(atomic.LoadPointer(&mnt.key.point)) -} - -// mountTable maps (mount parent, mount point) pairs to mounts. It supports -// efficient concurrent lookup, even in the presence of concurrent mutators -// (provided mutation is sufficiently uncommon). -// -// mountTable.Init() must be called on new mountTables before use. -type mountTable struct { - // mountTable is implemented as a seqcount-protected hash table that - // resolves collisions with linear probing, featuring Robin Hood insertion - // and backward shift deletion. These minimize probe length variance, - // significantly improving the performance of linear probing at high load - // factors. (mountTable doesn't use bucketing, which is the other major - // technique commonly used in high-performance hash tables; the efficiency - // of bucketing is largely due to SIMD lookup, and Go lacks both SIMD - // intrinsics and inline assembly, limiting the performance of this - // approach.) - - seq gvsync.SeqCount - seed uint32 // for hashing keys - - // size holds both length (number of elements) and capacity (number of - // slots): capacity is stored as its base-2 log (referred to as order) in - // the least significant bits of size, and length is stored in the - // remaining bits. Go defines bit shifts >= width of shifted unsigned - // operand as shifting to 0, which differs from x86's SHL, so the Go - // compiler inserts a bounds check for each bit shift unless we mask order - // anyway (cf. runtime.bucketShift()), and length isn't used by lookup; - // thus this bit packing gets us more bits for the length (vs. storing - // length and cap in separate uint32s) for ~free. - size uint64 - - slots unsafe.Pointer // []mountSlot; never nil after Init -} - -type mountSlot struct { - // We don't store keys in slots; instead, we just check Mount.parent and - // Mount.point directly. Any practical use of lookup will need to touch - // Mounts anyway, and comparing hashes means that false positives are - // extremely rare, so this isn't an extra cache line touch overall. - value unsafe.Pointer // *Mount - hash uintptr -} - -const ( - mtSizeOrderBits = 6 // log2 of pointer size in bits - mtSizeOrderMask = (1 << mtSizeOrderBits) - 1 - mtSizeOrderOne = 1 - mtSizeLenLSB = mtSizeOrderBits - mtSizeLenOne = 1 << mtSizeLenLSB - mtSizeLenNegOne = ^uint64(mtSizeOrderMask) // uint64(-1) << mtSizeLenLSB - - mountSlotBytes = unsafe.Sizeof(mountSlot{}) - mountKeyBytes = unsafe.Sizeof(mountKey{}) - - // Tuning parameters. - // - // Essentially every mountTable will contain at least /proc, /sys, and - // /dev/shm, so there is ~no reason for mtInitCap to be < 4. - mtInitOrder = 2 - mtInitCap = 1 << mtInitOrder - mtMaxLoadNum = 13 - mtMaxLoadDen = 16 -) - -func init() { - // We can't just define mtSizeOrderBits as follows because Go doesn't have - // constexpr. - if ptrBits := uint(unsafe.Sizeof(uintptr(0)) * 8); mtSizeOrderBits != bits.TrailingZeros(ptrBits) { - panic(fmt.Sprintf("mtSizeOrderBits (%d) must be %d = log2 of pointer size in bits (%d)", mtSizeOrderBits, bits.TrailingZeros(ptrBits), ptrBits)) - } - if bits.OnesCount(uint(mountSlotBytes)) != 1 { - panic(fmt.Sprintf("sizeof(mountSlotBytes) (%d) must be a power of 2 to use bit masking for wraparound", mountSlotBytes)) - } - if mtInitCap <= 1 { - panic(fmt.Sprintf("mtInitCap (%d) must be at least 2 since mountTable methods assume that there will always be at least one empty slot", mtInitCap)) - } - if mtMaxLoadNum >= mtMaxLoadDen { - panic(fmt.Sprintf("invalid mountTable maximum load factor (%d/%d)", mtMaxLoadNum, mtMaxLoadDen)) - } -} - -// Init must be called exactly once on each mountTable before use. -func (mt *mountTable) Init() { - mt.seed = rand32() - mt.size = mtInitOrder - mt.slots = newMountTableSlots(mtInitCap) -} - -func newMountTableSlots(cap uintptr) unsafe.Pointer { - slice := make([]mountSlot, cap, cap) - hdr := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) - return unsafe.Pointer(hdr.Data) -} - -// Lookup returns the Mount with the given parent, mounted at the given point. -// If no such Mount exists, Lookup returns nil. -// -// Lookup may be called even if there are concurrent mutators of mt. -func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount { - key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)} - hash := memhash(noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes) - -loop: - for { - epoch := mt.seq.BeginRead() - size := atomic.LoadUint64(&mt.size) - slots := atomic.LoadPointer(&mt.slots) - if !mt.seq.ReadOk(epoch) { - continue - } - tcap := uintptr(1) << (size & mtSizeOrderMask) - mask := tcap - 1 - off := (hash & mask) * mountSlotBytes - offmask := mask * mountSlotBytes - for { - // This avoids bounds checking. - slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off)) - slotValue := atomic.LoadPointer(&slot.value) - slotHash := atomic.LoadUintptr(&slot.hash) - if !mt.seq.ReadOk(epoch) { - // The element we're looking for might have been moved into a - // slot we've previously checked, so restart entirely. - continue loop - } - if slotValue == nil { - return nil - } - if slotHash == hash { - mount := (*Mount)(slotValue) - var mountKey mountKey - mountKey.parent = atomic.LoadPointer(&mount.key.parent) - mountKey.point = atomic.LoadPointer(&mount.key.point) - if !mt.seq.ReadOk(epoch) { - continue loop - } - if key == mountKey { - return mount - } - } - off = (off + mountSlotBytes) & offmask - } - } -} - -// Insert inserts the given mount into mt. -// -// Preconditions: There are no concurrent mutators of mt. mt must not already -// contain a Mount with the same mount point and parent. -func (mt *mountTable) Insert(mount *Mount) { - hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes) - - // We're under the maximum load factor if: - // - // (len+1) / cap <= mtMaxLoadNum / mtMaxLoadDen - // (len+1) * mtMaxLoadDen <= mtMaxLoadNum * cap - tlen := mt.size >> mtSizeLenLSB - order := mt.size & mtSizeOrderMask - tcap := uintptr(1) << order - if ((tlen + 1) * mtMaxLoadDen) <= (uint64(mtMaxLoadNum) << order) { - // Atomically insert the new element into the table. - mt.seq.BeginWrite() - atomic.AddUint64(&mt.size, mtSizeLenOne) - mtInsertLocked(mt.slots, tcap, unsafe.Pointer(mount), hash) - mt.seq.EndWrite() - return - } - - // Otherwise, we have to expand. Double the number of slots in the new - // table. - newOrder := order + 1 - if newOrder > mtSizeOrderMask { - panic("mount table size overflow") - } - newCap := uintptr(1) << newOrder - newSlots := newMountTableSlots(newCap) - // Copy existing elements to the new table. - oldCur := mt.slots - // Go does not permit pointers to the end of allocated objects, so we - // must use a pointer to the last element of the old table. The - // following expression is equivalent to - // `slots+(cap-1)*mountSlotBytes` but has a critical path length of 2 - // arithmetic instructions instead of 3. - oldLast := unsafe.Pointer((uintptr(mt.slots) - mountSlotBytes) + (tcap * mountSlotBytes)) - for { - oldSlot := (*mountSlot)(oldCur) - if oldSlot.value != nil { - // Don't need to lock mt.seq yet since newSlots isn't visible - // to readers. - mtInsertLocked(newSlots, newCap, oldSlot.value, oldSlot.hash) - } - if oldCur == oldLast { - break - } - oldCur = unsafe.Pointer(uintptr(oldCur) + mountSlotBytes) - } - // Insert the new element into the new table. - mtInsertLocked(newSlots, newCap, unsafe.Pointer(mount), hash) - // Atomically switch to the new table. - mt.seq.BeginWrite() - atomic.AddUint64(&mt.size, mtSizeLenOne|mtSizeOrderOne) - atomic.StorePointer(&mt.slots, newSlots) - mt.seq.EndWrite() -} - -// Preconditions: There are no concurrent mutators of the table (slots, cap). -// If the table is visible to readers, then mt.seq must be in a writer critical -// section. cap must be a power of 2. -func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, hash uintptr) { - mask := cap - 1 - off := (hash & mask) * mountSlotBytes - offmask := mask * mountSlotBytes - disp := uintptr(0) - for { - slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off)) - slotValue := slot.value - if slotValue == nil { - atomic.StorePointer(&slot.value, value) - atomic.StoreUintptr(&slot.hash, hash) - return - } - // If we've been displaced farther from our first-probed slot than the - // element stored in this one, swap elements and switch to inserting - // the replaced one. (This is Robin Hood insertion.) - slotHash := slot.hash - slotDisp := ((off / mountSlotBytes) - slotHash) & mask - if disp > slotDisp { - atomic.StorePointer(&slot.value, value) - atomic.StoreUintptr(&slot.hash, hash) - value = slotValue - hash = slotHash - disp = slotDisp - } - off = (off + mountSlotBytes) & offmask - disp++ - } -} - -// Remove removes the given mount from mt. -// -// Preconditions: There are no concurrent mutators of mt. mt must contain -// mount. -func (mt *mountTable) Remove(mount *Mount) { - hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes) - tcap := uintptr(1) << (mt.size & mtSizeOrderMask) - mask := tcap - 1 - slots := mt.slots - off := (hash & mask) * mountSlotBytes - offmask := mask * mountSlotBytes - for { - slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off)) - slotValue := slot.value - if slotValue == unsafe.Pointer(mount) { - // Found the element to remove. Move all subsequent elements - // backward until we either find an empty slot, or an element that - // is already in its first-probed slot. (This is backward shift - // deletion.) - mt.seq.BeginWrite() - for { - nextOff := (off + mountSlotBytes) & offmask - nextSlot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + nextOff)) - nextSlotValue := nextSlot.value - if nextSlotValue == nil { - break - } - nextSlotHash := nextSlot.hash - if (nextOff / mountSlotBytes) == (nextSlotHash & mask) { - break - } - atomic.StorePointer(&slot.value, nextSlotValue) - atomic.StoreUintptr(&slot.hash, nextSlotHash) - off = nextOff - slot = nextSlot - } - atomic.StorePointer(&slot.value, nil) - atomic.AddUint64(&mt.size, mtSizeLenNegOne) - mt.seq.EndWrite() - return - } - if checkInvariants && slotValue == nil { - panic(fmt.Sprintf("mountTable.Remove() called on missing Mount %v", mount)) - } - off = (off + mountSlotBytes) & offmask - } -} - -//go:linkname memhash runtime.memhash -func memhash(p unsafe.Pointer, seed, s uintptr) uintptr - -//go:linkname rand32 runtime.fastrand -func rand32() uint32 - -// This is copy/pasted from runtime.noescape(), and is needed because arguments -// apparently escape from all functions defined by linkname. -// -//go:nosplit -func noescape(p unsafe.Pointer) unsafe.Pointer { - x := uintptr(p) - return unsafe.Pointer(x ^ 0) -} diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go deleted file mode 100644 index 187e5410c..000000000 --- a/pkg/sentry/vfs/options.go +++ /dev/null @@ -1,123 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" -) - -// GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and -// FilesystemImpl.GetDentryAt(). -type GetDentryOptions struct { - // If CheckSearchable is true, FilesystemImpl.GetDentryAt() must check that - // the returned Dentry is a directory for which creds has search - // permission. - CheckSearchable bool -} - -// MkdirOptions contains options to VirtualFilesystem.MkdirAt() and -// FilesystemImpl.MkdirAt(). -type MkdirOptions struct { - // Mode is the file mode bits for the created directory. - Mode uint16 -} - -// MknodOptions contains options to VirtualFilesystem.MknodAt() and -// FilesystemImpl.MknodAt(). -type MknodOptions struct { - // Mode is the file type and mode bits for the created file. - Mode uint16 - - // If Mode specifies a character or block device special file, DevMajor and - // DevMinor are the major and minor device numbers for the created device. - DevMajor uint32 - DevMinor uint32 -} - -// OpenOptions contains options to VirtualFilesystem.OpenAt() and -// FilesystemImpl.OpenAt(). -type OpenOptions struct { - // Flags contains access mode and flags as specified for open(2). - // - // FilesystemImpls is reponsible for implementing the following flags: - // O_RDONLY, O_WRONLY, O_RDWR, O_APPEND, O_CREAT, O_DIRECT, O_DSYNC, - // O_EXCL, O_NOATIME, O_NOCTTY, O_NONBLOCK, O_PATH, O_SYNC, O_TMPFILE, and - // O_TRUNC. VFS is responsible for handling O_DIRECTORY, O_LARGEFILE, and - // O_NOFOLLOW. VFS users are responsible for handling O_CLOEXEC, since file - // descriptors are mostly outside the scope of VFS. - Flags uint32 - - // If FilesystemImpl.OpenAt() creates a file, Mode is the file mode for the - // created file. - Mode uint16 -} - -// ReadOptions contains options to FileDescription.PRead(), -// FileDescriptionImpl.PRead(), FileDescription.Read(), and -// FileDescriptionImpl.Read(). -type ReadOptions struct { - // Flags contains flags as specified for preadv2(2). - Flags uint32 -} - -// RenameOptions contains options to VirtualFilesystem.RenameAt() and -// FilesystemImpl.RenameAt(). -type RenameOptions struct { - // Flags contains flags as specified for renameat2(2). - Flags uint32 -} - -// SetStatOptions contains options to VirtualFilesystem.SetStatAt(), -// FilesystemImpl.SetStatAt(), FileDescription.SetStat(), and -// FileDescriptionImpl.SetStat(). -type SetStatOptions struct { - // Stat is the metadata that should be set. Only fields indicated by - // Stat.Mask should be set. - // - // If Stat specifies that a timestamp should be set, - // FilesystemImpl.SetStatAt() and FileDescriptionImpl.SetStat() must - // special-case StatxTimestamp.Nsec == UTIME_NOW as described by - // utimensat(2); however, they do not need to check for StatxTimestamp.Nsec - // == UTIME_OMIT (VFS users must unset the corresponding bit in Stat.Mask - // instead). - Stat linux.Statx -} - -// StatOptions contains options to VirtualFilesystem.StatAt(), -// FilesystemImpl.StatAt(), FileDescription.Stat(), and -// FileDescriptionImpl.Stat(). -type StatOptions struct { - // Mask is the set of fields in the returned Statx that the FilesystemImpl - // or FileDescriptionImpl should provide. Bits are as in linux.Statx.Mask. - // - // The FilesystemImpl or FileDescriptionImpl may return fields not - // requested in Mask, and may fail to return fields requested in Mask that - // are not supported by the underlying filesystem implementation, without - // returning an error. - Mask uint32 - - // Sync specifies the synchronization required, and is one of - // linux.AT_STATX_SYNC_AS_STAT (which is 0, and therefore the default), - // linux.AT_STATX_SYNC_FORCE_SYNC, or linux.AT_STATX_SYNC_DONT_SYNC. - Sync uint32 -} - -// WriteOptions contains options to FileDescription.PWrite(), -// FileDescriptionImpl.PWrite(), FileDescription.Write(), and -// FileDescriptionImpl.Write(). -type WriteOptions struct { - // Flags contains flags as specified for pwritev2(2). - Flags uint32 -} diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go deleted file mode 100644 index f8e74355c..000000000 --- a/pkg/sentry/vfs/permissions.go +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" -) - -// AccessTypes is a bitmask of Unix file permissions. -type AccessTypes uint16 - -// Bits in AccessTypes. -const ( - MayRead AccessTypes = 4 - MayWrite = 2 - MayExec = 1 -) - -// GenericCheckPermissions checks that creds has the given access rights on a -// file with the given permissions, UID, and GID, subject to the rules of -// fs/namei.c:generic_permission(). isDir is true if the file is a directory. -func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir bool, mode uint16, kuid auth.KUID, kgid auth.KGID) error { - // Check permission bits. - perms := mode - if creds.EffectiveKUID == kuid { - perms >>= 6 - } else if creds.InGroup(kgid) { - perms >>= 3 - } - if uint16(ats)&perms == uint16(ats) { - return nil - } - - // Caller capabilities require that the file's KUID and KGID are mapped in - // the caller's user namespace; compare - // kernel/capability.c:privileged_wrt_inode_uidgid(). - if !kuid.In(creds.UserNamespace).Ok() || !kgid.In(creds.UserNamespace).Ok() { - return syserror.EACCES - } - // CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary - // directories, and read arbitrary non-directory files. - if (isDir && (ats&MayWrite == 0)) || ats == MayRead { - if creds.HasCapability(linux.CAP_DAC_READ_SEARCH) { - return nil - } - } - // CAP_DAC_OVERRIDE allows arbitrary access to directories, read/write - // access to non-directory files, and execute access to non-directory files - // for which at least one execute bit is set. - if isDir || (ats&MayExec == 0) || (mode&0111 != 0) { - if creds.HasCapability(linux.CAP_DAC_OVERRIDE) { - return nil - } - } - return syserror.EACCES -} - -// AccessTypesForOpenFlags returns the access types required to open a file -// with the given OpenOptions.Flags. Note that this is NOT the same thing as -// the set of accesses permitted for the opened file: -// -// - O_TRUNC causes MayWrite to be set in the returned AccessTypes (since it -// mutates the file), but does not permit the opened to write to the file -// thereafter. -// -// - "Linux reserves the special, nonstandard access mode 3 (binary 11) in -// flags to mean: check for read and write permission on the file and return a -// file descriptor that can't be used for reading or writing." - open(2). Thus -// AccessTypesForOpenFlags returns MayRead|MayWrite in this case, but -// filesystems are responsible for ensuring that access is denied. -// -// Use May{Read,Write}FileWithOpenFlags() for these checks instead. -func AccessTypesForOpenFlags(flags uint32) AccessTypes { - switch flags & linux.O_ACCMODE { - case linux.O_RDONLY: - if flags&linux.O_TRUNC != 0 { - return MayRead | MayWrite - } - return MayRead - case linux.O_WRONLY: - return MayWrite - default: - return MayRead | MayWrite - } -} - -// MayReadFileWithOpenFlags returns true if a file with the given open flags -// should be readable. -func MayReadFileWithOpenFlags(flags uint32) bool { - switch flags & linux.O_ACCMODE { - case linux.O_RDONLY, linux.O_RDWR: - return true - default: - return false - } -} - -// MayWriteFileWithOpenFlags returns true if a file with the given open flags -// should be writable. -func MayWriteFileWithOpenFlags(flags uint32) bool { - switch flags & linux.O_ACCMODE { - case linux.O_WRONLY, linux.O_RDWR: - return true - default: - return false - } -} diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go deleted file mode 100644 index 8d05c8583..000000000 --- a/pkg/sentry/vfs/resolving_path.go +++ /dev/null @@ -1,453 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "fmt" - "sync" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" -) - -// ResolvingPath represents the state of an in-progress path resolution, shared -// between VFS and FilesystemImpl methods that take a path. -// -// From the perspective of FilesystemImpl methods, a ResolvingPath represents a -// starting Dentry on the associated Filesystem (on which a reference is -// already held) and a stream of path components relative to that Dentry. -// -// ResolvingPath is loosely analogous to Linux's struct nameidata. -type ResolvingPath struct { - vfs *VirtualFilesystem - root VirtualDentry // refs borrowed from PathOperation - mount *Mount - start *Dentry - pit fspath.Iterator - - flags uint16 - mustBeDir bool // final file must be a directory? - mustBeDirOrig bool - symlinks uint8 // number of symlinks traversed - symlinksOrig uint8 - curPart uint8 // index into parts - numOrigParts uint8 - - creds *auth.Credentials - - // Data associated with resolve*Errors, stored in ResolvingPath so that - // those errors don't need to allocate. - nextMount *Mount // ref held if not nil - nextStart *Dentry // ref held if not nil - absSymlinkTarget fspath.Path - - // ResolvingPath must track up to two relative paths: the "current" - // relative path, which is updated whenever a relative symlink is - // encountered, and the "original" relative path, which is updated from the - // current relative path by handleError() when resolution must change - // filesystems (due to reaching a mount boundary or absolute symlink) and - // overwrites the current relative path when Restart() is called. - parts [1 + linux.MaxSymlinkTraversals]fspath.Iterator - origParts [1 + linux.MaxSymlinkTraversals]fspath.Iterator -} - -const ( - rpflagsHaveMountRef = 1 << iota // do we hold a reference on mount? - rpflagsHaveStartRef // do we hold a reference on start? - rpflagsFollowFinalSymlink // same as PathOperation.FollowFinalSymlink -) - -func init() { - if maxParts := len(ResolvingPath{}.parts); maxParts > 255 { - panic(fmt.Sprintf("uint8 is insufficient to accommodate len(ResolvingPath.parts) (%d)", maxParts)) - } -} - -// Error types that communicate state from the FilesystemImpl-caller, -// VFS-callee side of path resolution (i.e. errors returned by -// ResolvingPath.Resolve*()) to the VFS-caller, FilesystemImpl-callee side -// (i.e. VFS methods => ResolvingPath.handleError()). These are empty structs -// rather than error values because Go doesn't support non-primitive constants, -// so error "constants" are really mutable vars, necessitating somewhat -// expensive interface object comparisons. - -type resolveMountRootError struct{} - -// Error implements error.Error. -func (resolveMountRootError) Error() string { - return "resolving mount root" -} - -type resolveMountPointError struct{} - -// Error implements error.Error. -func (resolveMountPointError) Error() string { - return "resolving mount point" -} - -type resolveAbsSymlinkError struct{} - -// Error implements error.Error. -func (resolveAbsSymlinkError) Error() string { - return "resolving absolute symlink" -} - -var resolvingPathPool = sync.Pool{ - New: func() interface{} { - return &ResolvingPath{} - }, -} - -func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) (*ResolvingPath, error) { - path, err := fspath.Parse(pop.Pathname) - if err != nil { - return nil, err - } - rp := resolvingPathPool.Get().(*ResolvingPath) - rp.vfs = vfs - rp.root = pop.Root - rp.mount = pop.Start.mount - rp.start = pop.Start.dentry - rp.pit = path.Begin - rp.flags = 0 - if pop.FollowFinalSymlink { - rp.flags |= rpflagsFollowFinalSymlink - } - rp.mustBeDir = path.Dir - rp.mustBeDirOrig = path.Dir - rp.symlinks = 0 - rp.curPart = 0 - rp.numOrigParts = 1 - rp.creds = creds - rp.parts[0] = path.Begin - rp.origParts[0] = path.Begin - return rp, nil -} - -func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) { - rp.root = VirtualDentry{} - rp.decRefStartAndMount() - rp.mount = nil - rp.start = nil - rp.releaseErrorState() - resolvingPathPool.Put(rp) -} - -func (rp *ResolvingPath) decRefStartAndMount() { - if rp.flags&rpflagsHaveStartRef != 0 { - rp.start.decRef(rp.mount.fs) - } - if rp.flags&rpflagsHaveMountRef != 0 { - rp.mount.decRef() - } -} - -func (rp *ResolvingPath) releaseErrorState() { - if rp.nextStart != nil { - rp.nextStart.decRef(rp.nextMount.fs) - rp.nextStart = nil - } - if rp.nextMount != nil { - rp.nextMount.decRef() - rp.nextMount = nil - } -} - -// VirtualFilesystem returns the containing VirtualFilesystem. -func (rp *ResolvingPath) VirtualFilesystem() *VirtualFilesystem { - return rp.vfs -} - -// Credentials returns the credentials of rp's provider. -func (rp *ResolvingPath) Credentials() *auth.Credentials { - return rp.creds -} - -// Mount returns the Mount on which path resolution is currently occurring. It -// does not take a reference on the returned Mount. -func (rp *ResolvingPath) Mount() *Mount { - return rp.mount -} - -// Start returns the starting Dentry represented by rp. It does not take a -// reference on the returned Dentry. -func (rp *ResolvingPath) Start() *Dentry { - return rp.start -} - -// Done returns true if there are no remaining path components in the stream -// represented by rp. -func (rp *ResolvingPath) Done() bool { - // We don't need to check for rp.curPart == 0 because rp.Advance() won't - // set rp.pit to a terminal iterator otherwise. - return !rp.pit.Ok() -} - -// Final returns true if there is exactly one remaining path component in the -// stream represented by rp. -// -// Preconditions: !rp.Done(). -func (rp *ResolvingPath) Final() bool { - return rp.curPart == 0 && !rp.pit.NextOk() -} - -// Component returns the current path component in the stream represented by -// rp. -// -// Preconditions: !rp.Done(). -func (rp *ResolvingPath) Component() string { - if checkInvariants { - if !rp.pit.Ok() { - panic("ResolvingPath.Component() called at end of relative path") - } - } - return rp.pit.String() -} - -// Advance advances the stream of path components represented by rp. -// -// Preconditions: !rp.Done(). -func (rp *ResolvingPath) Advance() { - if checkInvariants { - if !rp.pit.Ok() { - panic("ResolvingPath.Advance() called at end of relative path") - } - } - next := rp.pit.Next() - if next.Ok() || rp.curPart == 0 { // have next component, or at end of path - rp.pit = next - } else { // at end of path segment, continue with next one - rp.curPart-- - rp.pit = rp.parts[rp.curPart-1] - } -} - -// Restart resets the stream of path components represented by rp to its state -// on entry to the current FilesystemImpl method. -func (rp *ResolvingPath) Restart() { - rp.pit = rp.origParts[rp.numOrigParts-1] - rp.mustBeDir = rp.mustBeDirOrig - rp.symlinks = rp.symlinksOrig - rp.curPart = rp.numOrigParts - 1 - copy(rp.parts[:], rp.origParts[:rp.numOrigParts]) - rp.releaseErrorState() -} - -func (rp *ResolvingPath) relpathCommit() { - rp.mustBeDirOrig = rp.mustBeDir - rp.symlinksOrig = rp.symlinks - rp.numOrigParts = rp.curPart + 1 - copy(rp.origParts[:rp.curPart], rp.parts[:]) - rp.origParts[rp.curPart] = rp.pit -} - -// ResolveParent returns the VFS parent of d. It does not take a reference on -// the returned Dentry. -// -// Preconditions: There are no concurrent mutators of d. -// -// Postconditions: If the returned error is nil, then the returned Dentry is -// not nil. -func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) { - var parent *Dentry - if d == rp.root.dentry && rp.mount == rp.root.mount { - // At contextual VFS root. - parent = d - } else if d == rp.mount.root { - // At mount root ... - mnt, mntpt := rp.vfs.getMountpointAt(rp.mount, rp.root) - if mnt != nil { - // ... of non-root mount. - rp.nextMount = mnt - rp.nextStart = mntpt - return nil, resolveMountRootError{} - } - // ... of root mount. - parent = d - } else if d.parent == nil { - // At filesystem root. - parent = d - } else { - parent = d.parent - } - if parent.isMounted() { - if mnt := rp.vfs.getMountAt(rp.mount, parent); mnt != nil { - rp.nextMount = mnt - return nil, resolveMountPointError{} - } - } - return parent, nil -} - -// ResolveChild returns the VFS child of d with the given name. It does not -// take a reference on the returned Dentry. If no such child exists, -// ResolveChild returns (nil, nil). -// -// Preconditions: There are no concurrent mutators of d. -func (rp *ResolvingPath) ResolveChild(d *Dentry, name string) (*Dentry, error) { - child := d.children[name] - if child == nil { - return nil, nil - } - if child.isMounted() { - if mnt := rp.vfs.getMountAt(rp.mount, child); mnt != nil { - rp.nextMount = mnt - return nil, resolveMountPointError{} - } - } - return child, nil -} - -// ResolveComponent returns the Dentry reached by starting at d and resolving -// the current path component in the stream represented by rp. It does not -// advance the stream. It does not take a reference on the returned Dentry. If -// no such Dentry exists, ResolveComponent returns (nil, nil). -// -// Preconditions: !rp.Done(). There are no concurrent mutators of d. -func (rp *ResolvingPath) ResolveComponent(d *Dentry) (*Dentry, error) { - switch pc := rp.Component(); pc { - case ".": - return d, nil - case "..": - return rp.ResolveParent(d) - default: - return rp.ResolveChild(d, pc) - } -} - -// ShouldFollowSymlink returns true if, supposing that the current path -// component in pcs represents a symbolic link, the symbolic link should be -// followed. -// -// Preconditions: !rp.Done(). -func (rp *ResolvingPath) ShouldFollowSymlink() bool { - // Non-final symlinks are always followed. - return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final() -} - -// HandleSymlink is called when the current path component is a symbolic link -// to the given target. If the calling Filesystem method should continue path -// traversal, HandleSymlink updates the path component stream to reflect the -// symlink target and returns nil. Otherwise it returns a non-nil error. -// -// Preconditions: !rp.Done(). -func (rp *ResolvingPath) HandleSymlink(target string) error { - if rp.symlinks >= linux.MaxSymlinkTraversals { - return syserror.ELOOP - } - targetPath, err := fspath.Parse(target) - if err != nil { - return err - } - rp.symlinks++ - if targetPath.Absolute { - rp.absSymlinkTarget = targetPath - return resolveAbsSymlinkError{} - } - if !targetPath.Begin.Ok() { - panic(fmt.Sprintf("symbolic link has non-empty target %q that is both relative and has no path components?", target)) - } - // Consume the path component that represented the symlink. - rp.Advance() - // Prepend the symlink target to the relative path. - rp.relpathPrepend(targetPath) - return nil -} - -func (rp *ResolvingPath) relpathPrepend(path fspath.Path) { - if rp.pit.Ok() { - rp.parts[rp.curPart] = rp.pit - rp.pit = path.Begin - rp.curPart++ - } else { - // The symlink was the final path component, so now the symlink target - // is the whole path. - rp.pit = path.Begin - // Symlink targets can set rp.mustBeDir (if they end in a trailing /), - // but can't unset it. - if path.Dir { - rp.mustBeDir = true - } - } -} - -func (rp *ResolvingPath) handleError(err error) bool { - switch err.(type) { - case resolveMountRootError: - // Switch to the new Mount. We hold references on the Mount and Dentry - // (from VFS.getMountpointAt()). - rp.decRefStartAndMount() - rp.mount = rp.nextMount - rp.start = rp.nextStart - rp.flags |= rpflagsHaveMountRef | rpflagsHaveStartRef - rp.nextMount = nil - rp.nextStart = nil - // Commit the previous FileystemImpl's progress through the relative - // path. (Don't consume the path component that caused us to traverse - // through the mount root - i.e. the ".." - because we still need to - // resolve the mount point's parent in the new FilesystemImpl.) - rp.relpathCommit() - // Restart path resolution on the new Mount. Don't bother calling - // rp.releaseErrorState() since we already set nextMount and nextStart - // to nil above. - return true - - case resolveMountPointError: - // Switch to the new Mount. We hold a reference on the Mount (from - // VFS.getMountAt()), but borrow the reference on the mount root from - // the Mount. - rp.decRefStartAndMount() - rp.mount = rp.nextMount - rp.start = rp.nextMount.root - rp.flags = rp.flags&^rpflagsHaveStartRef | rpflagsHaveMountRef - rp.nextMount = nil - // Consume the path component that represented the mount point. - rp.Advance() - // Commit the previous FilesystemImpl's progress through the relative - // path. - rp.relpathCommit() - // Restart path resolution on the new Mount. - rp.releaseErrorState() - return true - - case resolveAbsSymlinkError: - // Switch to the new Mount. References are borrowed from rp.root. - rp.decRefStartAndMount() - rp.mount = rp.root.mount - rp.start = rp.root.dentry - rp.flags &^= rpflagsHaveMountRef | rpflagsHaveStartRef - // Consume the path component that represented the symlink. - rp.Advance() - // Prepend the symlink target to the relative path. - rp.relpathPrepend(rp.absSymlinkTarget) - // Commit the previous FilesystemImpl's progress through the relative - // path, including the symlink target we just prepended. - rp.relpathCommit() - // Restart path resolution on the new Mount. - rp.releaseErrorState() - return true - - default: - // Not an error we can handle. - return false - } -} - -// MustBeDir returns true if the file traversed by rp must be a directory. -func (rp *ResolvingPath) MustBeDir() bool { - return rp.mustBeDir -} diff --git a/pkg/sentry/vfs/syscalls.go b/pkg/sentry/vfs/syscalls.go deleted file mode 100644 index 23f2b9e08..000000000 --- a/pkg/sentry/vfs/syscalls.go +++ /dev/null @@ -1,217 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" -) - -// PathOperation specifies the path operated on by a VFS method. -// -// PathOperation is passed to VFS methods by pointer to reduce memory copying: -// it's somewhat large and should never escape. (Options structs are passed by -// pointer to VFS and FileDescription methods for the same reason.) -type PathOperation struct { - // Root is the VFS root. References on Root are borrowed from the provider - // of the PathOperation. - // - // Invariants: Root.Ok(). - Root VirtualDentry - - // Start is the starting point for the path traversal. References on Start - // are borrowed from the provider of the PathOperation (i.e. the caller of - // the VFS method to which the PathOperation was passed). - // - // Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root. - Start VirtualDentry - - // Path is the pathname traversed by this operation. - Pathname string - - // If FollowFinalSymlink is true, and the Dentry traversed by the final - // path component represents a symbolic link, the symbolic link should be - // followed. - FollowFinalSymlink bool -} - -// GetDentryAt returns a VirtualDentry representing the given path, at which a -// file must exist. A reference is taken on the returned VirtualDentry. -func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return VirtualDentry{}, err - } - for { - d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts) - if err == nil { - vd := VirtualDentry{ - mount: rp.mount, - dentry: d, - } - rp.mount.incRef() - vfs.putResolvingPath(rp) - return vd, nil - } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - return VirtualDentry{}, err - } - } -} - -// MkdirAt creates a directory at the given path. -func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error { - // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is - // also honored." - mkdir(2) - opts.Mode &= 01777 - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return err - } - for { - err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts) - if err == nil { - vfs.putResolvingPath(rp) - return nil - } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - return err - } - } -} - -// OpenAt returns a FileDescription providing access to the file at the given -// path. A reference is taken on the returned FileDescription. -func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) { - // Remove: - // - // - O_LARGEFILE, which we always report in FileDescription status flags - // since only 64-bit architectures are supported at this time. - // - // - O_CLOEXEC, which affects file descriptors and therefore must be - // handled outside of VFS. - // - // - Unknown flags. - opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE - // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC. - if opts.Flags&linux.O_SYNC != 0 { - opts.Flags |= linux.O_DSYNC - } - // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified - // with O_DIRECTORY and a writable access mode (to ensure that it fails on - // filesystem implementations that do not support it). - if opts.Flags&linux.O_TMPFILE != 0 { - if opts.Flags&linux.O_DIRECTORY == 0 { - return nil, syserror.EINVAL - } - if opts.Flags&linux.O_CREAT != 0 { - return nil, syserror.EINVAL - } - if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY { - return nil, syserror.EINVAL - } - } - // O_PATH causes most other flags to be ignored. - if opts.Flags&linux.O_PATH != 0 { - opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH - } - // "On Linux, the following bits are also honored in mode: [S_ISUID, - // S_ISGID, S_ISVTX]" - open(2) - opts.Mode &= 07777 - - if opts.Flags&linux.O_NOFOLLOW != 0 { - pop.FollowFinalSymlink = false - } - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return nil, err - } - if opts.Flags&linux.O_DIRECTORY != 0 { - rp.mustBeDir = true - rp.mustBeDirOrig = true - } - for { - fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) - if err == nil { - vfs.putResolvingPath(rp) - return fd, nil - } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - return nil, err - } - } -} - -// StatAt returns metadata for the file at the given path. -func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return linux.Statx{}, err - } - for { - stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts) - if err == nil { - vfs.putResolvingPath(rp) - return stat, nil - } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - return linux.Statx{}, err - } - } -} - -// StatusFlags returns file description status flags. -func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) { - flags, err := fd.impl.StatusFlags(ctx) - flags |= linux.O_LARGEFILE - return flags, err -} - -// SetStatusFlags sets file description status flags. -func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error { - return fd.impl.SetStatusFlags(ctx, flags) -} - -// TODO: -// -// - VFS.SyncAllFilesystems() for sync(2) -// -// - Something for syncfs(2) -// -// - VFS.LinkAt() -// -// - VFS.MknodAt() -// -// - VFS.ReadlinkAt() -// -// - VFS.RenameAt() -// -// - VFS.RmdirAt() -// -// - VFS.SetStatAt() -// -// - VFS.StatFSAt() -// -// - VFS.SymlinkAt() -// -// - VFS.UnlinkAt() -// -// - FileDescription.(almost everything) diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go deleted file mode 100644 index 70b192ece..000000000 --- a/pkg/sentry/vfs/testutil.go +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" -) - -// FDTestFilesystemType is a test-only FilesystemType that produces Filesystems -// for which all FilesystemImpl methods taking a path return EPERM. It is used -// to produce Mounts and Dentries for testing of FileDescriptionImpls that do -// not depend on their originating Filesystem. -type FDTestFilesystemType struct{} - -// FDTestFilesystem is a test-only FilesystemImpl produced by -// FDTestFilesystemType. -type FDTestFilesystem struct { - vfsfs Filesystem -} - -// NewFilesystem implements FilesystemType.NewFilesystem. -func (fstype FDTestFilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts NewFilesystemOptions) (*Filesystem, *Dentry, error) { - var fs FDTestFilesystem - fs.vfsfs.Init(&fs) - return &fs.vfsfs, fs.NewDentry(), nil -} - -// Release implements FilesystemImpl.Release. -func (fs *FDTestFilesystem) Release() { -} - -// Sync implements FilesystemImpl.Sync. -func (fs *FDTestFilesystem) Sync(ctx context.Context) error { - return nil -} - -// GetDentryAt implements FilesystemImpl.GetDentryAt. -func (fs *FDTestFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) { - return nil, syserror.EPERM -} - -// LinkAt implements FilesystemImpl.LinkAt. -func (fs *FDTestFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error { - return syserror.EPERM -} - -// MkdirAt implements FilesystemImpl.MkdirAt. -func (fs *FDTestFilesystem) MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error { - return syserror.EPERM -} - -// MknodAt implements FilesystemImpl.MknodAt. -func (fs *FDTestFilesystem) MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error { - return syserror.EPERM -} - -// OpenAt implements FilesystemImpl.OpenAt. -func (fs *FDTestFilesystem) OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) { - return nil, syserror.EPERM -} - -// ReadlinkAt implements FilesystemImpl.ReadlinkAt. -func (fs *FDTestFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) { - return "", syserror.EPERM -} - -// RenameAt implements FilesystemImpl.RenameAt. -func (fs *FDTestFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error { - return syserror.EPERM -} - -// RmdirAt implements FilesystemImpl.RmdirAt. -func (fs *FDTestFilesystem) RmdirAt(ctx context.Context, rp *ResolvingPath) error { - return syserror.EPERM -} - -// SetStatAt implements FilesystemImpl.SetStatAt. -func (fs *FDTestFilesystem) SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error { - return syserror.EPERM -} - -// StatAt implements FilesystemImpl.StatAt. -func (fs *FDTestFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) { - return linux.Statx{}, syserror.EPERM -} - -// StatFSAt implements FilesystemImpl.StatFSAt. -func (fs *FDTestFilesystem) StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) { - return linux.Statfs{}, syserror.EPERM -} - -// SymlinkAt implements FilesystemImpl.SymlinkAt. -func (fs *FDTestFilesystem) SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error { - return syserror.EPERM -} - -// UnlinkAt implements FilesystemImpl.UnlinkAt. -func (fs *FDTestFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error { - return syserror.EPERM -} - -type fdTestDentry struct { - vfsd Dentry -} - -// NewDentry returns a new Dentry. -func (fs *FDTestFilesystem) NewDentry() *Dentry { - var d fdTestDentry - d.vfsd.Init(&d) - return &d.vfsd -} - -// IncRef implements DentryImpl.IncRef. -func (d *fdTestDentry) IncRef(vfsfs *Filesystem) { -} - -// TryIncRef implements DentryImpl.TryIncRef. -func (d *fdTestDentry) TryIncRef(vfsfs *Filesystem) bool { - return true -} - -// DecRef implements DentryImpl.DecRef. -func (d *fdTestDentry) DecRef(vfsfs *Filesystem) { -} diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go deleted file mode 100644 index 4a8a69540..000000000 --- a/pkg/sentry/vfs/vfs.go +++ /dev/null @@ -1,135 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package vfs implements a virtual filesystem layer. -// -// Lock order: -// -// Filesystem implementation locks -// VirtualFilesystem.mountMu -// VirtualFilesystem.fsTypesMu -package vfs - -import ( - "sync" -) - -// A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts. -// -// There is no analogue to the VirtualFilesystem type in Linux, as the -// equivalent state in Linux is global. -type VirtualFilesystem struct { - // mountMu serializes mount mutations. - // - // mountMu is analogous to Linux's namespace_sem. - mountMu sync.RWMutex - - // mounts maps (mount parent, mount point) pairs to mounts. (Since mounts - // are uniquely namespaced, including mount parent in the key correctly - // handles both bind mounts and mount namespaces; Linux does the same.) - // Synchronization between mutators and readers is provided by mounts.seq; - // synchronization between mutators is provided by mountMu. - // - // mounts is used to follow mount points during path traversal. We use a - // single table rather than per-Dentry tables to reduce size (and therefore - // cache footprint) for the vast majority of Dentries that are not mount - // points. - // - // mounts is analogous to Linux's mount_hashtable. - mounts mountTable - - // mountpoints maps mount points to mounts at those points in all - // namespaces. mountpoints is protected by mountMu. - // - // mountpoints is used to find mounts that must be unmounted due to - // removal of a mount point Dentry from another mount namespace. ("A file - // or directory that is a mount point in one namespace that is not a mount - // point in another namespace, may be renamed, unlinked, or removed - // (rmdir(2)) in the mount namespace in which it is not a mount point - // (subject to the usual permission checks)." - mount_namespaces(7)) - // - // mountpoints is analogous to Linux's mountpoint_hashtable. - mountpoints map[*Dentry]map[*Mount]struct{} - - // fsTypes contains all FilesystemTypes that are usable in the - // VirtualFilesystem. fsTypes is protected by fsTypesMu. - fsTypesMu sync.RWMutex - fsTypes map[string]FilesystemType -} - -// New returns a new VirtualFilesystem with no mounts or FilesystemTypes. -func New() *VirtualFilesystem { - vfs := &VirtualFilesystem{ - mountpoints: make(map[*Dentry]map[*Mount]struct{}), - fsTypes: make(map[string]FilesystemType), - } - vfs.mounts.Init() - return vfs -} - -// A VirtualDentry represents a node in a VFS tree, by combining a Dentry -// (which represents a node in a Filesystem's tree) and a Mount (which -// represents the Filesystem's position in a VFS mount tree). -// -// VirtualDentry's semantics are similar to that of a Go interface object -// representing a pointer: it is a copyable value type that represents -// references to another entity. The zero value of VirtualDentry is an "empty -// VirtualDentry", directly analogous to a nil interface object. -// VirtualDentry.Ok() checks that a VirtualDentry is not zero-valued; unless -// otherwise specified, all other VirtualDentry methods require -// VirtualDentry.Ok() == true. -// -// Mounts and Dentries are reference-counted, requiring that users call -// VirtualDentry.{Inc,Dec}Ref() as appropriate. We often colloquially refer to -// references on the Mount and Dentry referred to by a VirtualDentry as -// references on the VirtualDentry itself. Unless otherwise specified, all -// VirtualDentry methods require that a reference is held on the VirtualDentry. -// -// VirtualDentry is analogous to Linux's struct path. -type VirtualDentry struct { - mount *Mount - dentry *Dentry -} - -// Ok returns true if vd is not empty. It does not require that a reference is -// held. -func (vd VirtualDentry) Ok() bool { - return vd.mount != nil -} - -// IncRef increments the reference counts on the Mount and Dentry represented -// by vd. -func (vd VirtualDentry) IncRef() { - vd.mount.incRef() - vd.dentry.incRef(vd.mount.fs) -} - -// DecRef decrements the reference counts on the Mount and Dentry represented -// by vd. -func (vd VirtualDentry) DecRef() { - vd.dentry.decRef(vd.mount.fs) - vd.mount.decRef() -} - -// Mount returns the Mount associated with vd. It does not take a reference on -// the returned Mount. -func (vd VirtualDentry) Mount() *Mount { - return vd.mount -} - -// Dentry returns the Dentry associated with vd. It does not take a reference -// on the returned Dentry. -func (vd VirtualDentry) Dentry() *Dentry { - return vd.dentry -} |