diff options
Diffstat (limited to 'pkg/sentry/vfs')
-rwxr-xr-x | pkg/sentry/vfs/context.go | 37 | ||||
-rwxr-xr-x | pkg/sentry/vfs/debug.go | 22 | ||||
-rwxr-xr-x | pkg/sentry/vfs/dentry.go | 347 | ||||
-rwxr-xr-x | pkg/sentry/vfs/file_description.go | 216 | ||||
-rwxr-xr-x | pkg/sentry/vfs/file_description_impl_util.go | 254 | ||||
-rwxr-xr-x | pkg/sentry/vfs/filesystem.go | 155 | ||||
-rwxr-xr-x | pkg/sentry/vfs/filesystem_type.go | 70 | ||||
-rwxr-xr-x | pkg/sentry/vfs/mount.go | 411 | ||||
-rwxr-xr-x | pkg/sentry/vfs/mount_unsafe.go | 356 | ||||
-rwxr-xr-x | pkg/sentry/vfs/options.go | 123 | ||||
-rwxr-xr-x | pkg/sentry/vfs/permissions.go | 121 | ||||
-rwxr-xr-x | pkg/sentry/vfs/resolving_path.go | 453 | ||||
-rwxr-xr-x | pkg/sentry/vfs/syscalls.go | 217 | ||||
-rwxr-xr-x | pkg/sentry/vfs/testutil.go | 139 | ||||
-rwxr-xr-x | pkg/sentry/vfs/vfs.go | 135 | ||||
-rwxr-xr-x | pkg/sentry/vfs/vfs_state_autogen.go | 4 |
16 files changed, 3060 insertions, 0 deletions
diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go new file mode 100755 index 000000000..32cf9151b --- /dev/null +++ b/pkg/sentry/vfs/context.go @@ -0,0 +1,37 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/sentry/context" +) + +// contextID is this package's type for context.Context.Value keys. +type contextID int + +const ( + // CtxMountNamespace is a Context.Value key for a MountNamespace. + CtxMountNamespace contextID = iota +) + +// MountNamespaceFromContext returns the MountNamespace used by ctx. It does +// not take a reference on the returned MountNamespace. If ctx is not +// associated with a MountNamespace, MountNamespaceFromContext returns nil. +func MountNamespaceFromContext(ctx context.Context) *MountNamespace { + if v := ctx.Value(CtxMountNamespace); v != nil { + return v.(*MountNamespace) + } + return nil +} diff --git a/pkg/sentry/vfs/debug.go b/pkg/sentry/vfs/debug.go new file mode 100755 index 000000000..0ed20f249 --- /dev/null +++ b/pkg/sentry/vfs/debug.go @@ -0,0 +1,22 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +const ( + // If checkInvariants is true, perform runtime checks for invariants + // expected by the vfs package. This is normally disabled since VFS is + // often a hot path. + checkInvariants = false +) diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go new file mode 100755 index 000000000..45912fc58 --- /dev/null +++ b/pkg/sentry/vfs/dentry.go @@ -0,0 +1,347 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/syserror" +) + +// Dentry represents a node in a Filesystem tree which may represent a file. +// +// Dentries are reference-counted. Unless otherwise specified, all Dentry +// methods require that a reference is held. +// +// A Dentry transitions through up to 3 different states through its lifetime: +// +// - Dentries are initially "independent". Independent Dentries have no parent, +// and consequently no name. +// +// - Dentry.InsertChild() causes an independent Dentry to become a "child" of +// another Dentry. A child node has a parent node, and a name in that parent, +// both of which are mutable by DentryMoveChild(). Each child Dentry's name is +// unique within its parent. +// +// - Dentry.RemoveChild() causes a child Dentry to become "disowned". A +// disowned Dentry can still refer to its former parent and its former name in +// said parent, but the disowned Dentry is no longer reachable from its parent, +// and a new Dentry with the same name may become a child of the parent. (This +// is analogous to a struct dentry being "unhashed" in Linux.) +// +// Dentry is loosely analogous to Linux's struct dentry, but: +// +// - VFS does not associate Dentries with inodes. gVisor interacts primarily +// with filesystems that are accessed through filesystem APIs (as opposed to +// raw block devices); many such APIs support only paths and file descriptors, +// and not inodes. Furthermore, when parties outside the scope of VFS can +// rename inodes on such filesystems, VFS generally cannot "follow" the rename, +// both due to synchronization issues and because it may not even be able to +// name the destination path; this implies that it would in fact be *incorrect* +// for Dentries to be associated with inodes on such filesystems. Consequently, +// operations that are inode operations in Linux are FilesystemImpl methods +// and/or FileDescriptionImpl methods in gVisor's VFS. Filesystems that do +// support inodes may store appropriate state in implementations of DentryImpl. +// +// - VFS does not provide synchronization for mutable Dentry fields, other than +// mount-related ones. +// +// - VFS does not require that Dentries are instantiated for all paths accessed +// through VFS, only those that are tracked beyond the scope of a single +// Filesystem operation. This includes file descriptions, mount points, mount +// roots, process working directories, and chroots. This avoids instantiation +// of Dentries for operations on mutable remote filesystems that can't actually +// cache any state in the Dentry. +// +// - For the reasons above, VFS is not directly responsible for managing Dentry +// lifetime. Dentry reference counts only indicate the extent to which VFS +// requires Dentries to exist; Filesystems may elect to cache or discard +// Dentries with zero references. +type Dentry struct { + // parent is this Dentry's parent in this Filesystem. If this Dentry is + // independent, parent is nil. + parent *Dentry + + // name is this Dentry's name in parent. + name string + + flags uint32 + + // mounts is the number of Mounts for which this Dentry is Mount.point. + // mounts is accessed using atomic memory operations. + mounts uint32 + + // children are child Dentries. + children map[string]*Dentry + + // impl is the DentryImpl associated with this Dentry. impl is immutable. + // This should be the last field in Dentry. + impl DentryImpl +} + +const ( + // dflagsDisownedMask is set in Dentry.flags if the Dentry has been + // disowned. + dflagsDisownedMask = 1 << iota +) + +// Init must be called before first use of d. +func (d *Dentry) Init(impl DentryImpl) { + d.impl = impl +} + +// Impl returns the DentryImpl associated with d. +func (d *Dentry) Impl() DentryImpl { + return d.impl +} + +// DentryImpl contains implementation details for a Dentry. Implementations of +// DentryImpl should contain their associated Dentry by value as their first +// field. +type DentryImpl interface { + // IncRef increments the Dentry's reference count. A Dentry with a non-zero + // reference count must remain coherent with the state of the filesystem. + IncRef(fs *Filesystem) + + // TryIncRef increments the Dentry's reference count and returns true. If + // the Dentry's reference count is zero, TryIncRef may do nothing and + // return false. (It is also permitted to succeed if it can restore the + // guarantee that the Dentry is coherent with the state of the filesystem.) + // + // TryIncRef does not require that a reference is held on the Dentry. + TryIncRef(fs *Filesystem) bool + + // DecRef decrements the Dentry's reference count. + DecRef(fs *Filesystem) +} + +// IsDisowned returns true if d is disowned. +func (d *Dentry) IsDisowned() bool { + return atomic.LoadUint32(&d.flags)&dflagsDisownedMask != 0 +} + +// Preconditions: !d.IsDisowned(). +func (d *Dentry) setDisowned() { + atomic.AddUint32(&d.flags, dflagsDisownedMask) +} + +func (d *Dentry) isMounted() bool { + return atomic.LoadUint32(&d.mounts) != 0 +} + +func (d *Dentry) incRef(fs *Filesystem) { + d.impl.IncRef(fs) +} + +func (d *Dentry) tryIncRef(fs *Filesystem) bool { + return d.impl.TryIncRef(fs) +} + +func (d *Dentry) decRef(fs *Filesystem) { + d.impl.DecRef(fs) +} + +// These functions are exported so that filesystem implementations can use +// them. The vfs package, and users of VFS, should not call these functions. +// Unless otherwise specified, these methods require that there are no +// concurrent mutators of d. + +// Name returns d's name in its parent in its owning Filesystem. If d is +// independent, Name returns an empty string. +func (d *Dentry) Name() string { + return d.name +} + +// Parent returns d's parent in its owning Filesystem. It does not take a +// reference on the returned Dentry. If d is independent, Parent returns nil. +func (d *Dentry) Parent() *Dentry { + return d.parent +} + +// ParentOrSelf is equivalent to Parent, but returns d if d is independent. +func (d *Dentry) ParentOrSelf() *Dentry { + if d.parent == nil { + return d + } + return d.parent +} + +// Child returns d's child with the given name in its owning Filesystem. It +// does not take a reference on the returned Dentry. If no such child exists, +// Child returns nil. +func (d *Dentry) Child(name string) *Dentry { + return d.children[name] +} + +// HasChildren returns true if d has any children. +func (d *Dentry) HasChildren() bool { + return len(d.children) != 0 +} + +// InsertChild makes child a child of d with the given name. +// +// InsertChild is a mutator of d and child. +// +// Preconditions: child must be an independent Dentry. d and child must be from +// the same Filesystem. d must not already have a child with the given name. +func (d *Dentry) InsertChild(child *Dentry, name string) { + if checkInvariants { + if _, ok := d.children[name]; ok { + panic(fmt.Sprintf("parent already contains a child named %q", name)) + } + if child.parent != nil || child.name != "" { + panic(fmt.Sprintf("child is not independent: parent = %v, name = %q", child.parent, child.name)) + } + } + if d.children == nil { + d.children = make(map[string]*Dentry) + } + d.children[name] = child + child.parent = d + child.name = name +} + +// PrepareDeleteDentry must be called before attempting to delete the file +// represented by d. If PrepareDeleteDentry succeeds, the caller must call +// AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome. +// +// Preconditions: d is a child Dentry. +func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dentry) error { + if checkInvariants { + if d.parent == nil { + panic("d is independent") + } + if d.IsDisowned() { + panic("d is already disowned") + } + } + vfs.mountMu.RLock() + if _, ok := mntns.mountpoints[d]; ok { + vfs.mountMu.RUnlock() + return syserror.EBUSY + } + // Return with vfs.mountMu locked, which will be unlocked by + // AbortDeleteDentry or CommitDeleteDentry. + return nil +} + +// AbortDeleteDentry must be called after PrepareDeleteDentry if the deletion +// fails. +func (vfs *VirtualFilesystem) AbortDeleteDentry() { + vfs.mountMu.RUnlock() +} + +// CommitDeleteDentry must be called after the file represented by d is +// deleted, and causes d to become disowned. +// +// Preconditions: PrepareDeleteDentry was previously called on d. +func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) { + delete(d.parent.children, d.name) + d.setDisowned() + // TODO: lazily unmount mounts at d + vfs.mountMu.RUnlock() +} + +// DeleteDentry combines PrepareDeleteDentry and CommitDeleteDentry, as +// appropriate for in-memory filesystems that don't need to ensure that some +// external state change succeeds before committing the deletion. +func (vfs *VirtualFilesystem) DeleteDentry(mntns *MountNamespace, d *Dentry) error { + if err := vfs.PrepareDeleteDentry(mntns, d); err != nil { + return err + } + vfs.CommitDeleteDentry(d) + return nil +} + +// PrepareRenameDentry must be called before attempting to rename the file +// represented by from. If to is not nil, it represents the file that will be +// replaced or exchanged by the rename. If PrepareRenameDentry succeeds, the +// caller must call AbortRenameDentry, CommitRenameReplaceDentry, or +// CommitRenameExchangeDentry depending on the rename's outcome. +// +// Preconditions: from is a child Dentry. If to is not nil, it must be a child +// Dentry from the same Filesystem. +func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error { + if checkInvariants { + if from.parent == nil { + panic("from is independent") + } + if from.IsDisowned() { + panic("from is already disowned") + } + if to != nil { + if to.parent == nil { + panic("to is independent") + } + if to.IsDisowned() { + panic("to is already disowned") + } + } + } + vfs.mountMu.RLock() + if _, ok := mntns.mountpoints[from]; ok { + vfs.mountMu.RUnlock() + return syserror.EBUSY + } + if to != nil { + if _, ok := mntns.mountpoints[to]; ok { + vfs.mountMu.RUnlock() + return syserror.EBUSY + } + } + // Return with vfs.mountMu locked, which will be unlocked by + // AbortRenameDentry, CommitRenameReplaceDentry, or + // CommitRenameExchangeDentry. + return nil +} + +// AbortRenameDentry must be called after PrepareRenameDentry if the rename +// fails. +func (vfs *VirtualFilesystem) AbortRenameDentry() { + vfs.mountMu.RUnlock() +} + +// CommitRenameReplaceDentry must be called after the file represented by from +// is renamed without RENAME_EXCHANGE. If to is not nil, it represents the file +// that was replaced by from. +// +// Preconditions: PrepareRenameDentry was previously called on from and to. +// newParent.Child(newName) == to. +func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, newParent *Dentry, newName string, to *Dentry) { + if to != nil { + to.setDisowned() + // TODO: lazily unmount mounts at d + } + if newParent.children == nil { + newParent.children = make(map[string]*Dentry) + } + newParent.children[newName] = from + from.parent = newParent + from.name = newName + vfs.mountMu.RUnlock() +} + +// CommitRenameExchangeDentry must be called after the files represented by +// from and to are exchanged by rename(RENAME_EXCHANGE). +// +// Preconditions: PrepareRenameDentry was previously called on from and to. +func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) { + from.parent, to.parent = to.parent, from.parent + from.name, to.name = to.name, from.name + from.parent.children[from.name] = from + to.parent.children[to.name] = to + vfs.mountMu.RUnlock() +} diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go new file mode 100755 index 000000000..3a9665800 --- /dev/null +++ b/pkg/sentry/vfs/file_description.go @@ -0,0 +1,216 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// A FileDescription represents an open file description, which is the entity +// referred to by a file descriptor (POSIX.1-2017 3.258 "Open File +// Description"). +// +// FileDescriptions are reference-counted. Unless otherwise specified, all +// FileDescription methods require that a reference is held. +// +// FileDescription is analogous to Linux's struct file. +type FileDescription struct { + // refs is the reference count. refs is accessed using atomic memory + // operations. + refs int64 + + // vd is the filesystem location at which this FileDescription was opened. + // A reference is held on vd. vd is immutable. + vd VirtualDentry + + // impl is the FileDescriptionImpl associated with this Filesystem. impl is + // immutable. This should be the last field in FileDescription. + impl FileDescriptionImpl +} + +// Init must be called before first use of fd. It takes references on mnt and +// d. +func (fd *FileDescription) Init(impl FileDescriptionImpl, mnt *Mount, d *Dentry) { + fd.refs = 1 + fd.vd = VirtualDentry{ + mount: mnt, + dentry: d, + } + fd.vd.IncRef() + fd.impl = impl +} + +// Impl returns the FileDescriptionImpl associated with fd. +func (fd *FileDescription) Impl() FileDescriptionImpl { + return fd.impl +} + +// VirtualDentry returns the location at which fd was opened. It does not take +// a reference on the returned VirtualDentry. +func (fd *FileDescription) VirtualDentry() VirtualDentry { + return fd.vd +} + +// IncRef increments fd's reference count. +func (fd *FileDescription) IncRef() { + atomic.AddInt64(&fd.refs, 1) +} + +// DecRef decrements fd's reference count. +func (fd *FileDescription) DecRef() { + if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 { + fd.impl.Release() + fd.vd.DecRef() + } else if refs < 0 { + panic("FileDescription.DecRef() called without holding a reference") + } +} + +// FileDescriptionImpl contains implementation details for an FileDescription. +// Implementations of FileDescriptionImpl should contain their associated +// FileDescription by value as their first field. +// +// For all functions that return linux.Statx, Statx.Uid and Statx.Gid will +// be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID and +// auth.KGID respectively). +// +// FileDescriptionImpl is analogous to Linux's struct file_operations. +type FileDescriptionImpl interface { + // Release is called when the associated FileDescription reaches zero + // references. + Release() + + // OnClose is called when a file descriptor representing the + // FileDescription is closed. Note that returning a non-nil error does not + // prevent the file descriptor from being closed. + OnClose(ctx context.Context) error + + // StatusFlags returns file description status flags, as for + // fcntl(F_GETFL). + StatusFlags(ctx context.Context) (uint32, error) + + // SetStatusFlags sets file description status flags, as for + // fcntl(F_SETFL). + SetStatusFlags(ctx context.Context, flags uint32) error + + // Stat returns metadata for the file represented by the FileDescription. + Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) + + // SetStat updates metadata for the file represented by the + // FileDescription. + SetStat(ctx context.Context, opts SetStatOptions) error + + // StatFS returns metadata for the filesystem containing the file + // represented by the FileDescription. + StatFS(ctx context.Context) (linux.Statfs, error) + + // waiter.Waitable methods may be used to poll for I/O events. + waiter.Waitable + + // PRead reads from the file into dst, starting at the given offset, and + // returns the number of bytes read. PRead is permitted to return partial + // reads with a nil error. + PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) + + // Read is similar to PRead, but does not specify an offset. + // + // For files with an implicit FileDescription offset (e.g. regular files), + // Read begins at the FileDescription offset, and advances the offset by + // the number of bytes read; note that POSIX 2.9.7 "Thread Interactions + // with Regular File Operations" requires that all operations that may + // mutate the FileDescription offset are serialized. + Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) + + // PWrite writes src to the file, starting at the given offset, and returns + // the number of bytes written. PWrite is permitted to return partial + // writes with a nil error. + // + // As in Linux (but not POSIX), if O_APPEND is in effect for the + // FileDescription, PWrite should ignore the offset and append data to the + // end of the file. + PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) + + // Write is similar to PWrite, but does not specify an offset, which is + // implied as for Read. + // + // Write is a FileDescriptionImpl method, instead of a wrapper around + // PWrite that uses a FileDescription offset, to make it possible for + // remote filesystems to implement O_APPEND correctly (i.e. atomically with + // respect to writers outside the scope of VFS). + Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) + + // IterDirents invokes cb on each entry in the directory represented by the + // FileDescription. If IterDirents has been called since the last call to + // Seek, it continues iteration from the end of the last call. + IterDirents(ctx context.Context, cb IterDirentsCallback) error + + // Seek changes the FileDescription offset (assuming one exists) and + // returns its new value. + // + // For directories, if whence == SEEK_SET and offset == 0, the caller is + // rewinddir(), such that Seek "shall also cause the directory stream to + // refer to the current state of the corresponding directory" - + // POSIX.1-2017. + Seek(ctx context.Context, offset int64, whence int32) (int64, error) + + // Sync requests that cached state associated with the file represented by + // the FileDescription is synchronized with persistent storage, and blocks + // until this is complete. + Sync(ctx context.Context) error + + // ConfigureMMap mutates opts to implement mmap(2) for the file. Most + // implementations that support memory mapping can call + // GenericConfigureMMap with the appropriate memmap.Mappable. + ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error + + // Ioctl implements the ioctl(2) syscall. + Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) + + // TODO: extended attributes; file locking +} + +// Dirent holds the information contained in struct linux_dirent64. +type Dirent struct { + // Name is the filename. + Name string + + // Type is the file type, a linux.DT_* constant. + Type uint8 + + // Ino is the inode number. + Ino uint64 + + // NextOff is the offset of the *next* Dirent in the directory; that is, + // FileDescription.Seek(NextOff, SEEK_SET) (as called by seekdir(3)) will + // cause the next call to FileDescription.IterDirents() to yield the next + // Dirent. (The offset of the first Dirent in a directory is always 0.) + NextOff int64 +} + +// IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents. +type IterDirentsCallback interface { + // Handle handles the given iterated Dirent. It returns true if iteration + // should continue, and false if FileDescriptionImpl.IterDirents should + // terminate now and restart with the same Dirent the next time it is + // called. + Handle(dirent Dirent) bool +} diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go new file mode 100755 index 000000000..4fbad7840 --- /dev/null +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -0,0 +1,254 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "bytes" + "io" + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +// The following design pattern is strongly recommended for filesystem +// implementations to adapt: +// - Have a local fileDescription struct (containing FileDescription) which +// embeds FileDescriptionDefaultImpl and overrides the default methods +// which are common to all fd implementations for that for that filesystem +// like StatusFlags, SetStatusFlags, Stat, SetStat, StatFS, etc. +// - This should be embedded in all file description implementations as the +// first field by value. +// - Directory FDs would also embed DirectoryFileDescriptionDefaultImpl. + +// FileDescriptionDefaultImpl may be embedded by implementations of +// FileDescriptionImpl to obtain implementations of many FileDescriptionImpl +// methods with default behavior analogous to Linux's. +type FileDescriptionDefaultImpl struct{} + +// OnClose implements FileDescriptionImpl.OnClose analogously to +// file_operations::flush == NULL in Linux. +func (FileDescriptionDefaultImpl) OnClose(ctx context.Context) error { + return nil +} + +// StatFS implements FileDescriptionImpl.StatFS analogously to +// super_operations::statfs == NULL in Linux. +func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, error) { + return linux.Statfs{}, syserror.ENOSYS +} + +// Readiness implements waiter.Waitable.Readiness analogously to +// file_operations::poll == NULL in Linux. +func (FileDescriptionDefaultImpl) Readiness(mask waiter.EventMask) waiter.EventMask { + // include/linux/poll.h:vfs_poll() => DEFAULT_POLLMASK + return waiter.EventIn | waiter.EventOut +} + +// EventRegister implements waiter.Waitable.EventRegister analogously to +// file_operations::poll == NULL in Linux. +func (FileDescriptionDefaultImpl) EventRegister(e *waiter.Entry, mask waiter.EventMask) { +} + +// EventUnregister implements waiter.Waitable.EventUnregister analogously to +// file_operations::poll == NULL in Linux. +func (FileDescriptionDefaultImpl) EventUnregister(e *waiter.Entry) { +} + +// PRead implements FileDescriptionImpl.PRead analogously to +// file_operations::read == file_operations::read_iter == NULL in Linux. +func (FileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + return 0, syserror.EINVAL +} + +// Read implements FileDescriptionImpl.Read analogously to +// file_operations::read == file_operations::read_iter == NULL in Linux. +func (FileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + return 0, syserror.EINVAL +} + +// PWrite implements FileDescriptionImpl.PWrite analogously to +// file_operations::write == file_operations::write_iter == NULL in Linux. +func (FileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + return 0, syserror.EINVAL +} + +// Write implements FileDescriptionImpl.Write analogously to +// file_operations::write == file_operations::write_iter == NULL in Linux. +func (FileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + return 0, syserror.EINVAL +} + +// IterDirents implements FileDescriptionImpl.IterDirents analogously to +// file_operations::iterate == file_operations::iterate_shared == NULL in +// Linux. +func (FileDescriptionDefaultImpl) IterDirents(ctx context.Context, cb IterDirentsCallback) error { + return syserror.ENOTDIR +} + +// Seek implements FileDescriptionImpl.Seek analogously to +// file_operations::llseek == NULL in Linux. +func (FileDescriptionDefaultImpl) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return 0, syserror.ESPIPE +} + +// Sync implements FileDescriptionImpl.Sync analogously to +// file_operations::fsync == NULL in Linux. +func (FileDescriptionDefaultImpl) Sync(ctx context.Context) error { + return syserror.EINVAL +} + +// ConfigureMMap implements FileDescriptionImpl.ConfigureMMap analogously to +// file_operations::mmap == NULL in Linux. +func (FileDescriptionDefaultImpl) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + return syserror.ENODEV +} + +// Ioctl implements FileDescriptionImpl.Ioctl analogously to +// file_operations::unlocked_ioctl == NULL in Linux. +func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return 0, syserror.ENOTTY +} + +// DirectoryFileDescriptionDefaultImpl may be embedded by implementations of +// FileDescriptionImpl that always represent directories to obtain +// implementations of non-directory I/O methods that return EISDIR. +type DirectoryFileDescriptionDefaultImpl struct{} + +// PRead implements FileDescriptionImpl.PRead. +func (DirectoryFileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// Read implements FileDescriptionImpl.Read. +func (DirectoryFileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// PWrite implements FileDescriptionImpl.PWrite. +func (DirectoryFileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// Write implements FileDescriptionImpl.Write. +func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// DynamicBytesFileDescriptionImpl may be embedded by implementations of +// FileDescriptionImpl that represent read-only regular files whose contents +// are backed by a bytes.Buffer that is regenerated when necessary, consistent +// with Linux's fs/seq_file.c:single_open(). +// +// DynamicBytesFileDescriptionImpl.SetDataSource() must be called before first +// use. +type DynamicBytesFileDescriptionImpl struct { + data DynamicBytesSource // immutable + mu sync.Mutex // protects the following fields + buf bytes.Buffer + off int64 + lastRead int64 // offset at which the last Read, PRead, or Seek ended +} + +// DynamicBytesSource represents a data source for a +// DynamicBytesFileDescriptionImpl. +type DynamicBytesSource interface { + // Generate writes the file's contents to buf. + Generate(ctx context.Context, buf *bytes.Buffer) error +} + +// SetDataSource must be called exactly once on fd before first use. +func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) { + fd.data = data +} + +// Preconditions: fd.mu must be locked. +func (fd *DynamicBytesFileDescriptionImpl) preadLocked(ctx context.Context, dst usermem.IOSequence, offset int64, opts *ReadOptions) (int64, error) { + // Regenerate the buffer if it's empty, or before pread() at a new offset. + // Compare fs/seq_file.c:seq_read() => traverse(). + switch { + case offset != fd.lastRead: + fd.buf.Reset() + fallthrough + case fd.buf.Len() == 0: + if err := fd.data.Generate(ctx, &fd.buf); err != nil { + fd.buf.Reset() + // fd.off is not updated in this case. + fd.lastRead = 0 + return 0, err + } + } + bs := fd.buf.Bytes() + if offset >= int64(len(bs)) { + return 0, io.EOF + } + n, err := dst.CopyOut(ctx, bs[offset:]) + fd.lastRead = offset + int64(n) + return int64(n), err +} + +// PRead implements FileDescriptionImpl.PRead. +func (fd *DynamicBytesFileDescriptionImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.preadLocked(ctx, dst, offset, &opts) + fd.mu.Unlock() + return n, err +} + +// Read implements FileDescriptionImpl.Read. +func (fd *DynamicBytesFileDescriptionImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.preadLocked(ctx, dst, fd.off, &opts) + fd.off += n + fd.mu.Unlock() + return n, err +} + +// Seek implements FileDescriptionImpl.Seek. +func (fd *DynamicBytesFileDescriptionImpl) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + switch whence { + case linux.SEEK_SET: + // Use offset as given. + case linux.SEEK_CUR: + offset += fd.off + default: + // fs/seq_file:seq_lseek() rejects SEEK_END etc. + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + if offset != fd.lastRead { + // Regenerate the file's contents immediately. Compare + // fs/seq_file.c:seq_lseek() => traverse(). + fd.buf.Reset() + if err := fd.data.Generate(ctx, &fd.buf); err != nil { + fd.buf.Reset() + fd.off = 0 + fd.lastRead = 0 + return 0, err + } + fd.lastRead = offset + } + fd.off = offset + return offset, nil +} diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go new file mode 100755 index 000000000..7a074b718 --- /dev/null +++ b/pkg/sentry/vfs/filesystem.go @@ -0,0 +1,155 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" +) + +// A Filesystem is a tree of nodes represented by Dentries, which forms part of +// a VirtualFilesystem. +// +// Filesystems are reference-counted. Unless otherwise specified, all +// Filesystem methods require that a reference is held. +// +// Filesystem is analogous to Linux's struct super_block. +type Filesystem struct { + // refs is the reference count. refs is accessed using atomic memory + // operations. + refs int64 + + // impl is the FilesystemImpl associated with this Filesystem. impl is + // immutable. This should be the last field in Dentry. + impl FilesystemImpl +} + +// Init must be called before first use of fs. +func (fs *Filesystem) Init(impl FilesystemImpl) { + fs.refs = 1 + fs.impl = impl +} + +// Impl returns the FilesystemImpl associated with fs. +func (fs *Filesystem) Impl() FilesystemImpl { + return fs.impl +} + +func (fs *Filesystem) incRef() { + if atomic.AddInt64(&fs.refs, 1) <= 1 { + panic("Filesystem.incRef() called without holding a reference") + } +} + +func (fs *Filesystem) decRef() { + if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 { + fs.impl.Release() + } else if refs < 0 { + panic("Filesystem.decRef() called without holding a reference") + } +} + +// FilesystemImpl contains implementation details for a Filesystem. +// Implementations of FilesystemImpl should contain their associated Filesystem +// by value as their first field. +// +// All methods that take a ResolvingPath must resolve the path before +// performing any other checks, including rejection of the operation if not +// supported by the FilesystemImpl. This is because the final FilesystemImpl +// (responsible for actually implementing the operation) isn't known until path +// resolution is complete. +// +// For all methods that take or return linux.Statx, Statx.Uid and Statx.Gid +// should be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID +// and auth.KGID respectively). +// +// FilesystemImpl combines elements of Linux's struct super_operations and +// struct inode_operations, for reasons described in the documentation for +// Dentry. +type FilesystemImpl interface { + // Release is called when the associated Filesystem reaches zero + // references. + Release() + + // Sync "causes all pending modifications to filesystem metadata and cached + // file data to be written to the underlying [filesystem]", as by syncfs(2). + Sync(ctx context.Context) error + + // GetDentryAt returns a Dentry representing the file at rp. A reference is + // taken on the returned Dentry. + // + // GetDentryAt does not correspond directly to a Linux syscall; it is used + // in the implementation of: + // + // - Syscalls that need to resolve two paths: rename(), renameat(), + // renameat2(), link(), linkat(). + // + // - Syscalls that need to refer to a filesystem position outside the + // context of a file description: chdir(), fchdir(), chroot(), mount(), + // umount(). + GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) + + // LinkAt creates a hard link at rp representing the same file as vd. It + // does not take ownership of references on vd. + // + // The implementation is responsible for checking that vd.Mount() == + // rp.Mount(), and that vd does not represent a directory. + LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error + + // MkdirAt creates a directory at rp. + MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error + + // MknodAt creates a regular file, device special file, or named pipe at + // rp. + MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error + + // OpenAt returns an FileDescription providing access to the file at rp. A + // reference is taken on the returned FileDescription. + OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) + + // ReadlinkAt returns the target of the symbolic link at rp. + ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) + + // RenameAt renames the Dentry represented by vd to rp. It does not take + // ownership of references on vd. + // + // The implementation is responsible for checking that vd.Mount() == + // rp.Mount(). + RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error + + // RmdirAt removes the directory at rp. + RmdirAt(ctx context.Context, rp *ResolvingPath) error + + // SetStatAt updates metadata for the file at the given path. + SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error + + // StatAt returns metadata for the file at rp. + StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) + + // StatFSAt returns metadata for the filesystem containing the file at rp. + // (This method takes a path because a FilesystemImpl may consist of any + // number of constituent filesystems.) + StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) + + // SymlinkAt creates a symbolic link at rp referring to the given target. + SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error + + // UnlinkAt removes the non-directory file at rp. + UnlinkAt(ctx context.Context, rp *ResolvingPath) error + + // TODO: d_path(); extended attributes; inotify_add_watch(); bind() +} diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go new file mode 100755 index 000000000..f401ad7f3 --- /dev/null +++ b/pkg/sentry/vfs/filesystem_type.go @@ -0,0 +1,70 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +// A FilesystemType constructs filesystems. +// +// FilesystemType is analogous to Linux's struct file_system_type. +type FilesystemType interface { + // NewFilesystem returns a Filesystem configured by the given options, + // along with its mount root. A reference is taken on the returned + // Filesystem and Dentry. + NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts NewFilesystemOptions) (*Filesystem, *Dentry, error) +} + +// NewFilesystemOptions contains options to FilesystemType.NewFilesystem. +type NewFilesystemOptions struct { + // Data is the string passed as the 5th argument to mount(2), which is + // usually a comma-separated list of filesystem-specific mount options. + Data string + + // InternalData holds opaque FilesystemType-specific data. There is + // intentionally no way for applications to specify InternalData; if it is + // not nil, the call to NewFilesystem originates from within the sentry. + InternalData interface{} +} + +// RegisterFilesystemType registers the given FilesystemType in vfs with the +// given name. +func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType) error { + vfs.fsTypesMu.Lock() + defer vfs.fsTypesMu.Unlock() + if existing, ok := vfs.fsTypes[name]; ok { + return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing) + } + vfs.fsTypes[name] = fsType + return nil +} + +// MustRegisterFilesystemType is equivalent to RegisterFilesystemType but +// panics on failure. +func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType) { + if err := vfs.RegisterFilesystemType(name, fsType); err != nil { + panic(fmt.Sprintf("failed to register filesystem type %T: %v", fsType, err)) + } +} + +func (vfs *VirtualFilesystem) getFilesystemType(name string) FilesystemType { + vfs.fsTypesMu.RLock() + defer vfs.fsTypesMu.RUnlock() + return vfs.fsTypes[name] +} diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go new file mode 100755 index 000000000..11702f720 --- /dev/null +++ b/pkg/sentry/vfs/mount.go @@ -0,0 +1,411 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "math" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +// A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem +// (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem +// (Mount.fs), which applies to path resolution in the context of a particular +// Mount (Mount.key.parent). +// +// Mounts are reference-counted. Unless otherwise specified, all Mount methods +// require that a reference is held. +// +// Mount and Filesystem are distinct types because it's possible for a single +// Filesystem to be mounted at multiple locations and/or in multiple mount +// namespaces. +// +// Mount is analogous to Linux's struct mount. (gVisor does not distinguish +// between struct mount and struct vfsmount.) +type Mount struct { + // The lower 63 bits of refs are a reference count. The MSB of refs is set + // if the Mount has been eagerly unmounted, as by umount(2) without the + // MNT_DETACH flag. refs is accessed using atomic memory operations. + refs int64 + + // The lower 63 bits of writers is the number of calls to + // Mount.CheckBeginWrite() that have not yet been paired with a call to + // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect. + // writers is accessed using atomic memory operations. + writers int64 + + // key is protected by VirtualFilesystem.mountMu and + // VirtualFilesystem.mounts.seq, and may be nil. References are held on + // key.parent and key.point if they are not nil. + // + // Invariant: key.parent != nil iff key.point != nil. key.point belongs to + // key.parent.fs. + key mountKey + + // fs, root, and ns are immutable. References are held on fs and root (but + // not ns). + // + // Invariant: root belongs to fs. + fs *Filesystem + root *Dentry + ns *MountNamespace +} + +// A MountNamespace is a collection of Mounts. +// +// MountNamespaces are reference-counted. Unless otherwise specified, all +// MountNamespace methods require that a reference is held. +// +// MountNamespace is analogous to Linux's struct mnt_namespace. +type MountNamespace struct { + refs int64 // accessed using atomic memory operations + + // root is the MountNamespace's root mount. root is immutable. + root *Mount + + // mountpoints contains all Dentries which are mount points in this + // namespace. mountpoints is protected by VirtualFilesystem.mountMu. + // + // mountpoints is used to determine if a Dentry can be moved or removed + // (which requires that the Dentry is not a mount point in the calling + // namespace). + // + // mountpoints is maintained even if there are no references held on the + // MountNamespace; this is required to ensure that + // VFS.PrepareDeleteDentry() and VFS.PrepareRemoveDentry() operate + // correctly on unreferenced MountNamespaces. + mountpoints map[*Dentry]struct{} +} + +// NewMountNamespace returns a new mount namespace with a root filesystem +// configured by the given arguments. A reference is taken on the returned +// MountNamespace. +func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *NewFilesystemOptions) (*MountNamespace, error) { + fsType := vfs.getFilesystemType(fsTypeName) + if fsType == nil { + return nil, syserror.ENODEV + } + fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts) + if err != nil { + return nil, err + } + mntns := &MountNamespace{ + refs: 1, + mountpoints: make(map[*Dentry]struct{}), + } + mntns.root = &Mount{ + fs: fs, + root: root, + ns: mntns, + refs: 1, + } + return mntns, nil +} + +// NewMount creates and mounts a new Filesystem. +func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *NewFilesystemOptions) error { + fsType := vfs.getFilesystemType(fsTypeName) + if fsType == nil { + return syserror.ENODEV + } + fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts) + if err != nil { + return err + } + // We can't hold vfs.mountMu while calling FilesystemImpl methods due to + // lock ordering. + vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{}) + if err != nil { + root.decRef(fs) + fs.decRef() + return err + } + vfs.mountMu.Lock() + for { + if vd.dentry.IsDisowned() { + vfs.mountMu.Unlock() + vd.DecRef() + root.decRef(fs) + fs.decRef() + return syserror.ENOENT + } + // vd might have been mounted over between vfs.GetDentryAt() and + // vfs.mountMu.Lock(). + if !vd.dentry.isMounted() { + break + } + nextmnt := vfs.mounts.Lookup(vd.mount, vd.dentry) + if nextmnt == nil { + break + } + nextmnt.incRef() + nextmnt.root.incRef(nextmnt.fs) + vd.DecRef() + vd = VirtualDentry{ + mount: nextmnt, + dentry: nextmnt.root, + } + } + // TODO: Linux requires that either both the mount point and the mount root + // are directories, or neither are, and returns ENOTDIR if this is not the + // case. + mntns := vd.mount.ns + mnt := &Mount{ + fs: fs, + root: root, + ns: mntns, + refs: 1, + } + mnt.storeKey(vd.mount, vd.dentry) + atomic.AddUint32(&vd.dentry.mounts, 1) + mntns.mountpoints[vd.dentry] = struct{}{} + vfsmpmounts, ok := vfs.mountpoints[vd.dentry] + if !ok { + vfsmpmounts = make(map[*Mount]struct{}) + vfs.mountpoints[vd.dentry] = vfsmpmounts + } + vfsmpmounts[mnt] = struct{}{} + vfs.mounts.Insert(mnt) + vfs.mountMu.Unlock() + return nil +} + +// getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes +// a reference on the returned Mount. If (mnt, d) is not a mount point, +// getMountAt returns nil. +// +// getMountAt is analogous to Linux's fs/namei.c:follow_mount(). +// +// Preconditions: References are held on mnt and d. +func (vfs *VirtualFilesystem) getMountAt(mnt *Mount, d *Dentry) *Mount { + // The first mount is special-cased: + // + // - The caller is assumed to have checked d.isMounted() already. (This + // isn't a precondition because it doesn't matter for correctness.) + // + // - We return nil, instead of mnt, if there is no mount at (mnt, d). + // + // - We don't drop the caller's references on mnt and d. +retryFirst: + next := vfs.mounts.Lookup(mnt, d) + if next == nil { + return nil + } + if !next.tryIncMountedRef() { + // Raced with umount. + goto retryFirst + } + mnt = next + d = next.root + // We don't need to take Dentry refs anywhere in this function because + // Mounts hold references on Mount.root, which is immutable. + for d.isMounted() { + next := vfs.mounts.Lookup(mnt, d) + if next == nil { + break + } + if !next.tryIncMountedRef() { + // Raced with umount. + continue + } + mnt.decRef() + mnt = next + d = next.root + } + return mnt +} + +// getMountpointAt returns the mount point for the stack of Mounts including +// mnt. It takes a reference on the returned Mount and Dentry. If no such mount +// point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil). +// +// Preconditions: References are held on mnt and root. vfsroot is not (mnt, +// mnt.root). +func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) (*Mount, *Dentry) { + // The first mount is special-cased: + // + // - The caller must have already checked mnt against vfsroot. + // + // - We return nil, instead of mnt, if there is no mount point for mnt. + // + // - We don't drop the caller's reference on mnt. +retryFirst: + epoch := vfs.mounts.seq.BeginRead() + parent, point := mnt.loadKey() + if !vfs.mounts.seq.ReadOk(epoch) { + goto retryFirst + } + if parent == nil { + return nil, nil + } + if !parent.tryIncMountedRef() { + // Raced with umount. + goto retryFirst + } + if !point.tryIncRef(parent.fs) { + // Since Mount holds a reference on Mount.key.point, this can only + // happen due to a racing change to Mount.key. + parent.decRef() + goto retryFirst + } + mnt = parent + d := point + for { + if mnt == vfsroot.mount && d == vfsroot.dentry { + break + } + if d != mnt.root { + break + } + retryNotFirst: + epoch := vfs.mounts.seq.BeginRead() + parent, point := mnt.loadKey() + if !vfs.mounts.seq.ReadOk(epoch) { + goto retryNotFirst + } + if parent == nil { + break + } + if !parent.tryIncMountedRef() { + // Raced with umount. + goto retryNotFirst + } + if !point.tryIncRef(parent.fs) { + // Since Mount holds a reference on Mount.key.point, this can + // only happen due to a racing change to Mount.key. + parent.decRef() + goto retryNotFirst + } + if !vfs.mounts.seq.ReadOk(epoch) { + point.decRef(parent.fs) + parent.decRef() + goto retryNotFirst + } + d.decRef(mnt.fs) + mnt.decRef() + mnt = parent + d = point + } + return mnt, d +} + +// tryIncMountedRef increments mnt's reference count and returns true. If mnt's +// reference count is already zero, or has been eagerly unmounted, +// tryIncMountedRef does nothing and returns false. +// +// tryIncMountedRef does not require that a reference is held on mnt. +func (mnt *Mount) tryIncMountedRef() bool { + for { + refs := atomic.LoadInt64(&mnt.refs) + if refs <= 0 { // refs < 0 => MSB set => eagerly unmounted + return false + } + if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs+1) { + return true + } + } +} + +func (mnt *Mount) incRef() { + // In general, negative values for mnt.refs are valid because the MSB is + // the eager-unmount bit. + atomic.AddInt64(&mnt.refs, 1) +} + +func (mnt *Mount) decRef() { + refs := atomic.AddInt64(&mnt.refs, -1) + if refs&^math.MinInt64 == 0 { // mask out MSB + parent, point := mnt.loadKey() + if point != nil { + point.decRef(parent.fs) + parent.decRef() + } + mnt.root.decRef(mnt.fs) + mnt.fs.decRef() + } +} + +// CheckBeginWrite increments the counter of in-progress write operations on +// mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns +// EROFS. +// +// If CheckBeginWrite succeeds, EndWrite must be called when the write +// operation is finished. +func (mnt *Mount) CheckBeginWrite() error { + if atomic.AddInt64(&mnt.writers, 1) < 0 { + atomic.AddInt64(&mnt.writers, -1) + return syserror.EROFS + } + return nil +} + +// EndWrite indicates that a write operation signaled by a previous successful +// call to CheckBeginWrite has finished. +func (mnt *Mount) EndWrite() { + atomic.AddInt64(&mnt.writers, -1) +} + +// Preconditions: VirtualFilesystem.mountMu must be locked for writing. +func (mnt *Mount) setReadOnlyLocked(ro bool) error { + if oldRO := atomic.LoadInt64(&mnt.writers) < 0; oldRO == ro { + return nil + } + if ro { + if !atomic.CompareAndSwapInt64(&mnt.writers, 0, math.MinInt64) { + return syserror.EBUSY + } + return nil + } + // Unset MSB without dropping any temporary increments from failed calls to + // mnt.CheckBeginWrite(). + atomic.AddInt64(&mnt.writers, math.MinInt64) + return nil +} + +// Filesystem returns the mounted Filesystem. It does not take a reference on +// the returned Filesystem. +func (mnt *Mount) Filesystem() *Filesystem { + return mnt.fs +} + +// IncRef increments mntns' reference count. +func (mntns *MountNamespace) IncRef() { + if atomic.AddInt64(&mntns.refs, 1) <= 1 { + panic("MountNamespace.IncRef() called without holding a reference") + } +} + +// DecRef decrements mntns' reference count. +func (mntns *MountNamespace) DecRef() { + if refs := atomic.AddInt64(&mntns.refs, 0); refs == 0 { + // TODO: unmount mntns.root + } else if refs < 0 { + panic("MountNamespace.DecRef() called without holding a reference") + } +} + +// Root returns mntns' root. A reference is taken on the returned +// VirtualDentry. +func (mntns *MountNamespace) Root() VirtualDentry { + vd := VirtualDentry{ + mount: mntns.root, + dentry: mntns.root.root, + } + vd.IncRef() + return vd +} diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go new file mode 100755 index 000000000..b0511aa40 --- /dev/null +++ b/pkg/sentry/vfs/mount_unsafe.go @@ -0,0 +1,356 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build go1.12 +// +build !go1.14 + +// Check go:linkname function signatures when updating Go version. + +package vfs + +import ( + "fmt" + "math/bits" + "reflect" + "sync/atomic" + "unsafe" + + "gvisor.dev/gvisor/third_party/gvsync" +) + +// mountKey represents the location at which a Mount is mounted. It is +// structurally identical to VirtualDentry, but stores its fields as +// unsafe.Pointer since mutators synchronize with VFS path traversal using +// seqcounts. +type mountKey struct { + parent unsafe.Pointer // *Mount + point unsafe.Pointer // *Dentry +} + +// Invariant: mnt.key's fields are nil. parent and point are non-nil. +func (mnt *Mount) storeKey(parent *Mount, point *Dentry) { + atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(parent)) + atomic.StorePointer(&mnt.key.point, unsafe.Pointer(point)) +} + +func (mnt *Mount) loadKey() (*Mount, *Dentry) { + return (*Mount)(atomic.LoadPointer(&mnt.key.parent)), (*Dentry)(atomic.LoadPointer(&mnt.key.point)) +} + +func (mnt *Mount) parent() *Mount { + return (*Mount)(atomic.LoadPointer(&mnt.key.parent)) +} + +func (mnt *Mount) point() *Dentry { + return (*Dentry)(atomic.LoadPointer(&mnt.key.point)) +} + +// mountTable maps (mount parent, mount point) pairs to mounts. It supports +// efficient concurrent lookup, even in the presence of concurrent mutators +// (provided mutation is sufficiently uncommon). +// +// mountTable.Init() must be called on new mountTables before use. +type mountTable struct { + // mountTable is implemented as a seqcount-protected hash table that + // resolves collisions with linear probing, featuring Robin Hood insertion + // and backward shift deletion. These minimize probe length variance, + // significantly improving the performance of linear probing at high load + // factors. (mountTable doesn't use bucketing, which is the other major + // technique commonly used in high-performance hash tables; the efficiency + // of bucketing is largely due to SIMD lookup, and Go lacks both SIMD + // intrinsics and inline assembly, limiting the performance of this + // approach.) + + seq gvsync.SeqCount + seed uint32 // for hashing keys + + // size holds both length (number of elements) and capacity (number of + // slots): capacity is stored as its base-2 log (referred to as order) in + // the least significant bits of size, and length is stored in the + // remaining bits. Go defines bit shifts >= width of shifted unsigned + // operand as shifting to 0, which differs from x86's SHL, so the Go + // compiler inserts a bounds check for each bit shift unless we mask order + // anyway (cf. runtime.bucketShift()), and length isn't used by lookup; + // thus this bit packing gets us more bits for the length (vs. storing + // length and cap in separate uint32s) for ~free. + size uint64 + + slots unsafe.Pointer // []mountSlot; never nil after Init +} + +type mountSlot struct { + // We don't store keys in slots; instead, we just check Mount.parent and + // Mount.point directly. Any practical use of lookup will need to touch + // Mounts anyway, and comparing hashes means that false positives are + // extremely rare, so this isn't an extra cache line touch overall. + value unsafe.Pointer // *Mount + hash uintptr +} + +const ( + mtSizeOrderBits = 6 // log2 of pointer size in bits + mtSizeOrderMask = (1 << mtSizeOrderBits) - 1 + mtSizeOrderOne = 1 + mtSizeLenLSB = mtSizeOrderBits + mtSizeLenOne = 1 << mtSizeLenLSB + mtSizeLenNegOne = ^uint64(mtSizeOrderMask) // uint64(-1) << mtSizeLenLSB + + mountSlotBytes = unsafe.Sizeof(mountSlot{}) + mountKeyBytes = unsafe.Sizeof(mountKey{}) + + // Tuning parameters. + // + // Essentially every mountTable will contain at least /proc, /sys, and + // /dev/shm, so there is ~no reason for mtInitCap to be < 4. + mtInitOrder = 2 + mtInitCap = 1 << mtInitOrder + mtMaxLoadNum = 13 + mtMaxLoadDen = 16 +) + +func init() { + // We can't just define mtSizeOrderBits as follows because Go doesn't have + // constexpr. + if ptrBits := uint(unsafe.Sizeof(uintptr(0)) * 8); mtSizeOrderBits != bits.TrailingZeros(ptrBits) { + panic(fmt.Sprintf("mtSizeOrderBits (%d) must be %d = log2 of pointer size in bits (%d)", mtSizeOrderBits, bits.TrailingZeros(ptrBits), ptrBits)) + } + if bits.OnesCount(uint(mountSlotBytes)) != 1 { + panic(fmt.Sprintf("sizeof(mountSlotBytes) (%d) must be a power of 2 to use bit masking for wraparound", mountSlotBytes)) + } + if mtInitCap <= 1 { + panic(fmt.Sprintf("mtInitCap (%d) must be at least 2 since mountTable methods assume that there will always be at least one empty slot", mtInitCap)) + } + if mtMaxLoadNum >= mtMaxLoadDen { + panic(fmt.Sprintf("invalid mountTable maximum load factor (%d/%d)", mtMaxLoadNum, mtMaxLoadDen)) + } +} + +// Init must be called exactly once on each mountTable before use. +func (mt *mountTable) Init() { + mt.seed = rand32() + mt.size = mtInitOrder + mt.slots = newMountTableSlots(mtInitCap) +} + +func newMountTableSlots(cap uintptr) unsafe.Pointer { + slice := make([]mountSlot, cap, cap) + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) + return unsafe.Pointer(hdr.Data) +} + +// Lookup returns the Mount with the given parent, mounted at the given point. +// If no such Mount exists, Lookup returns nil. +// +// Lookup may be called even if there are concurrent mutators of mt. +func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount { + key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)} + hash := memhash(noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes) + +loop: + for { + epoch := mt.seq.BeginRead() + size := atomic.LoadUint64(&mt.size) + slots := atomic.LoadPointer(&mt.slots) + if !mt.seq.ReadOk(epoch) { + continue + } + tcap := uintptr(1) << (size & mtSizeOrderMask) + mask := tcap - 1 + off := (hash & mask) * mountSlotBytes + offmask := mask * mountSlotBytes + for { + // This avoids bounds checking. + slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off)) + slotValue := atomic.LoadPointer(&slot.value) + slotHash := atomic.LoadUintptr(&slot.hash) + if !mt.seq.ReadOk(epoch) { + // The element we're looking for might have been moved into a + // slot we've previously checked, so restart entirely. + continue loop + } + if slotValue == nil { + return nil + } + if slotHash == hash { + mount := (*Mount)(slotValue) + var mountKey mountKey + mountKey.parent = atomic.LoadPointer(&mount.key.parent) + mountKey.point = atomic.LoadPointer(&mount.key.point) + if !mt.seq.ReadOk(epoch) { + continue loop + } + if key == mountKey { + return mount + } + } + off = (off + mountSlotBytes) & offmask + } + } +} + +// Insert inserts the given mount into mt. +// +// Preconditions: There are no concurrent mutators of mt. mt must not already +// contain a Mount with the same mount point and parent. +func (mt *mountTable) Insert(mount *Mount) { + hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes) + + // We're under the maximum load factor if: + // + // (len+1) / cap <= mtMaxLoadNum / mtMaxLoadDen + // (len+1) * mtMaxLoadDen <= mtMaxLoadNum * cap + tlen := mt.size >> mtSizeLenLSB + order := mt.size & mtSizeOrderMask + tcap := uintptr(1) << order + if ((tlen + 1) * mtMaxLoadDen) <= (uint64(mtMaxLoadNum) << order) { + // Atomically insert the new element into the table. + mt.seq.BeginWrite() + atomic.AddUint64(&mt.size, mtSizeLenOne) + mtInsertLocked(mt.slots, tcap, unsafe.Pointer(mount), hash) + mt.seq.EndWrite() + return + } + + // Otherwise, we have to expand. Double the number of slots in the new + // table. + newOrder := order + 1 + if newOrder > mtSizeOrderMask { + panic("mount table size overflow") + } + newCap := uintptr(1) << newOrder + newSlots := newMountTableSlots(newCap) + // Copy existing elements to the new table. + oldCur := mt.slots + // Go does not permit pointers to the end of allocated objects, so we + // must use a pointer to the last element of the old table. The + // following expression is equivalent to + // `slots+(cap-1)*mountSlotBytes` but has a critical path length of 2 + // arithmetic instructions instead of 3. + oldLast := unsafe.Pointer((uintptr(mt.slots) - mountSlotBytes) + (tcap * mountSlotBytes)) + for { + oldSlot := (*mountSlot)(oldCur) + if oldSlot.value != nil { + // Don't need to lock mt.seq yet since newSlots isn't visible + // to readers. + mtInsertLocked(newSlots, newCap, oldSlot.value, oldSlot.hash) + } + if oldCur == oldLast { + break + } + oldCur = unsafe.Pointer(uintptr(oldCur) + mountSlotBytes) + } + // Insert the new element into the new table. + mtInsertLocked(newSlots, newCap, unsafe.Pointer(mount), hash) + // Atomically switch to the new table. + mt.seq.BeginWrite() + atomic.AddUint64(&mt.size, mtSizeLenOne|mtSizeOrderOne) + atomic.StorePointer(&mt.slots, newSlots) + mt.seq.EndWrite() +} + +// Preconditions: There are no concurrent mutators of the table (slots, cap). +// If the table is visible to readers, then mt.seq must be in a writer critical +// section. cap must be a power of 2. +func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, hash uintptr) { + mask := cap - 1 + off := (hash & mask) * mountSlotBytes + offmask := mask * mountSlotBytes + disp := uintptr(0) + for { + slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off)) + slotValue := slot.value + if slotValue == nil { + atomic.StorePointer(&slot.value, value) + atomic.StoreUintptr(&slot.hash, hash) + return + } + // If we've been displaced farther from our first-probed slot than the + // element stored in this one, swap elements and switch to inserting + // the replaced one. (This is Robin Hood insertion.) + slotHash := slot.hash + slotDisp := ((off / mountSlotBytes) - slotHash) & mask + if disp > slotDisp { + atomic.StorePointer(&slot.value, value) + atomic.StoreUintptr(&slot.hash, hash) + value = slotValue + hash = slotHash + disp = slotDisp + } + off = (off + mountSlotBytes) & offmask + disp++ + } +} + +// Remove removes the given mount from mt. +// +// Preconditions: There are no concurrent mutators of mt. mt must contain +// mount. +func (mt *mountTable) Remove(mount *Mount) { + hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes) + tcap := uintptr(1) << (mt.size & mtSizeOrderMask) + mask := tcap - 1 + slots := mt.slots + off := (hash & mask) * mountSlotBytes + offmask := mask * mountSlotBytes + for { + slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off)) + slotValue := slot.value + if slotValue == unsafe.Pointer(mount) { + // Found the element to remove. Move all subsequent elements + // backward until we either find an empty slot, or an element that + // is already in its first-probed slot. (This is backward shift + // deletion.) + mt.seq.BeginWrite() + for { + nextOff := (off + mountSlotBytes) & offmask + nextSlot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + nextOff)) + nextSlotValue := nextSlot.value + if nextSlotValue == nil { + break + } + nextSlotHash := nextSlot.hash + if (nextOff / mountSlotBytes) == (nextSlotHash & mask) { + break + } + atomic.StorePointer(&slot.value, nextSlotValue) + atomic.StoreUintptr(&slot.hash, nextSlotHash) + off = nextOff + slot = nextSlot + } + atomic.StorePointer(&slot.value, nil) + atomic.AddUint64(&mt.size, mtSizeLenNegOne) + mt.seq.EndWrite() + return + } + if checkInvariants && slotValue == nil { + panic(fmt.Sprintf("mountTable.Remove() called on missing Mount %v", mount)) + } + off = (off + mountSlotBytes) & offmask + } +} + +//go:linkname memhash runtime.memhash +func memhash(p unsafe.Pointer, seed, s uintptr) uintptr + +//go:linkname rand32 runtime.fastrand +func rand32() uint32 + +// This is copy/pasted from runtime.noescape(), and is needed because arguments +// apparently escape from all functions defined by linkname. +// +//go:nosplit +func noescape(p unsafe.Pointer) unsafe.Pointer { + x := uintptr(p) + return unsafe.Pointer(x ^ 0) +} diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go new file mode 100755 index 000000000..3aa73d911 --- /dev/null +++ b/pkg/sentry/vfs/options.go @@ -0,0 +1,123 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" +) + +// GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and +// FilesystemImpl.GetDentryAt(). +type GetDentryOptions struct { + // If CheckSearchable is true, FilesystemImpl.GetDentryAt() must check that + // the returned Dentry is a directory for which creds has search + // permission. + CheckSearchable bool +} + +// MkdirOptions contains options to VirtualFilesystem.MkdirAt() and +// FilesystemImpl.MkdirAt(). +type MkdirOptions struct { + // Mode is the file mode bits for the created directory. + Mode linux.FileMode +} + +// MknodOptions contains options to VirtualFilesystem.MknodAt() and +// FilesystemImpl.MknodAt(). +type MknodOptions struct { + // Mode is the file type and mode bits for the created file. + Mode linux.FileMode + + // If Mode specifies a character or block device special file, DevMajor and + // DevMinor are the major and minor device numbers for the created device. + DevMajor uint32 + DevMinor uint32 +} + +// OpenOptions contains options to VirtualFilesystem.OpenAt() and +// FilesystemImpl.OpenAt(). +type OpenOptions struct { + // Flags contains access mode and flags as specified for open(2). + // + // FilesystemImpls is reponsible for implementing the following flags: + // O_RDONLY, O_WRONLY, O_RDWR, O_APPEND, O_CREAT, O_DIRECT, O_DSYNC, + // O_EXCL, O_NOATIME, O_NOCTTY, O_NONBLOCK, O_PATH, O_SYNC, O_TMPFILE, and + // O_TRUNC. VFS is responsible for handling O_DIRECTORY, O_LARGEFILE, and + // O_NOFOLLOW. VFS users are responsible for handling O_CLOEXEC, since file + // descriptors are mostly outside the scope of VFS. + Flags uint32 + + // If FilesystemImpl.OpenAt() creates a file, Mode is the file mode for the + // created file. + Mode linux.FileMode +} + +// ReadOptions contains options to FileDescription.PRead(), +// FileDescriptionImpl.PRead(), FileDescription.Read(), and +// FileDescriptionImpl.Read(). +type ReadOptions struct { + // Flags contains flags as specified for preadv2(2). + Flags uint32 +} + +// RenameOptions contains options to VirtualFilesystem.RenameAt() and +// FilesystemImpl.RenameAt(). +type RenameOptions struct { + // Flags contains flags as specified for renameat2(2). + Flags uint32 +} + +// SetStatOptions contains options to VirtualFilesystem.SetStatAt(), +// FilesystemImpl.SetStatAt(), FileDescription.SetStat(), and +// FileDescriptionImpl.SetStat(). +type SetStatOptions struct { + // Stat is the metadata that should be set. Only fields indicated by + // Stat.Mask should be set. + // + // If Stat specifies that a timestamp should be set, + // FilesystemImpl.SetStatAt() and FileDescriptionImpl.SetStat() must + // special-case StatxTimestamp.Nsec == UTIME_NOW as described by + // utimensat(2); however, they do not need to check for StatxTimestamp.Nsec + // == UTIME_OMIT (VFS users must unset the corresponding bit in Stat.Mask + // instead). + Stat linux.Statx +} + +// StatOptions contains options to VirtualFilesystem.StatAt(), +// FilesystemImpl.StatAt(), FileDescription.Stat(), and +// FileDescriptionImpl.Stat(). +type StatOptions struct { + // Mask is the set of fields in the returned Statx that the FilesystemImpl + // or FileDescriptionImpl should provide. Bits are as in linux.Statx.Mask. + // + // The FilesystemImpl or FileDescriptionImpl may return fields not + // requested in Mask, and may fail to return fields requested in Mask that + // are not supported by the underlying filesystem implementation, without + // returning an error. + Mask uint32 + + // Sync specifies the synchronization required, and is one of + // linux.AT_STATX_SYNC_AS_STAT (which is 0, and therefore the default), + // linux.AT_STATX_SYNC_FORCE_SYNC, or linux.AT_STATX_SYNC_DONT_SYNC. + Sync uint32 +} + +// WriteOptions contains options to FileDescription.PWrite(), +// FileDescriptionImpl.PWrite(), FileDescription.Write(), and +// FileDescriptionImpl.Write(). +type WriteOptions struct { + // Flags contains flags as specified for pwritev2(2). + Flags uint32 +} diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go new file mode 100755 index 000000000..f8e74355c --- /dev/null +++ b/pkg/sentry/vfs/permissions.go @@ -0,0 +1,121 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +// AccessTypes is a bitmask of Unix file permissions. +type AccessTypes uint16 + +// Bits in AccessTypes. +const ( + MayRead AccessTypes = 4 + MayWrite = 2 + MayExec = 1 +) + +// GenericCheckPermissions checks that creds has the given access rights on a +// file with the given permissions, UID, and GID, subject to the rules of +// fs/namei.c:generic_permission(). isDir is true if the file is a directory. +func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir bool, mode uint16, kuid auth.KUID, kgid auth.KGID) error { + // Check permission bits. + perms := mode + if creds.EffectiveKUID == kuid { + perms >>= 6 + } else if creds.InGroup(kgid) { + perms >>= 3 + } + if uint16(ats)&perms == uint16(ats) { + return nil + } + + // Caller capabilities require that the file's KUID and KGID are mapped in + // the caller's user namespace; compare + // kernel/capability.c:privileged_wrt_inode_uidgid(). + if !kuid.In(creds.UserNamespace).Ok() || !kgid.In(creds.UserNamespace).Ok() { + return syserror.EACCES + } + // CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary + // directories, and read arbitrary non-directory files. + if (isDir && (ats&MayWrite == 0)) || ats == MayRead { + if creds.HasCapability(linux.CAP_DAC_READ_SEARCH) { + return nil + } + } + // CAP_DAC_OVERRIDE allows arbitrary access to directories, read/write + // access to non-directory files, and execute access to non-directory files + // for which at least one execute bit is set. + if isDir || (ats&MayExec == 0) || (mode&0111 != 0) { + if creds.HasCapability(linux.CAP_DAC_OVERRIDE) { + return nil + } + } + return syserror.EACCES +} + +// AccessTypesForOpenFlags returns the access types required to open a file +// with the given OpenOptions.Flags. Note that this is NOT the same thing as +// the set of accesses permitted for the opened file: +// +// - O_TRUNC causes MayWrite to be set in the returned AccessTypes (since it +// mutates the file), but does not permit the opened to write to the file +// thereafter. +// +// - "Linux reserves the special, nonstandard access mode 3 (binary 11) in +// flags to mean: check for read and write permission on the file and return a +// file descriptor that can't be used for reading or writing." - open(2). Thus +// AccessTypesForOpenFlags returns MayRead|MayWrite in this case, but +// filesystems are responsible for ensuring that access is denied. +// +// Use May{Read,Write}FileWithOpenFlags() for these checks instead. +func AccessTypesForOpenFlags(flags uint32) AccessTypes { + switch flags & linux.O_ACCMODE { + case linux.O_RDONLY: + if flags&linux.O_TRUNC != 0 { + return MayRead | MayWrite + } + return MayRead + case linux.O_WRONLY: + return MayWrite + default: + return MayRead | MayWrite + } +} + +// MayReadFileWithOpenFlags returns true if a file with the given open flags +// should be readable. +func MayReadFileWithOpenFlags(flags uint32) bool { + switch flags & linux.O_ACCMODE { + case linux.O_RDONLY, linux.O_RDWR: + return true + default: + return false + } +} + +// MayWriteFileWithOpenFlags returns true if a file with the given open flags +// should be writable. +func MayWriteFileWithOpenFlags(flags uint32) bool { + switch flags & linux.O_ACCMODE { + case linux.O_WRONLY, linux.O_RDWR: + return true + default: + return false + } +} diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go new file mode 100755 index 000000000..8d05c8583 --- /dev/null +++ b/pkg/sentry/vfs/resolving_path.go @@ -0,0 +1,453 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "fmt" + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +// ResolvingPath represents the state of an in-progress path resolution, shared +// between VFS and FilesystemImpl methods that take a path. +// +// From the perspective of FilesystemImpl methods, a ResolvingPath represents a +// starting Dentry on the associated Filesystem (on which a reference is +// already held) and a stream of path components relative to that Dentry. +// +// ResolvingPath is loosely analogous to Linux's struct nameidata. +type ResolvingPath struct { + vfs *VirtualFilesystem + root VirtualDentry // refs borrowed from PathOperation + mount *Mount + start *Dentry + pit fspath.Iterator + + flags uint16 + mustBeDir bool // final file must be a directory? + mustBeDirOrig bool + symlinks uint8 // number of symlinks traversed + symlinksOrig uint8 + curPart uint8 // index into parts + numOrigParts uint8 + + creds *auth.Credentials + + // Data associated with resolve*Errors, stored in ResolvingPath so that + // those errors don't need to allocate. + nextMount *Mount // ref held if not nil + nextStart *Dentry // ref held if not nil + absSymlinkTarget fspath.Path + + // ResolvingPath must track up to two relative paths: the "current" + // relative path, which is updated whenever a relative symlink is + // encountered, and the "original" relative path, which is updated from the + // current relative path by handleError() when resolution must change + // filesystems (due to reaching a mount boundary or absolute symlink) and + // overwrites the current relative path when Restart() is called. + parts [1 + linux.MaxSymlinkTraversals]fspath.Iterator + origParts [1 + linux.MaxSymlinkTraversals]fspath.Iterator +} + +const ( + rpflagsHaveMountRef = 1 << iota // do we hold a reference on mount? + rpflagsHaveStartRef // do we hold a reference on start? + rpflagsFollowFinalSymlink // same as PathOperation.FollowFinalSymlink +) + +func init() { + if maxParts := len(ResolvingPath{}.parts); maxParts > 255 { + panic(fmt.Sprintf("uint8 is insufficient to accommodate len(ResolvingPath.parts) (%d)", maxParts)) + } +} + +// Error types that communicate state from the FilesystemImpl-caller, +// VFS-callee side of path resolution (i.e. errors returned by +// ResolvingPath.Resolve*()) to the VFS-caller, FilesystemImpl-callee side +// (i.e. VFS methods => ResolvingPath.handleError()). These are empty structs +// rather than error values because Go doesn't support non-primitive constants, +// so error "constants" are really mutable vars, necessitating somewhat +// expensive interface object comparisons. + +type resolveMountRootError struct{} + +// Error implements error.Error. +func (resolveMountRootError) Error() string { + return "resolving mount root" +} + +type resolveMountPointError struct{} + +// Error implements error.Error. +func (resolveMountPointError) Error() string { + return "resolving mount point" +} + +type resolveAbsSymlinkError struct{} + +// Error implements error.Error. +func (resolveAbsSymlinkError) Error() string { + return "resolving absolute symlink" +} + +var resolvingPathPool = sync.Pool{ + New: func() interface{} { + return &ResolvingPath{} + }, +} + +func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) (*ResolvingPath, error) { + path, err := fspath.Parse(pop.Pathname) + if err != nil { + return nil, err + } + rp := resolvingPathPool.Get().(*ResolvingPath) + rp.vfs = vfs + rp.root = pop.Root + rp.mount = pop.Start.mount + rp.start = pop.Start.dentry + rp.pit = path.Begin + rp.flags = 0 + if pop.FollowFinalSymlink { + rp.flags |= rpflagsFollowFinalSymlink + } + rp.mustBeDir = path.Dir + rp.mustBeDirOrig = path.Dir + rp.symlinks = 0 + rp.curPart = 0 + rp.numOrigParts = 1 + rp.creds = creds + rp.parts[0] = path.Begin + rp.origParts[0] = path.Begin + return rp, nil +} + +func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) { + rp.root = VirtualDentry{} + rp.decRefStartAndMount() + rp.mount = nil + rp.start = nil + rp.releaseErrorState() + resolvingPathPool.Put(rp) +} + +func (rp *ResolvingPath) decRefStartAndMount() { + if rp.flags&rpflagsHaveStartRef != 0 { + rp.start.decRef(rp.mount.fs) + } + if rp.flags&rpflagsHaveMountRef != 0 { + rp.mount.decRef() + } +} + +func (rp *ResolvingPath) releaseErrorState() { + if rp.nextStart != nil { + rp.nextStart.decRef(rp.nextMount.fs) + rp.nextStart = nil + } + if rp.nextMount != nil { + rp.nextMount.decRef() + rp.nextMount = nil + } +} + +// VirtualFilesystem returns the containing VirtualFilesystem. +func (rp *ResolvingPath) VirtualFilesystem() *VirtualFilesystem { + return rp.vfs +} + +// Credentials returns the credentials of rp's provider. +func (rp *ResolvingPath) Credentials() *auth.Credentials { + return rp.creds +} + +// Mount returns the Mount on which path resolution is currently occurring. It +// does not take a reference on the returned Mount. +func (rp *ResolvingPath) Mount() *Mount { + return rp.mount +} + +// Start returns the starting Dentry represented by rp. It does not take a +// reference on the returned Dentry. +func (rp *ResolvingPath) Start() *Dentry { + return rp.start +} + +// Done returns true if there are no remaining path components in the stream +// represented by rp. +func (rp *ResolvingPath) Done() bool { + // We don't need to check for rp.curPart == 0 because rp.Advance() won't + // set rp.pit to a terminal iterator otherwise. + return !rp.pit.Ok() +} + +// Final returns true if there is exactly one remaining path component in the +// stream represented by rp. +// +// Preconditions: !rp.Done(). +func (rp *ResolvingPath) Final() bool { + return rp.curPart == 0 && !rp.pit.NextOk() +} + +// Component returns the current path component in the stream represented by +// rp. +// +// Preconditions: !rp.Done(). +func (rp *ResolvingPath) Component() string { + if checkInvariants { + if !rp.pit.Ok() { + panic("ResolvingPath.Component() called at end of relative path") + } + } + return rp.pit.String() +} + +// Advance advances the stream of path components represented by rp. +// +// Preconditions: !rp.Done(). +func (rp *ResolvingPath) Advance() { + if checkInvariants { + if !rp.pit.Ok() { + panic("ResolvingPath.Advance() called at end of relative path") + } + } + next := rp.pit.Next() + if next.Ok() || rp.curPart == 0 { // have next component, or at end of path + rp.pit = next + } else { // at end of path segment, continue with next one + rp.curPart-- + rp.pit = rp.parts[rp.curPart-1] + } +} + +// Restart resets the stream of path components represented by rp to its state +// on entry to the current FilesystemImpl method. +func (rp *ResolvingPath) Restart() { + rp.pit = rp.origParts[rp.numOrigParts-1] + rp.mustBeDir = rp.mustBeDirOrig + rp.symlinks = rp.symlinksOrig + rp.curPart = rp.numOrigParts - 1 + copy(rp.parts[:], rp.origParts[:rp.numOrigParts]) + rp.releaseErrorState() +} + +func (rp *ResolvingPath) relpathCommit() { + rp.mustBeDirOrig = rp.mustBeDir + rp.symlinksOrig = rp.symlinks + rp.numOrigParts = rp.curPart + 1 + copy(rp.origParts[:rp.curPart], rp.parts[:]) + rp.origParts[rp.curPart] = rp.pit +} + +// ResolveParent returns the VFS parent of d. It does not take a reference on +// the returned Dentry. +// +// Preconditions: There are no concurrent mutators of d. +// +// Postconditions: If the returned error is nil, then the returned Dentry is +// not nil. +func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) { + var parent *Dentry + if d == rp.root.dentry && rp.mount == rp.root.mount { + // At contextual VFS root. + parent = d + } else if d == rp.mount.root { + // At mount root ... + mnt, mntpt := rp.vfs.getMountpointAt(rp.mount, rp.root) + if mnt != nil { + // ... of non-root mount. + rp.nextMount = mnt + rp.nextStart = mntpt + return nil, resolveMountRootError{} + } + // ... of root mount. + parent = d + } else if d.parent == nil { + // At filesystem root. + parent = d + } else { + parent = d.parent + } + if parent.isMounted() { + if mnt := rp.vfs.getMountAt(rp.mount, parent); mnt != nil { + rp.nextMount = mnt + return nil, resolveMountPointError{} + } + } + return parent, nil +} + +// ResolveChild returns the VFS child of d with the given name. It does not +// take a reference on the returned Dentry. If no such child exists, +// ResolveChild returns (nil, nil). +// +// Preconditions: There are no concurrent mutators of d. +func (rp *ResolvingPath) ResolveChild(d *Dentry, name string) (*Dentry, error) { + child := d.children[name] + if child == nil { + return nil, nil + } + if child.isMounted() { + if mnt := rp.vfs.getMountAt(rp.mount, child); mnt != nil { + rp.nextMount = mnt + return nil, resolveMountPointError{} + } + } + return child, nil +} + +// ResolveComponent returns the Dentry reached by starting at d and resolving +// the current path component in the stream represented by rp. It does not +// advance the stream. It does not take a reference on the returned Dentry. If +// no such Dentry exists, ResolveComponent returns (nil, nil). +// +// Preconditions: !rp.Done(). There are no concurrent mutators of d. +func (rp *ResolvingPath) ResolveComponent(d *Dentry) (*Dentry, error) { + switch pc := rp.Component(); pc { + case ".": + return d, nil + case "..": + return rp.ResolveParent(d) + default: + return rp.ResolveChild(d, pc) + } +} + +// ShouldFollowSymlink returns true if, supposing that the current path +// component in pcs represents a symbolic link, the symbolic link should be +// followed. +// +// Preconditions: !rp.Done(). +func (rp *ResolvingPath) ShouldFollowSymlink() bool { + // Non-final symlinks are always followed. + return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final() +} + +// HandleSymlink is called when the current path component is a symbolic link +// to the given target. If the calling Filesystem method should continue path +// traversal, HandleSymlink updates the path component stream to reflect the +// symlink target and returns nil. Otherwise it returns a non-nil error. +// +// Preconditions: !rp.Done(). +func (rp *ResolvingPath) HandleSymlink(target string) error { + if rp.symlinks >= linux.MaxSymlinkTraversals { + return syserror.ELOOP + } + targetPath, err := fspath.Parse(target) + if err != nil { + return err + } + rp.symlinks++ + if targetPath.Absolute { + rp.absSymlinkTarget = targetPath + return resolveAbsSymlinkError{} + } + if !targetPath.Begin.Ok() { + panic(fmt.Sprintf("symbolic link has non-empty target %q that is both relative and has no path components?", target)) + } + // Consume the path component that represented the symlink. + rp.Advance() + // Prepend the symlink target to the relative path. + rp.relpathPrepend(targetPath) + return nil +} + +func (rp *ResolvingPath) relpathPrepend(path fspath.Path) { + if rp.pit.Ok() { + rp.parts[rp.curPart] = rp.pit + rp.pit = path.Begin + rp.curPart++ + } else { + // The symlink was the final path component, so now the symlink target + // is the whole path. + rp.pit = path.Begin + // Symlink targets can set rp.mustBeDir (if they end in a trailing /), + // but can't unset it. + if path.Dir { + rp.mustBeDir = true + } + } +} + +func (rp *ResolvingPath) handleError(err error) bool { + switch err.(type) { + case resolveMountRootError: + // Switch to the new Mount. We hold references on the Mount and Dentry + // (from VFS.getMountpointAt()). + rp.decRefStartAndMount() + rp.mount = rp.nextMount + rp.start = rp.nextStart + rp.flags |= rpflagsHaveMountRef | rpflagsHaveStartRef + rp.nextMount = nil + rp.nextStart = nil + // Commit the previous FileystemImpl's progress through the relative + // path. (Don't consume the path component that caused us to traverse + // through the mount root - i.e. the ".." - because we still need to + // resolve the mount point's parent in the new FilesystemImpl.) + rp.relpathCommit() + // Restart path resolution on the new Mount. Don't bother calling + // rp.releaseErrorState() since we already set nextMount and nextStart + // to nil above. + return true + + case resolveMountPointError: + // Switch to the new Mount. We hold a reference on the Mount (from + // VFS.getMountAt()), but borrow the reference on the mount root from + // the Mount. + rp.decRefStartAndMount() + rp.mount = rp.nextMount + rp.start = rp.nextMount.root + rp.flags = rp.flags&^rpflagsHaveStartRef | rpflagsHaveMountRef + rp.nextMount = nil + // Consume the path component that represented the mount point. + rp.Advance() + // Commit the previous FilesystemImpl's progress through the relative + // path. + rp.relpathCommit() + // Restart path resolution on the new Mount. + rp.releaseErrorState() + return true + + case resolveAbsSymlinkError: + // Switch to the new Mount. References are borrowed from rp.root. + rp.decRefStartAndMount() + rp.mount = rp.root.mount + rp.start = rp.root.dentry + rp.flags &^= rpflagsHaveMountRef | rpflagsHaveStartRef + // Consume the path component that represented the symlink. + rp.Advance() + // Prepend the symlink target to the relative path. + rp.relpathPrepend(rp.absSymlinkTarget) + // Commit the previous FilesystemImpl's progress through the relative + // path, including the symlink target we just prepended. + rp.relpathCommit() + // Restart path resolution on the new Mount. + rp.releaseErrorState() + return true + + default: + // Not an error we can handle. + return false + } +} + +// MustBeDir returns true if the file traversed by rp must be a directory. +func (rp *ResolvingPath) MustBeDir() bool { + return rp.mustBeDir +} diff --git a/pkg/sentry/vfs/syscalls.go b/pkg/sentry/vfs/syscalls.go new file mode 100755 index 000000000..23f2b9e08 --- /dev/null +++ b/pkg/sentry/vfs/syscalls.go @@ -0,0 +1,217 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +// PathOperation specifies the path operated on by a VFS method. +// +// PathOperation is passed to VFS methods by pointer to reduce memory copying: +// it's somewhat large and should never escape. (Options structs are passed by +// pointer to VFS and FileDescription methods for the same reason.) +type PathOperation struct { + // Root is the VFS root. References on Root are borrowed from the provider + // of the PathOperation. + // + // Invariants: Root.Ok(). + Root VirtualDentry + + // Start is the starting point for the path traversal. References on Start + // are borrowed from the provider of the PathOperation (i.e. the caller of + // the VFS method to which the PathOperation was passed). + // + // Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root. + Start VirtualDentry + + // Path is the pathname traversed by this operation. + Pathname string + + // If FollowFinalSymlink is true, and the Dentry traversed by the final + // path component represents a symbolic link, the symbolic link should be + // followed. + FollowFinalSymlink bool +} + +// GetDentryAt returns a VirtualDentry representing the given path, at which a +// file must exist. A reference is taken on the returned VirtualDentry. +func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return VirtualDentry{}, err + } + for { + d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts) + if err == nil { + vd := VirtualDentry{ + mount: rp.mount, + dentry: d, + } + rp.mount.incRef() + vfs.putResolvingPath(rp) + return vd, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return VirtualDentry{}, err + } + } +} + +// MkdirAt creates a directory at the given path. +func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error { + // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is + // also honored." - mkdir(2) + opts.Mode &= 01777 + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return err + } + for { + err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// OpenAt returns a FileDescription providing access to the file at the given +// path. A reference is taken on the returned FileDescription. +func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) { + // Remove: + // + // - O_LARGEFILE, which we always report in FileDescription status flags + // since only 64-bit architectures are supported at this time. + // + // - O_CLOEXEC, which affects file descriptors and therefore must be + // handled outside of VFS. + // + // - Unknown flags. + opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE + // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC. + if opts.Flags&linux.O_SYNC != 0 { + opts.Flags |= linux.O_DSYNC + } + // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified + // with O_DIRECTORY and a writable access mode (to ensure that it fails on + // filesystem implementations that do not support it). + if opts.Flags&linux.O_TMPFILE != 0 { + if opts.Flags&linux.O_DIRECTORY == 0 { + return nil, syserror.EINVAL + } + if opts.Flags&linux.O_CREAT != 0 { + return nil, syserror.EINVAL + } + if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY { + return nil, syserror.EINVAL + } + } + // O_PATH causes most other flags to be ignored. + if opts.Flags&linux.O_PATH != 0 { + opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH + } + // "On Linux, the following bits are also honored in mode: [S_ISUID, + // S_ISGID, S_ISVTX]" - open(2) + opts.Mode &= 07777 + + if opts.Flags&linux.O_NOFOLLOW != 0 { + pop.FollowFinalSymlink = false + } + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return nil, err + } + if opts.Flags&linux.O_DIRECTORY != 0 { + rp.mustBeDir = true + rp.mustBeDirOrig = true + } + for { + fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return fd, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return nil, err + } + } +} + +// StatAt returns metadata for the file at the given path. +func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return linux.Statx{}, err + } + for { + stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return stat, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return linux.Statx{}, err + } + } +} + +// StatusFlags returns file description status flags. +func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) { + flags, err := fd.impl.StatusFlags(ctx) + flags |= linux.O_LARGEFILE + return flags, err +} + +// SetStatusFlags sets file description status flags. +func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error { + return fd.impl.SetStatusFlags(ctx, flags) +} + +// TODO: +// +// - VFS.SyncAllFilesystems() for sync(2) +// +// - Something for syncfs(2) +// +// - VFS.LinkAt() +// +// - VFS.MknodAt() +// +// - VFS.ReadlinkAt() +// +// - VFS.RenameAt() +// +// - VFS.RmdirAt() +// +// - VFS.SetStatAt() +// +// - VFS.StatFSAt() +// +// - VFS.SymlinkAt() +// +// - VFS.UnlinkAt() +// +// - FileDescription.(almost everything) diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go new file mode 100755 index 000000000..70b192ece --- /dev/null +++ b/pkg/sentry/vfs/testutil.go @@ -0,0 +1,139 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +// FDTestFilesystemType is a test-only FilesystemType that produces Filesystems +// for which all FilesystemImpl methods taking a path return EPERM. It is used +// to produce Mounts and Dentries for testing of FileDescriptionImpls that do +// not depend on their originating Filesystem. +type FDTestFilesystemType struct{} + +// FDTestFilesystem is a test-only FilesystemImpl produced by +// FDTestFilesystemType. +type FDTestFilesystem struct { + vfsfs Filesystem +} + +// NewFilesystem implements FilesystemType.NewFilesystem. +func (fstype FDTestFilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts NewFilesystemOptions) (*Filesystem, *Dentry, error) { + var fs FDTestFilesystem + fs.vfsfs.Init(&fs) + return &fs.vfsfs, fs.NewDentry(), nil +} + +// Release implements FilesystemImpl.Release. +func (fs *FDTestFilesystem) Release() { +} + +// Sync implements FilesystemImpl.Sync. +func (fs *FDTestFilesystem) Sync(ctx context.Context) error { + return nil +} + +// GetDentryAt implements FilesystemImpl.GetDentryAt. +func (fs *FDTestFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) { + return nil, syserror.EPERM +} + +// LinkAt implements FilesystemImpl.LinkAt. +func (fs *FDTestFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error { + return syserror.EPERM +} + +// MkdirAt implements FilesystemImpl.MkdirAt. +func (fs *FDTestFilesystem) MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error { + return syserror.EPERM +} + +// MknodAt implements FilesystemImpl.MknodAt. +func (fs *FDTestFilesystem) MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error { + return syserror.EPERM +} + +// OpenAt implements FilesystemImpl.OpenAt. +func (fs *FDTestFilesystem) OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) { + return nil, syserror.EPERM +} + +// ReadlinkAt implements FilesystemImpl.ReadlinkAt. +func (fs *FDTestFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) { + return "", syserror.EPERM +} + +// RenameAt implements FilesystemImpl.RenameAt. +func (fs *FDTestFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error { + return syserror.EPERM +} + +// RmdirAt implements FilesystemImpl.RmdirAt. +func (fs *FDTestFilesystem) RmdirAt(ctx context.Context, rp *ResolvingPath) error { + return syserror.EPERM +} + +// SetStatAt implements FilesystemImpl.SetStatAt. +func (fs *FDTestFilesystem) SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error { + return syserror.EPERM +} + +// StatAt implements FilesystemImpl.StatAt. +func (fs *FDTestFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) { + return linux.Statx{}, syserror.EPERM +} + +// StatFSAt implements FilesystemImpl.StatFSAt. +func (fs *FDTestFilesystem) StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) { + return linux.Statfs{}, syserror.EPERM +} + +// SymlinkAt implements FilesystemImpl.SymlinkAt. +func (fs *FDTestFilesystem) SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error { + return syserror.EPERM +} + +// UnlinkAt implements FilesystemImpl.UnlinkAt. +func (fs *FDTestFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error { + return syserror.EPERM +} + +type fdTestDentry struct { + vfsd Dentry +} + +// NewDentry returns a new Dentry. +func (fs *FDTestFilesystem) NewDentry() *Dentry { + var d fdTestDentry + d.vfsd.Init(&d) + return &d.vfsd +} + +// IncRef implements DentryImpl.IncRef. +func (d *fdTestDentry) IncRef(vfsfs *Filesystem) { +} + +// TryIncRef implements DentryImpl.TryIncRef. +func (d *fdTestDentry) TryIncRef(vfsfs *Filesystem) bool { + return true +} + +// DecRef implements DentryImpl.DecRef. +func (d *fdTestDentry) DecRef(vfsfs *Filesystem) { +} diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go new file mode 100755 index 000000000..4a8a69540 --- /dev/null +++ b/pkg/sentry/vfs/vfs.go @@ -0,0 +1,135 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package vfs implements a virtual filesystem layer. +// +// Lock order: +// +// Filesystem implementation locks +// VirtualFilesystem.mountMu +// VirtualFilesystem.fsTypesMu +package vfs + +import ( + "sync" +) + +// A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts. +// +// There is no analogue to the VirtualFilesystem type in Linux, as the +// equivalent state in Linux is global. +type VirtualFilesystem struct { + // mountMu serializes mount mutations. + // + // mountMu is analogous to Linux's namespace_sem. + mountMu sync.RWMutex + + // mounts maps (mount parent, mount point) pairs to mounts. (Since mounts + // are uniquely namespaced, including mount parent in the key correctly + // handles both bind mounts and mount namespaces; Linux does the same.) + // Synchronization between mutators and readers is provided by mounts.seq; + // synchronization between mutators is provided by mountMu. + // + // mounts is used to follow mount points during path traversal. We use a + // single table rather than per-Dentry tables to reduce size (and therefore + // cache footprint) for the vast majority of Dentries that are not mount + // points. + // + // mounts is analogous to Linux's mount_hashtable. + mounts mountTable + + // mountpoints maps mount points to mounts at those points in all + // namespaces. mountpoints is protected by mountMu. + // + // mountpoints is used to find mounts that must be unmounted due to + // removal of a mount point Dentry from another mount namespace. ("A file + // or directory that is a mount point in one namespace that is not a mount + // point in another namespace, may be renamed, unlinked, or removed + // (rmdir(2)) in the mount namespace in which it is not a mount point + // (subject to the usual permission checks)." - mount_namespaces(7)) + // + // mountpoints is analogous to Linux's mountpoint_hashtable. + mountpoints map[*Dentry]map[*Mount]struct{} + + // fsTypes contains all FilesystemTypes that are usable in the + // VirtualFilesystem. fsTypes is protected by fsTypesMu. + fsTypesMu sync.RWMutex + fsTypes map[string]FilesystemType +} + +// New returns a new VirtualFilesystem with no mounts or FilesystemTypes. +func New() *VirtualFilesystem { + vfs := &VirtualFilesystem{ + mountpoints: make(map[*Dentry]map[*Mount]struct{}), + fsTypes: make(map[string]FilesystemType), + } + vfs.mounts.Init() + return vfs +} + +// A VirtualDentry represents a node in a VFS tree, by combining a Dentry +// (which represents a node in a Filesystem's tree) and a Mount (which +// represents the Filesystem's position in a VFS mount tree). +// +// VirtualDentry's semantics are similar to that of a Go interface object +// representing a pointer: it is a copyable value type that represents +// references to another entity. The zero value of VirtualDentry is an "empty +// VirtualDentry", directly analogous to a nil interface object. +// VirtualDentry.Ok() checks that a VirtualDentry is not zero-valued; unless +// otherwise specified, all other VirtualDentry methods require +// VirtualDentry.Ok() == true. +// +// Mounts and Dentries are reference-counted, requiring that users call +// VirtualDentry.{Inc,Dec}Ref() as appropriate. We often colloquially refer to +// references on the Mount and Dentry referred to by a VirtualDentry as +// references on the VirtualDentry itself. Unless otherwise specified, all +// VirtualDentry methods require that a reference is held on the VirtualDentry. +// +// VirtualDentry is analogous to Linux's struct path. +type VirtualDentry struct { + mount *Mount + dentry *Dentry +} + +// Ok returns true if vd is not empty. It does not require that a reference is +// held. +func (vd VirtualDentry) Ok() bool { + return vd.mount != nil +} + +// IncRef increments the reference counts on the Mount and Dentry represented +// by vd. +func (vd VirtualDentry) IncRef() { + vd.mount.incRef() + vd.dentry.incRef(vd.mount.fs) +} + +// DecRef decrements the reference counts on the Mount and Dentry represented +// by vd. +func (vd VirtualDentry) DecRef() { + vd.dentry.decRef(vd.mount.fs) + vd.mount.decRef() +} + +// Mount returns the Mount associated with vd. It does not take a reference on +// the returned Mount. +func (vd VirtualDentry) Mount() *Mount { + return vd.mount +} + +// Dentry returns the Dentry associated with vd. It does not take a reference +// on the returned Dentry. +func (vd VirtualDentry) Dentry() *Dentry { + return vd.dentry +} diff --git a/pkg/sentry/vfs/vfs_state_autogen.go b/pkg/sentry/vfs/vfs_state_autogen.go new file mode 100755 index 000000000..0a4a7bf35 --- /dev/null +++ b/pkg/sentry/vfs/vfs_state_autogen.go @@ -0,0 +1,4 @@ +// automatically generated by stateify. + +package vfs + |