diff options
Diffstat (limited to 'pkg/sentry/vfs')
29 files changed, 8807 insertions, 0 deletions
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD new file mode 100644 index 000000000..642769e7c --- /dev/null +++ b/pkg/sentry/vfs/BUILD @@ -0,0 +1,100 @@ +load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +licenses(["notice"]) + +go_template_instance( + name = "epoll_interest_list", + out = "epoll_interest_list.go", + package = "vfs", + prefix = "epollInterest", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*epollInterest", + "Linker": "*epollInterest", + }, +) + +go_template_instance( + name = "event_list", + out = "event_list.go", + package = "vfs", + prefix = "event", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*Event", + "Linker": "*Event", + }, +) + +go_library( + name = "vfs", + srcs = [ + "anonfs.go", + "context.go", + "debug.go", + "dentry.go", + "device.go", + "epoll.go", + "epoll_interest_list.go", + "event_list.go", + "file_description.go", + "file_description_impl_util.go", + "filesystem.go", + "filesystem_impl_util.go", + "filesystem_type.go", + "inotify.go", + "lock.go", + "mount.go", + "mount_unsafe.go", + "options.go", + "pathname.go", + "permissions.go", + "resolving_path.go", + "vfs.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/context", + "//pkg/fd", + "//pkg/fdnotifier", + "//pkg/fspath", + "//pkg/gohacks", + "//pkg/log", + "//pkg/safemem", + "//pkg/sentry/arch", + "//pkg/sentry/fs", + "//pkg/sentry/fs/lock", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sentry/limits", + "//pkg/sentry/memmap", + "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/uniqueid", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +go_test( + name = "vfs_test", + size = "small", + srcs = [ + "file_description_impl_util_test.go", + "mount_test.go", + ], + library = ":vfs", + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/sentry/contexttest", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md new file mode 100644 index 000000000..4b9faf2ea --- /dev/null +++ b/pkg/sentry/vfs/README.md @@ -0,0 +1,195 @@ +# The gVisor Virtual Filesystem + +THIS PACKAGE IS CURRENTLY EXPERIMENTAL AND NOT READY OR ENABLED FOR PRODUCTION +USE. For the filesystem implementation currently used by gVisor, see the `fs` +package. + +## Implementation Notes + +### Reference Counting + +Filesystem, Dentry, Mount, MountNamespace, and FileDescription are all +reference-counted. Mount and MountNamespace are exclusively VFS-managed; when +their reference count reaches zero, VFS releases their resources. Filesystem and +FileDescription management is shared between VFS and filesystem implementations; +when their reference count reaches zero, VFS notifies the implementation by +calling `FilesystemImpl.Release()` or `FileDescriptionImpl.Release()` +respectively and then releases VFS-owned resources. Dentries are exclusively +managed by filesystem implementations; reference count changes are abstracted +through DentryImpl, which should release resources when reference count reaches +zero. + +Filesystem references are held by: + +- Mount: Each referenced Mount holds a reference on the mounted Filesystem. + +Dentry references are held by: + +- FileDescription: Each referenced FileDescription holds a reference on the + Dentry through which it was opened, via `FileDescription.vd.dentry`. + +- Mount: Each referenced Mount holds a reference on its mount point and on the + mounted filesystem root. The mount point is mutable (`mount(MS_MOVE)`). + +Mount references are held by: + +- FileDescription: Each referenced FileDescription holds a reference on the + Mount on which it was opened, via `FileDescription.vd.mount`. + +- Mount: Each referenced Mount holds a reference on its parent, which is the + mount containing its mount point. + +- VirtualFilesystem: A reference is held on each Mount that has been connected + to a mount point, but not yet umounted. + +MountNamespace and FileDescription references are held by users of VFS. The +expectation is that each `kernel.Task` holds a reference on its corresponding +MountNamespace, and each file descriptor holds a reference on its represented +FileDescription. + +Notes: + +- Dentries do not hold a reference on their owning Filesystem. Instead, all + uses of a Dentry occur in the context of a Mount, which holds a reference on + the relevant Filesystem (see e.g. the VirtualDentry type). As a corollary, + when releasing references on both a Dentry and its corresponding Mount, the + Dentry's reference must be released first (because releasing the Mount's + reference may release the last reference on the Filesystem, whose state may + be required to release the Dentry reference). + +### The Inheritance Pattern + +Filesystem, Dentry, and FileDescription are all concepts featuring both state +that must be shared between VFS and filesystem implementations, and operations +that are implementation-defined. To facilitate this, each of these three +concepts follows the same pattern, shown below for Dentry: + +```go +// Dentry represents a node in a filesystem tree. +type Dentry struct { + // VFS-required dentry state. + parent *Dentry + // ... + + // impl is the DentryImpl associated with this Dentry. impl is immutable. + // This should be the last field in Dentry. + impl DentryImpl +} + +// Init must be called before first use of d. +func (d *Dentry) Init(impl DentryImpl) { + d.impl = impl +} + +// Impl returns the DentryImpl associated with d. +func (d *Dentry) Impl() DentryImpl { + return d.impl +} + +// DentryImpl contains implementation-specific details of a Dentry. +// Implementations of DentryImpl should contain their associated Dentry by +// value as their first field. +type DentryImpl interface { + // VFS-required implementation-defined dentry operations. + IncRef() + // ... +} +``` + +This construction, which is essentially a type-safe analogue to Linux's +`container_of` pattern, has the following properties: + +- VFS works almost exclusively with pointers to Dentry rather than DentryImpl + interface objects, such as in the type of `Dentry.parent`. This avoids + interface method calls (which are somewhat expensive to perform, and defeat + inlining and escape analysis), reduces the size of VFS types (since an + interface object is two pointers in size), and allows pointers to be loaded + and stored atomically using `sync/atomic`. Implementation-defined behavior + is accessed via `Dentry.impl` when required. + +- Filesystem implementations can access the implementation-defined state + associated with objects of VFS types by type-asserting or type-switching + (e.g. `Dentry.Impl().(*myDentry)`). Type assertions to a concrete type + require only an equality comparison of the interface object's type pointer + to a static constant, and are consequently very fast. + +- Filesystem implementations can access the VFS state associated with objects + of implementation-defined types directly. + +- VFS and implementation-defined state for a given type occupy the same + object, minimizing memory allocations and maximizing memory locality. `impl` + is the last field in `Dentry`, and `Dentry` is the first field in + `DentryImpl` implementations, for similar reasons: this tends to cause + fetching of the `Dentry.impl` interface object to also fetch `DentryImpl` + fields, either because they are in the same cache line or via next-line + prefetching. + +## Future Work + +- Most `mount(2)` features, and unmounting, are incomplete. + +- VFS1 filesystems are not directly compatible with VFS2. It may be possible + to implement shims that implement `vfs.FilesystemImpl` for + `fs.MountNamespace`, `vfs.DentryImpl` for `fs.Dirent`, and + `vfs.FileDescriptionImpl` for `fs.File`, which may be adequate for + filesystems that are not performance-critical (e.g. sysfs); however, it is + not clear that this will be less effort than simply porting the filesystems + in question. Practically speaking, the following filesystems will probably + need to be ported or made compatible through a shim to evaluate filesystem + performance on realistic workloads: + + - devfs/procfs/sysfs, which will realistically be necessary to execute + most applications. (Note that procfs and sysfs do not support hard + links, so they do not require the complexity of separate inode objects. + Also note that Linux's /dev is actually a variant of tmpfs called + devtmpfs.) + + - tmpfs. This should be relatively straightforward: copy/paste memfs, + store regular file contents in pgalloc-allocated memory instead of + `[]byte`, and add support for file timestamps. (In fact, it probably + makes more sense to convert memfs to tmpfs and not keep the former.) + + - A remote filesystem, either lisafs (if it is ready by the time that + other benchmarking prerequisites are) or v9fs (aka 9P, aka gofers). + + - epoll files. + + Filesystems that will need to be ported before switching to VFS2, but can + probably be skipped for early testing: + + - overlayfs, which is needed for (at least) synthetic mount points. + + - Support for host ttys. + + - timerfd files. + + Filesystems that can be probably dropped: + + - ashmem, which is far too incomplete to use. + + - binder, which is similarly far too incomplete to use. + +- Save/restore. For instance, it is unclear if the current implementation of + the `state` package supports the inheritance pattern described above. + +- Many features that were previously implemented by VFS must now be + implemented by individual filesystems (though, in most cases, this should + consist of calls to hooks or libraries provided by `vfs` or other packages). + This includes, but is not necessarily limited to: + + - Block and character device special files + + - Inotify + + - File locking + + - `O_ASYNC` + +- Reference counts in the `vfs` package do not use the `refs` package since + `refs.AtomicRefCount` adds 64 bytes of overhead to each 8-byte reference + count, resulting in considerable cache bloat. 24 bytes of this overhead is + for weak reference support, which have poor performance and will not be used + by VFS2. The remaining 40 bytes is to store a descriptive string and stack + trace for reference leak checking; we can support reference leak checking + without incurring this space overhead by including the applicable + information directly in finalizers for applicable types. diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go new file mode 100644 index 000000000..641e3e502 --- /dev/null +++ b/pkg/sentry/vfs/anonfs.go @@ -0,0 +1,314 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// NewAnonVirtualDentry returns a VirtualDentry with the given synthetic name, +// consistent with Linux's fs/anon_inodes.c:anon_inode_getfile(). References +// are taken on the returned VirtualDentry. +func (vfs *VirtualFilesystem) NewAnonVirtualDentry(name string) VirtualDentry { + d := anonDentry{ + name: name, + } + d.vfsd.Init(&d) + vfs.anonMount.IncRef() + // anonDentry no-ops refcounting. + return VirtualDentry{ + mount: vfs.anonMount, + dentry: &d.vfsd, + } +} + +const ( + anonfsBlockSize = usermem.PageSize // via fs/libfs.c:pseudo_fs_fill_super() + + // Mode, UID, and GID for a generic anonfs file. + anonFileMode = 0600 // no type is correct + anonFileUID = auth.RootKUID + anonFileGID = auth.RootKGID +) + +// anonFilesystemType implements FilesystemType. +type anonFilesystemType struct{} + +// GetFilesystem implements FilesystemType.GetFilesystem. +func (anonFilesystemType) GetFilesystem(context.Context, *VirtualFilesystem, *auth.Credentials, string, GetFilesystemOptions) (*Filesystem, *Dentry, error) { + panic("cannot instaniate an anon filesystem") +} + +// Name implemenents FilesystemType.Name. +func (anonFilesystemType) Name() string { + return "none" +} + +// anonFilesystem is the implementation of FilesystemImpl that backs +// VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry(). +// +// Since all Dentries in anonFilesystem are non-directories, all FilesystemImpl +// methods that would require an anonDentry to be a directory return ENOTDIR. +type anonFilesystem struct { + vfsfs Filesystem + + devMinor uint32 +} + +type anonDentry struct { + vfsd Dentry + + name string +} + +// Release implements FilesystemImpl.Release. +func (fs *anonFilesystem) Release() { +} + +// Sync implements FilesystemImpl.Sync. +func (fs *anonFilesystem) Sync(ctx context.Context) error { + return nil +} + +// AccessAt implements vfs.Filesystem.Impl.AccessAt. +func (fs *anonFilesystem) AccessAt(ctx context.Context, rp *ResolvingPath, creds *auth.Credentials, ats AccessTypes) error { + if !rp.Done() { + return syserror.ENOTDIR + } + return GenericCheckPermissions(creds, ats, anonFileMode, anonFileUID, anonFileGID) +} + +// GetDentryAt implements FilesystemImpl.GetDentryAt. +func (fs *anonFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) { + if !rp.Done() { + return nil, syserror.ENOTDIR + } + if opts.CheckSearchable { + return nil, syserror.ENOTDIR + } + // anonDentry no-ops refcounting. + return rp.Start(), nil +} + +// GetParentDentryAt implements FilesystemImpl.GetParentDentryAt. +func (fs *anonFilesystem) GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) { + if !rp.Final() { + return nil, syserror.ENOTDIR + } + // anonDentry no-ops refcounting. + return rp.Start(), nil +} + +// LinkAt implements FilesystemImpl.LinkAt. +func (fs *anonFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error { + if !rp.Final() { + return syserror.ENOTDIR + } + return syserror.EPERM +} + +// MkdirAt implements FilesystemImpl.MkdirAt. +func (fs *anonFilesystem) MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error { + if !rp.Final() { + return syserror.ENOTDIR + } + return syserror.EPERM +} + +// MknodAt implements FilesystemImpl.MknodAt. +func (fs *anonFilesystem) MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error { + if !rp.Final() { + return syserror.ENOTDIR + } + return syserror.EPERM +} + +// OpenAt implements FilesystemImpl.OpenAt. +func (fs *anonFilesystem) OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) { + if !rp.Done() { + return nil, syserror.ENOTDIR + } + return nil, syserror.ENODEV +} + +// ReadlinkAt implements FilesystemImpl.ReadlinkAt. +func (fs *anonFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) { + if !rp.Done() { + return "", syserror.ENOTDIR + } + return "", syserror.EINVAL +} + +// RenameAt implements FilesystemImpl.RenameAt. +func (fs *anonFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error { + if !rp.Final() { + return syserror.ENOTDIR + } + return syserror.EPERM +} + +// RmdirAt implements FilesystemImpl.RmdirAt. +func (fs *anonFilesystem) RmdirAt(ctx context.Context, rp *ResolvingPath) error { + if !rp.Final() { + return syserror.ENOTDIR + } + return syserror.EPERM +} + +// SetStatAt implements FilesystemImpl.SetStatAt. +func (fs *anonFilesystem) SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error { + if !rp.Done() { + return syserror.ENOTDIR + } + // Linux actually permits anon_inode_inode's metadata to be set, which is + // visible to all users of anon_inode_inode. We just silently ignore + // metadata changes. + return nil +} + +// StatAt implements FilesystemImpl.StatAt. +func (fs *anonFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) { + if !rp.Done() { + return linux.Statx{}, syserror.ENOTDIR + } + // See fs/anon_inodes.c:anon_inode_init() => fs/libfs.c:alloc_anon_inode(). + return linux.Statx{ + Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS, + Blksize: anonfsBlockSize, + Nlink: 1, + UID: uint32(anonFileUID), + GID: uint32(anonFileGID), + Mode: anonFileMode, + Ino: 1, + Size: 0, + Blocks: 0, + DevMajor: linux.UNNAMED_MAJOR, + DevMinor: fs.devMinor, + }, nil +} + +// StatFSAt implements FilesystemImpl.StatFSAt. +func (fs *anonFilesystem) StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) { + if !rp.Done() { + return linux.Statfs{}, syserror.ENOTDIR + } + return linux.Statfs{ + Type: linux.ANON_INODE_FS_MAGIC, + BlockSize: anonfsBlockSize, + }, nil +} + +// SymlinkAt implements FilesystemImpl.SymlinkAt. +func (fs *anonFilesystem) SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error { + if !rp.Final() { + return syserror.ENOTDIR + } + return syserror.EPERM +} + +// UnlinkAt implements FilesystemImpl.UnlinkAt. +func (fs *anonFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error { + if !rp.Final() { + return syserror.ENOTDIR + } + return syserror.EPERM +} + +// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath, opts BoundEndpointOptions) (transport.BoundEndpoint, error) { + if !rp.Final() { + return nil, syserror.ENOTDIR + } + if err := GenericCheckPermissions(rp.Credentials(), MayWrite, anonFileMode, anonFileUID, anonFileGID); err != nil { + return nil, err + } + return nil, syserror.ECONNREFUSED +} + +// ListxattrAt implements FilesystemImpl.ListxattrAt. +func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) { + if !rp.Done() { + return nil, syserror.ENOTDIR + } + return nil, nil +} + +// GetxattrAt implements FilesystemImpl.GetxattrAt. +func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error) { + if !rp.Done() { + return "", syserror.ENOTDIR + } + return "", syserror.ENOTSUP +} + +// SetxattrAt implements FilesystemImpl.SetxattrAt. +func (fs *anonFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error { + if !rp.Done() { + return syserror.ENOTDIR + } + return syserror.EPERM +} + +// RemovexattrAt implements FilesystemImpl.RemovexattrAt. +func (fs *anonFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error { + if !rp.Done() { + return syserror.ENOTDIR + } + return syserror.EPERM +} + +// PrependPath implements FilesystemImpl.PrependPath. +func (fs *anonFilesystem) PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error { + b.PrependComponent(fmt.Sprintf("anon_inode:%s", vd.dentry.impl.(*anonDentry).name)) + return PrependPathSyntheticError{} +} + +// IncRef implements DentryImpl.IncRef. +func (d *anonDentry) IncRef() { + // no-op +} + +// TryIncRef implements DentryImpl.TryIncRef. +func (d *anonDentry) TryIncRef() bool { + return true +} + +// DecRef implements DentryImpl.DecRef. +func (d *anonDentry) DecRef() { + // no-op +} + +// InotifyWithParent implements DentryImpl.InotifyWithParent. +// +// Although Linux technically supports inotify on pseudo filesystems (inotify +// is implemented at the vfs layer), it is not particularly useful. It is left +// unimplemented until someone actually needs it. +func (d *anonDentry) InotifyWithParent(events, cookie uint32, et EventType) {} + +// Watches implements DentryImpl.Watches. +func (d *anonDentry) Watches() *Watches { + return nil +} + +// OnZeroWatches implements Dentry.OnZeroWatches. +func (d *anonDentry) OnZeroWatches() {} diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go new file mode 100644 index 000000000..c9e724fef --- /dev/null +++ b/pkg/sentry/vfs/context.go @@ -0,0 +1,75 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/context" +) + +// contextID is this package's type for context.Context.Value keys. +type contextID int + +const ( + // CtxMountNamespace is a Context.Value key for a MountNamespace. + CtxMountNamespace contextID = iota + + // CtxRoot is a Context.Value key for a VFS root. + CtxRoot +) + +// MountNamespaceFromContext returns the MountNamespace used by ctx. If ctx is +// not associated with a MountNamespace, MountNamespaceFromContext returns nil. +// +// A reference is taken on the returned MountNamespace. +func MountNamespaceFromContext(ctx context.Context) *MountNamespace { + if v := ctx.Value(CtxMountNamespace); v != nil { + return v.(*MountNamespace) + } + return nil +} + +// RootFromContext returns the VFS root used by ctx. It takes a reference on +// the returned VirtualDentry. If ctx does not have a specific VFS root, +// RootFromContext returns a zero-value VirtualDentry. +func RootFromContext(ctx context.Context) VirtualDentry { + if v := ctx.Value(CtxRoot); v != nil { + return v.(VirtualDentry) + } + return VirtualDentry{} +} + +type rootContext struct { + context.Context + root VirtualDentry +} + +// WithRoot returns a copy of ctx with the given root. +func WithRoot(ctx context.Context, root VirtualDentry) context.Context { + return &rootContext{ + Context: ctx, + root: root, + } +} + +// Value implements Context.Value. +func (rc rootContext) Value(key interface{}) interface{} { + switch key { + case CtxRoot: + rc.root.IncRef() + return rc.root + default: + return rc.Context.Value(key) + } +} diff --git a/pkg/sentry/vfs/debug.go b/pkg/sentry/vfs/debug.go new file mode 100644 index 000000000..0ed20f249 --- /dev/null +++ b/pkg/sentry/vfs/debug.go @@ -0,0 +1,22 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +const ( + // If checkInvariants is true, perform runtime checks for invariants + // expected by the vfs package. This is normally disabled since VFS is + // often a hot path. + checkInvariants = false +) diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go new file mode 100644 index 000000000..cea3e6955 --- /dev/null +++ b/pkg/sentry/vfs/dentry.go @@ -0,0 +1,324 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Dentry represents a node in a Filesystem tree at which a file exists. +// +// Dentries are reference-counted. Unless otherwise specified, all Dentry +// methods require that a reference is held. +// +// Dentry is loosely analogous to Linux's struct dentry, but: +// +// - VFS does not associate Dentries with inodes. gVisor interacts primarily +// with filesystems that are accessed through filesystem APIs (as opposed to +// raw block devices); many such APIs support only paths and file descriptors, +// and not inodes. Furthermore, when parties outside the scope of VFS can +// rename inodes on such filesystems, VFS generally cannot "follow" the rename, +// both due to synchronization issues and because it may not even be able to +// name the destination path; this implies that it would in fact be incorrect +// for Dentries to be associated with inodes on such filesystems. Consequently, +// operations that are inode operations in Linux are FilesystemImpl methods +// and/or FileDescriptionImpl methods in gVisor's VFS. Filesystems that do +// support inodes may store appropriate state in implementations of DentryImpl. +// +// - VFS does not require that Dentries are instantiated for all paths accessed +// through VFS, only those that are tracked beyond the scope of a single +// Filesystem operation. This includes file descriptions, mount points, mount +// roots, process working directories, and chroots. This avoids instantiation +// of Dentries for operations on mutable remote filesystems that can't actually +// cache any state in the Dentry. +// +// - VFS does not track filesystem structure (i.e. relationships between +// Dentries), since both the relevant state and synchronization are +// filesystem-specific. +// +// - For the reasons above, VFS is not directly responsible for managing Dentry +// lifetime. Dentry reference counts only indicate the extent to which VFS +// requires Dentries to exist; Filesystems may elect to cache or discard +// Dentries with zero references. +// +// +stateify savable +type Dentry struct { + // mu synchronizes deletion/invalidation and mounting over this Dentry. + mu sync.Mutex `state:"nosave"` + + // dead is true if the file represented by this Dentry has been deleted (by + // CommitDeleteDentry or CommitRenameReplaceDentry) or invalidated (by + // InvalidateDentry). dead is protected by mu. + dead bool + + // mounts is the number of Mounts for which this Dentry is Mount.point. + // mounts is accessed using atomic memory operations. + mounts uint32 + + // impl is the DentryImpl associated with this Dentry. impl is immutable. + // This should be the last field in Dentry. + impl DentryImpl +} + +// Init must be called before first use of d. +func (d *Dentry) Init(impl DentryImpl) { + d.impl = impl +} + +// Impl returns the DentryImpl associated with d. +func (d *Dentry) Impl() DentryImpl { + return d.impl +} + +// DentryImpl contains implementation details for a Dentry. Implementations of +// DentryImpl should contain their associated Dentry by value as their first +// field. +type DentryImpl interface { + // IncRef increments the Dentry's reference count. A Dentry with a non-zero + // reference count must remain coherent with the state of the filesystem. + IncRef() + + // TryIncRef increments the Dentry's reference count and returns true. If + // the Dentry's reference count is zero, TryIncRef may do nothing and + // return false. (It is also permitted to succeed if it can restore the + // guarantee that the Dentry is coherent with the state of the filesystem.) + // + // TryIncRef does not require that a reference is held on the Dentry. + TryIncRef() bool + + // DecRef decrements the Dentry's reference count. + DecRef() + + // InotifyWithParent notifies all watches on the targets represented by this + // dentry and its parent. The parent's watches are notified first, followed + // by this dentry's. + // + // InotifyWithParent automatically adds the IN_ISDIR flag for dentries + // representing directories. + // + // Note that the events may not actually propagate up to the user, depending + // on the event masks. + InotifyWithParent(events, cookie uint32, et EventType) + + // Watches returns the set of inotify watches for the file corresponding to + // the Dentry. Dentries that are hard links to the same underlying file + // share the same watches. + // + // Watches may return nil if the dentry belongs to a FilesystemImpl that + // does not support inotify. If an implementation returns a non-nil watch + // set, it must always return a non-nil watch set. Likewise, if an + // implementation returns a nil watch set, it must always return a nil watch + // set. + // + // The caller does not need to hold a reference on the dentry. + Watches() *Watches + + // OnZeroWatches is called whenever the number of watches on a dentry drops + // to zero. This is needed by some FilesystemImpls (e.g. gofer) to manage + // dentry lifetime. + // + // The caller does not need to hold a reference on the dentry. OnZeroWatches + // may acquire inotify locks, so to prevent deadlock, no inotify locks should + // be held by the caller. + OnZeroWatches() +} + +// IncRef increments d's reference count. +func (d *Dentry) IncRef() { + d.impl.IncRef() +} + +// TryIncRef increments d's reference count and returns true. If d's reference +// count is zero, TryIncRef may instead do nothing and return false. +func (d *Dentry) TryIncRef() bool { + return d.impl.TryIncRef() +} + +// DecRef decrements d's reference count. +func (d *Dentry) DecRef() { + d.impl.DecRef() +} + +// IsDead returns true if d has been deleted or invalidated by its owning +// filesystem. +func (d *Dentry) IsDead() bool { + d.mu.Lock() + defer d.mu.Unlock() + return d.dead +} + +func (d *Dentry) isMounted() bool { + return atomic.LoadUint32(&d.mounts) != 0 +} + +// InotifyWithParent notifies all watches on the targets represented by d and +// its parent of events. +func (d *Dentry) InotifyWithParent(events, cookie uint32, et EventType) { + d.impl.InotifyWithParent(events, cookie, et) +} + +// Watches returns the set of inotify watches associated with d. +// +// Watches will return nil if d belongs to a FilesystemImpl that does not +// support inotify. +func (d *Dentry) Watches() *Watches { + return d.impl.Watches() +} + +// OnZeroWatches performs cleanup tasks whenever the number of watches on a +// dentry drops to zero. +func (d *Dentry) OnZeroWatches() { + d.impl.OnZeroWatches() +} + +// The following functions are exported so that filesystem implementations can +// use them. The vfs package, and users of VFS, should not call these +// functions. + +// PrepareDeleteDentry must be called before attempting to delete the file +// represented by d. If PrepareDeleteDentry succeeds, the caller must call +// AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome. +func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dentry) error { + vfs.mountMu.Lock() + if mntns.mountpoints[d] != 0 { + vfs.mountMu.Unlock() + return syserror.EBUSY + } + d.mu.Lock() + vfs.mountMu.Unlock() + // Return with d.mu locked to block attempts to mount over it; it will be + // unlocked by AbortDeleteDentry or CommitDeleteDentry. + return nil +} + +// AbortDeleteDentry must be called after PrepareDeleteDentry if the deletion +// fails. +func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) { + d.mu.Unlock() +} + +// CommitDeleteDentry must be called after PrepareDeleteDentry if the deletion +// succeeds. +func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) { + d.dead = true + d.mu.Unlock() + if d.isMounted() { + vfs.forgetDeadMountpoint(d) + } +} + +// InvalidateDentry is called when d ceases to represent the file it formerly +// did for reasons outside of VFS' control (e.g. d represents the local state +// of a file on a remote filesystem on which the file has already been +// deleted). +func (vfs *VirtualFilesystem) InvalidateDentry(d *Dentry) { + d.mu.Lock() + d.dead = true + d.mu.Unlock() + if d.isMounted() { + vfs.forgetDeadMountpoint(d) + } +} + +// PrepareRenameDentry must be called before attempting to rename the file +// represented by from. If to is not nil, it represents the file that will be +// replaced or exchanged by the rename. If PrepareRenameDentry succeeds, the +// caller must call AbortRenameDentry, CommitRenameReplaceDentry, or +// CommitRenameExchangeDentry depending on the rename's outcome. +// +// Preconditions: If to is not nil, it must be a child Dentry from the same +// Filesystem. from != to. +func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error { + vfs.mountMu.Lock() + if mntns.mountpoints[from] != 0 { + vfs.mountMu.Unlock() + return syserror.EBUSY + } + if to != nil { + if mntns.mountpoints[to] != 0 { + vfs.mountMu.Unlock() + return syserror.EBUSY + } + to.mu.Lock() + } + from.mu.Lock() + vfs.mountMu.Unlock() + // Return with from.mu and to.mu locked, which will be unlocked by + // AbortRenameDentry, CommitRenameReplaceDentry, or + // CommitRenameExchangeDentry. + return nil +} + +// AbortRenameDentry must be called after PrepareRenameDentry if the rename +// fails. +func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) { + from.mu.Unlock() + if to != nil { + to.mu.Unlock() + } +} + +// CommitRenameReplaceDentry must be called after the file represented by from +// is renamed without RENAME_EXCHANGE. If to is not nil, it represents the file +// that was replaced by from. +// +// Preconditions: PrepareRenameDentry was previously called on from and to. +func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, to *Dentry) { + from.mu.Unlock() + if to != nil { + to.dead = true + to.mu.Unlock() + if to.isMounted() { + vfs.forgetDeadMountpoint(to) + } + } +} + +// CommitRenameExchangeDentry must be called after the files represented by +// from and to are exchanged by rename(RENAME_EXCHANGE). +// +// Preconditions: PrepareRenameDentry was previously called on from and to. +func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) { + from.mu.Unlock() + to.mu.Unlock() +} + +// forgetDeadMountpoint is called when a mount point is deleted or invalidated +// to umount all mounts using it in all other mount namespaces. +// +// forgetDeadMountpoint is analogous to Linux's +// fs/namespace.c:__detach_mounts(). +func (vfs *VirtualFilesystem) forgetDeadMountpoint(d *Dentry) { + var ( + vdsToDecRef []VirtualDentry + mountsToDecRef []*Mount + ) + vfs.mountMu.Lock() + vfs.mounts.seq.BeginWrite() + for mnt := range vfs.mountpoints[d] { + vdsToDecRef, mountsToDecRef = vfs.umountRecursiveLocked(mnt, &umountRecursiveOptions{}, vdsToDecRef, mountsToDecRef) + } + vfs.mounts.seq.EndWrite() + vfs.mountMu.Unlock() + for _, vd := range vdsToDecRef { + vd.DecRef() + } + for _, mnt := range mountsToDecRef { + mnt.DecRef() + } +} diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go new file mode 100644 index 000000000..1e9dffc8f --- /dev/null +++ b/pkg/sentry/vfs/device.go @@ -0,0 +1,132 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/syserror" +) + +// DeviceKind indicates whether a device is a block or character device. +type DeviceKind uint32 + +const ( + // BlockDevice indicates a block device. + BlockDevice DeviceKind = iota + + // CharDevice indicates a character device. + CharDevice +) + +// String implements fmt.Stringer.String. +func (kind DeviceKind) String() string { + switch kind { + case BlockDevice: + return "block" + case CharDevice: + return "character" + default: + return fmt.Sprintf("invalid device kind %d", kind) + } +} + +type devTuple struct { + kind DeviceKind + major uint32 + minor uint32 +} + +// A Device backs device special files. +type Device interface { + // Open returns a FileDescription representing this device. + Open(ctx context.Context, mnt *Mount, d *Dentry, opts OpenOptions) (*FileDescription, error) +} + +// +stateify savable +type registeredDevice struct { + dev Device + opts RegisterDeviceOptions +} + +// RegisterDeviceOptions contains options to +// VirtualFilesystem.RegisterDevice(). +// +// +stateify savable +type RegisterDeviceOptions struct { + // GroupName is the name shown for this device registration in + // /proc/devices. If GroupName is empty, this registration will not be + // shown in /proc/devices. + GroupName string +} + +// RegisterDevice registers the given Device in vfs with the given major and +// minor device numbers. +func (vfs *VirtualFilesystem) RegisterDevice(kind DeviceKind, major, minor uint32, dev Device, opts *RegisterDeviceOptions) error { + tup := devTuple{kind, major, minor} + vfs.devicesMu.Lock() + defer vfs.devicesMu.Unlock() + if existing, ok := vfs.devices[tup]; ok { + return fmt.Errorf("%s device number (%d, %d) is already registered to device type %T", kind, major, minor, existing.dev) + } + vfs.devices[tup] = ®isteredDevice{ + dev: dev, + opts: *opts, + } + return nil +} + +// OpenDeviceSpecialFile returns a FileDescription representing the given +// device. +func (vfs *VirtualFilesystem) OpenDeviceSpecialFile(ctx context.Context, mnt *Mount, d *Dentry, kind DeviceKind, major, minor uint32, opts *OpenOptions) (*FileDescription, error) { + tup := devTuple{kind, major, minor} + vfs.devicesMu.RLock() + defer vfs.devicesMu.RUnlock() + rd, ok := vfs.devices[tup] + if !ok { + return nil, syserror.ENXIO + } + return rd.dev.Open(ctx, mnt, d, *opts) +} + +// GetAnonBlockDevMinor allocates and returns an unused minor device number for +// an "anonymous" block device with major number UNNAMED_MAJOR. +func (vfs *VirtualFilesystem) GetAnonBlockDevMinor() (uint32, error) { + vfs.anonBlockDevMinorMu.Lock() + defer vfs.anonBlockDevMinorMu.Unlock() + minor := vfs.anonBlockDevMinorNext + const maxDevMinor = (1 << 20) - 1 + for minor < maxDevMinor { + if _, ok := vfs.anonBlockDevMinor[minor]; !ok { + vfs.anonBlockDevMinor[minor] = struct{}{} + vfs.anonBlockDevMinorNext = minor + 1 + return minor, nil + } + minor++ + } + return 0, syserror.EMFILE +} + +// PutAnonBlockDevMinor deallocates a minor device number returned by a +// previous call to GetAnonBlockDevMinor. +func (vfs *VirtualFilesystem) PutAnonBlockDevMinor(minor uint32) { + vfs.anonBlockDevMinorMu.Lock() + defer vfs.anonBlockDevMinorMu.Unlock() + delete(vfs.anonBlockDevMinor, minor) + if minor < vfs.anonBlockDevMinorNext { + vfs.anonBlockDevMinorNext = minor + } +} diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go new file mode 100644 index 000000000..599c3131c --- /dev/null +++ b/pkg/sentry/vfs/epoll.go @@ -0,0 +1,383 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +// epollCycleMu serializes attempts to register EpollInstances with other +// EpollInstances in order to check for cycles. +var epollCycleMu sync.Mutex + +// EpollInstance represents an epoll instance, as described by epoll(7). +type EpollInstance struct { + vfsfd FileDescription + FileDescriptionDefaultImpl + DentryMetadataFileDescriptionImpl + NoLockFD + + // q holds waiters on this EpollInstance. + q waiter.Queue + + // interest is the set of file descriptors that are registered with the + // EpollInstance for monitoring. interest is protected by interestMu. + interestMu sync.Mutex + interest map[epollInterestKey]*epollInterest + + // mu protects fields in registered epollInterests. + mu sync.Mutex + + // ready is the set of file descriptors that may be "ready" for I/O. Note + // that this must be an ordered list, not a map: "If more than maxevents + // file descriptors are ready when epoll_wait() is called, then successive + // epoll_wait() calls will round robin through the set of ready file + // descriptors. This behavior helps avoid starvation scenarios, where a + // process fails to notice that additional file descriptors are ready + // because it focuses on a set of file descriptors that are already known + // to be ready." - epoll_wait(2) + ready epollInterestList +} + +type epollInterestKey struct { + // file is the registered FileDescription. No reference is held on file; + // instead, when the last reference is dropped, FileDescription.DecRef() + // removes the FileDescription from all EpollInstances. file is immutable. + file *FileDescription + + // num is the file descriptor number with which this entry was registered. + // num is immutable. + num int32 +} + +// epollInterest represents an EpollInstance's interest in a file descriptor. +type epollInterest struct { + // epoll is the owning EpollInstance. epoll is immutable. + epoll *EpollInstance + + // key is the file to which this epollInterest applies. key is immutable. + key epollInterestKey + + // waiter is registered with key.file. entry is protected by epoll.mu. + waiter waiter.Entry + + // mask is the event mask associated with this registration, including + // flags EPOLLET and EPOLLONESHOT. mask is protected by epoll.mu. + mask uint32 + + // ready is true if epollInterestEntry is linked into epoll.ready. ready + // and epollInterestEntry are protected by epoll.mu. + ready bool + epollInterestEntry + + // userData is the struct epoll_event::data associated with this + // epollInterest. userData is protected by epoll.mu. + userData [2]int32 +} + +// NewEpollInstanceFD returns a FileDescription representing a new epoll +// instance. A reference is taken on the returned FileDescription. +func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) { + vd := vfs.NewAnonVirtualDentry("[eventpoll]") + defer vd.DecRef() + ep := &EpollInstance{ + interest: make(map[epollInterestKey]*epollInterest), + } + if err := ep.vfsfd.Init(ep, linux.O_RDWR, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ + DenyPRead: true, + DenyPWrite: true, + UseDentryMetadata: true, + }); err != nil { + return nil, err + } + return &ep.vfsfd, nil +} + +// Release implements FileDescriptionImpl.Release. +func (ep *EpollInstance) Release() { + // Unregister all polled fds. + ep.interestMu.Lock() + defer ep.interestMu.Unlock() + for key, epi := range ep.interest { + file := key.file + file.epollMu.Lock() + delete(file.epolls, epi) + file.epollMu.Unlock() + file.EventUnregister(&epi.waiter) + } + ep.interest = nil +} + +// Readiness implements waiter.Waitable.Readiness. +func (ep *EpollInstance) Readiness(mask waiter.EventMask) waiter.EventMask { + if mask&waiter.EventIn == 0 { + return 0 + } + ep.mu.Lock() + for epi := ep.ready.Front(); epi != nil; epi = epi.Next() { + wmask := waiter.EventMaskFromLinux(epi.mask) + if epi.key.file.Readiness(wmask)&wmask != 0 { + ep.mu.Unlock() + return waiter.EventIn + } + } + ep.mu.Unlock() + return 0 +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (ep *EpollInstance) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + ep.q.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (ep *EpollInstance) EventUnregister(e *waiter.Entry) { + ep.q.EventUnregister(e) +} + +// Seek implements FileDescriptionImpl.Seek. +func (ep *EpollInstance) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + // Linux: fs/eventpoll.c:eventpoll_fops.llseek == noop_llseek + return 0, nil +} + +// AddInterest implements the semantics of EPOLL_CTL_ADD. +// +// Preconditions: A reference must be held on file. +func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event linux.EpollEvent) error { + // Check for cyclic polling if necessary. + subep, _ := file.impl.(*EpollInstance) + if subep != nil { + epollCycleMu.Lock() + // epollCycleMu must be locked for the rest of AddInterest to ensure + // that cyclic polling is not introduced after the check. + defer epollCycleMu.Unlock() + if subep.mightPoll(ep) { + return syserror.ELOOP + } + } + + ep.interestMu.Lock() + defer ep.interestMu.Unlock() + + // Fail if the key is already registered. + key := epollInterestKey{ + file: file, + num: num, + } + if _, ok := ep.interest[key]; ok { + return syserror.EEXIST + } + + // Register interest in file. + mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP + epi := &epollInterest{ + epoll: ep, + key: key, + mask: mask, + userData: event.Data, + } + epi.waiter.Callback = epi + ep.interest[key] = epi + wmask := waiter.EventMaskFromLinux(mask) + file.EventRegister(&epi.waiter, wmask) + + // Check if the file is already ready. + if file.Readiness(wmask)&wmask != 0 { + epi.Callback(nil) + } + + // Add epi to file.epolls so that it is removed when the last + // FileDescription reference is dropped. + file.epollMu.Lock() + if file.epolls == nil { + file.epolls = make(map[*epollInterest]struct{}) + } + file.epolls[epi] = struct{}{} + file.epollMu.Unlock() + + return nil +} + +func (ep *EpollInstance) mightPoll(ep2 *EpollInstance) bool { + return ep.mightPollRecursive(ep2, 4) // Linux: fs/eventpoll.c:EP_MAX_NESTS +} + +func (ep *EpollInstance) mightPollRecursive(ep2 *EpollInstance, remainingRecursion int) bool { + ep.interestMu.Lock() + defer ep.interestMu.Unlock() + for key := range ep.interest { + nextep, ok := key.file.impl.(*EpollInstance) + if !ok { + continue + } + if nextep == ep2 { + return true + } + if remainingRecursion == 0 { + return true + } + if nextep.mightPollRecursive(ep2, remainingRecursion-1) { + return true + } + } + return false +} + +// ModifyInterest implements the semantics of EPOLL_CTL_MOD. +// +// Preconditions: A reference must be held on file. +func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event linux.EpollEvent) error { + ep.interestMu.Lock() + defer ep.interestMu.Unlock() + + // Fail if the key is not already registered. + epi, ok := ep.interest[epollInterestKey{ + file: file, + num: num, + }] + if !ok { + return syserror.ENOENT + } + + // Update epi for the next call to ep.ReadEvents(). + mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP + ep.mu.Lock() + epi.mask = mask + epi.userData = event.Data + ep.mu.Unlock() + + // Re-register with the new mask. + file.EventUnregister(&epi.waiter) + wmask := waiter.EventMaskFromLinux(mask) + file.EventRegister(&epi.waiter, wmask) + + // Check if the file is already ready with the new mask. + if file.Readiness(wmask)&wmask != 0 { + epi.Callback(nil) + } + + return nil +} + +// DeleteInterest implements the semantics of EPOLL_CTL_DEL. +// +// Preconditions: A reference must be held on file. +func (ep *EpollInstance) DeleteInterest(file *FileDescription, num int32) error { + ep.interestMu.Lock() + defer ep.interestMu.Unlock() + + // Fail if the key is not already registered. + epi, ok := ep.interest[epollInterestKey{ + file: file, + num: num, + }] + if !ok { + return syserror.ENOENT + } + + // Unregister from the file so that epi will no longer be readied. + file.EventUnregister(&epi.waiter) + + // Forget about epi. + ep.removeLocked(epi) + + file.epollMu.Lock() + delete(file.epolls, epi) + file.epollMu.Unlock() + + return nil +} + +// Callback implements waiter.EntryCallback.Callback. +func (epi *epollInterest) Callback(*waiter.Entry) { + newReady := false + epi.epoll.mu.Lock() + if !epi.ready { + newReady = true + epi.ready = true + epi.epoll.ready.PushBack(epi) + } + epi.epoll.mu.Unlock() + if newReady { + epi.epoll.q.Notify(waiter.EventIn) + } +} + +// Preconditions: ep.interestMu must be locked. +func (ep *EpollInstance) removeLocked(epi *epollInterest) { + delete(ep.interest, epi.key) + ep.mu.Lock() + if epi.ready { + epi.ready = false + ep.ready.Remove(epi) + } + ep.mu.Unlock() +} + +// ReadEvents reads up to len(events) ready events into events and returns the +// number of events read. +// +// Preconditions: len(events) != 0. +func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int { + i := 0 + // Hot path: avoid defer. + ep.mu.Lock() + var next *epollInterest + var requeue epollInterestList + for epi := ep.ready.Front(); epi != nil; epi = next { + next = epi.Next() + // Regardless of what else happens, epi is initially removed from the + // ready list. + ep.ready.Remove(epi) + wmask := waiter.EventMaskFromLinux(epi.mask) + ievents := epi.key.file.Readiness(wmask) & wmask + if ievents == 0 { + // Leave epi off the ready list. + epi.ready = false + continue + } + // Determine what we should do with epi. + switch { + case epi.mask&linux.EPOLLONESHOT != 0: + // Clear all events from the mask; they must be re-added by + // EPOLL_CTL_MOD. + epi.mask &= linux.EP_PRIVATE_BITS + fallthrough + case epi.mask&linux.EPOLLET != 0: + // Leave epi off the ready list. + epi.ready = false + default: + // Queue epi to be moved to the end of the ready list. + requeue.PushBack(epi) + } + // Report ievents. + events[i] = linux.EpollEvent{ + Events: ievents.ToLinux(), + Data: epi.userData, + } + i++ + if i == len(events) { + break + } + } + ep.ready.PushBackList(&requeue) + ep.mu.Unlock() + return i +} diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go new file mode 100644 index 000000000..0c42574db --- /dev/null +++ b/pkg/sentry/vfs/file_description.go @@ -0,0 +1,837 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// A FileDescription represents an open file description, which is the entity +// referred to by a file descriptor (POSIX.1-2017 3.258 "Open File +// Description"). +// +// FileDescriptions are reference-counted. Unless otherwise specified, all +// FileDescription methods require that a reference is held. +// +// FileDescription is analogous to Linux's struct file. +type FileDescription struct { + // refs is the reference count. refs is accessed using atomic memory + // operations. + refs int64 + + // flagsMu protects statusFlags and asyncHandler below. + flagsMu sync.Mutex + + // statusFlags contains status flags, "initialized by open(2) and possibly + // modified by fcntl()" - fcntl(2). statusFlags can be read using atomic + // memory operations when it does not need to be synchronized with an + // access to asyncHandler. + statusFlags uint32 + + // asyncHandler handles O_ASYNC signal generation. It is set with the + // F_SETOWN or F_SETOWN_EX fcntls. For asyncHandler to be used, O_ASYNC must + // also be set by fcntl(2). + asyncHandler FileAsync + + // epolls is the set of epollInterests registered for this FileDescription. + // epolls is protected by epollMu. + epollMu sync.Mutex + epolls map[*epollInterest]struct{} + + // vd is the filesystem location at which this FileDescription was opened. + // A reference is held on vd. vd is immutable. + vd VirtualDentry + + // opts contains options passed to FileDescription.Init(). opts is + // immutable. + opts FileDescriptionOptions + + // readable is MayReadFileWithOpenFlags(statusFlags). readable is + // immutable. + // + // readable is analogous to Linux's FMODE_READ. + readable bool + + // writable is MayWriteFileWithOpenFlags(statusFlags). If writable is true, + // the FileDescription holds a write count on vd.mount. writable is + // immutable. + // + // writable is analogous to Linux's FMODE_WRITE. + writable bool + + usedLockBSD uint32 + + // impl is the FileDescriptionImpl associated with this Filesystem. impl is + // immutable. This should be the last field in FileDescription. + impl FileDescriptionImpl +} + +// FileDescriptionOptions contains options to FileDescription.Init(). +type FileDescriptionOptions struct { + // If AllowDirectIO is true, allow O_DIRECT to be set on the file. + AllowDirectIO bool + + // If DenyPRead is true, calls to FileDescription.PRead() return ESPIPE. + DenyPRead bool + + // If DenyPWrite is true, calls to FileDescription.PWrite() return + // ESPIPE. + DenyPWrite bool + + // If UseDentryMetadata is true, calls to FileDescription methods that + // interact with file and filesystem metadata (Stat, SetStat, StatFS, + // Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling + // the corresponding FilesystemImpl methods instead of the corresponding + // FileDescriptionImpl methods. + // + // UseDentryMetadata is intended for file descriptions that are implemented + // outside of individual filesystems, such as pipes, sockets, and device + // special files. FileDescriptions for which UseDentryMetadata is true may + // embed DentryMetadataFileDescriptionImpl to obtain appropriate + // implementations of FileDescriptionImpl methods that should not be + // called. + UseDentryMetadata bool +} + +// FileCreationFlags are the set of flags passed to FileDescription.Init() but +// omitted from FileDescription.StatusFlags(). +const FileCreationFlags = linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC + +// Init must be called before first use of fd. If it succeeds, it takes +// references on mnt and d. flags is the initial file description flags, which +// is usually the full set of flags passed to open(2). +func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) error { + writable := MayWriteFileWithOpenFlags(flags) + if writable { + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + } + + fd.refs = 1 + + // Remove "file creation flags" to mirror the behavior from file.f_flags in + // fs/open.c:do_dentry_open. + fd.statusFlags = flags &^ FileCreationFlags + fd.vd = VirtualDentry{ + mount: mnt, + dentry: d, + } + mnt.IncRef() + d.IncRef() + fd.opts = *opts + fd.readable = MayReadFileWithOpenFlags(flags) + fd.writable = writable + fd.impl = impl + return nil +} + +// IncRef increments fd's reference count. +func (fd *FileDescription) IncRef() { + atomic.AddInt64(&fd.refs, 1) +} + +// TryIncRef increments fd's reference count and returns true. If fd's +// reference count is already zero, TryIncRef does nothing and returns false. +// +// TryIncRef does not require that a reference is held on fd. +func (fd *FileDescription) TryIncRef() bool { + for { + refs := atomic.LoadInt64(&fd.refs) + if refs <= 0 { + return false + } + if atomic.CompareAndSwapInt64(&fd.refs, refs, refs+1) { + return true + } + } +} + +// DecRef decrements fd's reference count. +func (fd *FileDescription) DecRef() { + if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 { + // Unregister fd from all epoll instances. + fd.epollMu.Lock() + epolls := fd.epolls + fd.epolls = nil + fd.epollMu.Unlock() + for epi := range epolls { + ep := epi.epoll + ep.interestMu.Lock() + // Check that epi has not been concurrently unregistered by + // EpollInstance.DeleteInterest() or EpollInstance.Release(). + if _, ok := ep.interest[epi.key]; ok { + fd.EventUnregister(&epi.waiter) + ep.removeLocked(epi) + } + ep.interestMu.Unlock() + } + + // If BSD locks were used, release any lock that it may have acquired. + if atomic.LoadUint32(&fd.usedLockBSD) != 0 { + fd.impl.UnlockBSD(context.Background(), fd) + } + + // Release implementation resources. + fd.impl.Release() + if fd.writable { + fd.vd.mount.EndWrite() + } + fd.vd.DecRef() + fd.flagsMu.Lock() + // TODO(gvisor.dev/issue/1663): We may need to unregister during save, as we do in VFS1. + if fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil { + fd.asyncHandler.Unregister(fd) + } + fd.asyncHandler = nil + fd.flagsMu.Unlock() + } else if refs < 0 { + panic("FileDescription.DecRef() called without holding a reference") + } +} + +// Refs returns the current number of references. The returned count +// is inherently racy and is unsafe to use without external synchronization. +func (fd *FileDescription) Refs() int64 { + return atomic.LoadInt64(&fd.refs) +} + +// Mount returns the mount on which fd was opened. It does not take a reference +// on the returned Mount. +func (fd *FileDescription) Mount() *Mount { + return fd.vd.mount +} + +// Dentry returns the dentry at which fd was opened. It does not take a +// reference on the returned Dentry. +func (fd *FileDescription) Dentry() *Dentry { + return fd.vd.dentry +} + +// VirtualDentry returns the location at which fd was opened. It does not take +// a reference on the returned VirtualDentry. +func (fd *FileDescription) VirtualDentry() VirtualDentry { + return fd.vd +} + +// Options returns the options passed to fd.Init(). +func (fd *FileDescription) Options() FileDescriptionOptions { + return fd.opts +} + +// StatusFlags returns file description status flags, as for fcntl(F_GETFL). +func (fd *FileDescription) StatusFlags() uint32 { + return atomic.LoadUint32(&fd.statusFlags) +} + +// SetStatusFlags sets file description status flags, as for fcntl(F_SETFL). +func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Credentials, flags uint32) error { + // Compare Linux's fs/fcntl.c:setfl(). + oldFlags := fd.StatusFlags() + // Linux documents this check as "O_APPEND cannot be cleared if the file is + // marked as append-only and the file is open for write", which would make + // sense. However, the check as actually implemented seems to be "O_APPEND + // cannot be changed if the file is marked as append-only". + if (flags^oldFlags)&linux.O_APPEND != 0 { + stat, err := fd.Stat(ctx, StatOptions{ + // There is no mask bit for stx_attributes. + Mask: 0, + // Linux just reads inode::i_flags directly. + Sync: linux.AT_STATX_DONT_SYNC, + }) + if err != nil { + return err + } + if (stat.AttributesMask&linux.STATX_ATTR_APPEND != 0) && (stat.Attributes&linux.STATX_ATTR_APPEND != 0) { + return syserror.EPERM + } + } + if (flags&linux.O_NOATIME != 0) && (oldFlags&linux.O_NOATIME == 0) { + stat, err := fd.Stat(ctx, StatOptions{ + Mask: linux.STATX_UID, + // Linux's inode_owner_or_capable() just reads inode::i_uid + // directly. + Sync: linux.AT_STATX_DONT_SYNC, + }) + if err != nil { + return err + } + if stat.Mask&linux.STATX_UID == 0 { + return syserror.EPERM + } + if !CanActAsOwner(creds, auth.KUID(stat.UID)) { + return syserror.EPERM + } + } + if flags&linux.O_DIRECT != 0 && !fd.opts.AllowDirectIO { + return syserror.EINVAL + } + // TODO(jamieliu): FileDescriptionImpl.SetOAsync()? + const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK + fd.flagsMu.Lock() + if fd.asyncHandler != nil { + // Use fd.statusFlags instead of oldFlags, which may have become outdated, + // to avoid double registering/unregistering. + if fd.statusFlags&linux.O_ASYNC == 0 && flags&linux.O_ASYNC != 0 { + fd.asyncHandler.Register(fd) + } else if fd.statusFlags&linux.O_ASYNC != 0 && flags&linux.O_ASYNC == 0 { + fd.asyncHandler.Unregister(fd) + } + } + fd.statusFlags = (oldFlags &^ settableFlags) | (flags & settableFlags) + fd.flagsMu.Unlock() + return nil +} + +// IsReadable returns true if fd was opened for reading. +func (fd *FileDescription) IsReadable() bool { + return fd.readable +} + +// IsWritable returns true if fd was opened for writing. +func (fd *FileDescription) IsWritable() bool { + return fd.writable +} + +// Impl returns the FileDescriptionImpl associated with fd. +func (fd *FileDescription) Impl() FileDescriptionImpl { + return fd.impl +} + +// FileDescriptionImpl contains implementation details for an FileDescription. +// Implementations of FileDescriptionImpl should contain their associated +// FileDescription by value as their first field. +// +// For all functions that return linux.Statx, Statx.Uid and Statx.Gid will +// be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID and +// auth.KGID respectively). +// +// All methods may return errors not specified. +// +// FileDescriptionImpl is analogous to Linux's struct file_operations. +type FileDescriptionImpl interface { + // Release is called when the associated FileDescription reaches zero + // references. + Release() + + // OnClose is called when a file descriptor representing the + // FileDescription is closed. Note that returning a non-nil error does not + // prevent the file descriptor from being closed. + OnClose(ctx context.Context) error + + // Stat returns metadata for the file represented by the FileDescription. + Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) + + // SetStat updates metadata for the file represented by the + // FileDescription. Implementations are responsible for checking if the + // operation can be performed (see vfs.CheckSetStat() for common checks). + SetStat(ctx context.Context, opts SetStatOptions) error + + // StatFS returns metadata for the filesystem containing the file + // represented by the FileDescription. + StatFS(ctx context.Context) (linux.Statfs, error) + + // Allocate grows file represented by FileDescription to offset + length bytes. + // Only mode == 0 is supported currently. + Allocate(ctx context.Context, mode, offset, length uint64) error + + // waiter.Waitable methods may be used to poll for I/O events. + waiter.Waitable + + // PRead reads from the file into dst, starting at the given offset, and + // returns the number of bytes read. PRead is permitted to return partial + // reads with a nil error. + // + // Errors: + // + // - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP. + // + // Preconditions: The FileDescription was opened for reading. + // FileDescriptionOptions.DenyPRead == false. + PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) + + // Read is similar to PRead, but does not specify an offset. + // + // For files with an implicit FileDescription offset (e.g. regular files), + // Read begins at the FileDescription offset, and advances the offset by + // the number of bytes read; note that POSIX 2.9.7 "Thread Interactions + // with Regular File Operations" requires that all operations that may + // mutate the FileDescription offset are serialized. + // + // Errors: + // + // - If opts.Flags specifies unsupported options, Read returns EOPNOTSUPP. + // + // Preconditions: The FileDescription was opened for reading. + Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) + + // PWrite writes src to the file, starting at the given offset, and returns + // the number of bytes written. PWrite is permitted to return partial + // writes with a nil error. + // + // As in Linux (but not POSIX), if O_APPEND is in effect for the + // FileDescription, PWrite should ignore the offset and append data to the + // end of the file. + // + // Errors: + // + // - If opts.Flags specifies unsupported options, PWrite returns + // EOPNOTSUPP. + // + // Preconditions: The FileDescription was opened for writing. + // FileDescriptionOptions.DenyPWrite == false. + PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) + + // Write is similar to PWrite, but does not specify an offset, which is + // implied as for Read. + // + // Write is a FileDescriptionImpl method, instead of a wrapper around + // PWrite that uses a FileDescription offset, to make it possible for + // remote filesystems to implement O_APPEND correctly (i.e. atomically with + // respect to writers outside the scope of VFS). + // + // Errors: + // + // - If opts.Flags specifies unsupported options, Write returns EOPNOTSUPP. + // + // Preconditions: The FileDescription was opened for writing. + Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) + + // IterDirents invokes cb on each entry in the directory represented by the + // FileDescription. If IterDirents has been called since the last call to + // Seek, it continues iteration from the end of the last call. + IterDirents(ctx context.Context, cb IterDirentsCallback) error + + // Seek changes the FileDescription offset (assuming one exists) and + // returns its new value. + // + // For directories, if whence == SEEK_SET and offset == 0, the caller is + // rewinddir(), such that Seek "shall also cause the directory stream to + // refer to the current state of the corresponding directory" - + // POSIX.1-2017. + Seek(ctx context.Context, offset int64, whence int32) (int64, error) + + // Sync requests that cached state associated with the file represented by + // the FileDescription is synchronized with persistent storage, and blocks + // until this is complete. + Sync(ctx context.Context) error + + // ConfigureMMap mutates opts to implement mmap(2) for the file. Most + // implementations that support memory mapping can call + // GenericConfigureMMap with the appropriate memmap.Mappable. + ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error + + // Ioctl implements the ioctl(2) syscall. + Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) + + // Listxattr returns all extended attribute names for the file. + Listxattr(ctx context.Context, size uint64) ([]string, error) + + // Getxattr returns the value associated with the given extended attribute + // for the file. + Getxattr(ctx context.Context, opts GetxattrOptions) (string, error) + + // Setxattr changes the value associated with the given extended attribute + // for the file. + Setxattr(ctx context.Context, opts SetxattrOptions) error + + // Removexattr removes the given extended attribute from the file. + Removexattr(ctx context.Context, name string) error + + // LockBSD tries to acquire a BSD-style advisory file lock. + LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error + + // UnlockBSD releases a BSD-style advisory file lock. + UnlockBSD(ctx context.Context, uid lock.UniqueID) error + + // LockPOSIX tries to acquire a POSIX-style advisory file lock. + LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, length uint64, whence int16, block lock.Blocker) error + + // UnlockPOSIX releases a POSIX-style advisory file lock. + UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, length uint64, whence int16) error +} + +// Dirent holds the information contained in struct linux_dirent64. +type Dirent struct { + // Name is the filename. + Name string + + // Type is the file type, a linux.DT_* constant. + Type uint8 + + // Ino is the inode number. + Ino uint64 + + // NextOff is the offset of the *next* Dirent in the directory; that is, + // FileDescription.Seek(NextOff, SEEK_SET) (as called by seekdir(3)) will + // cause the next call to FileDescription.IterDirents() to yield the next + // Dirent. (The offset of the first Dirent in a directory is always 0.) + NextOff int64 +} + +// IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents. +type IterDirentsCallback interface { + // Handle handles the given iterated Dirent. If Handle returns a non-nil + // error, FileDescriptionImpl.IterDirents must stop iteration and return + // the error; the next call to FileDescriptionImpl.IterDirents should + // restart with the same Dirent. + Handle(dirent Dirent) error +} + +// IterDirentsCallbackFunc implements IterDirentsCallback for a function with +// the semantics of IterDirentsCallback.Handle. +type IterDirentsCallbackFunc func(dirent Dirent) error + +// Handle implements IterDirentsCallback.Handle. +func (f IterDirentsCallbackFunc) Handle(dirent Dirent) error { + return f(dirent) +} + +// OnClose is called when a file descriptor representing the FileDescription is +// closed. Returning a non-nil error should not prevent the file descriptor +// from being closed. +func (fd *FileDescription) OnClose(ctx context.Context) error { + return fd.impl.OnClose(ctx) +} + +// Stat returns metadata for the file represented by fd. +func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts) + vfsObj.putResolvingPath(rp) + return stat, err + } + return fd.impl.Stat(ctx, opts) +} + +// SetStat updates metadata for the file represented by fd. +func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) error { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts) + vfsObj.putResolvingPath(rp) + return err + } + return fd.impl.SetStat(ctx, opts) +} + +// StatFS returns metadata for the filesystem containing the file represented +// by fd. +func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp) + vfsObj.putResolvingPath(rp) + return statfs, err + } + return fd.impl.StatFS(ctx) +} + +// Readiness implements waiter.Waitable.Readiness. +// +// It returns fd's I/O readiness. +func (fd *FileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + return fd.impl.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +// +// It registers e for I/O readiness events in mask. +func (fd *FileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + fd.impl.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +// +// It unregisters e for I/O readiness events. +func (fd *FileDescription) EventUnregister(e *waiter.Entry) { + fd.impl.EventUnregister(e) +} + +// PRead reads from the file represented by fd into dst, starting at the given +// offset, and returns the number of bytes read. PRead is permitted to return +// partial reads with a nil error. +func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + if fd.opts.DenyPRead { + return 0, syserror.ESPIPE + } + if !fd.readable { + return 0, syserror.EBADF + } + return fd.impl.PRead(ctx, dst, offset, opts) +} + +// Read is similar to PRead, but does not specify an offset. +func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + if !fd.readable { + return 0, syserror.EBADF + } + return fd.impl.Read(ctx, dst, opts) +} + +// PWrite writes src to the file represented by fd, starting at the given +// offset, and returns the number of bytes written. PWrite is permitted to +// return partial writes with a nil error. +func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + if fd.opts.DenyPWrite { + return 0, syserror.ESPIPE + } + if !fd.writable { + return 0, syserror.EBADF + } + return fd.impl.PWrite(ctx, src, offset, opts) +} + +// Write is similar to PWrite, but does not specify an offset. +func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + if !fd.writable { + return 0, syserror.EBADF + } + return fd.impl.Write(ctx, src, opts) +} + +// IterDirents invokes cb on each entry in the directory represented by fd. If +// IterDirents has been called since the last call to Seek, it continues +// iteration from the end of the last call. +func (fd *FileDescription) IterDirents(ctx context.Context, cb IterDirentsCallback) error { + return fd.impl.IterDirents(ctx, cb) +} + +// Seek changes fd's offset (assuming one exists) and returns its new value. +func (fd *FileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return fd.impl.Seek(ctx, offset, whence) +} + +// Sync has the semantics of fsync(2). +func (fd *FileDescription) Sync(ctx context.Context) error { + return fd.impl.Sync(ctx) +} + +// ConfigureMMap mutates opts to implement mmap(2) for the file represented by +// fd. +func (fd *FileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + return fd.impl.ConfigureMMap(ctx, opts) +} + +// Ioctl implements the ioctl(2) syscall. +func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return fd.impl.Ioctl(ctx, uio, args) +} + +// Listxattr returns all extended attribute names for the file represented by +// fd. +// +// If the size of the list (including a NUL terminating byte after every entry) +// would exceed size, ERANGE may be returned. Note that implementations +// are free to ignore size entirely and return without error). In all cases, +// if size is 0, the list should be returned without error, regardless of size. +func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp, size) + vfsObj.putResolvingPath(rp) + return names, err + } + names, err := fd.impl.Listxattr(ctx, size) + if err == syserror.ENOTSUP { + // Linux doesn't actually return ENOTSUP in this case; instead, + // fs/xattr.c:vfs_listxattr() falls back to allowing the security + // subsystem to return security extended attributes, which by default + // don't exist. + return nil, nil + } + return names, err +} + +// Getxattr returns the value associated with the given extended attribute for +// the file represented by fd. +// +// If the size of the return value exceeds opts.Size, ERANGE may be returned +// (note that implementations are free to ignore opts.Size entirely and return +// without error). In all cases, if opts.Size is 0, the value should be +// returned without error, regardless of size. +func (fd *FileDescription) Getxattr(ctx context.Context, opts *GetxattrOptions) (string, error) { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, *opts) + vfsObj.putResolvingPath(rp) + return val, err + } + return fd.impl.Getxattr(ctx, *opts) +} + +// Setxattr changes the value associated with the given extended attribute for +// the file represented by fd. +func (fd *FileDescription) Setxattr(ctx context.Context, opts *SetxattrOptions) error { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, *opts) + vfsObj.putResolvingPath(rp) + return err + } + return fd.impl.Setxattr(ctx, *opts) +} + +// Removexattr removes the given extended attribute from the file represented +// by fd. +func (fd *FileDescription) Removexattr(ctx context.Context, name string) error { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name) + vfsObj.putResolvingPath(rp) + return err + } + return fd.impl.Removexattr(ctx, name) +} + +// SyncFS instructs the filesystem containing fd to execute the semantics of +// syncfs(2). +func (fd *FileDescription) SyncFS(ctx context.Context) error { + return fd.vd.mount.fs.impl.Sync(ctx) +} + +// MappedName implements memmap.MappingIdentity.MappedName. +func (fd *FileDescription) MappedName(ctx context.Context) string { + vfsroot := RootFromContext(ctx) + s, _ := fd.vd.mount.vfs.PathnameWithDeleted(ctx, vfsroot, fd.vd) + if vfsroot.Ok() { + vfsroot.DecRef() + } + return s +} + +// DeviceID implements memmap.MappingIdentity.DeviceID. +func (fd *FileDescription) DeviceID() uint64 { + stat, err := fd.Stat(context.Background(), StatOptions{ + // There is no STATX_DEV; we assume that Stat will return it if it's + // available regardless of mask. + Mask: 0, + // fs/proc/task_mmu.c:show_map_vma() just reads inode::i_sb->s_dev + // directly. + Sync: linux.AT_STATX_DONT_SYNC, + }) + if err != nil { + return 0 + } + return uint64(linux.MakeDeviceID(uint16(stat.DevMajor), stat.DevMinor)) +} + +// InodeID implements memmap.MappingIdentity.InodeID. +func (fd *FileDescription) InodeID() uint64 { + stat, err := fd.Stat(context.Background(), StatOptions{ + Mask: linux.STATX_INO, + // fs/proc/task_mmu.c:show_map_vma() just reads inode::i_ino directly. + Sync: linux.AT_STATX_DONT_SYNC, + }) + if err != nil || stat.Mask&linux.STATX_INO == 0 { + return 0 + } + return stat.Ino +} + +// Msync implements memmap.MappingIdentity.Msync. +func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error { + return fd.Sync(ctx) +} + +// LockBSD tries to acquire a BSD-style advisory file lock. +func (fd *FileDescription) LockBSD(ctx context.Context, lockType lock.LockType, blocker lock.Blocker) error { + atomic.StoreUint32(&fd.usedLockBSD, 1) + return fd.impl.LockBSD(ctx, fd, lockType, blocker) +} + +// UnlockBSD releases a BSD-style advisory file lock. +func (fd *FileDescription) UnlockBSD(ctx context.Context) error { + return fd.impl.UnlockBSD(ctx, fd) +} + +// LockPOSIX locks a POSIX-style file range lock. +func (fd *FileDescription) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, end uint64, whence int16, block lock.Blocker) error { + return fd.impl.LockPOSIX(ctx, uid, t, start, end, whence, block) +} + +// UnlockPOSIX unlocks a POSIX-style file range lock. +func (fd *FileDescription) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, end uint64, whence int16) error { + return fd.impl.UnlockPOSIX(ctx, uid, start, end, whence) +} + +// A FileAsync sends signals to its owner when w is ready for IO. This is only +// implemented by pkg/sentry/fasync:FileAsync, but we unfortunately need this +// interface to avoid circular dependencies. +type FileAsync interface { + Register(w waiter.Waitable) + Unregister(w waiter.Waitable) +} + +// AsyncHandler returns the FileAsync for fd. +func (fd *FileDescription) AsyncHandler() FileAsync { + fd.flagsMu.Lock() + defer fd.flagsMu.Unlock() + return fd.asyncHandler +} + +// SetAsyncHandler sets fd.asyncHandler if it has not been set before and +// returns it. +func (fd *FileDescription) SetAsyncHandler(newHandler func() FileAsync) FileAsync { + fd.flagsMu.Lock() + defer fd.flagsMu.Unlock() + if fd.asyncHandler == nil { + fd.asyncHandler = newHandler() + if fd.statusFlags&linux.O_ASYNC != 0 { + fd.asyncHandler.Register(fd) + } + } + return fd.asyncHandler +} diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go new file mode 100644 index 000000000..6b8b4ad49 --- /dev/null +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -0,0 +1,428 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "bytes" + "io" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// The following design pattern is strongly recommended for filesystem +// implementations to adapt: +// - Have a local fileDescription struct (containing FileDescription) which +// embeds FileDescriptionDefaultImpl and overrides the default methods +// which are common to all fd implementations for that filesystem like +// StatusFlags, SetStatusFlags, Stat, SetStat, StatFS, etc. +// - This should be embedded in all file description implementations as the +// first field by value. +// - Directory FDs would also embed DirectoryFileDescriptionDefaultImpl. + +// FileDescriptionDefaultImpl may be embedded by implementations of +// FileDescriptionImpl to obtain implementations of many FileDescriptionImpl +// methods with default behavior analogous to Linux's. +type FileDescriptionDefaultImpl struct{} + +// OnClose implements FileDescriptionImpl.OnClose analogously to +// file_operations::flush == NULL in Linux. +func (FileDescriptionDefaultImpl) OnClose(ctx context.Context) error { + return nil +} + +// StatFS implements FileDescriptionImpl.StatFS analogously to +// super_operations::statfs == NULL in Linux. +func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, error) { + return linux.Statfs{}, syserror.ENOSYS +} + +// Allocate implements FileDescriptionImpl.Allocate analogously to +// fallocate called on regular file, directory or FIFO in Linux. +func (FileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.ENODEV +} + +// Readiness implements waiter.Waitable.Readiness analogously to +// file_operations::poll == NULL in Linux. +func (FileDescriptionDefaultImpl) Readiness(mask waiter.EventMask) waiter.EventMask { + // include/linux/poll.h:vfs_poll() => DEFAULT_POLLMASK + return waiter.EventIn | waiter.EventOut +} + +// EventRegister implements waiter.Waitable.EventRegister analogously to +// file_operations::poll == NULL in Linux. +func (FileDescriptionDefaultImpl) EventRegister(e *waiter.Entry, mask waiter.EventMask) { +} + +// EventUnregister implements waiter.Waitable.EventUnregister analogously to +// file_operations::poll == NULL in Linux. +func (FileDescriptionDefaultImpl) EventUnregister(e *waiter.Entry) { +} + +// PRead implements FileDescriptionImpl.PRead analogously to +// file_operations::read == file_operations::read_iter == NULL in Linux. +func (FileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + return 0, syserror.EINVAL +} + +// Read implements FileDescriptionImpl.Read analogously to +// file_operations::read == file_operations::read_iter == NULL in Linux. +func (FileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + return 0, syserror.EINVAL +} + +// PWrite implements FileDescriptionImpl.PWrite analogously to +// file_operations::write == file_operations::write_iter == NULL in Linux. +func (FileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + return 0, syserror.EINVAL +} + +// Write implements FileDescriptionImpl.Write analogously to +// file_operations::write == file_operations::write_iter == NULL in Linux. +func (FileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + return 0, syserror.EINVAL +} + +// IterDirents implements FileDescriptionImpl.IterDirents analogously to +// file_operations::iterate == file_operations::iterate_shared == NULL in +// Linux. +func (FileDescriptionDefaultImpl) IterDirents(ctx context.Context, cb IterDirentsCallback) error { + return syserror.ENOTDIR +} + +// Seek implements FileDescriptionImpl.Seek analogously to +// file_operations::llseek == NULL in Linux. +func (FileDescriptionDefaultImpl) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return 0, syserror.ESPIPE +} + +// Sync implements FileDescriptionImpl.Sync analogously to +// file_operations::fsync == NULL in Linux. +func (FileDescriptionDefaultImpl) Sync(ctx context.Context) error { + return syserror.EINVAL +} + +// ConfigureMMap implements FileDescriptionImpl.ConfigureMMap analogously to +// file_operations::mmap == NULL in Linux. +func (FileDescriptionDefaultImpl) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + return syserror.ENODEV +} + +// Ioctl implements FileDescriptionImpl.Ioctl analogously to +// file_operations::unlocked_ioctl == NULL in Linux. +func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return 0, syserror.ENOTTY +} + +// Listxattr implements FileDescriptionImpl.Listxattr analogously to +// inode_operations::listxattr == NULL in Linux. +func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context, size uint64) ([]string, error) { + // This isn't exactly accurate; see FileDescription.Listxattr. + return nil, syserror.ENOTSUP +} + +// Getxattr implements FileDescriptionImpl.Getxattr analogously to +// inode::i_opflags & IOP_XATTR == 0 in Linux. +func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, opts GetxattrOptions) (string, error) { + return "", syserror.ENOTSUP +} + +// Setxattr implements FileDescriptionImpl.Setxattr analogously to +// inode::i_opflags & IOP_XATTR == 0 in Linux. +func (FileDescriptionDefaultImpl) Setxattr(ctx context.Context, opts SetxattrOptions) error { + return syserror.ENOTSUP +} + +// Removexattr implements FileDescriptionImpl.Removexattr analogously to +// inode::i_opflags & IOP_XATTR == 0 in Linux. +func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) error { + return syserror.ENOTSUP +} + +// DirectoryFileDescriptionDefaultImpl may be embedded by implementations of +// FileDescriptionImpl that always represent directories to obtain +// implementations of non-directory I/O methods that return EISDIR. +type DirectoryFileDescriptionDefaultImpl struct{} + +// Allocate implements DirectoryFileDescriptionDefaultImpl.Allocate. +func (DirectoryFileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.EISDIR +} + +// PRead implements FileDescriptionImpl.PRead. +func (DirectoryFileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// Read implements FileDescriptionImpl.Read. +func (DirectoryFileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// PWrite implements FileDescriptionImpl.PWrite. +func (DirectoryFileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// Write implements FileDescriptionImpl.Write. +func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// DentryMetadataFileDescriptionImpl may be embedded by implementations of +// FileDescriptionImpl for which FileDescriptionOptions.UseDentryMetadata is +// true to obtain implementations of Stat and SetStat that panic. +type DentryMetadataFileDescriptionImpl struct{} + +// Stat implements FileDescriptionImpl.Stat. +func (DentryMetadataFileDescriptionImpl) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { + panic("illegal call to DentryMetadataFileDescriptionImpl.Stat") +} + +// SetStat implements FileDescriptionImpl.SetStat. +func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetStatOptions) error { + panic("illegal call to DentryMetadataFileDescriptionImpl.SetStat") +} + +// DynamicBytesSource represents a data source for a +// DynamicBytesFileDescriptionImpl. +type DynamicBytesSource interface { + // Generate writes the file's contents to buf. + Generate(ctx context.Context, buf *bytes.Buffer) error +} + +// StaticData implements DynamicBytesSource over a static string. +type StaticData struct { + Data string +} + +// Generate implements DynamicBytesSource. +func (s *StaticData) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString(s.Data) + return nil +} + +// WritableDynamicBytesSource extends DynamicBytesSource to allow writes to the +// underlying source. +type WritableDynamicBytesSource interface { + DynamicBytesSource + + // Write sends writes to the source. + Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) +} + +// DynamicBytesFileDescriptionImpl may be embedded by implementations of +// FileDescriptionImpl that represent read-only regular files whose contents +// are backed by a bytes.Buffer that is regenerated when necessary, consistent +// with Linux's fs/seq_file.c:single_open(). +// +// DynamicBytesFileDescriptionImpl.SetDataSource() must be called before first +// use. +type DynamicBytesFileDescriptionImpl struct { + data DynamicBytesSource // immutable + mu sync.Mutex // protects the following fields + buf bytes.Buffer + off int64 + lastRead int64 // offset at which the last Read, PRead, or Seek ended +} + +// SetDataSource must be called exactly once on fd before first use. +func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) { + fd.data = data +} + +// Preconditions: fd.mu must be locked. +func (fd *DynamicBytesFileDescriptionImpl) preadLocked(ctx context.Context, dst usermem.IOSequence, offset int64, opts *ReadOptions) (int64, error) { + // Regenerate the buffer if it's empty, or before pread() at a new offset. + // Compare fs/seq_file.c:seq_read() => traverse(). + switch { + case offset != fd.lastRead: + fd.buf.Reset() + fallthrough + case fd.buf.Len() == 0: + if err := fd.data.Generate(ctx, &fd.buf); err != nil { + fd.buf.Reset() + // fd.off is not updated in this case. + fd.lastRead = 0 + return 0, err + } + } + bs := fd.buf.Bytes() + if offset >= int64(len(bs)) { + return 0, io.EOF + } + n, err := dst.CopyOut(ctx, bs[offset:]) + fd.lastRead = offset + int64(n) + return int64(n), err +} + +// PRead implements FileDescriptionImpl.PRead. +func (fd *DynamicBytesFileDescriptionImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.preadLocked(ctx, dst, offset, &opts) + fd.mu.Unlock() + return n, err +} + +// Read implements FileDescriptionImpl.Read. +func (fd *DynamicBytesFileDescriptionImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.preadLocked(ctx, dst, fd.off, &opts) + fd.off += n + fd.mu.Unlock() + return n, err +} + +// Seek implements FileDescriptionImpl.Seek. +func (fd *DynamicBytesFileDescriptionImpl) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + switch whence { + case linux.SEEK_SET: + // Use offset as given. + case linux.SEEK_CUR: + offset += fd.off + default: + // fs/seq_file:seq_lseek() rejects SEEK_END etc. + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + if offset != fd.lastRead { + // Regenerate the file's contents immediately. Compare + // fs/seq_file.c:seq_lseek() => traverse(). + fd.buf.Reset() + if err := fd.data.Generate(ctx, &fd.buf); err != nil { + fd.buf.Reset() + fd.off = 0 + fd.lastRead = 0 + return 0, err + } + fd.lastRead = offset + } + fd.off = offset + return offset, nil +} + +// Preconditions: fd.mu must be locked. +func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { + return 0, syserror.EOPNOTSUPP + } + limit, err := CheckLimit(ctx, offset, src.NumBytes()) + if err != nil { + return 0, err + } + src = src.TakeFirst64(limit) + + writable, ok := fd.data.(WritableDynamicBytesSource) + if !ok { + return 0, syserror.EIO + } + n, err := writable.Write(ctx, src, offset) + if err != nil { + return 0, err + } + + // Invalidate cached data that might exist prior to this call. + fd.buf.Reset() + return n, nil +} + +// PWrite implements FileDescriptionImpl.PWrite. +func (fd *DynamicBytesFileDescriptionImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.pwriteLocked(ctx, src, offset, opts) + fd.mu.Unlock() + return n, err +} + +// Write implements FileDescriptionImpl.Write. +func (fd *DynamicBytesFileDescriptionImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.pwriteLocked(ctx, src, fd.off, opts) + fd.off += n + fd.mu.Unlock() + return n, err +} + +// GenericConfigureMMap may be used by most implementations of +// FileDescriptionImpl.ConfigureMMap. +func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.MMapOpts) error { + opts.Mappable = m + opts.MappingIdentity = fd + fd.IncRef() + return nil +} + +// LockFD may be used by most implementations of FileDescriptionImpl.Lock* +// functions. Caller must call Init(). +type LockFD struct { + locks *FileLocks +} + +// Init initializes fd with FileLocks to use. +func (fd *LockFD) Init(locks *FileLocks) { + fd.locks = locks +} + +// Locks returns the locks associated with this file. +func (fd *LockFD) Locks() *FileLocks { + return fd.locks +} + +// LockBSD implements vfs.FileDescriptionImpl.LockBSD. +func (fd *LockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + return fd.locks.LockBSD(uid, t, block) +} + +// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. +func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { + fd.locks.UnlockBSD(uid) + return nil +} + +// NoLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface +// returning ENOLCK. +type NoLockFD struct{} + +// LockBSD implements vfs.FileDescriptionImpl.LockBSD. +func (NoLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + return syserror.ENOLCK +} + +// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. +func (NoLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { + return syserror.ENOLCK +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return syserror.ENOLCK +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return syserror.ENOLCK +} diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go new file mode 100644 index 000000000..3b7e1c273 --- /dev/null +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -0,0 +1,224 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "bytes" + "fmt" + "io" + "sync/atomic" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// fileDescription is the common fd struct which a filesystem implementation +// embeds in all of its file description implementations as required. +type fileDescription struct { + vfsfd FileDescription + FileDescriptionDefaultImpl + NoLockFD +} + +// genCount contains the number of times its DynamicBytesSource.Generate() +// implementation has been called. +type genCount struct { + count uint64 // accessed using atomic memory ops +} + +// Generate implements DynamicBytesSource.Generate. +func (g *genCount) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "%d", atomic.AddUint64(&g.count, 1)) + return nil +} + +type storeData struct { + data string +} + +var _ WritableDynamicBytesSource = (*storeData)(nil) + +// Generate implements DynamicBytesSource. +func (d *storeData) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString(d.data) + return nil +} + +// Generate implements WritableDynamicBytesSource. +func (d *storeData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + buf := make([]byte, src.NumBytes()) + n, err := src.CopyIn(ctx, buf) + if err != nil { + return 0, err + } + + d.data = string(buf[:n]) + return 0, nil +} + +// testFD is a read-only FileDescriptionImpl representing a regular file. +type testFD struct { + fileDescription + DynamicBytesFileDescriptionImpl + + data DynamicBytesSource +} + +func newTestFD(vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesSource) *FileDescription { + vd := vfsObj.NewAnonVirtualDentry("genCountFD") + defer vd.DecRef() + var fd testFD + fd.vfsfd.Init(&fd, statusFlags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{}) + fd.DynamicBytesFileDescriptionImpl.SetDataSource(data) + return &fd.vfsfd +} + +// Release implements FileDescriptionImpl.Release. +func (fd *testFD) Release() { +} + +// SetStatusFlags implements FileDescriptionImpl.SetStatusFlags. +// Stat implements FileDescriptionImpl.Stat. +func (fd *testFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { + // Note that Statx.Mask == 0 in the return value. + return linux.Statx{}, nil +} + +// SetStat implements FileDescriptionImpl.SetStat. +func (fd *testFD) SetStat(ctx context.Context, opts SetStatOptions) error { + return syserror.EPERM +} + +func TestGenCountFD(t *testing.T) { + ctx := contexttest.Context(t) + + vfsObj := &VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } + fd := newTestFD(vfsObj, linux.O_RDWR, &genCount{}) + defer fd.DecRef() + + // The first read causes Generate to be called to fill the FD's buffer. + buf := make([]byte, 2) + ioseq := usermem.BytesIOSequence(buf) + n, err := fd.Read(ctx, ioseq, ReadOptions{}) + if n != 1 || (err != nil && err != io.EOF) { + t.Fatalf("first Read: got (%d, %v), wanted (1, nil or EOF)", n, err) + } + if want := byte('1'); buf[0] != want { + t.Errorf("first Read: got byte %c, wanted %c", buf[0], want) + } + + // A second read without seeking is still at EOF. + n, err = fd.Read(ctx, ioseq, ReadOptions{}) + if n != 0 || err != io.EOF { + t.Fatalf("second Read: got (%d, %v), wanted (0, EOF)", n, err) + } + + // Seeking to the beginning of the file causes it to be regenerated. + n, err = fd.Seek(ctx, 0, linux.SEEK_SET) + if n != 0 || err != nil { + t.Fatalf("Seek: got (%d, %v), wanted (0, nil)", n, err) + } + n, err = fd.Read(ctx, ioseq, ReadOptions{}) + if n != 1 || (err != nil && err != io.EOF) { + t.Fatalf("Read after Seek: got (%d, %v), wanted (1, nil or EOF)", n, err) + } + if want := byte('2'); buf[0] != want { + t.Errorf("Read after Seek: got byte %c, wanted %c", buf[0], want) + } + + // PRead at the beginning of the file also causes it to be regenerated. + n, err = fd.PRead(ctx, ioseq, 0, ReadOptions{}) + if n != 1 || (err != nil && err != io.EOF) { + t.Fatalf("PRead: got (%d, %v), wanted (1, nil or EOF)", n, err) + } + if want := byte('3'); buf[0] != want { + t.Errorf("PRead: got byte %c, wanted %c", buf[0], want) + } + + // Write and PWrite fails. + if _, err := fd.Write(ctx, ioseq, WriteOptions{}); err != syserror.EIO { + t.Errorf("Write: got err %v, wanted %v", err, syserror.EIO) + } + if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); err != syserror.EIO { + t.Errorf("Write: got err %v, wanted %v", err, syserror.EIO) + } +} + +func TestWritable(t *testing.T) { + ctx := contexttest.Context(t) + + vfsObj := &VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } + fd := newTestFD(vfsObj, linux.O_RDWR, &storeData{data: "init"}) + defer fd.DecRef() + + buf := make([]byte, 10) + ioseq := usermem.BytesIOSequence(buf) + if n, err := fd.Read(ctx, ioseq, ReadOptions{}); n != 4 && err != io.EOF { + t.Fatalf("Read: got (%v, %v), wanted (4, EOF)", n, err) + } + if want := "init"; want == string(buf) { + t.Fatalf("Read: got %v, wanted %v", string(buf), want) + } + + // Test PWrite. + want := "write" + writeIOSeq := usermem.BytesIOSequence([]byte(want)) + if n, err := fd.PWrite(ctx, writeIOSeq, 0, WriteOptions{}); int(n) != len(want) && err != nil { + t.Errorf("PWrite: got err (%v, %v), wanted (%v, nil)", n, err, len(want)) + } + if n, err := fd.PRead(ctx, ioseq, 0, ReadOptions{}); int(n) != len(want) && err != io.EOF { + t.Fatalf("PRead: got (%v, %v), wanted (%v, EOF)", n, err, len(want)) + } + if want == string(buf) { + t.Fatalf("PRead: got %v, wanted %v", string(buf), want) + } + + // Test Seek to 0 followed by Write. + want = "write2" + writeIOSeq = usermem.BytesIOSequence([]byte(want)) + if n, err := fd.Seek(ctx, 0, linux.SEEK_SET); n != 0 && err != nil { + t.Errorf("Seek: got err (%v, %v), wanted (0, nil)", n, err) + } + if n, err := fd.Write(ctx, writeIOSeq, WriteOptions{}); int(n) != len(want) && err != nil { + t.Errorf("Write: got err (%v, %v), wanted (%v, nil)", n, err, len(want)) + } + if n, err := fd.PRead(ctx, ioseq, 0, ReadOptions{}); int(n) != len(want) && err != io.EOF { + t.Fatalf("PRead: got (%v, %v), wanted (%v, EOF)", n, err, len(want)) + } + if want == string(buf) { + t.Fatalf("PRead: got %v, wanted %v", string(buf), want) + } + + // Test failure if offset != 0. + if n, err := fd.Seek(ctx, 1, linux.SEEK_SET); n != 0 && err != nil { + t.Errorf("Seek: got err (%v, %v), wanted (0, nil)", n, err) + } + if n, err := fd.Write(ctx, writeIOSeq, WriteOptions{}); n != 0 && err != syserror.EINVAL { + t.Errorf("Write: got err (%v, %v), wanted (0, EINVAL)", n, err) + } + if n, err := fd.PWrite(ctx, writeIOSeq, 2, WriteOptions{}); n != 0 && err != syserror.EINVAL { + t.Errorf("PWrite: got err (%v, %v), wanted (0, EINVAL)", n, err) + } +} diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go new file mode 100644 index 000000000..6bb9ca180 --- /dev/null +++ b/pkg/sentry/vfs/filesystem.go @@ -0,0 +1,556 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" +) + +// A Filesystem is a tree of nodes represented by Dentries, which forms part of +// a VirtualFilesystem. +// +// Filesystems are reference-counted. Unless otherwise specified, all +// Filesystem methods require that a reference is held. +// +// Filesystem is analogous to Linux's struct super_block. +// +// +stateify savable +type Filesystem struct { + // refs is the reference count. refs is accessed using atomic memory + // operations. + refs int64 + + // vfs is the VirtualFilesystem that uses this Filesystem. vfs is + // immutable. + vfs *VirtualFilesystem + + // fsType is the FilesystemType of this Filesystem. + fsType FilesystemType + + // impl is the FilesystemImpl associated with this Filesystem. impl is + // immutable. This should be the last field in Dentry. + impl FilesystemImpl +} + +// Init must be called before first use of fs. +func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, fsType FilesystemType, impl FilesystemImpl) { + fs.refs = 1 + fs.vfs = vfsObj + fs.fsType = fsType + fs.impl = impl + vfsObj.filesystemsMu.Lock() + vfsObj.filesystems[fs] = struct{}{} + vfsObj.filesystemsMu.Unlock() +} + +// FilesystemType returns the FilesystemType for this Filesystem. +func (fs *Filesystem) FilesystemType() FilesystemType { + return fs.fsType +} + +// VirtualFilesystem returns the containing VirtualFilesystem. +func (fs *Filesystem) VirtualFilesystem() *VirtualFilesystem { + return fs.vfs +} + +// Impl returns the FilesystemImpl associated with fs. +func (fs *Filesystem) Impl() FilesystemImpl { + return fs.impl +} + +// IncRef increments fs' reference count. +func (fs *Filesystem) IncRef() { + if atomic.AddInt64(&fs.refs, 1) <= 1 { + panic("Filesystem.IncRef() called without holding a reference") + } +} + +// TryIncRef increments fs' reference count and returns true. If fs' reference +// count is zero, TryIncRef does nothing and returns false. +// +// TryIncRef does not require that a reference is held on fs. +func (fs *Filesystem) TryIncRef() bool { + for { + refs := atomic.LoadInt64(&fs.refs) + if refs <= 0 { + return false + } + if atomic.CompareAndSwapInt64(&fs.refs, refs, refs+1) { + return true + } + } +} + +// DecRef decrements fs' reference count. +func (fs *Filesystem) DecRef() { + if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 { + fs.vfs.filesystemsMu.Lock() + delete(fs.vfs.filesystems, fs) + fs.vfs.filesystemsMu.Unlock() + fs.impl.Release() + } else if refs < 0 { + panic("Filesystem.decRef() called without holding a reference") + } +} + +// FilesystemImpl contains implementation details for a Filesystem. +// Implementations of FilesystemImpl should contain their associated Filesystem +// by value as their first field. +// +// All methods that take a ResolvingPath must resolve the path before +// performing any other checks, including rejection of the operation if not +// supported by the FilesystemImpl. This is because the final FilesystemImpl +// (responsible for actually implementing the operation) isn't known until path +// resolution is complete. +// +// Unless otherwise specified, FilesystemImpl methods are responsible for +// performing permission checks. In many cases, vfs package functions in +// permissions.go may be used to help perform these checks. +// +// When multiple specified error conditions apply to a given method call, the +// implementation may return any applicable errno unless otherwise specified, +// but returning the earliest error specified is preferable to maximize +// compatibility with Linux. +// +// All methods may return errors not specified, notably including: +// +// - ENOENT if a required path component does not exist. +// +// - ENOTDIR if an intermediate path component is not a directory. +// +// - Errors from vfs-package functions (ResolvingPath.Resolve*(), +// Mount.CheckBeginWrite(), permission-checking functions, etc.) +// +// For all methods that take or return linux.Statx, Statx.Uid and Statx.Gid +// should be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID +// and auth.KGID respectively). +// +// FilesystemImpl combines elements of Linux's struct super_operations and +// struct inode_operations, for reasons described in the documentation for +// Dentry. +type FilesystemImpl interface { + // Release is called when the associated Filesystem reaches zero + // references. + Release() + + // Sync "causes all pending modifications to filesystem metadata and cached + // file data to be written to the underlying [filesystem]", as by syncfs(2). + Sync(ctx context.Context) error + + // AccessAt checks whether a user with creds can access the file at rp. + AccessAt(ctx context.Context, rp *ResolvingPath, creds *auth.Credentials, ats AccessTypes) error + + // GetDentryAt returns a Dentry representing the file at rp. A reference is + // taken on the returned Dentry. + // + // GetDentryAt does not correspond directly to a Linux syscall; it is used + // in the implementation of: + // + // - Syscalls that need to resolve two paths: link(), linkat(). + // + // - Syscalls that need to refer to a filesystem position outside the + // context of a file description: chdir(), fchdir(), chroot(), mount(), + // umount(). + GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) + + // GetParentDentryAt returns a Dentry representing the directory at the + // second-to-last path component in rp. (Note that, despite the name, this + // is not necessarily the parent directory of the file at rp, since the + // last path component in rp may be "." or "..".) A reference is taken on + // the returned Dentry. + // + // GetParentDentryAt does not correspond directly to a Linux syscall; it is + // used in the implementation of the rename() family of syscalls, which + // must resolve the parent directories of two paths. + // + // Preconditions: !rp.Done(). + // + // Postconditions: If GetParentDentryAt returns a nil error, then + // rp.Final(). If GetParentDentryAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). + GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) + + // LinkAt creates a hard link at rp representing the same file as vd. It + // does not take ownership of references on vd. + // + // Errors: + // + // - If the last path component in rp is "." or "..", LinkAt returns + // EEXIST. + // + // - If a file already exists at rp, LinkAt returns EEXIST. + // + // - If rp.MustBeDir(), LinkAt returns ENOENT. + // + // - If the directory in which the link would be created has been removed + // by RmdirAt or RenameAt, LinkAt returns ENOENT. + // + // - If rp.Mount != vd.Mount(), LinkAt returns EXDEV. + // + // - If vd represents a directory, LinkAt returns EPERM. + // + // - If vd represents a file for which all existing links have been + // removed, or a file created by open(O_TMPFILE|O_EXCL), LinkAt returns + // ENOENT. Equivalently, if vd represents a file with a link count of 0 not + // created by open(O_TMPFILE) without O_EXCL, LinkAt returns ENOENT. + // + // Preconditions: !rp.Done(). For the final path component in rp, + // !rp.ShouldFollowSymlink(). + // + // Postconditions: If LinkAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). + LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error + + // MkdirAt creates a directory at rp. + // + // Errors: + // + // - If the last path component in rp is "." or "..", MkdirAt returns + // EEXIST. + // + // - If a file already exists at rp, MkdirAt returns EEXIST. + // + // - If the directory in which the new directory would be created has been + // removed by RmdirAt or RenameAt, MkdirAt returns ENOENT. + // + // Preconditions: !rp.Done(). For the final path component in rp, + // !rp.ShouldFollowSymlink(). + // + // Postconditions: If MkdirAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). + MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error + + // MknodAt creates a regular file, device special file, or named pipe at + // rp. + // + // Errors: + // + // - If the last path component in rp is "." or "..", MknodAt returns + // EEXIST. + // + // - If a file already exists at rp, MknodAt returns EEXIST. + // + // - If rp.MustBeDir(), MknodAt returns ENOENT. + // + // - If the directory in which the file would be created has been removed + // by RmdirAt or RenameAt, MknodAt returns ENOENT. + // + // Preconditions: !rp.Done(). For the final path component in rp, + // !rp.ShouldFollowSymlink(). + // + // Postconditions: If MknodAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). + MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error + + // OpenAt returns an FileDescription providing access to the file at rp. A + // reference is taken on the returned FileDescription. + // + // Errors: + // + // - If opts.Flags specifies O_TMPFILE and this feature is unsupported by + // the implementation, OpenAt returns EOPNOTSUPP. (All other unsupported + // features are silently ignored, consistently with Linux's open*(2).) + OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) + + // ReadlinkAt returns the target of the symbolic link at rp. + // + // Errors: + // + // - If the file at rp is not a symbolic link, ReadlinkAt returns EINVAL. + ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) + + // RenameAt renames the file named oldName in directory oldParentVD to rp. + // It does not take ownership of references on oldParentVD. + // + // Errors [1]: + // + // - If opts.Flags specifies unsupported options, RenameAt returns EINVAL. + // + // - If the last path component in rp is "." or "..", and opts.Flags + // contains RENAME_NOREPLACE, RenameAt returns EEXIST. + // + // - If the last path component in rp is "." or "..", and opts.Flags does + // not contain RENAME_NOREPLACE, RenameAt returns EBUSY. + // + // - If rp.Mount != oldParentVD.Mount(), RenameAt returns EXDEV. + // + // - If the renamed file is not a directory, and opts.MustBeDir is true, + // RenameAt returns ENOTDIR. + // + // - If renaming would replace an existing file and opts.Flags contains + // RENAME_NOREPLACE, RenameAt returns EEXIST. + // + // - If there is no existing file at rp and opts.Flags contains + // RENAME_EXCHANGE, RenameAt returns ENOENT. + // + // - If there is an existing non-directory file at rp, and rp.MustBeDir() + // is true, RenameAt returns ENOTDIR. + // + // - If the renamed file is not a directory, opts.Flags does not contain + // RENAME_EXCHANGE, and rp.MustBeDir() is true, RenameAt returns ENOTDIR. + // (This check is not subsumed by the check for directory replacement below + // since it applies even if there is no file to replace.) + // + // - If the renamed file is a directory, and the new parent directory of + // the renamed file is either the renamed directory or a descendant + // subdirectory of the renamed directory, RenameAt returns EINVAL. + // + // - If renaming would exchange the renamed file with an ancestor directory + // of the renamed file, RenameAt returns EINVAL. + // + // - If renaming would replace an ancestor directory of the renamed file, + // RenameAt returns ENOTEMPTY. (This check would be subsumed by the + // non-empty directory check below; however, this check takes place before + // the self-rename check.) + // + // - If the renamed file would replace or exchange with itself (i.e. the + // source and destination paths resolve to the same file), RenameAt returns + // nil, skipping the checks described below. + // + // - If the source or destination directory is not writable by the provider + // of rp.Credentials(), RenameAt returns EACCES. + // + // - If the renamed file is a directory, and renaming would replace a + // non-directory file, RenameAt returns ENOTDIR. + // + // - If the renamed file is not a directory, and renaming would replace a + // directory, RenameAt returns EISDIR. + // + // - If the new parent directory of the renamed file has been removed by + // RmdirAt or a preceding call to RenameAt, RenameAt returns ENOENT. + // + // - If the renamed file is a directory, it is not writable by the + // provider of rp.Credentials(), and the source and destination parent + // directories are different, RenameAt returns EACCES. (This is nominally + // required to change the ".." entry in the renamed directory.) + // + // - If renaming would replace a non-empty directory, RenameAt returns + // ENOTEMPTY. + // + // Preconditions: !rp.Done(). For the final path component in rp, + // !rp.ShouldFollowSymlink(). oldParentVD.Dentry() was obtained from a + // previous call to + // oldParentVD.Mount().Filesystem().Impl().GetParentDentryAt(). oldName is + // not "." or "..". + // + // Postconditions: If RenameAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). + // + // [1] "The worst of all namespace operations - renaming directory. + // "Perverted" doesn't even start to describe it. Somebody in UCB had a + // heck of a trip..." - fs/namei.c:vfs_rename() + RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error + + // RmdirAt removes the directory at rp. + // + // Errors: + // + // - If the last path component in rp is ".", RmdirAt returns EINVAL. + // + // - If the last path component in rp is "..", RmdirAt returns ENOTEMPTY. + // + // - If no file exists at rp, RmdirAt returns ENOENT. + // + // - If the file at rp exists but is not a directory, RmdirAt returns + // ENOTDIR. + // + // Preconditions: !rp.Done(). For the final path component in rp, + // !rp.ShouldFollowSymlink(). + // + // Postconditions: If RmdirAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). + RmdirAt(ctx context.Context, rp *ResolvingPath) error + + // SetStatAt updates metadata for the file at the given path. Implementations + // are responsible for checking if the operation can be performed + // (see vfs.CheckSetStat() for common checks). + // + // Errors: + // + // - If opts specifies unsupported options, SetStatAt returns EINVAL. + SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error + + // StatAt returns metadata for the file at rp. + StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) + + // StatFSAt returns metadata for the filesystem containing the file at rp. + // (This method takes a path because a FilesystemImpl may consist of any + // number of constituent filesystems.) + StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) + + // SymlinkAt creates a symbolic link at rp referring to the given target. + // + // Errors: + // + // - If the last path component in rp is "." or "..", SymlinkAt returns + // EEXIST. + // + // - If a file already exists at rp, SymlinkAt returns EEXIST. + // + // - If rp.MustBeDir(), SymlinkAt returns ENOENT. + // + // - If the directory in which the symbolic link would be created has been + // removed by RmdirAt or RenameAt, SymlinkAt returns ENOENT. + // + // Preconditions: !rp.Done(). For the final path component in rp, + // !rp.ShouldFollowSymlink(). + // + // Postconditions: If SymlinkAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). + SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error + + // UnlinkAt removes the file at rp. + // + // Errors: + // + // - If the last path component in rp is "." or "..", UnlinkAt returns + // EISDIR. + // + // - If no file exists at rp, UnlinkAt returns ENOENT. + // + // - If rp.MustBeDir(), and the file at rp exists and is not a directory, + // UnlinkAt returns ENOTDIR. + // + // - If the file at rp exists but is a directory, UnlinkAt returns EISDIR. + // + // Preconditions: !rp.Done(). For the final path component in rp, + // !rp.ShouldFollowSymlink(). + // + // Postconditions: If UnlinkAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). + UnlinkAt(ctx context.Context, rp *ResolvingPath) error + + // ListxattrAt returns all extended attribute names for the file at rp. + // + // Errors: + // + // - If extended attributes are not supported by the filesystem, + // ListxattrAt returns ENOTSUP. + // + // - If the size of the list (including a NUL terminating byte after every + // entry) would exceed size, ERANGE may be returned. Note that + // implementations are free to ignore size entirely and return without + // error). In all cases, if size is 0, the list should be returned without + // error, regardless of size. + ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) + + // GetxattrAt returns the value associated with the given extended + // attribute for the file at rp. + // + // Errors: + // + // - If extended attributes are not supported by the filesystem, GetxattrAt + // returns ENOTSUP. + // + // - If an extended attribute named opts.Name does not exist, ENODATA is + // returned. + // + // - If the size of the return value exceeds opts.Size, ERANGE may be + // returned (note that implementations are free to ignore opts.Size entirely + // and return without error). In all cases, if opts.Size is 0, the value + // should be returned without error, regardless of size. + GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error) + + // SetxattrAt changes the value associated with the given extended + // attribute for the file at rp. + // + // Errors: + // + // - If extended attributes are not supported by the filesystem, SetxattrAt + // returns ENOTSUP. + // + // - If XATTR_CREATE is set in opts.Flag and opts.Name already exists, + // EEXIST is returned. If XATTR_REPLACE is set and opts.Name does not exist, + // ENODATA is returned. + SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error + + // RemovexattrAt removes the given extended attribute from the file at rp. + // + // Errors: + // + // - If extended attributes are not supported by the filesystem, + // RemovexattrAt returns ENOTSUP. + // + // - If name does not exist, ENODATA is returned. + RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error + + // BoundEndpointAt returns the Unix socket endpoint bound at the path rp. + // + // Errors: + // + // - If the file does not have write permissions, then BoundEndpointAt + // returns EACCES. + // + // - If a non-socket file exists at rp, then BoundEndpointAt returns + // ECONNREFUSED. + BoundEndpointAt(ctx context.Context, rp *ResolvingPath, opts BoundEndpointOptions) (transport.BoundEndpoint, error) + + // PrependPath prepends a path from vd to vd.Mount().Root() to b. + // + // If vfsroot.Ok(), it is the contextual VFS root; if it is encountered + // before vd.Mount().Root(), PrependPath should stop prepending path + // components and return a PrependPathAtVFSRootError. + // + // If traversal of vd.Dentry()'s ancestors encounters an independent + // ("root") Dentry that is not vd.Mount().Root() (i.e. vd.Dentry() is not a + // descendant of vd.Mount().Root()), PrependPath should stop prepending + // path components and return a PrependPathAtNonMountRootError. + // + // Filesystems for which Dentries do not have meaningful paths may prepend + // an arbitrary descriptive string to b and then return a + // PrependPathSyntheticError. + // + // Most implementations can acquire the appropriate locks to ensure that + // Dentry.Name() and Dentry.Parent() are fixed for vd.Dentry() and all of + // its ancestors, then call GenericPrependPath. + // + // Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl. + PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error +} + +// PrependPathAtVFSRootError is returned by implementations of +// FilesystemImpl.PrependPath() when they encounter the contextual VFS root. +type PrependPathAtVFSRootError struct{} + +// Error implements error.Error. +func (PrependPathAtVFSRootError) Error() string { + return "vfs.FilesystemImpl.PrependPath() reached VFS root" +} + +// PrependPathAtNonMountRootError is returned by implementations of +// FilesystemImpl.PrependPath() when they encounter an independent ancestor +// Dentry that is not the Mount root. +type PrependPathAtNonMountRootError struct{} + +// Error implements error.Error. +func (PrependPathAtNonMountRootError) Error() string { + return "vfs.FilesystemImpl.PrependPath() reached root other than Mount root" +} + +// PrependPathSyntheticError is returned by implementations of +// FilesystemImpl.PrependPath() for which prepended names do not represent real +// paths. +type PrependPathSyntheticError struct{} + +// Error implements error.Error. +func (PrependPathSyntheticError) Error() string { + return "vfs.FilesystemImpl.PrependPath() prepended synthetic name" +} diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go new file mode 100644 index 000000000..465e610e0 --- /dev/null +++ b/pkg/sentry/vfs/filesystem_impl_util.go @@ -0,0 +1,43 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "strings" +) + +// GenericParseMountOptions parses a comma-separated list of options of the +// form "key" or "key=value", where neither key nor value contain commas, and +// returns it as a map. If str contains duplicate keys, then the last value +// wins. For example: +// +// str = "key0=value0,key1,key2=value2,key0=value3" -> map{'key0':'value3','key1':'','key2':'value2'} +// +// GenericParseMountOptions is not appropriate if values may contain commas, +// e.g. in the case of the mpol mount option for tmpfs(5). +func GenericParseMountOptions(str string) map[string]string { + m := make(map[string]string) + for _, opt := range strings.Split(str, ",") { + if len(opt) > 0 { + res := strings.SplitN(opt, "=", 2) + if len(res) == 2 { + m[res[0]] = res[1] + } else { + m[opt] = "" + } + } + } + return m +} diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go new file mode 100644 index 000000000..f2298f7f6 --- /dev/null +++ b/pkg/sentry/vfs/filesystem_type.go @@ -0,0 +1,117 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +// A FilesystemType constructs filesystems. +// +// FilesystemType is analogous to Linux's struct file_system_type. +type FilesystemType interface { + // GetFilesystem returns a Filesystem configured by the given options, + // along with its mount root. A reference is taken on the returned + // Filesystem and Dentry. + GetFilesystem(ctx context.Context, vfsObj *VirtualFilesystem, creds *auth.Credentials, source string, opts GetFilesystemOptions) (*Filesystem, *Dentry, error) + + // Name returns the name of this FilesystemType. + Name() string +} + +// GetFilesystemOptions contains options to FilesystemType.GetFilesystem. +type GetFilesystemOptions struct { + // Data is the string passed as the 5th argument to mount(2), which is + // usually a comma-separated list of filesystem-specific mount options. + Data string + + // InternalData holds opaque FilesystemType-specific data. There is + // intentionally no way for applications to specify InternalData; if it is + // not nil, the call to GetFilesystem originates from within the sentry. + InternalData interface{} +} + +// +stateify savable +type registeredFilesystemType struct { + fsType FilesystemType + opts RegisterFilesystemTypeOptions +} + +// RegisterFilesystemTypeOptions contains options to +// VirtualFilesystem.RegisterFilesystem(). +type RegisterFilesystemTypeOptions struct { + // If AllowUserMount is true, allow calls to VirtualFilesystem.MountAt() + // for which MountOptions.InternalMount == false to use this filesystem + // type. + AllowUserMount bool + + // If AllowUserList is true, make this filesystem type visible in + // /proc/filesystems. + AllowUserList bool + + // If RequiresDevice is true, indicate that mounting this filesystem + // requires a block device as the mount source in /proc/filesystems. + RequiresDevice bool +} + +// RegisterFilesystemType registers the given FilesystemType in vfs with the +// given name. +func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) error { + vfs.fsTypesMu.Lock() + defer vfs.fsTypesMu.Unlock() + if existing, ok := vfs.fsTypes[name]; ok { + return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing.fsType) + } + vfs.fsTypes[name] = ®isteredFilesystemType{ + fsType: fsType, + opts: *opts, + } + return nil +} + +// MustRegisterFilesystemType is equivalent to RegisterFilesystemType but +// panics on failure. +func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) { + if err := vfs.RegisterFilesystemType(name, fsType, opts); err != nil { + panic(fmt.Sprintf("failed to register filesystem type %T: %v", fsType, err)) + } +} + +func (vfs *VirtualFilesystem) getFilesystemType(name string) *registeredFilesystemType { + vfs.fsTypesMu.RLock() + defer vfs.fsTypesMu.RUnlock() + return vfs.fsTypes[name] +} + +// GenerateProcFilesystems emits the contents of /proc/filesystems for vfs to +// buf. +func (vfs *VirtualFilesystem) GenerateProcFilesystems(buf *bytes.Buffer) { + vfs.fsTypesMu.RLock() + defer vfs.fsTypesMu.RUnlock() + for name, rft := range vfs.fsTypes { + if !rft.opts.AllowUserList { + continue + } + var nodev string + if !rft.opts.RequiresDevice { + nodev = "nodev" + } + fmt.Fprintf(buf, "%s\t%s\n", nodev, name) + } +} diff --git a/pkg/sentry/vfs/g3doc/inotify.md b/pkg/sentry/vfs/g3doc/inotify.md new file mode 100644 index 000000000..e7da49faa --- /dev/null +++ b/pkg/sentry/vfs/g3doc/inotify.md @@ -0,0 +1,210 @@ +# Inotify + +Inotify is a mechanism for monitoring filesystem events in Linux--see +inotify(7). An inotify instance can be used to monitor files and directories for +modifications, creation/deletion, etc. The inotify API consists of system calls +that create inotify instances (inotify_init/inotify_init1) and add/remove +watches on files to an instance (inotify_add_watch/inotify_rm_watch). Events are +generated from various places in the sentry, including the syscall layer, the +vfs layer, the process fd table, and within each filesystem implementation. This +document outlines the implementation details of inotify in VFS2. + +## Inotify Objects + +Inotify data structures are implemented in the vfs package. + +### vfs.Inotify + +Inotify instances are represented by vfs.Inotify objects, which implement +vfs.FileDescriptionImpl. As in Linux, inotify fds are backed by a +pseudo-filesystem (anonfs). Each inotify instance receives events from a set of +vfs.Watch objects, which can be modified with inotify_add_watch(2) and +inotify_rm_watch(2). An application can retrieve events by reading the inotify +fd. + +### vfs.Watches + +The set of all watches held on a single file (i.e., the watch target) is stored +in vfs.Watches. Each watch will belong to a different inotify instance (an +instance can only have one watch on any watch target). The watches are stored in +a map indexed by their vfs.Inotify owner’s id. Hard links and file descriptions +to a single file will all share the same vfs.Watches. Activity on the target +causes its vfs.Watches to generate notifications on its watches’ inotify +instances. + +### vfs.Watch + +A single watch, owned by one inotify instance and applied to one watch target. +Both the vfs.Inotify owner and vfs.Watches on the target will hold a vfs.Watch, +which leads to some complicated locking behavior (see Lock Ordering). Whenever a +watch is notified of an event on its target, it will queue events to its inotify +instance for delivery to the user. + +### vfs.Event + +vfs.Event is a simple struct encapsulating all the fields for an inotify event. +It is generated by vfs.Watches and forwarded to the watches' owners. It is +serialized to the user during read(2) syscalls on the associated fs.Inotify's +fd. + +## Lock Ordering + +There are three locks related to the inotify implementation: + +Inotify.mu: the inotify instance lock. Inotify.evMu: the inotify event queue +lock. Watches.mu: the watch set lock, used to protect the collection of watches +on a target. + +The correct lock ordering for inotify code is: + +Inotify.mu -> Watches.mu -> Inotify.evMu. + +Note that we use a distinct lock to protect the inotify event queue. If we +simply used Inotify.mu, we could simultaneously have locks being acquired in the +order of Inotify.mu -> Watches.mu and Watches.mu -> Inotify.mu, which would +cause deadlocks. For instance, adding a watch to an inotify instance would +require locking Inotify.mu, and then adding the same watch to the target would +cause Watches.mu to be held. At the same time, generating an event on the target +would require Watches.mu to be held before iterating through each watch, and +then notifying the owner of each watch would cause Inotify.mu to be held. + +See the vfs package comment to understand how inotify locks fit into the overall +ordering of filesystem locks. + +## Watch Targets in Different Filesystem Implementations + +In Linux, watches reside on inodes at the virtual filesystem layer. As a result, +all hard links and file descriptions on a single file will all share the same +watch set. In VFS2, there is no common inode structure across filesystem types +(some may not even have inodes), so we have to plumb inotify support through +each specific filesystem implementation. Some of the technical considerations +are outlined below. + +### Tmpfs + +For filesystems with inodes, like tmpfs, the design is quite similar to that of +Linux, where watches reside on the inode. + +### Pseudo-filesystems + +Technically, because inotify is implemented at the vfs layer in Linux, +pseudo-filesystems on top of kernfs support inotify passively. However, watches +can only track explicit filesystem operations like read/write, open/close, +mknod, etc., so watches on a target like /proc/self/fd will not generate events +every time a new fd is added or removed. As of this writing, we leave inotify +unimplemented in kernfs and anonfs; it does not seem particularly useful. + +### Gofer Filesystem (fsimpl/gofer) + +The gofer filesystem has several traits that make it difficult to support +inotify: + +* **There are no inodes.** A file is represented as a dentry that holds an + unopened p9 file (and possibly an open FID), through which the Sentry + interacts with the gofer. + * *Solution:* Because there is no inode structure stored in the sandbox, + inotify watches must be held on the dentry. This would be an issue in + the presence of hard links, where multiple dentries would need to share + the same set of watches, but in VFS2, we do not support the internal + creation of hard links on gofer fs. As a result, we make the assumption + that every dentry corresponds to a unique inode. However, the next point + raises an issue with this assumption: +* **The Sentry cannot always be aware of hard links on the remote + filesystem.** There is no way for us to confirm whether two files on the + remote filesystem are actually links to the same inode. QIDs and inodes are + not always 1:1. The assumption that dentries and inodes are 1:1 is + inevitably broken if there are remote hard links that we cannot detect. + * *Solution:* this is an issue with gofer fs in general, not only inotify, + and we will have to live with it. +* **Dentries can be cached, and then evicted.** Dentry lifetime does not + correspond to file lifetime. Because gofer fs is not entirely in-memory, the + absence of a dentry does not mean that the corresponding file does not + exist, nor does a dentry reaching zero references mean that the + corresponding file no longer exists. When a dentry reaches zero references, + it will be cached, in case the file at that path is needed again in the + future. However, the dentry may be evicted from the cache, which will cause + a new dentry to be created next time the same file path is used. The + existing watches will be lost. + * *Solution:* When a dentry reaches zero references, do not cache it if it + has any watches, so we can avoid eviction/destruction. Note that if the + dentry was deleted or invalidated (d.vfsd.IsDead()), we should still + destroy it along with its watches. Additionally, when a dentry’s last + watch is removed, we cache it if it also has zero references. This way, + the dentry can eventually be evicted from memory if it is no longer + needed. +* **Dentries can be invalidated.** Another issue with dentry lifetime is that + the remote file at the file path represented may change from underneath the + dentry. In this case, the next time that the dentry is used, it will be + invalidated and a new dentry will replace it. In this case, it is not clear + what should be done with the watches on the old dentry. + * *Solution:* Silently destroy the watches when invalidation occurs. We + have no way of knowing exactly what happened, when it happens. Inotify + instances on NFS files in Linux probably behave in a similar fashion, + since inotify is implemented at the vfs layer and is not aware of the + complexities of remote file systems. + * An alternative would be to issue some kind of event upon invalidation, + e.g. a delete event, but this has several issues: + * We cannot discern whether the remote file was invalidated because it was + moved, deleted, etc. This information is crucial, because these cases + should result in different events. Furthermore, the watches should only + be destroyed if the file has been deleted. + * Moreover, the mechanism for detecting whether the underlying file has + changed is to check whether a new QID is given by the gofer. This may + result in false positives, e.g. suppose that the server closed and + re-opened the same file, which may result in a new QID. + * Finally, the time of the event may be completely different from the time + of the file modification, since a dentry is not immediately notified + when the underlying file has changed. It would be quite unexpected to + receive the notification when invalidation was triggered, i.e. the next + time the file was accessed within the sandbox, because then the + read/write/etc. operation on the file would not result in the expected + event. + * Another point in favor of the first solution: inotify in Linux can + already be lossy on local filesystems (one of the sacrifices made so + that filesystem performance isn’t killed), and it is lossy on NFS for + similar reasons to gofer fs. Therefore, it is better for inotify to be + silent than to emit incorrect notifications. +* **There may be external users of the remote filesystem.** We can only track + operations performed on the file within the sandbox. This is sufficient + under InteropModeExclusive, but whenever there are external users, the set + of actions we are aware of is incomplete. + * *Solution:* We could either return an error or just issue a warning when + inotify is used without InteropModeExclusive. Although faulty, VFS1 + allows it when the filesystem is shared, and Linux does the same for + remote filesystems (as mentioned above, inotify sits at the vfs level). + +## Dentry Interface + +For events that must be generated above the vfs layer, we provide the following +DentryImpl methods to allow interactions with targets on any FilesystemImpl: + +* **InotifyWithParent()** generates events on the dentry’s watches as well as + its parent’s. +* **Watches()** retrieves the watch set of the target represented by the + dentry. This is used to access and modify watches on a target. +* **OnZeroWatches()** performs cleanup tasks after the last watch is removed + from a dentry. This is needed by gofer fs, which must allow a watched dentry + to be cached once it has no more watches. Most implementations can just do + nothing. Note that OnZeroWatches() must be called after all inotify locks + are released to preserve lock ordering, since it may acquire + FilesystemImpl-specific locks. + +## IN_EXCL_UNLINK + +There are several options that can be set for a watch, specified as part of the +mask in inotify_add_watch(2). In particular, IN_EXCL_UNLINK requires some +additional support in each filesystem. + +A watch with IN_EXCL_UNLINK will not generate events for its target if it +corresponds to a path that was unlinked. For instance, if an fd is opened on +“foo/bar” and “foo/bar” is subsequently unlinked, any reads/writes/etc. on the +fd will be ignored by watches on “foo” or “foo/bar” with IN_EXCL_UNLINK. This +requires each DentryImpl to keep track of whether it has been unlinked, in order +to determine whether events should be sent to watches with IN_EXCL_UNLINK. + +## IN_ONESHOT + +One-shot watches expire after generating a single event. When an event occurs, +all one-shot watches on the target that successfully generated an event are +removed. Lock ordering can cause the management of one-shot watches to be quite +expensive; see Watches.Notify() for more information. diff --git a/pkg/sentry/vfs/genericfstree/BUILD b/pkg/sentry/vfs/genericfstree/BUILD new file mode 100644 index 000000000..d8fd92677 --- /dev/null +++ b/pkg/sentry/vfs/genericfstree/BUILD @@ -0,0 +1,16 @@ +load("//tools/go_generics:defs.bzl", "go_template") + +package( + default_visibility = ["//:sandbox"], + licenses = ["notice"], +) + +go_template( + name = "generic_fstree", + srcs = [ + "genericfstree.go", + ], + types = [ + "Dentry", + ], +) diff --git a/pkg/sentry/vfs/genericfstree/genericfstree.go b/pkg/sentry/vfs/genericfstree/genericfstree.go new file mode 100644 index 000000000..8882fa84a --- /dev/null +++ b/pkg/sentry/vfs/genericfstree/genericfstree.go @@ -0,0 +1,81 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package genericfstree provides tools for implementing vfs.FilesystemImpls +// where a single statically-determined lock or set of locks is sufficient to +// ensure that a Dentry's name and parent are contextually immutable. +// +// Clients using this package must use the go_template_instance rule in +// tools/go_generics/defs.bzl to create an instantiation of this template +// package, providing types to use in place of Dentry. +package genericfstree + +import ( + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// Dentry is a required type parameter that is a struct with the given fields. +type Dentry struct { + // vfsd is the embedded vfs.Dentry corresponding to this vfs.DentryImpl. + vfsd vfs.Dentry + + // parent is the parent of this Dentry in the filesystem's tree. If this + // Dentry is a filesystem root, parent is nil. + parent *Dentry + + // name is the name of this Dentry in its parent. If this Dentry is a + // filesystem root, name is unspecified. + name string +} + +// IsAncestorDentry returns true if d is an ancestor of d2; that is, d is +// either d2's parent or an ancestor of d2's parent. +func IsAncestorDentry(d, d2 *Dentry) bool { + for d2 != nil { // Stop at root, where d2.parent == nil. + if d2.parent == d { + return true + } + if d2.parent == d2 { + return false + } + d2 = d2.parent + } + return false +} + +// ParentOrSelf returns d.parent. If d.parent is nil, ParentOrSelf returns d. +func ParentOrSelf(d *Dentry) *Dentry { + if d.parent != nil { + return d.parent + } + return d +} + +// PrependPath is a generic implementation of FilesystemImpl.PrependPath(). +func PrependPath(vfsroot vfs.VirtualDentry, mnt *vfs.Mount, d *Dentry, b *fspath.Builder) error { + for { + if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() { + return vfs.PrependPathAtVFSRootError{} + } + if &d.vfsd == mnt.Root() { + return nil + } + if d.parent == nil { + return vfs.PrependPathAtNonMountRootError{} + } + b.PrependComponent(d.name) + d = d.parent + } +} diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go new file mode 100644 index 000000000..c2e21ac5f --- /dev/null +++ b/pkg/sentry/vfs/inotify.go @@ -0,0 +1,774 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "bytes" + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/uniqueid" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// inotifyEventBaseSize is the base size of linux's struct inotify_event. This +// must be a power 2 for rounding below. +const inotifyEventBaseSize = 16 + +// EventType defines different kinds of inotfiy events. +// +// The way events are labelled appears somewhat arbitrary, but they must match +// Linux so that IN_EXCL_UNLINK behaves as it does in Linux. +type EventType uint8 + +// PathEvent and InodeEvent correspond to FSNOTIFY_EVENT_PATH and +// FSNOTIFY_EVENT_INODE in Linux. +const ( + PathEvent EventType = iota + InodeEvent EventType = iota +) + +// Inotify represents an inotify instance created by inotify_init(2) or +// inotify_init1(2). Inotify implements FileDescriptionImpl. +// +// +stateify savable +type Inotify struct { + vfsfd FileDescription + FileDescriptionDefaultImpl + DentryMetadataFileDescriptionImpl + NoLockFD + + // Unique identifier for this inotify instance. We don't just reuse the + // inotify fd because fds can be duped. These should not be exposed to the + // user, since we may aggressively reuse an id on S/R. + id uint64 + + // queue is used to notify interested parties when the inotify instance + // becomes readable or writable. + queue waiter.Queue `state:"nosave"` + + // evMu *only* protects the events list. We need a separate lock while + // queuing events: using mu may violate lock ordering, since at that point + // the calling goroutine may already hold Watches.mu. + evMu sync.Mutex `state:"nosave"` + + // A list of pending events for this inotify instance. Protected by evMu. + events eventList + + // A scratch buffer, used to serialize inotify events. Allocate this + // ahead of time for the sake of performance. Protected by evMu. + scratch []byte + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // nextWatchMinusOne is used to allocate watch descriptors on this Inotify + // instance. Note that Linux starts numbering watch descriptors from 1. + nextWatchMinusOne int32 + + // Map from watch descriptors to watch objects. + watches map[int32]*Watch +} + +var _ FileDescriptionImpl = (*Inotify)(nil) + +// NewInotifyFD constructs a new Inotify instance. +func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) (*FileDescription, error) { + // O_CLOEXEC affects file descriptors, so it must be handled outside of vfs. + flags &^= linux.O_CLOEXEC + if flags&^linux.O_NONBLOCK != 0 { + return nil, syserror.EINVAL + } + + id := uniqueid.GlobalFromContext(ctx) + vd := vfsObj.NewAnonVirtualDentry(fmt.Sprintf("[inotifyfd:%d]", id)) + defer vd.DecRef() + fd := &Inotify{ + id: id, + scratch: make([]byte, inotifyEventBaseSize), + watches: make(map[int32]*Watch), + } + if err := fd.vfsfd.Init(fd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ + UseDentryMetadata: true, + DenyPRead: true, + DenyPWrite: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// Release implements FileDescriptionImpl.Release. Release removes all +// watches and frees all resources for an inotify instance. +func (i *Inotify) Release() { + var ds []*Dentry + + // We need to hold i.mu to avoid a race with concurrent calls to + // Inotify.handleDeletion from Watches. There's no risk of Watches + // accessing this Inotify after the destructor ends, because we remove all + // references to it below. + i.mu.Lock() + for _, w := range i.watches { + // Remove references to the watch from the watches set on the target. We + // don't need to worry about the references from i.watches, since this + // file description is about to be destroyed. + d := w.target + ws := d.Watches() + // Watchable dentries should never return a nil watch set. + if ws == nil { + panic("Cannot remove watch from an unwatchable dentry") + } + ws.Remove(i.id) + if ws.Size() == 0 { + ds = append(ds, d) + } + } + i.mu.Unlock() + + for _, d := range ds { + d.OnZeroWatches() + } +} + +// Allocate implements FileDescription.Allocate. +func (i *Inotify) Allocate(ctx context.Context, mode, offset, length uint64) error { + panic("Allocate should not be called on read-only inotify fds") +} + +// EventRegister implements waiter.Waitable. +func (i *Inotify) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + i.queue.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable. +func (i *Inotify) EventUnregister(e *waiter.Entry) { + i.queue.EventUnregister(e) +} + +// Readiness implements waiter.Waitable.Readiness. +// +// Readiness indicates whether there are pending events for an inotify instance. +func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask { + ready := waiter.EventMask(0) + + i.evMu.Lock() + defer i.evMu.Unlock() + + if !i.events.Empty() { + ready |= waiter.EventIn + } + + return mask & ready +} + +// PRead implements FileDescriptionImpl. +func (*Inotify) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// PWrite implements FileDescriptionImpl. +func (*Inotify) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Write implements FileDescriptionImpl.Write. +func (*Inotify) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + return 0, syserror.EBADF +} + +// Read implements FileDescriptionImpl.Read. +func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + if dst.NumBytes() < inotifyEventBaseSize { + return 0, syserror.EINVAL + } + + i.evMu.Lock() + defer i.evMu.Unlock() + + if i.events.Empty() { + // Nothing to read yet, tell caller to block. + return 0, syserror.ErrWouldBlock + } + + var writeLen int64 + for it := i.events.Front(); it != nil; { + // Advance `it` before the element is removed from the list, or else + // it.Next() will always be nil. + event := it + it = it.Next() + + // Does the buffer have enough remaining space to hold the event we're + // about to write out? + if dst.NumBytes() < int64(event.sizeOf()) { + if writeLen > 0 { + // Buffer wasn't big enough for all pending events, but we did + // write some events out. + return writeLen, nil + } + return 0, syserror.EINVAL + } + + // Linux always dequeues an available event as long as there's enough + // buffer space to copy it out, even if the copy below fails. Emulate + // this behaviour. + i.events.Remove(event) + + // Buffer has enough space, copy event to the read buffer. + n, err := event.CopyTo(ctx, i.scratch, dst) + if err != nil { + return 0, err + } + + writeLen += n + dst = dst.DropFirst64(n) + } + return writeLen, nil +} + +// Ioctl implements fs.FileOperations.Ioctl. +func (i *Inotify) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + switch args[1].Int() { + case linux.FIONREAD: + i.evMu.Lock() + defer i.evMu.Unlock() + var n uint32 + for e := i.events.Front(); e != nil; e = e.Next() { + n += uint32(e.sizeOf()) + } + var buf [4]byte + usermem.ByteOrder.PutUint32(buf[:], n) + _, err := uio.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{}) + return 0, err + + default: + return 0, syserror.ENOTTY + } +} + +func (i *Inotify) queueEvent(ev *Event) { + i.evMu.Lock() + + // Check if we should coalesce the event we're about to queue with the last + // one currently in the queue. Events are coalesced if they are identical. + if last := i.events.Back(); last != nil { + if ev.equals(last) { + // "Coalesce" the two events by simply not queuing the new one. We + // don't need to raise a waiter.EventIn notification because no new + // data is available for reading. + i.evMu.Unlock() + return + } + } + + i.events.PushBack(ev) + + // Release mutex before notifying waiters because we don't control what they + // can do. + i.evMu.Unlock() + + i.queue.Notify(waiter.EventIn) +} + +// newWatchLocked creates and adds a new watch to target. +// +// Precondition: i.mu must be locked. ws must be the watch set for target d. +func (i *Inotify) newWatchLocked(d *Dentry, ws *Watches, mask uint32) *Watch { + w := &Watch{ + owner: i, + wd: i.nextWatchIDLocked(), + target: d, + mask: mask, + } + + // Hold the watch in this inotify instance as well as the watch set on the + // target. + i.watches[w.wd] = w + ws.Add(w) + return w +} + +// newWatchIDLocked allocates and returns a new watch descriptor. +// +// Precondition: i.mu must be locked. +func (i *Inotify) nextWatchIDLocked() int32 { + i.nextWatchMinusOne++ + return i.nextWatchMinusOne +} + +// AddWatch constructs a new inotify watch and adds it to the target. It +// returns the watch descriptor returned by inotify_add_watch(2). +// +// The caller must hold a reference on target. +func (i *Inotify) AddWatch(target *Dentry, mask uint32) (int32, error) { + // Note: Locking this inotify instance protects the result returned by + // Lookup() below. With the lock held, we know for sure the lookup result + // won't become stale because it's impossible for *this* instance to + // add/remove watches on target. + i.mu.Lock() + defer i.mu.Unlock() + + ws := target.Watches() + if ws == nil { + // While Linux supports inotify watches on all filesystem types, watches on + // filesystems like kernfs are not generally useful, so we do not. + return 0, syserror.EPERM + } + // Does the target already have a watch from this inotify instance? + if existing := ws.Lookup(i.id); existing != nil { + newmask := mask + if mask&linux.IN_MASK_ADD != 0 { + // "Add (OR) events to watch mask for this pathname if it already + // exists (instead of replacing mask)." -- inotify(7) + newmask |= atomic.LoadUint32(&existing.mask) + } + atomic.StoreUint32(&existing.mask, newmask) + return existing.wd, nil + } + + // No existing watch, create a new watch. + w := i.newWatchLocked(target, ws, mask) + return w.wd, nil +} + +// RmWatch looks up an inotify watch for the given 'wd' and configures the +// target to stop sending events to this inotify instance. +func (i *Inotify) RmWatch(wd int32) error { + i.mu.Lock() + + // Find the watch we were asked to removed. + w, ok := i.watches[wd] + if !ok { + i.mu.Unlock() + return syserror.EINVAL + } + + // Remove the watch from this instance. + delete(i.watches, wd) + + // Remove the watch from the watch target. + ws := w.target.Watches() + // AddWatch ensures that w.target has a non-nil watch set. + if ws == nil { + panic("Watched dentry cannot have nil watch set") + } + ws.Remove(w.OwnerID()) + remaining := ws.Size() + i.mu.Unlock() + + if remaining == 0 { + w.target.OnZeroWatches() + } + + // Generate the event for the removal. + i.queueEvent(newEvent(wd, "", linux.IN_IGNORED, 0)) + + return nil +} + +// Watches is the collection of all inotify watches on a single file. +// +// +stateify savable +type Watches struct { + // mu protects the fields below. + mu sync.RWMutex `state:"nosave"` + + // ws is the map of active watches in this collection, keyed by the inotify + // instance id of the owner. + ws map[uint64]*Watch +} + +// Size returns the number of watches held by w. +func (w *Watches) Size() int { + w.mu.Lock() + defer w.mu.Unlock() + return len(w.ws) +} + +// Lookup returns the watch owned by an inotify instance with the given id. +// Returns nil if no such watch exists. +// +// Precondition: the inotify instance with the given id must be locked to +// prevent the returned watch from being concurrently modified or replaced in +// Inotify.watches. +func (w *Watches) Lookup(id uint64) *Watch { + w.mu.Lock() + defer w.mu.Unlock() + return w.ws[id] +} + +// Add adds watch into this set of watches. +// +// Precondition: the inotify instance with the given id must be locked. +func (w *Watches) Add(watch *Watch) { + w.mu.Lock() + defer w.mu.Unlock() + + owner := watch.OwnerID() + // Sanity check, we should never have two watches for one owner on the + // same target. + if _, exists := w.ws[owner]; exists { + panic(fmt.Sprintf("Watch collision with ID %+v", owner)) + } + if w.ws == nil { + w.ws = make(map[uint64]*Watch) + } + w.ws[owner] = watch +} + +// Remove removes a watch with the given id from this set of watches and +// releases it. The caller is responsible for generating any watch removal +// event, as appropriate. The provided id must match an existing watch in this +// collection. +// +// Precondition: the inotify instance with the given id must be locked. +func (w *Watches) Remove(id uint64) { + w.mu.Lock() + defer w.mu.Unlock() + + if w.ws == nil { + // This watch set is being destroyed. The thread executing the + // destructor is already in the process of deleting all our watches. We + // got here with no references on the target because we raced with the + // destructor notifying all the watch owners of destruction. See the + // comment in Watches.HandleDeletion for why this race exists. + return + } + + // It is possible for w.Remove() to be called for the same watch multiple + // times. See the treatment of one-shot watches in Watches.Notify(). + if _, ok := w.ws[id]; ok { + delete(w.ws, id) + } +} + +// Notify queues a new event with watches in this set. Watches with +// IN_EXCL_UNLINK are skipped if the event is coming from a child that has been +// unlinked. +func (w *Watches) Notify(name string, events, cookie uint32, et EventType, unlinked bool) { + var hasExpired bool + w.mu.RLock() + for _, watch := range w.ws { + if unlinked && watch.ExcludeUnlinked() && et == PathEvent { + continue + } + if watch.Notify(name, events, cookie) { + hasExpired = true + } + } + w.mu.RUnlock() + + if hasExpired { + w.cleanupExpiredWatches() + } +} + +// This function is relatively expensive and should only be called where there +// are expired watches. +func (w *Watches) cleanupExpiredWatches() { + // Because of lock ordering, we cannot acquire Inotify.mu for each watch + // owner while holding w.mu. As a result, store expired watches locally + // before removing. + var toRemove []*Watch + w.mu.RLock() + for _, watch := range w.ws { + if atomic.LoadInt32(&watch.expired) == 1 { + toRemove = append(toRemove, watch) + } + } + w.mu.RUnlock() + for _, watch := range toRemove { + watch.owner.RmWatch(watch.wd) + } +} + +// HandleDeletion is called when the watch target is destroyed. Clear the +// watch set, detach watches from the inotify instances they belong to, and +// generate the appropriate events. +func (w *Watches) HandleDeletion() { + w.Notify("", linux.IN_DELETE_SELF, 0, InodeEvent, true /* unlinked */) + + // As in Watches.Notify, we can't hold w.mu while acquiring Inotify.mu for + // the owner of each watch being deleted. Instead, atomically store the + // watches map in a local variable and set it to nil so we can iterate over + // it with the assurance that there will be no concurrent accesses. + var ws map[uint64]*Watch + w.mu.Lock() + ws = w.ws + w.ws = nil + w.mu.Unlock() + + // Remove each watch from its owner's watch set, and generate a corresponding + // watch removal event. + for _, watch := range ws { + i := watch.owner + i.mu.Lock() + _, found := i.watches[watch.wd] + delete(i.watches, watch.wd) + + // Release mutex before notifying waiters because we don't control what + // they can do. + i.mu.Unlock() + + // If watch was not found, it was removed from the inotify instance before + // we could get to it, in which case we should not generate an event. + if found { + i.queueEvent(newEvent(watch.wd, "", linux.IN_IGNORED, 0)) + } + } +} + +// Watch represent a particular inotify watch created by inotify_add_watch. +// +// +stateify savable +type Watch struct { + // Inotify instance which owns this watch. + // + // This field is immutable after creation. + owner *Inotify + + // Descriptor for this watch. This is unique across an inotify instance. + // + // This field is immutable after creation. + wd int32 + + // target is a dentry representing the watch target. Its watch set contains this watch. + // + // This field is immutable after creation. + target *Dentry + + // Events being monitored via this watch. Must be accessed with atomic + // memory operations. + mask uint32 + + // expired is set to 1 to indicate that this watch is a one-shot that has + // already sent a notification and therefore can be removed. Must be accessed + // with atomic memory operations. + expired int32 +} + +// OwnerID returns the id of the inotify instance that owns this watch. +func (w *Watch) OwnerID() uint64 { + return w.owner.id +} + +// ExcludeUnlinked indicates whether the watched object should continue to be +// notified of events originating from a path that has been unlinked. +// +// For example, if "foo/bar" is opened and then unlinked, operations on the +// open fd may be ignored by watches on "foo" and "foo/bar" with IN_EXCL_UNLINK. +func (w *Watch) ExcludeUnlinked() bool { + return atomic.LoadUint32(&w.mask)&linux.IN_EXCL_UNLINK != 0 +} + +// Notify queues a new event on this watch. Returns true if this is a one-shot +// watch that should be deleted, after this event was successfully queued. +func (w *Watch) Notify(name string, events uint32, cookie uint32) bool { + if atomic.LoadInt32(&w.expired) == 1 { + // This is a one-shot watch that is already in the process of being + // removed. This may happen if a second event reaches the watch target + // before this watch has been removed. + return false + } + + mask := atomic.LoadUint32(&w.mask) + if mask&events == 0 { + // We weren't watching for this event. + return false + } + + // Event mask should include bits matched from the watch plus all control + // event bits. + unmaskableBits := ^uint32(0) &^ linux.IN_ALL_EVENTS + effectiveMask := unmaskableBits | mask + matchedEvents := effectiveMask & events + w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie)) + if mask&linux.IN_ONESHOT != 0 { + atomic.StoreInt32(&w.expired, 1) + return true + } + return false +} + +// Event represents a struct inotify_event from linux. +// +// +stateify savable +type Event struct { + eventEntry + + wd int32 + mask uint32 + cookie uint32 + + // len is computed based on the name field is set automatically by + // Event.setName. It should be 0 when no name is set; otherwise it is the + // length of the name slice. + len uint32 + + // The name field has special padding requirements and should only be set by + // calling Event.setName. + name []byte +} + +func newEvent(wd int32, name string, events, cookie uint32) *Event { + e := &Event{ + wd: wd, + mask: events, + cookie: cookie, + } + if name != "" { + e.setName(name) + } + return e +} + +// paddedBytes converts a go string to a null-terminated c-string, padded with +// null bytes to a total size of 'l'. 'l' must be large enough for all the bytes +// in the 's' plus at least one null byte. +func paddedBytes(s string, l uint32) []byte { + if l < uint32(len(s)+1) { + panic("Converting string to byte array results in truncation, this can lead to buffer-overflow due to the missing null-byte!") + } + b := make([]byte, l) + copy(b, s) + + // b was zero-value initialized during make(), so the rest of the slice is + // already filled with null bytes. + + return b +} + +// setName sets the optional name for this event. +func (e *Event) setName(name string) { + // We need to pad the name such that the entire event length ends up a + // multiple of inotifyEventBaseSize. + unpaddedLen := len(name) + 1 + // Round up to nearest multiple of inotifyEventBaseSize. + e.len = uint32((unpaddedLen + inotifyEventBaseSize - 1) & ^(inotifyEventBaseSize - 1)) + // Make sure we haven't overflowed and wrapped around when rounding. + if unpaddedLen > int(e.len) { + panic("Overflow when rounding inotify event size, the 'name' field was too big.") + } + e.name = paddedBytes(name, e.len) +} + +func (e *Event) sizeOf() int { + s := inotifyEventBaseSize + int(e.len) + if s < inotifyEventBaseSize { + panic("Overflowed event size") + } + return s +} + +// CopyTo serializes this event to dst. buf is used as a scratch buffer to +// construct the output. We use a buffer allocated ahead of time for +// performance. buf must be at least inotifyEventBaseSize bytes. +func (e *Event) CopyTo(ctx context.Context, buf []byte, dst usermem.IOSequence) (int64, error) { + usermem.ByteOrder.PutUint32(buf[0:], uint32(e.wd)) + usermem.ByteOrder.PutUint32(buf[4:], e.mask) + usermem.ByteOrder.PutUint32(buf[8:], e.cookie) + usermem.ByteOrder.PutUint32(buf[12:], e.len) + + writeLen := 0 + + n, err := dst.CopyOut(ctx, buf) + if err != nil { + return 0, err + } + writeLen += n + dst = dst.DropFirst(n) + + if e.len > 0 { + n, err = dst.CopyOut(ctx, e.name) + if err != nil { + return 0, err + } + writeLen += n + } + + // Santiy check. + if writeLen != e.sizeOf() { + panic(fmt.Sprintf("Serialized unexpected amount of data for an event, expected %d, wrote %d.", e.sizeOf(), writeLen)) + } + + return int64(writeLen), nil +} + +func (e *Event) equals(other *Event) bool { + return e.wd == other.wd && + e.mask == other.mask && + e.cookie == other.cookie && + e.len == other.len && + bytes.Equal(e.name, other.name) +} + +// InotifyEventFromStatMask generates the appropriate events for an operation +// that set the stats specified in mask. +func InotifyEventFromStatMask(mask uint32) uint32 { + var ev uint32 + if mask&(linux.STATX_UID|linux.STATX_GID|linux.STATX_MODE) != 0 { + ev |= linux.IN_ATTRIB + } + if mask&linux.STATX_SIZE != 0 { + ev |= linux.IN_MODIFY + } + + if (mask & (linux.STATX_ATIME | linux.STATX_MTIME)) == (linux.STATX_ATIME | linux.STATX_MTIME) { + // Both times indicates a utime(s) call. + ev |= linux.IN_ATTRIB + } else if mask&linux.STATX_ATIME != 0 { + ev |= linux.IN_ACCESS + } else if mask&linux.STATX_MTIME != 0 { + mask |= linux.IN_MODIFY + } + return ev +} + +// InotifyRemoveChild sends the appriopriate notifications to the watch sets of +// the child being removed and its parent. Note that unlike most pairs of +// parent/child notifications, the child is notified first in this case. +func InotifyRemoveChild(self, parent *Watches, name string) { + if self != nil { + self.Notify("", linux.IN_ATTRIB, 0, InodeEvent, true /* unlinked */) + } + if parent != nil { + parent.Notify(name, linux.IN_DELETE, 0, InodeEvent, true /* unlinked */) + } +} + +// InotifyRename sends the appriopriate notifications to the watch sets of the +// file being renamed and its old/new parents. +func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, oldName, newName string, isDir bool) { + var dirEv uint32 + if isDir { + dirEv = linux.IN_ISDIR + } + cookie := uniqueid.InotifyCookie(ctx) + if oldParent != nil { + oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent, false /* unlinked */) + } + if newParent != nil { + newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent, false /* unlinked */) + } + // Somewhat surprisingly, self move events do not have a cookie. + if renamed != nil { + renamed.Notify("", linux.IN_MOVE_SELF, 0, InodeEvent, false /* unlinked */) + } +} diff --git a/pkg/sentry/vfs/lock.go b/pkg/sentry/vfs/lock.go new file mode 100644 index 000000000..6c7583a81 --- /dev/null +++ b/pkg/sentry/vfs/lock.go @@ -0,0 +1,109 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package lock provides POSIX and BSD style file locking for VFS2 file +// implementations. +// +// The actual implementations can be found in the lock package under +// sentry/fs/lock. +package vfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/syserror" +) + +// FileLocks supports POSIX and BSD style locks, which correspond to fcntl(2) +// and flock(2) respectively in Linux. It can be embedded into various file +// implementations for VFS2 that support locking. +// +// Note that in Linux these two types of locks are _not_ cooperative, because +// race and deadlock conditions make merging them prohibitive. We do the same +// and keep them oblivious to each other. +type FileLocks struct { + // bsd is a set of BSD-style advisory file wide locks, see flock(2). + bsd fslock.Locks + + // posix is a set of POSIX-style regional advisory locks, see fcntl(2). + posix fslock.Locks +} + +// LockBSD tries to acquire a BSD-style lock on the entire file. +func (fl *FileLocks) LockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + if fl.bsd.LockRegion(uid, t, fslock.LockRange{0, fslock.LockEOF}, block) { + return nil + } + return syserror.ErrWouldBlock +} + +// UnlockBSD releases a BSD-style lock on the entire file. +// +// This operation is always successful, even if there did not exist a lock on +// the requested region held by uid in the first place. +func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) { + fl.bsd.UnlockRegion(uid, fslock.LockRange{0, fslock.LockEOF}) +} + +// LockPOSIX tries to acquire a POSIX-style lock on a file region. +func (fl *FileLocks) LockPOSIX(ctx context.Context, fd *FileDescription, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + rng, err := computeRange(ctx, fd, start, length, whence) + if err != nil { + return err + } + if fl.posix.LockRegion(uid, t, rng, block) { + return nil + } + return syserror.ErrWouldBlock +} + +// UnlockPOSIX releases a POSIX-style lock on a file region. +// +// This operation is always successful, even if there did not exist a lock on +// the requested region held by uid in the first place. +func (fl *FileLocks) UnlockPOSIX(ctx context.Context, fd *FileDescription, uid fslock.UniqueID, start, length uint64, whence int16) error { + rng, err := computeRange(ctx, fd, start, length, whence) + if err != nil { + return err + } + fl.posix.UnlockRegion(uid, rng) + return nil +} + +func computeRange(ctx context.Context, fd *FileDescription, start uint64, length uint64, whence int16) (fslock.LockRange, error) { + var off int64 + switch whence { + case linux.SEEK_SET: + off = 0 + case linux.SEEK_CUR: + // Note that Linux does not hold any mutexes while retrieving the file + // offset, see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk. + curOff, err := fd.Seek(ctx, 0, linux.SEEK_CUR) + if err != nil { + return fslock.LockRange{}, err + } + off = curOff + case linux.SEEK_END: + stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_SIZE}) + if err != nil { + return fslock.LockRange{}, err + } + off = int64(stat.Size) + default: + return fslock.LockRange{}, syserror.EINVAL + } + + return fslock.ComputeRange(int64(start), int64(length), off) +} diff --git a/pkg/sentry/vfs/memxattr/BUILD b/pkg/sentry/vfs/memxattr/BUILD new file mode 100644 index 000000000..d8c4d27b9 --- /dev/null +++ b/pkg/sentry/vfs/memxattr/BUILD @@ -0,0 +1,15 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "memxattr", + srcs = ["xattr.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/vfs/memxattr/xattr.go b/pkg/sentry/vfs/memxattr/xattr.go new file mode 100644 index 000000000..cc1e7d764 --- /dev/null +++ b/pkg/sentry/vfs/memxattr/xattr.go @@ -0,0 +1,102 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package memxattr provides a default, in-memory extended attribute +// implementation. +package memxattr + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// SimpleExtendedAttributes implements extended attributes using a map of +// names to values. +// +// +stateify savable +type SimpleExtendedAttributes struct { + // mu protects the below fields. + mu sync.RWMutex `state:"nosave"` + xattrs map[string]string +} + +// Getxattr returns the value at 'name'. +func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string, error) { + x.mu.RLock() + value, ok := x.xattrs[opts.Name] + x.mu.RUnlock() + if !ok { + return "", syserror.ENODATA + } + // Check that the size of the buffer provided in getxattr(2) is large enough + // to contain the value. + if opts.Size != 0 && uint64(len(value)) > opts.Size { + return "", syserror.ERANGE + } + return value, nil +} + +// Setxattr sets 'value' at 'name'. +func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error { + x.mu.Lock() + defer x.mu.Unlock() + if x.xattrs == nil { + if opts.Flags&linux.XATTR_REPLACE != 0 { + return syserror.ENODATA + } + x.xattrs = make(map[string]string) + } + + _, ok := x.xattrs[opts.Name] + if ok && opts.Flags&linux.XATTR_CREATE != 0 { + return syserror.EEXIST + } + if !ok && opts.Flags&linux.XATTR_REPLACE != 0 { + return syserror.ENODATA + } + + x.xattrs[opts.Name] = opts.Value + return nil +} + +// Listxattr returns all names in xattrs. +func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) { + // Keep track of the size of the buffer needed in listxattr(2) for the list. + listSize := 0 + x.mu.RLock() + names := make([]string, 0, len(x.xattrs)) + for n := range x.xattrs { + names = append(names, n) + // Add one byte per null terminator. + listSize += len(n) + 1 + } + x.mu.RUnlock() + if size != 0 && uint64(listSize) > size { + return nil, syserror.ERANGE + } + return names, nil +} + +// Removexattr removes the xattr at 'name'. +func (x *SimpleExtendedAttributes) Removexattr(name string) error { + x.mu.Lock() + defer x.mu.Unlock() + if _, ok := x.xattrs[name]; !ok { + return syserror.ENODATA + } + delete(x.xattrs, name) + return nil +} diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go new file mode 100644 index 000000000..32f901bd8 --- /dev/null +++ b/pkg/sentry/vfs/mount.go @@ -0,0 +1,903 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "bytes" + "fmt" + "math" + "sort" + "strings" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +// A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem +// (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem +// (Mount.fs), which applies to path resolution in the context of a particular +// Mount (Mount.key.parent). +// +// Mounts are reference-counted. Unless otherwise specified, all Mount methods +// require that a reference is held. +// +// Mount and Filesystem are distinct types because it's possible for a single +// Filesystem to be mounted at multiple locations and/or in multiple mount +// namespaces. +// +// Mount is analogous to Linux's struct mount. (gVisor does not distinguish +// between struct mount and struct vfsmount.) +// +// +stateify savable +type Mount struct { + // vfs, fs, root are immutable. References are held on fs and root. + // + // Invariant: root belongs to fs. + vfs *VirtualFilesystem + fs *Filesystem + root *Dentry + + // ID is the immutable mount ID. + ID uint64 + + // Flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except + // for MS_RDONLY which is tracked in "writers". Immutable. + Flags MountFlags + + // key is protected by VirtualFilesystem.mountMu and + // VirtualFilesystem.mounts.seq, and may be nil. References are held on + // key.parent and key.point if they are not nil. + // + // Invariant: key.parent != nil iff key.point != nil. key.point belongs to + // key.parent.fs. + key mountKey + + // ns is the namespace in which this Mount was mounted. ns is protected by + // VirtualFilesystem.mountMu. + ns *MountNamespace + + // The lower 63 bits of refs are a reference count. The MSB of refs is set + // if the Mount has been eagerly umounted, as by umount(2) without the + // MNT_DETACH flag. refs is accessed using atomic memory operations. + refs int64 + + // children is the set of all Mounts for which Mount.key.parent is this + // Mount. children is protected by VirtualFilesystem.mountMu. + children map[*Mount]struct{} + + // umounted is true if VFS.umountRecursiveLocked() has been called on this + // Mount. VirtualFilesystem does not hold a reference on Mounts for which + // umounted is true. umounted is protected by VirtualFilesystem.mountMu. + umounted bool + + // The lower 63 bits of writers is the number of calls to + // Mount.CheckBeginWrite() that have not yet been paired with a call to + // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect. + // writers is accessed using atomic memory operations. + writers int64 +} + +func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount { + mnt := &Mount{ + ID: atomic.AddUint64(&vfs.lastMountID, 1), + Flags: opts.Flags, + vfs: vfs, + fs: fs, + root: root, + ns: mntns, + refs: 1, + } + if opts.ReadOnly { + mnt.setReadOnlyLocked(true) + } + return mnt +} + +// Options returns a copy of the MountOptions currently applicable to mnt. +func (mnt *Mount) Options() MountOptions { + mnt.vfs.mountMu.Lock() + defer mnt.vfs.mountMu.Unlock() + return MountOptions{ + Flags: mnt.Flags, + ReadOnly: mnt.readOnly(), + } +} + +// A MountNamespace is a collection of Mounts.// +// MountNamespaces are reference-counted. Unless otherwise specified, all +// MountNamespace methods require that a reference is held. +// +// MountNamespace is analogous to Linux's struct mnt_namespace. +// +// +stateify savable +type MountNamespace struct { + // Owner is the usernamespace that owns this mount namespace. + Owner *auth.UserNamespace + + // root is the MountNamespace's root mount. root is immutable. + root *Mount + + // refs is the reference count. refs is accessed using atomic memory + // operations. + refs int64 + + // mountpoints maps all Dentries which are mount points in this namespace + // to the number of Mounts for which they are mount points. mountpoints is + // protected by VirtualFilesystem.mountMu. + // + // mountpoints is used to determine if a Dentry can be moved or removed + // (which requires that the Dentry is not a mount point in the calling + // namespace). + // + // mountpoints is maintained even if there are no references held on the + // MountNamespace; this is required to ensure that + // VFS.PrepareDeleteDentry() and VFS.PrepareRemoveDentry() operate + // correctly on unreferenced MountNamespaces. + mountpoints map[*Dentry]uint32 +} + +// NewMountNamespace returns a new mount namespace with a root filesystem +// configured by the given arguments. A reference is taken on the returned +// MountNamespace. +func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) { + rft := vfs.getFilesystemType(fsTypeName) + if rft == nil { + ctx.Warningf("Unknown filesystem type: %s", fsTypeName) + return nil, syserror.ENODEV + } + fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts) + if err != nil { + return nil, err + } + mntns := &MountNamespace{ + Owner: creds.UserNamespace, + refs: 1, + mountpoints: make(map[*Dentry]uint32), + } + mntns.root = newMount(vfs, fs, root, mntns, &MountOptions{}) + return mntns, nil +} + +// NewDisconnectedMount returns a Mount representing fs with the given root +// (which may be nil). The new Mount is not associated with any MountNamespace +// and is not connected to any other Mounts. References are taken on fs and +// root. +func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry, opts *MountOptions) (*Mount, error) { + fs.IncRef() + if root != nil { + root.IncRef() + } + return newMount(vfs, fs, root, nil /* mntns */, opts), nil +} + +// MountDisconnected creates a Filesystem configured by the given arguments, +// then returns a Mount representing it. The new Mount is not associated with +// any MountNamespace and is not connected to any other Mounts. +func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth.Credentials, source string, fsTypeName string, opts *MountOptions) (*Mount, error) { + rft := vfs.getFilesystemType(fsTypeName) + if rft == nil { + return nil, syserror.ENODEV + } + if !opts.InternalMount && !rft.opts.AllowUserMount { + return nil, syserror.ENODEV + } + fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions) + if err != nil { + return nil, err + } + defer root.DecRef() + defer fs.DecRef() + return vfs.NewDisconnectedMount(fs, root, opts) +} + +// ConnectMountAt connects mnt at the path represented by target. +// +// Preconditions: mnt must be disconnected. +func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Credentials, mnt *Mount, target *PathOperation) error { + // We can't hold vfs.mountMu while calling FilesystemImpl methods due to + // lock ordering. + vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{}) + if err != nil { + return err + } + vfs.mountMu.Lock() + vd.dentry.mu.Lock() + for { + if vd.dentry.dead { + vd.dentry.mu.Unlock() + vfs.mountMu.Unlock() + vd.DecRef() + return syserror.ENOENT + } + // vd might have been mounted over between vfs.GetDentryAt() and + // vfs.mountMu.Lock(). + if !vd.dentry.isMounted() { + break + } + nextmnt := vfs.mounts.Lookup(vd.mount, vd.dentry) + if nextmnt == nil { + break + } + // It's possible that nextmnt has been umounted but not disconnected, + // in which case vfs no longer holds a reference on it, and the last + // reference may be concurrently dropped even though we're holding + // vfs.mountMu. + if !nextmnt.tryIncMountedRef() { + break + } + // This can't fail since we're holding vfs.mountMu. + nextmnt.root.IncRef() + vd.dentry.mu.Unlock() + vd.DecRef() + vd = VirtualDentry{ + mount: nextmnt, + dentry: nextmnt.root, + } + vd.dentry.mu.Lock() + } + // TODO(gvisor.dev/issue/1035): Linux requires that either both the mount + // point and the mount root are directories, or neither are, and returns + // ENOTDIR if this is not the case. + mntns := vd.mount.ns + vfs.mounts.seq.BeginWrite() + vfs.connectLocked(mnt, vd, mntns) + vfs.mounts.seq.EndWrite() + vd.dentry.mu.Unlock() + vfs.mountMu.Unlock() + return nil +} + +// MountAt creates and mounts a Filesystem configured by the given arguments. +func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error { + mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts) + if err != nil { + return err + } + defer mnt.DecRef() + if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil { + return err + } + return nil +} + +// UmountAt removes the Mount at the given path. +func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error { + if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 { + return syserror.EINVAL + } + + // MNT_FORCE is currently unimplemented except for the permission check. + // Force unmounting specifically requires CAP_SYS_ADMIN in the root user + // namespace, and not in the owner user namespace for the target mount. See + // fs/namespace.c:SYSCALL_DEFINE2(umount, ...) + if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) { + return syserror.EPERM + } + + vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{}) + if err != nil { + return err + } + defer vd.DecRef() + if vd.dentry != vd.mount.root { + return syserror.EINVAL + } + vfs.mountMu.Lock() + if mntns := MountNamespaceFromContext(ctx); mntns != nil { + defer mntns.DecRef() + if mntns != vd.mount.ns { + vfs.mountMu.Unlock() + return syserror.EINVAL + } + } + + // TODO(gvisor.dev/issue/1035): Linux special-cases umount of the caller's + // root, which we don't implement yet (we'll just fail it since the caller + // holds a reference on it). + + vfs.mounts.seq.BeginWrite() + if opts.Flags&linux.MNT_DETACH == 0 { + if len(vd.mount.children) != 0 { + vfs.mounts.seq.EndWrite() + vfs.mountMu.Unlock() + return syserror.EBUSY + } + // We are holding a reference on vd.mount. + expectedRefs := int64(1) + if !vd.mount.umounted { + expectedRefs = 2 + } + if atomic.LoadInt64(&vd.mount.refs)&^math.MinInt64 != expectedRefs { // mask out MSB + vfs.mounts.seq.EndWrite() + vfs.mountMu.Unlock() + return syserror.EBUSY + } + } + vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(vd.mount, &umountRecursiveOptions{ + eager: opts.Flags&linux.MNT_DETACH == 0, + disconnectHierarchy: true, + }, nil, nil) + vfs.mounts.seq.EndWrite() + vfs.mountMu.Unlock() + for _, vd := range vdsToDecRef { + vd.DecRef() + } + for _, mnt := range mountsToDecRef { + mnt.DecRef() + } + return nil +} + +type umountRecursiveOptions struct { + // If eager is true, ensure that future calls to Mount.tryIncMountedRef() + // on umounted mounts fail. + // + // eager is analogous to Linux's UMOUNT_SYNC. + eager bool + + // If disconnectHierarchy is true, Mounts that are umounted hierarchically + // should be disconnected from their parents. (Mounts whose parents are not + // umounted, which in most cases means the Mount passed to the initial call + // to umountRecursiveLocked, are unconditionally disconnected for + // consistency with Linux.) + // + // disconnectHierarchy is analogous to Linux's !UMOUNT_CONNECTED. + disconnectHierarchy bool +} + +// umountRecursiveLocked marks mnt and its descendants as umounted. It does not +// release mount or dentry references; instead, it appends VirtualDentries and +// Mounts on which references must be dropped to vdsToDecRef and mountsToDecRef +// respectively, and returns updated slices. (This is necessary because +// filesystem locks possibly taken by DentryImpl.DecRef() may precede +// vfs.mountMu in the lock order, and Mount.DecRef() may lock vfs.mountMu.) +// +// umountRecursiveLocked is analogous to Linux's fs/namespace.c:umount_tree(). +// +// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a +// writer critical section. +func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecursiveOptions, vdsToDecRef []VirtualDentry, mountsToDecRef []*Mount) ([]VirtualDentry, []*Mount) { + if !mnt.umounted { + mnt.umounted = true + mountsToDecRef = append(mountsToDecRef, mnt) + if parent := mnt.parent(); parent != nil && (opts.disconnectHierarchy || !parent.umounted) { + vdsToDecRef = append(vdsToDecRef, vfs.disconnectLocked(mnt)) + } + } + if opts.eager { + for { + refs := atomic.LoadInt64(&mnt.refs) + if refs < 0 { + break + } + if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs|math.MinInt64) { + break + } + } + } + for child := range mnt.children { + vdsToDecRef, mountsToDecRef = vfs.umountRecursiveLocked(child, opts, vdsToDecRef, mountsToDecRef) + } + return vdsToDecRef, mountsToDecRef +} + +// connectLocked makes vd the mount parent/point for mnt. It consumes +// references held by vd. +// +// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a +// writer critical section. d.mu must be locked. mnt.parent() == nil, i.e. mnt +// must not already be connected. +func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) { + if checkInvariants { + if mnt.parent() != nil { + panic("VFS.connectLocked called on connected mount") + } + } + mnt.IncRef() // dropped by callers of umountRecursiveLocked + mnt.storeKey(vd) + if vd.mount.children == nil { + vd.mount.children = make(map[*Mount]struct{}) + } + vd.mount.children[mnt] = struct{}{} + atomic.AddUint32(&vd.dentry.mounts, 1) + mnt.ns = mntns + mntns.mountpoints[vd.dentry]++ + vfs.mounts.insertSeqed(mnt) + vfsmpmounts, ok := vfs.mountpoints[vd.dentry] + if !ok { + vfsmpmounts = make(map[*Mount]struct{}) + vfs.mountpoints[vd.dentry] = vfsmpmounts + } + vfsmpmounts[mnt] = struct{}{} +} + +// disconnectLocked makes vd have no mount parent/point and returns its old +// mount parent/point with a reference held. +// +// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a +// writer critical section. mnt.parent() != nil. +func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry { + vd := mnt.loadKey() + if checkInvariants { + if vd.mount != nil { + panic("VFS.disconnectLocked called on disconnected mount") + } + } + mnt.storeKey(VirtualDentry{}) + delete(vd.mount.children, mnt) + atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1 + mnt.ns.mountpoints[vd.dentry]-- + if mnt.ns.mountpoints[vd.dentry] == 0 { + delete(mnt.ns.mountpoints, vd.dentry) + } + vfs.mounts.removeSeqed(mnt) + vfsmpmounts := vfs.mountpoints[vd.dentry] + delete(vfsmpmounts, mnt) + if len(vfsmpmounts) == 0 { + delete(vfs.mountpoints, vd.dentry) + } + return vd +} + +// tryIncMountedRef increments mnt's reference count and returns true. If mnt's +// reference count is already zero, or has been eagerly umounted, +// tryIncMountedRef does nothing and returns false. +// +// tryIncMountedRef does not require that a reference is held on mnt. +func (mnt *Mount) tryIncMountedRef() bool { + for { + refs := atomic.LoadInt64(&mnt.refs) + if refs <= 0 { // refs < 0 => MSB set => eagerly unmounted + return false + } + if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs+1) { + return true + } + } +} + +// IncRef increments mnt's reference count. +func (mnt *Mount) IncRef() { + // In general, negative values for mnt.refs are valid because the MSB is + // the eager-unmount bit. + atomic.AddInt64(&mnt.refs, 1) +} + +// DecRef decrements mnt's reference count. +func (mnt *Mount) DecRef() { + refs := atomic.AddInt64(&mnt.refs, -1) + if refs&^math.MinInt64 == 0 { // mask out MSB + var vd VirtualDentry + if mnt.parent() != nil { + mnt.vfs.mountMu.Lock() + mnt.vfs.mounts.seq.BeginWrite() + vd = mnt.vfs.disconnectLocked(mnt) + mnt.vfs.mounts.seq.EndWrite() + mnt.vfs.mountMu.Unlock() + } + mnt.root.DecRef() + mnt.fs.DecRef() + if vd.Ok() { + vd.DecRef() + } + } +} + +// IncRef increments mntns' reference count. +func (mntns *MountNamespace) IncRef() { + if atomic.AddInt64(&mntns.refs, 1) <= 1 { + panic("MountNamespace.IncRef() called without holding a reference") + } +} + +// DecRef decrements mntns' reference count. +func (mntns *MountNamespace) DecRef() { + vfs := mntns.root.fs.VirtualFilesystem() + if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 { + vfs.mountMu.Lock() + vfs.mounts.seq.BeginWrite() + vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(mntns.root, &umountRecursiveOptions{ + disconnectHierarchy: true, + }, nil, nil) + vfs.mounts.seq.EndWrite() + vfs.mountMu.Unlock() + for _, vd := range vdsToDecRef { + vd.DecRef() + } + for _, mnt := range mountsToDecRef { + mnt.DecRef() + } + } else if refs < 0 { + panic("MountNamespace.DecRef() called without holding a reference") + } +} + +// getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes +// a reference on the returned Mount. If (mnt, d) is not a mount point, +// getMountAt returns nil. +// +// getMountAt is analogous to Linux's fs/namei.c:follow_mount(). +// +// Preconditions: References are held on mnt and d. +func (vfs *VirtualFilesystem) getMountAt(mnt *Mount, d *Dentry) *Mount { + // The first mount is special-cased: + // + // - The caller is assumed to have checked d.isMounted() already. (This + // isn't a precondition because it doesn't matter for correctness.) + // + // - We return nil, instead of mnt, if there is no mount at (mnt, d). + // + // - We don't drop the caller's references on mnt and d. +retryFirst: + next := vfs.mounts.Lookup(mnt, d) + if next == nil { + return nil + } + if !next.tryIncMountedRef() { + // Raced with umount. + goto retryFirst + } + mnt = next + d = next.root + // We don't need to take Dentry refs anywhere in this function because + // Mounts hold references on Mount.root, which is immutable. + for d.isMounted() { + next := vfs.mounts.Lookup(mnt, d) + if next == nil { + break + } + if !next.tryIncMountedRef() { + // Raced with umount. + continue + } + mnt.DecRef() + mnt = next + d = next.root + } + return mnt +} + +// getMountpointAt returns the mount point for the stack of Mounts including +// mnt. It takes a reference on the returned VirtualDentry. If no such mount +// point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil). +// +// Preconditions: References are held on mnt and root. vfsroot is not (mnt, +// mnt.root). +func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) VirtualDentry { + // The first mount is special-cased: + // + // - The caller must have already checked mnt against vfsroot. + // + // - We return nil, instead of mnt, if there is no mount point for mnt. + // + // - We don't drop the caller's reference on mnt. +retryFirst: + epoch := vfs.mounts.seq.BeginRead() + parent, point := mnt.parent(), mnt.point() + if !vfs.mounts.seq.ReadOk(epoch) { + goto retryFirst + } + if parent == nil { + return VirtualDentry{} + } + if !parent.tryIncMountedRef() { + // Raced with umount. + goto retryFirst + } + if !point.TryIncRef() { + // Since Mount holds a reference on Mount.key.point, this can only + // happen due to a racing change to Mount.key. + parent.DecRef() + goto retryFirst + } + if !vfs.mounts.seq.ReadOk(epoch) { + point.DecRef() + parent.DecRef() + goto retryFirst + } + mnt = parent + d := point + for { + if mnt == vfsroot.mount && d == vfsroot.dentry { + break + } + if d != mnt.root { + break + } + retryNotFirst: + epoch := vfs.mounts.seq.BeginRead() + parent, point := mnt.parent(), mnt.point() + if !vfs.mounts.seq.ReadOk(epoch) { + goto retryNotFirst + } + if parent == nil { + break + } + if !parent.tryIncMountedRef() { + // Raced with umount. + goto retryNotFirst + } + if !point.TryIncRef() { + // Since Mount holds a reference on Mount.key.point, this can + // only happen due to a racing change to Mount.key. + parent.DecRef() + goto retryNotFirst + } + if !vfs.mounts.seq.ReadOk(epoch) { + point.DecRef() + parent.DecRef() + goto retryNotFirst + } + d.DecRef() + mnt.DecRef() + mnt = parent + d = point + } + return VirtualDentry{mnt, d} +} + +// CheckBeginWrite increments the counter of in-progress write operations on +// mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns +// EROFS. +// +// If CheckBeginWrite succeeds, EndWrite must be called when the write +// operation is finished. +func (mnt *Mount) CheckBeginWrite() error { + if atomic.AddInt64(&mnt.writers, 1) < 0 { + atomic.AddInt64(&mnt.writers, -1) + return syserror.EROFS + } + return nil +} + +// EndWrite indicates that a write operation signaled by a previous successful +// call to CheckBeginWrite has finished. +func (mnt *Mount) EndWrite() { + atomic.AddInt64(&mnt.writers, -1) +} + +// Preconditions: VirtualFilesystem.mountMu must be locked. +func (mnt *Mount) setReadOnlyLocked(ro bool) error { + if oldRO := atomic.LoadInt64(&mnt.writers) < 0; oldRO == ro { + return nil + } + if ro { + if !atomic.CompareAndSwapInt64(&mnt.writers, 0, math.MinInt64) { + return syserror.EBUSY + } + return nil + } + // Unset MSB without dropping any temporary increments from failed calls to + // mnt.CheckBeginWrite(). + atomic.AddInt64(&mnt.writers, math.MinInt64) + return nil +} + +func (mnt *Mount) readOnly() bool { + return atomic.LoadInt64(&mnt.writers) < 0 +} + +// Filesystem returns the mounted Filesystem. It does not take a reference on +// the returned Filesystem. +func (mnt *Mount) Filesystem() *Filesystem { + return mnt.fs +} + +// submountsLocked returns this Mount and all Mounts that are descendents of +// it. +// +// Precondition: mnt.vfs.mountMu must be held. +func (mnt *Mount) submountsLocked() []*Mount { + mounts := []*Mount{mnt} + for m := range mnt.children { + mounts = append(mounts, m.submountsLocked()...) + } + return mounts +} + +// Root returns the mount's root. It does not take a reference on the returned +// Dentry. +func (mnt *Mount) Root() *Dentry { + return mnt.root +} + +// Root returns mntns' root. A reference is taken on the returned +// VirtualDentry. +func (mntns *MountNamespace) Root() VirtualDentry { + vd := VirtualDentry{ + mount: mntns.root, + dentry: mntns.root.root, + } + vd.IncRef() + return vd +} + +// GenerateProcMounts emits the contents of /proc/[pid]/mounts for vfs to buf. +// +// Preconditions: taskRootDir.Ok(). +func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) { + vfs.mountMu.Lock() + defer vfs.mountMu.Unlock() + rootMnt := taskRootDir.mount + mounts := rootMnt.submountsLocked() + sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID }) + for _, mnt := range mounts { + // Get the path to this mount relative to task root. + mntRootVD := VirtualDentry{ + mount: mnt, + dentry: mnt.root, + } + path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD) + if err != nil { + // For some reason we didn't get a path. Log a warning + // and run with empty path. + ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err) + path = "" + } + if path == "" { + // Either an error occurred, or path is not reachable + // from root. + break + } + + opts := "rw" + if mnt.readOnly() { + opts = "ro" + } + if mnt.Flags.NoATime { + opts = ",noatime" + } + if mnt.Flags.NoExec { + opts += ",noexec" + } + + // Format: + // <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order> + // + // The "needs dump" and "fsck order" flags are always 0, which + // is allowed. + fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", path, mnt.fs.FilesystemType().Name(), opts, 0, 0) + } +} + +// GenerateProcMountInfo emits the contents of /proc/[pid]/mountinfo for vfs to +// buf. +// +// Preconditions: taskRootDir.Ok(). +func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) { + vfs.mountMu.Lock() + defer vfs.mountMu.Unlock() + rootMnt := taskRootDir.mount + mounts := rootMnt.submountsLocked() + sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID }) + for _, mnt := range mounts { + // Get the path to this mount relative to task root. + mntRootVD := VirtualDentry{ + mount: mnt, + dentry: mnt.root, + } + path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD) + if err != nil { + // For some reason we didn't get a path. Log a warning + // and run with empty path. + ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err) + path = "" + } + if path == "" { + // Either an error occurred, or path is not reachable + // from root. + break + } + // Stat the mount root to get the major/minor device numbers. + pop := &PathOperation{ + Root: mntRootVD, + Start: mntRootVD, + } + statx, err := vfs.StatAt(ctx, auth.NewAnonymousCredentials(), pop, &StatOptions{}) + if err != nil { + // Well that's not good. Ignore this mount. + break + } + + // Format: + // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue + // (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) + + // (1) Mount ID. + fmt.Fprintf(buf, "%d ", mnt.ID) + + // (2) Parent ID (or this ID if there is no parent). + pID := mnt.ID + if p := mnt.parent(); p != nil { + pID = p.ID + } + fmt.Fprintf(buf, "%d ", pID) + + // (3) Major:Minor device ID. We don't have a superblock, so we + // just use the root inode device number. + fmt.Fprintf(buf, "%d:%d ", statx.DevMajor, statx.DevMinor) + + // (4) Root: the pathname of the directory in the filesystem + // which forms the root of this mount. + // + // NOTE(b/78135857): This will always be "/" until we implement + // bind mounts. + fmt.Fprintf(buf, "/ ") + + // (5) Mount point (relative to process root). + fmt.Fprintf(buf, "%s ", manglePath(path)) + + // (6) Mount options. + opts := "rw" + if mnt.readOnly() { + opts = "ro" + } + if mnt.Flags.NoATime { + opts = ",noatime" + } + if mnt.Flags.NoExec { + opts += ",noexec" + } + fmt.Fprintf(buf, "%s ", opts) + + // (7) Optional fields: zero or more fields of the form "tag[:value]". + // (8) Separator: the end of the optional fields is marked by a single hyphen. + fmt.Fprintf(buf, "- ") + + // (9) Filesystem type. + fmt.Fprintf(buf, "%s ", mnt.fs.FilesystemType().Name()) + + // (10) Mount source: filesystem-specific information or "none". + fmt.Fprintf(buf, "none ") + + // (11) Superblock options, and final newline. + fmt.Fprintf(buf, "%s\n", superBlockOpts(path, mnt)) + } +} + +// manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents. +// See Linux fs/seq_file.c:mangle_path. +func manglePath(p string) string { + r := strings.NewReplacer(" ", "\\040", "\t", "\\011", "\n", "\\012", "\\", "\\134") + return r.Replace(p) +} + +// superBlockOpts returns the super block options string for the the mount at +// the given path. +func superBlockOpts(mountPath string, mnt *Mount) string { + // gVisor doesn't (yet) have a concept of super block options, so we + // use the ro/rw bit from the mount flag. + opts := "rw" + if mnt.readOnly() { + opts = "ro" + } + + // NOTE(b/147673608): If the mount is a cgroup, we also need to include + // the cgroup name in the options. For now we just read that from the + // path. + // + // TODO(gvisor.dev/issue/190): Once gVisor has full cgroup support, we + // should get this value from the cgroup itself, and not rely on the + // path. + if mnt.fs.FilesystemType().Name() == "cgroup" { + splitPath := strings.Split(mountPath, "/") + cgroupType := splitPath[len(splitPath)-1] + opts += "," + cgroupType + } + return opts +} diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go new file mode 100644 index 000000000..3335e4057 --- /dev/null +++ b/pkg/sentry/vfs/mount_test.go @@ -0,0 +1,458 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "fmt" + "runtime" + "testing" + + "gvisor.dev/gvisor/pkg/sync" +) + +func TestMountTableLookupEmpty(t *testing.T) { + var mt mountTable + mt.Init() + + parent := &Mount{} + point := &Dentry{} + if m := mt.Lookup(parent, point); m != nil { + t.Errorf("empty mountTable lookup: got %p, wanted nil", m) + } +} + +func TestMountTableInsertLookup(t *testing.T) { + var mt mountTable + mt.Init() + + mount := &Mount{} + mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}}) + mt.Insert(mount) + + if m := mt.Lookup(mount.parent(), mount.point()); m != mount { + t.Errorf("mountTable positive lookup: got %p, wanted %p", m, mount) + } + + otherParent := &Mount{} + if m := mt.Lookup(otherParent, mount.point()); m != nil { + t.Errorf("mountTable lookup with wrong mount parent: got %p, wanted nil", m) + } + otherPoint := &Dentry{} + if m := mt.Lookup(mount.parent(), otherPoint); m != nil { + t.Errorf("mountTable lookup with wrong mount point: got %p, wanted nil", m) + } +} + +// TODO(gvisor.dev/issue/1035): concurrent lookup/insertion/removal. + +// must be powers of 2 +var benchNumMounts = []int{1 << 2, 1 << 5, 1 << 8} + +// For all of the following: +// +// - BenchmarkMountTableFoo tests usage pattern "Foo" for mountTable. +// +// - BenchmarkMountMapFoo tests usage pattern "Foo" for a +// sync.RWMutex-protected map. (Mutator benchmarks do not use a RWMutex, since +// mountTable also requires external synchronization between mutators.) +// +// - BenchmarkMountSyncMapFoo tests usage pattern "Foo" for a sync.Map. +// +// ParallelLookup is by far the most common and performance-sensitive operation +// for this application. NegativeLookup is also important, but less so (only +// relevant with multiple mount namespaces and significant differences in +// mounts between them). Insertion and removal are benchmarked for +// completeness. +const enableComparativeBenchmarks = false + +func newBenchMount() *Mount { + mount := &Mount{} + mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}}) + return mount +} + +func BenchmarkMountTableParallelLookup(b *testing.B) { + for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 { + for _, numMounts := range benchNumMounts { + desc := fmt.Sprintf("%dx%d", numG, numMounts) + b.Run(desc, func(b *testing.B) { + var mt mountTable + mt.Init() + keys := make([]VirtualDentry, 0, numMounts) + for i := 0; i < numMounts; i++ { + mount := newBenchMount() + mt.Insert(mount) + keys = append(keys, mount.loadKey()) + } + + var ready sync.WaitGroup + begin := make(chan struct{}) + var end sync.WaitGroup + for g := 0; g < numG; g++ { + ready.Add(1) + end.Add(1) + go func() { + defer end.Done() + ready.Done() + <-begin + for i := 0; i < b.N; i++ { + k := keys[i&(numMounts-1)] + m := mt.Lookup(k.mount, k.dentry) + if m == nil { + b.Fatalf("lookup failed") + } + if parent := m.parent(); parent != k.mount { + b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount) + } + if point := m.point(); point != k.dentry { + b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry) + } + } + }() + } + + ready.Wait() + b.ResetTimer() + close(begin) + end.Wait() + }) + } + } +} + +func BenchmarkMountMapParallelLookup(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 { + for _, numMounts := range benchNumMounts { + desc := fmt.Sprintf("%dx%d", numG, numMounts) + b.Run(desc, func(b *testing.B) { + var mu sync.RWMutex + ms := make(map[VirtualDentry]*Mount) + keys := make([]VirtualDentry, 0, numMounts) + for i := 0; i < numMounts; i++ { + mount := newBenchMount() + key := mount.loadKey() + ms[key] = mount + keys = append(keys, key) + } + + var ready sync.WaitGroup + begin := make(chan struct{}) + var end sync.WaitGroup + for g := 0; g < numG; g++ { + ready.Add(1) + end.Add(1) + go func() { + defer end.Done() + ready.Done() + <-begin + for i := 0; i < b.N; i++ { + k := keys[i&(numMounts-1)] + mu.RLock() + m := ms[k] + mu.RUnlock() + if m == nil { + b.Fatalf("lookup failed") + } + if parent := m.parent(); parent != k.mount { + b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount) + } + if point := m.point(); point != k.dentry { + b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry) + } + } + }() + } + + ready.Wait() + b.ResetTimer() + close(begin) + end.Wait() + }) + } + } +} + +func BenchmarkMountSyncMapParallelLookup(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 { + for _, numMounts := range benchNumMounts { + desc := fmt.Sprintf("%dx%d", numG, numMounts) + b.Run(desc, func(b *testing.B) { + var ms sync.Map + keys := make([]VirtualDentry, 0, numMounts) + for i := 0; i < numMounts; i++ { + mount := newBenchMount() + key := mount.loadKey() + ms.Store(key, mount) + keys = append(keys, key) + } + + var ready sync.WaitGroup + begin := make(chan struct{}) + var end sync.WaitGroup + for g := 0; g < numG; g++ { + ready.Add(1) + end.Add(1) + go func() { + defer end.Done() + ready.Done() + <-begin + for i := 0; i < b.N; i++ { + k := keys[i&(numMounts-1)] + mi, ok := ms.Load(k) + if !ok { + b.Fatalf("lookup failed") + } + m := mi.(*Mount) + if parent := m.parent(); parent != k.mount { + b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount) + } + if point := m.point(); point != k.dentry { + b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry) + } + } + }() + } + + ready.Wait() + b.ResetTimer() + close(begin) + end.Wait() + }) + } + } +} + +func BenchmarkMountTableNegativeLookup(b *testing.B) { + for _, numMounts := range benchNumMounts { + desc := fmt.Sprintf("%d", numMounts) + b.Run(desc, func(b *testing.B) { + var mt mountTable + mt.Init() + for i := 0; i < numMounts; i++ { + mt.Insert(newBenchMount()) + } + negkeys := make([]VirtualDentry, 0, numMounts) + for i := 0; i < numMounts; i++ { + negkeys = append(negkeys, VirtualDentry{ + mount: &Mount{}, + dentry: &Dentry{}, + }) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + k := negkeys[i&(numMounts-1)] + m := mt.Lookup(k.mount, k.dentry) + if m != nil { + b.Fatalf("lookup got %p, wanted nil", m) + } + } + }) + } +} + +func BenchmarkMountMapNegativeLookup(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + for _, numMounts := range benchNumMounts { + desc := fmt.Sprintf("%d", numMounts) + b.Run(desc, func(b *testing.B) { + var mu sync.RWMutex + ms := make(map[VirtualDentry]*Mount) + for i := 0; i < numMounts; i++ { + mount := newBenchMount() + ms[mount.loadKey()] = mount + } + negkeys := make([]VirtualDentry, 0, numMounts) + for i := 0; i < numMounts; i++ { + negkeys = append(negkeys, VirtualDentry{ + mount: &Mount{}, + dentry: &Dentry{}, + }) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + k := negkeys[i&(numMounts-1)] + mu.RLock() + m := ms[k] + mu.RUnlock() + if m != nil { + b.Fatalf("lookup got %p, wanted nil", m) + } + } + }) + } +} + +func BenchmarkMountSyncMapNegativeLookup(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + for _, numMounts := range benchNumMounts { + desc := fmt.Sprintf("%d", numMounts) + b.Run(desc, func(b *testing.B) { + var ms sync.Map + for i := 0; i < numMounts; i++ { + mount := newBenchMount() + ms.Store(mount.loadKey(), mount) + } + negkeys := make([]VirtualDentry, 0, numMounts) + for i := 0; i < numMounts; i++ { + negkeys = append(negkeys, VirtualDentry{ + mount: &Mount{}, + dentry: &Dentry{}, + }) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + k := negkeys[i&(numMounts-1)] + m, _ := ms.Load(k) + if m != nil { + b.Fatalf("lookup got %p, wanted nil", m) + } + } + }) + } +} + +func BenchmarkMountTableInsert(b *testing.B) { + // Preallocate Mounts so that allocation time isn't included in the + // benchmark. + mounts := make([]*Mount, 0, b.N) + for i := 0; i < b.N; i++ { + mounts = append(mounts, newBenchMount()) + } + + var mt mountTable + mt.Init() + b.ResetTimer() + for i := range mounts { + mt.Insert(mounts[i]) + } +} + +func BenchmarkMountMapInsert(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + // Preallocate Mounts so that allocation time isn't included in the + // benchmark. + mounts := make([]*Mount, 0, b.N) + for i := 0; i < b.N; i++ { + mounts = append(mounts, newBenchMount()) + } + + ms := make(map[VirtualDentry]*Mount) + b.ResetTimer() + for i := range mounts { + mount := mounts[i] + ms[mount.loadKey()] = mount + } +} + +func BenchmarkMountSyncMapInsert(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + // Preallocate Mounts so that allocation time isn't included in the + // benchmark. + mounts := make([]*Mount, 0, b.N) + for i := 0; i < b.N; i++ { + mounts = append(mounts, newBenchMount()) + } + + var ms sync.Map + b.ResetTimer() + for i := range mounts { + mount := mounts[i] + ms.Store(mount.loadKey(), mount) + } +} + +func BenchmarkMountTableRemove(b *testing.B) { + mounts := make([]*Mount, 0, b.N) + for i := 0; i < b.N; i++ { + mounts = append(mounts, newBenchMount()) + } + var mt mountTable + mt.Init() + for i := range mounts { + mt.Insert(mounts[i]) + } + + b.ResetTimer() + for i := range mounts { + mt.Remove(mounts[i]) + } +} + +func BenchmarkMountMapRemove(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + mounts := make([]*Mount, 0, b.N) + for i := 0; i < b.N; i++ { + mounts = append(mounts, newBenchMount()) + } + ms := make(map[VirtualDentry]*Mount) + for i := range mounts { + mount := mounts[i] + ms[mount.loadKey()] = mount + } + + b.ResetTimer() + for i := range mounts { + mount := mounts[i] + delete(ms, mount.loadKey()) + } +} + +func BenchmarkMountSyncMapRemove(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + mounts := make([]*Mount, 0, b.N) + for i := 0; i < b.N; i++ { + mounts = append(mounts, newBenchMount()) + } + var ms sync.Map + for i := range mounts { + mount := mounts[i] + ms.Store(mount.loadKey(), mount) + } + + b.ResetTimer() + for i := range mounts { + mount := mounts[i] + ms.Delete(mount.loadKey()) + } +} diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go new file mode 100644 index 000000000..70f850ca4 --- /dev/null +++ b/pkg/sentry/vfs/mount_unsafe.go @@ -0,0 +1,364 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build go1.12 +// +build !go1.16 + +// Check go:linkname function signatures when updating Go version. + +package vfs + +import ( + "fmt" + "math/bits" + "reflect" + "sync/atomic" + "unsafe" + + "gvisor.dev/gvisor/pkg/gohacks" + "gvisor.dev/gvisor/pkg/sync" +) + +// mountKey represents the location at which a Mount is mounted. It is +// structurally identical to VirtualDentry, but stores its fields as +// unsafe.Pointer since mutators synchronize with VFS path traversal using +// seqcounts. +type mountKey struct { + parent unsafe.Pointer // *Mount + point unsafe.Pointer // *Dentry +} + +func (mnt *Mount) parent() *Mount { + return (*Mount)(atomic.LoadPointer(&mnt.key.parent)) +} + +func (mnt *Mount) point() *Dentry { + return (*Dentry)(atomic.LoadPointer(&mnt.key.point)) +} + +func (mnt *Mount) loadKey() VirtualDentry { + return VirtualDentry{ + mount: mnt.parent(), + dentry: mnt.point(), + } +} + +// Invariant: mnt.key.parent == nil. vd.Ok(). +func (mnt *Mount) storeKey(vd VirtualDentry) { + atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(vd.mount)) + atomic.StorePointer(&mnt.key.point, unsafe.Pointer(vd.dentry)) +} + +// mountTable maps (mount parent, mount point) pairs to mounts. It supports +// efficient concurrent lookup, even in the presence of concurrent mutators +// (provided mutation is sufficiently uncommon). +// +// mountTable.Init() must be called on new mountTables before use. +// +// +stateify savable +type mountTable struct { + // mountTable is implemented as a seqcount-protected hash table that + // resolves collisions with linear probing, featuring Robin Hood insertion + // and backward shift deletion. These minimize probe length variance, + // significantly improving the performance of linear probing at high load + // factors. (mountTable doesn't use bucketing, which is the other major + // technique commonly used in high-performance hash tables; the efficiency + // of bucketing is largely due to SIMD lookup, and Go lacks both SIMD + // intrinsics and inline assembly, limiting the performance of this + // approach.) + + seq sync.SeqCount `state:"nosave"` + seed uint32 // for hashing keys + + // size holds both length (number of elements) and capacity (number of + // slots): capacity is stored as its base-2 log (referred to as order) in + // the least significant bits of size, and length is stored in the + // remaining bits. Go defines bit shifts >= width of shifted unsigned + // operand as shifting to 0, which differs from x86's SHL, so the Go + // compiler inserts a bounds check for each bit shift unless we mask order + // anyway (cf. runtime.bucketShift()), and length isn't used by lookup; + // thus this bit packing gets us more bits for the length (vs. storing + // length and cap in separate uint32s) for ~free. + size uint64 + + slots unsafe.Pointer `state:"nosave"` // []mountSlot; never nil after Init +} + +type mountSlot struct { + // We don't store keys in slots; instead, we just check Mount.parent and + // Mount.point directly. Any practical use of lookup will need to touch + // Mounts anyway, and comparing hashes means that false positives are + // extremely rare, so this isn't an extra cache line touch overall. + value unsafe.Pointer // *Mount + hash uintptr +} + +const ( + mtSizeOrderBits = 6 // log2 of pointer size in bits + mtSizeOrderMask = (1 << mtSizeOrderBits) - 1 + mtSizeOrderOne = 1 + mtSizeLenLSB = mtSizeOrderBits + mtSizeLenOne = 1 << mtSizeLenLSB + mtSizeLenNegOne = ^uint64(mtSizeOrderMask) // uint64(-1) << mtSizeLenLSB + + mountSlotBytes = unsafe.Sizeof(mountSlot{}) + mountKeyBytes = unsafe.Sizeof(mountKey{}) + + // Tuning parameters. + // + // Essentially every mountTable will contain at least /proc, /sys, and + // /dev/shm, so there is ~no reason for mtInitCap to be < 4. + mtInitOrder = 2 + mtInitCap = 1 << mtInitOrder + mtMaxLoadNum = 13 + mtMaxLoadDen = 16 +) + +func init() { + // We can't just define mtSizeOrderBits as follows because Go doesn't have + // constexpr. + if ptrBits := uint(unsafe.Sizeof(uintptr(0)) * 8); mtSizeOrderBits != bits.TrailingZeros(ptrBits) { + panic(fmt.Sprintf("mtSizeOrderBits (%d) must be %d = log2 of pointer size in bits (%d)", mtSizeOrderBits, bits.TrailingZeros(ptrBits), ptrBits)) + } + if bits.OnesCount(uint(mountSlotBytes)) != 1 { + panic(fmt.Sprintf("sizeof(mountSlotBytes) (%d) must be a power of 2 to use bit masking for wraparound", mountSlotBytes)) + } + if mtInitCap <= 1 { + panic(fmt.Sprintf("mtInitCap (%d) must be at least 2 since mountTable methods assume that there will always be at least one empty slot", mtInitCap)) + } + if mtMaxLoadNum >= mtMaxLoadDen { + panic(fmt.Sprintf("invalid mountTable maximum load factor (%d/%d)", mtMaxLoadNum, mtMaxLoadDen)) + } +} + +// Init must be called exactly once on each mountTable before use. +func (mt *mountTable) Init() { + mt.seed = rand32() + mt.size = mtInitOrder + mt.slots = newMountTableSlots(mtInitCap) +} + +func newMountTableSlots(cap uintptr) unsafe.Pointer { + slice := make([]mountSlot, cap, cap) + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) + return unsafe.Pointer(hdr.Data) +} + +// Lookup returns the Mount with the given parent, mounted at the given point. +// If no such Mount exists, Lookup returns nil. +// +// Lookup may be called even if there are concurrent mutators of mt. +func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount { + key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)} + hash := memhash(gohacks.Noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes) + +loop: + for { + epoch := mt.seq.BeginRead() + size := atomic.LoadUint64(&mt.size) + slots := atomic.LoadPointer(&mt.slots) + if !mt.seq.ReadOk(epoch) { + continue + } + tcap := uintptr(1) << (size & mtSizeOrderMask) + mask := tcap - 1 + off := (hash & mask) * mountSlotBytes + offmask := mask * mountSlotBytes + for { + // This avoids bounds checking. + slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off)) + slotValue := atomic.LoadPointer(&slot.value) + slotHash := atomic.LoadUintptr(&slot.hash) + if !mt.seq.ReadOk(epoch) { + // The element we're looking for might have been moved into a + // slot we've previously checked, so restart entirely. + continue loop + } + if slotValue == nil { + return nil + } + if slotHash == hash { + mount := (*Mount)(slotValue) + var mountKey mountKey + mountKey.parent = atomic.LoadPointer(&mount.key.parent) + mountKey.point = atomic.LoadPointer(&mount.key.point) + if !mt.seq.ReadOk(epoch) { + continue loop + } + if key == mountKey { + return mount + } + } + off = (off + mountSlotBytes) & offmask + } + } +} + +// Insert inserts the given mount into mt. +// +// Preconditions: mt must not already contain a Mount with the same mount point +// and parent. +func (mt *mountTable) Insert(mount *Mount) { + mt.seq.BeginWrite() + mt.insertSeqed(mount) + mt.seq.EndWrite() +} + +// insertSeqed inserts the given mount into mt. +// +// Preconditions: mt.seq must be in a writer critical section. mt must not +// already contain a Mount with the same mount point and parent. +func (mt *mountTable) insertSeqed(mount *Mount) { + hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes) + + // We're under the maximum load factor if: + // + // (len+1) / cap <= mtMaxLoadNum / mtMaxLoadDen + // (len+1) * mtMaxLoadDen <= mtMaxLoadNum * cap + tlen := mt.size >> mtSizeLenLSB + order := mt.size & mtSizeOrderMask + tcap := uintptr(1) << order + if ((tlen + 1) * mtMaxLoadDen) <= (uint64(mtMaxLoadNum) << order) { + // Atomically insert the new element into the table. + atomic.AddUint64(&mt.size, mtSizeLenOne) + mtInsertLocked(mt.slots, tcap, unsafe.Pointer(mount), hash) + return + } + + // Otherwise, we have to expand. Double the number of slots in the new + // table. + newOrder := order + 1 + if newOrder > mtSizeOrderMask { + panic("mount table size overflow") + } + newCap := uintptr(1) << newOrder + newSlots := newMountTableSlots(newCap) + // Copy existing elements to the new table. + oldCur := mt.slots + // Go does not permit pointers to the end of allocated objects, so we + // must use a pointer to the last element of the old table. The + // following expression is equivalent to + // `slots+(cap-1)*mountSlotBytes` but has a critical path length of 2 + // arithmetic instructions instead of 3. + oldLast := unsafe.Pointer((uintptr(mt.slots) - mountSlotBytes) + (tcap * mountSlotBytes)) + for { + oldSlot := (*mountSlot)(oldCur) + if oldSlot.value != nil { + mtInsertLocked(newSlots, newCap, oldSlot.value, oldSlot.hash) + } + if oldCur == oldLast { + break + } + oldCur = unsafe.Pointer(uintptr(oldCur) + mountSlotBytes) + } + // Insert the new element into the new table. + mtInsertLocked(newSlots, newCap, unsafe.Pointer(mount), hash) + // Switch to the new table. + atomic.AddUint64(&mt.size, mtSizeLenOne|mtSizeOrderOne) + atomic.StorePointer(&mt.slots, newSlots) +} + +// Preconditions: There are no concurrent mutators of the table (slots, cap). +// If the table is visible to readers, then mt.seq must be in a writer critical +// section. cap must be a power of 2. +func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, hash uintptr) { + mask := cap - 1 + off := (hash & mask) * mountSlotBytes + offmask := mask * mountSlotBytes + disp := uintptr(0) + for { + slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off)) + slotValue := slot.value + if slotValue == nil { + atomic.StorePointer(&slot.value, value) + atomic.StoreUintptr(&slot.hash, hash) + return + } + // If we've been displaced farther from our first-probed slot than the + // element stored in this one, swap elements and switch to inserting + // the replaced one. (This is Robin Hood insertion.) + slotHash := slot.hash + slotDisp := ((off / mountSlotBytes) - slotHash) & mask + if disp > slotDisp { + atomic.StorePointer(&slot.value, value) + atomic.StoreUintptr(&slot.hash, hash) + value = slotValue + hash = slotHash + disp = slotDisp + } + off = (off + mountSlotBytes) & offmask + disp++ + } +} + +// Remove removes the given mount from mt. +// +// Preconditions: mt must contain mount. +func (mt *mountTable) Remove(mount *Mount) { + mt.seq.BeginWrite() + mt.removeSeqed(mount) + mt.seq.EndWrite() +} + +// removeSeqed removes the given mount from mt. +// +// Preconditions: mt.seq must be in a writer critical section. mt must contain +// mount. +func (mt *mountTable) removeSeqed(mount *Mount) { + hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes) + tcap := uintptr(1) << (mt.size & mtSizeOrderMask) + mask := tcap - 1 + slots := mt.slots + off := (hash & mask) * mountSlotBytes + offmask := mask * mountSlotBytes + for { + slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off)) + slotValue := slot.value + if slotValue == unsafe.Pointer(mount) { + // Found the element to remove. Move all subsequent elements + // backward until we either find an empty slot, or an element that + // is already in its first-probed slot. (This is backward shift + // deletion.) + for { + nextOff := (off + mountSlotBytes) & offmask + nextSlot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + nextOff)) + nextSlotValue := nextSlot.value + if nextSlotValue == nil { + break + } + nextSlotHash := nextSlot.hash + if (nextOff / mountSlotBytes) == (nextSlotHash & mask) { + break + } + atomic.StorePointer(&slot.value, nextSlotValue) + atomic.StoreUintptr(&slot.hash, nextSlotHash) + off = nextOff + slot = nextSlot + } + atomic.StorePointer(&slot.value, nil) + atomic.AddUint64(&mt.size, mtSizeLenNegOne) + return + } + if checkInvariants && slotValue == nil { + panic(fmt.Sprintf("mountTable.Remove() called on missing Mount %v", mount)) + } + off = (off + mountSlotBytes) & offmask + } +} + +//go:linkname memhash runtime.memhash +func memhash(p unsafe.Pointer, seed, s uintptr) uintptr + +//go:linkname rand32 runtime.fastrand +func rand32() uint32 diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go new file mode 100644 index 000000000..f223aeda8 --- /dev/null +++ b/pkg/sentry/vfs/options.go @@ -0,0 +1,235 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" +) + +// GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and +// FilesystemImpl.GetDentryAt(). +type GetDentryOptions struct { + // If CheckSearchable is true, FilesystemImpl.GetDentryAt() must check that + // the returned Dentry is a directory for which creds has search + // permission. + CheckSearchable bool +} + +// MkdirOptions contains options to VirtualFilesystem.MkdirAt() and +// FilesystemImpl.MkdirAt(). +type MkdirOptions struct { + // Mode is the file mode bits for the created directory. + Mode linux.FileMode + + // If ForSyntheticMountpoint is true, FilesystemImpl.MkdirAt() may create + // the given directory in memory only (as opposed to persistent storage). + // The created directory should be able to support the creation of + // subdirectories with ForSyntheticMountpoint == true. It does not need to + // support the creation of subdirectories with ForSyntheticMountpoint == + // false, or files of other types. + // + // FilesystemImpls are permitted to ignore the ForSyntheticMountpoint + // option. + // + // The ForSyntheticMountpoint option exists because, unlike mount(2), the + // OCI Runtime Specification permits the specification of mount points that + // do not exist, under the expectation that container runtimes will create + // them. (More accurately, the OCI Runtime Specification completely fails + // to document this feature, but it's implemented by runc.) + // ForSyntheticMountpoint allows such mount points to be created even when + // the underlying persistent filesystem is immutable. + ForSyntheticMountpoint bool +} + +// MknodOptions contains options to VirtualFilesystem.MknodAt() and +// FilesystemImpl.MknodAt(). +type MknodOptions struct { + // Mode is the file type and mode bits for the created file. + Mode linux.FileMode + + // If Mode specifies a character or block device special file, DevMajor and + // DevMinor are the major and minor device numbers for the created device. + DevMajor uint32 + DevMinor uint32 + + // Endpoint is the endpoint to bind to the created file, if a socket file is + // being created for bind(2) on a Unix domain socket. + Endpoint transport.BoundEndpoint +} + +// MountFlags contains flags as specified for mount(2), e.g. MS_NOEXEC. +// MS_RDONLY is not part of MountFlags because it's tracked in Mount.writers. +type MountFlags struct { + // NoExec is equivalent to MS_NOEXEC. + NoExec bool + + // NoATime is equivalent to MS_NOATIME and indicates that the + // filesystem should not update access time in-place. + NoATime bool +} + +// MountOptions contains options to VirtualFilesystem.MountAt(). +type MountOptions struct { + // Flags contains flags as specified for mount(2), e.g. MS_NOEXEC. + Flags MountFlags + + // ReadOnly is equivalent to MS_RDONLY. + ReadOnly bool + + // GetFilesystemOptions contains options to FilesystemType.GetFilesystem(). + GetFilesystemOptions GetFilesystemOptions + + // If InternalMount is true, allow the use of filesystem types for which + // RegisterFilesystemTypeOptions.AllowUserMount == false. + InternalMount bool +} + +// OpenOptions contains options to VirtualFilesystem.OpenAt() and +// FilesystemImpl.OpenAt(). +type OpenOptions struct { + // Flags contains access mode and flags as specified for open(2). + // + // FilesystemImpls are responsible for implementing the following flags: + // O_RDONLY, O_WRONLY, O_RDWR, O_APPEND, O_CREAT, O_DIRECT, O_DSYNC, + // O_EXCL, O_NOATIME, O_NOCTTY, O_NONBLOCK, O_PATH, O_SYNC, O_TMPFILE, and + // O_TRUNC. VFS is responsible for handling O_DIRECTORY, O_LARGEFILE, and + // O_NOFOLLOW. VFS users are responsible for handling O_CLOEXEC, since file + // descriptors are mostly outside the scope of VFS. + Flags uint32 + + // If FilesystemImpl.OpenAt() creates a file, Mode is the file mode for the + // created file. + Mode linux.FileMode + + // FileExec is set when the file is being opened to be executed. + // VirtualFilesystem.OpenAt() checks that the caller has execute permissions + // on the file, that the file is a regular file, and that the mount doesn't + // have MS_NOEXEC set. + FileExec bool +} + +// ReadOptions contains options to FileDescription.PRead(), +// FileDescriptionImpl.PRead(), FileDescription.Read(), and +// FileDescriptionImpl.Read(). +type ReadOptions struct { + // Flags contains flags as specified for preadv2(2). + Flags uint32 +} + +// RenameOptions contains options to VirtualFilesystem.RenameAt() and +// FilesystemImpl.RenameAt(). +type RenameOptions struct { + // Flags contains flags as specified for renameat2(2). + Flags uint32 + + // If MustBeDir is true, the renamed file must be a directory. + MustBeDir bool +} + +// SetStatOptions contains options to VirtualFilesystem.SetStatAt(), +// FilesystemImpl.SetStatAt(), FileDescription.SetStat(), and +// FileDescriptionImpl.SetStat(). +type SetStatOptions struct { + // Stat is the metadata that should be set. Only fields indicated by + // Stat.Mask should be set. + // + // If Stat specifies that a timestamp should be set, + // FilesystemImpl.SetStatAt() and FileDescriptionImpl.SetStat() must + // special-case StatxTimestamp.Nsec == UTIME_NOW as described by + // utimensat(2); however, they do not need to check for StatxTimestamp.Nsec + // == UTIME_OMIT (VFS users must unset the corresponding bit in Stat.Mask + // instead). + Stat linux.Statx +} + +// BoundEndpointOptions contains options to VirtualFilesystem.BoundEndpointAt() +// and FilesystemImpl.BoundEndpointAt(). +type BoundEndpointOptions struct { + // Addr is the path of the file whose socket endpoint is being retrieved. + // It is generally irrelevant: most endpoints are stored at a dentry that + // was created through a bind syscall, so the path can be stored on creation. + // However, if the endpoint was created in FilesystemImpl.BoundEndpointAt(), + // then we may not know what the original bind address was. + // + // For example, if connect(2) is called with address "foo" which corresponds + // a remote named socket in goferfs, we need to generate an endpoint wrapping + // that file. In this case, we can use Addr to set the endpoint address to + // "foo". Note that Addr is only a best-effort attempt--we still do not know + // the exact address that was used on the remote fs to bind the socket (it + // may have been "foo", "./foo", etc.). + Addr string +} + +// GetxattrOptions contains options to VirtualFilesystem.GetxattrAt(), +// FilesystemImpl.GetxattrAt(), FileDescription.Getxattr(), and +// FileDescriptionImpl.Getxattr(). +type GetxattrOptions struct { + // Name is the name of the extended attribute to retrieve. + Name string + + // Size is the maximum value size that the caller will tolerate. If the value + // is larger than size, getxattr methods may return ERANGE, but they are also + // free to ignore the hint entirely (i.e. the value returned may be larger + // than size). All size checking is done independently at the syscall layer. + Size uint64 +} + +// SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(), +// FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and +// FileDescriptionImpl.Setxattr(). +type SetxattrOptions struct { + // Name is the name of the extended attribute being mutated. + Name string + + // Value is the extended attribute's new value. + Value string + + // Flags contains flags as specified for setxattr/lsetxattr/fsetxattr(2). + Flags uint32 +} + +// StatOptions contains options to VirtualFilesystem.StatAt(), +// FilesystemImpl.StatAt(), FileDescription.Stat(), and +// FileDescriptionImpl.Stat(). +type StatOptions struct { + // Mask is the set of fields in the returned Statx that the FilesystemImpl + // or FileDescriptionImpl should provide. Bits are as in linux.Statx.Mask. + // + // The FilesystemImpl or FileDescriptionImpl may return fields not + // requested in Mask, and may fail to return fields requested in Mask that + // are not supported by the underlying filesystem implementation, without + // returning an error. + Mask uint32 + + // Sync specifies the synchronization required, and is one of + // linux.AT_STATX_SYNC_AS_STAT (which is 0, and therefore the default), + // linux.AT_STATX_SYNC_FORCE_SYNC, or linux.AT_STATX_SYNC_DONT_SYNC. + Sync uint32 +} + +// UmountOptions contains options to VirtualFilesystem.UmountAt(). +type UmountOptions struct { + // Flags contains flags as specified for umount2(2). + Flags uint32 +} + +// WriteOptions contains options to FileDescription.PWrite(), +// FileDescriptionImpl.PWrite(), FileDescription.Write(), and +// FileDescriptionImpl.Write(). +type WriteOptions struct { + // Flags contains flags as specified for pwritev2(2). + Flags uint32 +} diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go new file mode 100644 index 000000000..cd78d66bc --- /dev/null +++ b/pkg/sentry/vfs/pathname.go @@ -0,0 +1,195 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +var fspathBuilderPool = sync.Pool{ + New: func() interface{} { + return &fspath.Builder{} + }, +} + +func getFSPathBuilder() *fspath.Builder { + return fspathBuilderPool.Get().(*fspath.Builder) +} + +func putFSPathBuilder(b *fspath.Builder) { + // No methods can be called on b after b.String(), so reset it to its zero + // value (as returned by fspathBuilderPool.New) instead. + *b = fspath.Builder{} + fspathBuilderPool.Put(b) +} + +// PathnameWithDeleted returns an absolute pathname to vd, consistent with +// Linux's d_path(). In particular, if vd.Dentry() has been disowned, +// PathnameWithDeleted appends " (deleted)" to the returned pathname. +func (vfs *VirtualFilesystem) PathnameWithDeleted(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) { + b := getFSPathBuilder() + defer putFSPathBuilder(b) + haveRef := false + defer func() { + if haveRef { + vd.DecRef() + } + }() + + origD := vd.dentry +loop: + for { + err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b) + switch err.(type) { + case nil: + if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry { + // genericfstree.PrependPath() will have returned + // PrependPathAtVFSRootError in this case since it checks + // against vfsroot before mnt.root, but other implementations + // of FilesystemImpl.PrependPath() may return nil instead. + break loop + } + nextVD := vfs.getMountpointAt(vd.mount, vfsroot) + if !nextVD.Ok() { + break loop + } + if haveRef { + vd.DecRef() + } + vd = nextVD + haveRef = true + // continue loop + case PrependPathSyntheticError: + // Skip prepending "/" and appending " (deleted)". + return b.String(), nil + case PrependPathAtVFSRootError, PrependPathAtNonMountRootError: + break loop + default: + return "", err + } + } + b.PrependByte('/') + if origD.IsDead() { + b.AppendString(" (deleted)") + } + return b.String(), nil +} + +// PathnameReachable returns an absolute pathname to vd, consistent with +// Linux's __d_path() (as used by seq_path_root()). If vfsroot.Ok() and vd is +// not reachable from vfsroot, such that seq_path_root() would return SEQ_SKIP +// (causing the entire containing entry to be skipped), PathnameReachable +// returns ("", nil). +func (vfs *VirtualFilesystem) PathnameReachable(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) { + b := getFSPathBuilder() + defer putFSPathBuilder(b) + haveRef := false + defer func() { + if haveRef { + vd.DecRef() + } + }() +loop: + for { + err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b) + switch err.(type) { + case nil: + if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry { + break loop + } + nextVD := vfs.getMountpointAt(vd.mount, vfsroot) + if !nextVD.Ok() { + return "", nil + } + if haveRef { + vd.DecRef() + } + vd = nextVD + haveRef = true + case PrependPathAtVFSRootError: + break loop + case PrependPathAtNonMountRootError, PrependPathSyntheticError: + return "", nil + default: + return "", err + } + } + b.PrependByte('/') + return b.String(), nil +} + +// PathnameForGetcwd returns an absolute pathname to vd, consistent with +// Linux's sys_getcwd(). +func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) { + if vd.dentry.IsDead() { + return "", syserror.ENOENT + } + + b := getFSPathBuilder() + defer putFSPathBuilder(b) + haveRef := false + defer func() { + if haveRef { + vd.DecRef() + } + }() + unreachable := false +loop: + for { + err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b) + switch err.(type) { + case nil: + if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry { + break loop + } + nextVD := vfs.getMountpointAt(vd.mount, vfsroot) + if !nextVD.Ok() { + unreachable = true + break loop + } + if haveRef { + vd.DecRef() + } + vd = nextVD + haveRef = true + case PrependPathAtVFSRootError: + break loop + case PrependPathAtNonMountRootError, PrependPathSyntheticError: + unreachable = true + break loop + default: + return "", err + } + } + b.PrependByte('/') + if unreachable { + b.PrependString("(unreachable)") + } + return b.String(), nil +} + +// As of this writing, we do not have equivalents to: +// +// - d_absolute_path(), which returns EINVAL if (effectively) any call to +// FilesystemImpl.PrependPath() would return PrependPathAtNonMountRootError. +// +// - dentry_path(), which does not walk up mounts (and only returns the path +// relative to Filesystem root), but also appends "//deleted" for disowned +// Dentries. +// +// These should be added as necessary. diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go new file mode 100644 index 000000000..9cb050597 --- /dev/null +++ b/pkg/sentry/vfs/permissions.go @@ -0,0 +1,280 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "math" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/syserror" +) + +// AccessTypes is a bitmask of Unix file permissions. +type AccessTypes uint16 + +// Bits in AccessTypes. +const ( + MayExec AccessTypes = 1 + MayWrite AccessTypes = 2 + MayRead AccessTypes = 4 +) + +// OnlyRead returns true if access _only_ allows read. +func (a AccessTypes) OnlyRead() bool { + return a == MayRead +} + +// MayRead returns true if access allows read. +func (a AccessTypes) MayRead() bool { + return a&MayRead != 0 +} + +// MayWrite returns true if access allows write. +func (a AccessTypes) MayWrite() bool { + return a&MayWrite != 0 +} + +// MayExec returns true if access allows exec. +func (a AccessTypes) MayExec() bool { + return a&MayExec != 0 +} + +// GenericCheckPermissions checks that creds has the given access rights on a +// file with the given permissions, UID, and GID, subject to the rules of +// fs/namei.c:generic_permission(). +func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error { + // Check permission bits. + perms := uint16(mode.Permissions()) + if creds.EffectiveKUID == kuid { + perms >>= 6 + } else if creds.InGroup(kgid) { + perms >>= 3 + } + if uint16(ats)&perms == uint16(ats) { + // All permission bits match, access granted. + return nil + } + + // Caller capabilities require that the file's KUID and KGID are mapped in + // the caller's user namespace; compare + // kernel/capability.c:privileged_wrt_inode_uidgid(). + if !kuid.In(creds.UserNamespace).Ok() || !kgid.In(creds.UserNamespace).Ok() { + return syserror.EACCES + } + // CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary + // directories, and read arbitrary non-directory files. + if (mode.IsDir() && !ats.MayWrite()) || ats.OnlyRead() { + if creds.HasCapability(linux.CAP_DAC_READ_SEARCH) { + return nil + } + } + // CAP_DAC_OVERRIDE allows arbitrary access to directories, read/write + // access to non-directory files, and execute access to non-directory files + // for which at least one execute bit is set. + if mode.IsDir() || !ats.MayExec() || (mode.Permissions()&0111 != 0) { + if creds.HasCapability(linux.CAP_DAC_OVERRIDE) { + return nil + } + } + return syserror.EACCES +} + +// MayLink determines whether creating a hard link to a file with the given +// mode, kuid, and kgid is permitted. +// +// This corresponds to Linux's fs/namei.c:may_linkat. +func MayLink(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error { + // Source inode owner can hardlink all they like; otherwise, it must be a + // safe source. + if CanActAsOwner(creds, kuid) { + return nil + } + + // Only regular files can be hard linked. + if mode.FileType() != linux.S_IFREG { + return syserror.EPERM + } + + // Setuid files should not get pinned to the filesystem. + if mode&linux.S_ISUID != 0 { + return syserror.EPERM + } + + // Executable setgid files should not get pinned to the filesystem, but we + // don't support S_IXGRP anyway. + + // Hardlinking to unreadable or unwritable sources is dangerous. + if err := GenericCheckPermissions(creds, MayRead|MayWrite, mode, kuid, kgid); err != nil { + return syserror.EPERM + } + return nil +} + +// AccessTypesForOpenFlags returns the access types required to open a file +// with the given OpenOptions.Flags. Note that this is NOT the same thing as +// the set of accesses permitted for the opened file: +// +// - O_TRUNC causes MayWrite to be set in the returned AccessTypes (since it +// mutates the file), but does not permit writing to the open file description +// thereafter. +// +// - "Linux reserves the special, nonstandard access mode 3 (binary 11) in +// flags to mean: check for read and write permission on the file and return a +// file descriptor that can't be used for reading or writing." - open(2). Thus +// AccessTypesForOpenFlags returns MayRead|MayWrite in this case. +// +// Use May{Read,Write}FileWithOpenFlags() for these checks instead. +func AccessTypesForOpenFlags(opts *OpenOptions) AccessTypes { + ats := AccessTypes(0) + if opts.FileExec { + ats |= MayExec + } + + switch opts.Flags & linux.O_ACCMODE { + case linux.O_RDONLY: + if opts.Flags&linux.O_TRUNC != 0 { + return ats | MayRead | MayWrite + } + return ats | MayRead + case linux.O_WRONLY: + return ats | MayWrite + default: + return ats | MayRead | MayWrite + } +} + +// MayReadFileWithOpenFlags returns true if a file with the given open flags +// should be readable. +func MayReadFileWithOpenFlags(flags uint32) bool { + switch flags & linux.O_ACCMODE { + case linux.O_RDONLY, linux.O_RDWR: + return true + default: + return false + } +} + +// MayWriteFileWithOpenFlags returns true if a file with the given open flags +// should be writable. +func MayWriteFileWithOpenFlags(flags uint32) bool { + switch flags & linux.O_ACCMODE { + case linux.O_WRONLY, linux.O_RDWR: + return true + default: + return false + } +} + +// CheckSetStat checks that creds has permission to change the metadata of a +// file with the given permissions, UID, and GID as specified by stat, subject +// to the rules of Linux's fs/attr.c:setattr_prepare(). +func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error { + if stat.Mask&linux.STATX_SIZE != 0 { + limit, err := CheckLimit(ctx, 0, int64(stat.Size)) + if err != nil { + return err + } + if limit < int64(stat.Size) { + return syserror.ErrExceedsFileSizeLimit + } + } + if stat.Mask&linux.STATX_MODE != 0 { + if !CanActAsOwner(creds, kuid) { + return syserror.EPERM + } + // TODO(b/30815691): "If the calling process is not privileged (Linux: + // does not have the CAP_FSETID capability), and the group of the file + // does not match the effective group ID of the process or one of its + // supplementary group IDs, the S_ISGID bit will be turned off, but + // this will not cause an error to be returned." - chmod(2) + } + if stat.Mask&linux.STATX_UID != 0 { + if !((creds.EffectiveKUID == kuid && auth.KUID(stat.UID) == kuid) || + HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) { + return syserror.EPERM + } + } + if stat.Mask&linux.STATX_GID != 0 { + if !((creds.EffectiveKUID == kuid && creds.InGroup(auth.KGID(stat.GID))) || + HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) { + return syserror.EPERM + } + } + if stat.Mask&(linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME) != 0 { + if !CanActAsOwner(creds, kuid) { + if (stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW) || + (stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW) || + (stat.Mask&linux.STATX_CTIME != 0 && stat.Ctime.Nsec != linux.UTIME_NOW) { + return syserror.EPERM + } + if err := GenericCheckPermissions(creds, MayWrite, mode, kuid, kgid); err != nil { + return err + } + } + } + return nil +} + +// CheckDeleteSticky checks whether the sticky bit is set on a directory with +// the given file mode, and if so, checks whether creds has permission to +// remove a file owned by childKUID from a directory with the given mode. +// CheckDeleteSticky is consistent with fs/linux.h:check_sticky(). +func CheckDeleteSticky(creds *auth.Credentials, parentMode linux.FileMode, childKUID auth.KUID) error { + if parentMode&linux.ModeSticky == 0 { + return nil + } + if CanActAsOwner(creds, childKUID) { + return nil + } + return syserror.EPERM +} + +// CanActAsOwner returns true if creds can act as the owner of a file with the +// given owning UID, consistent with Linux's +// fs/inode.c:inode_owner_or_capable(). +func CanActAsOwner(creds *auth.Credentials, kuid auth.KUID) bool { + if creds.EffectiveKUID == kuid { + return true + } + return creds.HasCapability(linux.CAP_FOWNER) && creds.UserNamespace.MapFromKUID(kuid).Ok() +} + +// HasCapabilityOnFile returns true if creds has the given capability with +// respect to a file with the given owning UID and GID, consistent with Linux's +// kernel/capability.c:capable_wrt_inode_uidgid(). +func HasCapabilityOnFile(creds *auth.Credentials, cp linux.Capability, kuid auth.KUID, kgid auth.KGID) bool { + return creds.HasCapability(cp) && creds.UserNamespace.MapFromKUID(kuid).Ok() && creds.UserNamespace.MapFromKGID(kgid).Ok() +} + +// CheckLimit enforces file size rlimits. It returns error if the write +// operation must not proceed. Otherwise it returns the max length allowed to +// without violating the limit. +func CheckLimit(ctx context.Context, offset, size int64) (int64, error) { + fileSizeLimit := limits.FromContext(ctx).Get(limits.FileSize).Cur + if fileSizeLimit > math.MaxInt64 { + return size, nil + } + if offset >= int64(fileSizeLimit) { + return 0, syserror.ErrExceedsFileSizeLimit + } + remaining := int64(fileSizeLimit) - offset + if remaining < size { + return remaining, nil + } + return size, nil +} diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go new file mode 100644 index 000000000..9d047ff88 --- /dev/null +++ b/pkg/sentry/vfs/resolving_path.go @@ -0,0 +1,466 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// ResolvingPath represents the state of an in-progress path resolution, shared +// between VFS and FilesystemImpl methods that take a path. +// +// From the perspective of FilesystemImpl methods, a ResolvingPath represents a +// starting Dentry on the associated Filesystem (on which a reference is +// already held), a stream of path components relative to that Dentry, and +// elements of the invoking Context that are commonly required by +// FilesystemImpl methods. +// +// ResolvingPath is loosely analogous to Linux's struct nameidata. +type ResolvingPath struct { + vfs *VirtualFilesystem + root VirtualDentry // refs borrowed from PathOperation + mount *Mount + start *Dentry + pit fspath.Iterator + + flags uint16 + mustBeDir bool // final file must be a directory? + mustBeDirOrig bool + symlinks uint8 // number of symlinks traversed + symlinksOrig uint8 + curPart uint8 // index into parts + numOrigParts uint8 + + creds *auth.Credentials + + // Data associated with resolve*Errors, stored in ResolvingPath so that + // those errors don't need to allocate. + nextMount *Mount // ref held if not nil + nextStart *Dentry // ref held if not nil + absSymlinkTarget fspath.Path + + // ResolvingPath must track up to two relative paths: the "current" + // relative path, which is updated whenever a relative symlink is + // encountered, and the "original" relative path, which is updated from the + // current relative path by handleError() when resolution must change + // filesystems (due to reaching a mount boundary or absolute symlink) and + // overwrites the current relative path when Restart() is called. + parts [1 + linux.MaxSymlinkTraversals]fspath.Iterator + origParts [1 + linux.MaxSymlinkTraversals]fspath.Iterator +} + +const ( + rpflagsHaveMountRef = 1 << iota // do we hold a reference on mount? + rpflagsHaveStartRef // do we hold a reference on start? + rpflagsFollowFinalSymlink // same as PathOperation.FollowFinalSymlink +) + +func init() { + if maxParts := len(ResolvingPath{}.parts); maxParts > 255 { + panic(fmt.Sprintf("uint8 is insufficient to accommodate len(ResolvingPath.parts) (%d)", maxParts)) + } +} + +// Error types that communicate state from the FilesystemImpl-caller, +// VFS-callee side of path resolution (i.e. errors returned by +// ResolvingPath.Resolve*()) to the VFS-caller, FilesystemImpl-callee side +// (i.e. VFS methods => ResolvingPath.handleError()). These are empty structs +// rather than error values because Go doesn't support non-primitive constants, +// so error "constants" are really mutable vars, necessitating somewhat +// expensive interface object comparisons. + +type resolveMountRootOrJumpError struct{} + +// Error implements error.Error. +func (resolveMountRootOrJumpError) Error() string { + return "resolving mount root or jump" +} + +type resolveMountPointError struct{} + +// Error implements error.Error. +func (resolveMountPointError) Error() string { + return "resolving mount point" +} + +type resolveAbsSymlinkError struct{} + +// Error implements error.Error. +func (resolveAbsSymlinkError) Error() string { + return "resolving absolute symlink" +} + +var resolvingPathPool = sync.Pool{ + New: func() interface{} { + return &ResolvingPath{} + }, +} + +func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) *ResolvingPath { + rp := resolvingPathPool.Get().(*ResolvingPath) + rp.vfs = vfs + rp.root = pop.Root + rp.mount = pop.Start.mount + rp.start = pop.Start.dentry + rp.pit = pop.Path.Begin + rp.flags = 0 + if pop.FollowFinalSymlink { + rp.flags |= rpflagsFollowFinalSymlink + } + rp.mustBeDir = pop.Path.Dir + rp.mustBeDirOrig = pop.Path.Dir + rp.symlinks = 0 + rp.curPart = 0 + rp.numOrigParts = 1 + rp.creds = creds + rp.parts[0] = pop.Path.Begin + rp.origParts[0] = pop.Path.Begin + return rp +} + +func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) { + rp.root = VirtualDentry{} + rp.decRefStartAndMount() + rp.mount = nil + rp.start = nil + rp.releaseErrorState() + resolvingPathPool.Put(rp) +} + +func (rp *ResolvingPath) decRefStartAndMount() { + if rp.flags&rpflagsHaveStartRef != 0 { + rp.start.DecRef() + } + if rp.flags&rpflagsHaveMountRef != 0 { + rp.mount.DecRef() + } +} + +func (rp *ResolvingPath) releaseErrorState() { + if rp.nextStart != nil { + rp.nextStart.DecRef() + rp.nextStart = nil + } + if rp.nextMount != nil { + rp.nextMount.DecRef() + rp.nextMount = nil + } +} + +// VirtualFilesystem returns the containing VirtualFilesystem. +func (rp *ResolvingPath) VirtualFilesystem() *VirtualFilesystem { + return rp.vfs +} + +// Credentials returns the credentials of rp's provider. +func (rp *ResolvingPath) Credentials() *auth.Credentials { + return rp.creds +} + +// Mount returns the Mount on which path resolution is currently occurring. It +// does not take a reference on the returned Mount. +func (rp *ResolvingPath) Mount() *Mount { + return rp.mount +} + +// Start returns the starting Dentry represented by rp. It does not take a +// reference on the returned Dentry. +func (rp *ResolvingPath) Start() *Dentry { + return rp.start +} + +// Done returns true if there are no remaining path components in the stream +// represented by rp. +func (rp *ResolvingPath) Done() bool { + // We don't need to check for rp.curPart == 0 because rp.Advance() won't + // set rp.pit to a terminal iterator otherwise. + return !rp.pit.Ok() +} + +// Final returns true if there is exactly one remaining path component in the +// stream represented by rp. +// +// Preconditions: !rp.Done(). +func (rp *ResolvingPath) Final() bool { + return rp.curPart == 0 && !rp.pit.NextOk() +} + +// Component returns the current path component in the stream represented by +// rp. +// +// Preconditions: !rp.Done(). +func (rp *ResolvingPath) Component() string { + if checkInvariants { + if !rp.pit.Ok() { + panic("ResolvingPath.Component() called at end of relative path") + } + } + return rp.pit.String() +} + +// Advance advances the stream of path components represented by rp. +// +// Preconditions: !rp.Done(). +func (rp *ResolvingPath) Advance() { + if checkInvariants { + if !rp.pit.Ok() { + panic("ResolvingPath.Advance() called at end of relative path") + } + } + next := rp.pit.Next() + if next.Ok() || rp.curPart == 0 { // have next component, or at end of path + rp.pit = next + } else { // at end of path segment, continue with next one + rp.curPart-- + rp.pit = rp.parts[rp.curPart] + } +} + +// Restart resets the stream of path components represented by rp to its state +// on entry to the current FilesystemImpl method. +func (rp *ResolvingPath) Restart() { + rp.pit = rp.origParts[rp.numOrigParts-1] + rp.mustBeDir = rp.mustBeDirOrig + rp.symlinks = rp.symlinksOrig + rp.curPart = rp.numOrigParts - 1 + copy(rp.parts[:], rp.origParts[:rp.numOrigParts]) + rp.releaseErrorState() +} + +func (rp *ResolvingPath) relpathCommit() { + rp.mustBeDirOrig = rp.mustBeDir + rp.symlinksOrig = rp.symlinks + rp.numOrigParts = rp.curPart + 1 + copy(rp.origParts[:rp.curPart], rp.parts[:]) + rp.origParts[rp.curPart] = rp.pit +} + +// CheckRoot is called before resolving the parent of the Dentry d. If the +// Dentry is contextually a VFS root, such that path resolution should treat +// d's parent as itself, CheckRoot returns (true, nil). If the Dentry is the +// root of a non-root mount, such that path resolution should switch to another +// Mount, CheckRoot returns (unspecified, non-nil error). Otherwise, path +// resolution should resolve d's parent normally, and CheckRoot returns (false, +// nil). +func (rp *ResolvingPath) CheckRoot(d *Dentry) (bool, error) { + if d == rp.root.dentry && rp.mount == rp.root.mount { + // At contextual VFS root (due to e.g. chroot(2)). + return true, nil + } else if d == rp.mount.root { + // At mount root ... + vd := rp.vfs.getMountpointAt(rp.mount, rp.root) + if vd.Ok() { + // ... of non-root mount. + rp.nextMount = vd.mount + rp.nextStart = vd.dentry + return false, resolveMountRootOrJumpError{} + } + // ... of root mount. + return true, nil + } + return false, nil +} + +// CheckMount is called after resolving the parent or child of another Dentry +// to d. If d is a mount point, such that path resolution should switch to +// another Mount, CheckMount returns a non-nil error. Otherwise, CheckMount +// returns nil. +func (rp *ResolvingPath) CheckMount(d *Dentry) error { + if !d.isMounted() { + return nil + } + if mnt := rp.vfs.getMountAt(rp.mount, d); mnt != nil { + rp.nextMount = mnt + return resolveMountPointError{} + } + return nil +} + +// ShouldFollowSymlink returns true if, supposing that the current path +// component in pcs represents a symbolic link, the symbolic link should be +// followed. +// +// If path is terminated with '/', the '/' is considered the last element and +// any symlink before that is followed: +// - For most non-creating walks, the last path component is handled by +// fs/namei.c:lookup_last(), which sets LOOKUP_FOLLOW if the first byte +// after the path component is non-NULL (which is only possible if it's '/') +// and the path component is of type LAST_NORM. +// +// - For open/openat/openat2 without O_CREAT, the last path component is +// handled by fs/namei.c:do_last(), which does the same, though without the +// LAST_NORM check. +// +// Preconditions: !rp.Done(). +func (rp *ResolvingPath) ShouldFollowSymlink() bool { + // Non-final symlinks are always followed. Paths terminated with '/' are also + // always followed. + return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final() || rp.MustBeDir() +} + +// HandleSymlink is called when the current path component is a symbolic link +// to the given target. If the calling Filesystem method should continue path +// traversal, HandleSymlink updates the path component stream to reflect the +// symlink target and returns nil. Otherwise it returns a non-nil error. +// +// Preconditions: !rp.Done(). +// +// Postconditions: If HandleSymlink returns a nil error, then !rp.Done(). +func (rp *ResolvingPath) HandleSymlink(target string) error { + if rp.symlinks >= linux.MaxSymlinkTraversals { + return syserror.ELOOP + } + if len(target) == 0 { + return syserror.ENOENT + } + rp.symlinks++ + targetPath := fspath.Parse(target) + if targetPath.Absolute { + rp.absSymlinkTarget = targetPath + return resolveAbsSymlinkError{} + } + // Consume the path component that represented the symlink. + rp.Advance() + // Prepend the symlink target to the relative path. + if checkInvariants { + if !targetPath.HasComponents() { + panic(fmt.Sprintf("non-empty pathname %q parsed to relative path with no components", target)) + } + } + rp.relpathPrepend(targetPath) + return nil +} + +// Preconditions: path.HasComponents(). +func (rp *ResolvingPath) relpathPrepend(path fspath.Path) { + if rp.pit.Ok() { + rp.parts[rp.curPart] = rp.pit + rp.pit = path.Begin + rp.curPart++ + } else { + // The symlink was the final path component, so now the symlink target + // is the whole path. + rp.pit = path.Begin + // Symlink targets can set rp.mustBeDir (if they end in a trailing /), + // but can't unset it. + if path.Dir { + rp.mustBeDir = true + } + } +} + +// HandleJump is called when the current path component is a "magic" link to +// the given VirtualDentry, like /proc/[pid]/fd/[fd]. If the calling Filesystem +// method should continue path traversal, HandleMagicSymlink updates the path +// component stream to reflect the magic link target and returns nil. Otherwise +// it returns a non-nil error. +// +// Preconditions: !rp.Done(). +func (rp *ResolvingPath) HandleJump(target VirtualDentry) error { + if rp.symlinks >= linux.MaxSymlinkTraversals { + return syserror.ELOOP + } + rp.symlinks++ + // Consume the path component that represented the magic link. + rp.Advance() + // Unconditionally return a resolveMountRootOrJumpError, even if the Mount + // isn't changing, to force restarting at the new Dentry. + target.IncRef() + rp.nextMount = target.mount + rp.nextStart = target.dentry + return resolveMountRootOrJumpError{} +} + +func (rp *ResolvingPath) handleError(err error) bool { + switch err.(type) { + case resolveMountRootOrJumpError: + // Switch to the new Mount. We hold references on the Mount and Dentry. + rp.decRefStartAndMount() + rp.mount = rp.nextMount + rp.start = rp.nextStart + rp.flags |= rpflagsHaveMountRef | rpflagsHaveStartRef + rp.nextMount = nil + rp.nextStart = nil + // Commit the previous FileystemImpl's progress through the relative + // path. (Don't consume the path component that caused us to traverse + // through the mount root - i.e. the ".." - because we still need to + // resolve the mount point's parent in the new FilesystemImpl.) + rp.relpathCommit() + // Restart path resolution on the new Mount. Don't bother calling + // rp.releaseErrorState() since we already set nextMount and nextStart + // to nil above. + return true + + case resolveMountPointError: + // Switch to the new Mount. We hold a reference on the Mount, but + // borrow the reference on the mount root from the Mount. + rp.decRefStartAndMount() + rp.mount = rp.nextMount + rp.start = rp.nextMount.root + rp.flags = rp.flags&^rpflagsHaveStartRef | rpflagsHaveMountRef + rp.nextMount = nil + // Consume the path component that represented the mount point. + rp.Advance() + // Commit the previous FilesystemImpl's progress through the relative + // path. + rp.relpathCommit() + // Restart path resolution on the new Mount. + rp.releaseErrorState() + return true + + case resolveAbsSymlinkError: + // Switch to the new Mount. References are borrowed from rp.root. + rp.decRefStartAndMount() + rp.mount = rp.root.mount + rp.start = rp.root.dentry + rp.flags &^= rpflagsHaveMountRef | rpflagsHaveStartRef + // Consume the path component that represented the symlink. + rp.Advance() + // Prepend the symlink target to the relative path. + rp.relpathPrepend(rp.absSymlinkTarget) + // Commit the previous FilesystemImpl's progress through the relative + // path, including the symlink target we just prepended. + rp.relpathCommit() + // Restart path resolution on the new Mount. + rp.releaseErrorState() + return true + + default: + // Not an error we can handle. + return false + } +} + +// canHandleError returns true if err is an error returned by rp.Resolve*() +// that rp.handleError() may attempt to handle. +func (rp *ResolvingPath) canHandleError(err error) bool { + switch err.(type) { + case resolveMountRootOrJumpError, resolveMountPointError, resolveAbsSymlinkError: + return true + default: + return false + } +} + +// MustBeDir returns true if the file traversed by rp must be a directory. +func (rp *ResolvingPath) MustBeDir() bool { + return rp.mustBeDir +} diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go new file mode 100644 index 000000000..522e27475 --- /dev/null +++ b/pkg/sentry/vfs/vfs.go @@ -0,0 +1,849 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package vfs implements a virtual filesystem layer. +// +// Lock order: +// +// EpollInstance.interestMu +// FileDescription.epollMu +// FilesystemImpl/FileDescriptionImpl locks +// VirtualFilesystem.mountMu +// Dentry.mu +// Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry +// VirtualFilesystem.filesystemsMu +// EpollInstance.mu +// Inotify.mu +// Watches.mu +// Inotify.evMu +// VirtualFilesystem.fsTypesMu +// +// Locking Dentry.mu in multiple Dentries requires holding +// VirtualFilesystem.mountMu. Locking EpollInstance.interestMu in multiple +// EpollInstances requires holding epollCycleMu. +package vfs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts. +// +// There is no analogue to the VirtualFilesystem type in Linux, as the +// equivalent state in Linux is global. +// +// +stateify savable +type VirtualFilesystem struct { + // mountMu serializes mount mutations. + // + // mountMu is analogous to Linux's namespace_sem. + mountMu sync.Mutex `state:"nosave"` + + // mounts maps (mount parent, mount point) pairs to mounts. (Since mounts + // are uniquely namespaced, including mount parent in the key correctly + // handles both bind mounts and mount namespaces; Linux does the same.) + // Synchronization between mutators and readers is provided by mounts.seq; + // synchronization between mutators is provided by mountMu. + // + // mounts is used to follow mount points during path traversal. We use a + // single table rather than per-Dentry tables to reduce size (and therefore + // cache footprint) for the vast majority of Dentries that are not mount + // points. + // + // mounts is analogous to Linux's mount_hashtable. + mounts mountTable + + // mountpoints maps mount points to mounts at those points in all + // namespaces. mountpoints is protected by mountMu. + // + // mountpoints is used to find mounts that must be umounted due to + // removal of a mount point Dentry from another mount namespace. ("A file + // or directory that is a mount point in one namespace that is not a mount + // point in another namespace, may be renamed, unlinked, or removed + // (rmdir(2)) in the mount namespace in which it is not a mount point + // (subject to the usual permission checks)." - mount_namespaces(7)) + // + // mountpoints is analogous to Linux's mountpoint_hashtable. + mountpoints map[*Dentry]map[*Mount]struct{} + + // lastMountID is the last allocated mount ID. lastMountID is accessed + // using atomic memory operations. + lastMountID uint64 + + // anonMount is a Mount, not included in mounts or mountpoints, + // representing an anonFilesystem. anonMount is used to back + // VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry(). + // anonMount is immutable. + // + // anonMount is analogous to Linux's anon_inode_mnt. + anonMount *Mount + + // devices contains all registered Devices. devices is protected by + // devicesMu. + devicesMu sync.RWMutex `state:"nosave"` + devices map[devTuple]*registeredDevice + + // anonBlockDevMinor contains all allocated anonymous block device minor + // numbers. anonBlockDevMinorNext is a lower bound for the smallest + // unallocated anonymous block device number. anonBlockDevMinorNext and + // anonBlockDevMinor are protected by anonBlockDevMinorMu. + anonBlockDevMinorMu sync.Mutex `state:"nosave"` + anonBlockDevMinorNext uint32 + anonBlockDevMinor map[uint32]struct{} + + // fsTypes contains all registered FilesystemTypes. fsTypes is protected by + // fsTypesMu. + fsTypesMu sync.RWMutex `state:"nosave"` + fsTypes map[string]*registeredFilesystemType + + // filesystems contains all Filesystems. filesystems is protected by + // filesystemsMu. + filesystemsMu sync.Mutex `state:"nosave"` + filesystems map[*Filesystem]struct{} +} + +// Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes. +func (vfs *VirtualFilesystem) Init() error { + if vfs.mountpoints != nil { + panic("VFS already initialized") + } + vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{}) + vfs.devices = make(map[devTuple]*registeredDevice) + vfs.anonBlockDevMinorNext = 1 + vfs.anonBlockDevMinor = make(map[uint32]struct{}) + vfs.fsTypes = make(map[string]*registeredFilesystemType) + vfs.filesystems = make(map[*Filesystem]struct{}) + vfs.mounts.Init() + + // Construct vfs.anonMount. + anonfsDevMinor, err := vfs.GetAnonBlockDevMinor() + if err != nil { + // This shouldn't be possible since anonBlockDevMinorNext was + // initialized to 1 above (no device numbers have been allocated yet). + panic(fmt.Sprintf("VirtualFilesystem.Init: device number allocation for anonfs failed: %v", err)) + } + anonfs := anonFilesystem{ + devMinor: anonfsDevMinor, + } + anonfs.vfsfs.Init(vfs, &anonFilesystemType{}, &anonfs) + defer anonfs.vfsfs.DecRef() + anonMount, err := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{}) + if err != nil { + // We should not be passing any MountOptions that would cause + // construction of this mount to fail. + panic(fmt.Sprintf("VirtualFilesystem.Init: anonfs mount failed: %v", err)) + } + vfs.anonMount = anonMount + + return nil +} + +// PathOperation specifies the path operated on by a VFS method. +// +// PathOperation is passed to VFS methods by pointer to reduce memory copying: +// it's somewhat large and should never escape. (Options structs are passed by +// pointer to VFS and FileDescription methods for the same reason.) +type PathOperation struct { + // Root is the VFS root. References on Root are borrowed from the provider + // of the PathOperation. + // + // Invariants: Root.Ok(). + Root VirtualDentry + + // Start is the starting point for the path traversal. References on Start + // are borrowed from the provider of the PathOperation (i.e. the caller of + // the VFS method to which the PathOperation was passed). + // + // Invariants: Start.Ok(). If Path.Absolute, then Start == Root. + Start VirtualDentry + + // Path is the pathname traversed by this operation. + Path fspath.Path + + // If FollowFinalSymlink is true, and the Dentry traversed by the final + // path component represents a symbolic link, the symbolic link should be + // followed. + FollowFinalSymlink bool +} + +// AccessAt checks whether a user with creds has access to the file at +// the given path. +func (vfs *VirtualFilesystem) AccessAt(ctx context.Context, creds *auth.Credentials, ats AccessTypes, pop *PathOperation) error { + rp := vfs.getResolvingPath(creds, pop) + for { + err := rp.mount.fs.impl.AccessAt(ctx, rp, creds, ats) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// GetDentryAt returns a VirtualDentry representing the given path, at which a +// file must exist. A reference is taken on the returned VirtualDentry. +func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) { + rp := vfs.getResolvingPath(creds, pop) + for { + d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts) + if err == nil { + vd := VirtualDentry{ + mount: rp.mount, + dentry: d, + } + rp.mount.IncRef() + vfs.putResolvingPath(rp) + return vd, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return VirtualDentry{}, err + } + } +} + +// Preconditions: pop.Path.Begin.Ok(). +func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (VirtualDentry, string, error) { + rp := vfs.getResolvingPath(creds, pop) + for { + parent, err := rp.mount.fs.impl.GetParentDentryAt(ctx, rp) + if err == nil { + parentVD := VirtualDentry{ + mount: rp.mount, + dentry: parent, + } + rp.mount.IncRef() + name := rp.Component() + vfs.putResolvingPath(rp) + return parentVD, name, nil + } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) + } + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return VirtualDentry{}, "", err + } + } +} + +// LinkAt creates a hard link at newpop representing the existing file at +// oldpop. +func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation) error { + oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{}) + if err != nil { + return err + } + + if !newpop.Path.Begin.Ok() { + oldVD.DecRef() + if newpop.Path.Absolute { + return syserror.EEXIST + } + return syserror.ENOENT + } + if newpop.FollowFinalSymlink { + oldVD.DecRef() + ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink") + return syserror.EINVAL + } + + rp := vfs.getResolvingPath(creds, newpop) + for { + err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD) + if err == nil { + vfs.putResolvingPath(rp) + oldVD.DecRef() + return nil + } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) + } + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + oldVD.DecRef() + return err + } + } +} + +// MkdirAt creates a directory at the given path. +func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error { + if !pop.Path.Begin.Ok() { + if pop.Path.Absolute { + return syserror.EEXIST + } + return syserror.ENOENT + } + if pop.FollowFinalSymlink { + ctx.Warningf("VirtualFilesystem.MkdirAt: file creation paths can't follow final symlink") + return syserror.EINVAL + } + // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is + // also honored." - mkdir(2) + opts.Mode &= 0777 | linux.S_ISVTX + + rp := vfs.getResolvingPath(creds, pop) + for { + err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) + } + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// MknodAt creates a file of the given mode at the given path. It returns an +// error from the syserror package. +func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error { + if !pop.Path.Begin.Ok() { + if pop.Path.Absolute { + return syserror.EEXIST + } + return syserror.ENOENT + } + if pop.FollowFinalSymlink { + ctx.Warningf("VirtualFilesystem.MknodAt: file creation paths can't follow final symlink") + return syserror.EINVAL + } + + rp := vfs.getResolvingPath(creds, pop) + for { + err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) + } + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// OpenAt returns a FileDescription providing access to the file at the given +// path. A reference is taken on the returned FileDescription. +func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) { + // Remove: + // + // - O_CLOEXEC, which affects file descriptors and therefore must be + // handled outside of VFS. + // + // - Unknown flags. + opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_LARGEFILE | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE + // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC. + if opts.Flags&linux.O_SYNC != 0 { + opts.Flags |= linux.O_DSYNC + } + // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified + // with O_DIRECTORY and a writable access mode (to ensure that it fails on + // filesystem implementations that do not support it). + if opts.Flags&linux.O_TMPFILE != 0 { + if opts.Flags&linux.O_DIRECTORY == 0 { + return nil, syserror.EINVAL + } + if opts.Flags&linux.O_CREAT != 0 { + return nil, syserror.EINVAL + } + if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY { + return nil, syserror.EINVAL + } + } + // O_PATH causes most other flags to be ignored. + if opts.Flags&linux.O_PATH != 0 { + opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH + } + // "On Linux, the following bits are also honored in mode: [S_ISUID, + // S_ISGID, S_ISVTX]" - open(2) + opts.Mode &= 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX + + if opts.Flags&linux.O_NOFOLLOW != 0 { + pop.FollowFinalSymlink = false + } + rp := vfs.getResolvingPath(creds, pop) + if opts.Flags&linux.O_DIRECTORY != 0 { + rp.mustBeDir = true + rp.mustBeDirOrig = true + } + for { + fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + + if opts.FileExec { + if fd.Mount().Flags.NoExec { + fd.DecRef() + return nil, syserror.EACCES + } + + // Only a regular file can be executed. + stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE}) + if err != nil { + fd.DecRef() + return nil, err + } + if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG { + fd.DecRef() + return nil, syserror.EACCES + } + } + + fd.Dentry().InotifyWithParent(linux.IN_OPEN, 0, PathEvent) + return fd, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return nil, err + } + } +} + +// ReadlinkAt returns the target of the symbolic link at the given path. +func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (string, error) { + rp := vfs.getResolvingPath(creds, pop) + for { + target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp) + if err == nil { + vfs.putResolvingPath(rp) + return target, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return "", err + } + } +} + +// RenameAt renames the file at oldpop to newpop. +func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error { + if !oldpop.Path.Begin.Ok() { + if oldpop.Path.Absolute { + return syserror.EBUSY + } + return syserror.ENOENT + } + if oldpop.FollowFinalSymlink { + ctx.Warningf("VirtualFilesystem.RenameAt: source path can't follow final symlink") + return syserror.EINVAL + } + + oldParentVD, oldName, err := vfs.getParentDirAndName(ctx, creds, oldpop) + if err != nil { + return err + } + if oldName == "." || oldName == ".." { + oldParentVD.DecRef() + return syserror.EBUSY + } + + if !newpop.Path.Begin.Ok() { + oldParentVD.DecRef() + if newpop.Path.Absolute { + return syserror.EBUSY + } + return syserror.ENOENT + } + if newpop.FollowFinalSymlink { + oldParentVD.DecRef() + ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink") + return syserror.EINVAL + } + + rp := vfs.getResolvingPath(creds, newpop) + renameOpts := *opts + if oldpop.Path.Dir { + renameOpts.MustBeDir = true + } + for { + err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts) + if err == nil { + vfs.putResolvingPath(rp) + oldParentVD.DecRef() + return nil + } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) + } + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + oldParentVD.DecRef() + return err + } + } +} + +// RmdirAt removes the directory at the given path. +func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { + if !pop.Path.Begin.Ok() { + if pop.Path.Absolute { + return syserror.EBUSY + } + return syserror.ENOENT + } + if pop.FollowFinalSymlink { + ctx.Warningf("VirtualFilesystem.RmdirAt: file deletion paths can't follow final symlink") + return syserror.EINVAL + } + + rp := vfs.getResolvingPath(creds, pop) + for { + err := rp.mount.fs.impl.RmdirAt(ctx, rp) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) + } + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// SetStatAt changes metadata for the file at the given path. +func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetStatOptions) error { + rp := vfs.getResolvingPath(creds, pop) + for { + err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// StatAt returns metadata for the file at the given path. +func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) { + rp := vfs.getResolvingPath(creds, pop) + for { + stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return stat, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return linux.Statx{}, err + } + } +} + +// StatFSAt returns metadata for the filesystem containing the file at the +// given path. +func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (linux.Statfs, error) { + rp := vfs.getResolvingPath(creds, pop) + for { + statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp) + if err == nil { + vfs.putResolvingPath(rp) + return statfs, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return linux.Statfs{}, err + } + } +} + +// SymlinkAt creates a symbolic link at the given path with the given target. +func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error { + if !pop.Path.Begin.Ok() { + if pop.Path.Absolute { + return syserror.EEXIST + } + return syserror.ENOENT + } + if pop.FollowFinalSymlink { + ctx.Warningf("VirtualFilesystem.SymlinkAt: file creation paths can't follow final symlink") + return syserror.EINVAL + } + + rp := vfs.getResolvingPath(creds, pop) + for { + err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) + } + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// UnlinkAt deletes the non-directory file at the given path. +func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { + if !pop.Path.Begin.Ok() { + if pop.Path.Absolute { + return syserror.EBUSY + } + return syserror.ENOENT + } + if pop.FollowFinalSymlink { + ctx.Warningf("VirtualFilesystem.UnlinkAt: file deletion paths can't follow final symlink") + return syserror.EINVAL + } + + rp := vfs.getResolvingPath(creds, pop) + for { + err := rp.mount.fs.impl.UnlinkAt(ctx, rp) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) + } + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// BoundEndpointAt gets the bound endpoint at the given path, if one exists. +func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *BoundEndpointOptions) (transport.BoundEndpoint, error) { + if !pop.Path.Begin.Ok() { + if pop.Path.Absolute { + return nil, syserror.ECONNREFUSED + } + return nil, syserror.ENOENT + } + rp := vfs.getResolvingPath(creds, pop) + for { + bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return bep, nil + } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) + } + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return nil, err + } + } +} + +// ListxattrAt returns all extended attribute names for the file at the given +// path. +func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) { + rp := vfs.getResolvingPath(creds, pop) + for { + names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp, size) + if err == nil { + vfs.putResolvingPath(rp) + return names, nil + } + if err == syserror.ENOTSUP { + // Linux doesn't actually return ENOTSUP in this case; instead, + // fs/xattr.c:vfs_listxattr() falls back to allowing the security + // subsystem to return security extended attributes, which by + // default don't exist. + vfs.putResolvingPath(rp) + return nil, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return nil, err + } + } +} + +// GetxattrAt returns the value associated with the given extended attribute +// for the file at the given path. +func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetxattrOptions) (string, error) { + rp := vfs.getResolvingPath(creds, pop) + for { + val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return val, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return "", err + } + } +} + +// SetxattrAt changes the value associated with the given extended attribute +// for the file at the given path. +func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetxattrOptions) error { + rp := vfs.getResolvingPath(creds, pop) + for { + err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// RemovexattrAt removes the given extended attribute from the file at rp. +func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error { + rp := vfs.getResolvingPath(creds, pop) + for { + err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// SyncAllFilesystems has the semantics of Linux's sync(2). +func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error { + fss := make(map[*Filesystem]struct{}) + vfs.filesystemsMu.Lock() + for fs := range vfs.filesystems { + if !fs.TryIncRef() { + continue + } + fss[fs] = struct{}{} + } + vfs.filesystemsMu.Unlock() + var retErr error + for fs := range fss { + if err := fs.impl.Sync(ctx); err != nil && retErr == nil { + retErr = err + } + fs.DecRef() + } + return retErr +} + +// A VirtualDentry represents a node in a VFS tree, by combining a Dentry +// (which represents a node in a Filesystem's tree) and a Mount (which +// represents the Filesystem's position in a VFS mount tree). +// +// VirtualDentry's semantics are similar to that of a Go interface object +// representing a pointer: it is a copyable value type that represents +// references to another entity. The zero value of VirtualDentry is an "empty +// VirtualDentry", directly analogous to a nil interface object. +// VirtualDentry.Ok() checks that a VirtualDentry is not zero-valued; unless +// otherwise specified, all other VirtualDentry methods require +// VirtualDentry.Ok() == true. +// +// Mounts and Dentries are reference-counted, requiring that users call +// VirtualDentry.{Inc,Dec}Ref() as appropriate. We often colloquially refer to +// references on the Mount and Dentry referred to by a VirtualDentry as +// references on the VirtualDentry itself. Unless otherwise specified, all +// VirtualDentry methods require that a reference is held on the VirtualDentry. +// +// VirtualDentry is analogous to Linux's struct path. +// +// +stateify savable +type VirtualDentry struct { + mount *Mount + dentry *Dentry +} + +// MakeVirtualDentry creates a VirtualDentry. +func MakeVirtualDentry(mount *Mount, dentry *Dentry) VirtualDentry { + return VirtualDentry{ + mount: mount, + dentry: dentry, + } +} + +// Ok returns true if vd is not empty. It does not require that a reference is +// held. +func (vd VirtualDentry) Ok() bool { + return vd.mount != nil +} + +// IncRef increments the reference counts on the Mount and Dentry represented +// by vd. +func (vd VirtualDentry) IncRef() { + vd.mount.IncRef() + vd.dentry.IncRef() +} + +// DecRef decrements the reference counts on the Mount and Dentry represented +// by vd. +func (vd VirtualDentry) DecRef() { + vd.dentry.DecRef() + vd.mount.DecRef() +} + +// Mount returns the Mount associated with vd. It does not take a reference on +// the returned Mount. +func (vd VirtualDentry) Mount() *Mount { + return vd.mount +} + +// Dentry returns the Dentry associated with vd. It does not take a reference +// on the returned Dentry. +func (vd VirtualDentry) Dentry() *Dentry { + return vd.dentry +} |