summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/vfs
diff options
context:
space:
mode:
authorkevin.xu <cming.xu@gmail.com>2020-04-27 21:51:31 +0800
committerGitHub <noreply@github.com>2020-04-27 21:51:31 +0800
commite896ca54db67524afc20b644d43c72185e72dc0e (patch)
tree2a16f3a62a5cafd098f1f028c621f1b655589d69 /pkg/sentry/vfs
parent1f19624fa127d7d59cabe29593cc80b7fe6c81f8 (diff)
parent3c67754663f424f2ebbc0ff2a4c80e30618d5355 (diff)
Merge pull request #1 from google/master
catch up
Diffstat (limited to 'pkg/sentry/vfs')
-rw-r--r--pkg/sentry/vfs/BUILD54
-rw-r--r--pkg/sentry/vfs/README.md4
-rw-r--r--pkg/sentry/vfs/anonfs.go298
-rw-r--r--pkg/sentry/vfs/context.go22
-rw-r--r--pkg/sentry/vfs/dentry.go306
-rw-r--r--pkg/sentry/vfs/device.go132
-rw-r--r--pkg/sentry/vfs/epoll.go381
-rw-r--r--pkg/sentry/vfs/file_description.go588
-rw-r--r--pkg/sentry/vfs/file_description_impl_util.go156
-rw-r--r--pkg/sentry/vfs/file_description_impl_util_test.go172
-rw-r--r--pkg/sentry/vfs/filesystem.go431
-rw-r--r--pkg/sentry/vfs/filesystem_impl_util.go43
-rw-r--r--pkg/sentry/vfs/filesystem_type.go71
-rw-r--r--pkg/sentry/vfs/genericfstree/BUILD16
-rw-r--r--pkg/sentry/vfs/genericfstree/genericfstree.go80
-rw-r--r--pkg/sentry/vfs/lock/BUILD13
-rw-r--r--pkg/sentry/vfs/lock/lock.go72
-rw-r--r--pkg/sentry/vfs/memxattr/BUILD15
-rw-r--r--pkg/sentry/vfs/memxattr/xattr.go102
-rw-r--r--pkg/sentry/vfs/mount.go686
-rw-r--r--pkg/sentry/vfs/mount_test.go39
-rw-r--r--pkg/sentry/vfs/mount_unsafe.go84
-rw-r--r--pkg/sentry/vfs/options.go92
-rw-r--r--pkg/sentry/vfs/pathname.go195
-rw-r--r--pkg/sentry/vfs/permissions.go148
-rw-r--r--pkg/sentry/vfs/resolving_path.go205
-rw-r--r--pkg/sentry/vfs/syscalls.go235
-rw-r--r--pkg/sentry/vfs/testutil.go139
-rw-r--r--pkg/sentry/vfs/timerfd.go142
-rw-r--r--pkg/sentry/vfs/vfs.go743
30 files changed, 4645 insertions, 1019 deletions
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index eff4b44f6..9aeb83fb0 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -1,40 +1,66 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
-package(licenses = ["notice"])
+licenses(["notice"])
+
+go_template_instance(
+ name = "epoll_interest_list",
+ out = "epoll_interest_list.go",
+ package = "vfs",
+ prefix = "epollInterest",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Element": "*epollInterest",
+ "Linker": "*epollInterest",
+ },
+)
go_library(
name = "vfs",
srcs = [
+ "anonfs.go",
"context.go",
"debug.go",
"dentry.go",
+ "device.go",
+ "epoll.go",
+ "epoll_interest_list.go",
"file_description.go",
"file_description_impl_util.go",
"filesystem.go",
+ "filesystem_impl_util.go",
"filesystem_type.go",
"mount.go",
"mount_unsafe.go",
"options.go",
+ "pathname.go",
"permissions.go",
"resolving_path.go",
- "syscalls.go",
- "testutil.go",
+ "timerfd.go",
"vfs.go",
],
- importpath = "gvisor.dev/gvisor/pkg/sentry/vfs",
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
+ "//pkg/context",
+ "//pkg/fd",
"//pkg/fspath",
+ "//pkg/gohacks",
+ "//pkg/log",
+ "//pkg/safemem",
"//pkg/sentry/arch",
- "//pkg/sentry/context",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/time",
+ "//pkg/sentry/limits",
"//pkg/sentry/memmap",
- "//pkg/sentry/usermem",
+ "//pkg/sentry/socket/unix/transport",
+ "//pkg/sync",
"//pkg/syserror",
+ "//pkg/usermem",
"//pkg/waiter",
- "//third_party/gvsync",
+ "@org_golang_x_sys//unix:go_default_library",
],
)
@@ -45,13 +71,13 @@ go_test(
"file_description_impl_util_test.go",
"mount_test.go",
],
- embed = [":vfs"],
+ library = ":vfs",
deps = [
"//pkg/abi/linux",
- "//pkg/sentry/context",
- "//pkg/sentry/context/contexttest",
- "//pkg/sentry/kernel/auth",
- "//pkg/sentry/usermem",
+ "//pkg/context",
+ "//pkg/sentry/contexttest",
+ "//pkg/sync",
"//pkg/syserror",
+ "//pkg/usermem",
],
)
diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md
index 7847854bc..9aa133bcb 100644
--- a/pkg/sentry/vfs/README.md
+++ b/pkg/sentry/vfs/README.md
@@ -39,8 +39,8 @@ Mount references are held by:
- Mount: Each referenced Mount holds a reference on its parent, which is the
mount containing its mount point.
-- VirtualFilesystem: A reference is held on all Mounts that are attached
- (reachable by Mount traversal).
+- VirtualFilesystem: A reference is held on each Mount that has not been
+ umounted.
MountNamespace and FileDescription references are held by users of VFS. The
expectation is that each `kernel.Task` holds a reference on its corresponding
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
new file mode 100644
index 000000000..a64d86122
--- /dev/null
+++ b/pkg/sentry/vfs/anonfs.go
@@ -0,0 +1,298 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// NewAnonVirtualDentry returns a VirtualDentry with the given synthetic name,
+// consistent with Linux's fs/anon_inodes.c:anon_inode_getfile(). References
+// are taken on the returned VirtualDentry.
+func (vfs *VirtualFilesystem) NewAnonVirtualDentry(name string) VirtualDentry {
+ d := anonDentry{
+ name: name,
+ }
+ d.vfsd.Init(&d)
+ vfs.anonMount.IncRef()
+ // anonDentry no-ops refcounting.
+ return VirtualDentry{
+ mount: vfs.anonMount,
+ dentry: &d.vfsd,
+ }
+}
+
+const (
+ anonfsBlockSize = usermem.PageSize // via fs/libfs.c:pseudo_fs_fill_super()
+
+ // Mode, UID, and GID for a generic anonfs file.
+ anonFileMode = 0600 // no type is correct
+ anonFileUID = auth.RootKUID
+ anonFileGID = auth.RootKGID
+)
+
+// anonFilesystemType implements FilesystemType.
+type anonFilesystemType struct{}
+
+// GetFilesystem implements FilesystemType.GetFilesystem.
+func (anonFilesystemType) GetFilesystem(context.Context, *VirtualFilesystem, *auth.Credentials, string, GetFilesystemOptions) (*Filesystem, *Dentry, error) {
+ panic("cannot instaniate an anon filesystem")
+}
+
+// Name implemenents FilesystemType.Name.
+func (anonFilesystemType) Name() string {
+ return "none"
+}
+
+// anonFilesystem is the implementation of FilesystemImpl that backs
+// VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
+//
+// Since all Dentries in anonFilesystem are non-directories, all FilesystemImpl
+// methods that would require an anonDentry to be a directory return ENOTDIR.
+type anonFilesystem struct {
+ vfsfs Filesystem
+
+ devMinor uint32
+}
+
+type anonDentry struct {
+ vfsd Dentry
+
+ name string
+}
+
+// Release implements FilesystemImpl.Release.
+func (fs *anonFilesystem) Release() {
+}
+
+// Sync implements FilesystemImpl.Sync.
+func (fs *anonFilesystem) Sync(ctx context.Context) error {
+ return nil
+}
+
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+//
+// TODO(gvisor.dev/issue/1965): Implement access permissions.
+func (fs *anonFilesystem) AccessAt(ctx context.Context, rp *ResolvingPath, creds *auth.Credentials, ats AccessTypes) error {
+ if !rp.Done() {
+ return syserror.ENOTDIR
+ }
+ return GenericCheckPermissions(creds, ats, anonFileMode, anonFileUID, anonFileGID)
+}
+
+// GetDentryAt implements FilesystemImpl.GetDentryAt.
+func (fs *anonFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) {
+ if !rp.Done() {
+ return nil, syserror.ENOTDIR
+ }
+ if opts.CheckSearchable {
+ return nil, syserror.ENOTDIR
+ }
+ // anonDentry no-ops refcounting.
+ return rp.Start(), nil
+}
+
+// GetParentDentryAt implements FilesystemImpl.GetParentDentryAt.
+func (fs *anonFilesystem) GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) {
+ if !rp.Final() {
+ return nil, syserror.ENOTDIR
+ }
+ // anonDentry no-ops refcounting.
+ return rp.Start(), nil
+}
+
+// LinkAt implements FilesystemImpl.LinkAt.
+func (fs *anonFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error {
+ if !rp.Final() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// MkdirAt implements FilesystemImpl.MkdirAt.
+func (fs *anonFilesystem) MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error {
+ if !rp.Final() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// MknodAt implements FilesystemImpl.MknodAt.
+func (fs *anonFilesystem) MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error {
+ if !rp.Final() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// OpenAt implements FilesystemImpl.OpenAt.
+func (fs *anonFilesystem) OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) {
+ if !rp.Done() {
+ return nil, syserror.ENOTDIR
+ }
+ return nil, syserror.ENODEV
+}
+
+// ReadlinkAt implements FilesystemImpl.ReadlinkAt.
+func (fs *anonFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) {
+ if !rp.Done() {
+ return "", syserror.ENOTDIR
+ }
+ return "", syserror.EINVAL
+}
+
+// RenameAt implements FilesystemImpl.RenameAt.
+func (fs *anonFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error {
+ if !rp.Final() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// RmdirAt implements FilesystemImpl.RmdirAt.
+func (fs *anonFilesystem) RmdirAt(ctx context.Context, rp *ResolvingPath) error {
+ if !rp.Final() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// SetStatAt implements FilesystemImpl.SetStatAt.
+func (fs *anonFilesystem) SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error {
+ if !rp.Done() {
+ return syserror.ENOTDIR
+ }
+ // Linux actually permits anon_inode_inode's metadata to be set, which is
+ // visible to all users of anon_inode_inode. We just silently ignore
+ // metadata changes.
+ return nil
+}
+
+// StatAt implements FilesystemImpl.StatAt.
+func (fs *anonFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) {
+ if !rp.Done() {
+ return linux.Statx{}, syserror.ENOTDIR
+ }
+ // See fs/anon_inodes.c:anon_inode_init() => fs/libfs.c:alloc_anon_inode().
+ return linux.Statx{
+ Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS,
+ Blksize: anonfsBlockSize,
+ Nlink: 1,
+ UID: uint32(anonFileUID),
+ GID: uint32(anonFileGID),
+ Mode: anonFileMode,
+ Ino: 1,
+ Size: 0,
+ Blocks: 0,
+ DevMajor: 0,
+ DevMinor: fs.devMinor,
+ }, nil
+}
+
+// StatFSAt implements FilesystemImpl.StatFSAt.
+func (fs *anonFilesystem) StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) {
+ if !rp.Done() {
+ return linux.Statfs{}, syserror.ENOTDIR
+ }
+ return linux.Statfs{
+ Type: linux.ANON_INODE_FS_MAGIC,
+ BlockSize: anonfsBlockSize,
+ }, nil
+}
+
+// SymlinkAt implements FilesystemImpl.SymlinkAt.
+func (fs *anonFilesystem) SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error {
+ if !rp.Final() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// UnlinkAt implements FilesystemImpl.UnlinkAt.
+func (fs *anonFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error {
+ if !rp.Final() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath) (transport.BoundEndpoint, error) {
+ if !rp.Final() {
+ return nil, syserror.ENOTDIR
+ }
+ return nil, syserror.ECONNREFUSED
+}
+
+// ListxattrAt implements FilesystemImpl.ListxattrAt.
+func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) {
+ if !rp.Done() {
+ return nil, syserror.ENOTDIR
+ }
+ return nil, nil
+}
+
+// GetxattrAt implements FilesystemImpl.GetxattrAt.
+func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error) {
+ if !rp.Done() {
+ return "", syserror.ENOTDIR
+ }
+ return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements FilesystemImpl.SetxattrAt.
+func (fs *anonFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error {
+ if !rp.Done() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// RemovexattrAt implements FilesystemImpl.RemovexattrAt.
+func (fs *anonFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
+ if !rp.Done() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// PrependPath implements FilesystemImpl.PrependPath.
+func (fs *anonFilesystem) PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error {
+ b.PrependComponent(fmt.Sprintf("anon_inode:%s", vd.dentry.impl.(*anonDentry).name))
+ return PrependPathSyntheticError{}
+}
+
+// IncRef implements DentryImpl.IncRef.
+func (d *anonDentry) IncRef() {
+ // no-op
+}
+
+// TryIncRef implements DentryImpl.TryIncRef.
+func (d *anonDentry) TryIncRef() bool {
+ return true
+}
+
+// DecRef implements DentryImpl.DecRef.
+func (d *anonDentry) DecRef() {
+ // no-op
+}
diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go
index 32cf9151b..82781e6d3 100644
--- a/pkg/sentry/vfs/context.go
+++ b/pkg/sentry/vfs/context.go
@@ -15,7 +15,7 @@
package vfs
import (
- "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/context"
)
// contextID is this package's type for context.Context.Value keys.
@@ -24,14 +24,28 @@ type contextID int
const (
// CtxMountNamespace is a Context.Value key for a MountNamespace.
CtxMountNamespace contextID = iota
+
+ // CtxRoot is a Context.Value key for a VFS root.
+ CtxRoot
)
-// MountNamespaceFromContext returns the MountNamespace used by ctx. It does
-// not take a reference on the returned MountNamespace. If ctx is not
-// associated with a MountNamespace, MountNamespaceFromContext returns nil.
+// MountNamespaceFromContext returns the MountNamespace used by ctx. If ctx is
+// not associated with a MountNamespace, MountNamespaceFromContext returns nil.
+//
+// A reference is taken on the returned MountNamespace.
func MountNamespaceFromContext(ctx context.Context) *MountNamespace {
if v := ctx.Value(CtxMountNamespace); v != nil {
return v.(*MountNamespace)
}
return nil
}
+
+// RootFromContext returns the VFS root used by ctx. It takes a reference on
+// the returned VirtualDentry. If ctx does not have a specific VFS root,
+// RootFromContext returns a zero-value VirtualDentry.
+func RootFromContext(ctx context.Context) VirtualDentry {
+ if v := ctx.Value(CtxRoot); v != nil {
+ return v.(VirtualDentry)
+ }
+ return VirtualDentry{}
+}
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 45912fc58..8624dbd5d 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -15,33 +15,17 @@
package vfs
import (
- "fmt"
"sync/atomic"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
-// Dentry represents a node in a Filesystem tree which may represent a file.
+// Dentry represents a node in a Filesystem tree at which a file exists.
//
// Dentries are reference-counted. Unless otherwise specified, all Dentry
// methods require that a reference is held.
//
-// A Dentry transitions through up to 3 different states through its lifetime:
-//
-// - Dentries are initially "independent". Independent Dentries have no parent,
-// and consequently no name.
-//
-// - Dentry.InsertChild() causes an independent Dentry to become a "child" of
-// another Dentry. A child node has a parent node, and a name in that parent,
-// both of which are mutable by DentryMoveChild(). Each child Dentry's name is
-// unique within its parent.
-//
-// - Dentry.RemoveChild() causes a child Dentry to become "disowned". A
-// disowned Dentry can still refer to its former parent and its former name in
-// said parent, but the disowned Dentry is no longer reachable from its parent,
-// and a new Dentry with the same name may become a child of the parent. (This
-// is analogous to a struct dentry being "unhashed" in Linux.)
-//
// Dentry is loosely analogous to Linux's struct dentry, but:
//
// - VFS does not associate Dentries with inodes. gVisor interacts primarily
@@ -50,15 +34,12 @@ import (
// and not inodes. Furthermore, when parties outside the scope of VFS can
// rename inodes on such filesystems, VFS generally cannot "follow" the rename,
// both due to synchronization issues and because it may not even be able to
-// name the destination path; this implies that it would in fact be *incorrect*
+// name the destination path; this implies that it would in fact be incorrect
// for Dentries to be associated with inodes on such filesystems. Consequently,
// operations that are inode operations in Linux are FilesystemImpl methods
// and/or FileDescriptionImpl methods in gVisor's VFS. Filesystems that do
// support inodes may store appropriate state in implementations of DentryImpl.
//
-// - VFS does not provide synchronization for mutable Dentry fields, other than
-// mount-related ones.
-//
// - VFS does not require that Dentries are instantiated for all paths accessed
// through VFS, only those that are tracked beyond the scope of a single
// Filesystem operation. This includes file descriptions, mount points, mount
@@ -66,38 +47,34 @@ import (
// of Dentries for operations on mutable remote filesystems that can't actually
// cache any state in the Dentry.
//
+// - VFS does not track filesystem structure (i.e. relationships between
+// Dentries), since both the relevant state and synchronization are
+// filesystem-specific.
+//
// - For the reasons above, VFS is not directly responsible for managing Dentry
// lifetime. Dentry reference counts only indicate the extent to which VFS
// requires Dentries to exist; Filesystems may elect to cache or discard
// Dentries with zero references.
+//
+// +stateify savable
type Dentry struct {
- // parent is this Dentry's parent in this Filesystem. If this Dentry is
- // independent, parent is nil.
- parent *Dentry
-
- // name is this Dentry's name in parent.
- name string
+ // mu synchronizes deletion/invalidation and mounting over this Dentry.
+ mu sync.Mutex `state:"nosave"`
- flags uint32
+ // dead is true if the file represented by this Dentry has been deleted (by
+ // CommitDeleteDentry or CommitRenameReplaceDentry) or invalidated (by
+ // InvalidateDentry). dead is protected by mu.
+ dead bool
// mounts is the number of Mounts for which this Dentry is Mount.point.
// mounts is accessed using atomic memory operations.
mounts uint32
- // children are child Dentries.
- children map[string]*Dentry
-
// impl is the DentryImpl associated with this Dentry. impl is immutable.
// This should be the last field in Dentry.
impl DentryImpl
}
-const (
- // dflagsDisownedMask is set in Dentry.flags if the Dentry has been
- // disowned.
- dflagsDisownedMask = 1 << iota
-)
-
// Init must be called before first use of d.
func (d *Dentry) Init(impl DentryImpl) {
d.impl = impl
@@ -114,7 +91,7 @@ func (d *Dentry) Impl() DentryImpl {
type DentryImpl interface {
// IncRef increments the Dentry's reference count. A Dentry with a non-zero
// reference count must remain coherent with the state of the filesystem.
- IncRef(fs *Filesystem)
+ IncRef()
// TryIncRef increments the Dentry's reference count and returns true. If
// the Dentry's reference count is zero, TryIncRef may do nothing and
@@ -122,148 +99,87 @@ type DentryImpl interface {
// guarantee that the Dentry is coherent with the state of the filesystem.)
//
// TryIncRef does not require that a reference is held on the Dentry.
- TryIncRef(fs *Filesystem) bool
+ TryIncRef() bool
// DecRef decrements the Dentry's reference count.
- DecRef(fs *Filesystem)
+ DecRef()
}
-// IsDisowned returns true if d is disowned.
-func (d *Dentry) IsDisowned() bool {
- return atomic.LoadUint32(&d.flags)&dflagsDisownedMask != 0
+// IncRef increments d's reference count.
+func (d *Dentry) IncRef() {
+ d.impl.IncRef()
}
-// Preconditions: !d.IsDisowned().
-func (d *Dentry) setDisowned() {
- atomic.AddUint32(&d.flags, dflagsDisownedMask)
-}
-
-func (d *Dentry) isMounted() bool {
- return atomic.LoadUint32(&d.mounts) != 0
+// TryIncRef increments d's reference count and returns true. If d's reference
+// count is zero, TryIncRef may instead do nothing and return false.
+func (d *Dentry) TryIncRef() bool {
+ return d.impl.TryIncRef()
}
-func (d *Dentry) incRef(fs *Filesystem) {
- d.impl.IncRef(fs)
+// DecRef decrements d's reference count.
+func (d *Dentry) DecRef() {
+ d.impl.DecRef()
}
-func (d *Dentry) tryIncRef(fs *Filesystem) bool {
- return d.impl.TryIncRef(fs)
+// IsDead returns true if d has been deleted or invalidated by its owning
+// filesystem.
+func (d *Dentry) IsDead() bool {
+ d.mu.Lock()
+ defer d.mu.Unlock()
+ return d.dead
}
-func (d *Dentry) decRef(fs *Filesystem) {
- d.impl.DecRef(fs)
-}
-
-// These functions are exported so that filesystem implementations can use
-// them. The vfs package, and users of VFS, should not call these functions.
-// Unless otherwise specified, these methods require that there are no
-// concurrent mutators of d.
-
-// Name returns d's name in its parent in its owning Filesystem. If d is
-// independent, Name returns an empty string.
-func (d *Dentry) Name() string {
- return d.name
-}
-
-// Parent returns d's parent in its owning Filesystem. It does not take a
-// reference on the returned Dentry. If d is independent, Parent returns nil.
-func (d *Dentry) Parent() *Dentry {
- return d.parent
-}
-
-// ParentOrSelf is equivalent to Parent, but returns d if d is independent.
-func (d *Dentry) ParentOrSelf() *Dentry {
- if d.parent == nil {
- return d
- }
- return d.parent
-}
-
-// Child returns d's child with the given name in its owning Filesystem. It
-// does not take a reference on the returned Dentry. If no such child exists,
-// Child returns nil.
-func (d *Dentry) Child(name string) *Dentry {
- return d.children[name]
+func (d *Dentry) isMounted() bool {
+ return atomic.LoadUint32(&d.mounts) != 0
}
-// HasChildren returns true if d has any children.
-func (d *Dentry) HasChildren() bool {
- return len(d.children) != 0
-}
-
-// InsertChild makes child a child of d with the given name.
-//
-// InsertChild is a mutator of d and child.
-//
-// Preconditions: child must be an independent Dentry. d and child must be from
-// the same Filesystem. d must not already have a child with the given name.
-func (d *Dentry) InsertChild(child *Dentry, name string) {
- if checkInvariants {
- if _, ok := d.children[name]; ok {
- panic(fmt.Sprintf("parent already contains a child named %q", name))
- }
- if child.parent != nil || child.name != "" {
- panic(fmt.Sprintf("child is not independent: parent = %v, name = %q", child.parent, child.name))
- }
- }
- if d.children == nil {
- d.children = make(map[string]*Dentry)
- }
- d.children[name] = child
- child.parent = d
- child.name = name
-}
+// The following functions are exported so that filesystem implementations can
+// use them. The vfs package, and users of VFS, should not call these
+// functions.
// PrepareDeleteDentry must be called before attempting to delete the file
// represented by d. If PrepareDeleteDentry succeeds, the caller must call
// AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome.
-//
-// Preconditions: d is a child Dentry.
func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dentry) error {
- if checkInvariants {
- if d.parent == nil {
- panic("d is independent")
- }
- if d.IsDisowned() {
- panic("d is already disowned")
- }
- }
- vfs.mountMu.RLock()
- if _, ok := mntns.mountpoints[d]; ok {
- vfs.mountMu.RUnlock()
+ vfs.mountMu.Lock()
+ if mntns.mountpoints[d] != 0 {
+ vfs.mountMu.Unlock()
return syserror.EBUSY
}
- // Return with vfs.mountMu locked, which will be unlocked by
- // AbortDeleteDentry or CommitDeleteDentry.
+ d.mu.Lock()
+ vfs.mountMu.Unlock()
+ // Return with d.mu locked to block attempts to mount over it; it will be
+ // unlocked by AbortDeleteDentry or CommitDeleteDentry.
return nil
}
// AbortDeleteDentry must be called after PrepareDeleteDentry if the deletion
// fails.
-func (vfs *VirtualFilesystem) AbortDeleteDentry() {
- vfs.mountMu.RUnlock()
+func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) {
+ d.mu.Unlock()
}
-// CommitDeleteDentry must be called after the file represented by d is
-// deleted, and causes d to become disowned.
-//
-// Preconditions: PrepareDeleteDentry was previously called on d.
+// CommitDeleteDentry must be called after PrepareDeleteDentry if the deletion
+// succeeds.
func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) {
- delete(d.parent.children, d.name)
- d.setDisowned()
- // TODO: lazily unmount mounts at d
- vfs.mountMu.RUnlock()
+ d.dead = true
+ d.mu.Unlock()
+ if d.isMounted() {
+ vfs.forgetDeadMountpoint(d)
+ }
}
-// DeleteDentry combines PrepareDeleteDentry and CommitDeleteDentry, as
-// appropriate for in-memory filesystems that don't need to ensure that some
-// external state change succeeds before committing the deletion.
-func (vfs *VirtualFilesystem) DeleteDentry(mntns *MountNamespace, d *Dentry) error {
- if err := vfs.PrepareDeleteDentry(mntns, d); err != nil {
- return err
+// InvalidateDentry is called when d ceases to represent the file it formerly
+// did for reasons outside of VFS' control (e.g. d represents the local state
+// of a file on a remote filesystem on which the file has already been
+// deleted).
+func (vfs *VirtualFilesystem) InvalidateDentry(d *Dentry) {
+ d.mu.Lock()
+ d.dead = true
+ d.mu.Unlock()
+ if d.isMounted() {
+ vfs.forgetDeadMountpoint(d)
}
- vfs.CommitDeleteDentry(d)
- return nil
}
// PrepareRenameDentry must be called before attempting to rename the file
@@ -272,37 +188,24 @@ func (vfs *VirtualFilesystem) DeleteDentry(mntns *MountNamespace, d *Dentry) err
// caller must call AbortRenameDentry, CommitRenameReplaceDentry, or
// CommitRenameExchangeDentry depending on the rename's outcome.
//
-// Preconditions: from is a child Dentry. If to is not nil, it must be a child
-// Dentry from the same Filesystem.
+// Preconditions: If to is not nil, it must be a child Dentry from the same
+// Filesystem. from != to.
func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error {
- if checkInvariants {
- if from.parent == nil {
- panic("from is independent")
- }
- if from.IsDisowned() {
- panic("from is already disowned")
- }
- if to != nil {
- if to.parent == nil {
- panic("to is independent")
- }
- if to.IsDisowned() {
- panic("to is already disowned")
- }
- }
- }
- vfs.mountMu.RLock()
- if _, ok := mntns.mountpoints[from]; ok {
- vfs.mountMu.RUnlock()
+ vfs.mountMu.Lock()
+ if mntns.mountpoints[from] != 0 {
+ vfs.mountMu.Unlock()
return syserror.EBUSY
}
if to != nil {
- if _, ok := mntns.mountpoints[to]; ok {
- vfs.mountMu.RUnlock()
+ if mntns.mountpoints[to] != 0 {
+ vfs.mountMu.Unlock()
return syserror.EBUSY
}
+ to.mu.Lock()
}
- // Return with vfs.mountMu locked, which will be unlocked by
+ from.mu.Lock()
+ vfs.mountMu.Unlock()
+ // Return with from.mu and to.mu locked, which will be unlocked by
// AbortRenameDentry, CommitRenameReplaceDentry, or
// CommitRenameExchangeDentry.
return nil
@@ -310,8 +213,11 @@ func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, t
// AbortRenameDentry must be called after PrepareRenameDentry if the rename
// fails.
-func (vfs *VirtualFilesystem) AbortRenameDentry() {
- vfs.mountMu.RUnlock()
+func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) {
+ from.mu.Unlock()
+ if to != nil {
+ to.mu.Unlock()
+ }
}
// CommitRenameReplaceDentry must be called after the file represented by from
@@ -319,19 +225,15 @@ func (vfs *VirtualFilesystem) AbortRenameDentry() {
// that was replaced by from.
//
// Preconditions: PrepareRenameDentry was previously called on from and to.
-// newParent.Child(newName) == to.
-func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, newParent *Dentry, newName string, to *Dentry) {
+func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, to *Dentry) {
+ from.mu.Unlock()
if to != nil {
- to.setDisowned()
- // TODO: lazily unmount mounts at d
- }
- if newParent.children == nil {
- newParent.children = make(map[string]*Dentry)
+ to.dead = true
+ to.mu.Unlock()
+ if to.isMounted() {
+ vfs.forgetDeadMountpoint(to)
+ }
}
- newParent.children[newName] = from
- from.parent = newParent
- from.name = newName
- vfs.mountMu.RUnlock()
}
// CommitRenameExchangeDentry must be called after the files represented by
@@ -339,9 +241,31 @@ func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, newParent *Dentry,
//
// Preconditions: PrepareRenameDentry was previously called on from and to.
func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) {
- from.parent, to.parent = to.parent, from.parent
- from.name, to.name = to.name, from.name
- from.parent.children[from.name] = from
- to.parent.children[to.name] = to
- vfs.mountMu.RUnlock()
+ from.mu.Unlock()
+ to.mu.Unlock()
+}
+
+// forgetDeadMountpoint is called when a mount point is deleted or invalidated
+// to umount all mounts using it in all other mount namespaces.
+//
+// forgetDeadMountpoint is analogous to Linux's
+// fs/namespace.c:__detach_mounts().
+func (vfs *VirtualFilesystem) forgetDeadMountpoint(d *Dentry) {
+ var (
+ vdsToDecRef []VirtualDentry
+ mountsToDecRef []*Mount
+ )
+ vfs.mountMu.Lock()
+ vfs.mounts.seq.BeginWrite()
+ for mnt := range vfs.mountpoints[d] {
+ vdsToDecRef, mountsToDecRef = vfs.umountRecursiveLocked(mnt, &umountRecursiveOptions{}, vdsToDecRef, mountsToDecRef)
+ }
+ vfs.mounts.seq.EndWrite()
+ vfs.mountMu.Unlock()
+ for _, vd := range vdsToDecRef {
+ vd.DecRef()
+ }
+ for _, mnt := range mountsToDecRef {
+ mnt.DecRef()
+ }
}
diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go
new file mode 100644
index 000000000..bda5576fa
--- /dev/null
+++ b/pkg/sentry/vfs/device.go
@@ -0,0 +1,132 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// DeviceKind indicates whether a device is a block or character device.
+type DeviceKind uint32
+
+const (
+ // BlockDevice indicates a block device.
+ BlockDevice DeviceKind = iota
+
+ // CharDevice indicates a character device.
+ CharDevice
+)
+
+// String implements fmt.Stringer.String.
+func (kind DeviceKind) String() string {
+ switch kind {
+ case BlockDevice:
+ return "block"
+ case CharDevice:
+ return "character"
+ default:
+ return fmt.Sprintf("invalid device kind %d", kind)
+ }
+}
+
+type devTuple struct {
+ kind DeviceKind
+ major uint32
+ minor uint32
+}
+
+// A Device backs device special files.
+type Device interface {
+ // Open returns a FileDescription representing this device.
+ Open(ctx context.Context, mnt *Mount, d *Dentry, opts OpenOptions) (*FileDescription, error)
+}
+
+// +stateify savable
+type registeredDevice struct {
+ dev Device
+ opts RegisterDeviceOptions
+}
+
+// RegisterDeviceOptions contains options to
+// VirtualFilesystem.RegisterDevice().
+//
+// +stateify savable
+type RegisterDeviceOptions struct {
+ // GroupName is the name shown for this device registration in
+ // /proc/devices. If GroupName is empty, this registration will not be
+ // shown in /proc/devices.
+ GroupName string
+}
+
+// RegisterDevice registers the given Device in vfs with the given major and
+// minor device numbers.
+func (vfs *VirtualFilesystem) RegisterDevice(kind DeviceKind, major, minor uint32, dev Device, opts *RegisterDeviceOptions) error {
+ tup := devTuple{kind, major, minor}
+ vfs.devicesMu.Lock()
+ defer vfs.devicesMu.Unlock()
+ if existing, ok := vfs.devices[tup]; ok {
+ return fmt.Errorf("%s device number (%d, %d) is already registered to device type %T", kind, major, minor, existing.dev)
+ }
+ vfs.devices[tup] = &registeredDevice{
+ dev: dev,
+ opts: *opts,
+ }
+ return nil
+}
+
+// OpenDeviceSpecialFile returns a FileDescription representing the given
+// device.
+func (vfs *VirtualFilesystem) OpenDeviceSpecialFile(ctx context.Context, mnt *Mount, d *Dentry, kind DeviceKind, major, minor uint32, opts *OpenOptions) (*FileDescription, error) {
+ tup := devTuple{kind, major, minor}
+ vfs.devicesMu.RLock()
+ defer vfs.devicesMu.RUnlock()
+ rd, ok := vfs.devices[tup]
+ if !ok {
+ return nil, syserror.ENXIO
+ }
+ return rd.dev.Open(ctx, mnt, d, *opts)
+}
+
+// GetAnonBlockDevMinor allocates and returns an unused minor device number for
+// an "anonymous" block device with major number 0.
+func (vfs *VirtualFilesystem) GetAnonBlockDevMinor() (uint32, error) {
+ vfs.anonBlockDevMinorMu.Lock()
+ defer vfs.anonBlockDevMinorMu.Unlock()
+ minor := vfs.anonBlockDevMinorNext
+ const maxDevMinor = (1 << 20) - 1
+ for minor < maxDevMinor {
+ if _, ok := vfs.anonBlockDevMinor[minor]; !ok {
+ vfs.anonBlockDevMinor[minor] = struct{}{}
+ vfs.anonBlockDevMinorNext = minor + 1
+ return minor, nil
+ }
+ minor++
+ }
+ return 0, syserror.EMFILE
+}
+
+// PutAnonBlockDevMinor deallocates a minor device number returned by a
+// previous call to GetAnonBlockDevMinor.
+func (vfs *VirtualFilesystem) PutAnonBlockDevMinor(minor uint32) {
+ vfs.anonBlockDevMinorMu.Lock()
+ defer vfs.anonBlockDevMinorMu.Unlock()
+ delete(vfs.anonBlockDevMinor, minor)
+ if minor < vfs.anonBlockDevMinorNext {
+ vfs.anonBlockDevMinorNext = minor
+ }
+}
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
new file mode 100644
index 000000000..8e0b40841
--- /dev/null
+++ b/pkg/sentry/vfs/epoll.go
@@ -0,0 +1,381 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// epollCycleMu serializes attempts to register EpollInstances with other
+// EpollInstances in order to check for cycles.
+var epollCycleMu sync.Mutex
+
+// EpollInstance represents an epoll instance, as described by epoll(7).
+type EpollInstance struct {
+ vfsfd FileDescription
+ FileDescriptionDefaultImpl
+ DentryMetadataFileDescriptionImpl
+
+ // q holds waiters on this EpollInstance.
+ q waiter.Queue
+
+ // interest is the set of file descriptors that are registered with the
+ // EpollInstance for monitoring. interest is protected by interestMu.
+ interestMu sync.Mutex
+ interest map[epollInterestKey]*epollInterest
+
+ // mu protects fields in registered epollInterests.
+ mu sync.Mutex
+
+ // ready is the set of file descriptors that may be "ready" for I/O. Note
+ // that this must be an ordered list, not a map: "If more than maxevents
+ // file descriptors are ready when epoll_wait() is called, then successive
+ // epoll_wait() calls will round robin through the set of ready file
+ // descriptors. This behavior helps avoid starvation scenarios, where a
+ // process fails to notice that additional file descriptors are ready
+ // because it focuses on a set of file descriptors that are already known
+ // to be ready." - epoll_wait(2)
+ ready epollInterestList
+}
+
+type epollInterestKey struct {
+ // file is the registered FileDescription. No reference is held on file;
+ // instead, when the last reference is dropped, FileDescription.DecRef()
+ // removes the FileDescription from all EpollInstances. file is immutable.
+ file *FileDescription
+
+ // num is the file descriptor number with which this entry was registered.
+ // num is immutable.
+ num int32
+}
+
+// epollInterest represents an EpollInstance's interest in a file descriptor.
+type epollInterest struct {
+ // epoll is the owning EpollInstance. epoll is immutable.
+ epoll *EpollInstance
+
+ // key is the file to which this epollInterest applies. key is immutable.
+ key epollInterestKey
+
+ // waiter is registered with key.file. entry is protected by epoll.mu.
+ waiter waiter.Entry
+
+ // mask is the event mask associated with this registration, including
+ // flags EPOLLET and EPOLLONESHOT. mask is protected by epoll.mu.
+ mask uint32
+
+ // ready is true if epollInterestEntry is linked into epoll.ready. ready
+ // and epollInterestEntry are protected by epoll.mu.
+ ready bool
+ epollInterestEntry
+
+ // userData is the struct epoll_event::data associated with this
+ // epollInterest. userData is protected by epoll.mu.
+ userData [2]int32
+}
+
+// NewEpollInstanceFD returns a FileDescription representing a new epoll
+// instance. A reference is taken on the returned FileDescription.
+func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) {
+ vd := vfs.NewAnonVirtualDentry("[eventpoll]")
+ defer vd.DecRef()
+ ep := &EpollInstance{
+ interest: make(map[epollInterestKey]*epollInterest),
+ }
+ if err := ep.vfsfd.Init(ep, linux.O_RDWR, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
+ DenyPRead: true,
+ DenyPWrite: true,
+ UseDentryMetadata: true,
+ }); err != nil {
+ return nil, err
+ }
+ return &ep.vfsfd, nil
+}
+
+// Release implements FileDescriptionImpl.Release.
+func (ep *EpollInstance) Release() {
+ // Unregister all polled fds.
+ ep.interestMu.Lock()
+ defer ep.interestMu.Unlock()
+ for key, epi := range ep.interest {
+ file := key.file
+ file.epollMu.Lock()
+ delete(file.epolls, epi)
+ file.epollMu.Unlock()
+ file.EventUnregister(&epi.waiter)
+ }
+ ep.interest = nil
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (ep *EpollInstance) Readiness(mask waiter.EventMask) waiter.EventMask {
+ if mask&waiter.EventIn == 0 {
+ return 0
+ }
+ ep.mu.Lock()
+ for epi := ep.ready.Front(); epi != nil; epi = epi.Next() {
+ wmask := waiter.EventMaskFromLinux(epi.mask)
+ if epi.key.file.Readiness(wmask)&wmask != 0 {
+ ep.mu.Unlock()
+ return waiter.EventIn
+ }
+ }
+ ep.mu.Unlock()
+ return 0
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (ep *EpollInstance) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ ep.q.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (ep *EpollInstance) EventUnregister(e *waiter.Entry) {
+ ep.q.EventUnregister(e)
+}
+
+// Seek implements FileDescriptionImpl.Seek.
+func (ep *EpollInstance) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ // Linux: fs/eventpoll.c:eventpoll_fops.llseek == noop_llseek
+ return 0, nil
+}
+
+// AddInterest implements the semantics of EPOLL_CTL_ADD.
+//
+// Preconditions: A reference must be held on file.
+func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event linux.EpollEvent) error {
+ // Check for cyclic polling if necessary.
+ subep, _ := file.impl.(*EpollInstance)
+ if subep != nil {
+ epollCycleMu.Lock()
+ // epollCycleMu must be locked for the rest of AddInterest to ensure
+ // that cyclic polling is not introduced after the check.
+ defer epollCycleMu.Unlock()
+ if subep.mightPoll(ep) {
+ return syserror.ELOOP
+ }
+ }
+
+ ep.interestMu.Lock()
+ defer ep.interestMu.Unlock()
+
+ // Fail if the key is already registered.
+ key := epollInterestKey{
+ file: file,
+ num: num,
+ }
+ if _, ok := ep.interest[key]; ok {
+ return syserror.EEXIST
+ }
+
+ // Register interest in file.
+ mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP
+ epi := &epollInterest{
+ epoll: ep,
+ key: key,
+ mask: mask,
+ userData: event.Data,
+ }
+ ep.interest[key] = epi
+ wmask := waiter.EventMaskFromLinux(mask)
+ file.EventRegister(&epi.waiter, wmask)
+
+ // Check if the file is already ready.
+ if file.Readiness(wmask)&wmask != 0 {
+ epi.Callback(nil)
+ }
+
+ // Add epi to file.epolls so that it is removed when the last
+ // FileDescription reference is dropped.
+ file.epollMu.Lock()
+ if file.epolls == nil {
+ file.epolls = make(map[*epollInterest]struct{})
+ }
+ file.epolls[epi] = struct{}{}
+ file.epollMu.Unlock()
+
+ return nil
+}
+
+func (ep *EpollInstance) mightPoll(ep2 *EpollInstance) bool {
+ return ep.mightPollRecursive(ep2, 4) // Linux: fs/eventpoll.c:EP_MAX_NESTS
+}
+
+func (ep *EpollInstance) mightPollRecursive(ep2 *EpollInstance, remainingRecursion int) bool {
+ ep.interestMu.Lock()
+ defer ep.interestMu.Unlock()
+ for key := range ep.interest {
+ nextep, ok := key.file.impl.(*EpollInstance)
+ if !ok {
+ continue
+ }
+ if nextep == ep2 {
+ return true
+ }
+ if remainingRecursion == 0 {
+ return true
+ }
+ if nextep.mightPollRecursive(ep2, remainingRecursion-1) {
+ return true
+ }
+ }
+ return false
+}
+
+// ModifyInterest implements the semantics of EPOLL_CTL_MOD.
+//
+// Preconditions: A reference must be held on file.
+func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event linux.EpollEvent) error {
+ ep.interestMu.Lock()
+ defer ep.interestMu.Unlock()
+
+ // Fail if the key is not already registered.
+ epi, ok := ep.interest[epollInterestKey{
+ file: file,
+ num: num,
+ }]
+ if !ok {
+ return syserror.ENOENT
+ }
+
+ // Update epi for the next call to ep.ReadEvents().
+ mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP
+ ep.mu.Lock()
+ epi.mask = mask
+ epi.userData = event.Data
+ ep.mu.Unlock()
+
+ // Re-register with the new mask.
+ file.EventUnregister(&epi.waiter)
+ wmask := waiter.EventMaskFromLinux(mask)
+ file.EventRegister(&epi.waiter, wmask)
+
+ // Check if the file is already ready with the new mask.
+ if file.Readiness(wmask)&wmask != 0 {
+ epi.Callback(nil)
+ }
+
+ return nil
+}
+
+// DeleteInterest implements the semantics of EPOLL_CTL_DEL.
+//
+// Preconditions: A reference must be held on file.
+func (ep *EpollInstance) DeleteInterest(file *FileDescription, num int32) error {
+ ep.interestMu.Lock()
+ defer ep.interestMu.Unlock()
+
+ // Fail if the key is not already registered.
+ epi, ok := ep.interest[epollInterestKey{
+ file: file,
+ num: num,
+ }]
+ if !ok {
+ return syserror.ENOENT
+ }
+
+ // Unregister from the file so that epi will no longer be readied.
+ file.EventUnregister(&epi.waiter)
+
+ // Forget about epi.
+ ep.removeLocked(epi)
+
+ file.epollMu.Lock()
+ delete(file.epolls, epi)
+ file.epollMu.Unlock()
+
+ return nil
+}
+
+// Callback implements waiter.EntryCallback.Callback.
+func (epi *epollInterest) Callback(*waiter.Entry) {
+ newReady := false
+ epi.epoll.mu.Lock()
+ if !epi.ready {
+ newReady = true
+ epi.ready = true
+ epi.epoll.ready.PushBack(epi)
+ }
+ epi.epoll.mu.Unlock()
+ if newReady {
+ epi.epoll.q.Notify(waiter.EventIn)
+ }
+}
+
+// Preconditions: ep.interestMu must be locked.
+func (ep *EpollInstance) removeLocked(epi *epollInterest) {
+ delete(ep.interest, epi.key)
+ ep.mu.Lock()
+ if epi.ready {
+ epi.ready = false
+ ep.ready.Remove(epi)
+ }
+ ep.mu.Unlock()
+}
+
+// ReadEvents reads up to len(events) ready events into events and returns the
+// number of events read.
+//
+// Preconditions: len(events) != 0.
+func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int {
+ i := 0
+ // Hot path: avoid defer.
+ ep.mu.Lock()
+ var next *epollInterest
+ var requeue epollInterestList
+ for epi := ep.ready.Front(); epi != nil; epi = next {
+ next = epi.Next()
+ // Regardless of what else happens, epi is initially removed from the
+ // ready list.
+ ep.ready.Remove(epi)
+ wmask := waiter.EventMaskFromLinux(epi.mask)
+ ievents := epi.key.file.Readiness(wmask) & wmask
+ if ievents == 0 {
+ // Leave epi off the ready list.
+ epi.ready = false
+ continue
+ }
+ // Determine what we should do with epi.
+ switch {
+ case epi.mask&linux.EPOLLONESHOT != 0:
+ // Clear all events from the mask; they must be re-added by
+ // EPOLL_CTL_MOD.
+ epi.mask &= linux.EP_PRIVATE_BITS
+ fallthrough
+ case epi.mask&linux.EPOLLET != 0:
+ // Leave epi off the ready list.
+ epi.ready = false
+ default:
+ // Queue epi to be moved to the end of the ready list.
+ requeue.PushBack(epi)
+ }
+ // Report ievents.
+ events[i] = linux.EpollEvent{
+ Events: ievents.ToLinux(),
+ Data: epi.userData,
+ }
+ i++
+ if i == len(events) {
+ break
+ }
+ }
+ ep.ready.PushBackList(&requeue)
+ ep.mu.Unlock()
+ return i
+}
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 3a9665800..418d69b96 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -18,10 +18,14 @@ import (
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -38,36 +42,101 @@ type FileDescription struct {
// operations.
refs int64
+ // statusFlags contains status flags, "initialized by open(2) and possibly
+ // modified by fcntl()" - fcntl(2). statusFlags is accessed using atomic
+ // memory operations.
+ statusFlags uint32
+
+ // epolls is the set of epollInterests registered for this FileDescription.
+ // epolls is protected by epollMu.
+ epollMu sync.Mutex
+ epolls map[*epollInterest]struct{}
+
// vd is the filesystem location at which this FileDescription was opened.
// A reference is held on vd. vd is immutable.
vd VirtualDentry
+ // opts contains options passed to FileDescription.Init(). opts is
+ // immutable.
+ opts FileDescriptionOptions
+
+ // readable is MayReadFileWithOpenFlags(statusFlags). readable is
+ // immutable.
+ //
+ // readable is analogous to Linux's FMODE_READ.
+ readable bool
+
+ // writable is MayWriteFileWithOpenFlags(statusFlags). If writable is true,
+ // the FileDescription holds a write count on vd.mount. writable is
+ // immutable.
+ //
+ // writable is analogous to Linux's FMODE_WRITE.
+ writable bool
+
// impl is the FileDescriptionImpl associated with this Filesystem. impl is
// immutable. This should be the last field in FileDescription.
impl FileDescriptionImpl
}
-// Init must be called before first use of fd. It takes references on mnt and
-// d.
-func (fd *FileDescription) Init(impl FileDescriptionImpl, mnt *Mount, d *Dentry) {
+// FileDescriptionOptions contains options to FileDescription.Init().
+type FileDescriptionOptions struct {
+ // If AllowDirectIO is true, allow O_DIRECT to be set on the file. This is
+ // usually only the case if O_DIRECT would actually have an effect.
+ AllowDirectIO bool
+
+ // If DenyPRead is true, calls to FileDescription.PRead() return ESPIPE.
+ DenyPRead bool
+
+ // If DenyPWrite is true, calls to FileDescription.PWrite() return
+ // ESPIPE.
+ DenyPWrite bool
+
+ // if InvalidWrite is true, calls to FileDescription.Write() return
+ // EINVAL.
+ InvalidWrite bool
+
+ // If UseDentryMetadata is true, calls to FileDescription methods that
+ // interact with file and filesystem metadata (Stat, SetStat, StatFS,
+ // Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling
+ // the corresponding FilesystemImpl methods instead of the corresponding
+ // FileDescriptionImpl methods.
+ //
+ // UseDentryMetadata is intended for file descriptions that are implemented
+ // outside of individual filesystems, such as pipes, sockets, and device
+ // special files. FileDescriptions for which UseDentryMetadata is true may
+ // embed DentryMetadataFileDescriptionImpl to obtain appropriate
+ // implementations of FileDescriptionImpl methods that should not be
+ // called.
+ UseDentryMetadata bool
+}
+
+// Init must be called before first use of fd. If it succeeds, it takes
+// references on mnt and d. flags is the initial file description flags, which
+// is usually the full set of flags passed to open(2).
+func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) error {
+ writable := MayWriteFileWithOpenFlags(flags)
+ if writable {
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ }
+
fd.refs = 1
+
+ // Remove "file creation flags" to mirror the behavior from file.f_flags in
+ // fs/open.c:do_dentry_open
+ fd.statusFlags = flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC)
fd.vd = VirtualDentry{
mount: mnt,
dentry: d,
}
- fd.vd.IncRef()
+ mnt.IncRef()
+ d.IncRef()
+ fd.opts = *opts
+ fd.readable = MayReadFileWithOpenFlags(flags)
+ fd.writable = writable
fd.impl = impl
-}
-
-// Impl returns the FileDescriptionImpl associated with fd.
-func (fd *FileDescription) Impl() FileDescriptionImpl {
- return fd.impl
-}
-
-// VirtualDentry returns the location at which fd was opened. It does not take
-// a reference on the returned VirtualDentry.
-func (fd *FileDescription) VirtualDentry() VirtualDentry {
- return fd.vd
+ return nil
}
// IncRef increments fd's reference count.
@@ -75,16 +144,144 @@ func (fd *FileDescription) IncRef() {
atomic.AddInt64(&fd.refs, 1)
}
+// TryIncRef increments fd's reference count and returns true. If fd's
+// reference count is already zero, TryIncRef does nothing and returns false.
+//
+// TryIncRef does not require that a reference is held on fd.
+func (fd *FileDescription) TryIncRef() bool {
+ for {
+ refs := atomic.LoadInt64(&fd.refs)
+ if refs <= 0 {
+ return false
+ }
+ if atomic.CompareAndSwapInt64(&fd.refs, refs, refs+1) {
+ return true
+ }
+ }
+}
+
// DecRef decrements fd's reference count.
func (fd *FileDescription) DecRef() {
if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 {
+ // Unregister fd from all epoll instances.
+ fd.epollMu.Lock()
+ epolls := fd.epolls
+ fd.epolls = nil
+ fd.epollMu.Unlock()
+ for epi := range epolls {
+ ep := epi.epoll
+ ep.interestMu.Lock()
+ // Check that epi has not been concurrently unregistered by
+ // EpollInstance.DeleteInterest() or EpollInstance.Release().
+ if _, ok := ep.interest[epi.key]; ok {
+ fd.EventUnregister(&epi.waiter)
+ ep.removeLocked(epi)
+ }
+ ep.interestMu.Unlock()
+ }
+ // Release implementation resources.
fd.impl.Release()
+ if fd.writable {
+ fd.vd.mount.EndWrite()
+ }
fd.vd.DecRef()
} else if refs < 0 {
panic("FileDescription.DecRef() called without holding a reference")
}
}
+// Refs returns the current number of references. The returned count
+// is inherently racy and is unsafe to use without external synchronization.
+func (fd *FileDescription) Refs() int64 {
+ return atomic.LoadInt64(&fd.refs)
+}
+
+// Mount returns the mount on which fd was opened. It does not take a reference
+// on the returned Mount.
+func (fd *FileDescription) Mount() *Mount {
+ return fd.vd.mount
+}
+
+// Dentry returns the dentry at which fd was opened. It does not take a
+// reference on the returned Dentry.
+func (fd *FileDescription) Dentry() *Dentry {
+ return fd.vd.dentry
+}
+
+// VirtualDentry returns the location at which fd was opened. It does not take
+// a reference on the returned VirtualDentry.
+func (fd *FileDescription) VirtualDentry() VirtualDentry {
+ return fd.vd
+}
+
+// StatusFlags returns file description status flags, as for fcntl(F_GETFL).
+func (fd *FileDescription) StatusFlags() uint32 {
+ return atomic.LoadUint32(&fd.statusFlags)
+}
+
+// SetStatusFlags sets file description status flags, as for fcntl(F_SETFL).
+func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Credentials, flags uint32) error {
+ // Compare Linux's fs/fcntl.c:setfl().
+ oldFlags := fd.StatusFlags()
+ // Linux documents this check as "O_APPEND cannot be cleared if the file is
+ // marked as append-only and the file is open for write", which would make
+ // sense. However, the check as actually implemented seems to be "O_APPEND
+ // cannot be changed if the file is marked as append-only".
+ if (flags^oldFlags)&linux.O_APPEND != 0 {
+ stat, err := fd.Stat(ctx, StatOptions{
+ // There is no mask bit for stx_attributes.
+ Mask: 0,
+ // Linux just reads inode::i_flags directly.
+ Sync: linux.AT_STATX_DONT_SYNC,
+ })
+ if err != nil {
+ return err
+ }
+ if (stat.AttributesMask&linux.STATX_ATTR_APPEND != 0) && (stat.Attributes&linux.STATX_ATTR_APPEND != 0) {
+ return syserror.EPERM
+ }
+ }
+ if (flags&linux.O_NOATIME != 0) && (oldFlags&linux.O_NOATIME == 0) {
+ stat, err := fd.Stat(ctx, StatOptions{
+ Mask: linux.STATX_UID,
+ // Linux's inode_owner_or_capable() just reads inode::i_uid
+ // directly.
+ Sync: linux.AT_STATX_DONT_SYNC,
+ })
+ if err != nil {
+ return err
+ }
+ if stat.Mask&linux.STATX_UID == 0 {
+ return syserror.EPERM
+ }
+ if !CanActAsOwner(creds, auth.KUID(stat.UID)) {
+ return syserror.EPERM
+ }
+ }
+ if flags&linux.O_DIRECT != 0 && !fd.opts.AllowDirectIO {
+ return syserror.EINVAL
+ }
+ // TODO(jamieliu): FileDescriptionImpl.SetOAsync()?
+ const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK
+ atomic.StoreUint32(&fd.statusFlags, (oldFlags&^settableFlags)|(flags&settableFlags))
+ return nil
+}
+
+// IsReadable returns true if fd was opened for reading.
+func (fd *FileDescription) IsReadable() bool {
+ return fd.readable
+}
+
+// IsWritable returns true if fd was opened for writing.
+func (fd *FileDescription) IsWritable() bool {
+ return fd.writable
+}
+
+// Impl returns the FileDescriptionImpl associated with fd.
+func (fd *FileDescription) Impl() FileDescriptionImpl {
+ return fd.impl
+}
+
// FileDescriptionImpl contains implementation details for an FileDescription.
// Implementations of FileDescriptionImpl should contain their associated
// FileDescription by value as their first field.
@@ -93,6 +290,8 @@ func (fd *FileDescription) DecRef() {
// be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID and
// auth.KGID respectively).
//
+// All methods may return errors not specified.
+//
// FileDescriptionImpl is analogous to Linux's struct file_operations.
type FileDescriptionImpl interface {
// Release is called when the associated FileDescription reaches zero
@@ -104,19 +303,12 @@ type FileDescriptionImpl interface {
// prevent the file descriptor from being closed.
OnClose(ctx context.Context) error
- // StatusFlags returns file description status flags, as for
- // fcntl(F_GETFL).
- StatusFlags(ctx context.Context) (uint32, error)
-
- // SetStatusFlags sets file description status flags, as for
- // fcntl(F_SETFL).
- SetStatusFlags(ctx context.Context, flags uint32) error
-
// Stat returns metadata for the file represented by the FileDescription.
Stat(ctx context.Context, opts StatOptions) (linux.Statx, error)
// SetStat updates metadata for the file represented by the
- // FileDescription.
+ // FileDescription. Implementations are responsible for checking if the
+ // operation can be performed (see vfs.CheckSetStat() for common checks).
SetStat(ctx context.Context, opts SetStatOptions) error
// StatFS returns metadata for the filesystem containing the file
@@ -129,6 +321,13 @@ type FileDescriptionImpl interface {
// PRead reads from the file into dst, starting at the given offset, and
// returns the number of bytes read. PRead is permitted to return partial
// reads with a nil error.
+ //
+ // Errors:
+ //
+ // - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP.
+ //
+ // Preconditions: The FileDescription was opened for reading.
+ // FileDescriptionOptions.DenyPRead == false.
PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)
// Read is similar to PRead, but does not specify an offset.
@@ -138,6 +337,12 @@ type FileDescriptionImpl interface {
// the number of bytes read; note that POSIX 2.9.7 "Thread Interactions
// with Regular File Operations" requires that all operations that may
// mutate the FileDescription offset are serialized.
+ //
+ // Errors:
+ //
+ // - If opts.Flags specifies unsupported options, Read returns EOPNOTSUPP.
+ //
+ // Preconditions: The FileDescription was opened for reading.
Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error)
// PWrite writes src to the file, starting at the given offset, and returns
@@ -147,6 +352,14 @@ type FileDescriptionImpl interface {
// As in Linux (but not POSIX), if O_APPEND is in effect for the
// FileDescription, PWrite should ignore the offset and append data to the
// end of the file.
+ //
+ // Errors:
+ //
+ // - If opts.Flags specifies unsupported options, PWrite returns
+ // EOPNOTSUPP.
+ //
+ // Preconditions: The FileDescription was opened for writing.
+ // FileDescriptionOptions.DenyPWrite == false.
PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)
// Write is similar to PWrite, but does not specify an offset, which is
@@ -156,6 +369,12 @@ type FileDescriptionImpl interface {
// PWrite that uses a FileDescription offset, to make it possible for
// remote filesystems to implement O_APPEND correctly (i.e. atomically with
// respect to writers outside the scope of VFS).
+ //
+ // Errors:
+ //
+ // - If opts.Flags specifies unsupported options, Write returns EOPNOTSUPP.
+ //
+ // Preconditions: The FileDescription was opened for writing.
Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error)
// IterDirents invokes cb on each entry in the directory represented by the
@@ -185,7 +404,39 @@ type FileDescriptionImpl interface {
// Ioctl implements the ioctl(2) syscall.
Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)
- // TODO: extended attributes; file locking
+ // Listxattr returns all extended attribute names for the file.
+ Listxattr(ctx context.Context, size uint64) ([]string, error)
+
+ // Getxattr returns the value associated with the given extended attribute
+ // for the file.
+ Getxattr(ctx context.Context, opts GetxattrOptions) (string, error)
+
+ // Setxattr changes the value associated with the given extended attribute
+ // for the file.
+ Setxattr(ctx context.Context, opts SetxattrOptions) error
+
+ // Removexattr removes the given extended attribute from the file.
+ Removexattr(ctx context.Context, name string) error
+
+ // LockBSD tries to acquire a BSD-style advisory file lock.
+ //
+ // TODO(gvisor.dev/issue/1480): BSD-style file locking
+ LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error
+
+ // LockBSD releases a BSD-style advisory file lock.
+ //
+ // TODO(gvisor.dev/issue/1480): BSD-style file locking
+ UnlockBSD(ctx context.Context, uid lock.UniqueID) error
+
+ // LockPOSIX tries to acquire a POSIX-style advisory file lock.
+ //
+ // TODO(gvisor.dev/issue/1480): POSIX-style file locking
+ LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error
+
+ // UnlockPOSIX releases a POSIX-style advisory file lock.
+ //
+ // TODO(gvisor.dev/issue/1480): POSIX-style file locking
+ UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error
}
// Dirent holds the information contained in struct linux_dirent64.
@@ -208,9 +459,282 @@ type Dirent struct {
// IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents.
type IterDirentsCallback interface {
- // Handle handles the given iterated Dirent. It returns true if iteration
- // should continue, and false if FileDescriptionImpl.IterDirents should
- // terminate now and restart with the same Dirent the next time it is
- // called.
- Handle(dirent Dirent) bool
+ // Handle handles the given iterated Dirent. If Handle returns a non-nil
+ // error, FileDescriptionImpl.IterDirents must stop iteration and return
+ // the error; the next call to FileDescriptionImpl.IterDirents should
+ // restart with the same Dirent.
+ Handle(dirent Dirent) error
+}
+
+// OnClose is called when a file descriptor representing the FileDescription is
+// closed. Returning a non-nil error should not prevent the file descriptor
+// from being closed.
+func (fd *FileDescription) OnClose(ctx context.Context) error {
+ return fd.impl.OnClose(ctx)
+}
+
+// Stat returns metadata for the file represented by fd.
+func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+ if fd.opts.UseDentryMetadata {
+ vfsObj := fd.vd.mount.vfs
+ rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+ Root: fd.vd,
+ Start: fd.vd,
+ })
+ stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts)
+ vfsObj.putResolvingPath(rp)
+ return stat, err
+ }
+ return fd.impl.Stat(ctx, opts)
+}
+
+// SetStat updates metadata for the file represented by fd.
+func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) error {
+ if fd.opts.UseDentryMetadata {
+ vfsObj := fd.vd.mount.vfs
+ rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+ Root: fd.vd,
+ Start: fd.vd,
+ })
+ err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts)
+ vfsObj.putResolvingPath(rp)
+ return err
+ }
+ return fd.impl.SetStat(ctx, opts)
+}
+
+// StatFS returns metadata for the filesystem containing the file represented
+// by fd.
+func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
+ if fd.opts.UseDentryMetadata {
+ vfsObj := fd.vd.mount.vfs
+ rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+ Root: fd.vd,
+ Start: fd.vd,
+ })
+ statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp)
+ vfsObj.putResolvingPath(rp)
+ return statfs, err
+ }
+ return fd.impl.StatFS(ctx)
+}
+
+// Readiness returns fd's I/O readiness.
+func (fd *FileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return fd.impl.Readiness(mask)
+}
+
+// EventRegister registers e for I/O readiness events in mask.
+func (fd *FileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ fd.impl.EventRegister(e, mask)
+}
+
+// EventUnregister unregisters e for I/O readiness events.
+func (fd *FileDescription) EventUnregister(e *waiter.Entry) {
+ fd.impl.EventUnregister(e)
+}
+
+// PRead reads from the file represented by fd into dst, starting at the given
+// offset, and returns the number of bytes read. PRead is permitted to return
+// partial reads with a nil error.
+func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+ if fd.opts.DenyPRead {
+ return 0, syserror.ESPIPE
+ }
+ if !fd.readable {
+ return 0, syserror.EBADF
+ }
+ return fd.impl.PRead(ctx, dst, offset, opts)
+}
+
+// Read is similar to PRead, but does not specify an offset.
+func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+ if !fd.readable {
+ return 0, syserror.EBADF
+ }
+ return fd.impl.Read(ctx, dst, opts)
+}
+
+// PWrite writes src to the file represented by fd, starting at the given
+// offset, and returns the number of bytes written. PWrite is permitted to
+// return partial writes with a nil error.
+func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+ if fd.opts.DenyPWrite {
+ return 0, syserror.ESPIPE
+ }
+ if !fd.writable {
+ return 0, syserror.EBADF
+ }
+ return fd.impl.PWrite(ctx, src, offset, opts)
+}
+
+// Write is similar to PWrite, but does not specify an offset.
+func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+ if fd.opts.InvalidWrite {
+ return 0, syserror.EINVAL
+ }
+ if !fd.writable {
+ return 0, syserror.EBADF
+ }
+ return fd.impl.Write(ctx, src, opts)
+}
+
+// IterDirents invokes cb on each entry in the directory represented by fd. If
+// IterDirents has been called since the last call to Seek, it continues
+// iteration from the end of the last call.
+func (fd *FileDescription) IterDirents(ctx context.Context, cb IterDirentsCallback) error {
+ return fd.impl.IterDirents(ctx, cb)
+}
+
+// Seek changes fd's offset (assuming one exists) and returns its new value.
+func (fd *FileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ return fd.impl.Seek(ctx, offset, whence)
+}
+
+// Sync has the semantics of fsync(2).
+func (fd *FileDescription) Sync(ctx context.Context) error {
+ return fd.impl.Sync(ctx)
+}
+
+// ConfigureMMap mutates opts to implement mmap(2) for the file represented by
+// fd.
+func (fd *FileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+ return fd.impl.ConfigureMMap(ctx, opts)
+}
+
+// Ioctl implements the ioctl(2) syscall.
+func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ return fd.impl.Ioctl(ctx, uio, args)
+}
+
+// Listxattr returns all extended attribute names for the file represented by
+// fd.
+//
+// If the size of the list (including a NUL terminating byte after every entry)
+// would exceed size, ERANGE may be returned. Note that implementations
+// are free to ignore size entirely and return without error). In all cases,
+// if size is 0, the list should be returned without error, regardless of size.
+func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
+ if fd.opts.UseDentryMetadata {
+ vfsObj := fd.vd.mount.vfs
+ rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+ Root: fd.vd,
+ Start: fd.vd,
+ })
+ names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp, size)
+ vfsObj.putResolvingPath(rp)
+ return names, err
+ }
+ names, err := fd.impl.Listxattr(ctx, size)
+ if err == syserror.ENOTSUP {
+ // Linux doesn't actually return ENOTSUP in this case; instead,
+ // fs/xattr.c:vfs_listxattr() falls back to allowing the security
+ // subsystem to return security extended attributes, which by default
+ // don't exist.
+ return nil, nil
+ }
+ return names, err
+}
+
+// Getxattr returns the value associated with the given extended attribute for
+// the file represented by fd.
+//
+// If the size of the return value exceeds opts.Size, ERANGE may be returned
+// (note that implementations are free to ignore opts.Size entirely and return
+// without error). In all cases, if opts.Size is 0, the value should be
+// returned without error, regardless of size.
+func (fd *FileDescription) Getxattr(ctx context.Context, opts *GetxattrOptions) (string, error) {
+ if fd.opts.UseDentryMetadata {
+ vfsObj := fd.vd.mount.vfs
+ rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+ Root: fd.vd,
+ Start: fd.vd,
+ })
+ val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
+ vfsObj.putResolvingPath(rp)
+ return val, err
+ }
+ return fd.impl.Getxattr(ctx, *opts)
+}
+
+// Setxattr changes the value associated with the given extended attribute for
+// the file represented by fd.
+func (fd *FileDescription) Setxattr(ctx context.Context, opts *SetxattrOptions) error {
+ if fd.opts.UseDentryMetadata {
+ vfsObj := fd.vd.mount.vfs
+ rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+ Root: fd.vd,
+ Start: fd.vd,
+ })
+ err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
+ vfsObj.putResolvingPath(rp)
+ return err
+ }
+ return fd.impl.Setxattr(ctx, *opts)
+}
+
+// Removexattr removes the given extended attribute from the file represented
+// by fd.
+func (fd *FileDescription) Removexattr(ctx context.Context, name string) error {
+ if fd.opts.UseDentryMetadata {
+ vfsObj := fd.vd.mount.vfs
+ rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+ Root: fd.vd,
+ Start: fd.vd,
+ })
+ err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name)
+ vfsObj.putResolvingPath(rp)
+ return err
+ }
+ return fd.impl.Removexattr(ctx, name)
+}
+
+// SyncFS instructs the filesystem containing fd to execute the semantics of
+// syncfs(2).
+func (fd *FileDescription) SyncFS(ctx context.Context) error {
+ return fd.vd.mount.fs.impl.Sync(ctx)
+}
+
+// MappedName implements memmap.MappingIdentity.MappedName.
+func (fd *FileDescription) MappedName(ctx context.Context) string {
+ vfsroot := RootFromContext(ctx)
+ s, _ := fd.vd.mount.vfs.PathnameWithDeleted(ctx, vfsroot, fd.vd)
+ if vfsroot.Ok() {
+ vfsroot.DecRef()
+ }
+ return s
+}
+
+// DeviceID implements memmap.MappingIdentity.DeviceID.
+func (fd *FileDescription) DeviceID() uint64 {
+ stat, err := fd.Stat(context.Background(), StatOptions{
+ // There is no STATX_DEV; we assume that Stat will return it if it's
+ // available regardless of mask.
+ Mask: 0,
+ // fs/proc/task_mmu.c:show_map_vma() just reads inode::i_sb->s_dev
+ // directly.
+ Sync: linux.AT_STATX_DONT_SYNC,
+ })
+ if err != nil {
+ return 0
+ }
+ return uint64(linux.MakeDeviceID(uint16(stat.DevMajor), stat.DevMinor))
+}
+
+// InodeID implements memmap.MappingIdentity.InodeID.
+func (fd *FileDescription) InodeID() uint64 {
+ stat, err := fd.Stat(context.Background(), StatOptions{
+ Mask: linux.STATX_INO,
+ // fs/proc/task_mmu.c:show_map_vma() just reads inode::i_ino directly.
+ Sync: linux.AT_STATX_DONT_SYNC,
+ })
+ if err != nil || stat.Mask&linux.STATX_INO == 0 {
+ return 0
+ }
+ return stat.Ino
+}
+
+// Msync implements memmap.MappingIdentity.Msync.
+func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error {
+ return fd.Sync(ctx)
}
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index 4fbad7840..f4c111926 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -17,14 +17,15 @@ package vfs
import (
"bytes"
"io"
- "sync"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -32,8 +33,8 @@ import (
// implementations to adapt:
// - Have a local fileDescription struct (containing FileDescription) which
// embeds FileDescriptionDefaultImpl and overrides the default methods
-// which are common to all fd implementations for that for that filesystem
-// like StatusFlags, SetStatusFlags, Stat, SetStat, StatFS, etc.
+// which are common to all fd implementations for that filesystem like
+// StatusFlags, SetStatusFlags, Stat, SetStat, StatFS, etc.
// - This should be embedded in all file description implementations as the
// first field by value.
// - Directory FDs would also embed DirectoryFileDescriptionDefaultImpl.
@@ -127,6 +128,51 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg
return 0, syserror.ENOTTY
}
+// Listxattr implements FileDescriptionImpl.Listxattr analogously to
+// inode_operations::listxattr == NULL in Linux.
+func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context, size uint64) ([]string, error) {
+ // This isn't exactly accurate; see FileDescription.Listxattr.
+ return nil, syserror.ENOTSUP
+}
+
+// Getxattr implements FileDescriptionImpl.Getxattr analogously to
+// inode::i_opflags & IOP_XATTR == 0 in Linux.
+func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, opts GetxattrOptions) (string, error) {
+ return "", syserror.ENOTSUP
+}
+
+// Setxattr implements FileDescriptionImpl.Setxattr analogously to
+// inode::i_opflags & IOP_XATTR == 0 in Linux.
+func (FileDescriptionDefaultImpl) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+ return syserror.ENOTSUP
+}
+
+// Removexattr implements FileDescriptionImpl.Removexattr analogously to
+// inode::i_opflags & IOP_XATTR == 0 in Linux.
+func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) error {
+ return syserror.ENOTSUP
+}
+
+// LockBSD implements FileDescriptionImpl.LockBSD.
+func (FileDescriptionDefaultImpl) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error {
+ return syserror.EBADF
+}
+
+// UnlockBSD implements FileDescriptionImpl.UnlockBSD.
+func (FileDescriptionDefaultImpl) UnlockBSD(ctx context.Context, uid lock.UniqueID) error {
+ return syserror.EBADF
+}
+
+// LockPOSIX implements FileDescriptionImpl.LockPOSIX.
+func (FileDescriptionDefaultImpl) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error {
+ return syserror.EBADF
+}
+
+// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX.
+func (FileDescriptionDefaultImpl) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error {
+ return syserror.EBADF
+}
+
// DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
// FileDescriptionImpl that always represent directories to obtain
// implementations of non-directory I/O methods that return EISDIR.
@@ -152,6 +198,48 @@ func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src userme
return 0, syserror.EISDIR
}
+// DentryMetadataFileDescriptionImpl may be embedded by implementations of
+// FileDescriptionImpl for which FileDescriptionOptions.UseDentryMetadata is
+// true to obtain implementations of Stat and SetStat that panic.
+type DentryMetadataFileDescriptionImpl struct{}
+
+// Stat implements FileDescriptionImpl.Stat.
+func (DentryMetadataFileDescriptionImpl) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+ panic("illegal call to DentryMetadataFileDescriptionImpl.Stat")
+}
+
+// SetStat implements FileDescriptionImpl.SetStat.
+func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetStatOptions) error {
+ panic("illegal call to DentryMetadataFileDescriptionImpl.SetStat")
+}
+
+// DynamicBytesSource represents a data source for a
+// DynamicBytesFileDescriptionImpl.
+type DynamicBytesSource interface {
+ // Generate writes the file's contents to buf.
+ Generate(ctx context.Context, buf *bytes.Buffer) error
+}
+
+// StaticData implements DynamicBytesSource over a static string.
+type StaticData struct {
+ Data string
+}
+
+// Generate implements DynamicBytesSource.
+func (s *StaticData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ buf.WriteString(s.Data)
+ return nil
+}
+
+// WritableDynamicBytesSource extends DynamicBytesSource to allow writes to the
+// underlying source.
+type WritableDynamicBytesSource interface {
+ DynamicBytesSource
+
+ // Write sends writes to the source.
+ Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error)
+}
+
// DynamicBytesFileDescriptionImpl may be embedded by implementations of
// FileDescriptionImpl that represent read-only regular files whose contents
// are backed by a bytes.Buffer that is regenerated when necessary, consistent
@@ -167,13 +255,6 @@ type DynamicBytesFileDescriptionImpl struct {
lastRead int64 // offset at which the last Read, PRead, or Seek ended
}
-// DynamicBytesSource represents a data source for a
-// DynamicBytesFileDescriptionImpl.
-type DynamicBytesSource interface {
- // Generate writes the file's contents to buf.
- Generate(ctx context.Context, buf *bytes.Buffer) error
-}
-
// SetDataSource must be called exactly once on fd before first use.
func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) {
fd.data = data
@@ -252,3 +333,54 @@ func (fd *DynamicBytesFileDescriptionImpl) Seek(ctx context.Context, offset int6
fd.off = offset
return offset, nil
}
+
+// Preconditions: fd.mu must be locked.
+func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+ if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
+ return 0, syserror.EOPNOTSUPP
+ }
+ limit, err := CheckLimit(ctx, offset, src.NumBytes())
+ if err != nil {
+ return 0, err
+ }
+ src = src.TakeFirst64(limit)
+
+ writable, ok := fd.data.(WritableDynamicBytesSource)
+ if !ok {
+ return 0, syserror.EINVAL
+ }
+ n, err := writable.Write(ctx, src, offset)
+ if err != nil {
+ return 0, err
+ }
+
+ // Invalidate cached data that might exist prior to this call.
+ fd.buf.Reset()
+ return n, nil
+}
+
+// PWrite implements FileDescriptionImpl.PWrite.
+func (fd *DynamicBytesFileDescriptionImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+ fd.mu.Lock()
+ n, err := fd.pwriteLocked(ctx, src, offset, opts)
+ fd.mu.Unlock()
+ return n, err
+}
+
+// Write implements FileDescriptionImpl.Write.
+func (fd *DynamicBytesFileDescriptionImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+ fd.mu.Lock()
+ n, err := fd.pwriteLocked(ctx, src, fd.off, opts)
+ fd.off += n
+ fd.mu.Unlock()
+ return n, err
+}
+
+// GenericConfigureMMap may be used by most implementations of
+// FileDescriptionImpl.ConfigureMMap.
+func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.MMapOpts) error {
+ opts.Mappable = m
+ opts.MappingIdentity = fd
+ fd.IncRef()
+ return nil
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index 511b829fc..3a75d4d62 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -22,11 +22,10 @@ import (
"testing"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/context/contexttest"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/contexttest"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// fileDescription is the common fd struct which a filesystem implementation
@@ -36,74 +35,89 @@ type fileDescription struct {
FileDescriptionDefaultImpl
}
-// genCountFD is a read-only FileDescriptionImpl representing a regular file
-// that contains the number of times its DynamicBytesSource.Generate()
+// genCount contains the number of times its DynamicBytesSource.Generate()
// implementation has been called.
-type genCountFD struct {
- fileDescription
- DynamicBytesFileDescriptionImpl
-
+type genCount struct {
count uint64 // accessed using atomic memory ops
}
-func newGenCountFD(mnt *Mount, vfsd *Dentry) *FileDescription {
- var fd genCountFD
- fd.vfsfd.Init(&fd, mnt, vfsd)
- fd.DynamicBytesFileDescriptionImpl.SetDataSource(&fd)
- return &fd.vfsfd
+// Generate implements DynamicBytesSource.Generate.
+func (g *genCount) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ fmt.Fprintf(buf, "%d", atomic.AddUint64(&g.count, 1))
+ return nil
}
-// Release implements FileDescriptionImpl.Release.
-func (fd *genCountFD) Release() {
+type storeData struct {
+ data string
}
-// StatusFlags implements FileDescriptionImpl.StatusFlags.
-func (fd *genCountFD) StatusFlags(ctx context.Context) (uint32, error) {
+var _ WritableDynamicBytesSource = (*storeData)(nil)
+
+// Generate implements DynamicBytesSource.
+func (d *storeData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ buf.WriteString(d.data)
+ return nil
+}
+
+// Generate implements WritableDynamicBytesSource.
+func (d *storeData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+ buf := make([]byte, src.NumBytes())
+ n, err := src.CopyIn(ctx, buf)
+ if err != nil {
+ return 0, err
+ }
+
+ d.data = string(buf[:n])
return 0, nil
}
-// SetStatusFlags implements FileDescriptionImpl.SetStatusFlags.
-func (fd *genCountFD) SetStatusFlags(ctx context.Context, flags uint32) error {
- return syserror.EPERM
+// testFD is a read-only FileDescriptionImpl representing a regular file.
+type testFD struct {
+ fileDescription
+ DynamicBytesFileDescriptionImpl
+
+ data DynamicBytesSource
+}
+
+func newTestFD(vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesSource) *FileDescription {
+ vd := vfsObj.NewAnonVirtualDentry("genCountFD")
+ defer vd.DecRef()
+ var fd testFD
+ fd.vfsfd.Init(&fd, statusFlags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{})
+ fd.DynamicBytesFileDescriptionImpl.SetDataSource(data)
+ return &fd.vfsfd
}
+// Release implements FileDescriptionImpl.Release.
+func (fd *testFD) Release() {
+}
+
+// SetStatusFlags implements FileDescriptionImpl.SetStatusFlags.
// Stat implements FileDescriptionImpl.Stat.
-func (fd *genCountFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+func (fd *testFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
// Note that Statx.Mask == 0 in the return value.
return linux.Statx{}, nil
}
// SetStat implements FileDescriptionImpl.SetStat.
-func (fd *genCountFD) SetStat(ctx context.Context, opts SetStatOptions) error {
+func (fd *testFD) SetStat(ctx context.Context, opts SetStatOptions) error {
return syserror.EPERM
}
-// Generate implements DynamicBytesSource.Generate.
-func (fd *genCountFD) Generate(ctx context.Context, buf *bytes.Buffer) error {
- fmt.Fprintf(buf, "%d", atomic.AddUint64(&fd.count, 1))
- return nil
-}
-
func TestGenCountFD(t *testing.T) {
ctx := contexttest.Context(t)
- creds := auth.CredentialsFromContext(ctx)
- vfsObj := New() // vfs.New()
- vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "testfs", &NewFilesystemOptions{})
- if err != nil {
- t.Fatalf("failed to create testfs root mount: %v", err)
+ vfsObj := &VirtualFilesystem{}
+ if err := vfsObj.Init(); err != nil {
+ t.Fatalf("VFS init: %v", err)
}
- vd := mntns.Root()
- defer vd.DecRef()
-
- fd := newGenCountFD(vd.Mount(), vd.Dentry())
+ fd := newTestFD(vfsObj, linux.O_RDWR, &genCount{})
defer fd.DecRef()
// The first read causes Generate to be called to fill the FD's buffer.
buf := make([]byte, 2)
ioseq := usermem.BytesIOSequence(buf)
- n, err := fd.Impl().Read(ctx, ioseq, ReadOptions{})
+ n, err := fd.Read(ctx, ioseq, ReadOptions{})
if n != 1 || (err != nil && err != io.EOF) {
t.Fatalf("first Read: got (%d, %v), wanted (1, nil or EOF)", n, err)
}
@@ -112,17 +126,17 @@ func TestGenCountFD(t *testing.T) {
}
// A second read without seeking is still at EOF.
- n, err = fd.Impl().Read(ctx, ioseq, ReadOptions{})
+ n, err = fd.Read(ctx, ioseq, ReadOptions{})
if n != 0 || err != io.EOF {
t.Fatalf("second Read: got (%d, %v), wanted (0, EOF)", n, err)
}
// Seeking to the beginning of the file causes it to be regenerated.
- n, err = fd.Impl().Seek(ctx, 0, linux.SEEK_SET)
+ n, err = fd.Seek(ctx, 0, linux.SEEK_SET)
if n != 0 || err != nil {
t.Fatalf("Seek: got (%d, %v), wanted (0, nil)", n, err)
}
- n, err = fd.Impl().Read(ctx, ioseq, ReadOptions{})
+ n, err = fd.Read(ctx, ioseq, ReadOptions{})
if n != 1 || (err != nil && err != io.EOF) {
t.Fatalf("Read after Seek: got (%d, %v), wanted (1, nil or EOF)", n, err)
}
@@ -131,11 +145,79 @@ func TestGenCountFD(t *testing.T) {
}
// PRead at the beginning of the file also causes it to be regenerated.
- n, err = fd.Impl().PRead(ctx, ioseq, 0, ReadOptions{})
+ n, err = fd.PRead(ctx, ioseq, 0, ReadOptions{})
if n != 1 || (err != nil && err != io.EOF) {
t.Fatalf("PRead: got (%d, %v), wanted (1, nil or EOF)", n, err)
}
if want := byte('3'); buf[0] != want {
t.Errorf("PRead: got byte %c, wanted %c", buf[0], want)
}
+
+ // Write and PWrite fails.
+ if _, err := fd.Write(ctx, ioseq, WriteOptions{}); err != syserror.EINVAL {
+ t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL)
+ }
+ if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); err != syserror.EINVAL {
+ t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL)
+ }
+}
+
+func TestWritable(t *testing.T) {
+ ctx := contexttest.Context(t)
+
+ vfsObj := &VirtualFilesystem{}
+ if err := vfsObj.Init(); err != nil {
+ t.Fatalf("VFS init: %v", err)
+ }
+ fd := newTestFD(vfsObj, linux.O_RDWR, &storeData{data: "init"})
+ defer fd.DecRef()
+
+ buf := make([]byte, 10)
+ ioseq := usermem.BytesIOSequence(buf)
+ if n, err := fd.Read(ctx, ioseq, ReadOptions{}); n != 4 && err != io.EOF {
+ t.Fatalf("Read: got (%v, %v), wanted (4, EOF)", n, err)
+ }
+ if want := "init"; want == string(buf) {
+ t.Fatalf("Read: got %v, wanted %v", string(buf), want)
+ }
+
+ // Test PWrite.
+ want := "write"
+ writeIOSeq := usermem.BytesIOSequence([]byte(want))
+ if n, err := fd.PWrite(ctx, writeIOSeq, 0, WriteOptions{}); int(n) != len(want) && err != nil {
+ t.Errorf("PWrite: got err (%v, %v), wanted (%v, nil)", n, err, len(want))
+ }
+ if n, err := fd.PRead(ctx, ioseq, 0, ReadOptions{}); int(n) != len(want) && err != io.EOF {
+ t.Fatalf("PRead: got (%v, %v), wanted (%v, EOF)", n, err, len(want))
+ }
+ if want == string(buf) {
+ t.Fatalf("PRead: got %v, wanted %v", string(buf), want)
+ }
+
+ // Test Seek to 0 followed by Write.
+ want = "write2"
+ writeIOSeq = usermem.BytesIOSequence([]byte(want))
+ if n, err := fd.Seek(ctx, 0, linux.SEEK_SET); n != 0 && err != nil {
+ t.Errorf("Seek: got err (%v, %v), wanted (0, nil)", n, err)
+ }
+ if n, err := fd.Write(ctx, writeIOSeq, WriteOptions{}); int(n) != len(want) && err != nil {
+ t.Errorf("Write: got err (%v, %v), wanted (%v, nil)", n, err, len(want))
+ }
+ if n, err := fd.PRead(ctx, ioseq, 0, ReadOptions{}); int(n) != len(want) && err != io.EOF {
+ t.Fatalf("PRead: got (%v, %v), wanted (%v, EOF)", n, err, len(want))
+ }
+ if want == string(buf) {
+ t.Fatalf("PRead: got %v, wanted %v", string(buf), want)
+ }
+
+ // Test failure if offset != 0.
+ if n, err := fd.Seek(ctx, 1, linux.SEEK_SET); n != 0 && err != nil {
+ t.Errorf("Seek: got err (%v, %v), wanted (0, nil)", n, err)
+ }
+ if n, err := fd.Write(ctx, writeIOSeq, WriteOptions{}); n != 0 && err != syserror.EINVAL {
+ t.Errorf("Write: got err (%v, %v), wanted (0, EINVAL)", n, err)
+ }
+ if n, err := fd.PWrite(ctx, writeIOSeq, 2, WriteOptions{}); n != 0 && err != syserror.EINVAL {
+ t.Errorf("PWrite: got err (%v, %v), wanted (0, EINVAL)", n, err)
+ }
}
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 7a074b718..20e5bb072 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -18,7 +18,10 @@ import (
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
)
// A Filesystem is a tree of nodes represented by Dentries, which forms part of
@@ -28,20 +31,44 @@ import (
// Filesystem methods require that a reference is held.
//
// Filesystem is analogous to Linux's struct super_block.
+//
+// +stateify savable
type Filesystem struct {
// refs is the reference count. refs is accessed using atomic memory
// operations.
refs int64
+ // vfs is the VirtualFilesystem that uses this Filesystem. vfs is
+ // immutable.
+ vfs *VirtualFilesystem
+
+ // fsType is the FilesystemType of this Filesystem.
+ fsType FilesystemType
+
// impl is the FilesystemImpl associated with this Filesystem. impl is
// immutable. This should be the last field in Dentry.
impl FilesystemImpl
}
// Init must be called before first use of fs.
-func (fs *Filesystem) Init(impl FilesystemImpl) {
+func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, fsType FilesystemType, impl FilesystemImpl) {
fs.refs = 1
+ fs.vfs = vfsObj
+ fs.fsType = fsType
fs.impl = impl
+ vfsObj.filesystemsMu.Lock()
+ vfsObj.filesystems[fs] = struct{}{}
+ vfsObj.filesystemsMu.Unlock()
+}
+
+// FilesystemType returns the FilesystemType for this Filesystem.
+func (fs *Filesystem) FilesystemType() FilesystemType {
+ return fs.fsType
+}
+
+// VirtualFilesystem returns the containing VirtualFilesystem.
+func (fs *Filesystem) VirtualFilesystem() *VirtualFilesystem {
+ return fs.vfs
}
// Impl returns the FilesystemImpl associated with fs.
@@ -49,14 +76,35 @@ func (fs *Filesystem) Impl() FilesystemImpl {
return fs.impl
}
-func (fs *Filesystem) incRef() {
+// IncRef increments fs' reference count.
+func (fs *Filesystem) IncRef() {
if atomic.AddInt64(&fs.refs, 1) <= 1 {
- panic("Filesystem.incRef() called without holding a reference")
+ panic("Filesystem.IncRef() called without holding a reference")
+ }
+}
+
+// TryIncRef increments fs' reference count and returns true. If fs' reference
+// count is zero, TryIncRef does nothing and returns false.
+//
+// TryIncRef does not require that a reference is held on fs.
+func (fs *Filesystem) TryIncRef() bool {
+ for {
+ refs := atomic.LoadInt64(&fs.refs)
+ if refs <= 0 {
+ return false
+ }
+ if atomic.CompareAndSwapInt64(&fs.refs, refs, refs+1) {
+ return true
+ }
}
}
-func (fs *Filesystem) decRef() {
+// DecRef decrements fs' reference count.
+func (fs *Filesystem) DecRef() {
if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 {
+ fs.vfs.filesystemsMu.Lock()
+ delete(fs.vfs.filesystems, fs)
+ fs.vfs.filesystemsMu.Unlock()
fs.impl.Release()
} else if refs < 0 {
panic("Filesystem.decRef() called without holding a reference")
@@ -73,6 +121,24 @@ func (fs *Filesystem) decRef() {
// (responsible for actually implementing the operation) isn't known until path
// resolution is complete.
//
+// Unless otherwise specified, FilesystemImpl methods are responsible for
+// performing permission checks. In many cases, vfs package functions in
+// permissions.go may be used to help perform these checks.
+//
+// When multiple specified error conditions apply to a given method call, the
+// implementation may return any applicable errno unless otherwise specified,
+// but returning the earliest error specified is preferable to maximize
+// compatibility with Linux.
+//
+// All methods may return errors not specified, notably including:
+//
+// - ENOENT if a required path component does not exist.
+//
+// - ENOTDIR if an intermediate path component is not a directory.
+//
+// - Errors from vfs-package functions (ResolvingPath.Resolve*(),
+// Mount.CheckBeginWrite(), permission-checking functions, etc.)
+//
// For all methods that take or return linux.Statx, Statx.Uid and Statx.Gid
// should be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID
// and auth.KGID respectively).
@@ -89,52 +155,237 @@ type FilesystemImpl interface {
// file data to be written to the underlying [filesystem]", as by syncfs(2).
Sync(ctx context.Context) error
+ // AccessAt checks whether a user with creds can access the file at rp.
+ AccessAt(ctx context.Context, rp *ResolvingPath, creds *auth.Credentials, ats AccessTypes) error
+
// GetDentryAt returns a Dentry representing the file at rp. A reference is
// taken on the returned Dentry.
//
// GetDentryAt does not correspond directly to a Linux syscall; it is used
// in the implementation of:
//
- // - Syscalls that need to resolve two paths: rename(), renameat(),
- // renameat2(), link(), linkat().
+ // - Syscalls that need to resolve two paths: link(), linkat().
//
// - Syscalls that need to refer to a filesystem position outside the
// context of a file description: chdir(), fchdir(), chroot(), mount(),
// umount().
GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error)
+ // GetParentDentryAt returns a Dentry representing the directory at the
+ // second-to-last path component in rp. (Note that, despite the name, this
+ // is not necessarily the parent directory of the file at rp, since the
+ // last path component in rp may be "." or "..".) A reference is taken on
+ // the returned Dentry.
+ //
+ // GetParentDentryAt does not correspond directly to a Linux syscall; it is
+ // used in the implementation of the rename() family of syscalls, which
+ // must resolve the parent directories of two paths.
+ //
+ // Preconditions: !rp.Done().
+ //
+ // Postconditions: If GetParentDentryAt returns a nil error, then
+ // rp.Final(). If GetParentDentryAt returns an error returned by
+ // ResolvingPath.Resolve*(), then !rp.Done().
+ GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error)
+
// LinkAt creates a hard link at rp representing the same file as vd. It
// does not take ownership of references on vd.
//
- // The implementation is responsible for checking that vd.Mount() ==
- // rp.Mount(), and that vd does not represent a directory.
+ // Errors:
+ //
+ // - If the last path component in rp is "." or "..", LinkAt returns
+ // EEXIST.
+ //
+ // - If a file already exists at rp, LinkAt returns EEXIST.
+ //
+ // - If rp.MustBeDir(), LinkAt returns ENOENT.
+ //
+ // - If the directory in which the link would be created has been removed
+ // by RmdirAt or RenameAt, LinkAt returns ENOENT.
+ //
+ // - If rp.Mount != vd.Mount(), LinkAt returns EXDEV.
+ //
+ // - If vd represents a directory, LinkAt returns EPERM.
+ //
+ // - If vd represents a file for which all existing links have been
+ // removed, or a file created by open(O_TMPFILE|O_EXCL), LinkAt returns
+ // ENOENT. Equivalently, if vd represents a file with a link count of 0 not
+ // created by open(O_TMPFILE) without O_EXCL, LinkAt returns ENOENT.
+ //
+ // Preconditions: !rp.Done(). For the final path component in rp,
+ // !rp.ShouldFollowSymlink().
+ //
+ // Postconditions: If LinkAt returns an error returned by
+ // ResolvingPath.Resolve*(), then !rp.Done().
LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error
// MkdirAt creates a directory at rp.
+ //
+ // Errors:
+ //
+ // - If the last path component in rp is "." or "..", MkdirAt returns
+ // EEXIST.
+ //
+ // - If a file already exists at rp, MkdirAt returns EEXIST.
+ //
+ // - If the directory in which the new directory would be created has been
+ // removed by RmdirAt or RenameAt, MkdirAt returns ENOENT.
+ //
+ // Preconditions: !rp.Done(). For the final path component in rp,
+ // !rp.ShouldFollowSymlink().
+ //
+ // Postconditions: If MkdirAt returns an error returned by
+ // ResolvingPath.Resolve*(), then !rp.Done().
MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error
// MknodAt creates a regular file, device special file, or named pipe at
// rp.
+ //
+ // Errors:
+ //
+ // - If the last path component in rp is "." or "..", MknodAt returns
+ // EEXIST.
+ //
+ // - If a file already exists at rp, MknodAt returns EEXIST.
+ //
+ // - If rp.MustBeDir(), MknodAt returns ENOENT.
+ //
+ // - If the directory in which the file would be created has been removed
+ // by RmdirAt or RenameAt, MknodAt returns ENOENT.
+ //
+ // Preconditions: !rp.Done(). For the final path component in rp,
+ // !rp.ShouldFollowSymlink().
+ //
+ // Postconditions: If MknodAt returns an error returned by
+ // ResolvingPath.Resolve*(), then !rp.Done().
MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error
// OpenAt returns an FileDescription providing access to the file at rp. A
// reference is taken on the returned FileDescription.
+ //
+ // Errors:
+ //
+ // - If opts.Flags specifies O_TMPFILE and this feature is unsupported by
+ // the implementation, OpenAt returns EOPNOTSUPP. (All other unsupported
+ // features are silently ignored, consistently with Linux's open*(2).)
OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error)
// ReadlinkAt returns the target of the symbolic link at rp.
+ //
+ // Errors:
+ //
+ // - If the file at rp is not a symbolic link, ReadlinkAt returns EINVAL.
ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error)
- // RenameAt renames the Dentry represented by vd to rp. It does not take
- // ownership of references on vd.
+ // RenameAt renames the file named oldName in directory oldParentVD to rp.
+ // It does not take ownership of references on oldParentVD.
+ //
+ // Errors [1]:
+ //
+ // - If opts.Flags specifies unsupported options, RenameAt returns EINVAL.
+ //
+ // - If the last path component in rp is "." or "..", and opts.Flags
+ // contains RENAME_NOREPLACE, RenameAt returns EEXIST.
+ //
+ // - If the last path component in rp is "." or "..", and opts.Flags does
+ // not contain RENAME_NOREPLACE, RenameAt returns EBUSY.
+ //
+ // - If rp.Mount != oldParentVD.Mount(), RenameAt returns EXDEV.
+ //
+ // - If the renamed file is not a directory, and opts.MustBeDir is true,
+ // RenameAt returns ENOTDIR.
+ //
+ // - If renaming would replace an existing file and opts.Flags contains
+ // RENAME_NOREPLACE, RenameAt returns EEXIST.
+ //
+ // - If there is no existing file at rp and opts.Flags contains
+ // RENAME_EXCHANGE, RenameAt returns ENOENT.
+ //
+ // - If there is an existing non-directory file at rp, and rp.MustBeDir()
+ // is true, RenameAt returns ENOTDIR.
+ //
+ // - If the renamed file is not a directory, opts.Flags does not contain
+ // RENAME_EXCHANGE, and rp.MustBeDir() is true, RenameAt returns ENOTDIR.
+ // (This check is not subsumed by the check for directory replacement below
+ // since it applies even if there is no file to replace.)
+ //
+ // - If the renamed file is a directory, and the new parent directory of
+ // the renamed file is either the renamed directory or a descendant
+ // subdirectory of the renamed directory, RenameAt returns EINVAL.
+ //
+ // - If renaming would exchange the renamed file with an ancestor directory
+ // of the renamed file, RenameAt returns EINVAL.
+ //
+ // - If renaming would replace an ancestor directory of the renamed file,
+ // RenameAt returns ENOTEMPTY. (This check would be subsumed by the
+ // non-empty directory check below; however, this check takes place before
+ // the self-rename check.)
+ //
+ // - If the renamed file would replace or exchange with itself (i.e. the
+ // source and destination paths resolve to the same file), RenameAt returns
+ // nil, skipping the checks described below.
+ //
+ // - If the source or destination directory is not writable by the provider
+ // of rp.Credentials(), RenameAt returns EACCES.
+ //
+ // - If the renamed file is a directory, and renaming would replace a
+ // non-directory file, RenameAt returns ENOTDIR.
//
- // The implementation is responsible for checking that vd.Mount() ==
- // rp.Mount().
- RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error
+ // - If the renamed file is not a directory, and renaming would replace a
+ // directory, RenameAt returns EISDIR.
+ //
+ // - If the new parent directory of the renamed file has been removed by
+ // RmdirAt or a preceding call to RenameAt, RenameAt returns ENOENT.
+ //
+ // - If the renamed file is a directory, it is not writable by the
+ // provider of rp.Credentials(), and the source and destination parent
+ // directories are different, RenameAt returns EACCES. (This is nominally
+ // required to change the ".." entry in the renamed directory.)
+ //
+ // - If renaming would replace a non-empty directory, RenameAt returns
+ // ENOTEMPTY.
+ //
+ // Preconditions: !rp.Done(). For the final path component in rp,
+ // !rp.ShouldFollowSymlink(). oldParentVD.Dentry() was obtained from a
+ // previous call to
+ // oldParentVD.Mount().Filesystem().Impl().GetParentDentryAt(). oldName is
+ // not "." or "..".
+ //
+ // Postconditions: If RenameAt returns an error returned by
+ // ResolvingPath.Resolve*(), then !rp.Done().
+ //
+ // [1] "The worst of all namespace operations - renaming directory.
+ // "Perverted" doesn't even start to describe it. Somebody in UCB had a
+ // heck of a trip..." - fs/namei.c:vfs_rename()
+ RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error
// RmdirAt removes the directory at rp.
+ //
+ // Errors:
+ //
+ // - If the last path component in rp is ".", RmdirAt returns EINVAL.
+ //
+ // - If the last path component in rp is "..", RmdirAt returns ENOTEMPTY.
+ //
+ // - If no file exists at rp, RmdirAt returns ENOENT.
+ //
+ // - If the file at rp exists but is not a directory, RmdirAt returns
+ // ENOTDIR.
+ //
+ // Preconditions: !rp.Done(). For the final path component in rp,
+ // !rp.ShouldFollowSymlink().
+ //
+ // Postconditions: If RmdirAt returns an error returned by
+ // ResolvingPath.Resolve*(), then !rp.Done().
RmdirAt(ctx context.Context, rp *ResolvingPath) error
- // SetStatAt updates metadata for the file at the given path.
+ // SetStatAt updates metadata for the file at the given path. Implementations
+ // are responsible for checking if the operation can be performed
+ // (see vfs.CheckSetStat() for common checks).
+ //
+ // Errors:
+ //
+ // - If opts specifies unsupported options, SetStatAt returns EINVAL.
SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error
// StatAt returns metadata for the file at rp.
@@ -146,10 +397,156 @@ type FilesystemImpl interface {
StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error)
// SymlinkAt creates a symbolic link at rp referring to the given target.
+ //
+ // Errors:
+ //
+ // - If the last path component in rp is "." or "..", SymlinkAt returns
+ // EEXIST.
+ //
+ // - If a file already exists at rp, SymlinkAt returns EEXIST.
+ //
+ // - If rp.MustBeDir(), SymlinkAt returns ENOENT.
+ //
+ // - If the directory in which the symbolic link would be created has been
+ // removed by RmdirAt or RenameAt, SymlinkAt returns ENOENT.
+ //
+ // Preconditions: !rp.Done(). For the final path component in rp,
+ // !rp.ShouldFollowSymlink().
+ //
+ // Postconditions: If SymlinkAt returns an error returned by
+ // ResolvingPath.Resolve*(), then !rp.Done().
SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error
- // UnlinkAt removes the non-directory file at rp.
+ // UnlinkAt removes the file at rp.
+ //
+ // Errors:
+ //
+ // - If the last path component in rp is "." or "..", UnlinkAt returns
+ // EISDIR.
+ //
+ // - If no file exists at rp, UnlinkAt returns ENOENT.
+ //
+ // - If rp.MustBeDir(), and the file at rp exists and is not a directory,
+ // UnlinkAt returns ENOTDIR.
+ //
+ // - If the file at rp exists but is a directory, UnlinkAt returns EISDIR.
+ //
+ // Preconditions: !rp.Done(). For the final path component in rp,
+ // !rp.ShouldFollowSymlink().
+ //
+ // Postconditions: If UnlinkAt returns an error returned by
+ // ResolvingPath.Resolve*(), then !rp.Done().
UnlinkAt(ctx context.Context, rp *ResolvingPath) error
- // TODO: d_path(); extended attributes; inotify_add_watch(); bind()
+ // ListxattrAt returns all extended attribute names for the file at rp.
+ //
+ // Errors:
+ //
+ // - If extended attributes are not supported by the filesystem,
+ // ListxattrAt returns ENOTSUP.
+ //
+ // - If the size of the list (including a NUL terminating byte after every
+ // entry) would exceed size, ERANGE may be returned. Note that
+ // implementations are free to ignore size entirely and return without
+ // error). In all cases, if size is 0, the list should be returned without
+ // error, regardless of size.
+ ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error)
+
+ // GetxattrAt returns the value associated with the given extended
+ // attribute for the file at rp.
+ //
+ // Errors:
+ //
+ // - If extended attributes are not supported by the filesystem, GetxattrAt
+ // returns ENOTSUP.
+ //
+ // - If an extended attribute named opts.Name does not exist, ENODATA is
+ // returned.
+ //
+ // - If the size of the return value exceeds opts.Size, ERANGE may be
+ // returned (note that implementations are free to ignore opts.Size entirely
+ // and return without error). In all cases, if opts.Size is 0, the value
+ // should be returned without error, regardless of size.
+ GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error)
+
+ // SetxattrAt changes the value associated with the given extended
+ // attribute for the file at rp.
+ //
+ // Errors:
+ //
+ // - If extended attributes are not supported by the filesystem, SetxattrAt
+ // returns ENOTSUP.
+ //
+ // - If XATTR_CREATE is set in opts.Flag and opts.Name already exists,
+ // EEXIST is returned. If XATTR_REPLACE is set and opts.Name does not exist,
+ // ENODATA is returned.
+ SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error
+
+ // RemovexattrAt removes the given extended attribute from the file at rp.
+ //
+ // Errors:
+ //
+ // - If extended attributes are not supported by the filesystem,
+ // RemovexattrAt returns ENOTSUP.
+ //
+ // - If name does not exist, ENODATA is returned.
+ RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error
+
+ // BoundEndpointAt returns the Unix socket endpoint bound at the path rp.
+ //
+ // - If a non-socket file exists at rp, then BoundEndpointAt returns ECONNREFUSED.
+ BoundEndpointAt(ctx context.Context, rp *ResolvingPath) (transport.BoundEndpoint, error)
+
+ // PrependPath prepends a path from vd to vd.Mount().Root() to b.
+ //
+ // If vfsroot.Ok(), it is the contextual VFS root; if it is encountered
+ // before vd.Mount().Root(), PrependPath should stop prepending path
+ // components and return a PrependPathAtVFSRootError.
+ //
+ // If traversal of vd.Dentry()'s ancestors encounters an independent
+ // ("root") Dentry that is not vd.Mount().Root() (i.e. vd.Dentry() is not a
+ // descendant of vd.Mount().Root()), PrependPath should stop prepending
+ // path components and return a PrependPathAtNonMountRootError.
+ //
+ // Filesystems for which Dentries do not have meaningful paths may prepend
+ // an arbitrary descriptive string to b and then return a
+ // PrependPathSyntheticError.
+ //
+ // Most implementations can acquire the appropriate locks to ensure that
+ // Dentry.Name() and Dentry.Parent() are fixed for vd.Dentry() and all of
+ // its ancestors, then call GenericPrependPath.
+ //
+ // Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl.
+ PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error
+
+ // TODO(gvisor.dev/issue/1479): inotify_add_watch()
+}
+
+// PrependPathAtVFSRootError is returned by implementations of
+// FilesystemImpl.PrependPath() when they encounter the contextual VFS root.
+type PrependPathAtVFSRootError struct{}
+
+// Error implements error.Error.
+func (PrependPathAtVFSRootError) Error() string {
+ return "vfs.FilesystemImpl.PrependPath() reached VFS root"
+}
+
+// PrependPathAtNonMountRootError is returned by implementations of
+// FilesystemImpl.PrependPath() when they encounter an independent ancestor
+// Dentry that is not the Mount root.
+type PrependPathAtNonMountRootError struct{}
+
+// Error implements error.Error.
+func (PrependPathAtNonMountRootError) Error() string {
+ return "vfs.FilesystemImpl.PrependPath() reached root other than Mount root"
+}
+
+// PrependPathSyntheticError is returned by implementations of
+// FilesystemImpl.PrependPath() for which prepended names do not represent real
+// paths.
+type PrependPathSyntheticError struct{}
+
+// Error implements error.Error.
+func (PrependPathSyntheticError) Error() string {
+ return "vfs.FilesystemImpl.PrependPath() prepended synthetic name"
}
diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go
new file mode 100644
index 000000000..465e610e0
--- /dev/null
+++ b/pkg/sentry/vfs/filesystem_impl_util.go
@@ -0,0 +1,43 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "strings"
+)
+
+// GenericParseMountOptions parses a comma-separated list of options of the
+// form "key" or "key=value", where neither key nor value contain commas, and
+// returns it as a map. If str contains duplicate keys, then the last value
+// wins. For example:
+//
+// str = "key0=value0,key1,key2=value2,key0=value3" -> map{'key0':'value3','key1':'','key2':'value2'}
+//
+// GenericParseMountOptions is not appropriate if values may contain commas,
+// e.g. in the case of the mpol mount option for tmpfs(5).
+func GenericParseMountOptions(str string) map[string]string {
+ m := make(map[string]string)
+ for _, opt := range strings.Split(str, ",") {
+ if len(opt) > 0 {
+ res := strings.SplitN(opt, "=", 2)
+ if len(res) == 2 {
+ m[res[0]] = res[1]
+ } else {
+ m[opt] = ""
+ }
+ }
+ }
+ return m
+}
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
index f401ad7f3..f2298f7f6 100644
--- a/pkg/sentry/vfs/filesystem_type.go
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -15,9 +15,10 @@
package vfs
import (
+ "bytes"
"fmt"
- "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
)
@@ -25,46 +26,92 @@ import (
//
// FilesystemType is analogous to Linux's struct file_system_type.
type FilesystemType interface {
- // NewFilesystem returns a Filesystem configured by the given options,
+ // GetFilesystem returns a Filesystem configured by the given options,
// along with its mount root. A reference is taken on the returned
// Filesystem and Dentry.
- NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts NewFilesystemOptions) (*Filesystem, *Dentry, error)
+ GetFilesystem(ctx context.Context, vfsObj *VirtualFilesystem, creds *auth.Credentials, source string, opts GetFilesystemOptions) (*Filesystem, *Dentry, error)
+
+ // Name returns the name of this FilesystemType.
+ Name() string
}
-// NewFilesystemOptions contains options to FilesystemType.NewFilesystem.
-type NewFilesystemOptions struct {
+// GetFilesystemOptions contains options to FilesystemType.GetFilesystem.
+type GetFilesystemOptions struct {
// Data is the string passed as the 5th argument to mount(2), which is
// usually a comma-separated list of filesystem-specific mount options.
Data string
// InternalData holds opaque FilesystemType-specific data. There is
// intentionally no way for applications to specify InternalData; if it is
- // not nil, the call to NewFilesystem originates from within the sentry.
+ // not nil, the call to GetFilesystem originates from within the sentry.
InternalData interface{}
}
+// +stateify savable
+type registeredFilesystemType struct {
+ fsType FilesystemType
+ opts RegisterFilesystemTypeOptions
+}
+
+// RegisterFilesystemTypeOptions contains options to
+// VirtualFilesystem.RegisterFilesystem().
+type RegisterFilesystemTypeOptions struct {
+ // If AllowUserMount is true, allow calls to VirtualFilesystem.MountAt()
+ // for which MountOptions.InternalMount == false to use this filesystem
+ // type.
+ AllowUserMount bool
+
+ // If AllowUserList is true, make this filesystem type visible in
+ // /proc/filesystems.
+ AllowUserList bool
+
+ // If RequiresDevice is true, indicate that mounting this filesystem
+ // requires a block device as the mount source in /proc/filesystems.
+ RequiresDevice bool
+}
+
// RegisterFilesystemType registers the given FilesystemType in vfs with the
// given name.
-func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType) error {
+func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) error {
vfs.fsTypesMu.Lock()
defer vfs.fsTypesMu.Unlock()
if existing, ok := vfs.fsTypes[name]; ok {
- return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing)
+ return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing.fsType)
+ }
+ vfs.fsTypes[name] = &registeredFilesystemType{
+ fsType: fsType,
+ opts: *opts,
}
- vfs.fsTypes[name] = fsType
return nil
}
// MustRegisterFilesystemType is equivalent to RegisterFilesystemType but
// panics on failure.
-func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType) {
- if err := vfs.RegisterFilesystemType(name, fsType); err != nil {
+func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) {
+ if err := vfs.RegisterFilesystemType(name, fsType, opts); err != nil {
panic(fmt.Sprintf("failed to register filesystem type %T: %v", fsType, err))
}
}
-func (vfs *VirtualFilesystem) getFilesystemType(name string) FilesystemType {
+func (vfs *VirtualFilesystem) getFilesystemType(name string) *registeredFilesystemType {
vfs.fsTypesMu.RLock()
defer vfs.fsTypesMu.RUnlock()
return vfs.fsTypes[name]
}
+
+// GenerateProcFilesystems emits the contents of /proc/filesystems for vfs to
+// buf.
+func (vfs *VirtualFilesystem) GenerateProcFilesystems(buf *bytes.Buffer) {
+ vfs.fsTypesMu.RLock()
+ defer vfs.fsTypesMu.RUnlock()
+ for name, rft := range vfs.fsTypes {
+ if !rft.opts.AllowUserList {
+ continue
+ }
+ var nodev string
+ if !rft.opts.RequiresDevice {
+ nodev = "nodev"
+ }
+ fmt.Fprintf(buf, "%s\t%s\n", nodev, name)
+ }
+}
diff --git a/pkg/sentry/vfs/genericfstree/BUILD b/pkg/sentry/vfs/genericfstree/BUILD
new file mode 100644
index 000000000..d8fd92677
--- /dev/null
+++ b/pkg/sentry/vfs/genericfstree/BUILD
@@ -0,0 +1,16 @@
+load("//tools/go_generics:defs.bzl", "go_template")
+
+package(
+ default_visibility = ["//:sandbox"],
+ licenses = ["notice"],
+)
+
+go_template(
+ name = "generic_fstree",
+ srcs = [
+ "genericfstree.go",
+ ],
+ types = [
+ "Dentry",
+ ],
+)
diff --git a/pkg/sentry/vfs/genericfstree/genericfstree.go b/pkg/sentry/vfs/genericfstree/genericfstree.go
new file mode 100644
index 000000000..286510195
--- /dev/null
+++ b/pkg/sentry/vfs/genericfstree/genericfstree.go
@@ -0,0 +1,80 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package genericfstree provides tools for implementing vfs.FilesystemImpls
+// where a single statically-determined lock or set of locks is sufficient to
+// ensure that a Dentry's name and parent are contextually immutable.
+//
+// Clients using this package must use the go_template_instance rule in
+// tools/go_generics/defs.bzl to create an instantiation of this template
+// package, providing types to use in place of Dentry.
+package genericfstree
+
+import (
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// Dentry is a required type parameter that is a struct with the given fields.
+type Dentry struct {
+ // vfsd is the embedded vfs.Dentry corresponding to this vfs.DentryImpl.
+ vfsd vfs.Dentry
+
+ // parent is the parent of this Dentry in the filesystem's tree. If this
+ // Dentry is a filesystem root, parent is nil.
+ parent *Dentry
+
+ // name is the name of this Dentry in its parent. If this Dentry is a
+ // filesystem root, name is unspecified.
+ name string
+}
+
+// IsAncestorDentry returns true if d is an ancestor of d2; that is, d is
+// either d2's parent or an ancestor of d2's parent.
+func IsAncestorDentry(d, d2 *Dentry) bool {
+ for {
+ if d2.parent == d {
+ return true
+ }
+ if d2.parent == d2 {
+ return false
+ }
+ d2 = d2.parent
+ }
+}
+
+// ParentOrSelf returns d.parent. If d.parent is nil, ParentOrSelf returns d.
+func ParentOrSelf(d *Dentry) *Dentry {
+ if d.parent != nil {
+ return d.parent
+ }
+ return d
+}
+
+// PrependPath is a generic implementation of FilesystemImpl.PrependPath().
+func PrependPath(vfsroot vfs.VirtualDentry, mnt *vfs.Mount, d *Dentry, b *fspath.Builder) error {
+ for {
+ if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() {
+ return vfs.PrependPathAtVFSRootError{}
+ }
+ if &d.vfsd == mnt.Root() {
+ return nil
+ }
+ if d.parent == nil {
+ return vfs.PrependPathAtNonMountRootError{}
+ }
+ b.PrependComponent(d.name)
+ d = d.parent
+ }
+}
diff --git a/pkg/sentry/vfs/lock/BUILD b/pkg/sentry/vfs/lock/BUILD
new file mode 100644
index 000000000..d9ab063b7
--- /dev/null
+++ b/pkg/sentry/vfs/lock/BUILD
@@ -0,0 +1,13 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "lock",
+ srcs = ["lock.go"],
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/sentry/fs/lock",
+ "//pkg/syserror",
+ ],
+)
diff --git a/pkg/sentry/vfs/lock/lock.go b/pkg/sentry/vfs/lock/lock.go
new file mode 100644
index 000000000..724dfe743
--- /dev/null
+++ b/pkg/sentry/vfs/lock/lock.go
@@ -0,0 +1,72 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package lock provides POSIX and BSD style file locking for VFS2 file
+// implementations.
+//
+// The actual implementations can be found in the lock package under
+// sentry/fs/lock.
+package lock
+
+import (
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// FileLocks supports POSIX and BSD style locks, which correspond to fcntl(2)
+// and flock(2) respectively in Linux. It can be embedded into various file
+// implementations for VFS2 that support locking.
+//
+// Note that in Linux these two types of locks are _not_ cooperative, because
+// race and deadlock conditions make merging them prohibitive. We do the same
+// and keep them oblivious to each other.
+type FileLocks struct {
+ // bsd is a set of BSD-style advisory file wide locks, see flock(2).
+ bsd fslock.Locks
+
+ // posix is a set of POSIX-style regional advisory locks, see fcntl(2).
+ posix fslock.Locks
+}
+
+// LockBSD tries to acquire a BSD-style lock on the entire file.
+func (fl *FileLocks) LockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+ if fl.bsd.LockRegion(uid, t, fslock.LockRange{0, fslock.LockEOF}, block) {
+ return nil
+ }
+ return syserror.ErrWouldBlock
+}
+
+// UnlockBSD releases a BSD-style lock on the entire file.
+//
+// This operation is always successful, even if there did not exist a lock on
+// the requested region held by uid in the first place.
+func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) {
+ fl.bsd.UnlockRegion(uid, fslock.LockRange{0, fslock.LockEOF})
+}
+
+// LockPOSIX tries to acquire a POSIX-style lock on a file region.
+func (fl *FileLocks) LockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error {
+ if fl.posix.LockRegion(uid, t, rng, block) {
+ return nil
+ }
+ return syserror.ErrWouldBlock
+}
+
+// UnlockPOSIX releases a POSIX-style lock on a file region.
+//
+// This operation is always successful, even if there did not exist a lock on
+// the requested region held by uid in the first place.
+func (fl *FileLocks) UnlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) {
+ fl.posix.UnlockRegion(uid, rng)
+}
diff --git a/pkg/sentry/vfs/memxattr/BUILD b/pkg/sentry/vfs/memxattr/BUILD
new file mode 100644
index 000000000..d8c4d27b9
--- /dev/null
+++ b/pkg/sentry/vfs/memxattr/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "memxattr",
+ srcs = ["xattr.go"],
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/sentry/vfs",
+ "//pkg/sync",
+ "//pkg/syserror",
+ ],
+)
diff --git a/pkg/sentry/vfs/memxattr/xattr.go b/pkg/sentry/vfs/memxattr/xattr.go
new file mode 100644
index 000000000..cc1e7d764
--- /dev/null
+++ b/pkg/sentry/vfs/memxattr/xattr.go
@@ -0,0 +1,102 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package memxattr provides a default, in-memory extended attribute
+// implementation.
+package memxattr
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// SimpleExtendedAttributes implements extended attributes using a map of
+// names to values.
+//
+// +stateify savable
+type SimpleExtendedAttributes struct {
+ // mu protects the below fields.
+ mu sync.RWMutex `state:"nosave"`
+ xattrs map[string]string
+}
+
+// Getxattr returns the value at 'name'.
+func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string, error) {
+ x.mu.RLock()
+ value, ok := x.xattrs[opts.Name]
+ x.mu.RUnlock()
+ if !ok {
+ return "", syserror.ENODATA
+ }
+ // Check that the size of the buffer provided in getxattr(2) is large enough
+ // to contain the value.
+ if opts.Size != 0 && uint64(len(value)) > opts.Size {
+ return "", syserror.ERANGE
+ }
+ return value, nil
+}
+
+// Setxattr sets 'value' at 'name'.
+func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error {
+ x.mu.Lock()
+ defer x.mu.Unlock()
+ if x.xattrs == nil {
+ if opts.Flags&linux.XATTR_REPLACE != 0 {
+ return syserror.ENODATA
+ }
+ x.xattrs = make(map[string]string)
+ }
+
+ _, ok := x.xattrs[opts.Name]
+ if ok && opts.Flags&linux.XATTR_CREATE != 0 {
+ return syserror.EEXIST
+ }
+ if !ok && opts.Flags&linux.XATTR_REPLACE != 0 {
+ return syserror.ENODATA
+ }
+
+ x.xattrs[opts.Name] = opts.Value
+ return nil
+}
+
+// Listxattr returns all names in xattrs.
+func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) {
+ // Keep track of the size of the buffer needed in listxattr(2) for the list.
+ listSize := 0
+ x.mu.RLock()
+ names := make([]string, 0, len(x.xattrs))
+ for n := range x.xattrs {
+ names = append(names, n)
+ // Add one byte per null terminator.
+ listSize += len(n) + 1
+ }
+ x.mu.RUnlock()
+ if size != 0 && uint64(listSize) > size {
+ return nil, syserror.ERANGE
+ }
+ return names, nil
+}
+
+// Removexattr removes the xattr at 'name'.
+func (x *SimpleExtendedAttributes) Removexattr(name string) error {
+ x.mu.Lock()
+ defer x.mu.Unlock()
+ if _, ok := x.xattrs[name]; !ok {
+ return syserror.ENODATA
+ }
+ delete(x.xattrs, name)
+ return nil
+}
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 11702f720..02850b65c 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -15,14 +15,22 @@
package vfs
import (
+ "bytes"
+ "fmt"
"math"
+ "sort"
+ "strings"
"sync/atomic"
- "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/syserror"
)
+// lastMountID is used to allocate mount ids. Must be accessed atomically.
+var lastMountID uint64
+
// A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem
// (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem
// (Mount.fs), which applies to path resolution in the context of a particular
@@ -37,17 +45,18 @@ import (
//
// Mount is analogous to Linux's struct mount. (gVisor does not distinguish
// between struct mount and struct vfsmount.)
+//
+// +stateify savable
type Mount struct {
- // The lower 63 bits of refs are a reference count. The MSB of refs is set
- // if the Mount has been eagerly unmounted, as by umount(2) without the
- // MNT_DETACH flag. refs is accessed using atomic memory operations.
- refs int64
+ // vfs, fs, root are immutable. References are held on fs and root.
+ //
+ // Invariant: root belongs to fs.
+ vfs *VirtualFilesystem
+ fs *Filesystem
+ root *Dentry
- // The lower 63 bits of writers is the number of calls to
- // Mount.CheckBeginWrite() that have not yet been paired with a call to
- // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect.
- // writers is accessed using atomic memory operations.
- writers int64
+ // ID is the immutable mount ID.
+ ID uint64
// key is protected by VirtualFilesystem.mountMu and
// VirtualFilesystem.mounts.seq, and may be nil. References are held on
@@ -57,13 +66,49 @@ type Mount struct {
// key.parent.fs.
key mountKey
- // fs, root, and ns are immutable. References are held on fs and root (but
- // not ns).
- //
- // Invariant: root belongs to fs.
- fs *Filesystem
- root *Dentry
- ns *MountNamespace
+ // ns is the namespace in which this Mount was mounted. ns is protected by
+ // VirtualFilesystem.mountMu.
+ ns *MountNamespace
+
+ // The lower 63 bits of refs are a reference count. The MSB of refs is set
+ // if the Mount has been eagerly umounted, as by umount(2) without the
+ // MNT_DETACH flag. refs is accessed using atomic memory operations.
+ refs int64
+
+ // children is the set of all Mounts for which Mount.key.parent is this
+ // Mount. children is protected by VirtualFilesystem.mountMu.
+ children map[*Mount]struct{}
+
+ // umounted is true if VFS.umountRecursiveLocked() has been called on this
+ // Mount. VirtualFilesystem does not hold a reference on Mounts for which
+ // umounted is true. umounted is protected by VirtualFilesystem.mountMu.
+ umounted bool
+
+ // flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except
+ // for MS_RDONLY which is tracked in "writers".
+ flags MountFlags
+
+ // The lower 63 bits of writers is the number of calls to
+ // Mount.CheckBeginWrite() that have not yet been paired with a call to
+ // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect.
+ // writers is accessed using atomic memory operations.
+ writers int64
+}
+
+func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount {
+ mnt := &Mount{
+ ID: atomic.AddUint64(&lastMountID, 1),
+ vfs: vfs,
+ fs: fs,
+ root: root,
+ flags: opts.Flags,
+ ns: mntns,
+ refs: 1,
+ }
+ if opts.ReadOnly {
+ mnt.setReadOnlyLocked(true)
+ }
+ return mnt
}
// A MountNamespace is a collection of Mounts.
@@ -72,14 +117,19 @@ type Mount struct {
// MountNamespace methods require that a reference is held.
//
// MountNamespace is analogous to Linux's struct mnt_namespace.
+//
+// +stateify savable
type MountNamespace struct {
- refs int64 // accessed using atomic memory operations
-
// root is the MountNamespace's root mount. root is immutable.
root *Mount
- // mountpoints contains all Dentries which are mount points in this
- // namespace. mountpoints is protected by VirtualFilesystem.mountMu.
+ // refs is the reference count. refs is accessed using atomic memory
+ // operations.
+ refs int64
+
+ // mountpoints maps all Dentries which are mount points in this namespace
+ // to the number of Mounts for which they are mount points. mountpoints is
+ // protected by VirtualFilesystem.mountMu.
//
// mountpoints is used to determine if a Dentry can be moved or removed
// (which requires that the Dentry is not a mount point in the calling
@@ -89,59 +139,73 @@ type MountNamespace struct {
// MountNamespace; this is required to ensure that
// VFS.PrepareDeleteDentry() and VFS.PrepareRemoveDentry() operate
// correctly on unreferenced MountNamespaces.
- mountpoints map[*Dentry]struct{}
+ mountpoints map[*Dentry]uint32
}
// NewMountNamespace returns a new mount namespace with a root filesystem
// configured by the given arguments. A reference is taken on the returned
// MountNamespace.
-func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *NewFilesystemOptions) (*MountNamespace, error) {
- fsType := vfs.getFilesystemType(fsTypeName)
- if fsType == nil {
+func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
+ rft := vfs.getFilesystemType(fsTypeName)
+ if rft == nil {
+ ctx.Warningf("Unknown filesystem: %s", fsTypeName)
return nil, syserror.ENODEV
}
- fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts)
+ fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
if err != nil {
return nil, err
}
mntns := &MountNamespace{
refs: 1,
- mountpoints: make(map[*Dentry]struct{}),
- }
- mntns.root = &Mount{
- fs: fs,
- root: root,
- ns: mntns,
- refs: 1,
+ mountpoints: make(map[*Dentry]uint32),
}
+ mntns.root = newMount(vfs, fs, root, mntns, &MountOptions{})
return mntns, nil
}
-// NewMount creates and mounts a new Filesystem.
-func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *NewFilesystemOptions) error {
- fsType := vfs.getFilesystemType(fsTypeName)
- if fsType == nil {
+// NewDisconnectedMount returns a Mount representing fs with the given root
+// (which may be nil). The new Mount is not associated with any MountNamespace
+// and is not connected to any other Mounts. References are taken on fs and
+// root.
+func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry, opts *MountOptions) (*Mount, error) {
+ fs.IncRef()
+ if root != nil {
+ root.IncRef()
+ }
+ return newMount(vfs, fs, root, nil /* mntns */, opts), nil
+}
+
+// MountAt creates and mounts a Filesystem configured by the given arguments.
+func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
+ rft := vfs.getFilesystemType(fsTypeName)
+ if rft == nil {
+ return syserror.ENODEV
+ }
+ if !opts.InternalMount && !rft.opts.AllowUserMount {
return syserror.ENODEV
}
- fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts)
+ fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
if err != nil {
return err
}
+
// We can't hold vfs.mountMu while calling FilesystemImpl methods due to
// lock ordering.
vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{})
if err != nil {
- root.decRef(fs)
- fs.decRef()
+ root.DecRef()
+ fs.DecRef()
return err
}
vfs.mountMu.Lock()
+ vd.dentry.mu.Lock()
for {
- if vd.dentry.IsDisowned() {
+ if vd.dentry.dead {
+ vd.dentry.mu.Unlock()
vfs.mountMu.Unlock()
vd.DecRef()
- root.decRef(fs)
- fs.decRef()
+ root.DecRef()
+ fs.DecRef()
return syserror.ENOENT
}
// vd might have been mounted over between vfs.GetDentryAt() and
@@ -153,36 +217,270 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti
if nextmnt == nil {
break
}
- nextmnt.incRef()
- nextmnt.root.incRef(nextmnt.fs)
+ // It's possible that nextmnt has been umounted but not disconnected,
+ // in which case vfs no longer holds a reference on it, and the last
+ // reference may be concurrently dropped even though we're holding
+ // vfs.mountMu.
+ if !nextmnt.tryIncMountedRef() {
+ break
+ }
+ // This can't fail since we're holding vfs.mountMu.
+ nextmnt.root.IncRef()
+ vd.dentry.mu.Unlock()
vd.DecRef()
vd = VirtualDentry{
mount: nextmnt,
dentry: nextmnt.root,
}
+ vd.dentry.mu.Lock()
}
- // TODO: Linux requires that either both the mount point and the mount root
- // are directories, or neither are, and returns ENOTDIR if this is not the
- // case.
+ // TODO(gvisor.dev/issue/1035): Linux requires that either both the mount
+ // point and the mount root are directories, or neither are, and returns
+ // ENOTDIR if this is not the case.
mntns := vd.mount.ns
- mnt := &Mount{
- fs: fs,
- root: root,
- ns: mntns,
- refs: 1,
+ mnt := newMount(vfs, fs, root, mntns, opts)
+ vfs.mounts.seq.BeginWrite()
+ vfs.connectLocked(mnt, vd, mntns)
+ vfs.mounts.seq.EndWrite()
+ vd.dentry.mu.Unlock()
+ vfs.mountMu.Unlock()
+ return nil
+}
+
+// UmountAt removes the Mount at the given path.
+func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error {
+ if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 {
+ return syserror.EINVAL
+ }
+
+ // MNT_FORCE is currently unimplemented except for the permission check.
+ if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) {
+ return syserror.EPERM
+ }
+
+ vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{})
+ if err != nil {
+ return err
}
- mnt.storeKey(vd.mount, vd.dentry)
+ defer vd.DecRef()
+ if vd.dentry != vd.mount.root {
+ return syserror.EINVAL
+ }
+ vfs.mountMu.Lock()
+ if mntns := MountNamespaceFromContext(ctx); mntns != nil {
+ defer mntns.DecRef()
+ if mntns != vd.mount.ns {
+ vfs.mountMu.Unlock()
+ return syserror.EINVAL
+ }
+ }
+
+ // TODO(gvisor.dev/issue/1035): Linux special-cases umount of the caller's
+ // root, which we don't implement yet (we'll just fail it since the caller
+ // holds a reference on it).
+
+ vfs.mounts.seq.BeginWrite()
+ if opts.Flags&linux.MNT_DETACH == 0 {
+ if len(vd.mount.children) != 0 {
+ vfs.mounts.seq.EndWrite()
+ vfs.mountMu.Unlock()
+ return syserror.EBUSY
+ }
+ // We are holding a reference on vd.mount.
+ expectedRefs := int64(1)
+ if !vd.mount.umounted {
+ expectedRefs = 2
+ }
+ if atomic.LoadInt64(&vd.mount.refs)&^math.MinInt64 != expectedRefs { // mask out MSB
+ vfs.mounts.seq.EndWrite()
+ vfs.mountMu.Unlock()
+ return syserror.EBUSY
+ }
+ }
+ vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(vd.mount, &umountRecursiveOptions{
+ eager: opts.Flags&linux.MNT_DETACH == 0,
+ disconnectHierarchy: true,
+ }, nil, nil)
+ vfs.mounts.seq.EndWrite()
+ vfs.mountMu.Unlock()
+ for _, vd := range vdsToDecRef {
+ vd.DecRef()
+ }
+ for _, mnt := range mountsToDecRef {
+ mnt.DecRef()
+ }
+ return nil
+}
+
+type umountRecursiveOptions struct {
+ // If eager is true, ensure that future calls to Mount.tryIncMountedRef()
+ // on umounted mounts fail.
+ //
+ // eager is analogous to Linux's UMOUNT_SYNC.
+ eager bool
+
+ // If disconnectHierarchy is true, Mounts that are umounted hierarchically
+ // should be disconnected from their parents. (Mounts whose parents are not
+ // umounted, which in most cases means the Mount passed to the initial call
+ // to umountRecursiveLocked, are unconditionally disconnected for
+ // consistency with Linux.)
+ //
+ // disconnectHierarchy is analogous to Linux's !UMOUNT_CONNECTED.
+ disconnectHierarchy bool
+}
+
+// umountRecursiveLocked marks mnt and its descendants as umounted. It does not
+// release mount or dentry references; instead, it appends VirtualDentries and
+// Mounts on which references must be dropped to vdsToDecRef and mountsToDecRef
+// respectively, and returns updated slices. (This is necessary because
+// filesystem locks possibly taken by DentryImpl.DecRef() may precede
+// vfs.mountMu in the lock order, and Mount.DecRef() may lock vfs.mountMu.)
+//
+// umountRecursiveLocked is analogous to Linux's fs/namespace.c:umount_tree().
+//
+// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
+// writer critical section.
+func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecursiveOptions, vdsToDecRef []VirtualDentry, mountsToDecRef []*Mount) ([]VirtualDentry, []*Mount) {
+ if !mnt.umounted {
+ mnt.umounted = true
+ mountsToDecRef = append(mountsToDecRef, mnt)
+ if parent := mnt.parent(); parent != nil && (opts.disconnectHierarchy || !parent.umounted) {
+ vdsToDecRef = append(vdsToDecRef, vfs.disconnectLocked(mnt))
+ }
+ }
+ if opts.eager {
+ for {
+ refs := atomic.LoadInt64(&mnt.refs)
+ if refs < 0 {
+ break
+ }
+ if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs|math.MinInt64) {
+ break
+ }
+ }
+ }
+ for child := range mnt.children {
+ vdsToDecRef, mountsToDecRef = vfs.umountRecursiveLocked(child, opts, vdsToDecRef, mountsToDecRef)
+ }
+ return vdsToDecRef, mountsToDecRef
+}
+
+// connectLocked makes vd the mount parent/point for mnt. It consumes
+// references held by vd.
+//
+// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
+// writer critical section. d.mu must be locked. mnt.parent() == nil.
+func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) {
+ mnt.storeKey(vd)
+ if vd.mount.children == nil {
+ vd.mount.children = make(map[*Mount]struct{})
+ }
+ vd.mount.children[mnt] = struct{}{}
atomic.AddUint32(&vd.dentry.mounts, 1)
- mntns.mountpoints[vd.dentry] = struct{}{}
+ mntns.mountpoints[vd.dentry]++
+ vfs.mounts.insertSeqed(mnt)
vfsmpmounts, ok := vfs.mountpoints[vd.dentry]
if !ok {
vfsmpmounts = make(map[*Mount]struct{})
vfs.mountpoints[vd.dentry] = vfsmpmounts
}
vfsmpmounts[mnt] = struct{}{}
- vfs.mounts.Insert(mnt)
- vfs.mountMu.Unlock()
- return nil
+}
+
+// disconnectLocked makes vd have no mount parent/point and returns its old
+// mount parent/point with a reference held.
+//
+// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
+// writer critical section. mnt.parent() != nil.
+func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry {
+ vd := mnt.loadKey()
+ mnt.storeKey(VirtualDentry{})
+ delete(vd.mount.children, mnt)
+ atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1
+ mnt.ns.mountpoints[vd.dentry]--
+ if mnt.ns.mountpoints[vd.dentry] == 0 {
+ delete(mnt.ns.mountpoints, vd.dentry)
+ }
+ vfs.mounts.removeSeqed(mnt)
+ vfsmpmounts := vfs.mountpoints[vd.dentry]
+ delete(vfsmpmounts, mnt)
+ if len(vfsmpmounts) == 0 {
+ delete(vfs.mountpoints, vd.dentry)
+ }
+ return vd
+}
+
+// tryIncMountedRef increments mnt's reference count and returns true. If mnt's
+// reference count is already zero, or has been eagerly umounted,
+// tryIncMountedRef does nothing and returns false.
+//
+// tryIncMountedRef does not require that a reference is held on mnt.
+func (mnt *Mount) tryIncMountedRef() bool {
+ for {
+ refs := atomic.LoadInt64(&mnt.refs)
+ if refs <= 0 { // refs < 0 => MSB set => eagerly unmounted
+ return false
+ }
+ if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs+1) {
+ return true
+ }
+ }
+}
+
+// IncRef increments mnt's reference count.
+func (mnt *Mount) IncRef() {
+ // In general, negative values for mnt.refs are valid because the MSB is
+ // the eager-unmount bit.
+ atomic.AddInt64(&mnt.refs, 1)
+}
+
+// DecRef decrements mnt's reference count.
+func (mnt *Mount) DecRef() {
+ refs := atomic.AddInt64(&mnt.refs, -1)
+ if refs&^math.MinInt64 == 0 { // mask out MSB
+ var vd VirtualDentry
+ if mnt.parent() != nil {
+ mnt.vfs.mountMu.Lock()
+ mnt.vfs.mounts.seq.BeginWrite()
+ vd = mnt.vfs.disconnectLocked(mnt)
+ mnt.vfs.mounts.seq.EndWrite()
+ mnt.vfs.mountMu.Unlock()
+ }
+ mnt.root.DecRef()
+ mnt.fs.DecRef()
+ if vd.Ok() {
+ vd.DecRef()
+ }
+ }
+}
+
+// IncRef increments mntns' reference count.
+func (mntns *MountNamespace) IncRef() {
+ if atomic.AddInt64(&mntns.refs, 1) <= 1 {
+ panic("MountNamespace.IncRef() called without holding a reference")
+ }
+}
+
+// DecRef decrements mntns' reference count.
+func (mntns *MountNamespace) DecRef() {
+ vfs := mntns.root.fs.VirtualFilesystem()
+ if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 {
+ vfs.mountMu.Lock()
+ vfs.mounts.seq.BeginWrite()
+ vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(mntns.root, &umountRecursiveOptions{
+ disconnectHierarchy: true,
+ }, nil, nil)
+ vfs.mounts.seq.EndWrite()
+ vfs.mountMu.Unlock()
+ for _, vd := range vdsToDecRef {
+ vd.DecRef()
+ }
+ for _, mnt := range mountsToDecRef {
+ mnt.DecRef()
+ }
+ } else if refs < 0 {
+ panic("MountNamespace.DecRef() called without holding a reference")
+ }
}
// getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes
@@ -223,7 +521,7 @@ retryFirst:
// Raced with umount.
continue
}
- mnt.decRef()
+ mnt.DecRef()
mnt = next
d = next.root
}
@@ -231,12 +529,12 @@ retryFirst:
}
// getMountpointAt returns the mount point for the stack of Mounts including
-// mnt. It takes a reference on the returned Mount and Dentry. If no such mount
+// mnt. It takes a reference on the returned VirtualDentry. If no such mount
// point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil).
//
// Preconditions: References are held on mnt and root. vfsroot is not (mnt,
// mnt.root).
-func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) (*Mount, *Dentry) {
+func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) VirtualDentry {
// The first mount is special-cased:
//
// - The caller must have already checked mnt against vfsroot.
@@ -246,21 +544,26 @@ func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry)
// - We don't drop the caller's reference on mnt.
retryFirst:
epoch := vfs.mounts.seq.BeginRead()
- parent, point := mnt.loadKey()
+ parent, point := mnt.parent(), mnt.point()
if !vfs.mounts.seq.ReadOk(epoch) {
goto retryFirst
}
if parent == nil {
- return nil, nil
+ return VirtualDentry{}
}
if !parent.tryIncMountedRef() {
// Raced with umount.
goto retryFirst
}
- if !point.tryIncRef(parent.fs) {
+ if !point.TryIncRef() {
// Since Mount holds a reference on Mount.key.point, this can only
// happen due to a racing change to Mount.key.
- parent.decRef()
+ parent.DecRef()
+ goto retryFirst
+ }
+ if !vfs.mounts.seq.ReadOk(epoch) {
+ point.DecRef()
+ parent.DecRef()
goto retryFirst
}
mnt = parent
@@ -274,7 +577,7 @@ retryFirst:
}
retryNotFirst:
epoch := vfs.mounts.seq.BeginRead()
- parent, point := mnt.loadKey()
+ parent, point := mnt.parent(), mnt.point()
if !vfs.mounts.seq.ReadOk(epoch) {
goto retryNotFirst
}
@@ -285,59 +588,23 @@ retryFirst:
// Raced with umount.
goto retryNotFirst
}
- if !point.tryIncRef(parent.fs) {
+ if !point.TryIncRef() {
// Since Mount holds a reference on Mount.key.point, this can
// only happen due to a racing change to Mount.key.
- parent.decRef()
+ parent.DecRef()
goto retryNotFirst
}
if !vfs.mounts.seq.ReadOk(epoch) {
- point.decRef(parent.fs)
- parent.decRef()
+ point.DecRef()
+ parent.DecRef()
goto retryNotFirst
}
- d.decRef(mnt.fs)
- mnt.decRef()
+ d.DecRef()
+ mnt.DecRef()
mnt = parent
d = point
}
- return mnt, d
-}
-
-// tryIncMountedRef increments mnt's reference count and returns true. If mnt's
-// reference count is already zero, or has been eagerly unmounted,
-// tryIncMountedRef does nothing and returns false.
-//
-// tryIncMountedRef does not require that a reference is held on mnt.
-func (mnt *Mount) tryIncMountedRef() bool {
- for {
- refs := atomic.LoadInt64(&mnt.refs)
- if refs <= 0 { // refs < 0 => MSB set => eagerly unmounted
- return false
- }
- if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs+1) {
- return true
- }
- }
-}
-
-func (mnt *Mount) incRef() {
- // In general, negative values for mnt.refs are valid because the MSB is
- // the eager-unmount bit.
- atomic.AddInt64(&mnt.refs, 1)
-}
-
-func (mnt *Mount) decRef() {
- refs := atomic.AddInt64(&mnt.refs, -1)
- if refs&^math.MinInt64 == 0 { // mask out MSB
- parent, point := mnt.loadKey()
- if point != nil {
- point.decRef(parent.fs)
- parent.decRef()
- }
- mnt.root.decRef(mnt.fs)
- mnt.fs.decRef()
- }
+ return VirtualDentry{mnt, d}
}
// CheckBeginWrite increments the counter of in-progress write operations on
@@ -360,7 +627,7 @@ func (mnt *Mount) EndWrite() {
atomic.AddInt64(&mnt.writers, -1)
}
-// Preconditions: VirtualFilesystem.mountMu must be locked for writing.
+// Preconditions: VirtualFilesystem.mountMu must be locked.
func (mnt *Mount) setReadOnlyLocked(ro bool) error {
if oldRO := atomic.LoadInt64(&mnt.writers) < 0; oldRO == ro {
return nil
@@ -377,26 +644,32 @@ func (mnt *Mount) setReadOnlyLocked(ro bool) error {
return nil
}
+func (mnt *Mount) readOnly() bool {
+ return atomic.LoadInt64(&mnt.writers) < 0
+}
+
// Filesystem returns the mounted Filesystem. It does not take a reference on
// the returned Filesystem.
func (mnt *Mount) Filesystem() *Filesystem {
return mnt.fs
}
-// IncRef increments mntns' reference count.
-func (mntns *MountNamespace) IncRef() {
- if atomic.AddInt64(&mntns.refs, 1) <= 1 {
- panic("MountNamespace.IncRef() called without holding a reference")
+// submountsLocked returns this Mount and all Mounts that are descendents of
+// it.
+//
+// Precondition: mnt.vfs.mountMu must be held.
+func (mnt *Mount) submountsLocked() []*Mount {
+ mounts := []*Mount{mnt}
+ for m := range mnt.children {
+ mounts = append(mounts, m.submountsLocked()...)
}
+ return mounts
}
-// DecRef decrements mntns' reference count.
-func (mntns *MountNamespace) DecRef() {
- if refs := atomic.AddInt64(&mntns.refs, 0); refs == 0 {
- // TODO: unmount mntns.root
- } else if refs < 0 {
- panic("MountNamespace.DecRef() called without holding a reference")
- }
+// Root returns the mount's root. It does not take a reference on the returned
+// Dentry.
+func (mnt *Mount) Root() *Dentry {
+ return mnt.root
}
// Root returns mntns' root. A reference is taken on the returned
@@ -409,3 +682,174 @@ func (mntns *MountNamespace) Root() VirtualDentry {
vd.IncRef()
return vd
}
+
+// GenerateProcMounts emits the contents of /proc/[pid]/mounts for vfs to buf.
+//
+// Preconditions: taskRootDir.Ok().
+func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
+ vfs.mountMu.Lock()
+ defer vfs.mountMu.Unlock()
+ rootMnt := taskRootDir.mount
+ mounts := rootMnt.submountsLocked()
+ sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+ for _, mnt := range mounts {
+ // Get the path to this mount relative to task root.
+ mntRootVD := VirtualDentry{
+ mount: mnt,
+ dentry: mnt.root,
+ }
+ path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
+ if err != nil {
+ // For some reason we didn't get a path. Log a warning
+ // and run with empty path.
+ ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+ path = ""
+ }
+ if path == "" {
+ // Either an error occurred, or path is not reachable
+ // from root.
+ break
+ }
+
+ opts := "rw"
+ if mnt.readOnly() {
+ opts = "ro"
+ }
+ if mnt.flags.NoExec {
+ opts += ",noexec"
+ }
+
+ // Format:
+ // <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order>
+ //
+ // The "needs dump" and "fsck order" flags are always 0, which
+ // is allowed.
+ fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", path, mnt.fs.FilesystemType().Name(), opts, 0, 0)
+ }
+}
+
+// GenerateProcMountInfo emits the contents of /proc/[pid]/mountinfo for vfs to
+// buf.
+//
+// Preconditions: taskRootDir.Ok().
+func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
+ vfs.mountMu.Lock()
+ defer vfs.mountMu.Unlock()
+ rootMnt := taskRootDir.mount
+ mounts := rootMnt.submountsLocked()
+ sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+ for _, mnt := range mounts {
+ // Get the path to this mount relative to task root.
+ mntRootVD := VirtualDentry{
+ mount: mnt,
+ dentry: mnt.root,
+ }
+ path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
+ if err != nil {
+ // For some reason we didn't get a path. Log a warning
+ // and run with empty path.
+ ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+ path = ""
+ }
+ if path == "" {
+ // Either an error occurred, or path is not reachable
+ // from root.
+ break
+ }
+ // Stat the mount root to get the major/minor device numbers.
+ pop := &PathOperation{
+ Root: mntRootVD,
+ Start: mntRootVD,
+ }
+ statx, err := vfs.StatAt(ctx, auth.NewAnonymousCredentials(), pop, &StatOptions{})
+ if err != nil {
+ // Well that's not good. Ignore this mount.
+ break
+ }
+
+ // Format:
+ // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
+ // (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11)
+
+ // (1) Mount ID.
+ fmt.Fprintf(buf, "%d ", mnt.ID)
+
+ // (2) Parent ID (or this ID if there is no parent).
+ pID := mnt.ID
+ if p := mnt.parent(); p != nil {
+ pID = p.ID
+ }
+ fmt.Fprintf(buf, "%d ", pID)
+
+ // (3) Major:Minor device ID. We don't have a superblock, so we
+ // just use the root inode device number.
+ fmt.Fprintf(buf, "%d:%d ", statx.DevMajor, statx.DevMinor)
+
+ // (4) Root: the pathname of the directory in the filesystem
+ // which forms the root of this mount.
+ //
+ // NOTE(b/78135857): This will always be "/" until we implement
+ // bind mounts.
+ fmt.Fprintf(buf, "/ ")
+
+ // (5) Mount point (relative to process root).
+ fmt.Fprintf(buf, "%s ", manglePath(path))
+
+ // (6) Mount options.
+ opts := "rw"
+ if mnt.readOnly() {
+ opts = "ro"
+ }
+ if mnt.flags.NoExec {
+ opts += ",noexec"
+ }
+ // TODO(gvisor.dev/issue/1193): Add "noatime" if MS_NOATIME is
+ // set.
+ fmt.Fprintf(buf, "%s ", opts)
+
+ // (7) Optional fields: zero or more fields of the form "tag[:value]".
+ // (8) Separator: the end of the optional fields is marked by a single hyphen.
+ fmt.Fprintf(buf, "- ")
+
+ // (9) Filesystem type.
+ fmt.Fprintf(buf, "%s ", mnt.fs.FilesystemType().Name())
+
+ // (10) Mount source: filesystem-specific information or "none".
+ fmt.Fprintf(buf, "none ")
+
+ // (11) Superblock options, and final newline.
+ fmt.Fprintf(buf, "%s\n", superBlockOpts(path, mnt))
+ }
+}
+
+// manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents.
+// See Linux fs/seq_file.c:mangle_path.
+func manglePath(p string) string {
+ r := strings.NewReplacer(" ", "\\040", "\t", "\\011", "\n", "\\012", "\\", "\\134")
+ return r.Replace(p)
+}
+
+// superBlockOpts returns the super block options string for the the mount at
+// the given path.
+func superBlockOpts(mountPath string, mnt *Mount) string {
+ // gVisor doesn't (yet) have a concept of super block options, so we
+ // use the ro/rw bit from the mount flag.
+ opts := "rw"
+ if mnt.readOnly() {
+ opts = "ro"
+ }
+
+ // NOTE(b/147673608): If the mount is a cgroup, we also need to include
+ // the cgroup name in the options. For now we just read that from the
+ // path.
+ //
+ // TODO(gvisor.dev/issue/190): Once gVisor has full cgroup support, we
+ // should get this value from the cgroup itself, and not rely on the
+ // path.
+ if mnt.fs.FilesystemType().Name() == "cgroup" {
+ splitPath := strings.Split(mountPath, "/")
+ cgroupType := splitPath[len(splitPath)-1]
+ opts += "," + cgroupType
+ }
+ return opts
+}
diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go
index f394d7483..3335e4057 100644
--- a/pkg/sentry/vfs/mount_test.go
+++ b/pkg/sentry/vfs/mount_test.go
@@ -17,8 +17,9 @@ package vfs
import (
"fmt"
"runtime"
- "sync"
"testing"
+
+ "gvisor.dev/gvisor/pkg/sync"
)
func TestMountTableLookupEmpty(t *testing.T) {
@@ -37,7 +38,7 @@ func TestMountTableInsertLookup(t *testing.T) {
mt.Init()
mount := &Mount{}
- mount.storeKey(&Mount{}, &Dentry{})
+ mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}})
mt.Insert(mount)
if m := mt.Lookup(mount.parent(), mount.point()); m != mount {
@@ -54,7 +55,7 @@ func TestMountTableInsertLookup(t *testing.T) {
}
}
-// TODO: concurrent lookup/insertion/removal
+// TODO(gvisor.dev/issue/1035): concurrent lookup/insertion/removal.
// must be powers of 2
var benchNumMounts = []int{1 << 2, 1 << 5, 1 << 8}
@@ -78,18 +79,10 @@ const enableComparativeBenchmarks = false
func newBenchMount() *Mount {
mount := &Mount{}
- mount.storeKey(&Mount{}, &Dentry{})
+ mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}})
return mount
}
-func vdkey(mnt *Mount) VirtualDentry {
- parent, point := mnt.loadKey()
- return VirtualDentry{
- mount: parent,
- dentry: point,
- }
-}
-
func BenchmarkMountTableParallelLookup(b *testing.B) {
for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 {
for _, numMounts := range benchNumMounts {
@@ -101,7 +94,7 @@ func BenchmarkMountTableParallelLookup(b *testing.B) {
for i := 0; i < numMounts; i++ {
mount := newBenchMount()
mt.Insert(mount)
- keys = append(keys, vdkey(mount))
+ keys = append(keys, mount.loadKey())
}
var ready sync.WaitGroup
@@ -153,7 +146,7 @@ func BenchmarkMountMapParallelLookup(b *testing.B) {
keys := make([]VirtualDentry, 0, numMounts)
for i := 0; i < numMounts; i++ {
mount := newBenchMount()
- key := vdkey(mount)
+ key := mount.loadKey()
ms[key] = mount
keys = append(keys, key)
}
@@ -208,7 +201,7 @@ func BenchmarkMountSyncMapParallelLookup(b *testing.B) {
keys := make([]VirtualDentry, 0, numMounts)
for i := 0; i < numMounts; i++ {
mount := newBenchMount()
- key := vdkey(mount)
+ key := mount.loadKey()
ms.Store(key, mount)
keys = append(keys, key)
}
@@ -290,7 +283,7 @@ func BenchmarkMountMapNegativeLookup(b *testing.B) {
ms := make(map[VirtualDentry]*Mount)
for i := 0; i < numMounts; i++ {
mount := newBenchMount()
- ms[vdkey(mount)] = mount
+ ms[mount.loadKey()] = mount
}
negkeys := make([]VirtualDentry, 0, numMounts)
for i := 0; i < numMounts; i++ {
@@ -325,7 +318,7 @@ func BenchmarkMountSyncMapNegativeLookup(b *testing.B) {
var ms sync.Map
for i := 0; i < numMounts; i++ {
mount := newBenchMount()
- ms.Store(vdkey(mount), mount)
+ ms.Store(mount.loadKey(), mount)
}
negkeys := make([]VirtualDentry, 0, numMounts)
for i := 0; i < numMounts; i++ {
@@ -379,7 +372,7 @@ func BenchmarkMountMapInsert(b *testing.B) {
b.ResetTimer()
for i := range mounts {
mount := mounts[i]
- ms[vdkey(mount)] = mount
+ ms[mount.loadKey()] = mount
}
}
@@ -399,7 +392,7 @@ func BenchmarkMountSyncMapInsert(b *testing.B) {
b.ResetTimer()
for i := range mounts {
mount := mounts[i]
- ms.Store(vdkey(mount), mount)
+ ms.Store(mount.loadKey(), mount)
}
}
@@ -432,13 +425,13 @@ func BenchmarkMountMapRemove(b *testing.B) {
ms := make(map[VirtualDentry]*Mount)
for i := range mounts {
mount := mounts[i]
- ms[vdkey(mount)] = mount
+ ms[mount.loadKey()] = mount
}
b.ResetTimer()
for i := range mounts {
mount := mounts[i]
- delete(ms, vdkey(mount))
+ delete(ms, mount.loadKey())
}
}
@@ -454,12 +447,12 @@ func BenchmarkMountSyncMapRemove(b *testing.B) {
var ms sync.Map
for i := range mounts {
mount := mounts[i]
- ms.Store(vdkey(mount), mount)
+ ms.Store(mount.loadKey(), mount)
}
b.ResetTimer()
for i := range mounts {
mount := mounts[i]
- ms.Delete(vdkey(mount))
+ ms.Delete(mount.loadKey())
}
}
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index b0511aa40..bc7581698 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -13,7 +13,7 @@
// limitations under the License.
// +build go1.12
-// +build !go1.14
+// +build !go1.15
// Check go:linkname function signatures when updating Go version.
@@ -26,7 +26,8 @@ import (
"sync/atomic"
"unsafe"
- "gvisor.dev/gvisor/third_party/gvsync"
+ "gvisor.dev/gvisor/pkg/gohacks"
+ "gvisor.dev/gvisor/pkg/sync"
)
// mountKey represents the location at which a Mount is mounted. It is
@@ -38,16 +39,6 @@ type mountKey struct {
point unsafe.Pointer // *Dentry
}
-// Invariant: mnt.key's fields are nil. parent and point are non-nil.
-func (mnt *Mount) storeKey(parent *Mount, point *Dentry) {
- atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(parent))
- atomic.StorePointer(&mnt.key.point, unsafe.Pointer(point))
-}
-
-func (mnt *Mount) loadKey() (*Mount, *Dentry) {
- return (*Mount)(atomic.LoadPointer(&mnt.key.parent)), (*Dentry)(atomic.LoadPointer(&mnt.key.point))
-}
-
func (mnt *Mount) parent() *Mount {
return (*Mount)(atomic.LoadPointer(&mnt.key.parent))
}
@@ -56,11 +47,26 @@ func (mnt *Mount) point() *Dentry {
return (*Dentry)(atomic.LoadPointer(&mnt.key.point))
}
+func (mnt *Mount) loadKey() VirtualDentry {
+ return VirtualDentry{
+ mount: mnt.parent(),
+ dentry: mnt.point(),
+ }
+}
+
+// Invariant: mnt.key.parent == nil. vd.Ok().
+func (mnt *Mount) storeKey(vd VirtualDentry) {
+ atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(vd.mount))
+ atomic.StorePointer(&mnt.key.point, unsafe.Pointer(vd.dentry))
+}
+
// mountTable maps (mount parent, mount point) pairs to mounts. It supports
// efficient concurrent lookup, even in the presence of concurrent mutators
// (provided mutation is sufficiently uncommon).
//
// mountTable.Init() must be called on new mountTables before use.
+//
+// +stateify savable
type mountTable struct {
// mountTable is implemented as a seqcount-protected hash table that
// resolves collisions with linear probing, featuring Robin Hood insertion
@@ -72,8 +78,8 @@ type mountTable struct {
// intrinsics and inline assembly, limiting the performance of this
// approach.)
- seq gvsync.SeqCount
- seed uint32 // for hashing keys
+ seq sync.SeqCount `state:"nosave"`
+ seed uint32 // for hashing keys
// size holds both length (number of elements) and capacity (number of
// slots): capacity is stored as its base-2 log (referred to as order) in
@@ -86,7 +92,7 @@ type mountTable struct {
// length and cap in separate uint32s) for ~free.
size uint64
- slots unsafe.Pointer // []mountSlot; never nil after Init
+ slots unsafe.Pointer `state:"nosave"` // []mountSlot; never nil after Init
}
type mountSlot struct {
@@ -155,7 +161,7 @@ func newMountTableSlots(cap uintptr) unsafe.Pointer {
// Lookup may be called even if there are concurrent mutators of mt.
func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount {
key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)}
- hash := memhash(noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes)
+ hash := memhash(gohacks.Noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes)
loop:
for {
@@ -201,9 +207,19 @@ loop:
// Insert inserts the given mount into mt.
//
-// Preconditions: There are no concurrent mutators of mt. mt must not already
-// contain a Mount with the same mount point and parent.
+// Preconditions: mt must not already contain a Mount with the same mount point
+// and parent.
func (mt *mountTable) Insert(mount *Mount) {
+ mt.seq.BeginWrite()
+ mt.insertSeqed(mount)
+ mt.seq.EndWrite()
+}
+
+// insertSeqed inserts the given mount into mt.
+//
+// Preconditions: mt.seq must be in a writer critical section. mt must not
+// already contain a Mount with the same mount point and parent.
+func (mt *mountTable) insertSeqed(mount *Mount) {
hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
// We're under the maximum load factor if:
@@ -215,10 +231,8 @@ func (mt *mountTable) Insert(mount *Mount) {
tcap := uintptr(1) << order
if ((tlen + 1) * mtMaxLoadDen) <= (uint64(mtMaxLoadNum) << order) {
// Atomically insert the new element into the table.
- mt.seq.BeginWrite()
atomic.AddUint64(&mt.size, mtSizeLenOne)
mtInsertLocked(mt.slots, tcap, unsafe.Pointer(mount), hash)
- mt.seq.EndWrite()
return
}
@@ -241,8 +255,6 @@ func (mt *mountTable) Insert(mount *Mount) {
for {
oldSlot := (*mountSlot)(oldCur)
if oldSlot.value != nil {
- // Don't need to lock mt.seq yet since newSlots isn't visible
- // to readers.
mtInsertLocked(newSlots, newCap, oldSlot.value, oldSlot.hash)
}
if oldCur == oldLast {
@@ -252,11 +264,9 @@ func (mt *mountTable) Insert(mount *Mount) {
}
// Insert the new element into the new table.
mtInsertLocked(newSlots, newCap, unsafe.Pointer(mount), hash)
- // Atomically switch to the new table.
- mt.seq.BeginWrite()
+ // Switch to the new table.
atomic.AddUint64(&mt.size, mtSizeLenOne|mtSizeOrderOne)
atomic.StorePointer(&mt.slots, newSlots)
- mt.seq.EndWrite()
}
// Preconditions: There are no concurrent mutators of the table (slots, cap).
@@ -294,9 +304,18 @@ func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, has
// Remove removes the given mount from mt.
//
-// Preconditions: There are no concurrent mutators of mt. mt must contain
-// mount.
+// Preconditions: mt must contain mount.
func (mt *mountTable) Remove(mount *Mount) {
+ mt.seq.BeginWrite()
+ mt.removeSeqed(mount)
+ mt.seq.EndWrite()
+}
+
+// removeSeqed removes the given mount from mt.
+//
+// Preconditions: mt.seq must be in a writer critical section. mt must contain
+// mount.
+func (mt *mountTable) removeSeqed(mount *Mount) {
hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
tcap := uintptr(1) << (mt.size & mtSizeOrderMask)
mask := tcap - 1
@@ -311,7 +330,6 @@ func (mt *mountTable) Remove(mount *Mount) {
// backward until we either find an empty slot, or an element that
// is already in its first-probed slot. (This is backward shift
// deletion.)
- mt.seq.BeginWrite()
for {
nextOff := (off + mountSlotBytes) & offmask
nextSlot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + nextOff))
@@ -330,7 +348,6 @@ func (mt *mountTable) Remove(mount *Mount) {
}
atomic.StorePointer(&slot.value, nil)
atomic.AddUint64(&mt.size, mtSizeLenNegOne)
- mt.seq.EndWrite()
return
}
if checkInvariants && slotValue == nil {
@@ -345,12 +362,3 @@ func memhash(p unsafe.Pointer, seed, s uintptr) uintptr
//go:linkname rand32 runtime.fastrand
func rand32() uint32
-
-// This is copy/pasted from runtime.noescape(), and is needed because arguments
-// apparently escape from all functions defined by linkname.
-//
-//go:nosplit
-func noescape(p unsafe.Pointer) unsafe.Pointer {
- x := uintptr(p)
- return unsafe.Pointer(x ^ 0)
-}
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 3aa73d911..022bac127 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -16,6 +16,7 @@ package vfs
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
)
// GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and
@@ -32,6 +33,25 @@ type GetDentryOptions struct {
type MkdirOptions struct {
// Mode is the file mode bits for the created directory.
Mode linux.FileMode
+
+ // If ForSyntheticMountpoint is true, FilesystemImpl.MkdirAt() may create
+ // the given directory in memory only (as opposed to persistent storage).
+ // The created directory should be able to support the creation of
+ // subdirectories with ForSyntheticMountpoint == true. It does not need to
+ // support the creation of subdirectories with ForSyntheticMountpoint ==
+ // false, or files of other types.
+ //
+ // FilesystemImpls are permitted to ignore the ForSyntheticMountpoint
+ // option.
+ //
+ // The ForSyntheticMountpoint option exists because, unlike mount(2), the
+ // OCI Runtime Specification permits the specification of mount points that
+ // do not exist, under the expectation that container runtimes will create
+ // them. (More accurately, the OCI Runtime Specification completely fails
+ // to document this feature, but it's implemented by runc.)
+ // ForSyntheticMountpoint allows such mount points to be created even when
+ // the underlying persistent filesystem is immutable.
+ ForSyntheticMountpoint bool
}
// MknodOptions contains options to VirtualFilesystem.MknodAt() and
@@ -44,6 +64,33 @@ type MknodOptions struct {
// DevMinor are the major and minor device numbers for the created device.
DevMajor uint32
DevMinor uint32
+
+ // Endpoint is the endpoint to bind to the created file, if a socket file is
+ // being created for bind(2) on a Unix domain socket.
+ Endpoint transport.BoundEndpoint
+}
+
+// MountFlags contains flags as specified for mount(2), e.g. MS_NOEXEC.
+// MS_RDONLY is not part of MountFlags because it's tracked in Mount.writers.
+type MountFlags struct {
+ // NoExec is equivalent to MS_NOEXEC.
+ NoExec bool
+}
+
+// MountOptions contains options to VirtualFilesystem.MountAt().
+type MountOptions struct {
+ // Flags contains flags as specified for mount(2), e.g. MS_NOEXEC.
+ Flags MountFlags
+
+ // ReadOnly is equivalent to MS_RDONLY.
+ ReadOnly bool
+
+ // GetFilesystemOptions contains options to FilesystemType.GetFilesystem().
+ GetFilesystemOptions GetFilesystemOptions
+
+ // If InternalMount is true, allow the use of filesystem types for which
+ // RegisterFilesystemTypeOptions.AllowUserMount == false.
+ InternalMount bool
}
// OpenOptions contains options to VirtualFilesystem.OpenAt() and
@@ -51,7 +98,7 @@ type MknodOptions struct {
type OpenOptions struct {
// Flags contains access mode and flags as specified for open(2).
//
- // FilesystemImpls is reponsible for implementing the following flags:
+ // FilesystemImpls are responsible for implementing the following flags:
// O_RDONLY, O_WRONLY, O_RDWR, O_APPEND, O_CREAT, O_DIRECT, O_DSYNC,
// O_EXCL, O_NOATIME, O_NOCTTY, O_NONBLOCK, O_PATH, O_SYNC, O_TMPFILE, and
// O_TRUNC. VFS is responsible for handling O_DIRECTORY, O_LARGEFILE, and
@@ -62,6 +109,12 @@ type OpenOptions struct {
// If FilesystemImpl.OpenAt() creates a file, Mode is the file mode for the
// created file.
Mode linux.FileMode
+
+ // FileExec is set when the file is being opened to be executed.
+ // VirtualFilesystem.OpenAt() checks that the caller has execute permissions
+ // on the file, that the file is a regular file, and that the mount doesn't
+ // have MS_NOEXEC set.
+ FileExec bool
}
// ReadOptions contains options to FileDescription.PRead(),
@@ -77,6 +130,9 @@ type ReadOptions struct {
type RenameOptions struct {
// Flags contains flags as specified for renameat2(2).
Flags uint32
+
+ // If MustBeDir is true, the renamed file must be a directory.
+ MustBeDir bool
}
// SetStatOptions contains options to VirtualFilesystem.SetStatAt(),
@@ -95,6 +151,34 @@ type SetStatOptions struct {
Stat linux.Statx
}
+// GetxattrOptions contains options to VirtualFilesystem.GetxattrAt(),
+// FilesystemImpl.GetxattrAt(), FileDescription.Getxattr(), and
+// FileDescriptionImpl.Getxattr().
+type GetxattrOptions struct {
+ // Name is the name of the extended attribute to retrieve.
+ Name string
+
+ // Size is the maximum value size that the caller will tolerate. If the value
+ // is larger than size, getxattr methods may return ERANGE, but they are also
+ // free to ignore the hint entirely (i.e. the value returned may be larger
+ // than size). All size checking is done independently at the syscall layer.
+ Size uint64
+}
+
+// SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(),
+// FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and
+// FileDescriptionImpl.Setxattr().
+type SetxattrOptions struct {
+ // Name is the name of the extended attribute being mutated.
+ Name string
+
+ // Value is the extended attribute's new value.
+ Value string
+
+ // Flags contains flags as specified for setxattr/lsetxattr/fsetxattr(2).
+ Flags uint32
+}
+
// StatOptions contains options to VirtualFilesystem.StatAt(),
// FilesystemImpl.StatAt(), FileDescription.Stat(), and
// FileDescriptionImpl.Stat().
@@ -114,6 +198,12 @@ type StatOptions struct {
Sync uint32
}
+// UmountOptions contains options to VirtualFilesystem.UmountAt().
+type UmountOptions struct {
+ // Flags contains flags as specified for umount2(2).
+ Flags uint32
+}
+
// WriteOptions contains options to FileDescription.PWrite(),
// FileDescriptionImpl.PWrite(), FileDescription.Write(), and
// FileDescriptionImpl.Write().
diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go
new file mode 100644
index 000000000..cd78d66bc
--- /dev/null
+++ b/pkg/sentry/vfs/pathname.go
@@ -0,0 +1,195 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+var fspathBuilderPool = sync.Pool{
+ New: func() interface{} {
+ return &fspath.Builder{}
+ },
+}
+
+func getFSPathBuilder() *fspath.Builder {
+ return fspathBuilderPool.Get().(*fspath.Builder)
+}
+
+func putFSPathBuilder(b *fspath.Builder) {
+ // No methods can be called on b after b.String(), so reset it to its zero
+ // value (as returned by fspathBuilderPool.New) instead.
+ *b = fspath.Builder{}
+ fspathBuilderPool.Put(b)
+}
+
+// PathnameWithDeleted returns an absolute pathname to vd, consistent with
+// Linux's d_path(). In particular, if vd.Dentry() has been disowned,
+// PathnameWithDeleted appends " (deleted)" to the returned pathname.
+func (vfs *VirtualFilesystem) PathnameWithDeleted(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
+ b := getFSPathBuilder()
+ defer putFSPathBuilder(b)
+ haveRef := false
+ defer func() {
+ if haveRef {
+ vd.DecRef()
+ }
+ }()
+
+ origD := vd.dentry
+loop:
+ for {
+ err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b)
+ switch err.(type) {
+ case nil:
+ if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
+ // genericfstree.PrependPath() will have returned
+ // PrependPathAtVFSRootError in this case since it checks
+ // against vfsroot before mnt.root, but other implementations
+ // of FilesystemImpl.PrependPath() may return nil instead.
+ break loop
+ }
+ nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+ if !nextVD.Ok() {
+ break loop
+ }
+ if haveRef {
+ vd.DecRef()
+ }
+ vd = nextVD
+ haveRef = true
+ // continue loop
+ case PrependPathSyntheticError:
+ // Skip prepending "/" and appending " (deleted)".
+ return b.String(), nil
+ case PrependPathAtVFSRootError, PrependPathAtNonMountRootError:
+ break loop
+ default:
+ return "", err
+ }
+ }
+ b.PrependByte('/')
+ if origD.IsDead() {
+ b.AppendString(" (deleted)")
+ }
+ return b.String(), nil
+}
+
+// PathnameReachable returns an absolute pathname to vd, consistent with
+// Linux's __d_path() (as used by seq_path_root()). If vfsroot.Ok() and vd is
+// not reachable from vfsroot, such that seq_path_root() would return SEQ_SKIP
+// (causing the entire containing entry to be skipped), PathnameReachable
+// returns ("", nil).
+func (vfs *VirtualFilesystem) PathnameReachable(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
+ b := getFSPathBuilder()
+ defer putFSPathBuilder(b)
+ haveRef := false
+ defer func() {
+ if haveRef {
+ vd.DecRef()
+ }
+ }()
+loop:
+ for {
+ err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b)
+ switch err.(type) {
+ case nil:
+ if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
+ break loop
+ }
+ nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+ if !nextVD.Ok() {
+ return "", nil
+ }
+ if haveRef {
+ vd.DecRef()
+ }
+ vd = nextVD
+ haveRef = true
+ case PrependPathAtVFSRootError:
+ break loop
+ case PrependPathAtNonMountRootError, PrependPathSyntheticError:
+ return "", nil
+ default:
+ return "", err
+ }
+ }
+ b.PrependByte('/')
+ return b.String(), nil
+}
+
+// PathnameForGetcwd returns an absolute pathname to vd, consistent with
+// Linux's sys_getcwd().
+func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
+ if vd.dentry.IsDead() {
+ return "", syserror.ENOENT
+ }
+
+ b := getFSPathBuilder()
+ defer putFSPathBuilder(b)
+ haveRef := false
+ defer func() {
+ if haveRef {
+ vd.DecRef()
+ }
+ }()
+ unreachable := false
+loop:
+ for {
+ err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b)
+ switch err.(type) {
+ case nil:
+ if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
+ break loop
+ }
+ nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+ if !nextVD.Ok() {
+ unreachable = true
+ break loop
+ }
+ if haveRef {
+ vd.DecRef()
+ }
+ vd = nextVD
+ haveRef = true
+ case PrependPathAtVFSRootError:
+ break loop
+ case PrependPathAtNonMountRootError, PrependPathSyntheticError:
+ unreachable = true
+ break loop
+ default:
+ return "", err
+ }
+ }
+ b.PrependByte('/')
+ if unreachable {
+ b.PrependString("(unreachable)")
+ }
+ return b.String(), nil
+}
+
+// As of this writing, we do not have equivalents to:
+//
+// - d_absolute_path(), which returns EINVAL if (effectively) any call to
+// FilesystemImpl.PrependPath() would return PrependPathAtNonMountRootError.
+//
+// - dentry_path(), which does not walk up mounts (and only returns the path
+// relative to Filesystem root), but also appends "//deleted" for disowned
+// Dentries.
+//
+// These should be added as necessary.
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index f8e74355c..f9647f90e 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -15,8 +15,12 @@
package vfs
import (
+ "math"
+
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -25,23 +29,44 @@ type AccessTypes uint16
// Bits in AccessTypes.
const (
+ MayExec AccessTypes = 1
+ MayWrite AccessTypes = 2
MayRead AccessTypes = 4
- MayWrite = 2
- MayExec = 1
)
+// OnlyRead returns true if access _only_ allows read.
+func (a AccessTypes) OnlyRead() bool {
+ return a == MayRead
+}
+
+// MayRead returns true if access allows read.
+func (a AccessTypes) MayRead() bool {
+ return a&MayRead != 0
+}
+
+// MayWrite returns true if access allows write.
+func (a AccessTypes) MayWrite() bool {
+ return a&MayWrite != 0
+}
+
+// MayExec returns true if access allows exec.
+func (a AccessTypes) MayExec() bool {
+ return a&MayExec != 0
+}
+
// GenericCheckPermissions checks that creds has the given access rights on a
// file with the given permissions, UID, and GID, subject to the rules of
-// fs/namei.c:generic_permission(). isDir is true if the file is a directory.
-func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir bool, mode uint16, kuid auth.KUID, kgid auth.KGID) error {
+// fs/namei.c:generic_permission().
+func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
// Check permission bits.
- perms := mode
+ perms := uint16(mode.Permissions())
if creds.EffectiveKUID == kuid {
perms >>= 6
} else if creds.InGroup(kgid) {
perms >>= 3
}
if uint16(ats)&perms == uint16(ats) {
+ // All permission bits match, access granted.
return nil
}
@@ -53,7 +78,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo
}
// CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary
// directories, and read arbitrary non-directory files.
- if (isDir && (ats&MayWrite == 0)) || ats == MayRead {
+ if (mode.IsDir() && !ats.MayWrite()) || ats.OnlyRead() {
if creds.HasCapability(linux.CAP_DAC_READ_SEARCH) {
return nil
}
@@ -61,7 +86,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo
// CAP_DAC_OVERRIDE allows arbitrary access to directories, read/write
// access to non-directory files, and execute access to non-directory files
// for which at least one execute bit is set.
- if isDir || (ats&MayExec == 0) || (mode&0111 != 0) {
+ if mode.IsDir() || !ats.MayExec() || (mode.Permissions()&0111 != 0) {
if creds.HasCapability(linux.CAP_DAC_OVERRIDE) {
return nil
}
@@ -74,27 +99,31 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo
// the set of accesses permitted for the opened file:
//
// - O_TRUNC causes MayWrite to be set in the returned AccessTypes (since it
-// mutates the file), but does not permit the opened to write to the file
+// mutates the file), but does not permit writing to the open file description
// thereafter.
//
// - "Linux reserves the special, nonstandard access mode 3 (binary 11) in
// flags to mean: check for read and write permission on the file and return a
// file descriptor that can't be used for reading or writing." - open(2). Thus
-// AccessTypesForOpenFlags returns MayRead|MayWrite in this case, but
-// filesystems are responsible for ensuring that access is denied.
+// AccessTypesForOpenFlags returns MayRead|MayWrite in this case.
//
// Use May{Read,Write}FileWithOpenFlags() for these checks instead.
-func AccessTypesForOpenFlags(flags uint32) AccessTypes {
- switch flags & linux.O_ACCMODE {
+func AccessTypesForOpenFlags(opts *OpenOptions) AccessTypes {
+ ats := AccessTypes(0)
+ if opts.FileExec {
+ ats |= MayExec
+ }
+
+ switch opts.Flags & linux.O_ACCMODE {
case linux.O_RDONLY:
- if flags&linux.O_TRUNC != 0 {
- return MayRead | MayWrite
+ if opts.Flags&linux.O_TRUNC != 0 {
+ return ats | MayRead | MayWrite
}
- return MayRead
+ return ats | MayRead
case linux.O_WRONLY:
- return MayWrite
+ return ats | MayWrite
default:
- return MayRead | MayWrite
+ return ats | MayRead | MayWrite
}
}
@@ -119,3 +148,88 @@ func MayWriteFileWithOpenFlags(flags uint32) bool {
return false
}
}
+
+// CheckSetStat checks that creds has permission to change the metadata of a
+// file with the given permissions, UID, and GID as specified by stat, subject
+// to the rules of Linux's fs/attr.c:setattr_prepare().
+func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
+ if stat.Mask&linux.STATX_SIZE != 0 {
+ limit, err := CheckLimit(ctx, 0, int64(stat.Size))
+ if err != nil {
+ return err
+ }
+ if limit < int64(stat.Size) {
+ return syserror.ErrExceedsFileSizeLimit
+ }
+ }
+ if stat.Mask&linux.STATX_MODE != 0 {
+ if !CanActAsOwner(creds, kuid) {
+ return syserror.EPERM
+ }
+ // TODO(b/30815691): "If the calling process is not privileged (Linux:
+ // does not have the CAP_FSETID capability), and the group of the file
+ // does not match the effective group ID of the process or one of its
+ // supplementary group IDs, the S_ISGID bit will be turned off, but
+ // this will not cause an error to be returned." - chmod(2)
+ }
+ if stat.Mask&linux.STATX_UID != 0 {
+ if !((creds.EffectiveKUID == kuid && auth.KUID(stat.UID) == kuid) ||
+ HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) {
+ return syserror.EPERM
+ }
+ }
+ if stat.Mask&linux.STATX_GID != 0 {
+ if !((creds.EffectiveKUID == kuid && creds.InGroup(auth.KGID(stat.GID))) ||
+ HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) {
+ return syserror.EPERM
+ }
+ }
+ if stat.Mask&(linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME) != 0 {
+ if !CanActAsOwner(creds, kuid) {
+ if (stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW) ||
+ (stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW) ||
+ (stat.Mask&linux.STATX_CTIME != 0 && stat.Ctime.Nsec != linux.UTIME_NOW) {
+ return syserror.EPERM
+ }
+ if err := GenericCheckPermissions(creds, MayWrite, mode, kuid, kgid); err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+}
+
+// CanActAsOwner returns true if creds can act as the owner of a file with the
+// given owning UID, consistent with Linux's
+// fs/inode.c:inode_owner_or_capable().
+func CanActAsOwner(creds *auth.Credentials, kuid auth.KUID) bool {
+ if creds.EffectiveKUID == kuid {
+ return true
+ }
+ return creds.HasCapability(linux.CAP_FOWNER) && creds.UserNamespace.MapFromKUID(kuid).Ok()
+}
+
+// HasCapabilityOnFile returns true if creds has the given capability with
+// respect to a file with the given owning UID and GID, consistent with Linux's
+// kernel/capability.c:capable_wrt_inode_uidgid().
+func HasCapabilityOnFile(creds *auth.Credentials, cp linux.Capability, kuid auth.KUID, kgid auth.KGID) bool {
+ return creds.HasCapability(cp) && creds.UserNamespace.MapFromKUID(kuid).Ok() && creds.UserNamespace.MapFromKGID(kgid).Ok()
+}
+
+// CheckLimit enforces file size rlimits. It returns error if the write
+// operation must not proceed. Otherwise it returns the max length allowed to
+// without violating the limit.
+func CheckLimit(ctx context.Context, offset, size int64) (int64, error) {
+ fileSizeLimit := limits.FromContext(ctx).Get(limits.FileSize).Cur
+ if fileSizeLimit > math.MaxInt64 {
+ return size, nil
+ }
+ if offset >= int64(fileSizeLimit) {
+ return 0, syserror.ErrExceedsFileSizeLimit
+ }
+ remaining := int64(fileSizeLimit) - offset
+ if remaining < size {
+ return remaining, nil
+ }
+ return size, nil
+}
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index 8d05c8583..9d047ff88 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -16,11 +16,11 @@ package vfs
import (
"fmt"
- "sync"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -29,7 +29,9 @@ import (
//
// From the perspective of FilesystemImpl methods, a ResolvingPath represents a
// starting Dentry on the associated Filesystem (on which a reference is
-// already held) and a stream of path components relative to that Dentry.
+// already held), a stream of path components relative to that Dentry, and
+// elements of the invoking Context that are commonly required by
+// FilesystemImpl methods.
//
// ResolvingPath is loosely analogous to Linux's struct nameidata.
type ResolvingPath struct {
@@ -85,11 +87,11 @@ func init() {
// so error "constants" are really mutable vars, necessitating somewhat
// expensive interface object comparisons.
-type resolveMountRootError struct{}
+type resolveMountRootOrJumpError struct{}
// Error implements error.Error.
-func (resolveMountRootError) Error() string {
- return "resolving mount root"
+func (resolveMountRootOrJumpError) Error() string {
+ return "resolving mount root or jump"
}
type resolveMountPointError struct{}
@@ -112,30 +114,26 @@ var resolvingPathPool = sync.Pool{
},
}
-func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) (*ResolvingPath, error) {
- path, err := fspath.Parse(pop.Pathname)
- if err != nil {
- return nil, err
- }
+func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) *ResolvingPath {
rp := resolvingPathPool.Get().(*ResolvingPath)
rp.vfs = vfs
rp.root = pop.Root
rp.mount = pop.Start.mount
rp.start = pop.Start.dentry
- rp.pit = path.Begin
+ rp.pit = pop.Path.Begin
rp.flags = 0
if pop.FollowFinalSymlink {
rp.flags |= rpflagsFollowFinalSymlink
}
- rp.mustBeDir = path.Dir
- rp.mustBeDirOrig = path.Dir
+ rp.mustBeDir = pop.Path.Dir
+ rp.mustBeDirOrig = pop.Path.Dir
rp.symlinks = 0
rp.curPart = 0
rp.numOrigParts = 1
rp.creds = creds
- rp.parts[0] = path.Begin
- rp.origParts[0] = path.Begin
- return rp, nil
+ rp.parts[0] = pop.Path.Begin
+ rp.origParts[0] = pop.Path.Begin
+ return rp
}
func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) {
@@ -149,20 +147,20 @@ func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) {
func (rp *ResolvingPath) decRefStartAndMount() {
if rp.flags&rpflagsHaveStartRef != 0 {
- rp.start.decRef(rp.mount.fs)
+ rp.start.DecRef()
}
if rp.flags&rpflagsHaveMountRef != 0 {
- rp.mount.decRef()
+ rp.mount.DecRef()
}
}
func (rp *ResolvingPath) releaseErrorState() {
if rp.nextStart != nil {
- rp.nextStart.decRef(rp.nextMount.fs)
+ rp.nextStart.DecRef()
rp.nextStart = nil
}
if rp.nextMount != nil {
- rp.nextMount.decRef()
+ rp.nextMount.DecRef()
rp.nextMount = nil
}
}
@@ -232,7 +230,7 @@ func (rp *ResolvingPath) Advance() {
rp.pit = next
} else { // at end of path segment, continue with next one
rp.curPart--
- rp.pit = rp.parts[rp.curPart-1]
+ rp.pit = rp.parts[rp.curPart]
}
}
@@ -255,88 +253,67 @@ func (rp *ResolvingPath) relpathCommit() {
rp.origParts[rp.curPart] = rp.pit
}
-// ResolveParent returns the VFS parent of d. It does not take a reference on
-// the returned Dentry.
-//
-// Preconditions: There are no concurrent mutators of d.
-//
-// Postconditions: If the returned error is nil, then the returned Dentry is
-// not nil.
-func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) {
- var parent *Dentry
+// CheckRoot is called before resolving the parent of the Dentry d. If the
+// Dentry is contextually a VFS root, such that path resolution should treat
+// d's parent as itself, CheckRoot returns (true, nil). If the Dentry is the
+// root of a non-root mount, such that path resolution should switch to another
+// Mount, CheckRoot returns (unspecified, non-nil error). Otherwise, path
+// resolution should resolve d's parent normally, and CheckRoot returns (false,
+// nil).
+func (rp *ResolvingPath) CheckRoot(d *Dentry) (bool, error) {
if d == rp.root.dentry && rp.mount == rp.root.mount {
- // At contextual VFS root.
- parent = d
+ // At contextual VFS root (due to e.g. chroot(2)).
+ return true, nil
} else if d == rp.mount.root {
// At mount root ...
- mnt, mntpt := rp.vfs.getMountpointAt(rp.mount, rp.root)
- if mnt != nil {
+ vd := rp.vfs.getMountpointAt(rp.mount, rp.root)
+ if vd.Ok() {
// ... of non-root mount.
- rp.nextMount = mnt
- rp.nextStart = mntpt
- return nil, resolveMountRootError{}
+ rp.nextMount = vd.mount
+ rp.nextStart = vd.dentry
+ return false, resolveMountRootOrJumpError{}
}
// ... of root mount.
- parent = d
- } else if d.parent == nil {
- // At filesystem root.
- parent = d
- } else {
- parent = d.parent
- }
- if parent.isMounted() {
- if mnt := rp.vfs.getMountAt(rp.mount, parent); mnt != nil {
- rp.nextMount = mnt
- return nil, resolveMountPointError{}
- }
+ return true, nil
}
- return parent, nil
+ return false, nil
}
-// ResolveChild returns the VFS child of d with the given name. It does not
-// take a reference on the returned Dentry. If no such child exists,
-// ResolveChild returns (nil, nil).
-//
-// Preconditions: There are no concurrent mutators of d.
-func (rp *ResolvingPath) ResolveChild(d *Dentry, name string) (*Dentry, error) {
- child := d.children[name]
- if child == nil {
- return nil, nil
+// CheckMount is called after resolving the parent or child of another Dentry
+// to d. If d is a mount point, such that path resolution should switch to
+// another Mount, CheckMount returns a non-nil error. Otherwise, CheckMount
+// returns nil.
+func (rp *ResolvingPath) CheckMount(d *Dentry) error {
+ if !d.isMounted() {
+ return nil
}
- if child.isMounted() {
- if mnt := rp.vfs.getMountAt(rp.mount, child); mnt != nil {
- rp.nextMount = mnt
- return nil, resolveMountPointError{}
- }
- }
- return child, nil
-}
-
-// ResolveComponent returns the Dentry reached by starting at d and resolving
-// the current path component in the stream represented by rp. It does not
-// advance the stream. It does not take a reference on the returned Dentry. If
-// no such Dentry exists, ResolveComponent returns (nil, nil).
-//
-// Preconditions: !rp.Done(). There are no concurrent mutators of d.
-func (rp *ResolvingPath) ResolveComponent(d *Dentry) (*Dentry, error) {
- switch pc := rp.Component(); pc {
- case ".":
- return d, nil
- case "..":
- return rp.ResolveParent(d)
- default:
- return rp.ResolveChild(d, pc)
+ if mnt := rp.vfs.getMountAt(rp.mount, d); mnt != nil {
+ rp.nextMount = mnt
+ return resolveMountPointError{}
}
+ return nil
}
// ShouldFollowSymlink returns true if, supposing that the current path
// component in pcs represents a symbolic link, the symbolic link should be
// followed.
//
+// If path is terminated with '/', the '/' is considered the last element and
+// any symlink before that is followed:
+// - For most non-creating walks, the last path component is handled by
+// fs/namei.c:lookup_last(), which sets LOOKUP_FOLLOW if the first byte
+// after the path component is non-NULL (which is only possible if it's '/')
+// and the path component is of type LAST_NORM.
+//
+// - For open/openat/openat2 without O_CREAT, the last path component is
+// handled by fs/namei.c:do_last(), which does the same, though without the
+// LAST_NORM check.
+//
// Preconditions: !rp.Done().
func (rp *ResolvingPath) ShouldFollowSymlink() bool {
- // Non-final symlinks are always followed.
- return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final()
+ // Non-final symlinks are always followed. Paths terminated with '/' are also
+ // always followed.
+ return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final() || rp.MustBeDir()
}
// HandleSymlink is called when the current path component is a symbolic link
@@ -345,29 +322,34 @@ func (rp *ResolvingPath) ShouldFollowSymlink() bool {
// symlink target and returns nil. Otherwise it returns a non-nil error.
//
// Preconditions: !rp.Done().
+//
+// Postconditions: If HandleSymlink returns a nil error, then !rp.Done().
func (rp *ResolvingPath) HandleSymlink(target string) error {
if rp.symlinks >= linux.MaxSymlinkTraversals {
return syserror.ELOOP
}
- targetPath, err := fspath.Parse(target)
- if err != nil {
- return err
+ if len(target) == 0 {
+ return syserror.ENOENT
}
rp.symlinks++
+ targetPath := fspath.Parse(target)
if targetPath.Absolute {
rp.absSymlinkTarget = targetPath
return resolveAbsSymlinkError{}
}
- if !targetPath.Begin.Ok() {
- panic(fmt.Sprintf("symbolic link has non-empty target %q that is both relative and has no path components?", target))
- }
// Consume the path component that represented the symlink.
rp.Advance()
// Prepend the symlink target to the relative path.
+ if checkInvariants {
+ if !targetPath.HasComponents() {
+ panic(fmt.Sprintf("non-empty pathname %q parsed to relative path with no components", target))
+ }
+ }
rp.relpathPrepend(targetPath)
return nil
}
+// Preconditions: path.HasComponents().
func (rp *ResolvingPath) relpathPrepend(path fspath.Path) {
if rp.pit.Ok() {
rp.parts[rp.curPart] = rp.pit
@@ -385,11 +367,32 @@ func (rp *ResolvingPath) relpathPrepend(path fspath.Path) {
}
}
+// HandleJump is called when the current path component is a "magic" link to
+// the given VirtualDentry, like /proc/[pid]/fd/[fd]. If the calling Filesystem
+// method should continue path traversal, HandleMagicSymlink updates the path
+// component stream to reflect the magic link target and returns nil. Otherwise
+// it returns a non-nil error.
+//
+// Preconditions: !rp.Done().
+func (rp *ResolvingPath) HandleJump(target VirtualDentry) error {
+ if rp.symlinks >= linux.MaxSymlinkTraversals {
+ return syserror.ELOOP
+ }
+ rp.symlinks++
+ // Consume the path component that represented the magic link.
+ rp.Advance()
+ // Unconditionally return a resolveMountRootOrJumpError, even if the Mount
+ // isn't changing, to force restarting at the new Dentry.
+ target.IncRef()
+ rp.nextMount = target.mount
+ rp.nextStart = target.dentry
+ return resolveMountRootOrJumpError{}
+}
+
func (rp *ResolvingPath) handleError(err error) bool {
switch err.(type) {
- case resolveMountRootError:
- // Switch to the new Mount. We hold references on the Mount and Dentry
- // (from VFS.getMountpointAt()).
+ case resolveMountRootOrJumpError:
+ // Switch to the new Mount. We hold references on the Mount and Dentry.
rp.decRefStartAndMount()
rp.mount = rp.nextMount
rp.start = rp.nextStart
@@ -407,9 +410,8 @@ func (rp *ResolvingPath) handleError(err error) bool {
return true
case resolveMountPointError:
- // Switch to the new Mount. We hold a reference on the Mount (from
- // VFS.getMountAt()), but borrow the reference on the mount root from
- // the Mount.
+ // Switch to the new Mount. We hold a reference on the Mount, but
+ // borrow the reference on the mount root from the Mount.
rp.decRefStartAndMount()
rp.mount = rp.nextMount
rp.start = rp.nextMount.root
@@ -447,6 +449,17 @@ func (rp *ResolvingPath) handleError(err error) bool {
}
}
+// canHandleError returns true if err is an error returned by rp.Resolve*()
+// that rp.handleError() may attempt to handle.
+func (rp *ResolvingPath) canHandleError(err error) bool {
+ switch err.(type) {
+ case resolveMountRootOrJumpError, resolveMountPointError, resolveAbsSymlinkError:
+ return true
+ default:
+ return false
+ }
+}
+
// MustBeDir returns true if the file traversed by rp must be a directory.
func (rp *ResolvingPath) MustBeDir() bool {
return rp.mustBeDir
diff --git a/pkg/sentry/vfs/syscalls.go b/pkg/sentry/vfs/syscalls.go
deleted file mode 100644
index abde0feaa..000000000
--- a/pkg/sentry/vfs/syscalls.go
+++ /dev/null
@@ -1,235 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs
-
-import (
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/syserror"
-)
-
-// PathOperation specifies the path operated on by a VFS method.
-//
-// PathOperation is passed to VFS methods by pointer to reduce memory copying:
-// it's somewhat large and should never escape. (Options structs are passed by
-// pointer to VFS and FileDescription methods for the same reason.)
-type PathOperation struct {
- // Root is the VFS root. References on Root are borrowed from the provider
- // of the PathOperation.
- //
- // Invariants: Root.Ok().
- Root VirtualDentry
-
- // Start is the starting point for the path traversal. References on Start
- // are borrowed from the provider of the PathOperation (i.e. the caller of
- // the VFS method to which the PathOperation was passed).
- //
- // Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root.
- Start VirtualDentry
-
- // Path is the pathname traversed by this operation.
- Pathname string
-
- // If FollowFinalSymlink is true, and the Dentry traversed by the final
- // path component represents a symbolic link, the symbolic link should be
- // followed.
- FollowFinalSymlink bool
-}
-
-// GetDentryAt returns a VirtualDentry representing the given path, at which a
-// file must exist. A reference is taken on the returned VirtualDentry.
-func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
- rp, err := vfs.getResolvingPath(creds, pop)
- if err != nil {
- return VirtualDentry{}, err
- }
- for {
- d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts)
- if err == nil {
- vd := VirtualDentry{
- mount: rp.mount,
- dentry: d,
- }
- rp.mount.incRef()
- vfs.putResolvingPath(rp)
- return vd, nil
- }
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
- return VirtualDentry{}, err
- }
- }
-}
-
-// MkdirAt creates a directory at the given path.
-func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
- // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is
- // also honored." - mkdir(2)
- opts.Mode &= 01777
- rp, err := vfs.getResolvingPath(creds, pop)
- if err != nil {
- return err
- }
- for {
- err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
- if err == nil {
- vfs.putResolvingPath(rp)
- return nil
- }
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
- return err
- }
- }
-}
-
-// MknodAt creates a file of the given mode at the given path. It returns an
-// error from the syserror package.
-func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
- rp, err := vfs.getResolvingPath(creds, pop)
- if err != nil {
- return nil
- }
- for {
- if err = rp.mount.fs.impl.MknodAt(ctx, rp, *opts); err == nil {
- vfs.putResolvingPath(rp)
- return nil
- }
- // Handle mount traversals.
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
- return err
- }
- }
-}
-
-// OpenAt returns a FileDescription providing access to the file at the given
-// path. A reference is taken on the returned FileDescription.
-func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
- // Remove:
- //
- // - O_LARGEFILE, which we always report in FileDescription status flags
- // since only 64-bit architectures are supported at this time.
- //
- // - O_CLOEXEC, which affects file descriptors and therefore must be
- // handled outside of VFS.
- //
- // - Unknown flags.
- opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
- // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC.
- if opts.Flags&linux.O_SYNC != 0 {
- opts.Flags |= linux.O_DSYNC
- }
- // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified
- // with O_DIRECTORY and a writable access mode (to ensure that it fails on
- // filesystem implementations that do not support it).
- if opts.Flags&linux.O_TMPFILE != 0 {
- if opts.Flags&linux.O_DIRECTORY == 0 {
- return nil, syserror.EINVAL
- }
- if opts.Flags&linux.O_CREAT != 0 {
- return nil, syserror.EINVAL
- }
- if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY {
- return nil, syserror.EINVAL
- }
- }
- // O_PATH causes most other flags to be ignored.
- if opts.Flags&linux.O_PATH != 0 {
- opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH
- }
- // "On Linux, the following bits are also honored in mode: [S_ISUID,
- // S_ISGID, S_ISVTX]" - open(2)
- opts.Mode &= 07777
-
- if opts.Flags&linux.O_NOFOLLOW != 0 {
- pop.FollowFinalSymlink = false
- }
- rp, err := vfs.getResolvingPath(creds, pop)
- if err != nil {
- return nil, err
- }
- if opts.Flags&linux.O_DIRECTORY != 0 {
- rp.mustBeDir = true
- rp.mustBeDirOrig = true
- }
- for {
- fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
- if err == nil {
- vfs.putResolvingPath(rp)
- return fd, nil
- }
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
- return nil, err
- }
- }
-}
-
-// StatAt returns metadata for the file at the given path.
-func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) {
- rp, err := vfs.getResolvingPath(creds, pop)
- if err != nil {
- return linux.Statx{}, err
- }
- for {
- stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
- if err == nil {
- vfs.putResolvingPath(rp)
- return stat, nil
- }
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
- return linux.Statx{}, err
- }
- }
-}
-
-// StatusFlags returns file description status flags.
-func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) {
- flags, err := fd.impl.StatusFlags(ctx)
- flags |= linux.O_LARGEFILE
- return flags, err
-}
-
-// SetStatusFlags sets file description status flags.
-func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
- return fd.impl.SetStatusFlags(ctx, flags)
-}
-
-// TODO:
-//
-// - VFS.SyncAllFilesystems() for sync(2)
-//
-// - Something for syncfs(2)
-//
-// - VFS.LinkAt()
-//
-// - VFS.ReadlinkAt()
-//
-// - VFS.RenameAt()
-//
-// - VFS.RmdirAt()
-//
-// - VFS.SetStatAt()
-//
-// - VFS.StatFSAt()
-//
-// - VFS.SymlinkAt()
-//
-// - VFS.UnlinkAt()
-//
-// - FileDescription.(almost everything)
diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go
deleted file mode 100644
index 70b192ece..000000000
--- a/pkg/sentry/vfs/testutil.go
+++ /dev/null
@@ -1,139 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs
-
-import (
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/syserror"
-)
-
-// FDTestFilesystemType is a test-only FilesystemType that produces Filesystems
-// for which all FilesystemImpl methods taking a path return EPERM. It is used
-// to produce Mounts and Dentries for testing of FileDescriptionImpls that do
-// not depend on their originating Filesystem.
-type FDTestFilesystemType struct{}
-
-// FDTestFilesystem is a test-only FilesystemImpl produced by
-// FDTestFilesystemType.
-type FDTestFilesystem struct {
- vfsfs Filesystem
-}
-
-// NewFilesystem implements FilesystemType.NewFilesystem.
-func (fstype FDTestFilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts NewFilesystemOptions) (*Filesystem, *Dentry, error) {
- var fs FDTestFilesystem
- fs.vfsfs.Init(&fs)
- return &fs.vfsfs, fs.NewDentry(), nil
-}
-
-// Release implements FilesystemImpl.Release.
-func (fs *FDTestFilesystem) Release() {
-}
-
-// Sync implements FilesystemImpl.Sync.
-func (fs *FDTestFilesystem) Sync(ctx context.Context) error {
- return nil
-}
-
-// GetDentryAt implements FilesystemImpl.GetDentryAt.
-func (fs *FDTestFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) {
- return nil, syserror.EPERM
-}
-
-// LinkAt implements FilesystemImpl.LinkAt.
-func (fs *FDTestFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error {
- return syserror.EPERM
-}
-
-// MkdirAt implements FilesystemImpl.MkdirAt.
-func (fs *FDTestFilesystem) MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error {
- return syserror.EPERM
-}
-
-// MknodAt implements FilesystemImpl.MknodAt.
-func (fs *FDTestFilesystem) MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error {
- return syserror.EPERM
-}
-
-// OpenAt implements FilesystemImpl.OpenAt.
-func (fs *FDTestFilesystem) OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) {
- return nil, syserror.EPERM
-}
-
-// ReadlinkAt implements FilesystemImpl.ReadlinkAt.
-func (fs *FDTestFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) {
- return "", syserror.EPERM
-}
-
-// RenameAt implements FilesystemImpl.RenameAt.
-func (fs *FDTestFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error {
- return syserror.EPERM
-}
-
-// RmdirAt implements FilesystemImpl.RmdirAt.
-func (fs *FDTestFilesystem) RmdirAt(ctx context.Context, rp *ResolvingPath) error {
- return syserror.EPERM
-}
-
-// SetStatAt implements FilesystemImpl.SetStatAt.
-func (fs *FDTestFilesystem) SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error {
- return syserror.EPERM
-}
-
-// StatAt implements FilesystemImpl.StatAt.
-func (fs *FDTestFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) {
- return linux.Statx{}, syserror.EPERM
-}
-
-// StatFSAt implements FilesystemImpl.StatFSAt.
-func (fs *FDTestFilesystem) StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) {
- return linux.Statfs{}, syserror.EPERM
-}
-
-// SymlinkAt implements FilesystemImpl.SymlinkAt.
-func (fs *FDTestFilesystem) SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error {
- return syserror.EPERM
-}
-
-// UnlinkAt implements FilesystemImpl.UnlinkAt.
-func (fs *FDTestFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error {
- return syserror.EPERM
-}
-
-type fdTestDentry struct {
- vfsd Dentry
-}
-
-// NewDentry returns a new Dentry.
-func (fs *FDTestFilesystem) NewDentry() *Dentry {
- var d fdTestDentry
- d.vfsd.Init(&d)
- return &d.vfsd
-}
-
-// IncRef implements DentryImpl.IncRef.
-func (d *fdTestDentry) IncRef(vfsfs *Filesystem) {
-}
-
-// TryIncRef implements DentryImpl.TryIncRef.
-func (d *fdTestDentry) TryIncRef(vfsfs *Filesystem) bool {
- return true
-}
-
-// DecRef implements DentryImpl.DecRef.
-func (d *fdTestDentry) DecRef(vfsfs *Filesystem) {
-}
diff --git a/pkg/sentry/vfs/timerfd.go b/pkg/sentry/vfs/timerfd.go
new file mode 100644
index 000000000..42b880656
--- /dev/null
+++ b/pkg/sentry/vfs/timerfd.go
@@ -0,0 +1,142 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/context"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// TimerFileDescription implements FileDescriptionImpl for timer fds. It also
+// implements ktime.TimerListener.
+type TimerFileDescription struct {
+ vfsfd FileDescription
+ FileDescriptionDefaultImpl
+ DentryMetadataFileDescriptionImpl
+
+ events waiter.Queue
+ timer *ktime.Timer
+
+ // val is the number of timer expirations since the last successful
+ // call to PRead, or SetTime. val must be accessed using atomic memory
+ // operations.
+ val uint64
+}
+
+var _ FileDescriptionImpl = (*TimerFileDescription)(nil)
+var _ ktime.TimerListener = (*TimerFileDescription)(nil)
+
+// NewTimerFD returns a new timer fd.
+func (vfs *VirtualFilesystem) NewTimerFD(clock ktime.Clock, flags uint32) (*FileDescription, error) {
+ vd := vfs.NewAnonVirtualDentry("[timerfd]")
+ defer vd.DecRef()
+ tfd := &TimerFileDescription{}
+ tfd.timer = ktime.NewTimer(clock, tfd)
+ if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
+ UseDentryMetadata: true,
+ DenyPRead: true,
+ DenyPWrite: true,
+ InvalidWrite: true,
+ }); err != nil {
+ return nil, err
+ }
+ return &tfd.vfsfd, nil
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+ const sizeofUint64 = 8
+ if dst.NumBytes() < sizeofUint64 {
+ return 0, syserror.EINVAL
+ }
+ if val := atomic.SwapUint64(&tfd.val, 0); val != 0 {
+ var buf [sizeofUint64]byte
+ usermem.ByteOrder.PutUint64(buf[:], val)
+ if _, err := dst.CopyOut(ctx, buf[:]); err != nil {
+ // Linux does not undo consuming the number of
+ // expirations even if writing to userspace fails.
+ return 0, err
+ }
+ return sizeofUint64, nil
+ }
+ return 0, syserror.ErrWouldBlock
+}
+
+// Clock returns the timer fd's Clock.
+func (tfd *TimerFileDescription) Clock() ktime.Clock {
+ return tfd.timer.Clock()
+}
+
+// GetTime returns the associated Timer's setting and the time at which it was
+// observed.
+func (tfd *TimerFileDescription) GetTime() (ktime.Time, ktime.Setting) {
+ return tfd.timer.Get()
+}
+
+// SetTime atomically changes the associated Timer's setting, resets the number
+// of expirations to 0, and returns the previous setting and the time at which
+// it was observed.
+func (tfd *TimerFileDescription) SetTime(s ktime.Setting) (ktime.Time, ktime.Setting) {
+ return tfd.timer.SwapAnd(s, func() { atomic.StoreUint64(&tfd.val, 0) })
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (tfd *TimerFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+ var ready waiter.EventMask
+ if atomic.LoadUint64(&tfd.val) != 0 {
+ ready |= waiter.EventIn
+ }
+ return ready
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (tfd *TimerFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ tfd.events.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (tfd *TimerFileDescription) EventUnregister(e *waiter.Entry) {
+ tfd.events.EventUnregister(e)
+}
+
+// PauseTimer pauses the associated Timer.
+func (tfd *TimerFileDescription) PauseTimer() {
+ tfd.timer.Pause()
+}
+
+// ResumeTimer resumes the associated Timer.
+func (tfd *TimerFileDescription) ResumeTimer() {
+ tfd.timer.Resume()
+}
+
+// Release implements FileDescriptionImpl.Release()
+func (tfd *TimerFileDescription) Release() {
+ tfd.timer.Destroy()
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (tfd *TimerFileDescription) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) {
+ atomic.AddUint64(&tfd.val, exp)
+ tfd.events.Notify(waiter.EventIn)
+ return ktime.Setting{}, false
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (tfd *TimerFileDescription) Destroy() {}
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 4a8a69540..9015f2cc1 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -16,24 +16,44 @@
//
// Lock order:
//
-// Filesystem implementation locks
-// VirtualFilesystem.mountMu
+// EpollInstance.interestMu
+// FileDescription.epollMu
+// FilesystemImpl/FileDescriptionImpl locks
+// VirtualFilesystem.mountMu
+// Dentry.mu
+// Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
+// VirtualFilesystem.filesystemsMu
+// EpollInstance.mu
// VirtualFilesystem.fsTypesMu
+//
+// Locking Dentry.mu in multiple Dentries requires holding
+// VirtualFilesystem.mountMu. Locking EpollInstance.interestMu in multiple
+// EpollInstances requires holding epollCycleMu.
package vfs
import (
- "sync"
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
)
// A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts.
//
// There is no analogue to the VirtualFilesystem type in Linux, as the
// equivalent state in Linux is global.
+//
+// +stateify savable
type VirtualFilesystem struct {
// mountMu serializes mount mutations.
//
// mountMu is analogous to Linux's namespace_sem.
- mountMu sync.RWMutex
+ mountMu sync.Mutex `state:"nosave"`
// mounts maps (mount parent, mount point) pairs to mounts. (Since mounts
// are uniquely namespaced, including mount parent in the key correctly
@@ -52,7 +72,7 @@ type VirtualFilesystem struct {
// mountpoints maps mount points to mounts at those points in all
// namespaces. mountpoints is protected by mountMu.
//
- // mountpoints is used to find mounts that must be unmounted due to
+ // mountpoints is used to find mounts that must be umounted due to
// removal of a mount point Dentry from another mount namespace. ("A file
// or directory that is a mount point in one namespace that is not a mount
// point in another namespace, may be renamed, unlinked, or removed
@@ -62,20 +82,693 @@ type VirtualFilesystem struct {
// mountpoints is analogous to Linux's mountpoint_hashtable.
mountpoints map[*Dentry]map[*Mount]struct{}
- // fsTypes contains all FilesystemTypes that are usable in the
- // VirtualFilesystem. fsTypes is protected by fsTypesMu.
- fsTypesMu sync.RWMutex
- fsTypes map[string]FilesystemType
+ // anonMount is a Mount, not included in mounts or mountpoints,
+ // representing an anonFilesystem. anonMount is used to back
+ // VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
+ // anonMount is immutable.
+ //
+ // anonMount is analogous to Linux's anon_inode_mnt.
+ anonMount *Mount
+
+ // devices contains all registered Devices. devices is protected by
+ // devicesMu.
+ devicesMu sync.RWMutex `state:"nosave"`
+ devices map[devTuple]*registeredDevice
+
+ // anonBlockDevMinor contains all allocated anonymous block device minor
+ // numbers. anonBlockDevMinorNext is a lower bound for the smallest
+ // unallocated anonymous block device number. anonBlockDevMinorNext and
+ // anonBlockDevMinor are protected by anonBlockDevMinorMu.
+ anonBlockDevMinorMu sync.Mutex `state:"nosave"`
+ anonBlockDevMinorNext uint32
+ anonBlockDevMinor map[uint32]struct{}
+
+ // fsTypes contains all registered FilesystemTypes. fsTypes is protected by
+ // fsTypesMu.
+ fsTypesMu sync.RWMutex `state:"nosave"`
+ fsTypes map[string]*registeredFilesystemType
+
+ // filesystems contains all Filesystems. filesystems is protected by
+ // filesystemsMu.
+ filesystemsMu sync.Mutex `state:"nosave"`
+ filesystems map[*Filesystem]struct{}
}
-// New returns a new VirtualFilesystem with no mounts or FilesystemTypes.
-func New() *VirtualFilesystem {
- vfs := &VirtualFilesystem{
- mountpoints: make(map[*Dentry]map[*Mount]struct{}),
- fsTypes: make(map[string]FilesystemType),
- }
+// Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes.
+func (vfs *VirtualFilesystem) Init() error {
+ vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{})
+ vfs.devices = make(map[devTuple]*registeredDevice)
+ vfs.anonBlockDevMinorNext = 1
+ vfs.anonBlockDevMinor = make(map[uint32]struct{})
+ vfs.fsTypes = make(map[string]*registeredFilesystemType)
+ vfs.filesystems = make(map[*Filesystem]struct{})
vfs.mounts.Init()
- return vfs
+
+ // Construct vfs.anonMount.
+ anonfsDevMinor, err := vfs.GetAnonBlockDevMinor()
+ if err != nil {
+ // This shouldn't be possible since anonBlockDevMinorNext was
+ // initialized to 1 above (no device numbers have been allocated yet).
+ panic(fmt.Sprintf("VirtualFilesystem.Init: device number allocation for anonfs failed: %v", err))
+ }
+ anonfs := anonFilesystem{
+ devMinor: anonfsDevMinor,
+ }
+ anonfs.vfsfs.Init(vfs, &anonFilesystemType{}, &anonfs)
+ defer anonfs.vfsfs.DecRef()
+ anonMount, err := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{})
+ if err != nil {
+ // We should not be passing any MountOptions that would cause
+ // construction of this mount to fail.
+ panic(fmt.Sprintf("VirtualFilesystem.Init: anonfs mount failed: %v", err))
+ }
+ vfs.anonMount = anonMount
+
+ return nil
+}
+
+// PathOperation specifies the path operated on by a VFS method.
+//
+// PathOperation is passed to VFS methods by pointer to reduce memory copying:
+// it's somewhat large and should never escape. (Options structs are passed by
+// pointer to VFS and FileDescription methods for the same reason.)
+type PathOperation struct {
+ // Root is the VFS root. References on Root are borrowed from the provider
+ // of the PathOperation.
+ //
+ // Invariants: Root.Ok().
+ Root VirtualDentry
+
+ // Start is the starting point for the path traversal. References on Start
+ // are borrowed from the provider of the PathOperation (i.e. the caller of
+ // the VFS method to which the PathOperation was passed).
+ //
+ // Invariants: Start.Ok(). If Path.Absolute, then Start == Root.
+ Start VirtualDentry
+
+ // Path is the pathname traversed by this operation.
+ Path fspath.Path
+
+ // If FollowFinalSymlink is true, and the Dentry traversed by the final
+ // path component represents a symbolic link, the symbolic link should be
+ // followed.
+ FollowFinalSymlink bool
+}
+
+// AccessAt checks whether a user with creds has access to the file at
+// the given path.
+func (vfs *VirtualFilesystem) AccessAt(ctx context.Context, creds *auth.Credentials, ats AccessTypes, pop *PathOperation) error {
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ err := rp.mount.fs.impl.AccessAt(ctx, rp, creds, ats)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return err
+ }
+ }
+}
+
+// GetDentryAt returns a VirtualDentry representing the given path, at which a
+// file must exist. A reference is taken on the returned VirtualDentry.
+func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts)
+ if err == nil {
+ vd := VirtualDentry{
+ mount: rp.mount,
+ dentry: d,
+ }
+ rp.mount.IncRef()
+ vfs.putResolvingPath(rp)
+ return vd, nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return VirtualDentry{}, err
+ }
+ }
+}
+
+// Preconditions: pop.Path.Begin.Ok().
+func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (VirtualDentry, string, error) {
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ parent, err := rp.mount.fs.impl.GetParentDentryAt(ctx, rp)
+ if err == nil {
+ parentVD := VirtualDentry{
+ mount: rp.mount,
+ dentry: parent,
+ }
+ rp.mount.IncRef()
+ name := rp.Component()
+ vfs.putResolvingPath(rp)
+ return parentVD, name, nil
+ }
+ if checkInvariants {
+ if rp.canHandleError(err) && rp.Done() {
+ panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
+ }
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return VirtualDentry{}, "", err
+ }
+ }
+}
+
+// LinkAt creates a hard link at newpop representing the existing file at
+// oldpop.
+func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation) error {
+ oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{})
+ if err != nil {
+ return err
+ }
+
+ if !newpop.Path.Begin.Ok() {
+ oldVD.DecRef()
+ if newpop.Path.Absolute {
+ return syserror.EEXIST
+ }
+ return syserror.ENOENT
+ }
+ if newpop.FollowFinalSymlink {
+ oldVD.DecRef()
+ ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink")
+ return syserror.EINVAL
+ }
+
+ rp := vfs.getResolvingPath(creds, newpop)
+ for {
+ err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ oldVD.DecRef()
+ return nil
+ }
+ if checkInvariants {
+ if rp.canHandleError(err) && rp.Done() {
+ panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
+ }
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ oldVD.DecRef()
+ return err
+ }
+ }
+}
+
+// MkdirAt creates a directory at the given path.
+func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
+ if !pop.Path.Begin.Ok() {
+ if pop.Path.Absolute {
+ return syserror.EEXIST
+ }
+ return syserror.ENOENT
+ }
+ if pop.FollowFinalSymlink {
+ ctx.Warningf("VirtualFilesystem.MkdirAt: file creation paths can't follow final symlink")
+ return syserror.EINVAL
+ }
+ // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is
+ // also honored." - mkdir(2)
+ opts.Mode &= 0777 | linux.S_ISVTX
+
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return nil
+ }
+ if checkInvariants {
+ if rp.canHandleError(err) && rp.Done() {
+ panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
+ }
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return err
+ }
+ }
+}
+
+// MknodAt creates a file of the given mode at the given path. It returns an
+// error from the syserror package.
+func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
+ if !pop.Path.Begin.Ok() {
+ if pop.Path.Absolute {
+ return syserror.EEXIST
+ }
+ return syserror.ENOENT
+ }
+ if pop.FollowFinalSymlink {
+ ctx.Warningf("VirtualFilesystem.MknodAt: file creation paths can't follow final symlink")
+ return syserror.EINVAL
+ }
+
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return nil
+ }
+ if checkInvariants {
+ if rp.canHandleError(err) && rp.Done() {
+ panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
+ }
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return err
+ }
+ }
+}
+
+// BoundEndpointAt gets the bound endpoint at the given path, if one exists.
+func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (transport.BoundEndpoint, error) {
+ if !pop.Path.Begin.Ok() {
+ if pop.Path.Absolute {
+ return nil, syserror.ECONNREFUSED
+ }
+ return nil, syserror.ENOENT
+ }
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return bep, nil
+ }
+ if checkInvariants {
+ if rp.canHandleError(err) && rp.Done() {
+ panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
+ }
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return nil, err
+ }
+ }
+}
+
+// OpenAt returns a FileDescription providing access to the file at the given
+// path. A reference is taken on the returned FileDescription.
+func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
+ // Remove:
+ //
+ // - O_CLOEXEC, which affects file descriptors and therefore must be
+ // handled outside of VFS.
+ //
+ // - Unknown flags.
+ opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_LARGEFILE | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
+ // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC.
+ if opts.Flags&linux.O_SYNC != 0 {
+ opts.Flags |= linux.O_DSYNC
+ }
+ // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified
+ // with O_DIRECTORY and a writable access mode (to ensure that it fails on
+ // filesystem implementations that do not support it).
+ if opts.Flags&linux.O_TMPFILE != 0 {
+ if opts.Flags&linux.O_DIRECTORY == 0 {
+ return nil, syserror.EINVAL
+ }
+ if opts.Flags&linux.O_CREAT != 0 {
+ return nil, syserror.EINVAL
+ }
+ if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY {
+ return nil, syserror.EINVAL
+ }
+ }
+ // O_PATH causes most other flags to be ignored.
+ if opts.Flags&linux.O_PATH != 0 {
+ opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH
+ }
+ // "On Linux, the following bits are also honored in mode: [S_ISUID,
+ // S_ISGID, S_ISVTX]" - open(2)
+ opts.Mode &= 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX
+
+ if opts.Flags&linux.O_NOFOLLOW != 0 {
+ pop.FollowFinalSymlink = false
+ }
+ rp := vfs.getResolvingPath(creds, pop)
+ if opts.Flags&linux.O_DIRECTORY != 0 {
+ rp.mustBeDir = true
+ rp.mustBeDirOrig = true
+ }
+ for {
+ fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+
+ if opts.FileExec {
+ if fd.Mount().flags.NoExec {
+ fd.DecRef()
+ return nil, syserror.EACCES
+ }
+
+ // Only a regular file can be executed.
+ stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE})
+ if err != nil {
+ fd.DecRef()
+ return nil, err
+ }
+ if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG {
+ fd.DecRef()
+ return nil, syserror.EACCES
+ }
+ }
+
+ return fd, nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return nil, err
+ }
+ }
+}
+
+// ReadlinkAt returns the target of the symbolic link at the given path.
+func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (string, error) {
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return target, nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return "", err
+ }
+ }
+}
+
+// RenameAt renames the file at oldpop to newpop.
+func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error {
+ if !oldpop.Path.Begin.Ok() {
+ if oldpop.Path.Absolute {
+ return syserror.EBUSY
+ }
+ return syserror.ENOENT
+ }
+ if oldpop.FollowFinalSymlink {
+ ctx.Warningf("VirtualFilesystem.RenameAt: source path can't follow final symlink")
+ return syserror.EINVAL
+ }
+
+ oldParentVD, oldName, err := vfs.getParentDirAndName(ctx, creds, oldpop)
+ if err != nil {
+ return err
+ }
+ if oldName == "." || oldName == ".." {
+ oldParentVD.DecRef()
+ return syserror.EBUSY
+ }
+
+ if !newpop.Path.Begin.Ok() {
+ oldParentVD.DecRef()
+ if newpop.Path.Absolute {
+ return syserror.EBUSY
+ }
+ return syserror.ENOENT
+ }
+ if newpop.FollowFinalSymlink {
+ oldParentVD.DecRef()
+ ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink")
+ return syserror.EINVAL
+ }
+
+ rp := vfs.getResolvingPath(creds, newpop)
+ renameOpts := *opts
+ if oldpop.Path.Dir {
+ renameOpts.MustBeDir = true
+ }
+ for {
+ err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ oldParentVD.DecRef()
+ return nil
+ }
+ if checkInvariants {
+ if rp.canHandleError(err) && rp.Done() {
+ panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
+ }
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ oldParentVD.DecRef()
+ return err
+ }
+ }
+}
+
+// RmdirAt removes the directory at the given path.
+func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
+ if !pop.Path.Begin.Ok() {
+ if pop.Path.Absolute {
+ return syserror.EBUSY
+ }
+ return syserror.ENOENT
+ }
+ if pop.FollowFinalSymlink {
+ ctx.Warningf("VirtualFilesystem.RmdirAt: file deletion paths can't follow final symlink")
+ return syserror.EINVAL
+ }
+
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ err := rp.mount.fs.impl.RmdirAt(ctx, rp)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return nil
+ }
+ if checkInvariants {
+ if rp.canHandleError(err) && rp.Done() {
+ panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
+ }
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return err
+ }
+ }
+}
+
+// SetStatAt changes metadata for the file at the given path.
+func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetStatOptions) error {
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return err
+ }
+ }
+}
+
+// StatAt returns metadata for the file at the given path.
+func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) {
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return stat, nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return linux.Statx{}, err
+ }
+ }
+}
+
+// StatFSAt returns metadata for the filesystem containing the file at the
+// given path.
+func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (linux.Statfs, error) {
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return statfs, nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return linux.Statfs{}, err
+ }
+ }
+}
+
+// SymlinkAt creates a symbolic link at the given path with the given target.
+func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error {
+ if !pop.Path.Begin.Ok() {
+ if pop.Path.Absolute {
+ return syserror.EEXIST
+ }
+ return syserror.ENOENT
+ }
+ if pop.FollowFinalSymlink {
+ ctx.Warningf("VirtualFilesystem.SymlinkAt: file creation paths can't follow final symlink")
+ return syserror.EINVAL
+ }
+
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return nil
+ }
+ if checkInvariants {
+ if rp.canHandleError(err) && rp.Done() {
+ panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
+ }
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return err
+ }
+ }
+}
+
+// UnlinkAt deletes the non-directory file at the given path.
+func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
+ if !pop.Path.Begin.Ok() {
+ if pop.Path.Absolute {
+ return syserror.EBUSY
+ }
+ return syserror.ENOENT
+ }
+ if pop.FollowFinalSymlink {
+ ctx.Warningf("VirtualFilesystem.UnlinkAt: file deletion paths can't follow final symlink")
+ return syserror.EINVAL
+ }
+
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ err := rp.mount.fs.impl.UnlinkAt(ctx, rp)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return nil
+ }
+ if checkInvariants {
+ if rp.canHandleError(err) && rp.Done() {
+ panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
+ }
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return err
+ }
+ }
+}
+
+// ListxattrAt returns all extended attribute names for the file at the given
+// path.
+func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) {
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp, size)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return names, nil
+ }
+ if err == syserror.ENOTSUP {
+ // Linux doesn't actually return ENOTSUP in this case; instead,
+ // fs/xattr.c:vfs_listxattr() falls back to allowing the security
+ // subsystem to return security extended attributes, which by
+ // default don't exist.
+ vfs.putResolvingPath(rp)
+ return nil, nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return nil, err
+ }
+ }
+}
+
+// GetxattrAt returns the value associated with the given extended attribute
+// for the file at the given path.
+func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetxattrOptions) (string, error) {
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return val, nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return "", err
+ }
+ }
+}
+
+// SetxattrAt changes the value associated with the given extended attribute
+// for the file at the given path.
+func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetxattrOptions) error {
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return err
+ }
+ }
+}
+
+// RemovexattrAt removes the given extended attribute from the file at rp.
+func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error {
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return err
+ }
+ }
+}
+
+// SyncAllFilesystems has the semantics of Linux's sync(2).
+func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error {
+ fss := make(map[*Filesystem]struct{})
+ vfs.filesystemsMu.Lock()
+ for fs := range vfs.filesystems {
+ if !fs.TryIncRef() {
+ continue
+ }
+ fss[fs] = struct{}{}
+ }
+ vfs.filesystemsMu.Unlock()
+ var retErr error
+ for fs := range fss {
+ if err := fs.impl.Sync(ctx); err != nil && retErr == nil {
+ retErr = err
+ }
+ fs.DecRef()
+ }
+ return retErr
}
// A VirtualDentry represents a node in a VFS tree, by combining a Dentry
@@ -97,11 +790,21 @@ func New() *VirtualFilesystem {
// VirtualDentry methods require that a reference is held on the VirtualDentry.
//
// VirtualDentry is analogous to Linux's struct path.
+//
+// +stateify savable
type VirtualDentry struct {
mount *Mount
dentry *Dentry
}
+// MakeVirtualDentry creates a VirtualDentry.
+func MakeVirtualDentry(mount *Mount, dentry *Dentry) VirtualDentry {
+ return VirtualDentry{
+ mount: mount,
+ dentry: dentry,
+ }
+}
+
// Ok returns true if vd is not empty. It does not require that a reference is
// held.
func (vd VirtualDentry) Ok() bool {
@@ -111,15 +814,15 @@ func (vd VirtualDentry) Ok() bool {
// IncRef increments the reference counts on the Mount and Dentry represented
// by vd.
func (vd VirtualDentry) IncRef() {
- vd.mount.incRef()
- vd.dentry.incRef(vd.mount.fs)
+ vd.mount.IncRef()
+ vd.dentry.IncRef()
}
// DecRef decrements the reference counts on the Mount and Dentry represented
// by vd.
func (vd VirtualDentry) DecRef() {
- vd.dentry.decRef(vd.mount.fs)
- vd.mount.decRef()
+ vd.dentry.DecRef()
+ vd.mount.DecRef()
}
// Mount returns the Mount associated with vd. It does not take a reference on