summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/vfs
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/vfs')
-rw-r--r--pkg/sentry/vfs/BUILD26
-rw-r--r--pkg/sentry/vfs/README.md4
-rw-r--r--pkg/sentry/vfs/anonfs.go66
-rw-r--r--pkg/sentry/vfs/context.go31
-rw-r--r--pkg/sentry/vfs/dentry.go278
-rw-r--r--pkg/sentry/vfs/device.go5
-rw-r--r--pkg/sentry/vfs/epoll.go25
-rw-r--r--pkg/sentry/vfs/file_description.go89
-rw-r--r--pkg/sentry/vfs/file_description_impl_util.go13
-rw-r--r--pkg/sentry/vfs/file_description_impl_util_test.go10
-rw-r--r--pkg/sentry/vfs/filesystem.go67
-rw-r--r--pkg/sentry/vfs/filesystem_impl_util.go26
-rw-r--r--pkg/sentry/vfs/filesystem_type.go4
-rw-r--r--pkg/sentry/vfs/genericfstree/BUILD16
-rw-r--r--pkg/sentry/vfs/genericfstree/genericfstree.go81
-rw-r--r--pkg/sentry/vfs/inotify.go697
-rw-r--r--pkg/sentry/vfs/memxattr/BUILD15
-rw-r--r--pkg/sentry/vfs/memxattr/xattr.go102
-rw-r--r--pkg/sentry/vfs/mount.go354
-rw-r--r--pkg/sentry/vfs/mount_test.go2
-rw-r--r--pkg/sentry/vfs/mount_unsafe.go22
-rw-r--r--pkg/sentry/vfs/options.go78
-rw-r--r--pkg/sentry/vfs/pathname.go49
-rw-r--r--pkg/sentry/vfs/permissions.go54
-rw-r--r--pkg/sentry/vfs/resolving_path.go103
-rw-r--r--pkg/sentry/vfs/vfs.go156
26 files changed, 1899 insertions, 474 deletions
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 14b39eb9d..774cc66cc 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -15,6 +15,18 @@ go_template_instance(
},
)
+go_template_instance(
+ name = "event_list",
+ out = "event_list.go",
+ package = "vfs",
+ prefix = "event",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Element": "*Event",
+ "Linker": "*Event",
+ },
+)
+
go_library(
name = "vfs",
srcs = [
@@ -25,11 +37,13 @@ go_library(
"device.go",
"epoll.go",
"epoll_interest_list.go",
+ "event_list.go",
"file_description.go",
"file_description_impl_util.go",
"filesystem.go",
"filesystem_impl_util.go",
"filesystem_type.go",
+ "inotify.go",
"mount.go",
"mount_unsafe.go",
"options.go",
@@ -41,16 +55,28 @@ go_library(
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
+ "//pkg/binary",
"//pkg/context",
+ "//pkg/fd",
+ "//pkg/fdnotifier",
"//pkg/fspath",
+ "//pkg/gohacks",
+ "//pkg/log",
+ "//pkg/safemem",
"//pkg/sentry/arch",
+ "//pkg/sentry/fs",
"//pkg/sentry/fs/lock",
"//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/time",
+ "//pkg/sentry/limits",
"//pkg/sentry/memmap",
+ "//pkg/sentry/socket/unix/transport",
+ "//pkg/sentry/uniqueid",
"//pkg/sync",
"//pkg/syserror",
"//pkg/usermem",
"//pkg/waiter",
+ "@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md
index 9aa133bcb..66f3105bd 100644
--- a/pkg/sentry/vfs/README.md
+++ b/pkg/sentry/vfs/README.md
@@ -39,8 +39,8 @@ Mount references are held by:
- Mount: Each referenced Mount holds a reference on its parent, which is the
mount containing its mount point.
-- VirtualFilesystem: A reference is held on each Mount that has not been
- umounted.
+- VirtualFilesystem: A reference is held on each Mount that has been connected
+ to a mount point, but not yet umounted.
MountNamespace and FileDescription references are held by users of VFS. The
expectation is that each `kernel.Task` holds a reference on its corresponding
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index 2db25be49..b7c6b60b8 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -41,7 +42,27 @@ func (vfs *VirtualFilesystem) NewAnonVirtualDentry(name string) VirtualDentry {
}
}
-const anonfsBlockSize = usermem.PageSize // via fs/libfs.c:pseudo_fs_fill_super()
+const (
+ anonfsBlockSize = usermem.PageSize // via fs/libfs.c:pseudo_fs_fill_super()
+
+ // Mode, UID, and GID for a generic anonfs file.
+ anonFileMode = 0600 // no type is correct
+ anonFileUID = auth.RootKUID
+ anonFileGID = auth.RootKGID
+)
+
+// anonFilesystemType implements FilesystemType.
+type anonFilesystemType struct{}
+
+// GetFilesystem implements FilesystemType.GetFilesystem.
+func (anonFilesystemType) GetFilesystem(context.Context, *VirtualFilesystem, *auth.Credentials, string, GetFilesystemOptions) (*Filesystem, *Dentry, error) {
+ panic("cannot instaniate an anon filesystem")
+}
+
+// Name implemenents FilesystemType.Name.
+func (anonFilesystemType) Name() string {
+ return "none"
+}
// anonFilesystem is the implementation of FilesystemImpl that backs
// VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
@@ -69,6 +90,14 @@ func (fs *anonFilesystem) Sync(ctx context.Context) error {
return nil
}
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *anonFilesystem) AccessAt(ctx context.Context, rp *ResolvingPath, creds *auth.Credentials, ats AccessTypes) error {
+ if !rp.Done() {
+ return syserror.ENOTDIR
+ }
+ return GenericCheckPermissions(creds, ats, anonFileMode, anonFileUID, anonFileGID)
+}
+
// GetDentryAt implements FilesystemImpl.GetDentryAt.
func (fs *anonFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) {
if !rp.Done() {
@@ -167,13 +196,13 @@ func (fs *anonFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts St
Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS,
Blksize: anonfsBlockSize,
Nlink: 1,
- UID: uint32(auth.RootKUID),
- GID: uint32(auth.RootKGID),
- Mode: 0600, // no type is correct
+ UID: uint32(anonFileUID),
+ GID: uint32(anonFileGID),
+ Mode: anonFileMode,
Ino: 1,
Size: 0,
Blocks: 0,
- DevMajor: 0,
+ DevMajor: linux.UNNAMED_MAJOR,
DevMinor: fs.devMinor,
}, nil
}
@@ -205,8 +234,19 @@ func (fs *anonFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error
return syserror.EPERM
}
+// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath, opts BoundEndpointOptions) (transport.BoundEndpoint, error) {
+ if !rp.Final() {
+ return nil, syserror.ENOTDIR
+ }
+ if err := GenericCheckPermissions(rp.Credentials(), MayWrite, anonFileMode, anonFileUID, anonFileGID); err != nil {
+ return nil, err
+ }
+ return nil, syserror.ECONNREFUSED
+}
+
// ListxattrAt implements FilesystemImpl.ListxattrAt.
-func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) {
+func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) {
if !rp.Done() {
return nil, syserror.ENOTDIR
}
@@ -214,7 +254,7 @@ func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([
}
// GetxattrAt implements FilesystemImpl.GetxattrAt.
-func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) {
+func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error) {
if !rp.Done() {
return "", syserror.ENOTDIR
}
@@ -257,3 +297,15 @@ func (d *anonDentry) TryIncRef() bool {
func (d *anonDentry) DecRef() {
// no-op
}
+
+// InotifyWithParent implements DentryImpl.InotifyWithParent.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *anonDentry) InotifyWithParent(events uint32, cookie uint32, et EventType) {}
+
+// Watches implements DentryImpl.Watches.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *anonDentry) Watches() *Watches {
+ return nil
+}
diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go
index d97362b9a..c9e724fef 100644
--- a/pkg/sentry/vfs/context.go
+++ b/pkg/sentry/vfs/context.go
@@ -29,9 +29,10 @@ const (
CtxRoot
)
-// MountNamespaceFromContext returns the MountNamespace used by ctx. It does
-// not take a reference on the returned MountNamespace. If ctx is not
-// associated with a MountNamespace, MountNamespaceFromContext returns nil.
+// MountNamespaceFromContext returns the MountNamespace used by ctx. If ctx is
+// not associated with a MountNamespace, MountNamespaceFromContext returns nil.
+//
+// A reference is taken on the returned MountNamespace.
func MountNamespaceFromContext(ctx context.Context) *MountNamespace {
if v := ctx.Value(CtxMountNamespace); v != nil {
return v.(*MountNamespace)
@@ -48,3 +49,27 @@ func RootFromContext(ctx context.Context) VirtualDentry {
}
return VirtualDentry{}
}
+
+type rootContext struct {
+ context.Context
+ root VirtualDentry
+}
+
+// WithRoot returns a copy of ctx with the given root.
+func WithRoot(ctx context.Context, root VirtualDentry) context.Context {
+ return &rootContext{
+ Context: ctx,
+ root: root,
+ }
+}
+
+// Value implements Context.Value.
+func (rc rootContext) Value(key interface{}) interface{} {
+ switch key {
+ case CtxRoot:
+ rc.root.IncRef()
+ return rc.root
+ default:
+ return rc.Context.Value(key)
+ }
+}
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 486a76475..24af13eb1 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -15,34 +15,17 @@
package vfs
import (
- "fmt"
"sync/atomic"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
-// Dentry represents a node in a Filesystem tree which may represent a file.
+// Dentry represents a node in a Filesystem tree at which a file exists.
//
// Dentries are reference-counted. Unless otherwise specified, all Dentry
// methods require that a reference is held.
//
-// A Dentry transitions through up to 3 different states through its lifetime:
-//
-// - Dentries are initially "independent". Independent Dentries have no parent,
-// and consequently no name.
-//
-// - Dentry.InsertChild() causes an independent Dentry to become a "child" of
-// another Dentry. A child node has a parent node, and a name in that parent,
-// both of which are mutable by DentryMoveChild(). Each child Dentry's name is
-// unique within its parent.
-//
-// - Dentry.RemoveChild() causes a child Dentry to become "disowned". A
-// disowned Dentry can still refer to its former parent and its former name in
-// said parent, but the disowned Dentry is no longer reachable from its parent,
-// and a new Dentry with the same name may become a child of the parent. (This
-// is analogous to a struct dentry being "unhashed" in Linux.)
-//
// Dentry is loosely analogous to Linux's struct dentry, but:
//
// - VFS does not associate Dentries with inodes. gVisor interacts primarily
@@ -57,9 +40,6 @@ import (
// and/or FileDescriptionImpl methods in gVisor's VFS. Filesystems that do
// support inodes may store appropriate state in implementations of DentryImpl.
//
-// - VFS does not provide synchronization for mutable Dentry fields, other than
-// mount-related ones.
-//
// - VFS does not require that Dentries are instantiated for all paths accessed
// through VFS, only those that are tracked beyond the scope of a single
// Filesystem operation. This includes file descriptions, mount points, mount
@@ -67,41 +47,34 @@ import (
// of Dentries for operations on mutable remote filesystems that can't actually
// cache any state in the Dentry.
//
+// - VFS does not track filesystem structure (i.e. relationships between
+// Dentries), since both the relevant state and synchronization are
+// filesystem-specific.
+//
// - For the reasons above, VFS is not directly responsible for managing Dentry
// lifetime. Dentry reference counts only indicate the extent to which VFS
// requires Dentries to exist; Filesystems may elect to cache or discard
// Dentries with zero references.
+//
+// +stateify savable
type Dentry struct {
- // parent is this Dentry's parent in this Filesystem. If this Dentry is
- // independent, parent is nil.
- parent *Dentry
+ // mu synchronizes deletion/invalidation and mounting over this Dentry.
+ mu sync.Mutex `state:"nosave"`
- // name is this Dentry's name in parent.
- name string
-
- flags uint32
+ // dead is true if the file represented by this Dentry has been deleted (by
+ // CommitDeleteDentry or CommitRenameReplaceDentry) or invalidated (by
+ // InvalidateDentry). dead is protected by mu.
+ dead bool
// mounts is the number of Mounts for which this Dentry is Mount.point.
// mounts is accessed using atomic memory operations.
mounts uint32
- // children are child Dentries.
- children map[string]*Dentry
-
- // mu synchronizes disowning and mounting over this Dentry.
- mu sync.Mutex
-
// impl is the DentryImpl associated with this Dentry. impl is immutable.
// This should be the last field in Dentry.
impl DentryImpl
}
-const (
- // dflagsDisownedMask is set in Dentry.flags if the Dentry has been
- // disowned.
- dflagsDisownedMask = 1 << iota
-)
-
// Init must be called before first use of d.
func (d *Dentry) Init(impl DentryImpl) {
d.impl = impl
@@ -130,20 +103,22 @@ type DentryImpl interface {
// DecRef decrements the Dentry's reference count.
DecRef()
-}
-
-// IsDisowned returns true if d is disowned.
-func (d *Dentry) IsDisowned() bool {
- return atomic.LoadUint32(&d.flags)&dflagsDisownedMask != 0
-}
-
-// Preconditions: !d.IsDisowned().
-func (d *Dentry) setDisowned() {
- atomic.AddUint32(&d.flags, dflagsDisownedMask)
-}
-func (d *Dentry) isMounted() bool {
- return atomic.LoadUint32(&d.mounts) != 0
+ // InotifyWithParent notifies all watches on the targets represented by this
+ // dentry and its parent. The parent's watches are notified first, followed
+ // by this dentry's.
+ //
+ // InotifyWithParent automatically adds the IN_ISDIR flag for dentries
+ // representing directories.
+ //
+ // Note that the events may not actually propagate up to the user, depending
+ // on the event masks.
+ InotifyWithParent(events uint32, cookie uint32, et EventType)
+
+ // Watches returns the set of inotify watches for the file corresponding to
+ // the Dentry. Dentries that are hard links to the same underlying file
+ // share the same watches.
+ Watches() *Watches
}
// IncRef increments d's reference count.
@@ -162,104 +137,37 @@ func (d *Dentry) DecRef() {
d.impl.DecRef()
}
-// These functions are exported so that filesystem implementations can use
-// them. The vfs package, and users of VFS, should not call these functions.
-// Unless otherwise specified, these methods require that there are no
-// concurrent mutators of d.
-
-// Name returns d's name in its parent in its owning Filesystem. If d is
-// independent, Name returns an empty string.
-func (d *Dentry) Name() string {
- return d.name
-}
-
-// Parent returns d's parent in its owning Filesystem. It does not take a
-// reference on the returned Dentry. If d is independent, Parent returns nil.
-func (d *Dentry) Parent() *Dentry {
- return d.parent
+// IsDead returns true if d has been deleted or invalidated by its owning
+// filesystem.
+func (d *Dentry) IsDead() bool {
+ d.mu.Lock()
+ defer d.mu.Unlock()
+ return d.dead
}
-// ParentOrSelf is equivalent to Parent, but returns d if d is independent.
-func (d *Dentry) ParentOrSelf() *Dentry {
- if d.parent == nil {
- return d
- }
- return d.parent
+func (d *Dentry) isMounted() bool {
+ return atomic.LoadUint32(&d.mounts) != 0
}
-// Child returns d's child with the given name in its owning Filesystem. It
-// does not take a reference on the returned Dentry. If no such child exists,
-// Child returns nil.
-func (d *Dentry) Child(name string) *Dentry {
- return d.children[name]
+// InotifyWithParent notifies all watches on the inodes for this dentry and
+// its parent of events.
+func (d *Dentry) InotifyWithParent(events uint32, cookie uint32, et EventType) {
+ d.impl.InotifyWithParent(events, cookie, et)
}
-// HasChildren returns true if d has any children.
-func (d *Dentry) HasChildren() bool {
- return len(d.children) != 0
+// Watches returns the set of inotify watches associated with d.
+func (d *Dentry) Watches() *Watches {
+ return d.impl.Watches()
}
-// Children returns a map containing all of d's children.
-func (d *Dentry) Children() map[string]*Dentry {
- if !d.HasChildren() {
- return nil
- }
- m := make(map[string]*Dentry)
- for name, child := range d.children {
- m[name] = child
- }
- return m
-}
-
-// InsertChild makes child a child of d with the given name.
-//
-// InsertChild is a mutator of d and child.
-//
-// Preconditions: child must be an independent Dentry. d and child must be from
-// the same Filesystem. d must not already have a child with the given name.
-func (d *Dentry) InsertChild(child *Dentry, name string) {
- if checkInvariants {
- if _, ok := d.children[name]; ok {
- panic(fmt.Sprintf("parent already contains a child named %q", name))
- }
- if child.parent != nil || child.name != "" {
- panic(fmt.Sprintf("child is not independent: parent = %v, name = %q", child.parent, child.name))
- }
- }
- if d.children == nil {
- d.children = make(map[string]*Dentry)
- }
- d.children[name] = child
- child.parent = d
- child.name = name
-}
-
-// IsAncestorOf returns true if d is an ancestor of d2; that is, d is either
-// d2's parent or an ancestor of d2's parent.
-func (d *Dentry) IsAncestorOf(d2 *Dentry) bool {
- for d2.parent != nil {
- if d2.parent == d {
- return true
- }
- d2 = d2.parent
- }
- return false
-}
+// The following functions are exported so that filesystem implementations can
+// use them. The vfs package, and users of VFS, should not call these
+// functions.
// PrepareDeleteDentry must be called before attempting to delete the file
// represented by d. If PrepareDeleteDentry succeeds, the caller must call
// AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome.
-//
-// Preconditions: d is a child Dentry.
func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dentry) error {
- if checkInvariants {
- if d.parent == nil {
- panic("d is independent")
- }
- if d.IsDisowned() {
- panic("d is already disowned")
- }
- }
vfs.mountMu.Lock()
if mntns.mountpoints[d] != 0 {
vfs.mountMu.Unlock()
@@ -278,42 +186,27 @@ func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) {
d.mu.Unlock()
}
-// CommitDeleteDentry must be called after the file represented by d is
-// deleted, and causes d to become disowned.
-//
-// CommitDeleteDentry is a mutator of d and d.Parent().
-//
-// Preconditions: PrepareDeleteDentry was previously called on d.
+// CommitDeleteDentry must be called after PrepareDeleteDentry if the deletion
+// succeeds.
func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) {
- if d.parent != nil {
- delete(d.parent.children, d.name)
- }
- d.setDisowned()
+ d.dead = true
d.mu.Unlock()
if d.isMounted() {
- vfs.forgetDisownedMountpoint(d)
+ vfs.forgetDeadMountpoint(d)
}
}
-// ForceDeleteDentry causes d to become disowned. It should only be used in
-// cases where VFS has no ability to stop the deletion (e.g. d represents the
-// local state of a file on a remote filesystem on which the file has already
-// been deleted).
-//
-// ForceDeleteDentry is a mutator of d and d.Parent().
-//
-// Preconditions: d is a child Dentry.
-func (vfs *VirtualFilesystem) ForceDeleteDentry(d *Dentry) {
- if checkInvariants {
- if d.parent == nil {
- panic("d is independent")
- }
- if d.IsDisowned() {
- panic("d is already disowned")
- }
- }
+// InvalidateDentry is called when d ceases to represent the file it formerly
+// did for reasons outside of VFS' control (e.g. d represents the local state
+// of a file on a remote filesystem on which the file has already been
+// deleted).
+func (vfs *VirtualFilesystem) InvalidateDentry(d *Dentry) {
d.mu.Lock()
- vfs.CommitDeleteDentry(d)
+ d.dead = true
+ d.mu.Unlock()
+ if d.isMounted() {
+ vfs.forgetDeadMountpoint(d)
+ }
}
// PrepareRenameDentry must be called before attempting to rename the file
@@ -322,25 +215,9 @@ func (vfs *VirtualFilesystem) ForceDeleteDentry(d *Dentry) {
// caller must call AbortRenameDentry, CommitRenameReplaceDentry, or
// CommitRenameExchangeDentry depending on the rename's outcome.
//
-// Preconditions: from is a child Dentry. If to is not nil, it must be a child
-// Dentry from the same Filesystem. from != to.
+// Preconditions: If to is not nil, it must be a child Dentry from the same
+// Filesystem. from != to.
func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error {
- if checkInvariants {
- if from.parent == nil {
- panic("from is independent")
- }
- if from.IsDisowned() {
- panic("from is already disowned")
- }
- if to != nil {
- if to.parent == nil {
- panic("to is independent")
- }
- if to.IsDisowned() {
- panic("to is already disowned")
- }
- }
- }
vfs.mountMu.Lock()
if mntns.mountpoints[from] != 0 {
vfs.mountMu.Unlock()
@@ -374,24 +251,14 @@ func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) {
// is renamed without RENAME_EXCHANGE. If to is not nil, it represents the file
// that was replaced by from.
//
-// CommitRenameReplaceDentry is a mutator of from, to, from.Parent(), and
-// to.Parent().
-//
// Preconditions: PrepareRenameDentry was previously called on from and to.
-// newParent.Child(newName) == to.
-func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, newParent *Dentry, newName string, to *Dentry) {
- if newParent.children == nil {
- newParent.children = make(map[string]*Dentry)
- }
- newParent.children[newName] = from
- from.parent = newParent
- from.name = newName
+func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, to *Dentry) {
from.mu.Unlock()
if to != nil {
- to.setDisowned()
+ to.dead = true
to.mu.Unlock()
if to.isMounted() {
- vfs.forgetDisownedMountpoint(to)
+ vfs.forgetDeadMountpoint(to)
}
}
}
@@ -399,25 +266,18 @@ func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, newParent *Dentry,
// CommitRenameExchangeDentry must be called after the files represented by
// from and to are exchanged by rename(RENAME_EXCHANGE).
//
-// CommitRenameExchangeDentry is a mutator of from, to, from.Parent(), and
-// to.Parent().
-//
// Preconditions: PrepareRenameDentry was previously called on from and to.
func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) {
- from.parent, to.parent = to.parent, from.parent
- from.name, to.name = to.name, from.name
- from.parent.children[from.name] = from
- to.parent.children[to.name] = to
from.mu.Unlock()
to.mu.Unlock()
}
-// forgetDisownedMountpoint is called when a mount point is deleted to umount
-// all mounts using it in all other mount namespaces.
+// forgetDeadMountpoint is called when a mount point is deleted or invalidated
+// to umount all mounts using it in all other mount namespaces.
//
-// forgetDisownedMountpoint is analogous to Linux's
+// forgetDeadMountpoint is analogous to Linux's
// fs/namespace.c:__detach_mounts().
-func (vfs *VirtualFilesystem) forgetDisownedMountpoint(d *Dentry) {
+func (vfs *VirtualFilesystem) forgetDeadMountpoint(d *Dentry) {
var (
vdsToDecRef []VirtualDentry
mountsToDecRef []*Mount
diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go
index 3af2aa58d..1e9dffc8f 100644
--- a/pkg/sentry/vfs/device.go
+++ b/pkg/sentry/vfs/device.go
@@ -56,6 +56,7 @@ type Device interface {
Open(ctx context.Context, mnt *Mount, d *Dentry, opts OpenOptions) (*FileDescription, error)
}
+// +stateify savable
type registeredDevice struct {
dev Device
opts RegisterDeviceOptions
@@ -63,6 +64,8 @@ type registeredDevice struct {
// RegisterDeviceOptions contains options to
// VirtualFilesystem.RegisterDevice().
+//
+// +stateify savable
type RegisterDeviceOptions struct {
// GroupName is the name shown for this device registration in
// /proc/devices. If GroupName is empty, this registration will not be
@@ -100,7 +103,7 @@ func (vfs *VirtualFilesystem) OpenDeviceSpecialFile(ctx context.Context, mnt *Mo
}
// GetAnonBlockDevMinor allocates and returns an unused minor device number for
-// an "anonymous" block device with major number 0.
+// an "anonymous" block device with major number UNNAMED_MAJOR.
func (vfs *VirtualFilesystem) GetAnonBlockDevMinor() (uint32, error) {
vfs.anonBlockDevMinorMu.Lock()
defer vfs.anonBlockDevMinorMu.Unlock()
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
index 7c83f9a5a..8297f964b 100644
--- a/pkg/sentry/vfs/epoll.go
+++ b/pkg/sentry/vfs/epoll.go
@@ -85,8 +85,8 @@ type epollInterest struct {
ready bool
epollInterestEntry
- // userData is the epoll_data_t associated with this epollInterest.
- // userData is protected by epoll.mu.
+ // userData is the struct epoll_event::data associated with this
+ // epollInterest. userData is protected by epoll.mu.
userData [2]int32
}
@@ -99,6 +99,8 @@ func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) {
interest: make(map[epollInterestKey]*epollInterest),
}
if err := ep.vfsfd.Init(ep, linux.O_RDWR, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
+ DenyPRead: true,
+ DenyPWrite: true,
UseDentryMetadata: true,
}); err != nil {
return nil, err
@@ -157,7 +159,7 @@ func (ep *EpollInstance) Seek(ctx context.Context, offset int64, whence int32) (
// AddInterest implements the semantics of EPOLL_CTL_ADD.
//
// Preconditions: A reference must be held on file.
-func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, mask uint32, userData [2]int32) error {
+func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event linux.EpollEvent) error {
// Check for cyclic polling if necessary.
subep, _ := file.impl.(*EpollInstance)
if subep != nil {
@@ -183,13 +185,14 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, mask uint
}
// Register interest in file.
- mask |= linux.EPOLLERR | linux.EPOLLRDHUP
+ mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP
epi := &epollInterest{
epoll: ep,
key: key,
mask: mask,
- userData: userData,
+ userData: event.Data,
}
+ epi.waiter.Callback = epi
ep.interest[key] = epi
wmask := waiter.EventMaskFromLinux(mask)
file.EventRegister(&epi.waiter, wmask)
@@ -202,6 +205,9 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, mask uint
// Add epi to file.epolls so that it is removed when the last
// FileDescription reference is dropped.
file.epollMu.Lock()
+ if file.epolls == nil {
+ file.epolls = make(map[*epollInterest]struct{})
+ }
file.epolls[epi] = struct{}{}
file.epollMu.Unlock()
@@ -236,7 +242,7 @@ func (ep *EpollInstance) mightPollRecursive(ep2 *EpollInstance, remainingRecursi
// ModifyInterest implements the semantics of EPOLL_CTL_MOD.
//
// Preconditions: A reference must be held on file.
-func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, mask uint32, userData [2]int32) error {
+func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event linux.EpollEvent) error {
ep.interestMu.Lock()
defer ep.interestMu.Unlock()
@@ -250,13 +256,13 @@ func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, mask u
}
// Update epi for the next call to ep.ReadEvents().
+ mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP
ep.mu.Lock()
epi.mask = mask
- epi.userData = userData
+ epi.userData = event.Data
ep.mu.Unlock()
// Re-register with the new mask.
- mask |= linux.EPOLLERR | linux.EPOLLRDHUP
file.EventUnregister(&epi.waiter)
wmask := waiter.EventMaskFromLinux(mask)
file.EventRegister(&epi.waiter, wmask)
@@ -363,8 +369,7 @@ func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int {
// Report ievents.
events[i] = linux.EpollEvent{
Events: ievents.ToLinux(),
- Fd: epi.userData[0],
- Data: epi.userData[1],
+ Data: epi.userData,
}
i++
if i == len(events) {
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 5bac660c7..bb294563d 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -84,6 +84,13 @@ type FileDescriptionOptions struct {
// usually only the case if O_DIRECT would actually have an effect.
AllowDirectIO bool
+ // If DenyPRead is true, calls to FileDescription.PRead() return ESPIPE.
+ DenyPRead bool
+
+ // If DenyPWrite is true, calls to FileDescription.PWrite() return
+ // ESPIPE.
+ DenyPWrite bool
+
// If UseDentryMetadata is true, calls to FileDescription methods that
// interact with file and filesystem metadata (Stat, SetStat, StatFS,
// Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling
@@ -100,10 +107,10 @@ type FileDescriptionOptions struct {
}
// Init must be called before first use of fd. If it succeeds, it takes
-// references on mnt and d. statusFlags is the initial file description status
-// flags, which is usually the full set of flags passed to open(2).
-func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) error {
- writable := MayWriteFileWithOpenFlags(statusFlags)
+// references on mnt and d. flags is the initial file description flags, which
+// is usually the full set of flags passed to open(2).
+func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) error {
+ writable := MayWriteFileWithOpenFlags(flags)
if writable {
if err := mnt.CheckBeginWrite(); err != nil {
return err
@@ -111,14 +118,18 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mn
}
fd.refs = 1
- fd.statusFlags = statusFlags | linux.O_LARGEFILE
+
+ // Remove "file creation flags" to mirror the behavior from file.f_flags in
+ // fs/open.c:do_dentry_open
+ fd.statusFlags = flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC)
fd.vd = VirtualDentry{
mount: mnt,
dentry: d,
}
- fd.vd.IncRef()
+ mnt.IncRef()
+ d.IncRef()
fd.opts = *opts
- fd.readable = MayReadFileWithOpenFlags(statusFlags)
+ fd.readable = MayReadFileWithOpenFlags(flags)
fd.writable = writable
fd.impl = impl
return nil
@@ -175,6 +186,12 @@ func (fd *FileDescription) DecRef() {
}
}
+// Refs returns the current number of references. The returned count
+// is inherently racy and is unsafe to use without external synchronization.
+func (fd *FileDescription) Refs() int64 {
+ return atomic.LoadInt64(&fd.refs)
+}
+
// Mount returns the mount on which fd was opened. It does not take a reference
// on the returned Mount.
func (fd *FileDescription) Mount() *Mount {
@@ -193,6 +210,11 @@ func (fd *FileDescription) VirtualDentry() VirtualDentry {
return fd.vd
}
+// Options returns the options passed to fd.Init().
+func (fd *FileDescription) Options() FileDescriptionOptions {
+ return fd.opts
+}
+
// StatusFlags returns file description status flags, as for fcntl(F_GETFL).
func (fd *FileDescription) StatusFlags() uint32 {
return atomic.LoadUint32(&fd.statusFlags)
@@ -286,7 +308,8 @@ type FileDescriptionImpl interface {
Stat(ctx context.Context, opts StatOptions) (linux.Statx, error)
// SetStat updates metadata for the file represented by the
- // FileDescription.
+ // FileDescription. Implementations are responsible for checking if the
+ // operation can be performed (see vfs.CheckSetStat() for common checks).
SetStat(ctx context.Context, opts SetStatOptions) error
// StatFS returns metadata for the filesystem containing the file
@@ -305,6 +328,7 @@ type FileDescriptionImpl interface {
// - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP.
//
// Preconditions: The FileDescription was opened for reading.
+ // FileDescriptionOptions.DenyPRead == false.
PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)
// Read is similar to PRead, but does not specify an offset.
@@ -336,6 +360,7 @@ type FileDescriptionImpl interface {
// EOPNOTSUPP.
//
// Preconditions: The FileDescription was opened for writing.
+ // FileDescriptionOptions.DenyPWrite == false.
PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)
// Write is similar to PWrite, but does not specify an offset, which is
@@ -381,11 +406,11 @@ type FileDescriptionImpl interface {
Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)
// Listxattr returns all extended attribute names for the file.
- Listxattr(ctx context.Context) ([]string, error)
+ Listxattr(ctx context.Context, size uint64) ([]string, error)
// Getxattr returns the value associated with the given extended attribute
// for the file.
- Getxattr(ctx context.Context, name string) (string, error)
+ Getxattr(ctx context.Context, opts GetxattrOptions) (string, error)
// Setxattr changes the value associated with the given extended attribute
// for the file.
@@ -435,11 +460,11 @@ type Dirent struct {
// IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents.
type IterDirentsCallback interface {
- // Handle handles the given iterated Dirent. It returns true if iteration
- // should continue, and false if FileDescriptionImpl.IterDirents should
- // terminate now and restart with the same Dirent the next time it is
- // called.
- Handle(dirent Dirent) bool
+ // Handle handles the given iterated Dirent. If Handle returns a non-nil
+ // error, FileDescriptionImpl.IterDirents must stop iteration and return
+ // the error; the next call to FileDescriptionImpl.IterDirents should
+ // restart with the same Dirent.
+ Handle(dirent Dirent) error
}
// OnClose is called when a file descriptor representing the FileDescription is
@@ -514,6 +539,9 @@ func (fd *FileDescription) EventUnregister(e *waiter.Entry) {
// offset, and returns the number of bytes read. PRead is permitted to return
// partial reads with a nil error.
func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+ if fd.opts.DenyPRead {
+ return 0, syserror.ESPIPE
+ }
if !fd.readable {
return 0, syserror.EBADF
}
@@ -532,6 +560,9 @@ func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opt
// offset, and returns the number of bytes written. PWrite is permitted to
// return partial writes with a nil error.
func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+ if fd.opts.DenyPWrite {
+ return 0, syserror.ESPIPE
+ }
if !fd.writable {
return 0, syserror.EBADF
}
@@ -576,18 +607,23 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.
// Listxattr returns all extended attribute names for the file represented by
// fd.
-func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) {
+//
+// If the size of the list (including a NUL terminating byte after every entry)
+// would exceed size, ERANGE may be returned. Note that implementations
+// are free to ignore size entirely and return without error). In all cases,
+// if size is 0, the list should be returned without error, regardless of size.
+func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
if fd.opts.UseDentryMetadata {
vfsObj := fd.vd.mount.vfs
rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
Root: fd.vd,
Start: fd.vd,
})
- names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp)
+ names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp, size)
vfsObj.putResolvingPath(rp)
return names, err
}
- names, err := fd.impl.Listxattr(ctx)
+ names, err := fd.impl.Listxattr(ctx, size)
if err == syserror.ENOTSUP {
// Linux doesn't actually return ENOTSUP in this case; instead,
// fs/xattr.c:vfs_listxattr() falls back to allowing the security
@@ -600,34 +636,39 @@ func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) {
// Getxattr returns the value associated with the given extended attribute for
// the file represented by fd.
-func (fd *FileDescription) Getxattr(ctx context.Context, name string) (string, error) {
+//
+// If the size of the return value exceeds opts.Size, ERANGE may be returned
+// (note that implementations are free to ignore opts.Size entirely and return
+// without error). In all cases, if opts.Size is 0, the value should be
+// returned without error, regardless of size.
+func (fd *FileDescription) Getxattr(ctx context.Context, opts *GetxattrOptions) (string, error) {
if fd.opts.UseDentryMetadata {
vfsObj := fd.vd.mount.vfs
rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
Root: fd.vd,
Start: fd.vd,
})
- val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, name)
+ val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
vfsObj.putResolvingPath(rp)
return val, err
}
- return fd.impl.Getxattr(ctx, name)
+ return fd.impl.Getxattr(ctx, *opts)
}
// Setxattr changes the value associated with the given extended attribute for
// the file represented by fd.
-func (fd *FileDescription) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+func (fd *FileDescription) Setxattr(ctx context.Context, opts *SetxattrOptions) error {
if fd.opts.UseDentryMetadata {
vfsObj := fd.vd.mount.vfs
rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
Root: fd.vd,
Start: fd.vd,
})
- err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, opts)
+ err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
vfsObj.putResolvingPath(rp)
return err
}
- return fd.impl.Setxattr(ctx, opts)
+ return fd.impl.Setxattr(ctx, *opts)
}
// Removexattr removes the given extended attribute from the file represented
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index c2a52ec1b..f4c111926 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -33,8 +33,8 @@ import (
// implementations to adapt:
// - Have a local fileDescription struct (containing FileDescription) which
// embeds FileDescriptionDefaultImpl and overrides the default methods
-// which are common to all fd implementations for that for that filesystem
-// like StatusFlags, SetStatusFlags, Stat, SetStat, StatFS, etc.
+// which are common to all fd implementations for that filesystem like
+// StatusFlags, SetStatusFlags, Stat, SetStat, StatFS, etc.
// - This should be embedded in all file description implementations as the
// first field by value.
// - Directory FDs would also embed DirectoryFileDescriptionDefaultImpl.
@@ -130,14 +130,14 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg
// Listxattr implements FileDescriptionImpl.Listxattr analogously to
// inode_operations::listxattr == NULL in Linux.
-func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context) ([]string, error) {
+func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context, size uint64) ([]string, error) {
// This isn't exactly accurate; see FileDescription.Listxattr.
return nil, syserror.ENOTSUP
}
// Getxattr implements FileDescriptionImpl.Getxattr analogously to
// inode::i_opflags & IOP_XATTR == 0 in Linux.
-func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, name string) (string, error) {
+func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, opts GetxattrOptions) (string, error) {
return "", syserror.ENOTSUP
}
@@ -339,6 +339,11 @@ func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src
if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
return 0, syserror.EOPNOTSUPP
}
+ limit, err := CheckLimit(ctx, offset, src.NumBytes())
+ if err != nil {
+ return 0, err
+ }
+ src = src.TakeFirst64(limit)
writable, ok := fd.data.(WritableDynamicBytesSource)
if !ok {
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index 8fa26418e..3a75d4d62 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -107,7 +107,10 @@ func (fd *testFD) SetStat(ctx context.Context, opts SetStatOptions) error {
func TestGenCountFD(t *testing.T) {
ctx := contexttest.Context(t)
- vfsObj := New() // vfs.New()
+ vfsObj := &VirtualFilesystem{}
+ if err := vfsObj.Init(); err != nil {
+ t.Fatalf("VFS init: %v", err)
+ }
fd := newTestFD(vfsObj, linux.O_RDWR, &genCount{})
defer fd.DecRef()
@@ -162,7 +165,10 @@ func TestGenCountFD(t *testing.T) {
func TestWritable(t *testing.T) {
ctx := contexttest.Context(t)
- vfsObj := New() // vfs.New()
+ vfsObj := &VirtualFilesystem{}
+ if err := vfsObj.Init(); err != nil {
+ t.Fatalf("VFS init: %v", err)
+ }
fd := newTestFD(vfsObj, linux.O_RDWR, &storeData{data: "init"})
defer fd.DecRef()
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index a06a6caf3..1edd584c9 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -20,6 +20,8 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
)
// A Filesystem is a tree of nodes represented by Dentries, which forms part of
@@ -29,6 +31,8 @@ import (
// Filesystem methods require that a reference is held.
//
// Filesystem is analogous to Linux's struct super_block.
+//
+// +stateify savable
type Filesystem struct {
// refs is the reference count. refs is accessed using atomic memory
// operations.
@@ -38,21 +42,30 @@ type Filesystem struct {
// immutable.
vfs *VirtualFilesystem
+ // fsType is the FilesystemType of this Filesystem.
+ fsType FilesystemType
+
// impl is the FilesystemImpl associated with this Filesystem. impl is
// immutable. This should be the last field in Dentry.
impl FilesystemImpl
}
// Init must be called before first use of fs.
-func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, impl FilesystemImpl) {
+func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, fsType FilesystemType, impl FilesystemImpl) {
fs.refs = 1
fs.vfs = vfsObj
+ fs.fsType = fsType
fs.impl = impl
vfsObj.filesystemsMu.Lock()
vfsObj.filesystems[fs] = struct{}{}
vfsObj.filesystemsMu.Unlock()
}
+// FilesystemType returns the FilesystemType for this Filesystem.
+func (fs *Filesystem) FilesystemType() FilesystemType {
+ return fs.fsType
+}
+
// VirtualFilesystem returns the containing VirtualFilesystem.
func (fs *Filesystem) VirtualFilesystem() *VirtualFilesystem {
return fs.vfs
@@ -142,6 +155,9 @@ type FilesystemImpl interface {
// file data to be written to the underlying [filesystem]", as by syncfs(2).
Sync(ctx context.Context) error
+ // AccessAt checks whether a user with creds can access the file at rp.
+ AccessAt(ctx context.Context, rp *ResolvingPath, creds *auth.Credentials, ats AccessTypes) error
+
// GetDentryAt returns a Dentry representing the file at rp. A reference is
// taken on the returned Dentry.
//
@@ -330,7 +346,10 @@ type FilesystemImpl interface {
// ENOTEMPTY.
//
// Preconditions: !rp.Done(). For the final path component in rp,
- // !rp.ShouldFollowSymlink(). oldName is not "." or "..".
+ // !rp.ShouldFollowSymlink(). oldParentVD.Dentry() was obtained from a
+ // previous call to
+ // oldParentVD.Mount().Filesystem().Impl().GetParentDentryAt(). oldName is
+ // not "." or "..".
//
// Postconditions: If RenameAt returns an error returned by
// ResolvingPath.Resolve*(), then !rp.Done().
@@ -360,7 +379,9 @@ type FilesystemImpl interface {
// ResolvingPath.Resolve*(), then !rp.Done().
RmdirAt(ctx context.Context, rp *ResolvingPath) error
- // SetStatAt updates metadata for the file at the given path.
+ // SetStatAt updates metadata for the file at the given path. Implementations
+ // are responsible for checking if the operation can be performed
+ // (see vfs.CheckSetStat() for common checks).
//
// Errors:
//
@@ -422,9 +443,14 @@ type FilesystemImpl interface {
// Errors:
//
// - If extended attributes are not supported by the filesystem,
- // ListxattrAt returns nil. (See FileDescription.Listxattr for an
- // explanation.)
- ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error)
+ // ListxattrAt returns ENOTSUP.
+ //
+ // - If the size of the list (including a NUL terminating byte after every
+ // entry) would exceed size, ERANGE may be returned. Note that
+ // implementations are free to ignore size entirely and return without
+ // error). In all cases, if size is 0, the list should be returned without
+ // error, regardless of size.
+ ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error)
// GetxattrAt returns the value associated with the given extended
// attribute for the file at rp.
@@ -433,7 +459,15 @@ type FilesystemImpl interface {
//
// - If extended attributes are not supported by the filesystem, GetxattrAt
// returns ENOTSUP.
- GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error)
+ //
+ // - If an extended attribute named opts.Name does not exist, ENODATA is
+ // returned.
+ //
+ // - If the size of the return value exceeds opts.Size, ERANGE may be
+ // returned (note that implementations are free to ignore opts.Size entirely
+ // and return without error). In all cases, if opts.Size is 0, the value
+ // should be returned without error, regardless of size.
+ GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error)
// SetxattrAt changes the value associated with the given extended
// attribute for the file at rp.
@@ -442,6 +476,10 @@ type FilesystemImpl interface {
//
// - If extended attributes are not supported by the filesystem, SetxattrAt
// returns ENOTSUP.
+ //
+ // - If XATTR_CREATE is set in opts.Flag and opts.Name already exists,
+ // EEXIST is returned. If XATTR_REPLACE is set and opts.Name does not exist,
+ // ENODATA is returned.
SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error
// RemovexattrAt removes the given extended attribute from the file at rp.
@@ -450,8 +488,21 @@ type FilesystemImpl interface {
//
// - If extended attributes are not supported by the filesystem,
// RemovexattrAt returns ENOTSUP.
+ //
+ // - If name does not exist, ENODATA is returned.
RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error
+ // BoundEndpointAt returns the Unix socket endpoint bound at the path rp.
+ //
+ // Errors:
+ //
+ // - If the file does not have write permissions, then BoundEndpointAt
+ // returns EACCES.
+ //
+ // - If a non-socket file exists at rp, then BoundEndpointAt returns
+ // ECONNREFUSED.
+ BoundEndpointAt(ctx context.Context, rp *ResolvingPath, opts BoundEndpointOptions) (transport.BoundEndpoint, error)
+
// PrependPath prepends a path from vd to vd.Mount().Root() to b.
//
// If vfsroot.Ok(), it is the contextual VFS root; if it is encountered
@@ -474,7 +525,7 @@ type FilesystemImpl interface {
// Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl.
PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error
- // TODO: inotify_add_watch(); bind()
+ // TODO(gvisor.dev/issue/1479): inotify_add_watch()
}
// PrependPathAtVFSRootError is returned by implementations of
diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go
index 7315a588e..465e610e0 100644
--- a/pkg/sentry/vfs/filesystem_impl_util.go
+++ b/pkg/sentry/vfs/filesystem_impl_util.go
@@ -16,8 +16,6 @@ package vfs
import (
"strings"
-
- "gvisor.dev/gvisor/pkg/fspath"
)
// GenericParseMountOptions parses a comma-separated list of options of the
@@ -43,27 +41,3 @@ func GenericParseMountOptions(str string) map[string]string {
}
return m
}
-
-// GenericPrependPath may be used by implementations of
-// FilesystemImpl.PrependPath() for which a single statically-determined lock
-// or set of locks is sufficient to ensure its preconditions (as opposed to
-// e.g. per-Dentry locks).
-//
-// Preconditions: Dentry.Name() and Dentry.Parent() must be held constant for
-// vd.Dentry() and all of its ancestors.
-func GenericPrependPath(vfsroot, vd VirtualDentry, b *fspath.Builder) error {
- mnt, d := vd.mount, vd.dentry
- for {
- if mnt == vfsroot.mount && d == vfsroot.dentry {
- return PrependPathAtVFSRootError{}
- }
- if d == mnt.root {
- return nil
- }
- if d.parent == nil {
- return PrependPathAtNonMountRootError{}
- }
- b.PrependComponent(d.name)
- d = d.parent
- }
-}
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
index c58b70728..f2298f7f6 100644
--- a/pkg/sentry/vfs/filesystem_type.go
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -30,6 +30,9 @@ type FilesystemType interface {
// along with its mount root. A reference is taken on the returned
// Filesystem and Dentry.
GetFilesystem(ctx context.Context, vfsObj *VirtualFilesystem, creds *auth.Credentials, source string, opts GetFilesystemOptions) (*Filesystem, *Dentry, error)
+
+ // Name returns the name of this FilesystemType.
+ Name() string
}
// GetFilesystemOptions contains options to FilesystemType.GetFilesystem.
@@ -44,6 +47,7 @@ type GetFilesystemOptions struct {
InternalData interface{}
}
+// +stateify savable
type registeredFilesystemType struct {
fsType FilesystemType
opts RegisterFilesystemTypeOptions
diff --git a/pkg/sentry/vfs/genericfstree/BUILD b/pkg/sentry/vfs/genericfstree/BUILD
new file mode 100644
index 000000000..d8fd92677
--- /dev/null
+++ b/pkg/sentry/vfs/genericfstree/BUILD
@@ -0,0 +1,16 @@
+load("//tools/go_generics:defs.bzl", "go_template")
+
+package(
+ default_visibility = ["//:sandbox"],
+ licenses = ["notice"],
+)
+
+go_template(
+ name = "generic_fstree",
+ srcs = [
+ "genericfstree.go",
+ ],
+ types = [
+ "Dentry",
+ ],
+)
diff --git a/pkg/sentry/vfs/genericfstree/genericfstree.go b/pkg/sentry/vfs/genericfstree/genericfstree.go
new file mode 100644
index 000000000..8882fa84a
--- /dev/null
+++ b/pkg/sentry/vfs/genericfstree/genericfstree.go
@@ -0,0 +1,81 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package genericfstree provides tools for implementing vfs.FilesystemImpls
+// where a single statically-determined lock or set of locks is sufficient to
+// ensure that a Dentry's name and parent are contextually immutable.
+//
+// Clients using this package must use the go_template_instance rule in
+// tools/go_generics/defs.bzl to create an instantiation of this template
+// package, providing types to use in place of Dentry.
+package genericfstree
+
+import (
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// Dentry is a required type parameter that is a struct with the given fields.
+type Dentry struct {
+ // vfsd is the embedded vfs.Dentry corresponding to this vfs.DentryImpl.
+ vfsd vfs.Dentry
+
+ // parent is the parent of this Dentry in the filesystem's tree. If this
+ // Dentry is a filesystem root, parent is nil.
+ parent *Dentry
+
+ // name is the name of this Dentry in its parent. If this Dentry is a
+ // filesystem root, name is unspecified.
+ name string
+}
+
+// IsAncestorDentry returns true if d is an ancestor of d2; that is, d is
+// either d2's parent or an ancestor of d2's parent.
+func IsAncestorDentry(d, d2 *Dentry) bool {
+ for d2 != nil { // Stop at root, where d2.parent == nil.
+ if d2.parent == d {
+ return true
+ }
+ if d2.parent == d2 {
+ return false
+ }
+ d2 = d2.parent
+ }
+ return false
+}
+
+// ParentOrSelf returns d.parent. If d.parent is nil, ParentOrSelf returns d.
+func ParentOrSelf(d *Dentry) *Dentry {
+ if d.parent != nil {
+ return d.parent
+ }
+ return d
+}
+
+// PrependPath is a generic implementation of FilesystemImpl.PrependPath().
+func PrependPath(vfsroot vfs.VirtualDentry, mnt *vfs.Mount, d *Dentry, b *fspath.Builder) error {
+ for {
+ if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() {
+ return vfs.PrependPathAtVFSRootError{}
+ }
+ if &d.vfsd == mnt.Root() {
+ return nil
+ }
+ if d.parent == nil {
+ return vfs.PrependPathAtNonMountRootError{}
+ }
+ b.PrependComponent(d.name)
+ d = d.parent
+ }
+}
diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go
new file mode 100644
index 000000000..05a3051a4
--- /dev/null
+++ b/pkg/sentry/vfs/inotify.go
@@ -0,0 +1,697 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "bytes"
+ "fmt"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/uniqueid"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// inotifyEventBaseSize is the base size of linux's struct inotify_event. This
+// must be a power 2 for rounding below.
+const inotifyEventBaseSize = 16
+
+// EventType defines different kinds of inotfiy events.
+//
+// The way events are labelled appears somewhat arbitrary, but they must match
+// Linux so that IN_EXCL_UNLINK behaves as it does in Linux.
+type EventType uint8
+
+// PathEvent and InodeEvent correspond to FSNOTIFY_EVENT_PATH and
+// FSNOTIFY_EVENT_INODE in Linux.
+const (
+ PathEvent EventType = iota
+ InodeEvent EventType = iota
+)
+
+// Inotify represents an inotify instance created by inotify_init(2) or
+// inotify_init1(2). Inotify implements FileDescriptionImpl.
+//
+// Lock ordering:
+// Inotify.mu -> Watches.mu -> Inotify.evMu
+//
+// +stateify savable
+type Inotify struct {
+ vfsfd FileDescription
+ FileDescriptionDefaultImpl
+ DentryMetadataFileDescriptionImpl
+
+ // Unique identifier for this inotify instance. We don't just reuse the
+ // inotify fd because fds can be duped. These should not be exposed to the
+ // user, since we may aggressively reuse an id on S/R.
+ id uint64
+
+ // queue is used to notify interested parties when the inotify instance
+ // becomes readable or writable.
+ queue waiter.Queue `state:"nosave"`
+
+ // evMu *only* protects the events list. We need a separate lock while
+ // queuing events: using mu may violate lock ordering, since at that point
+ // the calling goroutine may already hold Watches.mu.
+ evMu sync.Mutex `state:"nosave"`
+
+ // A list of pending events for this inotify instance. Protected by evMu.
+ events eventList
+
+ // A scratch buffer, used to serialize inotify events. Allocate this
+ // ahead of time for the sake of performance. Protected by evMu.
+ scratch []byte
+
+ // mu protects the fields below.
+ mu sync.Mutex `state:"nosave"`
+
+ // nextWatchMinusOne is used to allocate watch descriptors on this Inotify
+ // instance. Note that Linux starts numbering watch descriptors from 1.
+ nextWatchMinusOne int32
+
+ // Map from watch descriptors to watch objects.
+ watches map[int32]*Watch
+}
+
+var _ FileDescriptionImpl = (*Inotify)(nil)
+
+// NewInotifyFD constructs a new Inotify instance.
+func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) (*FileDescription, error) {
+ // O_CLOEXEC affects file descriptors, so it must be handled outside of vfs.
+ flags &^= linux.O_CLOEXEC
+ if flags&^linux.O_NONBLOCK != 0 {
+ return nil, syserror.EINVAL
+ }
+
+ id := uniqueid.GlobalFromContext(ctx)
+ vd := vfsObj.NewAnonVirtualDentry(fmt.Sprintf("[inotifyfd:%d]", id))
+ defer vd.DecRef()
+ fd := &Inotify{
+ id: id,
+ scratch: make([]byte, inotifyEventBaseSize),
+ watches: make(map[int32]*Watch),
+ }
+ if err := fd.vfsfd.Init(fd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
+ UseDentryMetadata: true,
+ DenyPRead: true,
+ DenyPWrite: true,
+ }); err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+// Release implements FileDescriptionImpl.Release. Release removes all
+// watches and frees all resources for an inotify instance.
+func (i *Inotify) Release() {
+ // We need to hold i.mu to avoid a race with concurrent calls to
+ // Inotify.handleDeletion from Watches. There's no risk of Watches
+ // accessing this Inotify after the destructor ends, because we remove all
+ // references to it below.
+ i.mu.Lock()
+ defer i.mu.Unlock()
+ for _, w := range i.watches {
+ // Remove references to the watch from the watches set on the target. We
+ // don't need to worry about the references from i.watches, since this
+ // file description is about to be destroyed.
+ w.set.Remove(i.id)
+ }
+}
+
+// EventRegister implements waiter.Waitable.
+func (i *Inotify) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ i.queue.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.
+func (i *Inotify) EventUnregister(e *waiter.Entry) {
+ i.queue.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+//
+// Readiness indicates whether there are pending events for an inotify instance.
+func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask {
+ ready := waiter.EventMask(0)
+
+ i.evMu.Lock()
+ defer i.evMu.Unlock()
+
+ if !i.events.Empty() {
+ ready |= waiter.EventIn
+ }
+
+ return mask & ready
+}
+
+// PRead implements FileDescriptionImpl.
+func (*Inotify) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+ return 0, syserror.ESPIPE
+}
+
+// PWrite implements FileDescriptionImpl.
+func (*Inotify) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+ return 0, syserror.ESPIPE
+}
+
+// Write implements FileDescriptionImpl.Write.
+func (*Inotify) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+ return 0, syserror.EBADF
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+ if dst.NumBytes() < inotifyEventBaseSize {
+ return 0, syserror.EINVAL
+ }
+
+ i.evMu.Lock()
+ defer i.evMu.Unlock()
+
+ if i.events.Empty() {
+ // Nothing to read yet, tell caller to block.
+ return 0, syserror.ErrWouldBlock
+ }
+
+ var writeLen int64
+ for it := i.events.Front(); it != nil; {
+ // Advance `it` before the element is removed from the list, or else
+ // it.Next() will always be nil.
+ event := it
+ it = it.Next()
+
+ // Does the buffer have enough remaining space to hold the event we're
+ // about to write out?
+ if dst.NumBytes() < int64(event.sizeOf()) {
+ if writeLen > 0 {
+ // Buffer wasn't big enough for all pending events, but we did
+ // write some events out.
+ return writeLen, nil
+ }
+ return 0, syserror.EINVAL
+ }
+
+ // Linux always dequeues an available event as long as there's enough
+ // buffer space to copy it out, even if the copy below fails. Emulate
+ // this behaviour.
+ i.events.Remove(event)
+
+ // Buffer has enough space, copy event to the read buffer.
+ n, err := event.CopyTo(ctx, i.scratch, dst)
+ if err != nil {
+ return 0, err
+ }
+
+ writeLen += n
+ dst = dst.DropFirst64(n)
+ }
+ return writeLen, nil
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (i *Inotify) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ switch args[1].Int() {
+ case linux.FIONREAD:
+ i.evMu.Lock()
+ defer i.evMu.Unlock()
+ var n uint32
+ for e := i.events.Front(); e != nil; e = e.Next() {
+ n += uint32(e.sizeOf())
+ }
+ var buf [4]byte
+ usermem.ByteOrder.PutUint32(buf[:], n)
+ _, err := uio.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{})
+ return 0, err
+
+ default:
+ return 0, syserror.ENOTTY
+ }
+}
+
+func (i *Inotify) queueEvent(ev *Event) {
+ i.evMu.Lock()
+
+ // Check if we should coalesce the event we're about to queue with the last
+ // one currently in the queue. Events are coalesced if they are identical.
+ if last := i.events.Back(); last != nil {
+ if ev.equals(last) {
+ // "Coalesce" the two events by simply not queuing the new one. We
+ // don't need to raise a waiter.EventIn notification because no new
+ // data is available for reading.
+ i.evMu.Unlock()
+ return
+ }
+ }
+
+ i.events.PushBack(ev)
+
+ // Release mutex before notifying waiters because we don't control what they
+ // can do.
+ i.evMu.Unlock()
+
+ i.queue.Notify(waiter.EventIn)
+}
+
+// newWatchLocked creates and adds a new watch to target.
+//
+// Precondition: i.mu must be locked.
+func (i *Inotify) newWatchLocked(target *Dentry, mask uint32) *Watch {
+ targetWatches := target.Watches()
+ w := &Watch{
+ owner: i,
+ wd: i.nextWatchIDLocked(),
+ set: targetWatches,
+ mask: mask,
+ }
+
+ // Hold the watch in this inotify instance as well as the watch set on the
+ // target.
+ i.watches[w.wd] = w
+ targetWatches.Add(w)
+ return w
+}
+
+// newWatchIDLocked allocates and returns a new watch descriptor.
+//
+// Precondition: i.mu must be locked.
+func (i *Inotify) nextWatchIDLocked() int32 {
+ i.nextWatchMinusOne++
+ return i.nextWatchMinusOne
+}
+
+// handleDeletion handles the deletion of the target of watch w. It removes w
+// from i.watches and a watch removal event is generated.
+func (i *Inotify) handleDeletion(w *Watch) {
+ i.mu.Lock()
+ _, found := i.watches[w.wd]
+ delete(i.watches, w.wd)
+ i.mu.Unlock()
+
+ if found {
+ i.queueEvent(newEvent(w.wd, "", linux.IN_IGNORED, 0))
+ }
+}
+
+// AddWatch constructs a new inotify watch and adds it to the target. It
+// returns the watch descriptor returned by inotify_add_watch(2).
+func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 {
+ // Note: Locking this inotify instance protects the result returned by
+ // Lookup() below. With the lock held, we know for sure the lookup result
+ // won't become stale because it's impossible for *this* instance to
+ // add/remove watches on target.
+ i.mu.Lock()
+ defer i.mu.Unlock()
+
+ // Does the target already have a watch from this inotify instance?
+ if existing := target.Watches().Lookup(i.id); existing != nil {
+ newmask := mask
+ if mask&linux.IN_MASK_ADD != 0 {
+ // "Add (OR) events to watch mask for this pathname if it already
+ // exists (instead of replacing mask)." -- inotify(7)
+ newmask |= atomic.LoadUint32(&existing.mask)
+ }
+ atomic.StoreUint32(&existing.mask, newmask)
+ return existing.wd
+ }
+
+ // No existing watch, create a new watch.
+ w := i.newWatchLocked(target, mask)
+ return w.wd
+}
+
+// RmWatch looks up an inotify watch for the given 'wd' and configures the
+// target to stop sending events to this inotify instance.
+func (i *Inotify) RmWatch(wd int32) error {
+ i.mu.Lock()
+
+ // Find the watch we were asked to removed.
+ w, ok := i.watches[wd]
+ if !ok {
+ i.mu.Unlock()
+ return syserror.EINVAL
+ }
+
+ // Remove the watch from this instance.
+ delete(i.watches, wd)
+
+ // Remove the watch from the watch target.
+ w.set.Remove(w.OwnerID())
+ i.mu.Unlock()
+
+ // Generate the event for the removal.
+ i.queueEvent(newEvent(wd, "", linux.IN_IGNORED, 0))
+
+ return nil
+}
+
+// Watches is the collection of all inotify watches on a single file.
+//
+// +stateify savable
+type Watches struct {
+ // mu protects the fields below.
+ mu sync.RWMutex `state:"nosave"`
+
+ // ws is the map of active watches in this collection, keyed by the inotify
+ // instance id of the owner.
+ ws map[uint64]*Watch
+}
+
+// Lookup returns the watch owned by an inotify instance with the given id.
+// Returns nil if no such watch exists.
+//
+// Precondition: the inotify instance with the given id must be locked to
+// prevent the returned watch from being concurrently modified or replaced in
+// Inotify.watches.
+func (w *Watches) Lookup(id uint64) *Watch {
+ w.mu.Lock()
+ defer w.mu.Unlock()
+ return w.ws[id]
+}
+
+// Add adds watch into this set of watches.
+//
+// Precondition: the inotify instance with the given id must be locked.
+func (w *Watches) Add(watch *Watch) {
+ w.mu.Lock()
+ defer w.mu.Unlock()
+
+ owner := watch.OwnerID()
+ // Sanity check, we should never have two watches for one owner on the
+ // same target.
+ if _, exists := w.ws[owner]; exists {
+ panic(fmt.Sprintf("Watch collision with ID %+v", owner))
+ }
+ if w.ws == nil {
+ w.ws = make(map[uint64]*Watch)
+ }
+ w.ws[owner] = watch
+}
+
+// Remove removes a watch with the given id from this set of watches and
+// releases it. The caller is responsible for generating any watch removal
+// event, as appropriate. The provided id must match an existing watch in this
+// collection.
+//
+// Precondition: the inotify instance with the given id must be locked.
+func (w *Watches) Remove(id uint64) {
+ w.mu.Lock()
+ defer w.mu.Unlock()
+
+ if w.ws == nil {
+ // This watch set is being destroyed. The thread executing the
+ // destructor is already in the process of deleting all our watches. We
+ // got here with no references on the target because we raced with the
+ // destructor notifying all the watch owners of destruction. See the
+ // comment in Watches.HandleDeletion for why this race exists.
+ return
+ }
+
+ if _, ok := w.ws[id]; !ok {
+ // While there's technically no problem with silently ignoring a missing
+ // watch, this is almost certainly a bug.
+ panic(fmt.Sprintf("Attempt to remove a watch, but no watch found with provided id %+v.", id))
+ }
+ delete(w.ws, id)
+}
+
+// Notify queues a new event with all watches in this set.
+func (w *Watches) Notify(name string, events, cookie uint32, et EventType) {
+ w.NotifyWithExclusions(name, events, cookie, et, false)
+}
+
+// NotifyWithExclusions queues a new event with watches in this set. Watches
+// with IN_EXCL_UNLINK are skipped if the event is coming from a child that
+// has been unlinked.
+func (w *Watches) NotifyWithExclusions(name string, events, cookie uint32, et EventType, unlinked bool) {
+ // N.B. We don't defer the unlocks because Notify is in the hot path of
+ // all IO operations, and the defer costs too much for small IO
+ // operations.
+ w.mu.RLock()
+ for _, watch := range w.ws {
+ if unlinked && watch.ExcludeUnlinkedChildren() && et == PathEvent {
+ continue
+ }
+ watch.Notify(name, events, cookie)
+ }
+ w.mu.RUnlock()
+}
+
+// HandleDeletion is called when the watch target is destroyed to emit
+// the appropriate events.
+func (w *Watches) HandleDeletion() {
+ w.Notify("", linux.IN_DELETE_SELF, 0, InodeEvent)
+
+ // TODO(gvisor.dev/issue/1479): This doesn't work because maps are not copied
+ // by value. Ideally, we wouldn't have this circular locking so we can just
+ // notify of IN_DELETE_SELF in the same loop below.
+ //
+ // We can't hold w.mu while calling watch.handleDeletion to preserve lock
+ // ordering w.r.t to the owner inotify instances. Instead, atomically move
+ // the watches map into a local variable so we can iterate over it safely.
+ //
+ // Because of this however, it is possible for the watches' owners to reach
+ // this inode while the inode has no refs. This is still safe because the
+ // owners can only reach the inode until this function finishes calling
+ // watch.handleDeletion below and the inode is guaranteed to exist in the
+ // meantime. But we still have to be very careful not to rely on inode state
+ // that may have been already destroyed.
+ var ws map[uint64]*Watch
+ w.mu.Lock()
+ ws = w.ws
+ w.ws = nil
+ w.mu.Unlock()
+
+ for _, watch := range ws {
+ // TODO(gvisor.dev/issue/1479): consider refactoring this.
+ watch.handleDeletion()
+ }
+}
+
+// Watch represent a particular inotify watch created by inotify_add_watch.
+//
+// +stateify savable
+type Watch struct {
+ // Inotify instance which owns this watch.
+ owner *Inotify
+
+ // Descriptor for this watch. This is unique across an inotify instance.
+ wd int32
+
+ // set is the watch set containing this watch. It belongs to the target file
+ // of this watch.
+ set *Watches
+
+ // Events being monitored via this watch. Must be accessed with atomic
+ // memory operations.
+ mask uint32
+}
+
+// OwnerID returns the id of the inotify instance that owns this watch.
+func (w *Watch) OwnerID() uint64 {
+ return w.owner.id
+}
+
+// ExcludeUnlinkedChildren indicates whether the watched object should continue
+// to be notified of events of its children after they have been unlinked, e.g.
+// for an open file descriptor.
+//
+// TODO(gvisor.dev/issue/1479): Implement IN_EXCL_UNLINK.
+// We can do this by keeping track of the set of unlinked children in Watches
+// to skip notification.
+func (w *Watch) ExcludeUnlinkedChildren() bool {
+ return atomic.LoadUint32(&w.mask)&linux.IN_EXCL_UNLINK != 0
+}
+
+// Notify queues a new event on this watch.
+func (w *Watch) Notify(name string, events uint32, cookie uint32) {
+ mask := atomic.LoadUint32(&w.mask)
+ if mask&events == 0 {
+ // We weren't watching for this event.
+ return
+ }
+
+ // Event mask should include bits matched from the watch plus all control
+ // event bits.
+ unmaskableBits := ^uint32(0) &^ linux.IN_ALL_EVENTS
+ effectiveMask := unmaskableBits | mask
+ matchedEvents := effectiveMask & events
+ w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie))
+}
+
+// handleDeletion handles the deletion of w's target.
+func (w *Watch) handleDeletion() {
+ w.owner.handleDeletion(w)
+}
+
+// Event represents a struct inotify_event from linux.
+//
+// +stateify savable
+type Event struct {
+ eventEntry
+
+ wd int32
+ mask uint32
+ cookie uint32
+
+ // len is computed based on the name field is set automatically by
+ // Event.setName. It should be 0 when no name is set; otherwise it is the
+ // length of the name slice.
+ len uint32
+
+ // The name field has special padding requirements and should only be set by
+ // calling Event.setName.
+ name []byte
+}
+
+func newEvent(wd int32, name string, events, cookie uint32) *Event {
+ e := &Event{
+ wd: wd,
+ mask: events,
+ cookie: cookie,
+ }
+ if name != "" {
+ e.setName(name)
+ }
+ return e
+}
+
+// paddedBytes converts a go string to a null-terminated c-string, padded with
+// null bytes to a total size of 'l'. 'l' must be large enough for all the bytes
+// in the 's' plus at least one null byte.
+func paddedBytes(s string, l uint32) []byte {
+ if l < uint32(len(s)+1) {
+ panic("Converting string to byte array results in truncation, this can lead to buffer-overflow due to the missing null-byte!")
+ }
+ b := make([]byte, l)
+ copy(b, s)
+
+ // b was zero-value initialized during make(), so the rest of the slice is
+ // already filled with null bytes.
+
+ return b
+}
+
+// setName sets the optional name for this event.
+func (e *Event) setName(name string) {
+ // We need to pad the name such that the entire event length ends up a
+ // multiple of inotifyEventBaseSize.
+ unpaddedLen := len(name) + 1
+ // Round up to nearest multiple of inotifyEventBaseSize.
+ e.len = uint32((unpaddedLen + inotifyEventBaseSize - 1) & ^(inotifyEventBaseSize - 1))
+ // Make sure we haven't overflowed and wrapped around when rounding.
+ if unpaddedLen > int(e.len) {
+ panic("Overflow when rounding inotify event size, the 'name' field was too big.")
+ }
+ e.name = paddedBytes(name, e.len)
+}
+
+func (e *Event) sizeOf() int {
+ s := inotifyEventBaseSize + int(e.len)
+ if s < inotifyEventBaseSize {
+ panic("overflow")
+ }
+ return s
+}
+
+// CopyTo serializes this event to dst. buf is used as a scratch buffer to
+// construct the output. We use a buffer allocated ahead of time for
+// performance. buf must be at least inotifyEventBaseSize bytes.
+func (e *Event) CopyTo(ctx context.Context, buf []byte, dst usermem.IOSequence) (int64, error) {
+ usermem.ByteOrder.PutUint32(buf[0:], uint32(e.wd))
+ usermem.ByteOrder.PutUint32(buf[4:], e.mask)
+ usermem.ByteOrder.PutUint32(buf[8:], e.cookie)
+ usermem.ByteOrder.PutUint32(buf[12:], e.len)
+
+ writeLen := 0
+
+ n, err := dst.CopyOut(ctx, buf)
+ if err != nil {
+ return 0, err
+ }
+ writeLen += n
+ dst = dst.DropFirst(n)
+
+ if e.len > 0 {
+ n, err = dst.CopyOut(ctx, e.name)
+ if err != nil {
+ return 0, err
+ }
+ writeLen += n
+ }
+
+ // Santiy check.
+ if writeLen != e.sizeOf() {
+ panic(fmt.Sprintf("Serialized unexpected amount of data for an event, expected %d, wrote %d.", e.sizeOf(), writeLen))
+ }
+
+ return int64(writeLen), nil
+}
+
+func (e *Event) equals(other *Event) bool {
+ return e.wd == other.wd &&
+ e.mask == other.mask &&
+ e.cookie == other.cookie &&
+ e.len == other.len &&
+ bytes.Equal(e.name, other.name)
+}
+
+// InotifyEventFromStatMask generates the appropriate events for an operation
+// that set the stats specified in mask.
+func InotifyEventFromStatMask(mask uint32) uint32 {
+ var ev uint32
+ if mask&(linux.STATX_UID|linux.STATX_GID|linux.STATX_MODE) != 0 {
+ ev |= linux.IN_ATTRIB
+ }
+ if mask&linux.STATX_SIZE != 0 {
+ ev |= linux.IN_MODIFY
+ }
+
+ if (mask & (linux.STATX_ATIME | linux.STATX_MTIME)) == (linux.STATX_ATIME | linux.STATX_MTIME) {
+ // Both times indicates a utime(s) call.
+ ev |= linux.IN_ATTRIB
+ } else if mask&linux.STATX_ATIME != 0 {
+ ev |= linux.IN_ACCESS
+ } else if mask&linux.STATX_MTIME != 0 {
+ mask |= linux.IN_MODIFY
+ }
+ return ev
+}
+
+// InotifyRemoveChild sends the appriopriate notifications to the watch sets of
+// the child being removed and its parent.
+func InotifyRemoveChild(self, parent *Watches, name string) {
+ self.Notify("", linux.IN_ATTRIB, 0, InodeEvent)
+ parent.Notify(name, linux.IN_DELETE, 0, InodeEvent)
+ // TODO(gvisor.dev/issue/1479): implement IN_EXCL_UNLINK.
+}
+
+// InotifyRename sends the appriopriate notifications to the watch sets of the
+// file being renamed and its old/new parents.
+func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, oldName, newName string, isDir bool) {
+ var dirEv uint32
+ if isDir {
+ dirEv = linux.IN_ISDIR
+ }
+ cookie := uniqueid.InotifyCookie(ctx)
+ oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent)
+ newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent)
+ // Somewhat surprisingly, self move events do not have a cookie.
+ renamed.Notify("", linux.IN_MOVE_SELF, 0, InodeEvent)
+}
diff --git a/pkg/sentry/vfs/memxattr/BUILD b/pkg/sentry/vfs/memxattr/BUILD
new file mode 100644
index 000000000..d8c4d27b9
--- /dev/null
+++ b/pkg/sentry/vfs/memxattr/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "memxattr",
+ srcs = ["xattr.go"],
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/sentry/vfs",
+ "//pkg/sync",
+ "//pkg/syserror",
+ ],
+)
diff --git a/pkg/sentry/vfs/memxattr/xattr.go b/pkg/sentry/vfs/memxattr/xattr.go
new file mode 100644
index 000000000..cc1e7d764
--- /dev/null
+++ b/pkg/sentry/vfs/memxattr/xattr.go
@@ -0,0 +1,102 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package memxattr provides a default, in-memory extended attribute
+// implementation.
+package memxattr
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// SimpleExtendedAttributes implements extended attributes using a map of
+// names to values.
+//
+// +stateify savable
+type SimpleExtendedAttributes struct {
+ // mu protects the below fields.
+ mu sync.RWMutex `state:"nosave"`
+ xattrs map[string]string
+}
+
+// Getxattr returns the value at 'name'.
+func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string, error) {
+ x.mu.RLock()
+ value, ok := x.xattrs[opts.Name]
+ x.mu.RUnlock()
+ if !ok {
+ return "", syserror.ENODATA
+ }
+ // Check that the size of the buffer provided in getxattr(2) is large enough
+ // to contain the value.
+ if opts.Size != 0 && uint64(len(value)) > opts.Size {
+ return "", syserror.ERANGE
+ }
+ return value, nil
+}
+
+// Setxattr sets 'value' at 'name'.
+func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error {
+ x.mu.Lock()
+ defer x.mu.Unlock()
+ if x.xattrs == nil {
+ if opts.Flags&linux.XATTR_REPLACE != 0 {
+ return syserror.ENODATA
+ }
+ x.xattrs = make(map[string]string)
+ }
+
+ _, ok := x.xattrs[opts.Name]
+ if ok && opts.Flags&linux.XATTR_CREATE != 0 {
+ return syserror.EEXIST
+ }
+ if !ok && opts.Flags&linux.XATTR_REPLACE != 0 {
+ return syserror.ENODATA
+ }
+
+ x.xattrs[opts.Name] = opts.Value
+ return nil
+}
+
+// Listxattr returns all names in xattrs.
+func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) {
+ // Keep track of the size of the buffer needed in listxattr(2) for the list.
+ listSize := 0
+ x.mu.RLock()
+ names := make([]string, 0, len(x.xattrs))
+ for n := range x.xattrs {
+ names = append(names, n)
+ // Add one byte per null terminator.
+ listSize += len(n) + 1
+ }
+ x.mu.RUnlock()
+ if size != 0 && uint64(listSize) > size {
+ return nil, syserror.ERANGE
+ }
+ return names, nil
+}
+
+// Removexattr removes the xattr at 'name'.
+func (x *SimpleExtendedAttributes) Removexattr(name string) error {
+ x.mu.Lock()
+ defer x.mu.Unlock()
+ if _, ok := x.xattrs[name]; !ok {
+ return syserror.ENODATA
+ }
+ delete(x.xattrs, name)
+ return nil
+}
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 1fbb420f9..32f901bd8 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -15,7 +15,11 @@
package vfs
import (
+ "bytes"
+ "fmt"
"math"
+ "sort"
+ "strings"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -38,14 +42,23 @@ import (
//
// Mount is analogous to Linux's struct mount. (gVisor does not distinguish
// between struct mount and struct vfsmount.)
+//
+// +stateify savable
type Mount struct {
- // vfs, fs, and root are immutable. References are held on fs and root.
+ // vfs, fs, root are immutable. References are held on fs and root.
//
// Invariant: root belongs to fs.
vfs *VirtualFilesystem
fs *Filesystem
root *Dentry
+ // ID is the immutable mount ID.
+ ID uint64
+
+ // Flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except
+ // for MS_RDONLY which is tracked in "writers". Immutable.
+ Flags MountFlags
+
// key is protected by VirtualFilesystem.mountMu and
// VirtualFilesystem.mounts.seq, and may be nil. References are held on
// key.parent and key.point if they are not nil.
@@ -79,13 +92,43 @@ type Mount struct {
writers int64
}
-// A MountNamespace is a collection of Mounts.
-//
+func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount {
+ mnt := &Mount{
+ ID: atomic.AddUint64(&vfs.lastMountID, 1),
+ Flags: opts.Flags,
+ vfs: vfs,
+ fs: fs,
+ root: root,
+ ns: mntns,
+ refs: 1,
+ }
+ if opts.ReadOnly {
+ mnt.setReadOnlyLocked(true)
+ }
+ return mnt
+}
+
+// Options returns a copy of the MountOptions currently applicable to mnt.
+func (mnt *Mount) Options() MountOptions {
+ mnt.vfs.mountMu.Lock()
+ defer mnt.vfs.mountMu.Unlock()
+ return MountOptions{
+ Flags: mnt.Flags,
+ ReadOnly: mnt.readOnly(),
+ }
+}
+
+// A MountNamespace is a collection of Mounts.//
// MountNamespaces are reference-counted. Unless otherwise specified, all
// MountNamespace methods require that a reference is held.
//
// MountNamespace is analogous to Linux's struct mnt_namespace.
+//
+// +stateify savable
type MountNamespace struct {
+ // Owner is the usernamespace that owns this mount namespace.
+ Owner *auth.UserNamespace
+
// root is the MountNamespace's root mount. root is immutable.
root *Mount
@@ -114,6 +157,7 @@ type MountNamespace struct {
func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
rft := vfs.getFilesystemType(fsTypeName)
if rft == nil {
+ ctx.Warningf("Unknown filesystem type: %s", fsTypeName)
return nil, syserror.ENODEV
}
fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
@@ -121,49 +165,63 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
return nil, err
}
mntns := &MountNamespace{
+ Owner: creds.UserNamespace,
refs: 1,
mountpoints: make(map[*Dentry]uint32),
}
- mntns.root = &Mount{
- vfs: vfs,
- fs: fs,
- root: root,
- ns: mntns,
- refs: 1,
- }
+ mntns.root = newMount(vfs, fs, root, mntns, &MountOptions{})
return mntns, nil
}
-// MountAt creates and mounts a Filesystem configured by the given arguments.
-func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
+// NewDisconnectedMount returns a Mount representing fs with the given root
+// (which may be nil). The new Mount is not associated with any MountNamespace
+// and is not connected to any other Mounts. References are taken on fs and
+// root.
+func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry, opts *MountOptions) (*Mount, error) {
+ fs.IncRef()
+ if root != nil {
+ root.IncRef()
+ }
+ return newMount(vfs, fs, root, nil /* mntns */, opts), nil
+}
+
+// MountDisconnected creates a Filesystem configured by the given arguments,
+// then returns a Mount representing it. The new Mount is not associated with
+// any MountNamespace and is not connected to any other Mounts.
+func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth.Credentials, source string, fsTypeName string, opts *MountOptions) (*Mount, error) {
rft := vfs.getFilesystemType(fsTypeName)
if rft == nil {
- return syserror.ENODEV
+ return nil, syserror.ENODEV
}
if !opts.InternalMount && !rft.opts.AllowUserMount {
- return syserror.ENODEV
+ return nil, syserror.ENODEV
}
fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
if err != nil {
- return err
+ return nil, err
}
+ defer root.DecRef()
+ defer fs.DecRef()
+ return vfs.NewDisconnectedMount(fs, root, opts)
+}
+
+// ConnectMountAt connects mnt at the path represented by target.
+//
+// Preconditions: mnt must be disconnected.
+func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Credentials, mnt *Mount, target *PathOperation) error {
// We can't hold vfs.mountMu while calling FilesystemImpl methods due to
// lock ordering.
vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{})
if err != nil {
- root.DecRef()
- fs.DecRef()
return err
}
vfs.mountMu.Lock()
vd.dentry.mu.Lock()
for {
- if vd.dentry.IsDisowned() {
+ if vd.dentry.dead {
vd.dentry.mu.Unlock()
vfs.mountMu.Unlock()
vd.DecRef()
- root.DecRef()
- fs.DecRef()
return syserror.ENOENT
}
// vd might have been mounted over between vfs.GetDentryAt() and
@@ -192,17 +250,10 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
}
vd.dentry.mu.Lock()
}
- // TODO: Linux requires that either both the mount point and the mount root
- // are directories, or neither are, and returns ENOTDIR if this is not the
- // case.
+ // TODO(gvisor.dev/issue/1035): Linux requires that either both the mount
+ // point and the mount root are directories, or neither are, and returns
+ // ENOTDIR if this is not the case.
mntns := vd.mount.ns
- mnt := &Mount{
- vfs: vfs,
- fs: fs,
- root: root,
- ns: mntns,
- refs: 1,
- }
vfs.mounts.seq.BeginWrite()
vfs.connectLocked(mnt, vd, mntns)
vfs.mounts.seq.EndWrite()
@@ -211,6 +262,19 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
return nil
}
+// MountAt creates and mounts a Filesystem configured by the given arguments.
+func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
+ mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts)
+ if err != nil {
+ return err
+ }
+ defer mnt.DecRef()
+ if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil {
+ return err
+ }
+ return nil
+}
+
// UmountAt removes the Mount at the given path.
func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error {
if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 {
@@ -218,6 +282,9 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
}
// MNT_FORCE is currently unimplemented except for the permission check.
+ // Force unmounting specifically requires CAP_SYS_ADMIN in the root user
+ // namespace, and not in the owner user namespace for the target mount. See
+ // fs/namespace.c:SYSCALL_DEFINE2(umount, ...)
if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) {
return syserror.EPERM
}
@@ -231,14 +298,17 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
return syserror.EINVAL
}
vfs.mountMu.Lock()
- if mntns := MountNamespaceFromContext(ctx); mntns != nil && mntns != vd.mount.ns {
- vfs.mountMu.Unlock()
- return syserror.EINVAL
+ if mntns := MountNamespaceFromContext(ctx); mntns != nil {
+ defer mntns.DecRef()
+ if mntns != vd.mount.ns {
+ vfs.mountMu.Unlock()
+ return syserror.EINVAL
+ }
}
- // TODO(jamieliu): Linux special-cases umount of the caller's root, which
- // we don't implement yet (we'll just fail it since the caller holds a
- // reference on it).
+ // TODO(gvisor.dev/issue/1035): Linux special-cases umount of the caller's
+ // root, which we don't implement yet (we'll just fail it since the caller
+ // holds a reference on it).
vfs.mounts.seq.BeginWrite()
if opts.Flags&linux.MNT_DETACH == 0 {
@@ -330,14 +400,22 @@ func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecu
// references held by vd.
//
// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
-// writer critical section. d.mu must be locked. mnt.parent() == nil.
+// writer critical section. d.mu must be locked. mnt.parent() == nil, i.e. mnt
+// must not already be connected.
func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) {
+ if checkInvariants {
+ if mnt.parent() != nil {
+ panic("VFS.connectLocked called on connected mount")
+ }
+ }
+ mnt.IncRef() // dropped by callers of umountRecursiveLocked
mnt.storeKey(vd)
if vd.mount.children == nil {
vd.mount.children = make(map[*Mount]struct{})
}
vd.mount.children[mnt] = struct{}{}
atomic.AddUint32(&vd.dentry.mounts, 1)
+ mnt.ns = mntns
mntns.mountpoints[vd.dentry]++
vfs.mounts.insertSeqed(mnt)
vfsmpmounts, ok := vfs.mountpoints[vd.dentry]
@@ -355,6 +433,11 @@ func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns
// writer critical section. mnt.parent() != nil.
func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry {
vd := mnt.loadKey()
+ if checkInvariants {
+ if vd.mount != nil {
+ panic("VFS.disconnectLocked called on disconnected mount")
+ }
+ }
mnt.storeKey(VirtualDentry{})
delete(vd.mount.children, mnt)
atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1
@@ -605,12 +688,34 @@ func (mnt *Mount) setReadOnlyLocked(ro bool) error {
return nil
}
+func (mnt *Mount) readOnly() bool {
+ return atomic.LoadInt64(&mnt.writers) < 0
+}
+
// Filesystem returns the mounted Filesystem. It does not take a reference on
// the returned Filesystem.
func (mnt *Mount) Filesystem() *Filesystem {
return mnt.fs
}
+// submountsLocked returns this Mount and all Mounts that are descendents of
+// it.
+//
+// Precondition: mnt.vfs.mountMu must be held.
+func (mnt *Mount) submountsLocked() []*Mount {
+ mounts := []*Mount{mnt}
+ for m := range mnt.children {
+ mounts = append(mounts, m.submountsLocked()...)
+ }
+ return mounts
+}
+
+// Root returns the mount's root. It does not take a reference on the returned
+// Dentry.
+func (mnt *Mount) Root() *Dentry {
+ return mnt.root
+}
+
// Root returns mntns' root. A reference is taken on the returned
// VirtualDentry.
func (mntns *MountNamespace) Root() VirtualDentry {
@@ -621,3 +726,178 @@ func (mntns *MountNamespace) Root() VirtualDentry {
vd.IncRef()
return vd
}
+
+// GenerateProcMounts emits the contents of /proc/[pid]/mounts for vfs to buf.
+//
+// Preconditions: taskRootDir.Ok().
+func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
+ vfs.mountMu.Lock()
+ defer vfs.mountMu.Unlock()
+ rootMnt := taskRootDir.mount
+ mounts := rootMnt.submountsLocked()
+ sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+ for _, mnt := range mounts {
+ // Get the path to this mount relative to task root.
+ mntRootVD := VirtualDentry{
+ mount: mnt,
+ dentry: mnt.root,
+ }
+ path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
+ if err != nil {
+ // For some reason we didn't get a path. Log a warning
+ // and run with empty path.
+ ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+ path = ""
+ }
+ if path == "" {
+ // Either an error occurred, or path is not reachable
+ // from root.
+ break
+ }
+
+ opts := "rw"
+ if mnt.readOnly() {
+ opts = "ro"
+ }
+ if mnt.Flags.NoATime {
+ opts = ",noatime"
+ }
+ if mnt.Flags.NoExec {
+ opts += ",noexec"
+ }
+
+ // Format:
+ // <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order>
+ //
+ // The "needs dump" and "fsck order" flags are always 0, which
+ // is allowed.
+ fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", path, mnt.fs.FilesystemType().Name(), opts, 0, 0)
+ }
+}
+
+// GenerateProcMountInfo emits the contents of /proc/[pid]/mountinfo for vfs to
+// buf.
+//
+// Preconditions: taskRootDir.Ok().
+func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
+ vfs.mountMu.Lock()
+ defer vfs.mountMu.Unlock()
+ rootMnt := taskRootDir.mount
+ mounts := rootMnt.submountsLocked()
+ sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+ for _, mnt := range mounts {
+ // Get the path to this mount relative to task root.
+ mntRootVD := VirtualDentry{
+ mount: mnt,
+ dentry: mnt.root,
+ }
+ path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
+ if err != nil {
+ // For some reason we didn't get a path. Log a warning
+ // and run with empty path.
+ ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+ path = ""
+ }
+ if path == "" {
+ // Either an error occurred, or path is not reachable
+ // from root.
+ break
+ }
+ // Stat the mount root to get the major/minor device numbers.
+ pop := &PathOperation{
+ Root: mntRootVD,
+ Start: mntRootVD,
+ }
+ statx, err := vfs.StatAt(ctx, auth.NewAnonymousCredentials(), pop, &StatOptions{})
+ if err != nil {
+ // Well that's not good. Ignore this mount.
+ break
+ }
+
+ // Format:
+ // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
+ // (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11)
+
+ // (1) Mount ID.
+ fmt.Fprintf(buf, "%d ", mnt.ID)
+
+ // (2) Parent ID (or this ID if there is no parent).
+ pID := mnt.ID
+ if p := mnt.parent(); p != nil {
+ pID = p.ID
+ }
+ fmt.Fprintf(buf, "%d ", pID)
+
+ // (3) Major:Minor device ID. We don't have a superblock, so we
+ // just use the root inode device number.
+ fmt.Fprintf(buf, "%d:%d ", statx.DevMajor, statx.DevMinor)
+
+ // (4) Root: the pathname of the directory in the filesystem
+ // which forms the root of this mount.
+ //
+ // NOTE(b/78135857): This will always be "/" until we implement
+ // bind mounts.
+ fmt.Fprintf(buf, "/ ")
+
+ // (5) Mount point (relative to process root).
+ fmt.Fprintf(buf, "%s ", manglePath(path))
+
+ // (6) Mount options.
+ opts := "rw"
+ if mnt.readOnly() {
+ opts = "ro"
+ }
+ if mnt.Flags.NoATime {
+ opts = ",noatime"
+ }
+ if mnt.Flags.NoExec {
+ opts += ",noexec"
+ }
+ fmt.Fprintf(buf, "%s ", opts)
+
+ // (7) Optional fields: zero or more fields of the form "tag[:value]".
+ // (8) Separator: the end of the optional fields is marked by a single hyphen.
+ fmt.Fprintf(buf, "- ")
+
+ // (9) Filesystem type.
+ fmt.Fprintf(buf, "%s ", mnt.fs.FilesystemType().Name())
+
+ // (10) Mount source: filesystem-specific information or "none".
+ fmt.Fprintf(buf, "none ")
+
+ // (11) Superblock options, and final newline.
+ fmt.Fprintf(buf, "%s\n", superBlockOpts(path, mnt))
+ }
+}
+
+// manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents.
+// See Linux fs/seq_file.c:mangle_path.
+func manglePath(p string) string {
+ r := strings.NewReplacer(" ", "\\040", "\t", "\\011", "\n", "\\012", "\\", "\\134")
+ return r.Replace(p)
+}
+
+// superBlockOpts returns the super block options string for the the mount at
+// the given path.
+func superBlockOpts(mountPath string, mnt *Mount) string {
+ // gVisor doesn't (yet) have a concept of super block options, so we
+ // use the ro/rw bit from the mount flag.
+ opts := "rw"
+ if mnt.readOnly() {
+ opts = "ro"
+ }
+
+ // NOTE(b/147673608): If the mount is a cgroup, we also need to include
+ // the cgroup name in the options. For now we just read that from the
+ // path.
+ //
+ // TODO(gvisor.dev/issue/190): Once gVisor has full cgroup support, we
+ // should get this value from the cgroup itself, and not rely on the
+ // path.
+ if mnt.fs.FilesystemType().Name() == "cgroup" {
+ splitPath := strings.Split(mountPath, "/")
+ cgroupType := splitPath[len(splitPath)-1]
+ opts += "," + cgroupType
+ }
+ return opts
+}
diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go
index 3b933468d..3335e4057 100644
--- a/pkg/sentry/vfs/mount_test.go
+++ b/pkg/sentry/vfs/mount_test.go
@@ -55,7 +55,7 @@ func TestMountTableInsertLookup(t *testing.T) {
}
}
-// TODO: concurrent lookup/insertion/removal
+// TODO(gvisor.dev/issue/1035): concurrent lookup/insertion/removal.
// must be powers of 2
var benchNumMounts = []int{1 << 2, 1 << 5, 1 << 8}
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index bd90d36c4..70f850ca4 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -13,7 +13,7 @@
// limitations under the License.
// +build go1.12
-// +build !go1.15
+// +build !go1.16
// Check go:linkname function signatures when updating Go version.
@@ -26,6 +26,7 @@ import (
"sync/atomic"
"unsafe"
+ "gvisor.dev/gvisor/pkg/gohacks"
"gvisor.dev/gvisor/pkg/sync"
)
@@ -64,6 +65,8 @@ func (mnt *Mount) storeKey(vd VirtualDentry) {
// (provided mutation is sufficiently uncommon).
//
// mountTable.Init() must be called on new mountTables before use.
+//
+// +stateify savable
type mountTable struct {
// mountTable is implemented as a seqcount-protected hash table that
// resolves collisions with linear probing, featuring Robin Hood insertion
@@ -75,8 +78,8 @@ type mountTable struct {
// intrinsics and inline assembly, limiting the performance of this
// approach.)
- seq sync.SeqCount
- seed uint32 // for hashing keys
+ seq sync.SeqCount `state:"nosave"`
+ seed uint32 // for hashing keys
// size holds both length (number of elements) and capacity (number of
// slots): capacity is stored as its base-2 log (referred to as order) in
@@ -89,7 +92,7 @@ type mountTable struct {
// length and cap in separate uint32s) for ~free.
size uint64
- slots unsafe.Pointer // []mountSlot; never nil after Init
+ slots unsafe.Pointer `state:"nosave"` // []mountSlot; never nil after Init
}
type mountSlot struct {
@@ -158,7 +161,7 @@ func newMountTableSlots(cap uintptr) unsafe.Pointer {
// Lookup may be called even if there are concurrent mutators of mt.
func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount {
key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)}
- hash := memhash(noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes)
+ hash := memhash(gohacks.Noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes)
loop:
for {
@@ -359,12 +362,3 @@ func memhash(p unsafe.Pointer, seed, s uintptr) uintptr
//go:linkname rand32 runtime.fastrand
func rand32() uint32
-
-// This is copy/pasted from runtime.noescape(), and is needed because arguments
-// apparently escape from all functions defined by linkname.
-//
-//go:nosplit
-func noescape(p unsafe.Pointer) unsafe.Pointer {
- x := uintptr(p)
- return unsafe.Pointer(x ^ 0)
-}
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index fdf8be157..f223aeda8 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -16,6 +16,7 @@ package vfs
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
)
// GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and
@@ -32,6 +33,25 @@ type GetDentryOptions struct {
type MkdirOptions struct {
// Mode is the file mode bits for the created directory.
Mode linux.FileMode
+
+ // If ForSyntheticMountpoint is true, FilesystemImpl.MkdirAt() may create
+ // the given directory in memory only (as opposed to persistent storage).
+ // The created directory should be able to support the creation of
+ // subdirectories with ForSyntheticMountpoint == true. It does not need to
+ // support the creation of subdirectories with ForSyntheticMountpoint ==
+ // false, or files of other types.
+ //
+ // FilesystemImpls are permitted to ignore the ForSyntheticMountpoint
+ // option.
+ //
+ // The ForSyntheticMountpoint option exists because, unlike mount(2), the
+ // OCI Runtime Specification permits the specification of mount points that
+ // do not exist, under the expectation that container runtimes will create
+ // them. (More accurately, the OCI Runtime Specification completely fails
+ // to document this feature, but it's implemented by runc.)
+ // ForSyntheticMountpoint allows such mount points to be created even when
+ // the underlying persistent filesystem is immutable.
+ ForSyntheticMountpoint bool
}
// MknodOptions contains options to VirtualFilesystem.MknodAt() and
@@ -44,10 +64,31 @@ type MknodOptions struct {
// DevMinor are the major and minor device numbers for the created device.
DevMajor uint32
DevMinor uint32
+
+ // Endpoint is the endpoint to bind to the created file, if a socket file is
+ // being created for bind(2) on a Unix domain socket.
+ Endpoint transport.BoundEndpoint
+}
+
+// MountFlags contains flags as specified for mount(2), e.g. MS_NOEXEC.
+// MS_RDONLY is not part of MountFlags because it's tracked in Mount.writers.
+type MountFlags struct {
+ // NoExec is equivalent to MS_NOEXEC.
+ NoExec bool
+
+ // NoATime is equivalent to MS_NOATIME and indicates that the
+ // filesystem should not update access time in-place.
+ NoATime bool
}
// MountOptions contains options to VirtualFilesystem.MountAt().
type MountOptions struct {
+ // Flags contains flags as specified for mount(2), e.g. MS_NOEXEC.
+ Flags MountFlags
+
+ // ReadOnly is equivalent to MS_RDONLY.
+ ReadOnly bool
+
// GetFilesystemOptions contains options to FilesystemType.GetFilesystem().
GetFilesystemOptions GetFilesystemOptions
@@ -61,7 +102,7 @@ type MountOptions struct {
type OpenOptions struct {
// Flags contains access mode and flags as specified for open(2).
//
- // FilesystemImpls is reponsible for implementing the following flags:
+ // FilesystemImpls are responsible for implementing the following flags:
// O_RDONLY, O_WRONLY, O_RDWR, O_APPEND, O_CREAT, O_DIRECT, O_DSYNC,
// O_EXCL, O_NOATIME, O_NOCTTY, O_NONBLOCK, O_PATH, O_SYNC, O_TMPFILE, and
// O_TRUNC. VFS is responsible for handling O_DIRECTORY, O_LARGEFILE, and
@@ -75,7 +116,8 @@ type OpenOptions struct {
// FileExec is set when the file is being opened to be executed.
// VirtualFilesystem.OpenAt() checks that the caller has execute permissions
- // on the file, and that the file is a regular file.
+ // on the file, that the file is a regular file, and that the mount doesn't
+ // have MS_NOEXEC set.
FileExec bool
}
@@ -113,6 +155,38 @@ type SetStatOptions struct {
Stat linux.Statx
}
+// BoundEndpointOptions contains options to VirtualFilesystem.BoundEndpointAt()
+// and FilesystemImpl.BoundEndpointAt().
+type BoundEndpointOptions struct {
+ // Addr is the path of the file whose socket endpoint is being retrieved.
+ // It is generally irrelevant: most endpoints are stored at a dentry that
+ // was created through a bind syscall, so the path can be stored on creation.
+ // However, if the endpoint was created in FilesystemImpl.BoundEndpointAt(),
+ // then we may not know what the original bind address was.
+ //
+ // For example, if connect(2) is called with address "foo" which corresponds
+ // a remote named socket in goferfs, we need to generate an endpoint wrapping
+ // that file. In this case, we can use Addr to set the endpoint address to
+ // "foo". Note that Addr is only a best-effort attempt--we still do not know
+ // the exact address that was used on the remote fs to bind the socket (it
+ // may have been "foo", "./foo", etc.).
+ Addr string
+}
+
+// GetxattrOptions contains options to VirtualFilesystem.GetxattrAt(),
+// FilesystemImpl.GetxattrAt(), FileDescription.Getxattr(), and
+// FileDescriptionImpl.Getxattr().
+type GetxattrOptions struct {
+ // Name is the name of the extended attribute to retrieve.
+ Name string
+
+ // Size is the maximum value size that the caller will tolerate. If the value
+ // is larger than size, getxattr methods may return ERANGE, but they are also
+ // free to ignore the hint entirely (i.e. the value returned may be larger
+ // than size). All size checking is done independently at the syscall layer.
+ Size uint64
+}
+
// SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(),
// FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and
// FileDescriptionImpl.Setxattr().
diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go
index b318c681a..cd78d66bc 100644
--- a/pkg/sentry/vfs/pathname.go
+++ b/pkg/sentry/vfs/pathname.go
@@ -58,7 +58,7 @@ loop:
switch err.(type) {
case nil:
if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
- // GenericPrependPath() will have returned
+ // genericfstree.PrependPath() will have returned
// PrependPathAtVFSRootError in this case since it checks
// against vfsroot before mnt.root, but other implementations
// of FilesystemImpl.PrependPath() may return nil instead.
@@ -84,16 +84,59 @@ loop:
}
}
b.PrependByte('/')
- if origD.IsDisowned() {
+ if origD.IsDead() {
b.AppendString(" (deleted)")
}
return b.String(), nil
}
+// PathnameReachable returns an absolute pathname to vd, consistent with
+// Linux's __d_path() (as used by seq_path_root()). If vfsroot.Ok() and vd is
+// not reachable from vfsroot, such that seq_path_root() would return SEQ_SKIP
+// (causing the entire containing entry to be skipped), PathnameReachable
+// returns ("", nil).
+func (vfs *VirtualFilesystem) PathnameReachable(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
+ b := getFSPathBuilder()
+ defer putFSPathBuilder(b)
+ haveRef := false
+ defer func() {
+ if haveRef {
+ vd.DecRef()
+ }
+ }()
+loop:
+ for {
+ err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b)
+ switch err.(type) {
+ case nil:
+ if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
+ break loop
+ }
+ nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+ if !nextVD.Ok() {
+ return "", nil
+ }
+ if haveRef {
+ vd.DecRef()
+ }
+ vd = nextVD
+ haveRef = true
+ case PrependPathAtVFSRootError:
+ break loop
+ case PrependPathAtNonMountRootError, PrependPathSyntheticError:
+ return "", nil
+ default:
+ return "", err
+ }
+ }
+ b.PrependByte('/')
+ return b.String(), nil
+}
+
// PathnameForGetcwd returns an absolute pathname to vd, consistent with
// Linux's sys_getcwd().
func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
- if vd.dentry.IsDisowned() {
+ if vd.dentry.IsDead() {
return "", syserror.ENOENT
}
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index 8e250998a..f9647f90e 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -15,8 +15,12 @@
package vfs
import (
+ "math"
+
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -25,9 +29,9 @@ type AccessTypes uint16
// Bits in AccessTypes.
const (
+ MayExec AccessTypes = 1
+ MayWrite AccessTypes = 2
MayRead AccessTypes = 4
- MayWrite = 2
- MayExec = 1
)
// OnlyRead returns true if access _only_ allows read.
@@ -52,16 +56,17 @@ func (a AccessTypes) MayExec() bool {
// GenericCheckPermissions checks that creds has the given access rights on a
// file with the given permissions, UID, and GID, subject to the rules of
-// fs/namei.c:generic_permission(). isDir is true if the file is a directory.
-func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir bool, mode uint16, kuid auth.KUID, kgid auth.KGID) error {
+// fs/namei.c:generic_permission().
+func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
// Check permission bits.
- perms := mode
+ perms := uint16(mode.Permissions())
if creds.EffectiveKUID == kuid {
perms >>= 6
} else if creds.InGroup(kgid) {
perms >>= 3
}
if uint16(ats)&perms == uint16(ats) {
+ // All permission bits match, access granted.
return nil
}
@@ -73,7 +78,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo
}
// CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary
// directories, and read arbitrary non-directory files.
- if (isDir && !ats.MayWrite()) || ats.OnlyRead() {
+ if (mode.IsDir() && !ats.MayWrite()) || ats.OnlyRead() {
if creds.HasCapability(linux.CAP_DAC_READ_SEARCH) {
return nil
}
@@ -81,7 +86,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo
// CAP_DAC_OVERRIDE allows arbitrary access to directories, read/write
// access to non-directory files, and execute access to non-directory files
// for which at least one execute bit is set.
- if isDir || !ats.MayExec() || (mode&0111 != 0) {
+ if mode.IsDir() || !ats.MayExec() || (mode.Permissions()&0111 != 0) {
if creds.HasCapability(linux.CAP_DAC_OVERRIDE) {
return nil
}
@@ -147,7 +152,16 @@ func MayWriteFileWithOpenFlags(flags uint32) bool {
// CheckSetStat checks that creds has permission to change the metadata of a
// file with the given permissions, UID, and GID as specified by stat, subject
// to the rules of Linux's fs/attr.c:setattr_prepare().
-func CheckSetStat(creds *auth.Credentials, stat *linux.Statx, mode uint16, kuid auth.KUID, kgid auth.KGID) error {
+func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
+ if stat.Mask&linux.STATX_SIZE != 0 {
+ limit, err := CheckLimit(ctx, 0, int64(stat.Size))
+ if err != nil {
+ return err
+ }
+ if limit < int64(stat.Size) {
+ return syserror.ErrExceedsFileSizeLimit
+ }
+ }
if stat.Mask&linux.STATX_MODE != 0 {
if !CanActAsOwner(creds, kuid) {
return syserror.EPERM
@@ -177,11 +191,7 @@ func CheckSetStat(creds *auth.Credentials, stat *linux.Statx, mode uint16, kuid
(stat.Mask&linux.STATX_CTIME != 0 && stat.Ctime.Nsec != linux.UTIME_NOW) {
return syserror.EPERM
}
- // isDir is irrelevant in the following call to
- // GenericCheckPermissions since ats == MayWrite means that
- // CAP_DAC_READ_SEARCH does not apply, and CAP_DAC_OVERRIDE
- // applies, regardless of isDir.
- if err := GenericCheckPermissions(creds, MayWrite, false /* isDir */, mode, kuid, kgid); err != nil {
+ if err := GenericCheckPermissions(creds, MayWrite, mode, kuid, kgid); err != nil {
return err
}
}
@@ -205,3 +215,21 @@ func CanActAsOwner(creds *auth.Credentials, kuid auth.KUID) bool {
func HasCapabilityOnFile(creds *auth.Credentials, cp linux.Capability, kuid auth.KUID, kgid auth.KGID) bool {
return creds.HasCapability(cp) && creds.UserNamespace.MapFromKUID(kuid).Ok() && creds.UserNamespace.MapFromKGID(kgid).Ok()
}
+
+// CheckLimit enforces file size rlimits. It returns error if the write
+// operation must not proceed. Otherwise it returns the max length allowed to
+// without violating the limit.
+func CheckLimit(ctx context.Context, offset, size int64) (int64, error) {
+ fileSizeLimit := limits.FromContext(ctx).Get(limits.FileSize).Cur
+ if fileSizeLimit > math.MaxInt64 {
+ return size, nil
+ }
+ if offset >= int64(fileSizeLimit) {
+ return 0, syserror.ErrExceedsFileSizeLimit
+ }
+ remaining := int64(fileSizeLimit) - offset
+ if remaining < size {
+ return remaining, nil
+ }
+ return size, nil
+}
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index 8a0b382f6..9d047ff88 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -29,7 +29,9 @@ import (
//
// From the perspective of FilesystemImpl methods, a ResolvingPath represents a
// starting Dentry on the associated Filesystem (on which a reference is
-// already held) and a stream of path components relative to that Dentry.
+// already held), a stream of path components relative to that Dentry, and
+// elements of the invoking Context that are commonly required by
+// FilesystemImpl methods.
//
// ResolvingPath is loosely analogous to Linux's struct nameidata.
type ResolvingPath struct {
@@ -228,7 +230,7 @@ func (rp *ResolvingPath) Advance() {
rp.pit = next
} else { // at end of path segment, continue with next one
rp.curPart--
- rp.pit = rp.parts[rp.curPart-1]
+ rp.pit = rp.parts[rp.curPart]
}
}
@@ -251,18 +253,17 @@ func (rp *ResolvingPath) relpathCommit() {
rp.origParts[rp.curPart] = rp.pit
}
-// ResolveParent returns the VFS parent of d. It does not take a reference on
-// the returned Dentry.
-//
-// Preconditions: There are no concurrent mutators of d.
-//
-// Postconditions: If the returned error is nil, then the returned Dentry is
-// not nil.
-func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) {
- var parent *Dentry
+// CheckRoot is called before resolving the parent of the Dentry d. If the
+// Dentry is contextually a VFS root, such that path resolution should treat
+// d's parent as itself, CheckRoot returns (true, nil). If the Dentry is the
+// root of a non-root mount, such that path resolution should switch to another
+// Mount, CheckRoot returns (unspecified, non-nil error). Otherwise, path
+// resolution should resolve d's parent normally, and CheckRoot returns (false,
+// nil).
+func (rp *ResolvingPath) CheckRoot(d *Dentry) (bool, error) {
if d == rp.root.dentry && rp.mount == rp.root.mount {
- // At contextual VFS root.
- parent = d
+ // At contextual VFS root (due to e.g. chroot(2)).
+ return true, nil
} else if d == rp.mount.root {
// At mount root ...
vd := rp.vfs.getMountpointAt(rp.mount, rp.root)
@@ -270,69 +271,49 @@ func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) {
// ... of non-root mount.
rp.nextMount = vd.mount
rp.nextStart = vd.dentry
- return nil, resolveMountRootOrJumpError{}
+ return false, resolveMountRootOrJumpError{}
}
// ... of root mount.
- parent = d
- } else if d.parent == nil {
- // At filesystem root.
- parent = d
- } else {
- parent = d.parent
- }
- if parent.isMounted() {
- if mnt := rp.vfs.getMountAt(rp.mount, parent); mnt != nil {
- rp.nextMount = mnt
- return nil, resolveMountPointError{}
- }
+ return true, nil
}
- return parent, nil
+ return false, nil
}
-// ResolveChild returns the VFS child of d with the given name. It does not
-// take a reference on the returned Dentry. If no such child exists,
-// ResolveChild returns (nil, nil).
-//
-// Preconditions: There are no concurrent mutators of d.
-func (rp *ResolvingPath) ResolveChild(d *Dentry, name string) (*Dentry, error) {
- child := d.children[name]
- if child == nil {
- return nil, nil
+// CheckMount is called after resolving the parent or child of another Dentry
+// to d. If d is a mount point, such that path resolution should switch to
+// another Mount, CheckMount returns a non-nil error. Otherwise, CheckMount
+// returns nil.
+func (rp *ResolvingPath) CheckMount(d *Dentry) error {
+ if !d.isMounted() {
+ return nil
}
- if child.isMounted() {
- if mnt := rp.vfs.getMountAt(rp.mount, child); mnt != nil {
- rp.nextMount = mnt
- return nil, resolveMountPointError{}
- }
- }
- return child, nil
-}
-
-// ResolveComponent returns the Dentry reached by starting at d and resolving
-// the current path component in the stream represented by rp. It does not
-// advance the stream. It does not take a reference on the returned Dentry. If
-// no such Dentry exists, ResolveComponent returns (nil, nil).
-//
-// Preconditions: !rp.Done(). There are no concurrent mutators of d.
-func (rp *ResolvingPath) ResolveComponent(d *Dentry) (*Dentry, error) {
- switch pc := rp.Component(); pc {
- case ".":
- return d, nil
- case "..":
- return rp.ResolveParent(d)
- default:
- return rp.ResolveChild(d, pc)
+ if mnt := rp.vfs.getMountAt(rp.mount, d); mnt != nil {
+ rp.nextMount = mnt
+ return resolveMountPointError{}
}
+ return nil
}
// ShouldFollowSymlink returns true if, supposing that the current path
// component in pcs represents a symbolic link, the symbolic link should be
// followed.
//
+// If path is terminated with '/', the '/' is considered the last element and
+// any symlink before that is followed:
+// - For most non-creating walks, the last path component is handled by
+// fs/namei.c:lookup_last(), which sets LOOKUP_FOLLOW if the first byte
+// after the path component is non-NULL (which is only possible if it's '/')
+// and the path component is of type LAST_NORM.
+//
+// - For open/openat/openat2 without O_CREAT, the last path component is
+// handled by fs/namei.c:do_last(), which does the same, though without the
+// LAST_NORM check.
+//
// Preconditions: !rp.Done().
func (rp *ResolvingPath) ShouldFollowSymlink() bool {
- // Non-final symlinks are always followed.
- return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final()
+ // Non-final symlinks are always followed. Paths terminated with '/' are also
+ // always followed.
+ return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final() || rp.MustBeDir()
}
// HandleSymlink is called when the current path component is a symbolic link
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 9629afee9..9acca8bc7 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -38,6 +38,7 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -46,11 +47,13 @@ import (
//
// There is no analogue to the VirtualFilesystem type in Linux, as the
// equivalent state in Linux is global.
+//
+// +stateify savable
type VirtualFilesystem struct {
// mountMu serializes mount mutations.
//
// mountMu is analogous to Linux's namespace_sem.
- mountMu sync.Mutex
+ mountMu sync.Mutex `state:"nosave"`
// mounts maps (mount parent, mount point) pairs to mounts. (Since mounts
// are uniquely namespaced, including mount parent in the key correctly
@@ -79,6 +82,10 @@ type VirtualFilesystem struct {
// mountpoints is analogous to Linux's mountpoint_hashtable.
mountpoints map[*Dentry]map[*Mount]struct{}
+ // lastMountID is the last allocated mount ID. lastMountID is accessed
+ // using atomic memory operations.
+ lastMountID uint64
+
// anonMount is a Mount, not included in mounts or mountpoints,
// representing an anonFilesystem. anonMount is used to back
// VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
@@ -89,56 +96,59 @@ type VirtualFilesystem struct {
// devices contains all registered Devices. devices is protected by
// devicesMu.
- devicesMu sync.RWMutex
+ devicesMu sync.RWMutex `state:"nosave"`
devices map[devTuple]*registeredDevice
// anonBlockDevMinor contains all allocated anonymous block device minor
// numbers. anonBlockDevMinorNext is a lower bound for the smallest
// unallocated anonymous block device number. anonBlockDevMinorNext and
// anonBlockDevMinor are protected by anonBlockDevMinorMu.
- anonBlockDevMinorMu sync.Mutex
+ anonBlockDevMinorMu sync.Mutex `state:"nosave"`
anonBlockDevMinorNext uint32
anonBlockDevMinor map[uint32]struct{}
// fsTypes contains all registered FilesystemTypes. fsTypes is protected by
// fsTypesMu.
- fsTypesMu sync.RWMutex
+ fsTypesMu sync.RWMutex `state:"nosave"`
fsTypes map[string]*registeredFilesystemType
// filesystems contains all Filesystems. filesystems is protected by
// filesystemsMu.
- filesystemsMu sync.Mutex
+ filesystemsMu sync.Mutex `state:"nosave"`
filesystems map[*Filesystem]struct{}
}
-// New returns a new VirtualFilesystem with no mounts or FilesystemTypes.
-func New() *VirtualFilesystem {
- vfs := &VirtualFilesystem{
- mountpoints: make(map[*Dentry]map[*Mount]struct{}),
- devices: make(map[devTuple]*registeredDevice),
- anonBlockDevMinorNext: 1,
- anonBlockDevMinor: make(map[uint32]struct{}),
- fsTypes: make(map[string]*registeredFilesystemType),
- filesystems: make(map[*Filesystem]struct{}),
- }
+// Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes.
+func (vfs *VirtualFilesystem) Init() error {
+ vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{})
+ vfs.devices = make(map[devTuple]*registeredDevice)
+ vfs.anonBlockDevMinorNext = 1
+ vfs.anonBlockDevMinor = make(map[uint32]struct{})
+ vfs.fsTypes = make(map[string]*registeredFilesystemType)
+ vfs.filesystems = make(map[*Filesystem]struct{})
vfs.mounts.Init()
// Construct vfs.anonMount.
anonfsDevMinor, err := vfs.GetAnonBlockDevMinor()
if err != nil {
- panic(fmt.Sprintf("VirtualFilesystem.GetAnonBlockDevMinor() failed during VirtualFilesystem construction: %v", err))
+ // This shouldn't be possible since anonBlockDevMinorNext was
+ // initialized to 1 above (no device numbers have been allocated yet).
+ panic(fmt.Sprintf("VirtualFilesystem.Init: device number allocation for anonfs failed: %v", err))
}
anonfs := anonFilesystem{
devMinor: anonfsDevMinor,
}
- anonfs.vfsfs.Init(vfs, &anonfs)
- vfs.anonMount = &Mount{
- vfs: vfs,
- fs: &anonfs.vfsfs,
- refs: 1,
+ anonfs.vfsfs.Init(vfs, &anonFilesystemType{}, &anonfs)
+ defer anonfs.vfsfs.DecRef()
+ anonMount, err := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{})
+ if err != nil {
+ // We should not be passing any MountOptions that would cause
+ // construction of this mount to fail.
+ panic(fmt.Sprintf("VirtualFilesystem.Init: anonfs mount failed: %v", err))
}
+ vfs.anonMount = anonMount
- return vfs
+ return nil
}
// PathOperation specifies the path operated on by a VFS method.
@@ -169,6 +179,23 @@ type PathOperation struct {
FollowFinalSymlink bool
}
+// AccessAt checks whether a user with creds has access to the file at
+// the given path.
+func (vfs *VirtualFilesystem) AccessAt(ctx context.Context, creds *auth.Credentials, ats AccessTypes, pop *PathOperation) error {
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ err := rp.mount.fs.impl.AccessAt(ctx, rp, creds, ats)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return err
+ }
+ }
+}
+
// GetDentryAt returns a VirtualDentry representing the given path, at which a
// file must exist. A reference is taken on the returned VirtualDentry.
func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
@@ -208,7 +235,7 @@ func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *au
}
if checkInvariants {
if rp.canHandleError(err) && rp.Done() {
- panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+ panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
if !rp.handleError(err) {
@@ -249,7 +276,7 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential
}
if checkInvariants {
if rp.canHandleError(err) && rp.Done() {
- panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+ panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
if !rp.handleError(err) {
@@ -285,7 +312,7 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia
}
if checkInvariants {
if rp.canHandleError(err) && rp.Done() {
- panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+ panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
if !rp.handleError(err) {
@@ -312,13 +339,13 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia
rp := vfs.getResolvingPath(creds, pop)
for {
err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts)
- if err != nil {
+ if err == nil {
vfs.putResolvingPath(rp)
return nil
}
if checkInvariants {
if rp.canHandleError(err) && rp.Done() {
- panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+ panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
if !rp.handleError(err) {
@@ -333,14 +360,11 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia
func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
// Remove:
//
- // - O_LARGEFILE, which we always report in FileDescription status flags
- // since only 64-bit architectures are supported at this time.
- //
// - O_CLOEXEC, which affects file descriptors and therefore must be
// handled outside of VFS.
//
// - Unknown flags.
- opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
+ opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_LARGEFILE | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
// Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC.
if opts.Flags&linux.O_SYNC != 0 {
opts.Flags |= linux.O_DSYNC
@@ -380,24 +404,25 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
if err == nil {
vfs.putResolvingPath(rp)
- // TODO(gvisor.dev/issue/1193): Move inside fsimpl to avoid another call
- // to FileDescription.Stat().
if opts.FileExec {
+ if fd.Mount().Flags.NoExec {
+ fd.DecRef()
+ return nil, syserror.EACCES
+ }
+
// Only a regular file can be executed.
stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE})
if err != nil {
+ fd.DecRef()
return nil, err
}
- if stat.Mask&linux.STATX_TYPE != 0 {
- // This shouldn't happen, but if type can't be retrieved, file can't
- // be executed.
- return nil, syserror.EACCES
- }
- if linux.FileMode(stat.Mode).FileType() != linux.ModeRegular {
+ if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG {
+ fd.DecRef()
return nil, syserror.EACCES
}
}
+ fd.Dentry().InotifyWithParent(linux.IN_OPEN, 0, PathEvent)
return fd, nil
}
if !rp.handleError(err) {
@@ -472,7 +497,7 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti
}
if checkInvariants {
if rp.canHandleError(err) && rp.Done() {
- panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+ panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
if !rp.handleError(err) {
@@ -505,7 +530,7 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia
}
if checkInvariants {
if rp.canHandleError(err) && rp.Done() {
- panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+ panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
if !rp.handleError(err) {
@@ -586,7 +611,7 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent
}
if checkInvariants {
if rp.canHandleError(err) && rp.Done() {
- panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+ panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
if !rp.handleError(err) {
@@ -618,7 +643,7 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
}
if checkInvariants {
if rp.canHandleError(err) && rp.Done() {
- panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+ panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
}
}
if !rp.handleError(err) {
@@ -628,12 +653,39 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
}
}
+// BoundEndpointAt gets the bound endpoint at the given path, if one exists.
+func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *BoundEndpointOptions) (transport.BoundEndpoint, error) {
+ if !pop.Path.Begin.Ok() {
+ if pop.Path.Absolute {
+ return nil, syserror.ECONNREFUSED
+ }
+ return nil, syserror.ENOENT
+ }
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return bep, nil
+ }
+ if checkInvariants {
+ if rp.canHandleError(err) && rp.Done() {
+ panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
+ }
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return nil, err
+ }
+ }
+}
+
// ListxattrAt returns all extended attribute names for the file at the given
// path.
-func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) ([]string, error) {
+func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) {
rp := vfs.getResolvingPath(creds, pop)
for {
- names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp)
+ names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp, size)
if err == nil {
vfs.putResolvingPath(rp)
return names, nil
@@ -655,10 +707,10 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede
// GetxattrAt returns the value associated with the given extended attribute
// for the file at the given path.
-func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) (string, error) {
+func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetxattrOptions) (string, error) {
rp := vfs.getResolvingPath(creds, pop)
for {
- val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, name)
+ val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
if err == nil {
vfs.putResolvingPath(rp)
return val, nil
@@ -743,11 +795,21 @@ func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error {
// VirtualDentry methods require that a reference is held on the VirtualDentry.
//
// VirtualDentry is analogous to Linux's struct path.
+//
+// +stateify savable
type VirtualDentry struct {
mount *Mount
dentry *Dentry
}
+// MakeVirtualDentry creates a VirtualDentry.
+func MakeVirtualDentry(mount *Mount, dentry *Dentry) VirtualDentry {
+ return VirtualDentry{
+ mount: mount,
+ dentry: dentry,
+ }
+}
+
// Ok returns true if vd is not empty. It does not require that a reference is
// held.
func (vd VirtualDentry) Ok() bool {