diff options
Diffstat (limited to 'pkg/sentry/vfs')
-rw-r--r-- | pkg/sentry/vfs/BUILD | 38 | ||||
-rw-r--r-- | pkg/sentry/vfs/README.md | 9 | ||||
-rw-r--r-- | pkg/sentry/vfs/anonfs.go | 26 | ||||
-rw-r--r-- | pkg/sentry/vfs/context.go | 24 | ||||
-rw-r--r-- | pkg/sentry/vfs/dentry.go | 7 | ||||
-rw-r--r-- | pkg/sentry/vfs/device.go | 3 | ||||
-rw-r--r-- | pkg/sentry/vfs/epoll.go | 25 | ||||
-rw-r--r-- | pkg/sentry/vfs/file_description.go | 154 | ||||
-rw-r--r-- | pkg/sentry/vfs/file_description_impl_util.go | 52 | ||||
-rw-r--r-- | pkg/sentry/vfs/filesystem.go | 108 | ||||
-rw-r--r-- | pkg/sentry/vfs/filesystem_impl_util.go | 13 | ||||
-rw-r--r-- | pkg/sentry/vfs/filesystem_type.go | 12 | ||||
-rw-r--r-- | pkg/sentry/vfs/g3doc/inotify.md | 18 | ||||
-rw-r--r-- | pkg/sentry/vfs/genericfstree/genericfstree.go | 2 | ||||
-rw-r--r-- | pkg/sentry/vfs/inotify.go | 2 | ||||
-rw-r--r-- | pkg/sentry/vfs/lock.go | 2 | ||||
-rw-r--r-- | pkg/sentry/vfs/memxattr/xattr.go | 16 | ||||
-rw-r--r-- | pkg/sentry/vfs/mount.go | 134 | ||||
-rw-r--r-- | pkg/sentry/vfs/mount_test.go | 26 | ||||
-rw-r--r-- | pkg/sentry/vfs/mount_unsafe.go | 31 | ||||
-rw-r--r-- | pkg/sentry/vfs/options.go | 52 | ||||
-rw-r--r-- | pkg/sentry/vfs/permissions.go | 42 | ||||
-rw-r--r-- | pkg/sentry/vfs/resolving_path.go | 5 | ||||
-rw-r--r-- | pkg/sentry/vfs/vfs.go | 115 |
24 files changed, 621 insertions, 295 deletions
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 642769e7c..c855608db 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -27,6 +27,39 @@ go_template_instance( }, ) +go_template_instance( + name = "file_description_refs", + out = "file_description_refs.go", + package = "vfs", + prefix = "FileDescription", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "FileDescription", + }, +) + +go_template_instance( + name = "mount_namespace_refs", + out = "mount_namespace_refs.go", + package = "vfs", + prefix = "MountNamespace", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "MountNamespace", + }, +) + +go_template_instance( + name = "filesystem_refs", + out = "filesystem_refs.go", + package = "vfs", + prefix = "Filesystem", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "Filesystem", + }, +) + go_library( name = "vfs", srcs = [ @@ -40,12 +73,15 @@ go_library( "event_list.go", "file_description.go", "file_description_impl_util.go", + "file_description_refs.go", "filesystem.go", "filesystem_impl_util.go", + "filesystem_refs.go", "filesystem_type.go", "inotify.go", "lock.go", "mount.go", + "mount_namespace_refs.go", "mount_unsafe.go", "options.go", "pathname.go", @@ -56,13 +92,13 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/binary", "//pkg/context", "//pkg/fd", "//pkg/fdnotifier", "//pkg/fspath", "//pkg/gohacks", "//pkg/log", + "//pkg/refs", "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/fs", diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md index 4b9faf2ea..5aad31b78 100644 --- a/pkg/sentry/vfs/README.md +++ b/pkg/sentry/vfs/README.md @@ -184,12 +184,3 @@ This construction, which is essentially a type-safe analogue to Linux's - File locking - `O_ASYNC` - -- Reference counts in the `vfs` package do not use the `refs` package since - `refs.AtomicRefCount` adds 64 bytes of overhead to each 8-byte reference - count, resulting in considerable cache bloat. 24 bytes of this overhead is - for weak reference support, which have poor performance and will not be used - by VFS2. The remaining 40 bytes is to store a descriptive string and stack - trace for reference leak checking; we can support reference leak checking - without incurring this space overhead by including the applicable - information directly in finalizers for applicable types. diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index 5a0e3e6b5..7ad0eaf86 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -52,6 +52,8 @@ const ( ) // anonFilesystemType implements FilesystemType. +// +// +stateify savable type anonFilesystemType struct{} // GetFilesystem implements FilesystemType.GetFilesystem. @@ -59,22 +61,28 @@ func (anonFilesystemType) GetFilesystem(context.Context, *VirtualFilesystem, *au panic("cannot instaniate an anon filesystem") } -// Name implemenents FilesystemType.Name. +// Name implements FilesystemType.Name. func (anonFilesystemType) Name() string { return "none" } +// Release implemenents FilesystemType.Release. +func (anonFilesystemType) Release(ctx context.Context) {} + // anonFilesystem is the implementation of FilesystemImpl that backs // VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry(). // // Since all Dentries in anonFilesystem are non-directories, all FilesystemImpl // methods that would require an anonDentry to be a directory return ENOTDIR. +// +// +stateify savable type anonFilesystem struct { vfsfs Filesystem devMinor uint32 } +// +stateify savable type anonDentry struct { vfsd Dentry @@ -245,32 +253,32 @@ func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath return nil, syserror.ECONNREFUSED } -// ListxattrAt implements FilesystemImpl.ListxattrAt. -func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) { +// ListXattrAt implements FilesystemImpl.ListXattrAt. +func (fs *anonFilesystem) ListXattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) { if !rp.Done() { return nil, syserror.ENOTDIR } return nil, nil } -// GetxattrAt implements FilesystemImpl.GetxattrAt. -func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error) { +// GetXattrAt implements FilesystemImpl.GetXattrAt. +func (fs *anonFilesystem) GetXattrAt(ctx context.Context, rp *ResolvingPath, opts GetXattrOptions) (string, error) { if !rp.Done() { return "", syserror.ENOTDIR } return "", syserror.ENOTSUP } -// SetxattrAt implements FilesystemImpl.SetxattrAt. -func (fs *anonFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error { +// SetXattrAt implements FilesystemImpl.SetXattrAt. +func (fs *anonFilesystem) SetXattrAt(ctx context.Context, rp *ResolvingPath, opts SetXattrOptions) error { if !rp.Done() { return syserror.ENOTDIR } return syserror.EPERM } -// RemovexattrAt implements FilesystemImpl.RemovexattrAt. -func (fs *anonFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error { +// RemoveXattrAt implements FilesystemImpl.RemoveXattrAt. +func (fs *anonFilesystem) RemoveXattrAt(ctx context.Context, rp *ResolvingPath, name string) error { if !rp.Done() { return syserror.ENOTDIR } diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go index c9e724fef..97018651f 100644 --- a/pkg/sentry/vfs/context.go +++ b/pkg/sentry/vfs/context.go @@ -40,6 +40,30 @@ func MountNamespaceFromContext(ctx context.Context) *MountNamespace { return nil } +type mountNamespaceContext struct { + context.Context + mntns *MountNamespace +} + +// Value implements Context.Value. +func (mc mountNamespaceContext) Value(key interface{}) interface{} { + switch key { + case CtxMountNamespace: + mc.mntns.IncRef() + return mc.mntns + default: + return mc.Context.Value(key) + } +} + +// WithMountNamespace returns a copy of ctx with the given MountNamespace. +func WithMountNamespace(ctx context.Context, mntns *MountNamespace) context.Context { + return &mountNamespaceContext{ + Context: ctx, + mntns: mntns, + } +} + // RootFromContext returns the VFS root used by ctx. It takes a reference on // the returned VirtualDentry. If ctx does not have a specific VFS root, // RootFromContext returns a zero-value VirtualDentry. diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index bc7ea93ea..320ab7ce1 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -89,6 +89,8 @@ func (d *Dentry) Impl() DentryImpl { // DentryImpl contains implementation details for a Dentry. Implementations of // DentryImpl should contain their associated Dentry by value as their first // field. +// +// +stateify savable type DentryImpl interface { // IncRef increments the Dentry's reference count. A Dentry with a non-zero // reference count must remain coherent with the state of the filesystem. @@ -242,8 +244,9 @@ func (vfs *VirtualFilesystem) InvalidateDentry(ctx context.Context, d *Dentry) { // caller must call AbortRenameDentry, CommitRenameReplaceDentry, or // CommitRenameExchangeDentry depending on the rename's outcome. // -// Preconditions: If to is not nil, it must be a child Dentry from the same -// Filesystem. from != to. +// Preconditions: +// * If to is not nil, it must be a child Dentry from the same Filesystem. +// * from != to. func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error { vfs.mountMu.Lock() if mntns.mountpoints[from] != 0 { diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go index 1e9dffc8f..dde2ad79b 100644 --- a/pkg/sentry/vfs/device.go +++ b/pkg/sentry/vfs/device.go @@ -22,6 +22,8 @@ import ( ) // DeviceKind indicates whether a device is a block or character device. +// +// +stateify savable type DeviceKind uint32 const ( @@ -44,6 +46,7 @@ func (kind DeviceKind) String() string { } } +// +stateify savable type devTuple struct { kind DeviceKind major uint32 diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go index 1b5af9f73..8f36c3e3b 100644 --- a/pkg/sentry/vfs/epoll.go +++ b/pkg/sentry/vfs/epoll.go @@ -27,6 +27,8 @@ import ( var epollCycleMu sync.Mutex // EpollInstance represents an epoll instance, as described by epoll(7). +// +// +stateify savable type EpollInstance struct { vfsfd FileDescription FileDescriptionDefaultImpl @@ -38,11 +40,11 @@ type EpollInstance struct { // interest is the set of file descriptors that are registered with the // EpollInstance for monitoring. interest is protected by interestMu. - interestMu sync.Mutex + interestMu sync.Mutex `state:"nosave"` interest map[epollInterestKey]*epollInterest // mu protects fields in registered epollInterests. - mu sync.Mutex + mu sync.Mutex `state:"nosave"` // ready is the set of file descriptors that may be "ready" for I/O. Note // that this must be an ordered list, not a map: "If more than maxevents @@ -55,6 +57,7 @@ type EpollInstance struct { ready epollInterestList } +// +stateify savable type epollInterestKey struct { // file is the registered FileDescription. No reference is held on file; // instead, when the last reference is dropped, FileDescription.DecRef() @@ -67,6 +70,8 @@ type epollInterestKey struct { } // epollInterest represents an EpollInstance's interest in a file descriptor. +// +// +stateify savable type epollInterest struct { // epoll is the owning EpollInstance. epoll is immutable. epoll *EpollInstance @@ -331,11 +336,9 @@ func (ep *EpollInstance) removeLocked(epi *epollInterest) { ep.mu.Unlock() } -// ReadEvents reads up to len(events) ready events into events and returns the -// number of events read. -// -// Preconditions: len(events) != 0. -func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int { +// ReadEvents appends up to maxReady events to events and returns the updated +// slice of events. +func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent, maxEvents int) []linux.EpollEvent { i := 0 // Hot path: avoid defer. ep.mu.Lock() @@ -368,16 +371,16 @@ func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int { requeue.PushBack(epi) } // Report ievents. - events[i] = linux.EpollEvent{ + events = append(events, linux.EpollEvent{ Events: ievents.ToLinux(), Data: epi.userData, - } + }) i++ - if i == len(events) { + if i == maxEvents { break } } ep.ready.PushBackList(&requeue) ep.mu.Unlock() - return i + return events } diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index dcafffe57..183957ad8 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -37,13 +37,13 @@ import ( // FileDescription methods require that a reference is held. // // FileDescription is analogous to Linux's struct file. +// +// +stateify savable type FileDescription struct { - // refs is the reference count. refs is accessed using atomic memory - // operations. - refs int64 + FileDescriptionRefs // flagsMu protects statusFlags and asyncHandler below. - flagsMu sync.Mutex + flagsMu sync.Mutex `state:"nosave"` // statusFlags contains status flags, "initialized by open(2) and possibly // modified by fcntl()" - fcntl(2). statusFlags can be read using atomic @@ -58,7 +58,7 @@ type FileDescription struct { // epolls is the set of epollInterests registered for this FileDescription. // epolls is protected by epollMu. - epollMu sync.Mutex + epollMu sync.Mutex `state:"nosave"` epolls map[*epollInterest]struct{} // vd is the filesystem location at which this FileDescription was opened. @@ -90,6 +90,8 @@ type FileDescription struct { } // FileDescriptionOptions contains options to FileDescription.Init(). +// +// +stateify savable type FileDescriptionOptions struct { // If AllowDirectIO is true, allow O_DIRECT to be set on the file. AllowDirectIO bool @@ -103,7 +105,7 @@ type FileDescriptionOptions struct { // If UseDentryMetadata is true, calls to FileDescription methods that // interact with file and filesystem metadata (Stat, SetStat, StatFS, - // Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling + // ListXattr, GetXattr, SetXattr, RemoveXattr) are implemented by calling // the corresponding FilesystemImpl methods instead of the corresponding // FileDescriptionImpl methods. // @@ -131,7 +133,7 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou } } - fd.refs = 1 + fd.EnableLeakCheck() // Remove "file creation flags" to mirror the behavior from file.f_flags in // fs/open.c:do_dentry_open. @@ -149,30 +151,9 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou return nil } -// IncRef increments fd's reference count. -func (fd *FileDescription) IncRef() { - atomic.AddInt64(&fd.refs, 1) -} - -// TryIncRef increments fd's reference count and returns true. If fd's -// reference count is already zero, TryIncRef does nothing and returns false. -// -// TryIncRef does not require that a reference is held on fd. -func (fd *FileDescription) TryIncRef() bool { - for { - refs := atomic.LoadInt64(&fd.refs) - if refs <= 0 { - return false - } - if atomic.CompareAndSwapInt64(&fd.refs, refs, refs+1) { - return true - } - } -} - // DecRef decrements fd's reference count. func (fd *FileDescription) DecRef(ctx context.Context) { - if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 { + fd.FileDescriptionRefs.DecRef(func() { // Unregister fd from all epoll instances. fd.epollMu.Lock() epolls := fd.epolls @@ -208,15 +189,7 @@ func (fd *FileDescription) DecRef(ctx context.Context) { } fd.asyncHandler = nil fd.flagsMu.Unlock() - } else if refs < 0 { - panic("FileDescription.DecRef() called without holding a reference") - } -} - -// Refs returns the current number of references. The returned count -// is inherently racy and is unsafe to use without external synchronization. -func (fd *FileDescription) Refs() int64 { - return atomic.LoadInt64(&fd.refs) + }) } // Mount returns the mount on which fd was opened. It does not take a reference @@ -357,6 +330,9 @@ type FileDescriptionImpl interface { // Allocate grows the file to offset + length bytes. // Only mode == 0 is supported currently. // + // Allocate should return EISDIR on directories, ESPIPE on pipes, and ENODEV on + // other files where it is not supported. + // // Preconditions: The FileDescription was opened for writing. Allocate(ctx context.Context, mode, offset, length uint64) error @@ -371,8 +347,9 @@ type FileDescriptionImpl interface { // // - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP. // - // Preconditions: The FileDescription was opened for reading. - // FileDescriptionOptions.DenyPRead == false. + // Preconditions: + // * The FileDescription was opened for reading. + // * FileDescriptionOptions.DenyPRead == false. PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) // Read is similar to PRead, but does not specify an offset. @@ -403,8 +380,9 @@ type FileDescriptionImpl interface { // - If opts.Flags specifies unsupported options, PWrite returns // EOPNOTSUPP. // - // Preconditions: The FileDescription was opened for writing. - // FileDescriptionOptions.DenyPWrite == false. + // Preconditions: + // * The FileDescription was opened for writing. + // * FileDescriptionOptions.DenyPWrite == false. PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) // Write is similar to PWrite, but does not specify an offset, which is @@ -449,19 +427,19 @@ type FileDescriptionImpl interface { // Ioctl implements the ioctl(2) syscall. Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) - // Listxattr returns all extended attribute names for the file. - Listxattr(ctx context.Context, size uint64) ([]string, error) + // ListXattr returns all extended attribute names for the file. + ListXattr(ctx context.Context, size uint64) ([]string, error) - // Getxattr returns the value associated with the given extended attribute + // GetXattr returns the value associated with the given extended attribute // for the file. - Getxattr(ctx context.Context, opts GetxattrOptions) (string, error) + GetXattr(ctx context.Context, opts GetXattrOptions) (string, error) - // Setxattr changes the value associated with the given extended attribute + // SetXattr changes the value associated with the given extended attribute // for the file. - Setxattr(ctx context.Context, opts SetxattrOptions) error + SetXattr(ctx context.Context, opts SetXattrOptions) error - // Removexattr removes the given extended attribute from the file. - Removexattr(ctx context.Context, name string) error + // RemoveXattr removes the given extended attribute from the file. + RemoveXattr(ctx context.Context, name string) error // LockBSD tries to acquire a BSD-style advisory file lock. LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error @@ -477,6 +455,8 @@ type FileDescriptionImpl interface { } // Dirent holds the information contained in struct linux_dirent64. +// +// +stateify savable type Dirent struct { // Name is the filename. Name string @@ -664,25 +644,25 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch. return fd.impl.Ioctl(ctx, uio, args) } -// Listxattr returns all extended attribute names for the file represented by +// ListXattr returns all extended attribute names for the file represented by // fd. // // If the size of the list (including a NUL terminating byte after every entry) // would exceed size, ERANGE may be returned. Note that implementations // are free to ignore size entirely and return without error). In all cases, // if size is 0, the list should be returned without error, regardless of size. -func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) { +func (fd *FileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { if fd.opts.UseDentryMetadata { vfsObj := fd.vd.mount.vfs rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ Root: fd.vd, Start: fd.vd, }) - names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp, size) + names, err := fd.vd.mount.fs.impl.ListXattrAt(ctx, rp, size) vfsObj.putResolvingPath(ctx, rp) return names, err } - names, err := fd.impl.Listxattr(ctx, size) + names, err := fd.impl.ListXattr(ctx, size) if err == syserror.ENOTSUP { // Linux doesn't actually return ENOTSUP in this case; instead, // fs/xattr.c:vfs_listxattr() falls back to allowing the security @@ -693,57 +673,57 @@ func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string return names, err } -// Getxattr returns the value associated with the given extended attribute for +// GetXattr returns the value associated with the given extended attribute for // the file represented by fd. // // If the size of the return value exceeds opts.Size, ERANGE may be returned // (note that implementations are free to ignore opts.Size entirely and return // without error). In all cases, if opts.Size is 0, the value should be // returned without error, regardless of size. -func (fd *FileDescription) Getxattr(ctx context.Context, opts *GetxattrOptions) (string, error) { +func (fd *FileDescription) GetXattr(ctx context.Context, opts *GetXattrOptions) (string, error) { if fd.opts.UseDentryMetadata { vfsObj := fd.vd.mount.vfs rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ Root: fd.vd, Start: fd.vd, }) - val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, *opts) + val, err := fd.vd.mount.fs.impl.GetXattrAt(ctx, rp, *opts) vfsObj.putResolvingPath(ctx, rp) return val, err } - return fd.impl.Getxattr(ctx, *opts) + return fd.impl.GetXattr(ctx, *opts) } -// Setxattr changes the value associated with the given extended attribute for +// SetXattr changes the value associated with the given extended attribute for // the file represented by fd. -func (fd *FileDescription) Setxattr(ctx context.Context, opts *SetxattrOptions) error { +func (fd *FileDescription) SetXattr(ctx context.Context, opts *SetXattrOptions) error { if fd.opts.UseDentryMetadata { vfsObj := fd.vd.mount.vfs rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ Root: fd.vd, Start: fd.vd, }) - err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, *opts) + err := fd.vd.mount.fs.impl.SetXattrAt(ctx, rp, *opts) vfsObj.putResolvingPath(ctx, rp) return err } - return fd.impl.Setxattr(ctx, *opts) + return fd.impl.SetXattr(ctx, *opts) } -// Removexattr removes the given extended attribute from the file represented +// RemoveXattr removes the given extended attribute from the file represented // by fd. -func (fd *FileDescription) Removexattr(ctx context.Context, name string) error { +func (fd *FileDescription) RemoveXattr(ctx context.Context, name string) error { if fd.opts.UseDentryMetadata { vfsObj := fd.vd.mount.vfs rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ Root: fd.vd, Start: fd.vd, }) - err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name) + err := fd.vd.mount.fs.impl.RemoveXattrAt(ctx, rp, name) vfsObj.putResolvingPath(ctx, rp) return err } - return fd.impl.Removexattr(ctx, name) + return fd.impl.RemoveXattr(ctx, name) } // SyncFS instructs the filesystem containing fd to execute the semantics of @@ -845,3 +825,45 @@ func (fd *FileDescription) SetAsyncHandler(newHandler func() FileAsync) FileAsyn } return fd.asyncHandler } + +// FileReadWriteSeeker is a helper struct to pass a FileDescription as +// io.Reader/io.Writer/io.ReadSeeker/io.ReaderAt/io.WriterAt/etc. +type FileReadWriteSeeker struct { + FD *FileDescription + Ctx context.Context + ROpts ReadOptions + WOpts WriteOptions +} + +// ReadAt implements io.ReaderAt.ReadAt. +func (f *FileReadWriteSeeker) ReadAt(p []byte, off int64) (int, error) { + dst := usermem.BytesIOSequence(p) + n, err := f.FD.PRead(f.Ctx, dst, off, f.ROpts) + return int(n), err +} + +// Read implements io.ReadWriteSeeker.Read. +func (f *FileReadWriteSeeker) Read(p []byte) (int, error) { + dst := usermem.BytesIOSequence(p) + n, err := f.FD.Read(f.Ctx, dst, f.ROpts) + return int(n), err +} + +// Seek implements io.ReadWriteSeeker.Seek. +func (f *FileReadWriteSeeker) Seek(offset int64, whence int) (int64, error) { + return f.FD.Seek(f.Ctx, offset, int32(whence)) +} + +// WriteAt implements io.WriterAt.WriteAt. +func (f *FileReadWriteSeeker) WriteAt(p []byte, off int64) (int, error) { + dst := usermem.BytesIOSequence(p) + n, err := f.FD.PWrite(f.Ctx, dst, off, f.WOpts) + return int(n), err +} + +// Write implements io.ReadWriteSeeker.Write. +func (f *FileReadWriteSeeker) Write(p []byte) (int, error) { + buf := usermem.BytesIOSequence(p) + n, err := f.FD.Write(f.Ctx, buf, f.WOpts) + return int(n), err +} diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index 6b8b4ad49..48ca9de44 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -42,6 +42,8 @@ import ( // FileDescriptionDefaultImpl may be embedded by implementations of // FileDescriptionImpl to obtain implementations of many FileDescriptionImpl // methods with default behavior analogous to Linux's. +// +// +stateify savable type FileDescriptionDefaultImpl struct{} // OnClose implements FileDescriptionImpl.OnClose analogously to @@ -57,7 +59,11 @@ func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, err } // Allocate implements FileDescriptionImpl.Allocate analogously to -// fallocate called on regular file, directory or FIFO in Linux. +// fallocate called on an invalid type of file in Linux. +// +// Note that directories can rely on this implementation even though they +// should technically return EISDIR. Allocate should never be called for a +// directory, because it requires a writable fd. func (FileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error { return syserror.ENODEV } @@ -134,34 +140,36 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg return 0, syserror.ENOTTY } -// Listxattr implements FileDescriptionImpl.Listxattr analogously to +// ListXattr implements FileDescriptionImpl.ListXattr analogously to // inode_operations::listxattr == NULL in Linux. -func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context, size uint64) ([]string, error) { - // This isn't exactly accurate; see FileDescription.Listxattr. +func (FileDescriptionDefaultImpl) ListXattr(ctx context.Context, size uint64) ([]string, error) { + // This isn't exactly accurate; see FileDescription.ListXattr. return nil, syserror.ENOTSUP } -// Getxattr implements FileDescriptionImpl.Getxattr analogously to +// GetXattr implements FileDescriptionImpl.GetXattr analogously to // inode::i_opflags & IOP_XATTR == 0 in Linux. -func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, opts GetxattrOptions) (string, error) { +func (FileDescriptionDefaultImpl) GetXattr(ctx context.Context, opts GetXattrOptions) (string, error) { return "", syserror.ENOTSUP } -// Setxattr implements FileDescriptionImpl.Setxattr analogously to +// SetXattr implements FileDescriptionImpl.SetXattr analogously to // inode::i_opflags & IOP_XATTR == 0 in Linux. -func (FileDescriptionDefaultImpl) Setxattr(ctx context.Context, opts SetxattrOptions) error { +func (FileDescriptionDefaultImpl) SetXattr(ctx context.Context, opts SetXattrOptions) error { return syserror.ENOTSUP } -// Removexattr implements FileDescriptionImpl.Removexattr analogously to +// RemoveXattr implements FileDescriptionImpl.RemoveXattr analogously to // inode::i_opflags & IOP_XATTR == 0 in Linux. -func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) error { +func (FileDescriptionDefaultImpl) RemoveXattr(ctx context.Context, name string) error { return syserror.ENOTSUP } // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of // FileDescriptionImpl that always represent directories to obtain // implementations of non-directory I/O methods that return EISDIR. +// +// +stateify savable type DirectoryFileDescriptionDefaultImpl struct{} // Allocate implements DirectoryFileDescriptionDefaultImpl.Allocate. @@ -192,6 +200,8 @@ func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src userme // DentryMetadataFileDescriptionImpl may be embedded by implementations of // FileDescriptionImpl for which FileDescriptionOptions.UseDentryMetadata is // true to obtain implementations of Stat and SetStat that panic. +// +// +stateify savable type DentryMetadataFileDescriptionImpl struct{} // Stat implements FileDescriptionImpl.Stat. @@ -206,12 +216,16 @@ func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetSt // DynamicBytesSource represents a data source for a // DynamicBytesFileDescriptionImpl. +// +// +stateify savable type DynamicBytesSource interface { // Generate writes the file's contents to buf. Generate(ctx context.Context, buf *bytes.Buffer) error } // StaticData implements DynamicBytesSource over a static string. +// +// +stateify savable type StaticData struct { Data string } @@ -238,14 +252,24 @@ type WritableDynamicBytesSource interface { // // DynamicBytesFileDescriptionImpl.SetDataSource() must be called before first // use. +// +// +stateify savable type DynamicBytesFileDescriptionImpl struct { data DynamicBytesSource // immutable - mu sync.Mutex // protects the following fields - buf bytes.Buffer + mu sync.Mutex `state:"nosave"` // protects the following fields + buf bytes.Buffer `state:".([]byte)"` off int64 lastRead int64 // offset at which the last Read, PRead, or Seek ended } +func (fd *DynamicBytesFileDescriptionImpl) saveBuf() []byte { + return fd.buf.Bytes() +} + +func (fd *DynamicBytesFileDescriptionImpl) loadBuf(p []byte) { + fd.buf.Write(p) +} + // SetDataSource must be called exactly once on fd before first use. func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) { fd.data = data @@ -378,6 +402,8 @@ func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.M // LockFD may be used by most implementations of FileDescriptionImpl.Lock* // functions. Caller must call Init(). +// +// +stateify savable type LockFD struct { locks *FileLocks } @@ -405,6 +431,8 @@ func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { // NoLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface // returning ENOLCK. +// +// +stateify savable type NoLockFD struct{} // LockBSD implements vfs.FileDescriptionImpl.LockBSD. diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index df3758fd1..c93d94634 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -15,8 +15,6 @@ package vfs import ( - "sync/atomic" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" @@ -34,9 +32,7 @@ import ( // // +stateify savable type Filesystem struct { - // refs is the reference count. refs is accessed using atomic memory - // operations. - refs int64 + FilesystemRefs // vfs is the VirtualFilesystem that uses this Filesystem. vfs is // immutable. @@ -52,7 +48,7 @@ type Filesystem struct { // Init must be called before first use of fs. func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, fsType FilesystemType, impl FilesystemImpl) { - fs.refs = 1 + fs.EnableLeakCheck() fs.vfs = vfsObj fs.fsType = fsType fs.impl = impl @@ -76,39 +72,14 @@ func (fs *Filesystem) Impl() FilesystemImpl { return fs.impl } -// IncRef increments fs' reference count. -func (fs *Filesystem) IncRef() { - if atomic.AddInt64(&fs.refs, 1) <= 1 { - panic("Filesystem.IncRef() called without holding a reference") - } -} - -// TryIncRef increments fs' reference count and returns true. If fs' reference -// count is zero, TryIncRef does nothing and returns false. -// -// TryIncRef does not require that a reference is held on fs. -func (fs *Filesystem) TryIncRef() bool { - for { - refs := atomic.LoadInt64(&fs.refs) - if refs <= 0 { - return false - } - if atomic.CompareAndSwapInt64(&fs.refs, refs, refs+1) { - return true - } - } -} - // DecRef decrements fs' reference count. func (fs *Filesystem) DecRef(ctx context.Context) { - if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 { + fs.FilesystemRefs.DecRef(func() { fs.vfs.filesystemsMu.Lock() delete(fs.vfs.filesystems, fs) fs.vfs.filesystemsMu.Unlock() fs.impl.Release(ctx) - } else if refs < 0 { - panic("Filesystem.decRef() called without holding a reference") - } + }) } // FilesystemImpl contains implementation details for a Filesystem. @@ -212,8 +183,9 @@ type FilesystemImpl interface { // ENOENT. Equivalently, if vd represents a file with a link count of 0 not // created by open(O_TMPFILE) without O_EXCL, LinkAt returns ENOENT. // - // Preconditions: !rp.Done(). For the final path component in rp, - // !rp.ShouldFollowSymlink(). + // Preconditions: + // * !rp.Done(). + // * For the final path component in rp, !rp.ShouldFollowSymlink(). // // Postconditions: If LinkAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). @@ -231,8 +203,9 @@ type FilesystemImpl interface { // - If the directory in which the new directory would be created has been // removed by RmdirAt or RenameAt, MkdirAt returns ENOENT. // - // Preconditions: !rp.Done(). For the final path component in rp, - // !rp.ShouldFollowSymlink(). + // Preconditions: + // * !rp.Done(). + // * For the final path component in rp, !rp.ShouldFollowSymlink(). // // Postconditions: If MkdirAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). @@ -253,8 +226,9 @@ type FilesystemImpl interface { // - If the directory in which the file would be created has been removed // by RmdirAt or RenameAt, MknodAt returns ENOENT. // - // Preconditions: !rp.Done(). For the final path component in rp, - // !rp.ShouldFollowSymlink(). + // Preconditions: + // * !rp.Done(). + // * For the final path component in rp, !rp.ShouldFollowSymlink(). // // Postconditions: If MknodAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). @@ -345,11 +319,12 @@ type FilesystemImpl interface { // - If renaming would replace a non-empty directory, RenameAt returns // ENOTEMPTY. // - // Preconditions: !rp.Done(). For the final path component in rp, - // !rp.ShouldFollowSymlink(). oldParentVD.Dentry() was obtained from a - // previous call to - // oldParentVD.Mount().Filesystem().Impl().GetParentDentryAt(). oldName is - // not "." or "..". + // Preconditions: + // * !rp.Done(). + // * For the final path component in rp, !rp.ShouldFollowSymlink(). + // * oldParentVD.Dentry() was obtained from a previous call to + // oldParentVD.Mount().Filesystem().Impl().GetParentDentryAt(). + // * oldName is not "." or "..". // // Postconditions: If RenameAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). @@ -372,8 +347,9 @@ type FilesystemImpl interface { // - If the file at rp exists but is not a directory, RmdirAt returns // ENOTDIR. // - // Preconditions: !rp.Done(). For the final path component in rp, - // !rp.ShouldFollowSymlink(). + // Preconditions: + // * !rp.Done(). + // * For the final path component in rp, !rp.ShouldFollowSymlink(). // // Postconditions: If RmdirAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). @@ -410,8 +386,9 @@ type FilesystemImpl interface { // - If the directory in which the symbolic link would be created has been // removed by RmdirAt or RenameAt, SymlinkAt returns ENOENT. // - // Preconditions: !rp.Done(). For the final path component in rp, - // !rp.ShouldFollowSymlink(). + // Preconditions: + // * !rp.Done(). + // * For the final path component in rp, !rp.ShouldFollowSymlink(). // // Postconditions: If SymlinkAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). @@ -431,33 +408,34 @@ type FilesystemImpl interface { // // - If the file at rp exists but is a directory, UnlinkAt returns EISDIR. // - // Preconditions: !rp.Done(). For the final path component in rp, - // !rp.ShouldFollowSymlink(). + // Preconditions: + // * !rp.Done(). + // * For the final path component in rp, !rp.ShouldFollowSymlink(). // // Postconditions: If UnlinkAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). UnlinkAt(ctx context.Context, rp *ResolvingPath) error - // ListxattrAt returns all extended attribute names for the file at rp. + // ListXattrAt returns all extended attribute names for the file at rp. // // Errors: // // - If extended attributes are not supported by the filesystem, - // ListxattrAt returns ENOTSUP. + // ListXattrAt returns ENOTSUP. // // - If the size of the list (including a NUL terminating byte after every // entry) would exceed size, ERANGE may be returned. Note that // implementations are free to ignore size entirely and return without // error). In all cases, if size is 0, the list should be returned without // error, regardless of size. - ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) + ListXattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) - // GetxattrAt returns the value associated with the given extended + // GetXattrAt returns the value associated with the given extended // attribute for the file at rp. // // Errors: // - // - If extended attributes are not supported by the filesystem, GetxattrAt + // - If extended attributes are not supported by the filesystem, GetXattrAt // returns ENOTSUP. // // - If an extended attribute named opts.Name does not exist, ENODATA is @@ -467,30 +445,30 @@ type FilesystemImpl interface { // returned (note that implementations are free to ignore opts.Size entirely // and return without error). In all cases, if opts.Size is 0, the value // should be returned without error, regardless of size. - GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error) + GetXattrAt(ctx context.Context, rp *ResolvingPath, opts GetXattrOptions) (string, error) - // SetxattrAt changes the value associated with the given extended + // SetXattrAt changes the value associated with the given extended // attribute for the file at rp. // // Errors: // - // - If extended attributes are not supported by the filesystem, SetxattrAt + // - If extended attributes are not supported by the filesystem, SetXattrAt // returns ENOTSUP. // // - If XATTR_CREATE is set in opts.Flag and opts.Name already exists, // EEXIST is returned. If XATTR_REPLACE is set and opts.Name does not exist, // ENODATA is returned. - SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error + SetXattrAt(ctx context.Context, rp *ResolvingPath, opts SetXattrOptions) error - // RemovexattrAt removes the given extended attribute from the file at rp. + // RemoveXattrAt removes the given extended attribute from the file at rp. // // Errors: // // - If extended attributes are not supported by the filesystem, - // RemovexattrAt returns ENOTSUP. + // RemoveXattrAt returns ENOTSUP. // // - If name does not exist, ENODATA is returned. - RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error + RemoveXattrAt(ctx context.Context, rp *ResolvingPath, name string) error // BoundEndpointAt returns the Unix socket endpoint bound at the path rp. // @@ -528,6 +506,8 @@ type FilesystemImpl interface { // PrependPathAtVFSRootError is returned by implementations of // FilesystemImpl.PrependPath() when they encounter the contextual VFS root. +// +// +stateify savable type PrependPathAtVFSRootError struct{} // Error implements error.Error. @@ -538,6 +518,8 @@ func (PrependPathAtVFSRootError) Error() string { // PrependPathAtNonMountRootError is returned by implementations of // FilesystemImpl.PrependPath() when they encounter an independent ancestor // Dentry that is not the Mount root. +// +// +stateify savable type PrependPathAtNonMountRootError struct{} // Error implements error.Error. @@ -548,6 +530,8 @@ func (PrependPathAtNonMountRootError) Error() string { // PrependPathSyntheticError is returned by implementations of // FilesystemImpl.PrependPath() for which prepended names do not represent real // paths. +// +// +stateify savable type PrependPathSyntheticError struct{} // Error implements error.Error. diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go index 465e610e0..2620cf975 100644 --- a/pkg/sentry/vfs/filesystem_impl_util.go +++ b/pkg/sentry/vfs/filesystem_impl_util.go @@ -16,6 +16,9 @@ package vfs import ( "strings" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/usermem" ) // GenericParseMountOptions parses a comma-separated list of options of the @@ -41,3 +44,13 @@ func GenericParseMountOptions(str string) map[string]string { } return m } + +// GenericStatFS returns a statfs struct filled with the common fields for a +// general filesystem. This is analogous to Linux's fs/libfs.cs:simple_statfs(). +func GenericStatFS(fsMagic uint64) linux.Statfs { + return linux.Statfs{ + Type: fsMagic, + BlockSize: usermem.PageSize, + NameLength: linux.NAME_MAX, + } +} diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go index f2298f7f6..9d54cc4ed 100644 --- a/pkg/sentry/vfs/filesystem_type.go +++ b/pkg/sentry/vfs/filesystem_type.go @@ -33,6 +33,9 @@ type FilesystemType interface { // Name returns the name of this FilesystemType. Name() string + + // Release releases all resources held by this FilesystemType. + Release(ctx context.Context) } // GetFilesystemOptions contains options to FilesystemType.GetFilesystem. @@ -55,10 +58,13 @@ type registeredFilesystemType struct { // RegisterFilesystemTypeOptions contains options to // VirtualFilesystem.RegisterFilesystem(). +// +// +stateify savable type RegisterFilesystemTypeOptions struct { - // If AllowUserMount is true, allow calls to VirtualFilesystem.MountAt() - // for which MountOptions.InternalMount == false to use this filesystem - // type. + // AllowUserMount determines whether users are allowed to mount a file system + // of this type, i.e. through mount(2). If AllowUserMount is true, allow calls + // to VirtualFilesystem.MountAt() for which MountOptions.InternalMount == false + // to use this filesystem type. AllowUserMount bool // If AllowUserList is true, make this filesystem type visible in diff --git a/pkg/sentry/vfs/g3doc/inotify.md b/pkg/sentry/vfs/g3doc/inotify.md index e7da49faa..833db213f 100644 --- a/pkg/sentry/vfs/g3doc/inotify.md +++ b/pkg/sentry/vfs/g3doc/inotify.md @@ -28,9 +28,9 @@ The set of all watches held on a single file (i.e., the watch target) is stored in vfs.Watches. Each watch will belong to a different inotify instance (an instance can only have one watch on any watch target). The watches are stored in a map indexed by their vfs.Inotify owner’s id. Hard links and file descriptions -to a single file will all share the same vfs.Watches. Activity on the target -causes its vfs.Watches to generate notifications on its watches’ inotify -instances. +to a single file will all share the same vfs.Watches (with the exception of the +gofer filesystem, described in a later section). Activity on the target causes +its vfs.Watches to generate notifications on its watches’ inotify instances. ### vfs.Watch @@ -103,12 +103,12 @@ inotify: unopened p9 file (and possibly an open FID), through which the Sentry interacts with the gofer. * *Solution:* Because there is no inode structure stored in the sandbox, - inotify watches must be held on the dentry. This would be an issue in - the presence of hard links, where multiple dentries would need to share - the same set of watches, but in VFS2, we do not support the internal - creation of hard links on gofer fs. As a result, we make the assumption - that every dentry corresponds to a unique inode. However, the next point - raises an issue with this assumption: + inotify watches must be held on the dentry. For the purposes of inotify, + we assume that every dentry corresponds to a unique inode, which may + cause unexpected behavior in the presence of hard links, where multiple + dentries should share the same set of watches. Indeed, it is impossible + for us to be absolutely sure whether dentries correspond to the same + file or not, due to the following point: * **The Sentry cannot always be aware of hard links on the remote filesystem.** There is no way for us to confirm whether two files on the remote filesystem are actually links to the same inode. QIDs and inodes are diff --git a/pkg/sentry/vfs/genericfstree/genericfstree.go b/pkg/sentry/vfs/genericfstree/genericfstree.go index 8882fa84a..2d27d9d35 100644 --- a/pkg/sentry/vfs/genericfstree/genericfstree.go +++ b/pkg/sentry/vfs/genericfstree/genericfstree.go @@ -27,6 +27,8 @@ import ( ) // Dentry is a required type parameter that is a struct with the given fields. +// +// +stateify savable type Dentry struct { // vfsd is the embedded vfs.Dentry corresponding to this vfs.DentryImpl. vfsd vfs.Dentry diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go index aff220a61..3f0b8f45b 100644 --- a/pkg/sentry/vfs/inotify.go +++ b/pkg/sentry/vfs/inotify.go @@ -37,6 +37,8 @@ const inotifyEventBaseSize = 16 // // The way events are labelled appears somewhat arbitrary, but they must match // Linux so that IN_EXCL_UNLINK behaves as it does in Linux. +// +// +stateify savable type EventType uint8 // PathEvent and InodeEvent correspond to FSNOTIFY_EVENT_PATH and diff --git a/pkg/sentry/vfs/lock.go b/pkg/sentry/vfs/lock.go index 42666eebf..55783d4eb 100644 --- a/pkg/sentry/vfs/lock.go +++ b/pkg/sentry/vfs/lock.go @@ -33,6 +33,8 @@ import ( // Note that in Linux these two types of locks are _not_ cooperative, because // race and deadlock conditions make merging them prohibitive. We do the same // and keep them oblivious to each other. +// +// +stateify savable type FileLocks struct { // bsd is a set of BSD-style advisory file wide locks, see flock(2). bsd fslock.Locks diff --git a/pkg/sentry/vfs/memxattr/xattr.go b/pkg/sentry/vfs/memxattr/xattr.go index cc1e7d764..638b5d830 100644 --- a/pkg/sentry/vfs/memxattr/xattr.go +++ b/pkg/sentry/vfs/memxattr/xattr.go @@ -33,8 +33,8 @@ type SimpleExtendedAttributes struct { xattrs map[string]string } -// Getxattr returns the value at 'name'. -func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string, error) { +// GetXattr returns the value at 'name'. +func (x *SimpleExtendedAttributes) GetXattr(opts *vfs.GetXattrOptions) (string, error) { x.mu.RLock() value, ok := x.xattrs[opts.Name] x.mu.RUnlock() @@ -49,8 +49,8 @@ func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string, return value, nil } -// Setxattr sets 'value' at 'name'. -func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error { +// SetXattr sets 'value' at 'name'. +func (x *SimpleExtendedAttributes) SetXattr(opts *vfs.SetXattrOptions) error { x.mu.Lock() defer x.mu.Unlock() if x.xattrs == nil { @@ -72,8 +72,8 @@ func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error { return nil } -// Listxattr returns all names in xattrs. -func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) { +// ListXattr returns all names in xattrs. +func (x *SimpleExtendedAttributes) ListXattr(size uint64) ([]string, error) { // Keep track of the size of the buffer needed in listxattr(2) for the list. listSize := 0 x.mu.RLock() @@ -90,8 +90,8 @@ func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) { return names, nil } -// Removexattr removes the xattr at 'name'. -func (x *SimpleExtendedAttributes) Removexattr(name string) error { +// RemoveXattr removes the xattr at 'name'. +func (x *SimpleExtendedAttributes) RemoveXattr(name string) error { x.mu.Lock() defer x.mu.Unlock() if _, ok := x.xattrs[name]; !ok { diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 67dfba986..78f115bfa 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -46,8 +46,9 @@ import ( // +stateify savable type Mount struct { // vfs, fs, root are immutable. References are held on fs and root. + // Note that for a disconnected mount, root may be nil. // - // Invariant: root belongs to fs. + // Invariant: if not nil, root belongs to fs. vfs *VirtualFilesystem fs *Filesystem root *Dentry @@ -65,7 +66,7 @@ type Mount struct { // // Invariant: key.parent != nil iff key.point != nil. key.point belongs to // key.parent.fs. - key mountKey + key mountKey `state:".(VirtualDentry)"` // ns is the namespace in which this Mount was mounted. ns is protected by // VirtualFilesystem.mountMu. @@ -126,16 +127,14 @@ func (mnt *Mount) Options() MountOptions { // // +stateify savable type MountNamespace struct { + MountNamespaceRefs + // Owner is the usernamespace that owns this mount namespace. Owner *auth.UserNamespace // root is the MountNamespace's root mount. root is immutable. root *Mount - // refs is the reference count. refs is accessed using atomic memory - // operations. - refs int64 - // mountpoints maps all Dentries which are mount points in this namespace // to the number of Mounts for which they are mount points. mountpoints is // protected by VirtualFilesystem.mountMu. @@ -154,22 +153,22 @@ type MountNamespace struct { // NewMountNamespace returns a new mount namespace with a root filesystem // configured by the given arguments. A reference is taken on the returned // MountNamespace. -func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) { +func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *MountOptions) (*MountNamespace, error) { rft := vfs.getFilesystemType(fsTypeName) if rft == nil { ctx.Warningf("Unknown filesystem type: %s", fsTypeName) return nil, syserror.ENODEV } - fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts) + fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions) if err != nil { return nil, err } mntns := &MountNamespace{ Owner: creds.UserNamespace, - refs: 1, mountpoints: make(map[*Dentry]uint32), } - mntns.root = newMount(vfs, fs, root, mntns, &MountOptions{}) + mntns.EnableLeakCheck() + mntns.root = newMount(vfs, fs, root, mntns, opts) return mntns, nil } @@ -263,16 +262,20 @@ func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Cr } // MountAt creates and mounts a Filesystem configured by the given arguments. -func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error { +// The VirtualFilesystem will hold a reference to the Mount until it is unmounted. +// +// This method returns the mounted Mount without a reference, for convenience +// during VFS setup when there is no chance of racing with unmount. +func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) (*Mount, error) { mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts) if err != nil { - return err + return nil, err } defer mnt.DecRef(ctx) if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil { - return err + return nil, err } - return nil + return mnt, nil } // UmountAt removes the Mount at the given path. @@ -343,6 +346,7 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti return nil } +// +stateify savable type umountRecursiveOptions struct { // If eager is true, ensure that future calls to Mount.tryIncMountedRef() // on umounted mounts fail. @@ -369,8 +373,9 @@ type umountRecursiveOptions struct { // // umountRecursiveLocked is analogous to Linux's fs/namespace.c:umount_tree(). // -// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a -// writer critical section. +// Preconditions: +// * vfs.mountMu must be locked. +// * vfs.mounts.seq must be in a writer critical section. func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecursiveOptions, vdsToDecRef []VirtualDentry, mountsToDecRef []*Mount) ([]VirtualDentry, []*Mount) { if !mnt.umounted { mnt.umounted = true @@ -399,9 +404,11 @@ func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecu // connectLocked makes vd the mount parent/point for mnt. It consumes // references held by vd. // -// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a -// writer critical section. d.mu must be locked. mnt.parent() == nil, i.e. mnt -// must not already be connected. +// Preconditions: +// * vfs.mountMu must be locked. +// * vfs.mounts.seq must be in a writer critical section. +// * d.mu must be locked. +// * mnt.parent() == nil, i.e. mnt must not already be connected. func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) { if checkInvariants { if mnt.parent() != nil { @@ -409,7 +416,7 @@ func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns } } mnt.IncRef() // dropped by callers of umountRecursiveLocked - mnt.storeKey(vd) + mnt.setKey(vd) if vd.mount.children == nil { vd.mount.children = make(map[*Mount]struct{}) } @@ -429,16 +436,18 @@ func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns // disconnectLocked makes vd have no mount parent/point and returns its old // mount parent/point with a reference held. // -// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a -// writer critical section. mnt.parent() != nil. +// Preconditions: +// * vfs.mountMu must be locked. +// * vfs.mounts.seq must be in a writer critical section. +// * mnt.parent() != nil. func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry { - vd := mnt.loadKey() + vd := mnt.getKey() if checkInvariants { if vd.mount != nil { panic("VFS.disconnectLocked called on disconnected mount") } } - mnt.storeKey(VirtualDentry{}) + mnt.loadKey(VirtualDentry{}) delete(vd.mount.children, mnt) atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1 mnt.ns.mountpoints[vd.dentry]-- @@ -490,7 +499,9 @@ func (mnt *Mount) DecRef(ctx context.Context) { mnt.vfs.mounts.seq.EndWrite() mnt.vfs.mountMu.Unlock() } - mnt.root.DecRef(ctx) + if mnt.root != nil { + mnt.root.DecRef(ctx) + } mnt.fs.DecRef(ctx) if vd.Ok() { vd.DecRef(ctx) @@ -498,17 +509,10 @@ func (mnt *Mount) DecRef(ctx context.Context) { } } -// IncRef increments mntns' reference count. -func (mntns *MountNamespace) IncRef() { - if atomic.AddInt64(&mntns.refs, 1) <= 1 { - panic("MountNamespace.IncRef() called without holding a reference") - } -} - // DecRef decrements mntns' reference count. func (mntns *MountNamespace) DecRef(ctx context.Context) { vfs := mntns.root.fs.VirtualFilesystem() - if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 { + mntns.MountNamespaceRefs.DecRef(func() { vfs.mountMu.Lock() vfs.mounts.seq.BeginWrite() vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(mntns.root, &umountRecursiveOptions{ @@ -522,9 +526,7 @@ func (mntns *MountNamespace) DecRef(ctx context.Context) { for _, mnt := range mountsToDecRef { mnt.DecRef(ctx) } - } else if refs < 0 { - panic("MountNamespace.DecRef() called without holding a reference") - } + }) } // getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes @@ -576,8 +578,9 @@ retryFirst: // mnt. It takes a reference on the returned VirtualDentry. If no such mount // point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil). // -// Preconditions: References are held on mnt and root. vfsroot is not (mnt, -// mnt.root). +// Preconditions: +// * References are held on mnt and root. +// * vfsroot is not (mnt, mnt.root). func (vfs *VirtualFilesystem) getMountpointAt(ctx context.Context, mnt *Mount, vfsroot VirtualDentry) VirtualDentry { // The first mount is special-cased: // @@ -651,6 +654,13 @@ retryFirst: return VirtualDentry{mnt, d} } +// SetMountReadOnly sets the mount as ReadOnly. +func (vfs *VirtualFilesystem) SetMountReadOnly(mnt *Mount, ro bool) error { + vfs.mountMu.Lock() + defer vfs.mountMu.Unlock() + return mnt.setReadOnlyLocked(ro) +} + // CheckBeginWrite increments the counter of in-progress write operations on // mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns // EROFS. @@ -717,14 +727,12 @@ func (mnt *Mount) Root() *Dentry { return mnt.root } -// Root returns mntns' root. A reference is taken on the returned -// VirtualDentry. +// Root returns mntns' root. It does not take a reference on the returned Dentry. func (mntns *MountNamespace) Root() VirtualDentry { vd := VirtualDentry{ mount: mntns.root, dentry: mntns.root.root, } - vd.IncRef() return vd } @@ -732,11 +740,23 @@ func (mntns *MountNamespace) Root() VirtualDentry { // // Preconditions: taskRootDir.Ok(). func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) { - vfs.mountMu.Lock() - defer vfs.mountMu.Unlock() rootMnt := taskRootDir.mount + + vfs.mountMu.Lock() mounts := rootMnt.submountsLocked() + // Take a reference on mounts since we need to drop vfs.mountMu before + // calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()). + for _, mnt := range mounts { + mnt.IncRef() + } + vfs.mountMu.Unlock() + defer func() { + for _, mnt := range mounts { + mnt.DecRef(ctx) + } + }() sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID }) + for _, mnt := range mounts { // Get the path to this mount relative to task root. mntRootVD := VirtualDentry{ @@ -747,7 +767,7 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi if err != nil { // For some reason we didn't get a path. Log a warning // and run with empty path. - ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err) + ctx.Warningf("VFS.GenerateProcMounts: error getting pathname for mount root %+v: %v", mnt.root, err) path = "" } if path == "" { @@ -781,11 +801,25 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi // // Preconditions: taskRootDir.Ok(). func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) { - vfs.mountMu.Lock() - defer vfs.mountMu.Unlock() rootMnt := taskRootDir.mount + + vfs.mountMu.Lock() mounts := rootMnt.submountsLocked() + // Take a reference on mounts since we need to drop vfs.mountMu before + // calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()) or + // vfs.StatAt() (=> FilesystemImpl.StatAt()). + for _, mnt := range mounts { + mnt.IncRef() + } + vfs.mountMu.Unlock() + defer func() { + for _, mnt := range mounts { + mnt.DecRef(ctx) + } + }() sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID }) + + creds := auth.CredentialsFromContext(ctx) for _, mnt := range mounts { // Get the path to this mount relative to task root. mntRootVD := VirtualDentry{ @@ -796,7 +830,7 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo if err != nil { // For some reason we didn't get a path. Log a warning // and run with empty path. - ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err) + ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err) path = "" } if path == "" { @@ -809,9 +843,10 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo Root: mntRootVD, Start: mntRootVD, } - statx, err := vfs.StatAt(ctx, auth.NewAnonymousCredentials(), pop, &StatOptions{}) + statx, err := vfs.StatAt(ctx, creds, pop, &StatOptions{}) if err != nil { // Well that's not good. Ignore this mount. + ctx.Warningf("VFS.GenerateProcMountInfo: failed to stat mount root %+v: %v", mnt.root, err) break } @@ -823,6 +858,9 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo fmt.Fprintf(buf, "%d ", mnt.ID) // (2) Parent ID (or this ID if there is no parent). + // Note that even if the call to mnt.parent() races with Mount + // destruction (which is possible since we're not holding vfs.mountMu), + // its Mount.ID will still be valid. pID := mnt.ID if p := mnt.parent(); p != nil { pID = p.ID diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go index 3335e4057..cb8c56bd3 100644 --- a/pkg/sentry/vfs/mount_test.go +++ b/pkg/sentry/vfs/mount_test.go @@ -38,7 +38,7 @@ func TestMountTableInsertLookup(t *testing.T) { mt.Init() mount := &Mount{} - mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}}) + mount.setKey(VirtualDentry{&Mount{}, &Dentry{}}) mt.Insert(mount) if m := mt.Lookup(mount.parent(), mount.point()); m != mount { @@ -79,7 +79,7 @@ const enableComparativeBenchmarks = false func newBenchMount() *Mount { mount := &Mount{} - mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}}) + mount.loadKey(VirtualDentry{&Mount{}, &Dentry{}}) return mount } @@ -94,7 +94,7 @@ func BenchmarkMountTableParallelLookup(b *testing.B) { for i := 0; i < numMounts; i++ { mount := newBenchMount() mt.Insert(mount) - keys = append(keys, mount.loadKey()) + keys = append(keys, mount.saveKey()) } var ready sync.WaitGroup @@ -146,7 +146,7 @@ func BenchmarkMountMapParallelLookup(b *testing.B) { keys := make([]VirtualDentry, 0, numMounts) for i := 0; i < numMounts; i++ { mount := newBenchMount() - key := mount.loadKey() + key := mount.saveKey() ms[key] = mount keys = append(keys, key) } @@ -201,7 +201,7 @@ func BenchmarkMountSyncMapParallelLookup(b *testing.B) { keys := make([]VirtualDentry, 0, numMounts) for i := 0; i < numMounts; i++ { mount := newBenchMount() - key := mount.loadKey() + key := mount.getKey() ms.Store(key, mount) keys = append(keys, key) } @@ -283,7 +283,7 @@ func BenchmarkMountMapNegativeLookup(b *testing.B) { ms := make(map[VirtualDentry]*Mount) for i := 0; i < numMounts; i++ { mount := newBenchMount() - ms[mount.loadKey()] = mount + ms[mount.getKey()] = mount } negkeys := make([]VirtualDentry, 0, numMounts) for i := 0; i < numMounts; i++ { @@ -318,7 +318,7 @@ func BenchmarkMountSyncMapNegativeLookup(b *testing.B) { var ms sync.Map for i := 0; i < numMounts; i++ { mount := newBenchMount() - ms.Store(mount.loadKey(), mount) + ms.Store(mount.saveKey(), mount) } negkeys := make([]VirtualDentry, 0, numMounts) for i := 0; i < numMounts; i++ { @@ -372,7 +372,7 @@ func BenchmarkMountMapInsert(b *testing.B) { b.ResetTimer() for i := range mounts { mount := mounts[i] - ms[mount.loadKey()] = mount + ms[mount.saveKey()] = mount } } @@ -392,7 +392,7 @@ func BenchmarkMountSyncMapInsert(b *testing.B) { b.ResetTimer() for i := range mounts { mount := mounts[i] - ms.Store(mount.loadKey(), mount) + ms.Store(mount.saveKey(), mount) } } @@ -425,13 +425,13 @@ func BenchmarkMountMapRemove(b *testing.B) { ms := make(map[VirtualDentry]*Mount) for i := range mounts { mount := mounts[i] - ms[mount.loadKey()] = mount + ms[mount.saveKey()] = mount } b.ResetTimer() for i := range mounts { mount := mounts[i] - delete(ms, mount.loadKey()) + delete(ms, mount.saveKey()) } } @@ -447,12 +447,12 @@ func BenchmarkMountSyncMapRemove(b *testing.B) { var ms sync.Map for i := range mounts { mount := mounts[i] - ms.Store(mount.loadKey(), mount) + ms.Store(mount.saveKey(), mount) } b.ResetTimer() for i := range mounts { mount := mounts[i] - ms.Delete(mount.loadKey()) + ms.Delete(mount.saveKey()) } } diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go index 70f850ca4..b7d122d22 100644 --- a/pkg/sentry/vfs/mount_unsafe.go +++ b/pkg/sentry/vfs/mount_unsafe.go @@ -13,7 +13,7 @@ // limitations under the License. // +build go1.12 -// +build !go1.16 +// +build !go1.17 // Check go:linkname function signatures when updating Go version. @@ -34,6 +34,8 @@ import ( // structurally identical to VirtualDentry, but stores its fields as // unsafe.Pointer since mutators synchronize with VFS path traversal using // seqcounts. +// +// This is explicitly not savable. type mountKey struct { parent unsafe.Pointer // *Mount point unsafe.Pointer // *Dentry @@ -47,19 +49,23 @@ func (mnt *Mount) point() *Dentry { return (*Dentry)(atomic.LoadPointer(&mnt.key.point)) } -func (mnt *Mount) loadKey() VirtualDentry { +func (mnt *Mount) getKey() VirtualDentry { return VirtualDentry{ mount: mnt.parent(), dentry: mnt.point(), } } +func (mnt *Mount) saveKey() VirtualDentry { return mnt.getKey() } + // Invariant: mnt.key.parent == nil. vd.Ok(). -func (mnt *Mount) storeKey(vd VirtualDentry) { +func (mnt *Mount) setKey(vd VirtualDentry) { atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(vd.mount)) atomic.StorePointer(&mnt.key.point, unsafe.Pointer(vd.dentry)) } +func (mnt *Mount) loadKey(vd VirtualDentry) { mnt.setKey(vd) } + // mountTable maps (mount parent, mount point) pairs to mounts. It supports // efficient concurrent lookup, even in the presence of concurrent mutators // (provided mutation is sufficiently uncommon). @@ -92,6 +98,7 @@ type mountTable struct { // length and cap in separate uint32s) for ~free. size uint64 + // FIXME(gvisor.dev/issue/1663): Slots need to be saved. slots unsafe.Pointer `state:"nosave"` // []mountSlot; never nil after Init } @@ -217,8 +224,9 @@ func (mt *mountTable) Insert(mount *Mount) { // insertSeqed inserts the given mount into mt. // -// Preconditions: mt.seq must be in a writer critical section. mt must not -// already contain a Mount with the same mount point and parent. +// Preconditions: +// * mt.seq must be in a writer critical section. +// * mt must not already contain a Mount with the same mount point and parent. func (mt *mountTable) insertSeqed(mount *Mount) { hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes) @@ -269,9 +277,11 @@ func (mt *mountTable) insertSeqed(mount *Mount) { atomic.StorePointer(&mt.slots, newSlots) } -// Preconditions: There are no concurrent mutators of the table (slots, cap). -// If the table is visible to readers, then mt.seq must be in a writer critical -// section. cap must be a power of 2. +// Preconditions: +// * There are no concurrent mutators of the table (slots, cap). +// * If the table is visible to readers, then mt.seq must be in a writer +// critical section. +// * cap must be a power of 2. func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, hash uintptr) { mask := cap - 1 off := (hash & mask) * mountSlotBytes @@ -313,8 +323,9 @@ func (mt *mountTable) Remove(mount *Mount) { // removeSeqed removes the given mount from mt. // -// Preconditions: mt.seq must be in a writer critical section. mt must contain -// mount. +// Preconditions: +// * mt.seq must be in a writer critical section. +// * mt must contain mount. func (mt *mountTable) removeSeqed(mount *Mount) { hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes) tcap := uintptr(1) << (mt.size & mtSizeOrderMask) diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index dfc8573fd..bc79e5ecc 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -21,6 +21,8 @@ import ( // GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and // FilesystemImpl.GetDentryAt(). +// +// +stateify savable type GetDentryOptions struct { // If CheckSearchable is true, FilesystemImpl.GetDentryAt() must check that // the returned Dentry is a directory for which creds has search @@ -30,6 +32,8 @@ type GetDentryOptions struct { // MkdirOptions contains options to VirtualFilesystem.MkdirAt() and // FilesystemImpl.MkdirAt(). +// +// +stateify savable type MkdirOptions struct { // Mode is the file mode bits for the created directory. Mode linux.FileMode @@ -56,6 +60,8 @@ type MkdirOptions struct { // MknodOptions contains options to VirtualFilesystem.MknodAt() and // FilesystemImpl.MknodAt(). +// +// +stateify savable type MknodOptions struct { // Mode is the file type and mode bits for the created file. Mode linux.FileMode @@ -72,6 +78,8 @@ type MknodOptions struct { // MountFlags contains flags as specified for mount(2), e.g. MS_NOEXEC. // MS_RDONLY is not part of MountFlags because it's tracked in Mount.writers. +// +// +stateify savable type MountFlags struct { // NoExec is equivalent to MS_NOEXEC. NoExec bool @@ -93,6 +101,8 @@ type MountFlags struct { } // MountOptions contains options to VirtualFilesystem.MountAt(). +// +// +stateify savable type MountOptions struct { // Flags contains flags as specified for mount(2), e.g. MS_NOEXEC. Flags MountFlags @@ -103,13 +113,17 @@ type MountOptions struct { // GetFilesystemOptions contains options to FilesystemType.GetFilesystem(). GetFilesystemOptions GetFilesystemOptions - // If InternalMount is true, allow the use of filesystem types for which - // RegisterFilesystemTypeOptions.AllowUserMount == false. + // InternalMount indicates whether the mount operation is coming from the + // application, i.e. through mount(2). If InternalMount is true, allow the use + // of filesystem types for which RegisterFilesystemTypeOptions.AllowUserMount + // == false. InternalMount bool } // OpenOptions contains options to VirtualFilesystem.OpenAt() and // FilesystemImpl.OpenAt(). +// +// +stateify savable type OpenOptions struct { // Flags contains access mode and flags as specified for open(2). // @@ -135,6 +149,8 @@ type OpenOptions struct { // ReadOptions contains options to FileDescription.PRead(), // FileDescriptionImpl.PRead(), FileDescription.Read(), and // FileDescriptionImpl.Read(). +// +// +stateify savable type ReadOptions struct { // Flags contains flags as specified for preadv2(2). Flags uint32 @@ -142,6 +158,8 @@ type ReadOptions struct { // RenameOptions contains options to VirtualFilesystem.RenameAt() and // FilesystemImpl.RenameAt(). +// +// +stateify savable type RenameOptions struct { // Flags contains flags as specified for renameat2(2). Flags uint32 @@ -153,6 +171,8 @@ type RenameOptions struct { // SetStatOptions contains options to VirtualFilesystem.SetStatAt(), // FilesystemImpl.SetStatAt(), FileDescription.SetStat(), and // FileDescriptionImpl.SetStat(). +// +// +stateify savable type SetStatOptions struct { // Stat is the metadata that should be set. Only fields indicated by // Stat.Mask should be set. @@ -174,6 +194,8 @@ type SetStatOptions struct { // BoundEndpointOptions contains options to VirtualFilesystem.BoundEndpointAt() // and FilesystemImpl.BoundEndpointAt(). +// +// +stateify savable type BoundEndpointOptions struct { // Addr is the path of the file whose socket endpoint is being retrieved. // It is generally irrelevant: most endpoints are stored at a dentry that @@ -190,10 +212,12 @@ type BoundEndpointOptions struct { Addr string } -// GetxattrOptions contains options to VirtualFilesystem.GetxattrAt(), -// FilesystemImpl.GetxattrAt(), FileDescription.Getxattr(), and -// FileDescriptionImpl.Getxattr(). -type GetxattrOptions struct { +// GetXattrOptions contains options to VirtualFilesystem.GetXattrAt(), +// FilesystemImpl.GetXattrAt(), FileDescription.GetXattr(), and +// FileDescriptionImpl.GetXattr(). +// +// +stateify savable +type GetXattrOptions struct { // Name is the name of the extended attribute to retrieve. Name string @@ -204,10 +228,12 @@ type GetxattrOptions struct { Size uint64 } -// SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(), -// FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and -// FileDescriptionImpl.Setxattr(). -type SetxattrOptions struct { +// SetXattrOptions contains options to VirtualFilesystem.SetXattrAt(), +// FilesystemImpl.SetXattrAt(), FileDescription.SetXattr(), and +// FileDescriptionImpl.SetXattr(). +// +// +stateify savable +type SetXattrOptions struct { // Name is the name of the extended attribute being mutated. Name string @@ -221,6 +247,8 @@ type SetxattrOptions struct { // StatOptions contains options to VirtualFilesystem.StatAt(), // FilesystemImpl.StatAt(), FileDescription.Stat(), and // FileDescriptionImpl.Stat(). +// +// +stateify savable type StatOptions struct { // Mask is the set of fields in the returned Statx that the FilesystemImpl // or FileDescriptionImpl should provide. Bits are as in linux.Statx.Mask. @@ -238,6 +266,8 @@ type StatOptions struct { } // UmountOptions contains options to VirtualFilesystem.UmountAt(). +// +// +stateify savable type UmountOptions struct { // Flags contains flags as specified for umount2(2). Flags uint32 @@ -246,6 +276,8 @@ type UmountOptions struct { // WriteOptions contains options to FileDescription.PWrite(), // FileDescriptionImpl.PWrite(), FileDescription.Write(), and // FileDescriptionImpl.Write(). +// +// +stateify savable type WriteOptions struct { // Flags contains flags as specified for pwritev2(2). Flags uint32 diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index 33389c1df..d48520d58 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -16,6 +16,7 @@ package vfs import ( "math" + "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -25,6 +26,8 @@ import ( ) // AccessTypes is a bitmask of Unix file permissions. +// +// +stateify savable type AccessTypes uint16 // Bits in AccessTypes. @@ -271,7 +274,7 @@ func HasCapabilityOnFile(creds *auth.Credentials, cp linux.Capability, kuid auth // operation must not proceed. Otherwise it returns the max length allowed to // without violating the limit. func CheckLimit(ctx context.Context, offset, size int64) (int64, error) { - fileSizeLimit := limits.FromContext(ctx).Get(limits.FileSize).Cur + fileSizeLimit := limits.FromContextOrDie(ctx).Get(limits.FileSize).Cur if fileSizeLimit > math.MaxInt64 { return size, nil } @@ -284,3 +287,40 @@ func CheckLimit(ctx context.Context, offset, size int64) (int64, error) { } return size, nil } + +// CheckXattrPermissions checks permissions for extended attribute access. +// This is analogous to fs/xattr.c:xattr_permission(). Some key differences: +// * Does not check for read-only filesystem property. +// * Does not check inode immutability or append only mode. In both cases EPERM +// must be returned by filesystem implementations. +// * Does not do inode permission checks. Filesystem implementations should +// handle inode permission checks as they may differ across implementations. +func CheckXattrPermissions(creds *auth.Credentials, ats AccessTypes, mode linux.FileMode, kuid auth.KUID, name string) error { + switch { + case strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX): + // The trusted.* namespace can only be accessed by privileged + // users. + if creds.HasCapability(linux.CAP_SYS_ADMIN) { + return nil + } + if ats.MayWrite() { + return syserror.EPERM + } + return syserror.ENODATA + case strings.HasPrefix(name, linux.XATTR_USER_PREFIX): + // In the user.* namespace, only regular files and directories can have + // extended attributes. For sticky directories, only the owner and + // privileged users can write attributes. + filetype := mode.FileType() + if filetype != linux.ModeRegular && filetype != linux.ModeDirectory { + if ats.MayWrite() { + return syserror.EPERM + } + return syserror.ENODATA + } + if filetype == linux.ModeDirectory && mode&linux.ModeSticky != 0 && ats.MayWrite() && !CanActAsOwner(creds, kuid) { + return syserror.EPERM + } + } + return nil +} diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index 3304372d9..e4fd55012 100644 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go @@ -35,6 +35,8 @@ import ( // FilesystemImpl methods. // // ResolvingPath is loosely analogous to Linux's struct nameidata. +// +// +stateify savable type ResolvingPath struct { vfs *VirtualFilesystem root VirtualDentry // refs borrowed from PathOperation @@ -88,6 +90,7 @@ func init() { // so error "constants" are really mutable vars, necessitating somewhat // expensive interface object comparisons. +// +stateify savable type resolveMountRootOrJumpError struct{} // Error implements error.Error. @@ -95,6 +98,7 @@ func (resolveMountRootOrJumpError) Error() string { return "resolving mount root or jump" } +// +stateify savable type resolveMountPointError struct{} // Error implements error.Error. @@ -102,6 +106,7 @@ func (resolveMountPointError) Error() string { return "resolving mount point" } +// +stateify savable type resolveAbsSymlinkError struct{} // Error implements error.Error. diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 9c2420683..38d2701d2 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -24,9 +24,9 @@ // Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry // VirtualFilesystem.filesystemsMu // EpollInstance.mu -// Inotify.mu -// Watches.mu -// Inotify.evMu +// Inotify.mu +// Watches.mu +// Inotify.evMu // VirtualFilesystem.fsTypesMu // // Locking Dentry.mu in multiple Dentries requires holding @@ -36,6 +36,7 @@ package vfs import ( "fmt" + "path" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -157,11 +158,23 @@ func (vfs *VirtualFilesystem) Init(ctx context.Context) error { return nil } +// Release drops references on filesystem objects held by vfs. +// +// Precondition: This must be called after VFS.Init() has succeeded. +func (vfs *VirtualFilesystem) Release(ctx context.Context) { + vfs.anonMount.DecRef(ctx) + for _, fst := range vfs.fsTypes { + fst.fsType.Release(ctx) + } +} + // PathOperation specifies the path operated on by a VFS method. // // PathOperation is passed to VFS methods by pointer to reduce memory copying: // it's somewhat large and should never escape. (Options structs are passed by // pointer to VFS and FileDescription methods for the same reason.) +// +// +stateify savable type PathOperation struct { // Root is the VFS root. References on Root are borrowed from the provider // of the PathOperation. @@ -296,6 +309,8 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential // MkdirAt creates a directory at the given path. func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error { if !pop.Path.Begin.Ok() { + // pop.Path should not be empty in operations that create/delete files. + // This is consistent with mkdirat(dirfd, "", mode). if pop.Path.Absolute { return syserror.EEXIST } @@ -332,6 +347,8 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia // error from the syserror package. func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error { if !pop.Path.Begin.Ok() { + // pop.Path should not be empty in operations that create/delete files. + // This is consistent with mknodat(dirfd, "", mode, dev). if pop.Path.Absolute { return syserror.EEXIST } @@ -517,6 +534,8 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti // RmdirAt removes the directory at the given path. func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { if !pop.Path.Begin.Ok() { + // pop.Path should not be empty in operations that create/delete files. + // This is consistent with unlinkat(dirfd, "", AT_REMOVEDIR). if pop.Path.Absolute { return syserror.EBUSY } @@ -598,6 +617,8 @@ func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credenti // SymlinkAt creates a symbolic link at the given path with the given target. func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error { if !pop.Path.Begin.Ok() { + // pop.Path should not be empty in operations that create/delete files. + // This is consistent with symlinkat(oldpath, newdirfd, ""). if pop.Path.Absolute { return syserror.EEXIST } @@ -630,6 +651,8 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent // UnlinkAt deletes the non-directory file at the given path. func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { if !pop.Path.Begin.Ok() { + // pop.Path should not be empty in operations that create/delete files. + // This is consistent with unlinkat(dirfd, "", 0). if pop.Path.Absolute { return syserror.EBUSY } @@ -661,12 +684,6 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti // BoundEndpointAt gets the bound endpoint at the given path, if one exists. func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *BoundEndpointOptions) (transport.BoundEndpoint, error) { - if !pop.Path.Begin.Ok() { - if pop.Path.Absolute { - return nil, syserror.ECONNREFUSED - } - return nil, syserror.ENOENT - } rp := vfs.getResolvingPath(creds, pop) for { bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts) @@ -686,12 +703,12 @@ func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.C } } -// ListxattrAt returns all extended attribute names for the file at the given +// ListXattrAt returns all extended attribute names for the file at the given // path. -func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) { +func (vfs *VirtualFilesystem) ListXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) { rp := vfs.getResolvingPath(creds, pop) for { - names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp, size) + names, err := rp.mount.fs.impl.ListXattrAt(ctx, rp, size) if err == nil { vfs.putResolvingPath(ctx, rp) return names, nil @@ -711,12 +728,12 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede } } -// GetxattrAt returns the value associated with the given extended attribute +// GetXattrAt returns the value associated with the given extended attribute // for the file at the given path. -func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetxattrOptions) (string, error) { +func (vfs *VirtualFilesystem) GetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetXattrOptions) (string, error) { rp := vfs.getResolvingPath(creds, pop) for { - val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, *opts) + val, err := rp.mount.fs.impl.GetXattrAt(ctx, rp, *opts) if err == nil { vfs.putResolvingPath(ctx, rp) return val, nil @@ -728,12 +745,12 @@ func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Creden } } -// SetxattrAt changes the value associated with the given extended attribute +// SetXattrAt changes the value associated with the given extended attribute // for the file at the given path. -func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetxattrOptions) error { +func (vfs *VirtualFilesystem) SetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetXattrOptions) error { rp := vfs.getResolvingPath(creds, pop) for { - err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts) + err := rp.mount.fs.impl.SetXattrAt(ctx, rp, *opts) if err == nil { vfs.putResolvingPath(ctx, rp) return nil @@ -745,11 +762,11 @@ func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Creden } } -// RemovexattrAt removes the given extended attribute from the file at rp. -func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error { +// RemoveXattrAt removes the given extended attribute from the file at rp. +func (vfs *VirtualFilesystem) RemoveXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error { rp := vfs.getResolvingPath(creds, pop) for { - err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name) + err := rp.mount.fs.impl.RemoveXattrAt(ctx, rp, name) if err == nil { vfs.putResolvingPath(ctx, rp) return nil @@ -782,6 +799,62 @@ func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error { return retErr } +// MkdirAllAt recursively creates non-existent directories on the given path +// (including the last component). +func (vfs *VirtualFilesystem) MkdirAllAt(ctx context.Context, currentPath string, root VirtualDentry, creds *auth.Credentials, mkdirOpts *MkdirOptions) error { + pop := &PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(currentPath), + } + stat, err := vfs.StatAt(ctx, creds, pop, &StatOptions{Mask: linux.STATX_TYPE}) + switch err { + case nil: + if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.FileTypeMask != linux.ModeDirectory { + return syserror.ENOTDIR + } + // Directory already exists. + return nil + case syserror.ENOENT: + // Expected, we will create the dir. + default: + return fmt.Errorf("stat failed for %q during directory creation: %w", currentPath, err) + } + + // Recurse to ensure parent is created and then create the final directory. + if err := vfs.MkdirAllAt(ctx, path.Dir(currentPath), root, creds, mkdirOpts); err != nil { + return err + } + if err := vfs.MkdirAt(ctx, creds, pop, mkdirOpts); err != nil { + return fmt.Errorf("failed to create directory %q: %w", currentPath, err) + } + return nil +} + +// MakeSyntheticMountpoint creates parent directories of target if they do not +// exist and attempts to create a directory for the mountpoint. If a +// non-directory file already exists there then we allow it. +func (vfs *VirtualFilesystem) MakeSyntheticMountpoint(ctx context.Context, target string, root VirtualDentry, creds *auth.Credentials) error { + mkdirOpts := &MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true} + + // Make sure the parent directory of target exists. + if err := vfs.MkdirAllAt(ctx, path.Dir(target), root, creds, mkdirOpts); err != nil { + return fmt.Errorf("failed to create parent directory of mountpoint %q: %w", target, err) + } + + // Attempt to mkdir the final component. If a file (of any type) exists + // then we let allow mounting on top of that because we do not require the + // target to be an existing directory, unlike Linux mount(2). + if err := vfs.MkdirAt(ctx, creds, &PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(target), + }, mkdirOpts); err != nil && err != syserror.EEXIST { + return fmt.Errorf("failed to create mountpoint %q: %w", target, err) + } + return nil +} + // A VirtualDentry represents a node in a VFS tree, by combining a Dentry // (which represents a node in a Filesystem's tree) and a Mount (which // represents the Filesystem's position in a VFS mount tree). |