diff options
Diffstat (limited to 'pkg/sentry/vfs')
-rw-r--r-- | pkg/sentry/vfs/BUILD | 16 | ||||
-rw-r--r-- | pkg/sentry/vfs/README.md | 6 | ||||
-rw-r--r-- | pkg/sentry/vfs/anonfs.go | 15 | ||||
-rw-r--r-- | pkg/sentry/vfs/dentry.go | 53 | ||||
-rw-r--r-- | pkg/sentry/vfs/epoll.go | 5 | ||||
-rw-r--r-- | pkg/sentry/vfs/file_description.go | 151 | ||||
-rw-r--r-- | pkg/sentry/vfs/file_description_impl_util.go | 86 | ||||
-rw-r--r-- | pkg/sentry/vfs/file_description_impl_util_test.go | 9 | ||||
-rw-r--r-- | pkg/sentry/vfs/filesystem.go | 2 | ||||
-rw-r--r-- | pkg/sentry/vfs/g3doc/inotify.md | 210 | ||||
-rw-r--r-- | pkg/sentry/vfs/genericfstree/genericfstree.go | 3 | ||||
-rw-r--r-- | pkg/sentry/vfs/inotify.go | 774 | ||||
-rw-r--r-- | pkg/sentry/vfs/lock.go (renamed from pkg/sentry/vfs/lock/lock.go) | 43 | ||||
-rw-r--r-- | pkg/sentry/vfs/lock/BUILD | 13 | ||||
-rw-r--r-- | pkg/sentry/vfs/mount.go | 102 | ||||
-rw-r--r-- | pkg/sentry/vfs/mount_unsafe.go | 2 | ||||
-rw-r--r-- | pkg/sentry/vfs/options.go | 21 | ||||
-rw-r--r-- | pkg/sentry/vfs/permissions.go | 53 | ||||
-rw-r--r-- | pkg/sentry/vfs/vfs.go | 13 |
19 files changed, 1475 insertions, 102 deletions
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 94d69c1cc..642769e7c 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -15,6 +15,18 @@ go_template_instance( }, ) +go_template_instance( + name = "event_list", + out = "event_list.go", + package = "vfs", + prefix = "event", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*Event", + "Linker": "*Event", + }, +) + go_library( name = "vfs", srcs = [ @@ -25,11 +37,14 @@ go_library( "device.go", "epoll.go", "epoll_interest_list.go", + "event_list.go", "file_description.go", "file_description_impl_util.go", "filesystem.go", "filesystem_impl_util.go", "filesystem_type.go", + "inotify.go", + "lock.go", "mount.go", "mount_unsafe.go", "options.go", @@ -57,6 +72,7 @@ go_library( "//pkg/sentry/limits", "//pkg/sentry/memmap", "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/uniqueid", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md index 9aa133bcb..4b9faf2ea 100644 --- a/pkg/sentry/vfs/README.md +++ b/pkg/sentry/vfs/README.md @@ -39,8 +39,8 @@ Mount references are held by: - Mount: Each referenced Mount holds a reference on its parent, which is the mount containing its mount point. -- VirtualFilesystem: A reference is held on each Mount that has not been - umounted. +- VirtualFilesystem: A reference is held on each Mount that has been connected + to a mount point, but not yet umounted. MountNamespace and FileDescription references are held by users of VFS. The expectation is that each `kernel.Task` holds a reference on its corresponding @@ -169,8 +169,6 @@ This construction, which is essentially a type-safe analogue to Linux's - binder, which is similarly far too incomplete to use. - - whitelistfs, which we are already actively attempting to remove. - - Save/restore. For instance, it is unclear if the current implementation of the `state` package supports the inheritance pattern described above. diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index caf770fd5..641e3e502 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -297,3 +297,18 @@ func (d *anonDentry) TryIncRef() bool { func (d *anonDentry) DecRef() { // no-op } + +// InotifyWithParent implements DentryImpl.InotifyWithParent. +// +// Although Linux technically supports inotify on pseudo filesystems (inotify +// is implemented at the vfs layer), it is not particularly useful. It is left +// unimplemented until someone actually needs it. +func (d *anonDentry) InotifyWithParent(events, cookie uint32, et EventType) {} + +// Watches implements DentryImpl.Watches. +func (d *anonDentry) Watches() *Watches { + return nil +} + +// OnZeroWatches implements Dentry.OnZeroWatches. +func (d *anonDentry) OnZeroWatches() {} diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index 8624dbd5d..cea3e6955 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -103,6 +103,39 @@ type DentryImpl interface { // DecRef decrements the Dentry's reference count. DecRef() + + // InotifyWithParent notifies all watches on the targets represented by this + // dentry and its parent. The parent's watches are notified first, followed + // by this dentry's. + // + // InotifyWithParent automatically adds the IN_ISDIR flag for dentries + // representing directories. + // + // Note that the events may not actually propagate up to the user, depending + // on the event masks. + InotifyWithParent(events, cookie uint32, et EventType) + + // Watches returns the set of inotify watches for the file corresponding to + // the Dentry. Dentries that are hard links to the same underlying file + // share the same watches. + // + // Watches may return nil if the dentry belongs to a FilesystemImpl that + // does not support inotify. If an implementation returns a non-nil watch + // set, it must always return a non-nil watch set. Likewise, if an + // implementation returns a nil watch set, it must always return a nil watch + // set. + // + // The caller does not need to hold a reference on the dentry. + Watches() *Watches + + // OnZeroWatches is called whenever the number of watches on a dentry drops + // to zero. This is needed by some FilesystemImpls (e.g. gofer) to manage + // dentry lifetime. + // + // The caller does not need to hold a reference on the dentry. OnZeroWatches + // may acquire inotify locks, so to prevent deadlock, no inotify locks should + // be held by the caller. + OnZeroWatches() } // IncRef increments d's reference count. @@ -133,6 +166,26 @@ func (d *Dentry) isMounted() bool { return atomic.LoadUint32(&d.mounts) != 0 } +// InotifyWithParent notifies all watches on the targets represented by d and +// its parent of events. +func (d *Dentry) InotifyWithParent(events, cookie uint32, et EventType) { + d.impl.InotifyWithParent(events, cookie, et) +} + +// Watches returns the set of inotify watches associated with d. +// +// Watches will return nil if d belongs to a FilesystemImpl that does not +// support inotify. +func (d *Dentry) Watches() *Watches { + return d.impl.Watches() +} + +// OnZeroWatches performs cleanup tasks whenever the number of watches on a +// dentry drops to zero. +func (d *Dentry) OnZeroWatches() { + d.impl.OnZeroWatches() +} + // The following functions are exported so that filesystem implementations can // use them. The vfs package, and users of VFS, should not call these // functions. diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go index 8297f964b..5b009b928 100644 --- a/pkg/sentry/vfs/epoll.go +++ b/pkg/sentry/vfs/epoll.go @@ -31,6 +31,7 @@ type EpollInstance struct { vfsfd FileDescription FileDescriptionDefaultImpl DentryMetadataFileDescriptionImpl + NoLockFD // q holds waiters on this EpollInstance. q waiter.Queue @@ -185,7 +186,7 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event lin } // Register interest in file. - mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP + mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP epi := &epollInterest{ epoll: ep, key: key, @@ -256,7 +257,7 @@ func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event } // Update epi for the next call to ep.ReadEvents(). - mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP + mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP ep.mu.Lock() epi.mask = mask epi.userData = event.Data diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index cfabd936c..93861fb4a 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -42,11 +42,20 @@ type FileDescription struct { // operations. refs int64 + // flagsMu protects statusFlags and asyncHandler below. + flagsMu sync.Mutex + // statusFlags contains status flags, "initialized by open(2) and possibly - // modified by fcntl()" - fcntl(2). statusFlags is accessed using atomic - // memory operations. + // modified by fcntl()" - fcntl(2). statusFlags can be read using atomic + // memory operations when it does not need to be synchronized with an + // access to asyncHandler. statusFlags uint32 + // asyncHandler handles O_ASYNC signal generation. It is set with the + // F_SETOWN or F_SETOWN_EX fcntls. For asyncHandler to be used, O_ASYNC must + // also be set by fcntl(2). + asyncHandler FileAsync + // epolls is the set of epollInterests registered for this FileDescription. // epolls is protected by epollMu. epollMu sync.Mutex @@ -73,6 +82,8 @@ type FileDescription struct { // writable is analogous to Linux's FMODE_WRITE. writable bool + usedLockBSD uint32 + // impl is the FileDescriptionImpl associated with this Filesystem. impl is // immutable. This should be the last field in FileDescription. impl FileDescriptionImpl @@ -80,8 +91,7 @@ type FileDescription struct { // FileDescriptionOptions contains options to FileDescription.Init(). type FileDescriptionOptions struct { - // If AllowDirectIO is true, allow O_DIRECT to be set on the file. This is - // usually only the case if O_DIRECT would actually have an effect. + // If AllowDirectIO is true, allow O_DIRECT to be set on the file. AllowDirectIO bool // If DenyPRead is true, calls to FileDescription.PRead() return ESPIPE. @@ -106,6 +116,10 @@ type FileDescriptionOptions struct { UseDentryMetadata bool } +// FileCreationFlags are the set of flags passed to FileDescription.Init() but +// omitted from FileDescription.StatusFlags(). +const FileCreationFlags = linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC + // Init must be called before first use of fd. If it succeeds, it takes // references on mnt and d. flags is the initial file description flags, which // is usually the full set of flags passed to open(2). @@ -120,8 +134,8 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou fd.refs = 1 // Remove "file creation flags" to mirror the behavior from file.f_flags in - // fs/open.c:do_dentry_open - fd.statusFlags = flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC) + // fs/open.c:do_dentry_open. + fd.statusFlags = flags &^ FileCreationFlags fd.vd = VirtualDentry{ mount: mnt, dentry: d, @@ -175,12 +189,25 @@ func (fd *FileDescription) DecRef() { } ep.interestMu.Unlock() } + + // If BSD locks were used, release any lock that it may have acquired. + if atomic.LoadUint32(&fd.usedLockBSD) != 0 { + fd.impl.UnlockBSD(context.Background(), fd) + } + // Release implementation resources. fd.impl.Release() if fd.writable { fd.vd.mount.EndWrite() } fd.vd.DecRef() + fd.flagsMu.Lock() + // TODO(gvisor.dev/issue/1663): We may need to unregister during save, as we do in VFS1. + if fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil { + fd.asyncHandler.Unregister(fd) + } + fd.asyncHandler = nil + fd.flagsMu.Unlock() } else if refs < 0 { panic("FileDescription.DecRef() called without holding a reference") } @@ -210,6 +237,11 @@ func (fd *FileDescription) VirtualDentry() VirtualDentry { return fd.vd } +// Options returns the options passed to fd.Init(). +func (fd *FileDescription) Options() FileDescriptionOptions { + return fd.opts +} + // StatusFlags returns file description status flags, as for fcntl(F_GETFL). func (fd *FileDescription) StatusFlags() uint32 { return atomic.LoadUint32(&fd.statusFlags) @@ -259,7 +291,18 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede } // TODO(jamieliu): FileDescriptionImpl.SetOAsync()? const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK - atomic.StoreUint32(&fd.statusFlags, (oldFlags&^settableFlags)|(flags&settableFlags)) + fd.flagsMu.Lock() + if fd.asyncHandler != nil { + // Use fd.statusFlags instead of oldFlags, which may have become outdated, + // to avoid double registering/unregistering. + if fd.statusFlags&linux.O_ASYNC == 0 && flags&linux.O_ASYNC != 0 { + fd.asyncHandler.Register(fd) + } else if fd.statusFlags&linux.O_ASYNC != 0 && flags&linux.O_ASYNC == 0 { + fd.asyncHandler.Unregister(fd) + } + } + fd.statusFlags = (oldFlags &^ settableFlags) | (flags & settableFlags) + fd.flagsMu.Unlock() return nil } @@ -311,6 +354,10 @@ type FileDescriptionImpl interface { // represented by the FileDescription. StatFS(ctx context.Context) (linux.Statfs, error) + // Allocate grows the file to offset + length bytes. + // Only mode == 0 is supported currently. + Allocate(ctx context.Context, mode, offset, length uint64) error + // waiter.Waitable methods may be used to poll for I/O events. waiter.Waitable @@ -415,24 +462,16 @@ type FileDescriptionImpl interface { Removexattr(ctx context.Context, name string) error // LockBSD tries to acquire a BSD-style advisory file lock. - // - // TODO(gvisor.dev/issue/1480): BSD-style file locking LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error - // LockBSD releases a BSD-style advisory file lock. - // - // TODO(gvisor.dev/issue/1480): BSD-style file locking + // UnlockBSD releases a BSD-style advisory file lock. UnlockBSD(ctx context.Context, uid lock.UniqueID) error // LockPOSIX tries to acquire a POSIX-style advisory file lock. - // - // TODO(gvisor.dev/issue/1480): POSIX-style file locking - LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error + LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, length uint64, whence int16, block lock.Blocker) error // UnlockPOSIX releases a POSIX-style advisory file lock. - // - // TODO(gvisor.dev/issue/1480): POSIX-style file locking - UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error + UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, length uint64, whence int16) error } // Dirent holds the information contained in struct linux_dirent64. @@ -462,6 +501,15 @@ type IterDirentsCallback interface { Handle(dirent Dirent) error } +// IterDirentsCallbackFunc implements IterDirentsCallback for a function with +// the semantics of IterDirentsCallback.Handle. +type IterDirentsCallbackFunc func(dirent Dirent) error + +// Handle implements IterDirentsCallback.Handle. +func (f IterDirentsCallbackFunc) Handle(dirent Dirent) error { + return f(dirent) +} + // OnClose is called when a file descriptor representing the FileDescription is // closed. Returning a non-nil error should not prevent the file descriptor // from being closed. @@ -515,17 +563,28 @@ func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { return fd.impl.StatFS(ctx) } -// Readiness returns fd's I/O readiness. +// Allocate grows file represented by FileDescription to offset + length bytes. +func (fd *FileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error { + return fd.impl.Allocate(ctx, mode, offset, length) +} + +// Readiness implements waiter.Waitable.Readiness. +// +// It returns fd's I/O readiness. func (fd *FileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { return fd.impl.Readiness(mask) } -// EventRegister registers e for I/O readiness events in mask. +// EventRegister implements waiter.Waitable.EventRegister. +// +// It registers e for I/O readiness events in mask. func (fd *FileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { fd.impl.EventRegister(e, mask) } -// EventUnregister unregisters e for I/O readiness events. +// EventUnregister implements waiter.Waitable.EventUnregister. +// +// It unregisters e for I/O readiness events. func (fd *FileDescription) EventUnregister(e *waiter.Entry) { fd.impl.EventUnregister(e) } @@ -731,3 +790,53 @@ func (fd *FileDescription) InodeID() uint64 { func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error { return fd.Sync(ctx) } + +// LockBSD tries to acquire a BSD-style advisory file lock. +func (fd *FileDescription) LockBSD(ctx context.Context, lockType lock.LockType, blocker lock.Blocker) error { + atomic.StoreUint32(&fd.usedLockBSD, 1) + return fd.impl.LockBSD(ctx, fd, lockType, blocker) +} + +// UnlockBSD releases a BSD-style advisory file lock. +func (fd *FileDescription) UnlockBSD(ctx context.Context) error { + return fd.impl.UnlockBSD(ctx, fd) +} + +// LockPOSIX locks a POSIX-style file range lock. +func (fd *FileDescription) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, end uint64, whence int16, block lock.Blocker) error { + return fd.impl.LockPOSIX(ctx, uid, t, start, end, whence, block) +} + +// UnlockPOSIX unlocks a POSIX-style file range lock. +func (fd *FileDescription) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, end uint64, whence int16) error { + return fd.impl.UnlockPOSIX(ctx, uid, start, end, whence) +} + +// A FileAsync sends signals to its owner when w is ready for IO. This is only +// implemented by pkg/sentry/fasync:FileAsync, but we unfortunately need this +// interface to avoid circular dependencies. +type FileAsync interface { + Register(w waiter.Waitable) + Unregister(w waiter.Waitable) +} + +// AsyncHandler returns the FileAsync for fd. +func (fd *FileDescription) AsyncHandler() FileAsync { + fd.flagsMu.Lock() + defer fd.flagsMu.Unlock() + return fd.asyncHandler +} + +// SetAsyncHandler sets fd.asyncHandler if it has not been set before and +// returns it. +func (fd *FileDescription) SetAsyncHandler(newHandler func() FileAsync) FileAsync { + fd.flagsMu.Lock() + defer fd.flagsMu.Unlock() + if fd.asyncHandler == nil { + fd.asyncHandler = newHandler() + if fd.statusFlags&linux.O_ASYNC != 0 { + fd.asyncHandler.Register(fd) + } + } + return fd.asyncHandler +} diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index f4c111926..6b8b4ad49 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -21,7 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fs/lock" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -56,6 +56,12 @@ func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, err return linux.Statfs{}, syserror.ENOSYS } +// Allocate implements FileDescriptionImpl.Allocate analogously to +// fallocate called on regular file, directory or FIFO in Linux. +func (FileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.ENODEV +} + // Readiness implements waiter.Waitable.Readiness analogously to // file_operations::poll == NULL in Linux. func (FileDescriptionDefaultImpl) Readiness(mask waiter.EventMask) waiter.EventMask { @@ -153,31 +159,16 @@ func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) return syserror.ENOTSUP } -// LockBSD implements FileDescriptionImpl.LockBSD. -func (FileDescriptionDefaultImpl) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error { - return syserror.EBADF -} - -// UnlockBSD implements FileDescriptionImpl.UnlockBSD. -func (FileDescriptionDefaultImpl) UnlockBSD(ctx context.Context, uid lock.UniqueID) error { - return syserror.EBADF -} - -// LockPOSIX implements FileDescriptionImpl.LockPOSIX. -func (FileDescriptionDefaultImpl) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error { - return syserror.EBADF -} - -// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX. -func (FileDescriptionDefaultImpl) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error { - return syserror.EBADF -} - // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of // FileDescriptionImpl that always represent directories to obtain // implementations of non-directory I/O methods that return EISDIR. type DirectoryFileDescriptionDefaultImpl struct{} +// Allocate implements DirectoryFileDescriptionDefaultImpl.Allocate. +func (DirectoryFileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.EISDIR +} + // PRead implements FileDescriptionImpl.PRead. func (DirectoryFileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { return 0, syserror.EISDIR @@ -347,7 +338,7 @@ func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src writable, ok := fd.data.(WritableDynamicBytesSource) if !ok { - return 0, syserror.EINVAL + return 0, syserror.EIO } n, err := writable.Write(ctx, src, offset) if err != nil { @@ -384,3 +375,54 @@ func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.M fd.IncRef() return nil } + +// LockFD may be used by most implementations of FileDescriptionImpl.Lock* +// functions. Caller must call Init(). +type LockFD struct { + locks *FileLocks +} + +// Init initializes fd with FileLocks to use. +func (fd *LockFD) Init(locks *FileLocks) { + fd.locks = locks +} + +// Locks returns the locks associated with this file. +func (fd *LockFD) Locks() *FileLocks { + return fd.locks +} + +// LockBSD implements vfs.FileDescriptionImpl.LockBSD. +func (fd *LockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + return fd.locks.LockBSD(uid, t, block) +} + +// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. +func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { + fd.locks.UnlockBSD(uid) + return nil +} + +// NoLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface +// returning ENOLCK. +type NoLockFD struct{} + +// LockBSD implements vfs.FileDescriptionImpl.LockBSD. +func (NoLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + return syserror.ENOLCK +} + +// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. +func (NoLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { + return syserror.ENOLCK +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return syserror.ENOLCK +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return syserror.ENOLCK +} diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index 3a75d4d62..3b7e1c273 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -33,6 +33,7 @@ import ( type fileDescription struct { vfsfd FileDescription FileDescriptionDefaultImpl + NoLockFD } // genCount contains the number of times its DynamicBytesSource.Generate() @@ -154,11 +155,11 @@ func TestGenCountFD(t *testing.T) { } // Write and PWrite fails. - if _, err := fd.Write(ctx, ioseq, WriteOptions{}); err != syserror.EINVAL { - t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL) + if _, err := fd.Write(ctx, ioseq, WriteOptions{}); err != syserror.EIO { + t.Errorf("Write: got err %v, wanted %v", err, syserror.EIO) } - if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); err != syserror.EINVAL { - t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL) + if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); err != syserror.EIO { + t.Errorf("Write: got err %v, wanted %v", err, syserror.EIO) } } diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 1edd584c9..6bb9ca180 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -524,8 +524,6 @@ type FilesystemImpl interface { // // Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl. PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error - - // TODO(gvisor.dev/issue/1479): inotify_add_watch() } // PrependPathAtVFSRootError is returned by implementations of diff --git a/pkg/sentry/vfs/g3doc/inotify.md b/pkg/sentry/vfs/g3doc/inotify.md new file mode 100644 index 000000000..e7da49faa --- /dev/null +++ b/pkg/sentry/vfs/g3doc/inotify.md @@ -0,0 +1,210 @@ +# Inotify + +Inotify is a mechanism for monitoring filesystem events in Linux--see +inotify(7). An inotify instance can be used to monitor files and directories for +modifications, creation/deletion, etc. The inotify API consists of system calls +that create inotify instances (inotify_init/inotify_init1) and add/remove +watches on files to an instance (inotify_add_watch/inotify_rm_watch). Events are +generated from various places in the sentry, including the syscall layer, the +vfs layer, the process fd table, and within each filesystem implementation. This +document outlines the implementation details of inotify in VFS2. + +## Inotify Objects + +Inotify data structures are implemented in the vfs package. + +### vfs.Inotify + +Inotify instances are represented by vfs.Inotify objects, which implement +vfs.FileDescriptionImpl. As in Linux, inotify fds are backed by a +pseudo-filesystem (anonfs). Each inotify instance receives events from a set of +vfs.Watch objects, which can be modified with inotify_add_watch(2) and +inotify_rm_watch(2). An application can retrieve events by reading the inotify +fd. + +### vfs.Watches + +The set of all watches held on a single file (i.e., the watch target) is stored +in vfs.Watches. Each watch will belong to a different inotify instance (an +instance can only have one watch on any watch target). The watches are stored in +a map indexed by their vfs.Inotify owner’s id. Hard links and file descriptions +to a single file will all share the same vfs.Watches. Activity on the target +causes its vfs.Watches to generate notifications on its watches’ inotify +instances. + +### vfs.Watch + +A single watch, owned by one inotify instance and applied to one watch target. +Both the vfs.Inotify owner and vfs.Watches on the target will hold a vfs.Watch, +which leads to some complicated locking behavior (see Lock Ordering). Whenever a +watch is notified of an event on its target, it will queue events to its inotify +instance for delivery to the user. + +### vfs.Event + +vfs.Event is a simple struct encapsulating all the fields for an inotify event. +It is generated by vfs.Watches and forwarded to the watches' owners. It is +serialized to the user during read(2) syscalls on the associated fs.Inotify's +fd. + +## Lock Ordering + +There are three locks related to the inotify implementation: + +Inotify.mu: the inotify instance lock. Inotify.evMu: the inotify event queue +lock. Watches.mu: the watch set lock, used to protect the collection of watches +on a target. + +The correct lock ordering for inotify code is: + +Inotify.mu -> Watches.mu -> Inotify.evMu. + +Note that we use a distinct lock to protect the inotify event queue. If we +simply used Inotify.mu, we could simultaneously have locks being acquired in the +order of Inotify.mu -> Watches.mu and Watches.mu -> Inotify.mu, which would +cause deadlocks. For instance, adding a watch to an inotify instance would +require locking Inotify.mu, and then adding the same watch to the target would +cause Watches.mu to be held. At the same time, generating an event on the target +would require Watches.mu to be held before iterating through each watch, and +then notifying the owner of each watch would cause Inotify.mu to be held. + +See the vfs package comment to understand how inotify locks fit into the overall +ordering of filesystem locks. + +## Watch Targets in Different Filesystem Implementations + +In Linux, watches reside on inodes at the virtual filesystem layer. As a result, +all hard links and file descriptions on a single file will all share the same +watch set. In VFS2, there is no common inode structure across filesystem types +(some may not even have inodes), so we have to plumb inotify support through +each specific filesystem implementation. Some of the technical considerations +are outlined below. + +### Tmpfs + +For filesystems with inodes, like tmpfs, the design is quite similar to that of +Linux, where watches reside on the inode. + +### Pseudo-filesystems + +Technically, because inotify is implemented at the vfs layer in Linux, +pseudo-filesystems on top of kernfs support inotify passively. However, watches +can only track explicit filesystem operations like read/write, open/close, +mknod, etc., so watches on a target like /proc/self/fd will not generate events +every time a new fd is added or removed. As of this writing, we leave inotify +unimplemented in kernfs and anonfs; it does not seem particularly useful. + +### Gofer Filesystem (fsimpl/gofer) + +The gofer filesystem has several traits that make it difficult to support +inotify: + +* **There are no inodes.** A file is represented as a dentry that holds an + unopened p9 file (and possibly an open FID), through which the Sentry + interacts with the gofer. + * *Solution:* Because there is no inode structure stored in the sandbox, + inotify watches must be held on the dentry. This would be an issue in + the presence of hard links, where multiple dentries would need to share + the same set of watches, but in VFS2, we do not support the internal + creation of hard links on gofer fs. As a result, we make the assumption + that every dentry corresponds to a unique inode. However, the next point + raises an issue with this assumption: +* **The Sentry cannot always be aware of hard links on the remote + filesystem.** There is no way for us to confirm whether two files on the + remote filesystem are actually links to the same inode. QIDs and inodes are + not always 1:1. The assumption that dentries and inodes are 1:1 is + inevitably broken if there are remote hard links that we cannot detect. + * *Solution:* this is an issue with gofer fs in general, not only inotify, + and we will have to live with it. +* **Dentries can be cached, and then evicted.** Dentry lifetime does not + correspond to file lifetime. Because gofer fs is not entirely in-memory, the + absence of a dentry does not mean that the corresponding file does not + exist, nor does a dentry reaching zero references mean that the + corresponding file no longer exists. When a dentry reaches zero references, + it will be cached, in case the file at that path is needed again in the + future. However, the dentry may be evicted from the cache, which will cause + a new dentry to be created next time the same file path is used. The + existing watches will be lost. + * *Solution:* When a dentry reaches zero references, do not cache it if it + has any watches, so we can avoid eviction/destruction. Note that if the + dentry was deleted or invalidated (d.vfsd.IsDead()), we should still + destroy it along with its watches. Additionally, when a dentry’s last + watch is removed, we cache it if it also has zero references. This way, + the dentry can eventually be evicted from memory if it is no longer + needed. +* **Dentries can be invalidated.** Another issue with dentry lifetime is that + the remote file at the file path represented may change from underneath the + dentry. In this case, the next time that the dentry is used, it will be + invalidated and a new dentry will replace it. In this case, it is not clear + what should be done with the watches on the old dentry. + * *Solution:* Silently destroy the watches when invalidation occurs. We + have no way of knowing exactly what happened, when it happens. Inotify + instances on NFS files in Linux probably behave in a similar fashion, + since inotify is implemented at the vfs layer and is not aware of the + complexities of remote file systems. + * An alternative would be to issue some kind of event upon invalidation, + e.g. a delete event, but this has several issues: + * We cannot discern whether the remote file was invalidated because it was + moved, deleted, etc. This information is crucial, because these cases + should result in different events. Furthermore, the watches should only + be destroyed if the file has been deleted. + * Moreover, the mechanism for detecting whether the underlying file has + changed is to check whether a new QID is given by the gofer. This may + result in false positives, e.g. suppose that the server closed and + re-opened the same file, which may result in a new QID. + * Finally, the time of the event may be completely different from the time + of the file modification, since a dentry is not immediately notified + when the underlying file has changed. It would be quite unexpected to + receive the notification when invalidation was triggered, i.e. the next + time the file was accessed within the sandbox, because then the + read/write/etc. operation on the file would not result in the expected + event. + * Another point in favor of the first solution: inotify in Linux can + already be lossy on local filesystems (one of the sacrifices made so + that filesystem performance isn’t killed), and it is lossy on NFS for + similar reasons to gofer fs. Therefore, it is better for inotify to be + silent than to emit incorrect notifications. +* **There may be external users of the remote filesystem.** We can only track + operations performed on the file within the sandbox. This is sufficient + under InteropModeExclusive, but whenever there are external users, the set + of actions we are aware of is incomplete. + * *Solution:* We could either return an error or just issue a warning when + inotify is used without InteropModeExclusive. Although faulty, VFS1 + allows it when the filesystem is shared, and Linux does the same for + remote filesystems (as mentioned above, inotify sits at the vfs level). + +## Dentry Interface + +For events that must be generated above the vfs layer, we provide the following +DentryImpl methods to allow interactions with targets on any FilesystemImpl: + +* **InotifyWithParent()** generates events on the dentry’s watches as well as + its parent’s. +* **Watches()** retrieves the watch set of the target represented by the + dentry. This is used to access and modify watches on a target. +* **OnZeroWatches()** performs cleanup tasks after the last watch is removed + from a dentry. This is needed by gofer fs, which must allow a watched dentry + to be cached once it has no more watches. Most implementations can just do + nothing. Note that OnZeroWatches() must be called after all inotify locks + are released to preserve lock ordering, since it may acquire + FilesystemImpl-specific locks. + +## IN_EXCL_UNLINK + +There are several options that can be set for a watch, specified as part of the +mask in inotify_add_watch(2). In particular, IN_EXCL_UNLINK requires some +additional support in each filesystem. + +A watch with IN_EXCL_UNLINK will not generate events for its target if it +corresponds to a path that was unlinked. For instance, if an fd is opened on +“foo/bar” and “foo/bar” is subsequently unlinked, any reads/writes/etc. on the +fd will be ignored by watches on “foo” or “foo/bar” with IN_EXCL_UNLINK. This +requires each DentryImpl to keep track of whether it has been unlinked, in order +to determine whether events should be sent to watches with IN_EXCL_UNLINK. + +## IN_ONESHOT + +One-shot watches expire after generating a single event. When an event occurs, +all one-shot watches on the target that successfully generated an event are +removed. Lock ordering can cause the management of one-shot watches to be quite +expensive; see Watches.Notify() for more information. diff --git a/pkg/sentry/vfs/genericfstree/genericfstree.go b/pkg/sentry/vfs/genericfstree/genericfstree.go index 286510195..8882fa84a 100644 --- a/pkg/sentry/vfs/genericfstree/genericfstree.go +++ b/pkg/sentry/vfs/genericfstree/genericfstree.go @@ -43,7 +43,7 @@ type Dentry struct { // IsAncestorDentry returns true if d is an ancestor of d2; that is, d is // either d2's parent or an ancestor of d2's parent. func IsAncestorDentry(d, d2 *Dentry) bool { - for { + for d2 != nil { // Stop at root, where d2.parent == nil. if d2.parent == d { return true } @@ -52,6 +52,7 @@ func IsAncestorDentry(d, d2 *Dentry) bool { } d2 = d2.parent } + return false } // ParentOrSelf returns d.parent. If d.parent is nil, ParentOrSelf returns d. diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go new file mode 100644 index 000000000..167b731ac --- /dev/null +++ b/pkg/sentry/vfs/inotify.go @@ -0,0 +1,774 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "bytes" + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/uniqueid" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// inotifyEventBaseSize is the base size of linux's struct inotify_event. This +// must be a power 2 for rounding below. +const inotifyEventBaseSize = 16 + +// EventType defines different kinds of inotfiy events. +// +// The way events are labelled appears somewhat arbitrary, but they must match +// Linux so that IN_EXCL_UNLINK behaves as it does in Linux. +type EventType uint8 + +// PathEvent and InodeEvent correspond to FSNOTIFY_EVENT_PATH and +// FSNOTIFY_EVENT_INODE in Linux. +const ( + PathEvent EventType = iota + InodeEvent EventType = iota +) + +// Inotify represents an inotify instance created by inotify_init(2) or +// inotify_init1(2). Inotify implements FileDescriptionImpl. +// +// +stateify savable +type Inotify struct { + vfsfd FileDescription + FileDescriptionDefaultImpl + DentryMetadataFileDescriptionImpl + NoLockFD + + // Unique identifier for this inotify instance. We don't just reuse the + // inotify fd because fds can be duped. These should not be exposed to the + // user, since we may aggressively reuse an id on S/R. + id uint64 + + // queue is used to notify interested parties when the inotify instance + // becomes readable or writable. + queue waiter.Queue `state:"nosave"` + + // evMu *only* protects the events list. We need a separate lock while + // queuing events: using mu may violate lock ordering, since at that point + // the calling goroutine may already hold Watches.mu. + evMu sync.Mutex `state:"nosave"` + + // A list of pending events for this inotify instance. Protected by evMu. + events eventList + + // A scratch buffer, used to serialize inotify events. Allocate this + // ahead of time for the sake of performance. Protected by evMu. + scratch []byte + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // nextWatchMinusOne is used to allocate watch descriptors on this Inotify + // instance. Note that Linux starts numbering watch descriptors from 1. + nextWatchMinusOne int32 + + // Map from watch descriptors to watch objects. + watches map[int32]*Watch +} + +var _ FileDescriptionImpl = (*Inotify)(nil) + +// NewInotifyFD constructs a new Inotify instance. +func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) (*FileDescription, error) { + // O_CLOEXEC affects file descriptors, so it must be handled outside of vfs. + flags &^= linux.O_CLOEXEC + if flags&^linux.O_NONBLOCK != 0 { + return nil, syserror.EINVAL + } + + id := uniqueid.GlobalFromContext(ctx) + vd := vfsObj.NewAnonVirtualDentry(fmt.Sprintf("[inotifyfd:%d]", id)) + defer vd.DecRef() + fd := &Inotify{ + id: id, + scratch: make([]byte, inotifyEventBaseSize), + watches: make(map[int32]*Watch), + } + if err := fd.vfsfd.Init(fd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ + UseDentryMetadata: true, + DenyPRead: true, + DenyPWrite: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// Release implements FileDescriptionImpl.Release. Release removes all +// watches and frees all resources for an inotify instance. +func (i *Inotify) Release() { + var ds []*Dentry + + // We need to hold i.mu to avoid a race with concurrent calls to + // Inotify.handleDeletion from Watches. There's no risk of Watches + // accessing this Inotify after the destructor ends, because we remove all + // references to it below. + i.mu.Lock() + for _, w := range i.watches { + // Remove references to the watch from the watches set on the target. We + // don't need to worry about the references from i.watches, since this + // file description is about to be destroyed. + d := w.target + ws := d.Watches() + // Watchable dentries should never return a nil watch set. + if ws == nil { + panic("Cannot remove watch from an unwatchable dentry") + } + ws.Remove(i.id) + if ws.Size() == 0 { + ds = append(ds, d) + } + } + i.mu.Unlock() + + for _, d := range ds { + d.OnZeroWatches() + } +} + +// Allocate implements FileDescription.Allocate. +func (i *Inotify) Allocate(ctx context.Context, mode, offset, length uint64) error { + panic("Allocate should not be called on read-only inotify fds") +} + +// EventRegister implements waiter.Waitable. +func (i *Inotify) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + i.queue.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable. +func (i *Inotify) EventUnregister(e *waiter.Entry) { + i.queue.EventUnregister(e) +} + +// Readiness implements waiter.Waitable.Readiness. +// +// Readiness indicates whether there are pending events for an inotify instance. +func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask { + ready := waiter.EventMask(0) + + i.evMu.Lock() + defer i.evMu.Unlock() + + if !i.events.Empty() { + ready |= waiter.EventIn + } + + return mask & ready +} + +// PRead implements FileDescriptionImpl.PRead. +func (*Inotify) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// PWrite implements FileDescriptionImpl.PWrite. +func (*Inotify) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Write implements FileDescriptionImpl.Write. +func (*Inotify) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + return 0, syserror.EBADF +} + +// Read implements FileDescriptionImpl.Read. +func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + if dst.NumBytes() < inotifyEventBaseSize { + return 0, syserror.EINVAL + } + + i.evMu.Lock() + defer i.evMu.Unlock() + + if i.events.Empty() { + // Nothing to read yet, tell caller to block. + return 0, syserror.ErrWouldBlock + } + + var writeLen int64 + for it := i.events.Front(); it != nil; { + // Advance `it` before the element is removed from the list, or else + // it.Next() will always be nil. + event := it + it = it.Next() + + // Does the buffer have enough remaining space to hold the event we're + // about to write out? + if dst.NumBytes() < int64(event.sizeOf()) { + if writeLen > 0 { + // Buffer wasn't big enough for all pending events, but we did + // write some events out. + return writeLen, nil + } + return 0, syserror.EINVAL + } + + // Linux always dequeues an available event as long as there's enough + // buffer space to copy it out, even if the copy below fails. Emulate + // this behaviour. + i.events.Remove(event) + + // Buffer has enough space, copy event to the read buffer. + n, err := event.CopyTo(ctx, i.scratch, dst) + if err != nil { + return 0, err + } + + writeLen += n + dst = dst.DropFirst64(n) + } + return writeLen, nil +} + +// Ioctl implements FileDescriptionImpl.Ioctl. +func (i *Inotify) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + switch args[1].Int() { + case linux.FIONREAD: + i.evMu.Lock() + defer i.evMu.Unlock() + var n uint32 + for e := i.events.Front(); e != nil; e = e.Next() { + n += uint32(e.sizeOf()) + } + var buf [4]byte + usermem.ByteOrder.PutUint32(buf[:], n) + _, err := uio.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{}) + return 0, err + + default: + return 0, syserror.ENOTTY + } +} + +func (i *Inotify) queueEvent(ev *Event) { + i.evMu.Lock() + + // Check if we should coalesce the event we're about to queue with the last + // one currently in the queue. Events are coalesced if they are identical. + if last := i.events.Back(); last != nil { + if ev.equals(last) { + // "Coalesce" the two events by simply not queuing the new one. We + // don't need to raise a waiter.EventIn notification because no new + // data is available for reading. + i.evMu.Unlock() + return + } + } + + i.events.PushBack(ev) + + // Release mutex before notifying waiters because we don't control what they + // can do. + i.evMu.Unlock() + + i.queue.Notify(waiter.EventIn) +} + +// newWatchLocked creates and adds a new watch to target. +// +// Precondition: i.mu must be locked. ws must be the watch set for target d. +func (i *Inotify) newWatchLocked(d *Dentry, ws *Watches, mask uint32) *Watch { + w := &Watch{ + owner: i, + wd: i.nextWatchIDLocked(), + target: d, + mask: mask, + } + + // Hold the watch in this inotify instance as well as the watch set on the + // target. + i.watches[w.wd] = w + ws.Add(w) + return w +} + +// newWatchIDLocked allocates and returns a new watch descriptor. +// +// Precondition: i.mu must be locked. +func (i *Inotify) nextWatchIDLocked() int32 { + i.nextWatchMinusOne++ + return i.nextWatchMinusOne +} + +// AddWatch constructs a new inotify watch and adds it to the target. It +// returns the watch descriptor returned by inotify_add_watch(2). +// +// The caller must hold a reference on target. +func (i *Inotify) AddWatch(target *Dentry, mask uint32) (int32, error) { + // Note: Locking this inotify instance protects the result returned by + // Lookup() below. With the lock held, we know for sure the lookup result + // won't become stale because it's impossible for *this* instance to + // add/remove watches on target. + i.mu.Lock() + defer i.mu.Unlock() + + ws := target.Watches() + if ws == nil { + // While Linux supports inotify watches on all filesystem types, watches on + // filesystems like kernfs are not generally useful, so we do not. + return 0, syserror.EPERM + } + // Does the target already have a watch from this inotify instance? + if existing := ws.Lookup(i.id); existing != nil { + newmask := mask + if mask&linux.IN_MASK_ADD != 0 { + // "Add (OR) events to watch mask for this pathname if it already + // exists (instead of replacing mask)." -- inotify(7) + newmask |= atomic.LoadUint32(&existing.mask) + } + atomic.StoreUint32(&existing.mask, newmask) + return existing.wd, nil + } + + // No existing watch, create a new watch. + w := i.newWatchLocked(target, ws, mask) + return w.wd, nil +} + +// RmWatch looks up an inotify watch for the given 'wd' and configures the +// target to stop sending events to this inotify instance. +func (i *Inotify) RmWatch(wd int32) error { + i.mu.Lock() + + // Find the watch we were asked to removed. + w, ok := i.watches[wd] + if !ok { + i.mu.Unlock() + return syserror.EINVAL + } + + // Remove the watch from this instance. + delete(i.watches, wd) + + // Remove the watch from the watch target. + ws := w.target.Watches() + // AddWatch ensures that w.target has a non-nil watch set. + if ws == nil { + panic("Watched dentry cannot have nil watch set") + } + ws.Remove(w.OwnerID()) + remaining := ws.Size() + i.mu.Unlock() + + if remaining == 0 { + w.target.OnZeroWatches() + } + + // Generate the event for the removal. + i.queueEvent(newEvent(wd, "", linux.IN_IGNORED, 0)) + + return nil +} + +// Watches is the collection of all inotify watches on a single file. +// +// +stateify savable +type Watches struct { + // mu protects the fields below. + mu sync.RWMutex `state:"nosave"` + + // ws is the map of active watches in this collection, keyed by the inotify + // instance id of the owner. + ws map[uint64]*Watch +} + +// Size returns the number of watches held by w. +func (w *Watches) Size() int { + w.mu.Lock() + defer w.mu.Unlock() + return len(w.ws) +} + +// Lookup returns the watch owned by an inotify instance with the given id. +// Returns nil if no such watch exists. +// +// Precondition: the inotify instance with the given id must be locked to +// prevent the returned watch from being concurrently modified or replaced in +// Inotify.watches. +func (w *Watches) Lookup(id uint64) *Watch { + w.mu.Lock() + defer w.mu.Unlock() + return w.ws[id] +} + +// Add adds watch into this set of watches. +// +// Precondition: the inotify instance with the given id must be locked. +func (w *Watches) Add(watch *Watch) { + w.mu.Lock() + defer w.mu.Unlock() + + owner := watch.OwnerID() + // Sanity check, we should never have two watches for one owner on the + // same target. + if _, exists := w.ws[owner]; exists { + panic(fmt.Sprintf("Watch collision with ID %+v", owner)) + } + if w.ws == nil { + w.ws = make(map[uint64]*Watch) + } + w.ws[owner] = watch +} + +// Remove removes a watch with the given id from this set of watches and +// releases it. The caller is responsible for generating any watch removal +// event, as appropriate. The provided id must match an existing watch in this +// collection. +// +// Precondition: the inotify instance with the given id must be locked. +func (w *Watches) Remove(id uint64) { + w.mu.Lock() + defer w.mu.Unlock() + + if w.ws == nil { + // This watch set is being destroyed. The thread executing the + // destructor is already in the process of deleting all our watches. We + // got here with no references on the target because we raced with the + // destructor notifying all the watch owners of destruction. See the + // comment in Watches.HandleDeletion for why this race exists. + return + } + + // It is possible for w.Remove() to be called for the same watch multiple + // times. See the treatment of one-shot watches in Watches.Notify(). + if _, ok := w.ws[id]; ok { + delete(w.ws, id) + } +} + +// Notify queues a new event with watches in this set. Watches with +// IN_EXCL_UNLINK are skipped if the event is coming from a child that has been +// unlinked. +func (w *Watches) Notify(name string, events, cookie uint32, et EventType, unlinked bool) { + var hasExpired bool + w.mu.RLock() + for _, watch := range w.ws { + if unlinked && watch.ExcludeUnlinked() && et == PathEvent { + continue + } + if watch.Notify(name, events, cookie) { + hasExpired = true + } + } + w.mu.RUnlock() + + if hasExpired { + w.cleanupExpiredWatches() + } +} + +// This function is relatively expensive and should only be called where there +// are expired watches. +func (w *Watches) cleanupExpiredWatches() { + // Because of lock ordering, we cannot acquire Inotify.mu for each watch + // owner while holding w.mu. As a result, store expired watches locally + // before removing. + var toRemove []*Watch + w.mu.RLock() + for _, watch := range w.ws { + if atomic.LoadInt32(&watch.expired) == 1 { + toRemove = append(toRemove, watch) + } + } + w.mu.RUnlock() + for _, watch := range toRemove { + watch.owner.RmWatch(watch.wd) + } +} + +// HandleDeletion is called when the watch target is destroyed. Clear the +// watch set, detach watches from the inotify instances they belong to, and +// generate the appropriate events. +func (w *Watches) HandleDeletion() { + w.Notify("", linux.IN_DELETE_SELF, 0, InodeEvent, true /* unlinked */) + + // As in Watches.Notify, we can't hold w.mu while acquiring Inotify.mu for + // the owner of each watch being deleted. Instead, atomically store the + // watches map in a local variable and set it to nil so we can iterate over + // it with the assurance that there will be no concurrent accesses. + var ws map[uint64]*Watch + w.mu.Lock() + ws = w.ws + w.ws = nil + w.mu.Unlock() + + // Remove each watch from its owner's watch set, and generate a corresponding + // watch removal event. + for _, watch := range ws { + i := watch.owner + i.mu.Lock() + _, found := i.watches[watch.wd] + delete(i.watches, watch.wd) + + // Release mutex before notifying waiters because we don't control what + // they can do. + i.mu.Unlock() + + // If watch was not found, it was removed from the inotify instance before + // we could get to it, in which case we should not generate an event. + if found { + i.queueEvent(newEvent(watch.wd, "", linux.IN_IGNORED, 0)) + } + } +} + +// Watch represent a particular inotify watch created by inotify_add_watch. +// +// +stateify savable +type Watch struct { + // Inotify instance which owns this watch. + // + // This field is immutable after creation. + owner *Inotify + + // Descriptor for this watch. This is unique across an inotify instance. + // + // This field is immutable after creation. + wd int32 + + // target is a dentry representing the watch target. Its watch set contains this watch. + // + // This field is immutable after creation. + target *Dentry + + // Events being monitored via this watch. Must be accessed with atomic + // memory operations. + mask uint32 + + // expired is set to 1 to indicate that this watch is a one-shot that has + // already sent a notification and therefore can be removed. Must be accessed + // with atomic memory operations. + expired int32 +} + +// OwnerID returns the id of the inotify instance that owns this watch. +func (w *Watch) OwnerID() uint64 { + return w.owner.id +} + +// ExcludeUnlinked indicates whether the watched object should continue to be +// notified of events originating from a path that has been unlinked. +// +// For example, if "foo/bar" is opened and then unlinked, operations on the +// open fd may be ignored by watches on "foo" and "foo/bar" with IN_EXCL_UNLINK. +func (w *Watch) ExcludeUnlinked() bool { + return atomic.LoadUint32(&w.mask)&linux.IN_EXCL_UNLINK != 0 +} + +// Notify queues a new event on this watch. Returns true if this is a one-shot +// watch that should be deleted, after this event was successfully queued. +func (w *Watch) Notify(name string, events uint32, cookie uint32) bool { + if atomic.LoadInt32(&w.expired) == 1 { + // This is a one-shot watch that is already in the process of being + // removed. This may happen if a second event reaches the watch target + // before this watch has been removed. + return false + } + + mask := atomic.LoadUint32(&w.mask) + if mask&events == 0 { + // We weren't watching for this event. + return false + } + + // Event mask should include bits matched from the watch plus all control + // event bits. + unmaskableBits := ^uint32(0) &^ linux.IN_ALL_EVENTS + effectiveMask := unmaskableBits | mask + matchedEvents := effectiveMask & events + w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie)) + if mask&linux.IN_ONESHOT != 0 { + atomic.StoreInt32(&w.expired, 1) + return true + } + return false +} + +// Event represents a struct inotify_event from linux. +// +// +stateify savable +type Event struct { + eventEntry + + wd int32 + mask uint32 + cookie uint32 + + // len is computed based on the name field is set automatically by + // Event.setName. It should be 0 when no name is set; otherwise it is the + // length of the name slice. + len uint32 + + // The name field has special padding requirements and should only be set by + // calling Event.setName. + name []byte +} + +func newEvent(wd int32, name string, events, cookie uint32) *Event { + e := &Event{ + wd: wd, + mask: events, + cookie: cookie, + } + if name != "" { + e.setName(name) + } + return e +} + +// paddedBytes converts a go string to a null-terminated c-string, padded with +// null bytes to a total size of 'l'. 'l' must be large enough for all the bytes +// in the 's' plus at least one null byte. +func paddedBytes(s string, l uint32) []byte { + if l < uint32(len(s)+1) { + panic("Converting string to byte array results in truncation, this can lead to buffer-overflow due to the missing null-byte!") + } + b := make([]byte, l) + copy(b, s) + + // b was zero-value initialized during make(), so the rest of the slice is + // already filled with null bytes. + + return b +} + +// setName sets the optional name for this event. +func (e *Event) setName(name string) { + // We need to pad the name such that the entire event length ends up a + // multiple of inotifyEventBaseSize. + unpaddedLen := len(name) + 1 + // Round up to nearest multiple of inotifyEventBaseSize. + e.len = uint32((unpaddedLen + inotifyEventBaseSize - 1) & ^(inotifyEventBaseSize - 1)) + // Make sure we haven't overflowed and wrapped around when rounding. + if unpaddedLen > int(e.len) { + panic("Overflow when rounding inotify event size, the 'name' field was too big.") + } + e.name = paddedBytes(name, e.len) +} + +func (e *Event) sizeOf() int { + s := inotifyEventBaseSize + int(e.len) + if s < inotifyEventBaseSize { + panic("Overflowed event size") + } + return s +} + +// CopyTo serializes this event to dst. buf is used as a scratch buffer to +// construct the output. We use a buffer allocated ahead of time for +// performance. buf must be at least inotifyEventBaseSize bytes. +func (e *Event) CopyTo(ctx context.Context, buf []byte, dst usermem.IOSequence) (int64, error) { + usermem.ByteOrder.PutUint32(buf[0:], uint32(e.wd)) + usermem.ByteOrder.PutUint32(buf[4:], e.mask) + usermem.ByteOrder.PutUint32(buf[8:], e.cookie) + usermem.ByteOrder.PutUint32(buf[12:], e.len) + + writeLen := 0 + + n, err := dst.CopyOut(ctx, buf) + if err != nil { + return 0, err + } + writeLen += n + dst = dst.DropFirst(n) + + if e.len > 0 { + n, err = dst.CopyOut(ctx, e.name) + if err != nil { + return 0, err + } + writeLen += n + } + + // Santiy check. + if writeLen != e.sizeOf() { + panic(fmt.Sprintf("Serialized unexpected amount of data for an event, expected %d, wrote %d.", e.sizeOf(), writeLen)) + } + + return int64(writeLen), nil +} + +func (e *Event) equals(other *Event) bool { + return e.wd == other.wd && + e.mask == other.mask && + e.cookie == other.cookie && + e.len == other.len && + bytes.Equal(e.name, other.name) +} + +// InotifyEventFromStatMask generates the appropriate events for an operation +// that set the stats specified in mask. +func InotifyEventFromStatMask(mask uint32) uint32 { + var ev uint32 + if mask&(linux.STATX_UID|linux.STATX_GID|linux.STATX_MODE) != 0 { + ev |= linux.IN_ATTRIB + } + if mask&linux.STATX_SIZE != 0 { + ev |= linux.IN_MODIFY + } + + if (mask & (linux.STATX_ATIME | linux.STATX_MTIME)) == (linux.STATX_ATIME | linux.STATX_MTIME) { + // Both times indicates a utime(s) call. + ev |= linux.IN_ATTRIB + } else if mask&linux.STATX_ATIME != 0 { + ev |= linux.IN_ACCESS + } else if mask&linux.STATX_MTIME != 0 { + mask |= linux.IN_MODIFY + } + return ev +} + +// InotifyRemoveChild sends the appriopriate notifications to the watch sets of +// the child being removed and its parent. Note that unlike most pairs of +// parent/child notifications, the child is notified first in this case. +func InotifyRemoveChild(self, parent *Watches, name string) { + if self != nil { + self.Notify("", linux.IN_ATTRIB, 0, InodeEvent, true /* unlinked */) + } + if parent != nil { + parent.Notify(name, linux.IN_DELETE, 0, InodeEvent, true /* unlinked */) + } +} + +// InotifyRename sends the appriopriate notifications to the watch sets of the +// file being renamed and its old/new parents. +func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, oldName, newName string, isDir bool) { + var dirEv uint32 + if isDir { + dirEv = linux.IN_ISDIR + } + cookie := uniqueid.InotifyCookie(ctx) + if oldParent != nil { + oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent, false /* unlinked */) + } + if newParent != nil { + newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent, false /* unlinked */) + } + // Somewhat surprisingly, self move events do not have a cookie. + if renamed != nil { + renamed.Notify("", linux.IN_MOVE_SELF, 0, InodeEvent, false /* unlinked */) + } +} diff --git a/pkg/sentry/vfs/lock/lock.go b/pkg/sentry/vfs/lock.go index 724dfe743..6c7583a81 100644 --- a/pkg/sentry/vfs/lock/lock.go +++ b/pkg/sentry/vfs/lock.go @@ -17,9 +17,11 @@ // // The actual implementations can be found in the lock package under // sentry/fs/lock. -package lock +package vfs import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -56,7 +58,11 @@ func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) { } // LockPOSIX tries to acquire a POSIX-style lock on a file region. -func (fl *FileLocks) LockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { +func (fl *FileLocks) LockPOSIX(ctx context.Context, fd *FileDescription, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + rng, err := computeRange(ctx, fd, start, length, whence) + if err != nil { + return err + } if fl.posix.LockRegion(uid, t, rng, block) { return nil } @@ -67,6 +73,37 @@ func (fl *FileLocks) LockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fsloc // // This operation is always successful, even if there did not exist a lock on // the requested region held by uid in the first place. -func (fl *FileLocks) UnlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) { +func (fl *FileLocks) UnlockPOSIX(ctx context.Context, fd *FileDescription, uid fslock.UniqueID, start, length uint64, whence int16) error { + rng, err := computeRange(ctx, fd, start, length, whence) + if err != nil { + return err + } fl.posix.UnlockRegion(uid, rng) + return nil +} + +func computeRange(ctx context.Context, fd *FileDescription, start uint64, length uint64, whence int16) (fslock.LockRange, error) { + var off int64 + switch whence { + case linux.SEEK_SET: + off = 0 + case linux.SEEK_CUR: + // Note that Linux does not hold any mutexes while retrieving the file + // offset, see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk. + curOff, err := fd.Seek(ctx, 0, linux.SEEK_CUR) + if err != nil { + return fslock.LockRange{}, err + } + off = curOff + case linux.SEEK_END: + stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_SIZE}) + if err != nil { + return fslock.LockRange{}, err + } + off = int64(stat.Size) + default: + return fslock.LockRange{}, syserror.EINVAL + } + + return fslock.ComputeRange(int64(start), int64(length), off) } diff --git a/pkg/sentry/vfs/lock/BUILD b/pkg/sentry/vfs/lock/BUILD deleted file mode 100644 index d9ab063b7..000000000 --- a/pkg/sentry/vfs/lock/BUILD +++ /dev/null @@ -1,13 +0,0 @@ -load("//tools:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "lock", - srcs = ["lock.go"], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/sentry/fs/lock", - "//pkg/syserror", - ], -) diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 02850b65c..32f901bd8 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -28,9 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -// lastMountID is used to allocate mount ids. Must be accessed atomically. -var lastMountID uint64 - // A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem // (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem // (Mount.fs), which applies to path resolution in the context of a particular @@ -58,6 +55,10 @@ type Mount struct { // ID is the immutable mount ID. ID uint64 + // Flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except + // for MS_RDONLY which is tracked in "writers". Immutable. + Flags MountFlags + // key is protected by VirtualFilesystem.mountMu and // VirtualFilesystem.mounts.seq, and may be nil. References are held on // key.parent and key.point if they are not nil. @@ -84,10 +85,6 @@ type Mount struct { // umounted is true. umounted is protected by VirtualFilesystem.mountMu. umounted bool - // flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except - // for MS_RDONLY which is tracked in "writers". - flags MountFlags - // The lower 63 bits of writers is the number of calls to // Mount.CheckBeginWrite() that have not yet been paired with a call to // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect. @@ -97,11 +94,11 @@ type Mount struct { func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount { mnt := &Mount{ - ID: atomic.AddUint64(&lastMountID, 1), + ID: atomic.AddUint64(&vfs.lastMountID, 1), + Flags: opts.Flags, vfs: vfs, fs: fs, root: root, - flags: opts.Flags, ns: mntns, refs: 1, } @@ -111,8 +108,17 @@ func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *Mount return mnt } -// A MountNamespace is a collection of Mounts. -// +// Options returns a copy of the MountOptions currently applicable to mnt. +func (mnt *Mount) Options() MountOptions { + mnt.vfs.mountMu.Lock() + defer mnt.vfs.mountMu.Unlock() + return MountOptions{ + Flags: mnt.Flags, + ReadOnly: mnt.readOnly(), + } +} + +// A MountNamespace is a collection of Mounts.// // MountNamespaces are reference-counted. Unless otherwise specified, all // MountNamespace methods require that a reference is held. // @@ -120,6 +126,9 @@ func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *Mount // // +stateify savable type MountNamespace struct { + // Owner is the usernamespace that owns this mount namespace. + Owner *auth.UserNamespace + // root is the MountNamespace's root mount. root is immutable. root *Mount @@ -148,7 +157,7 @@ type MountNamespace struct { func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) { rft := vfs.getFilesystemType(fsTypeName) if rft == nil { - ctx.Warningf("Unknown filesystem: %s", fsTypeName) + ctx.Warningf("Unknown filesystem type: %s", fsTypeName) return nil, syserror.ENODEV } fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts) @@ -156,6 +165,7 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth return nil, err } mntns := &MountNamespace{ + Owner: creds.UserNamespace, refs: 1, mountpoints: make(map[*Dentry]uint32), } @@ -175,26 +185,34 @@ func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry, return newMount(vfs, fs, root, nil /* mntns */, opts), nil } -// MountAt creates and mounts a Filesystem configured by the given arguments. -func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error { +// MountDisconnected creates a Filesystem configured by the given arguments, +// then returns a Mount representing it. The new Mount is not associated with +// any MountNamespace and is not connected to any other Mounts. +func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth.Credentials, source string, fsTypeName string, opts *MountOptions) (*Mount, error) { rft := vfs.getFilesystemType(fsTypeName) if rft == nil { - return syserror.ENODEV + return nil, syserror.ENODEV } if !opts.InternalMount && !rft.opts.AllowUserMount { - return syserror.ENODEV + return nil, syserror.ENODEV } fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions) if err != nil { - return err + return nil, err } + defer root.DecRef() + defer fs.DecRef() + return vfs.NewDisconnectedMount(fs, root, opts) +} +// ConnectMountAt connects mnt at the path represented by target. +// +// Preconditions: mnt must be disconnected. +func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Credentials, mnt *Mount, target *PathOperation) error { // We can't hold vfs.mountMu while calling FilesystemImpl methods due to // lock ordering. vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{}) if err != nil { - root.DecRef() - fs.DecRef() return err } vfs.mountMu.Lock() @@ -204,8 +222,6 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia vd.dentry.mu.Unlock() vfs.mountMu.Unlock() vd.DecRef() - root.DecRef() - fs.DecRef() return syserror.ENOENT } // vd might have been mounted over between vfs.GetDentryAt() and @@ -238,7 +254,6 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia // point and the mount root are directories, or neither are, and returns // ENOTDIR if this is not the case. mntns := vd.mount.ns - mnt := newMount(vfs, fs, root, mntns, opts) vfs.mounts.seq.BeginWrite() vfs.connectLocked(mnt, vd, mntns) vfs.mounts.seq.EndWrite() @@ -247,6 +262,19 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia return nil } +// MountAt creates and mounts a Filesystem configured by the given arguments. +func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error { + mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts) + if err != nil { + return err + } + defer mnt.DecRef() + if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil { + return err + } + return nil +} + // UmountAt removes the Mount at the given path. func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error { if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 { @@ -254,6 +282,9 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti } // MNT_FORCE is currently unimplemented except for the permission check. + // Force unmounting specifically requires CAP_SYS_ADMIN in the root user + // namespace, and not in the owner user namespace for the target mount. See + // fs/namespace.c:SYSCALL_DEFINE2(umount, ...) if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) { return syserror.EPERM } @@ -369,14 +400,22 @@ func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecu // references held by vd. // // Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a -// writer critical section. d.mu must be locked. mnt.parent() == nil. +// writer critical section. d.mu must be locked. mnt.parent() == nil, i.e. mnt +// must not already be connected. func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) { + if checkInvariants { + if mnt.parent() != nil { + panic("VFS.connectLocked called on connected mount") + } + } + mnt.IncRef() // dropped by callers of umountRecursiveLocked mnt.storeKey(vd) if vd.mount.children == nil { vd.mount.children = make(map[*Mount]struct{}) } vd.mount.children[mnt] = struct{}{} atomic.AddUint32(&vd.dentry.mounts, 1) + mnt.ns = mntns mntns.mountpoints[vd.dentry]++ vfs.mounts.insertSeqed(mnt) vfsmpmounts, ok := vfs.mountpoints[vd.dentry] @@ -394,6 +433,11 @@ func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns // writer critical section. mnt.parent() != nil. func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry { vd := mnt.loadKey() + if checkInvariants { + if vd.mount != nil { + panic("VFS.disconnectLocked called on disconnected mount") + } + } mnt.storeKey(VirtualDentry{}) delete(vd.mount.children, mnt) atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1 @@ -715,7 +759,10 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi if mnt.readOnly() { opts = "ro" } - if mnt.flags.NoExec { + if mnt.Flags.NoATime { + opts = ",noatime" + } + if mnt.Flags.NoExec { opts += ",noexec" } @@ -800,11 +847,12 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo if mnt.readOnly() { opts = "ro" } - if mnt.flags.NoExec { + if mnt.Flags.NoATime { + opts = ",noatime" + } + if mnt.Flags.NoExec { opts += ",noexec" } - // TODO(gvisor.dev/issue/1193): Add "noatime" if MS_NOATIME is - // set. fmt.Fprintf(buf, "%s ", opts) // (7) Optional fields: zero or more fields of the form "tag[:value]". diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go index bc7581698..70f850ca4 100644 --- a/pkg/sentry/vfs/mount_unsafe.go +++ b/pkg/sentry/vfs/mount_unsafe.go @@ -13,7 +13,7 @@ // limitations under the License. // +build go1.12 -// +build !go1.15 +// +build !go1.16 // Check go:linkname function signatures when updating Go version. diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index 53d364c5c..dfc8573fd 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -75,6 +75,21 @@ type MknodOptions struct { type MountFlags struct { // NoExec is equivalent to MS_NOEXEC. NoExec bool + + // NoATime is equivalent to MS_NOATIME and indicates that the + // filesystem should not update access time in-place. + NoATime bool + + // NoDev is equivalent to MS_NODEV and indicates that the + // filesystem should not allow access to devices (special files). + // TODO(gVisor.dev/issue/3186): respect this flag in non FUSE + // filesystems. + NoDev bool + + // NoSUID is equivalent to MS_NOSUID and indicates that the + // filesystem should not honor set-user-ID and set-group-ID bits or + // file capabilities when executing programs. + NoSUID bool } // MountOptions contains options to VirtualFilesystem.MountAt(). @@ -149,6 +164,12 @@ type SetStatOptions struct { // == UTIME_OMIT (VFS users must unset the corresponding bit in Stat.Mask // instead). Stat linux.Statx + + // NeedWritePerm indicates that write permission on the file is needed for + // this operation. This is needed for truncate(2) (note that ftruncate(2) + // does not require the same check--instead, it checks that the fd is + // writable). + NeedWritePerm bool } // BoundEndpointOptions contains options to VirtualFilesystem.BoundEndpointAt() diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index f9647f90e..33389c1df 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -94,6 +94,37 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, mode linu return syserror.EACCES } +// MayLink determines whether creating a hard link to a file with the given +// mode, kuid, and kgid is permitted. +// +// This corresponds to Linux's fs/namei.c:may_linkat. +func MayLink(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error { + // Source inode owner can hardlink all they like; otherwise, it must be a + // safe source. + if CanActAsOwner(creds, kuid) { + return nil + } + + // Only regular files can be hard linked. + if mode.FileType() != linux.S_IFREG { + return syserror.EPERM + } + + // Setuid files should not get pinned to the filesystem. + if mode&linux.S_ISUID != 0 { + return syserror.EPERM + } + + // Executable setgid files should not get pinned to the filesystem, but we + // don't support S_IXGRP anyway. + + // Hardlinking to unreadable or unwritable sources is dangerous. + if err := GenericCheckPermissions(creds, MayRead|MayWrite, mode, kuid, kgid); err != nil { + return syserror.EPERM + } + return nil +} + // AccessTypesForOpenFlags returns the access types required to open a file // with the given OpenOptions.Flags. Note that this is NOT the same thing as // the set of accesses permitted for the opened file: @@ -152,7 +183,8 @@ func MayWriteFileWithOpenFlags(flags uint32) bool { // CheckSetStat checks that creds has permission to change the metadata of a // file with the given permissions, UID, and GID as specified by stat, subject // to the rules of Linux's fs/attr.c:setattr_prepare(). -func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error { +func CheckSetStat(ctx context.Context, creds *auth.Credentials, opts *SetStatOptions, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error { + stat := &opts.Stat if stat.Mask&linux.STATX_SIZE != 0 { limit, err := CheckLimit(ctx, 0, int64(stat.Size)) if err != nil { @@ -184,6 +216,11 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Stat return syserror.EPERM } } + if opts.NeedWritePerm && !creds.HasCapability(linux.CAP_DAC_OVERRIDE) { + if err := GenericCheckPermissions(creds, MayWrite, mode, kuid, kgid); err != nil { + return err + } + } if stat.Mask&(linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME) != 0 { if !CanActAsOwner(creds, kuid) { if (stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW) || @@ -199,6 +236,20 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Stat return nil } +// CheckDeleteSticky checks whether the sticky bit is set on a directory with +// the given file mode, and if so, checks whether creds has permission to +// remove a file owned by childKUID from a directory with the given mode. +// CheckDeleteSticky is consistent with fs/linux.h:check_sticky(). +func CheckDeleteSticky(creds *auth.Credentials, parentMode linux.FileMode, childKUID auth.KUID) error { + if parentMode&linux.ModeSticky == 0 { + return nil + } + if CanActAsOwner(creds, childKUID) { + return nil + } + return syserror.EPERM +} + // CanActAsOwner returns true if creds can act as the owner of a file with the // given owning UID, consistent with Linux's // fs/inode.c:inode_owner_or_capable(). diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 8d7f8f8af..522e27475 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -24,6 +24,9 @@ // Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry // VirtualFilesystem.filesystemsMu // EpollInstance.mu +// Inotify.mu +// Watches.mu +// Inotify.evMu // VirtualFilesystem.fsTypesMu // // Locking Dentry.mu in multiple Dentries requires holding @@ -82,6 +85,10 @@ type VirtualFilesystem struct { // mountpoints is analogous to Linux's mountpoint_hashtable. mountpoints map[*Dentry]map[*Mount]struct{} + // lastMountID is the last allocated mount ID. lastMountID is accessed + // using atomic memory operations. + lastMountID uint64 + // anonMount is a Mount, not included in mounts or mountpoints, // representing an anonFilesystem. anonMount is used to back // VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry(). @@ -116,6 +123,9 @@ type VirtualFilesystem struct { // Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes. func (vfs *VirtualFilesystem) Init() error { + if vfs.mountpoints != nil { + panic("VFS already initialized") + } vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{}) vfs.devices = make(map[devTuple]*registeredDevice) vfs.anonBlockDevMinorNext = 1 @@ -401,7 +411,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential vfs.putResolvingPath(rp) if opts.FileExec { - if fd.Mount().flags.NoExec { + if fd.Mount().Flags.NoExec { fd.DecRef() return nil, syserror.EACCES } @@ -418,6 +428,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential } } + fd.Dentry().InotifyWithParent(linux.IN_OPEN, 0, PathEvent) return fd, nil } if !rp.handleError(err) { |