summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorgVisor bot <gvisor-bot@google.com>2020-05-29 15:12:51 +0000
committergVisor bot <gvisor-bot@google.com>2020-05-29 15:12:51 +0000
commit2470312351ab76897d8e151f2008b6acacda706a (patch)
tree6457da4e3a19e2d76d2475183a383b06032ace29
parentd00bd5bdee15b028787faa81288ebc56ad0abf84 (diff)
parentfe464f44b7d3696bafd9a2faf3750e1dc4d56d80 (diff)
Merge release-20200522.0-27-gfe464f44 (automated)
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer.go12
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs.go12
-rw-r--r--pkg/sentry/fsimpl/tmpfs/directory.go1
-rw-r--r--pkg/sentry/fsimpl/tmpfs/filesystem.go40
-rw-r--r--pkg/sentry/fsimpl/tmpfs/tmpfs.go60
-rw-r--r--pkg/sentry/kernel/fd_table.go8
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/inotify.go134
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/read_write.go36
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/vfs2.go8
-rw-r--r--pkg/sentry/vfs/anonfs.go12
-rw-r--r--pkg/sentry/vfs/dentry.go27
-rw-r--r--pkg/sentry/vfs/event_list.go193
-rw-r--r--pkg/sentry/vfs/inotify.go675
-rw-r--r--pkg/sentry/vfs/vfs.go1
-rw-r--r--pkg/sentry/vfs/vfs_state_autogen.go106
15 files changed, 1313 insertions, 12 deletions
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 131da332f..850482a19 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -1039,6 +1039,18 @@ func (d *dentry) decRefLocked() {
}
}
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *dentry) InotifyWithParent(events uint32, cookie uint32) {}
+
+// Watches implements vfs.DentryImpl.Watches.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *dentry) Watches() *vfs.Watches {
+ return nil
+}
+
// checkCachingLocked should be called after d's reference count becomes 0 or it
// becomes disowned.
//
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index a83151ad3..682545994 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -225,6 +225,18 @@ func (d *Dentry) destroy() {
}
}
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *Dentry) InotifyWithParent(events uint32, cookie uint32) {}
+
+// Watches implements vfs.DentryImpl.Watches.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *Dentry) Watches() *vfs.Watches {
+ return nil
+}
+
// InsertChild inserts child into the vfs dentry cache with the given name under
// this dentry. This does not update the directory inode, so calling this on
// it's own isn't sufficient to insert a child into a directory. InsertChild
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index f2399981b..8bc475f88 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -112,6 +112,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
dir.iterMu.Lock()
defer dir.iterMu.Unlock()
+ fd.dentry().InotifyWithParent(linux.IN_ACCESS, 0)
fd.inode().touchAtime(fd.vfsfd.Mount())
if fd.off == 0 {
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 7c04570f1..b4159f904 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -177,6 +177,12 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa
if err := create(parentDir, name); err != nil {
return err
}
+
+ ev := linux.IN_CREATE
+ if dir {
+ ev |= linux.IN_ISDIR
+ }
+ parentDir.inode.watches.Notify(name, uint32(ev), 0)
parentDir.inode.touchCMtime()
return nil
}
@@ -241,6 +247,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
return syserror.EMLINK
}
d.inode.incLinksLocked()
+ d.inode.watches.Notify("", linux.IN_ATTRIB, 0)
parentDir.insertChildLocked(fs.newDentry(d.inode), name)
return nil
})
@@ -354,6 +361,7 @@ afterTrailingSymlink:
if err != nil {
return nil, err
}
+ parentDir.inode.watches.Notify(name, linux.IN_CREATE, 0)
parentDir.inode.touchCMtime()
return fd, nil
}
@@ -559,6 +567,8 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
newParentDir.inode.touchCMtime()
}
renamed.inode.touchCtime()
+
+ vfs.InotifyRename(ctx, &renamed.inode.watches, &oldParentDir.inode.watches, &newParentDir.inode.watches, oldName, newName, renamed.inode.isDir())
return nil
}
@@ -603,6 +613,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
return err
}
parentDir.removeChildLocked(child)
+ parentDir.inode.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0)
// Remove links for child, child/., and child/..
child.inode.decLinksLocked()
child.inode.decLinksLocked()
@@ -620,7 +631,14 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
if err != nil {
return err
}
- return d.inode.setStat(ctx, rp.Credentials(), &opts.Stat)
+ if err := d.inode.setStat(ctx, rp.Credentials(), &opts.Stat); err != nil {
+ return err
+ }
+
+ if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+ d.InotifyWithParent(ev, 0)
+ }
+ return nil
}
// StatAt implements vfs.FilesystemImpl.StatAt.
@@ -700,6 +718,12 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
return err
}
+
+ // Generate inotify events. Note that this must take place before the link
+ // count of the child is decremented, or else the watches may be dropped
+ // before these events are added.
+ vfs.InotifyRemoveChild(&child.inode.watches, &parentDir.inode.watches, name)
+
parentDir.removeChildLocked(child)
child.inode.decLinksLocked()
vfsObj.CommitDeleteDentry(&child.vfsd)
@@ -756,7 +780,12 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
if err != nil {
return err
}
- return d.inode.setxattr(rp.Credentials(), &opts)
+ if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil {
+ return err
+ }
+
+ d.InotifyWithParent(linux.IN_ATTRIB, 0)
+ return nil
}
// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
@@ -767,7 +796,12 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
if err != nil {
return err
}
- return d.inode.removexattr(rp.Credentials(), name)
+ if err := d.inode.removexattr(rp.Credentials(), name); err != nil {
+ return err
+ }
+
+ d.InotifyWithParent(linux.IN_ATTRIB, 0)
+ return nil
}
// PrependPath implements vfs.FilesystemImpl.PrependPath.
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index b739095b7..1d83b6840 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -201,6 +201,26 @@ func (d *dentry) DecRef() {
d.inode.decRef()
}
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+func (d *dentry) InotifyWithParent(events uint32, cookie uint32) {
+ if d.inode.isDir() {
+ events |= linux.IN_ISDIR
+ }
+
+ // The ordering below is important, Linux always notifies the parent first.
+ if d.parent != nil {
+ // Note that d.parent or d.name may be stale if there is a concurrent
+ // rename operation. Inotify does not provide consistency guarantees.
+ d.parent.inode.watches.Notify(d.name, events, cookie)
+ }
+ d.inode.watches.Notify("", events, cookie)
+}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *dentry) Watches() *vfs.Watches {
+ return &d.inode.watches
+}
+
// inode represents a filesystem object.
type inode struct {
// fs is the owning filesystem. fs is immutable.
@@ -236,6 +256,9 @@ type inode struct {
// Advisory file locks, which lock at the inode level.
locks lock.FileLocks
+ // Inotify watches for this inode.
+ watches vfs.Watches
+
impl interface{} // immutable
}
@@ -257,6 +280,7 @@ func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials,
i.ctime = now
i.mtime = now
// i.nlink initialized by caller
+ i.watches = vfs.Watches{}
i.impl = impl
}
@@ -307,6 +331,7 @@ func (i *inode) tryIncRef() bool {
func (i *inode) decRef() {
if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
+ i.watches.HandleDeletion()
if regFile, ok := i.impl.(*regularFile); ok {
// Release memory used by regFile to store data. Since regFile is
// no longer usable, we don't need to grab any locks or update any
@@ -628,8 +653,12 @@ func (fd *fileDescription) filesystem() *filesystem {
return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
}
+func (fd *fileDescription) dentry() *dentry {
+ return fd.vfsfd.Dentry().Impl().(*dentry)
+}
+
func (fd *fileDescription) inode() *inode {
- return fd.vfsfd.Dentry().Impl().(*dentry).inode
+ return fd.dentry().inode
}
// Stat implements vfs.FileDescriptionImpl.Stat.
@@ -642,7 +671,16 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
creds := auth.CredentialsFromContext(ctx)
- return fd.inode().setStat(ctx, creds, &opts.Stat)
+ d := fd.dentry()
+ if err := d.inode.setStat(ctx, creds, &opts.Stat); err != nil {
+ return err
+ }
+
+ // Generate inotify events.
+ if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+ d.InotifyWithParent(ev, 0)
+ }
+ return nil
}
// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
@@ -657,12 +695,26 @@ func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOption
// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
- return fd.inode().setxattr(auth.CredentialsFromContext(ctx), &opts)
+ d := fd.dentry()
+ if err := d.inode.setxattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
+ return err
+ }
+
+ // Generate inotify events.
+ d.InotifyWithParent(linux.IN_ATTRIB, 0)
+ return nil
}
// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
- return fd.inode().removexattr(auth.CredentialsFromContext(ctx), name)
+ d := fd.dentry()
+ if err := d.inode.removexattr(auth.CredentialsFromContext(ctx), name); err != nil {
+ return err
+ }
+
+ // Generate inotify events.
+ d.InotifyWithParent(linux.IN_ATTRIB, 0)
+ return nil
}
// NewMemfd creates a new tmpfs regular file and file description that can back
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index ed40b5303..ef73e1169 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -152,7 +152,13 @@ func (f *FDTable) drop(file *fs.File) {
// dropVFS2 drops the table reference.
func (f *FDTable) dropVFS2(file *vfs.FileDescription) {
// TODO(gvisor.dev/issue/1480): Release locks.
- // TODO(gvisor.dev/issue/1479): Send inotify events.
+
+ // Generate inotify events.
+ ev := uint32(linux.IN_CLOSE_NOWRITE)
+ if file.IsWritable() {
+ ev = linux.IN_CLOSE_WRITE
+ }
+ file.Dentry().InotifyWithParent(ev, 0)
// Drop the table reference.
file.DecRef()
diff --git a/pkg/sentry/syscalls/linux/vfs2/inotify.go b/pkg/sentry/syscalls/linux/vfs2/inotify.go
new file mode 100644
index 000000000..7d50b6a16
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/inotify.go
@@ -0,0 +1,134 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+const allFlags = linux.IN_NONBLOCK | linux.IN_CLOEXEC
+
+// InotifyInit1 implements the inotify_init1() syscalls.
+func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ flags := args[0].Int()
+ if flags&^allFlags != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ ino, err := vfs.NewInotifyFD(t, t.Kernel().VFS(), uint32(flags))
+ if err != nil {
+ return 0, nil, err
+ }
+ defer ino.DecRef()
+
+ fd, err := t.NewFDFromVFS2(0, ino, kernel.FDFlags{
+ CloseOnExec: flags&linux.IN_CLOEXEC != 0,
+ })
+
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return uintptr(fd), nil, nil
+}
+
+// InotifyInit implements the inotify_init() syscalls.
+func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ args[0].Value = 0
+ return InotifyInit1(t, args)
+}
+
+// fdToInotify resolves an fd to an inotify object. If successful, the file will
+// have an extra ref and the caller is responsible for releasing the ref.
+func fdToInotify(t *kernel.Task, fd int32) (*vfs.Inotify, *vfs.FileDescription, error) {
+ f := t.GetFileVFS2(fd)
+ if f == nil {
+ // Invalid fd.
+ return nil, nil, syserror.EBADF
+ }
+
+ ino, ok := f.Impl().(*vfs.Inotify)
+ if !ok {
+ // Not an inotify fd.
+ f.DecRef()
+ return nil, nil, syserror.EINVAL
+ }
+
+ return ino, f, nil
+}
+
+// InotifyAddWatch implements the inotify_add_watch() syscall.
+func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ addr := args[1].Pointer()
+ mask := args[2].Uint()
+
+ // "EINVAL: The given event mask contains no valid events."
+ // -- inotify_add_watch(2)
+ if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // "IN_DONT_FOLLOW: Don't dereference pathname if it is a symbolic link."
+ // -- inotify(7)
+ follow := followFinalSymlink
+ if mask&linux.IN_DONT_FOLLOW == 0 {
+ follow = nofollowFinalSymlink
+ }
+
+ ino, f, err := fdToInotify(t, fd)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer f.DecRef()
+
+ path, err := copyInPath(t, addr)
+ if err != nil {
+ return 0, nil, err
+ }
+ if mask&linux.IN_ONLYDIR != 0 {
+ path.Dir = true
+ }
+ tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, follow)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer tpop.Release()
+ d, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{})
+ if err != nil {
+ return 0, nil, err
+ }
+ defer d.DecRef()
+
+ fd = ino.AddWatch(d.Dentry(), mask)
+ return uintptr(fd), nil, err
+}
+
+// InotifyRmWatch implements the inotify_rm_watch() syscall.
+func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ fd := args[0].Int()
+ wd := args[1].Int()
+
+ ino, f, err := fdToInotify(t, fd)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer f.DecRef()
+ return 0, nil, ino.RmWatch(wd)
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go
index 3a7ef24f5..92b5631a3 100644
--- a/pkg/sentry/syscalls/linux/vfs2/read_write.go
+++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go
@@ -93,11 +93,17 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
n, err := file.Read(t, dst, opts)
if err != syserror.ErrWouldBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0)
+ }
return n, err
}
allowBlock, deadline, hasDeadline := blockPolicy(t, file)
if !allowBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0)
+ }
return n, err
}
@@ -128,6 +134,9 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt
}
file.EventUnregister(&w)
+ if total > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0)
+ }
return total, err
}
@@ -248,11 +257,17 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
n, err := file.PRead(t, dst, offset, opts)
if err != syserror.ErrWouldBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0)
+ }
return n, err
}
allowBlock, deadline, hasDeadline := blockPolicy(t, file)
if !allowBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0)
+ }
return n, err
}
@@ -283,6 +298,9 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of
}
file.EventUnregister(&w)
+ if total > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0)
+ }
return total, err
}
@@ -345,11 +363,17 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
n, err := file.Write(t, src, opts)
if err != syserror.ErrWouldBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0)
+ }
return n, err
}
allowBlock, deadline, hasDeadline := blockPolicy(t, file)
if !allowBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0)
+ }
return n, err
}
@@ -380,6 +404,9 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op
}
file.EventUnregister(&w)
+ if total > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0)
+ }
return total, err
}
@@ -500,11 +527,17 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
n, err := file.PWrite(t, src, offset, opts)
if err != syserror.ErrWouldBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0)
+ }
return n, err
}
allowBlock, deadline, hasDeadline := blockPolicy(t, file)
if !allowBlock {
+ if n > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0)
+ }
return n, err
}
@@ -535,6 +568,9 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o
}
file.EventUnregister(&w)
+ if total > 0 {
+ file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0)
+ }
return total, err
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
index 083fdcf82..ef8358b8a 100644
--- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go
+++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
@@ -116,9 +116,9 @@ func Override() {
s.Table[232] = syscalls.Supported("epoll_wait", EpollWait)
s.Table[233] = syscalls.Supported("epoll_ctl", EpollCtl)
s.Table[235] = syscalls.Supported("utimes", Utimes)
- delete(s.Table, 253) // inotify_init
- delete(s.Table, 254) // inotify_add_watch
- delete(s.Table, 255) // inotify_rm_watch
+ s.Table[253] = syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil)
+ s.Table[254] = syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil)
+ s.Table[255] = syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil)
s.Table[257] = syscalls.Supported("openat", Openat)
s.Table[258] = syscalls.Supported("mkdirat", Mkdirat)
s.Table[259] = syscalls.Supported("mknodat", Mknodat)
@@ -151,7 +151,7 @@ func Override() {
s.Table[291] = syscalls.Supported("epoll_create1", EpollCreate1)
s.Table[292] = syscalls.Supported("dup3", Dup3)
s.Table[293] = syscalls.Supported("pipe2", Pipe2)
- delete(s.Table, 294) // inotify_init1
+ s.Table[294] = syscalls.PartiallySupported("inotify_init1", InotifyInit1, "inotify events are only available inside the sandbox.", nil)
s.Table[295] = syscalls.Supported("preadv", Preadv)
s.Table[296] = syscalls.Supported("pwritev", Pwritev)
s.Table[299] = syscalls.Supported("recvmmsg", RecvMMsg)
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index caf770fd5..55a3d54cc 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -297,3 +297,15 @@ func (d *anonDentry) TryIncRef() bool {
func (d *anonDentry) DecRef() {
// no-op
}
+
+// InotifyWithParent implements DentryImpl.InotifyWithParent.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *anonDentry) InotifyWithParent(events uint32, cookie uint32) {}
+
+// Watches implements DentryImpl.Watches.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *anonDentry) Watches() *Watches {
+ return nil
+}
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 8624dbd5d..d61b9e09b 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -103,6 +103,22 @@ type DentryImpl interface {
// DecRef decrements the Dentry's reference count.
DecRef()
+
+ // InotifyWithParent notifies all watches on the targets represented by this
+ // dentry and its parent. The parent's watches are notified first, followed
+ // by this dentry's.
+ //
+ // InotifyWithParent automatically adds the IN_ISDIR flag for dentries
+ // representing directories.
+ //
+ // Note that the events may not actually propagate up to the user, depending
+ // on the event masks.
+ InotifyWithParent(events uint32, cookie uint32)
+
+ // Watches returns the set of inotify watches for the file corresponding to
+ // the Dentry. Dentries that are hard links to the same underlying file
+ // share the same watches.
+ Watches() *Watches
}
// IncRef increments d's reference count.
@@ -133,6 +149,17 @@ func (d *Dentry) isMounted() bool {
return atomic.LoadUint32(&d.mounts) != 0
}
+// InotifyWithParent notifies all watches on the inodes for this dentry and
+// its parent of events.
+func (d *Dentry) InotifyWithParent(events uint32, cookie uint32) {
+ d.impl.InotifyWithParent(events, cookie)
+}
+
+// Watches returns the set of inotify watches associated with d.
+func (d *Dentry) Watches() *Watches {
+ return d.impl.Watches()
+}
+
// The following functions are exported so that filesystem implementations can
// use them. The vfs package, and users of VFS, should not call these
// functions.
diff --git a/pkg/sentry/vfs/event_list.go b/pkg/sentry/vfs/event_list.go
new file mode 100644
index 000000000..2ae5b6aba
--- /dev/null
+++ b/pkg/sentry/vfs/event_list.go
@@ -0,0 +1,193 @@
+package vfs
+
+// ElementMapper provides an identity mapping by default.
+//
+// This can be replaced to provide a struct that maps elements to linker
+// objects, if they are not the same. An ElementMapper is not typically
+// required if: Linker is left as is, Element is left as is, or Linker and
+// Element are the same type.
+type eventElementMapper struct{}
+
+// linkerFor maps an Element to a Linker.
+//
+// This default implementation should be inlined.
+//
+//go:nosplit
+func (eventElementMapper) linkerFor(elem *Event) *Event { return elem }
+
+// List is an intrusive list. Entries can be added to or removed from the list
+// in O(1) time and with no additional memory allocations.
+//
+// The zero value for List is an empty list ready to use.
+//
+// To iterate over a list (where l is a List):
+// for e := l.Front(); e != nil; e = e.Next() {
+// // do something with e.
+// }
+//
+// +stateify savable
+type eventList struct {
+ head *Event
+ tail *Event
+}
+
+// Reset resets list l to the empty state.
+func (l *eventList) Reset() {
+ l.head = nil
+ l.tail = nil
+}
+
+// Empty returns true iff the list is empty.
+func (l *eventList) Empty() bool {
+ return l.head == nil
+}
+
+// Front returns the first element of list l or nil.
+func (l *eventList) Front() *Event {
+ return l.head
+}
+
+// Back returns the last element of list l or nil.
+func (l *eventList) Back() *Event {
+ return l.tail
+}
+
+// Len returns the number of elements in the list.
+//
+// NOTE: This is an O(n) operation.
+func (l *eventList) Len() (count int) {
+ for e := l.Front(); e != nil; e = e.Next() {
+ count++
+ }
+ return count
+}
+
+// PushFront inserts the element e at the front of list l.
+func (l *eventList) PushFront(e *Event) {
+ linker := eventElementMapper{}.linkerFor(e)
+ linker.SetNext(l.head)
+ linker.SetPrev(nil)
+ if l.head != nil {
+ eventElementMapper{}.linkerFor(l.head).SetPrev(e)
+ } else {
+ l.tail = e
+ }
+
+ l.head = e
+}
+
+// PushBack inserts the element e at the back of list l.
+func (l *eventList) PushBack(e *Event) {
+ linker := eventElementMapper{}.linkerFor(e)
+ linker.SetNext(nil)
+ linker.SetPrev(l.tail)
+ if l.tail != nil {
+ eventElementMapper{}.linkerFor(l.tail).SetNext(e)
+ } else {
+ l.head = e
+ }
+
+ l.tail = e
+}
+
+// PushBackList inserts list m at the end of list l, emptying m.
+func (l *eventList) PushBackList(m *eventList) {
+ if l.head == nil {
+ l.head = m.head
+ l.tail = m.tail
+ } else if m.head != nil {
+ eventElementMapper{}.linkerFor(l.tail).SetNext(m.head)
+ eventElementMapper{}.linkerFor(m.head).SetPrev(l.tail)
+
+ l.tail = m.tail
+ }
+ m.head = nil
+ m.tail = nil
+}
+
+// InsertAfter inserts e after b.
+func (l *eventList) InsertAfter(b, e *Event) {
+ bLinker := eventElementMapper{}.linkerFor(b)
+ eLinker := eventElementMapper{}.linkerFor(e)
+
+ a := bLinker.Next()
+
+ eLinker.SetNext(a)
+ eLinker.SetPrev(b)
+ bLinker.SetNext(e)
+
+ if a != nil {
+ eventElementMapper{}.linkerFor(a).SetPrev(e)
+ } else {
+ l.tail = e
+ }
+}
+
+// InsertBefore inserts e before a.
+func (l *eventList) InsertBefore(a, e *Event) {
+ aLinker := eventElementMapper{}.linkerFor(a)
+ eLinker := eventElementMapper{}.linkerFor(e)
+
+ b := aLinker.Prev()
+ eLinker.SetNext(a)
+ eLinker.SetPrev(b)
+ aLinker.SetPrev(e)
+
+ if b != nil {
+ eventElementMapper{}.linkerFor(b).SetNext(e)
+ } else {
+ l.head = e
+ }
+}
+
+// Remove removes e from l.
+func (l *eventList) Remove(e *Event) {
+ linker := eventElementMapper{}.linkerFor(e)
+ prev := linker.Prev()
+ next := linker.Next()
+
+ if prev != nil {
+ eventElementMapper{}.linkerFor(prev).SetNext(next)
+ } else {
+ l.head = next
+ }
+
+ if next != nil {
+ eventElementMapper{}.linkerFor(next).SetPrev(prev)
+ } else {
+ l.tail = prev
+ }
+
+ linker.SetNext(nil)
+ linker.SetPrev(nil)
+}
+
+// Entry is a default implementation of Linker. Users can add anonymous fields
+// of this type to their structs to make them automatically implement the
+// methods needed by List.
+//
+// +stateify savable
+type eventEntry struct {
+ next *Event
+ prev *Event
+}
+
+// Next returns the entry that follows e in the list.
+func (e *eventEntry) Next() *Event {
+ return e.next
+}
+
+// Prev returns the entry that precedes e in the list.
+func (e *eventEntry) Prev() *Event {
+ return e.prev
+}
+
+// SetNext assigns 'entry' as the entry that follows e in the list.
+func (e *eventEntry) SetNext(elem *Event) {
+ e.next = elem
+}
+
+// SetPrev assigns 'entry' as the entry that precedes e in the list.
+func (e *eventEntry) SetPrev(elem *Event) {
+ e.prev = elem
+}
diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go
new file mode 100644
index 000000000..1d28ccb46
--- /dev/null
+++ b/pkg/sentry/vfs/inotify.go
@@ -0,0 +1,675 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "bytes"
+ "fmt"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/uniqueid"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// inotifyEventBaseSize is the base size of linux's struct inotify_event. This
+// must be a power 2 for rounding below.
+const inotifyEventBaseSize = 16
+
+// Inotify represents an inotify instance created by inotify_init(2) or
+// inotify_init1(2). Inotify implements FileDescriptionImpl.
+//
+// Lock ordering:
+// Inotify.mu -> Watches.mu -> Inotify.evMu
+//
+// +stateify savable
+type Inotify struct {
+ vfsfd FileDescription
+ FileDescriptionDefaultImpl
+ DentryMetadataFileDescriptionImpl
+
+ // Unique identifier for this inotify instance. We don't just reuse the
+ // inotify fd because fds can be duped. These should not be exposed to the
+ // user, since we may aggressively reuse an id on S/R.
+ id uint64
+
+ // queue is used to notify interested parties when the inotify instance
+ // becomes readable or writable.
+ queue waiter.Queue `state:"nosave"`
+
+ // evMu *only* protects the events list. We need a separate lock while
+ // queuing events: using mu may violate lock ordering, since at that point
+ // the calling goroutine may already hold Watches.mu.
+ evMu sync.Mutex `state:"nosave"`
+
+ // A list of pending events for this inotify instance. Protected by evMu.
+ events eventList
+
+ // A scratch buffer, used to serialize inotify events. Allocate this
+ // ahead of time for the sake of performance. Protected by evMu.
+ scratch []byte
+
+ // mu protects the fields below.
+ mu sync.Mutex `state:"nosave"`
+
+ // nextWatchMinusOne is used to allocate watch descriptors on this Inotify
+ // instance. Note that Linux starts numbering watch descriptors from 1.
+ nextWatchMinusOne int32
+
+ // Map from watch descriptors to watch objects.
+ watches map[int32]*Watch
+}
+
+var _ FileDescriptionImpl = (*Inotify)(nil)
+
+// NewInotifyFD constructs a new Inotify instance.
+func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) (*FileDescription, error) {
+ // O_CLOEXEC affects file descriptors, so it must be handled outside of vfs.
+ flags &^= linux.O_CLOEXEC
+ if flags&^linux.O_NONBLOCK != 0 {
+ return nil, syserror.EINVAL
+ }
+
+ id := uniqueid.GlobalFromContext(ctx)
+ vd := vfsObj.NewAnonVirtualDentry(fmt.Sprintf("[inotifyfd:%d]", id))
+ defer vd.DecRef()
+ fd := &Inotify{
+ id: id,
+ scratch: make([]byte, inotifyEventBaseSize),
+ watches: make(map[int32]*Watch),
+ }
+ if err := fd.vfsfd.Init(fd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
+ UseDentryMetadata: true,
+ DenyPRead: true,
+ DenyPWrite: true,
+ }); err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
+
+// Release implements FileDescriptionImpl.Release. Release removes all
+// watches and frees all resources for an inotify instance.
+func (i *Inotify) Release() {
+ // We need to hold i.mu to avoid a race with concurrent calls to
+ // Inotify.handleDeletion from Watches. There's no risk of Watches
+ // accessing this Inotify after the destructor ends, because we remove all
+ // references to it below.
+ i.mu.Lock()
+ defer i.mu.Unlock()
+ for _, w := range i.watches {
+ // Remove references to the watch from the watches set on the target. We
+ // don't need to worry about the references from i.watches, since this
+ // file description is about to be destroyed.
+ w.set.Remove(i.id)
+ }
+}
+
+// EventRegister implements waiter.Waitable.
+func (i *Inotify) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ i.queue.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.
+func (i *Inotify) EventUnregister(e *waiter.Entry) {
+ i.queue.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+//
+// Readiness indicates whether there are pending events for an inotify instance.
+func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask {
+ ready := waiter.EventMask(0)
+
+ i.evMu.Lock()
+ defer i.evMu.Unlock()
+
+ if !i.events.Empty() {
+ ready |= waiter.EventIn
+ }
+
+ return mask & ready
+}
+
+// PRead implements FileDescriptionImpl.
+func (*Inotify) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+ return 0, syserror.ESPIPE
+}
+
+// PWrite implements FileDescriptionImpl.
+func (*Inotify) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+ return 0, syserror.ESPIPE
+}
+
+// Write implements FileDescriptionImpl.Write.
+func (*Inotify) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+ return 0, syserror.EBADF
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+ if dst.NumBytes() < inotifyEventBaseSize {
+ return 0, syserror.EINVAL
+ }
+
+ i.evMu.Lock()
+ defer i.evMu.Unlock()
+
+ if i.events.Empty() {
+ // Nothing to read yet, tell caller to block.
+ return 0, syserror.ErrWouldBlock
+ }
+
+ var writeLen int64
+ for it := i.events.Front(); it != nil; {
+ // Advance `it` before the element is removed from the list, or else
+ // it.Next() will always be nil.
+ event := it
+ it = it.Next()
+
+ // Does the buffer have enough remaining space to hold the event we're
+ // about to write out?
+ if dst.NumBytes() < int64(event.sizeOf()) {
+ if writeLen > 0 {
+ // Buffer wasn't big enough for all pending events, but we did
+ // write some events out.
+ return writeLen, nil
+ }
+ return 0, syserror.EINVAL
+ }
+
+ // Linux always dequeues an available event as long as there's enough
+ // buffer space to copy it out, even if the copy below fails. Emulate
+ // this behaviour.
+ i.events.Remove(event)
+
+ // Buffer has enough space, copy event to the read buffer.
+ n, err := event.CopyTo(ctx, i.scratch, dst)
+ if err != nil {
+ return 0, err
+ }
+
+ writeLen += n
+ dst = dst.DropFirst64(n)
+ }
+ return writeLen, nil
+}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (i *Inotify) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ switch args[1].Int() {
+ case linux.FIONREAD:
+ i.evMu.Lock()
+ defer i.evMu.Unlock()
+ var n uint32
+ for e := i.events.Front(); e != nil; e = e.Next() {
+ n += uint32(e.sizeOf())
+ }
+ var buf [4]byte
+ usermem.ByteOrder.PutUint32(buf[:], n)
+ _, err := uio.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{})
+ return 0, err
+
+ default:
+ return 0, syserror.ENOTTY
+ }
+}
+
+func (i *Inotify) queueEvent(ev *Event) {
+ i.evMu.Lock()
+
+ // Check if we should coalesce the event we're about to queue with the last
+ // one currently in the queue. Events are coalesced if they are identical.
+ if last := i.events.Back(); last != nil {
+ if ev.equals(last) {
+ // "Coalesce" the two events by simply not queuing the new one. We
+ // don't need to raise a waiter.EventIn notification because no new
+ // data is available for reading.
+ i.evMu.Unlock()
+ return
+ }
+ }
+
+ i.events.PushBack(ev)
+
+ // Release mutex before notifying waiters because we don't control what they
+ // can do.
+ i.evMu.Unlock()
+
+ i.queue.Notify(waiter.EventIn)
+}
+
+// newWatchLocked creates and adds a new watch to target.
+//
+// Precondition: i.mu must be locked.
+func (i *Inotify) newWatchLocked(target *Dentry, mask uint32) *Watch {
+ targetWatches := target.Watches()
+ w := &Watch{
+ owner: i,
+ wd: i.nextWatchIDLocked(),
+ set: targetWatches,
+ mask: mask,
+ }
+
+ // Hold the watch in this inotify instance as well as the watch set on the
+ // target.
+ i.watches[w.wd] = w
+ targetWatches.Add(w)
+ return w
+}
+
+// newWatchIDLocked allocates and returns a new watch descriptor.
+//
+// Precondition: i.mu must be locked.
+func (i *Inotify) nextWatchIDLocked() int32 {
+ i.nextWatchMinusOne++
+ return i.nextWatchMinusOne
+}
+
+// handleDeletion handles the deletion of the target of watch w. It removes w
+// from i.watches and a watch removal event is generated.
+func (i *Inotify) handleDeletion(w *Watch) {
+ i.mu.Lock()
+ _, found := i.watches[w.wd]
+ delete(i.watches, w.wd)
+ i.mu.Unlock()
+
+ if found {
+ i.queueEvent(newEvent(w.wd, "", linux.IN_IGNORED, 0))
+ }
+}
+
+// AddWatch constructs a new inotify watch and adds it to the target. It
+// returns the watch descriptor returned by inotify_add_watch(2).
+func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 {
+ // Note: Locking this inotify instance protects the result returned by
+ // Lookup() below. With the lock held, we know for sure the lookup result
+ // won't become stale because it's impossible for *this* instance to
+ // add/remove watches on target.
+ i.mu.Lock()
+ defer i.mu.Unlock()
+
+ // Does the target already have a watch from this inotify instance?
+ if existing := target.Watches().Lookup(i.id); existing != nil {
+ newmask := mask
+ if mask&linux.IN_MASK_ADD != 0 {
+ // "Add (OR) events to watch mask for this pathname if it already
+ // exists (instead of replacing mask)." -- inotify(7)
+ newmask |= atomic.LoadUint32(&existing.mask)
+ }
+ atomic.StoreUint32(&existing.mask, newmask)
+ return existing.wd
+ }
+
+ // No existing watch, create a new watch.
+ w := i.newWatchLocked(target, mask)
+ return w.wd
+}
+
+// RmWatch looks up an inotify watch for the given 'wd' and configures the
+// target to stop sending events to this inotify instance.
+func (i *Inotify) RmWatch(wd int32) error {
+ i.mu.Lock()
+
+ // Find the watch we were asked to removed.
+ w, ok := i.watches[wd]
+ if !ok {
+ i.mu.Unlock()
+ return syserror.EINVAL
+ }
+
+ // Remove the watch from this instance.
+ delete(i.watches, wd)
+
+ // Remove the watch from the watch target.
+ w.set.Remove(w.OwnerID())
+ i.mu.Unlock()
+
+ // Generate the event for the removal.
+ i.queueEvent(newEvent(wd, "", linux.IN_IGNORED, 0))
+
+ return nil
+}
+
+// Watches is the collection of all inotify watches on a single file.
+//
+// +stateify savable
+type Watches struct {
+ // mu protects the fields below.
+ mu sync.RWMutex `state:"nosave"`
+
+ // ws is the map of active watches in this collection, keyed by the inotify
+ // instance id of the owner.
+ ws map[uint64]*Watch
+}
+
+// Lookup returns the watch owned by an inotify instance with the given id.
+// Returns nil if no such watch exists.
+//
+// Precondition: the inotify instance with the given id must be locked to
+// prevent the returned watch from being concurrently modified or replaced in
+// Inotify.watches.
+func (w *Watches) Lookup(id uint64) *Watch {
+ w.mu.Lock()
+ defer w.mu.Unlock()
+ return w.ws[id]
+}
+
+// Add adds watch into this set of watches.
+//
+// Precondition: the inotify instance with the given id must be locked.
+func (w *Watches) Add(watch *Watch) {
+ w.mu.Lock()
+ defer w.mu.Unlock()
+
+ owner := watch.OwnerID()
+ // Sanity check, we should never have two watches for one owner on the
+ // same target.
+ if _, exists := w.ws[owner]; exists {
+ panic(fmt.Sprintf("Watch collision with ID %+v", owner))
+ }
+ if w.ws == nil {
+ w.ws = make(map[uint64]*Watch)
+ }
+ w.ws[owner] = watch
+}
+
+// Remove removes a watch with the given id from this set of watches and
+// releases it. The caller is responsible for generating any watch removal
+// event, as appropriate. The provided id must match an existing watch in this
+// collection.
+//
+// Precondition: the inotify instance with the given id must be locked.
+func (w *Watches) Remove(id uint64) {
+ w.mu.Lock()
+ defer w.mu.Unlock()
+
+ if w.ws == nil {
+ // This watch set is being destroyed. The thread executing the
+ // destructor is already in the process of deleting all our watches. We
+ // got here with no references on the target because we raced with the
+ // destructor notifying all the watch owners of destruction. See the
+ // comment in Watches.HandleDeletion for why this race exists.
+ return
+ }
+
+ if _, ok := w.ws[id]; !ok {
+ // While there's technically no problem with silently ignoring a missing
+ // watch, this is almost certainly a bug.
+ panic(fmt.Sprintf("Attempt to remove a watch, but no watch found with provided id %+v.", id))
+ }
+ delete(w.ws, id)
+}
+
+// Notify queues a new event with all watches in this set.
+func (w *Watches) Notify(name string, events, cookie uint32) {
+ // N.B. We don't defer the unlocks because Notify is in the hot path of
+ // all IO operations, and the defer costs too much for small IO
+ // operations.
+ w.mu.RLock()
+ for _, watch := range w.ws {
+ // TODO(gvisor.dev/issue/1479): Skip for IN_EXCL_UNLINK cases.
+ watch.Notify(name, events, cookie)
+ }
+ w.mu.RUnlock()
+}
+
+// HandleDeletion is called when the watch target is destroyed to emit
+// the appropriate events.
+func (w *Watches) HandleDeletion() {
+ w.Notify("", linux.IN_DELETE_SELF, 0)
+
+ // TODO(gvisor.dev/issue/1479): This doesn't work because maps are not copied
+ // by value. Ideally, we wouldn't have this circular locking so we can just
+ // notify of IN_DELETE_SELF in the same loop below.
+ //
+ // We can't hold w.mu while calling watch.handleDeletion to preserve lock
+ // ordering w.r.t to the owner inotify instances. Instead, atomically move
+ // the watches map into a local variable so we can iterate over it safely.
+ //
+ // Because of this however, it is possible for the watches' owners to reach
+ // this inode while the inode has no refs. This is still safe because the
+ // owners can only reach the inode until this function finishes calling
+ // watch.handleDeletion below and the inode is guaranteed to exist in the
+ // meantime. But we still have to be very careful not to rely on inode state
+ // that may have been already destroyed.
+ var ws map[uint64]*Watch
+ w.mu.Lock()
+ ws = w.ws
+ w.ws = nil
+ w.mu.Unlock()
+
+ for _, watch := range ws {
+ // TODO(gvisor.dev/issue/1479): consider refactoring this.
+ watch.handleDeletion()
+ }
+}
+
+// Watch represent a particular inotify watch created by inotify_add_watch.
+//
+// +stateify savable
+type Watch struct {
+ // Inotify instance which owns this watch.
+ owner *Inotify
+
+ // Descriptor for this watch. This is unique across an inotify instance.
+ wd int32
+
+ // set is the watch set containing this watch. It belongs to the target file
+ // of this watch.
+ set *Watches
+
+ // Events being monitored via this watch. Must be accessed with atomic
+ // memory operations.
+ mask uint32
+}
+
+// OwnerID returns the id of the inotify instance that owns this watch.
+func (w *Watch) OwnerID() uint64 {
+ return w.owner.id
+}
+
+// ExcludeUnlinkedChildren indicates whether the watched object should continue
+// to be notified of events of its children after they have been unlinked, e.g.
+// for an open file descriptor.
+//
+// TODO(gvisor.dev/issue/1479): Implement IN_EXCL_UNLINK.
+// We can do this by keeping track of the set of unlinked children in Watches
+// to skip notification.
+func (w *Watch) ExcludeUnlinkedChildren() bool {
+ return atomic.LoadUint32(&w.mask)&linux.IN_EXCL_UNLINK != 0
+}
+
+// Notify queues a new event on this watch.
+func (w *Watch) Notify(name string, events uint32, cookie uint32) {
+ mask := atomic.LoadUint32(&w.mask)
+ if mask&events == 0 {
+ // We weren't watching for this event.
+ return
+ }
+
+ // Event mask should include bits matched from the watch plus all control
+ // event bits.
+ unmaskableBits := ^uint32(0) &^ linux.IN_ALL_EVENTS
+ effectiveMask := unmaskableBits | mask
+ matchedEvents := effectiveMask & events
+ w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie))
+}
+
+// handleDeletion handles the deletion of w's target.
+func (w *Watch) handleDeletion() {
+ w.owner.handleDeletion(w)
+}
+
+// Event represents a struct inotify_event from linux.
+//
+// +stateify savable
+type Event struct {
+ eventEntry
+
+ wd int32
+ mask uint32
+ cookie uint32
+
+ // len is computed based on the name field is set automatically by
+ // Event.setName. It should be 0 when no name is set; otherwise it is the
+ // length of the name slice.
+ len uint32
+
+ // The name field has special padding requirements and should only be set by
+ // calling Event.setName.
+ name []byte
+}
+
+func newEvent(wd int32, name string, events, cookie uint32) *Event {
+ e := &Event{
+ wd: wd,
+ mask: events,
+ cookie: cookie,
+ }
+ if name != "" {
+ e.setName(name)
+ }
+ return e
+}
+
+// paddedBytes converts a go string to a null-terminated c-string, padded with
+// null bytes to a total size of 'l'. 'l' must be large enough for all the bytes
+// in the 's' plus at least one null byte.
+func paddedBytes(s string, l uint32) []byte {
+ if l < uint32(len(s)+1) {
+ panic("Converting string to byte array results in truncation, this can lead to buffer-overflow due to the missing null-byte!")
+ }
+ b := make([]byte, l)
+ copy(b, s)
+
+ // b was zero-value initialized during make(), so the rest of the slice is
+ // already filled with null bytes.
+
+ return b
+}
+
+// setName sets the optional name for this event.
+func (e *Event) setName(name string) {
+ // We need to pad the name such that the entire event length ends up a
+ // multiple of inotifyEventBaseSize.
+ unpaddedLen := len(name) + 1
+ // Round up to nearest multiple of inotifyEventBaseSize.
+ e.len = uint32((unpaddedLen + inotifyEventBaseSize - 1) & ^(inotifyEventBaseSize - 1))
+ // Make sure we haven't overflowed and wrapped around when rounding.
+ if unpaddedLen > int(e.len) {
+ panic("Overflow when rounding inotify event size, the 'name' field was too big.")
+ }
+ e.name = paddedBytes(name, e.len)
+}
+
+func (e *Event) sizeOf() int {
+ s := inotifyEventBaseSize + int(e.len)
+ if s < inotifyEventBaseSize {
+ panic("overflow")
+ }
+ return s
+}
+
+// CopyTo serializes this event to dst. buf is used as a scratch buffer to
+// construct the output. We use a buffer allocated ahead of time for
+// performance. buf must be at least inotifyEventBaseSize bytes.
+func (e *Event) CopyTo(ctx context.Context, buf []byte, dst usermem.IOSequence) (int64, error) {
+ usermem.ByteOrder.PutUint32(buf[0:], uint32(e.wd))
+ usermem.ByteOrder.PutUint32(buf[4:], e.mask)
+ usermem.ByteOrder.PutUint32(buf[8:], e.cookie)
+ usermem.ByteOrder.PutUint32(buf[12:], e.len)
+
+ writeLen := 0
+
+ n, err := dst.CopyOut(ctx, buf)
+ if err != nil {
+ return 0, err
+ }
+ writeLen += n
+ dst = dst.DropFirst(n)
+
+ if e.len > 0 {
+ n, err = dst.CopyOut(ctx, e.name)
+ if err != nil {
+ return 0, err
+ }
+ writeLen += n
+ }
+
+ // Santiy check.
+ if writeLen != e.sizeOf() {
+ panic(fmt.Sprintf("Serialized unexpected amount of data for an event, expected %d, wrote %d.", e.sizeOf(), writeLen))
+ }
+
+ return int64(writeLen), nil
+}
+
+func (e *Event) equals(other *Event) bool {
+ return e.wd == other.wd &&
+ e.mask == other.mask &&
+ e.cookie == other.cookie &&
+ e.len == other.len &&
+ bytes.Equal(e.name, other.name)
+}
+
+// InotifyEventFromStatMask generates the appropriate events for an operation
+// that set the stats specified in mask.
+func InotifyEventFromStatMask(mask uint32) uint32 {
+ var ev uint32
+ if mask&(linux.STATX_UID|linux.STATX_GID|linux.STATX_MODE) != 0 {
+ ev |= linux.IN_ATTRIB
+ }
+ if mask&linux.STATX_SIZE != 0 {
+ ev |= linux.IN_MODIFY
+ }
+
+ if (mask & (linux.STATX_ATIME | linux.STATX_MTIME)) == (linux.STATX_ATIME | linux.STATX_MTIME) {
+ // Both times indicates a utime(s) call.
+ ev |= linux.IN_ATTRIB
+ } else if mask&linux.STATX_ATIME != 0 {
+ ev |= linux.IN_ACCESS
+ } else if mask&linux.STATX_MTIME != 0 {
+ mask |= linux.IN_MODIFY
+ }
+ return ev
+}
+
+// InotifyRemoveChild sends the appriopriate notifications to the watch sets of
+// the child being removed and its parent.
+func InotifyRemoveChild(self, parent *Watches, name string) {
+ self.Notify("", linux.IN_ATTRIB, 0)
+ parent.Notify(name, linux.IN_DELETE, 0)
+ // TODO(gvisor.dev/issue/1479): implement IN_EXCL_UNLINK.
+}
+
+// InotifyRename sends the appriopriate notifications to the watch sets of the
+// file being renamed and its old/new parents.
+func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, oldName, newName string, isDir bool) {
+ var dirEv uint32
+ if isDir {
+ dirEv = linux.IN_ISDIR
+ }
+ cookie := uniqueid.InotifyCookie(ctx)
+ oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie)
+ newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie)
+ // Somewhat surprisingly, self move events do not have a cookie.
+ renamed.Notify("", linux.IN_MOVE_SELF, 0)
+}
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 6d2ba53ea..be6f21dba 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -422,6 +422,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
}
}
+ fd.Dentry().InotifyWithParent(linux.IN_OPEN, 0)
return fd, nil
}
if !rp.handleError(err) {
diff --git a/pkg/sentry/vfs/vfs_state_autogen.go b/pkg/sentry/vfs/vfs_state_autogen.go
index b21238d3f..c39fead64 100644
--- a/pkg/sentry/vfs/vfs_state_autogen.go
+++ b/pkg/sentry/vfs/vfs_state_autogen.go
@@ -71,6 +71,32 @@ func (x *epollInterestEntry) load(m state.Map) {
m.Load("prev", &x.prev)
}
+func (x *eventList) beforeSave() {}
+func (x *eventList) save(m state.Map) {
+ x.beforeSave()
+ m.Save("head", &x.head)
+ m.Save("tail", &x.tail)
+}
+
+func (x *eventList) afterLoad() {}
+func (x *eventList) load(m state.Map) {
+ m.Load("head", &x.head)
+ m.Load("tail", &x.tail)
+}
+
+func (x *eventEntry) beforeSave() {}
+func (x *eventEntry) save(m state.Map) {
+ x.beforeSave()
+ m.Save("next", &x.next)
+ m.Save("prev", &x.prev)
+}
+
+func (x *eventEntry) afterLoad() {}
+func (x *eventEntry) load(m state.Map) {
+ m.Load("next", &x.next)
+ m.Load("prev", &x.prev)
+}
+
func (x *Filesystem) beforeSave() {}
func (x *Filesystem) save(m state.Map) {
x.beforeSave()
@@ -101,6 +127,80 @@ func (x *registeredFilesystemType) load(m state.Map) {
m.Load("opts", &x.opts)
}
+func (x *Inotify) beforeSave() {}
+func (x *Inotify) save(m state.Map) {
+ x.beforeSave()
+ m.Save("vfsfd", &x.vfsfd)
+ m.Save("FileDescriptionDefaultImpl", &x.FileDescriptionDefaultImpl)
+ m.Save("DentryMetadataFileDescriptionImpl", &x.DentryMetadataFileDescriptionImpl)
+ m.Save("id", &x.id)
+ m.Save("events", &x.events)
+ m.Save("scratch", &x.scratch)
+ m.Save("nextWatchMinusOne", &x.nextWatchMinusOne)
+ m.Save("watches", &x.watches)
+}
+
+func (x *Inotify) afterLoad() {}
+func (x *Inotify) load(m state.Map) {
+ m.Load("vfsfd", &x.vfsfd)
+ m.Load("FileDescriptionDefaultImpl", &x.FileDescriptionDefaultImpl)
+ m.Load("DentryMetadataFileDescriptionImpl", &x.DentryMetadataFileDescriptionImpl)
+ m.Load("id", &x.id)
+ m.Load("events", &x.events)
+ m.Load("scratch", &x.scratch)
+ m.Load("nextWatchMinusOne", &x.nextWatchMinusOne)
+ m.Load("watches", &x.watches)
+}
+
+func (x *Watches) beforeSave() {}
+func (x *Watches) save(m state.Map) {
+ x.beforeSave()
+ m.Save("ws", &x.ws)
+}
+
+func (x *Watches) afterLoad() {}
+func (x *Watches) load(m state.Map) {
+ m.Load("ws", &x.ws)
+}
+
+func (x *Watch) beforeSave() {}
+func (x *Watch) save(m state.Map) {
+ x.beforeSave()
+ m.Save("owner", &x.owner)
+ m.Save("wd", &x.wd)
+ m.Save("set", &x.set)
+ m.Save("mask", &x.mask)
+}
+
+func (x *Watch) afterLoad() {}
+func (x *Watch) load(m state.Map) {
+ m.Load("owner", &x.owner)
+ m.Load("wd", &x.wd)
+ m.Load("set", &x.set)
+ m.Load("mask", &x.mask)
+}
+
+func (x *Event) beforeSave() {}
+func (x *Event) save(m state.Map) {
+ x.beforeSave()
+ m.Save("eventEntry", &x.eventEntry)
+ m.Save("wd", &x.wd)
+ m.Save("mask", &x.mask)
+ m.Save("cookie", &x.cookie)
+ m.Save("len", &x.len)
+ m.Save("name", &x.name)
+}
+
+func (x *Event) afterLoad() {}
+func (x *Event) load(m state.Map) {
+ m.Load("eventEntry", &x.eventEntry)
+ m.Load("wd", &x.wd)
+ m.Load("mask", &x.mask)
+ m.Load("cookie", &x.cookie)
+ m.Load("len", &x.len)
+ m.Load("name", &x.name)
+}
+
func (x *Mount) beforeSave() {}
func (x *Mount) save(m state.Map) {
x.beforeSave()
@@ -193,8 +293,14 @@ func init() {
state.Register("pkg/sentry/vfs.RegisterDeviceOptions", (*RegisterDeviceOptions)(nil), state.Fns{Save: (*RegisterDeviceOptions).save, Load: (*RegisterDeviceOptions).load})
state.Register("pkg/sentry/vfs.epollInterestList", (*epollInterestList)(nil), state.Fns{Save: (*epollInterestList).save, Load: (*epollInterestList).load})
state.Register("pkg/sentry/vfs.epollInterestEntry", (*epollInterestEntry)(nil), state.Fns{Save: (*epollInterestEntry).save, Load: (*epollInterestEntry).load})
+ state.Register("pkg/sentry/vfs.eventList", (*eventList)(nil), state.Fns{Save: (*eventList).save, Load: (*eventList).load})
+ state.Register("pkg/sentry/vfs.eventEntry", (*eventEntry)(nil), state.Fns{Save: (*eventEntry).save, Load: (*eventEntry).load})
state.Register("pkg/sentry/vfs.Filesystem", (*Filesystem)(nil), state.Fns{Save: (*Filesystem).save, Load: (*Filesystem).load})
state.Register("pkg/sentry/vfs.registeredFilesystemType", (*registeredFilesystemType)(nil), state.Fns{Save: (*registeredFilesystemType).save, Load: (*registeredFilesystemType).load})
+ state.Register("pkg/sentry/vfs.Inotify", (*Inotify)(nil), state.Fns{Save: (*Inotify).save, Load: (*Inotify).load})
+ state.Register("pkg/sentry/vfs.Watches", (*Watches)(nil), state.Fns{Save: (*Watches).save, Load: (*Watches).load})
+ state.Register("pkg/sentry/vfs.Watch", (*Watch)(nil), state.Fns{Save: (*Watch).save, Load: (*Watch).load})
+ state.Register("pkg/sentry/vfs.Event", (*Event)(nil), state.Fns{Save: (*Event).save, Load: (*Event).load})
state.Register("pkg/sentry/vfs.Mount", (*Mount)(nil), state.Fns{Save: (*Mount).save, Load: (*Mount).load})
state.Register("pkg/sentry/vfs.MountNamespace", (*MountNamespace)(nil), state.Fns{Save: (*MountNamespace).save, Load: (*MountNamespace).load})
state.Register("pkg/sentry/vfs.VirtualFilesystem", (*VirtualFilesystem)(nil), state.Fns{Save: (*VirtualFilesystem).save, Load: (*VirtualFilesystem).load})