Support inotify IN_ONESHOT.

Also, while we're here, make sure that gofer inotify events are generated when files are created in remote revalidating mode. Updates #1479. PiperOrigin-RevId: 318536354
author: Dean Deng <deandeng@google.com> 2020-06-26 13:46:01 -0700
committer: gVisor bot <gvisor-bot@google.com> 2020-06-26 13:47:48 -0700
commit: 54a31e219ca9d6086a367213a92d2a72ce3af07b (patch)
tree: 72fa69f3f94115c56b2c884527d7c3b93b3ba593
parent: cfd049da87f1a8ce5b9b20c65ab5ccd84cdaf3f1 (diff)
4 files changed, 115 insertions, 29 deletions
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index d253c996c..73bac738d 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -389,7 +389,15 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 		// RPC will fail with EEXIST like we would have. If the RPC succeeds, and a
 		// stale dentry exists, the dentry will fail revalidation next time it's
 		// used.
-		return createInRemoteDir(parent, name)
+		if err := createInRemoteDir(parent, name); err != nil {
+			return err
+		}
+		ev := linux.IN_CREATE
+		if dir {
+			ev |= linux.IN_ISDIR
+		}
+		parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
+		return nil
 	}
 	if child := parent.children[name]; child != nil {
 		return syserror.EEXIST
diff --git a/pkg/sentry/syscalls/linux/vfs2/inotify.go b/pkg/sentry/syscalls/linux/vfs2/inotify.go
index 74e5287b7..5d98134a5 100644
--- a/pkg/sentry/syscalls/linux/vfs2/inotify.go
+++ b/pkg/sentry/syscalls/linux/vfs2/inotify.go
@@ -81,7 +81,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern
 
 	// "EINVAL: The given event mask contains no valid events."
 	// -- inotify_add_watch(2)
-	if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 {
+	if mask&linux.ALL_INOTIFY_BITS == 0 {
 		return 0, nil, syserror.EINVAL
 	}
 
diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go
index 35960394c..509034531 100644
--- a/pkg/sentry/vfs/inotify.go
+++ b/pkg/sentry/vfs/inotify.go
@@ -447,29 +447,51 @@ func (w *Watches) Remove(id uint64) {
 		return
 	}
 
-	if _, ok := w.ws[id]; !ok {
-		// While there's technically no problem with silently ignoring a missing
-		// watch, this is almost certainly a bug.
-		panic(fmt.Sprintf("Attempt to remove a watch, but no watch found with provided id %+v.", id))
+	// It is possible for w.Remove() to be called for the same watch multiple
+	// times. See the treatment of one-shot watches in Watches.Notify().
+	if _, ok := w.ws[id]; ok {
+		delete(w.ws, id)
 	}
-	delete(w.ws, id)
 }
 
 // Notify queues a new event with watches in this set. Watches with
 // IN_EXCL_UNLINK are skipped if the event is coming from a child that has been
 // unlinked.
 func (w *Watches) Notify(name string, events, cookie uint32, et EventType, unlinked bool) {
-	// N.B. We don't defer the unlocks because Notify is in the hot path of
-	// all IO operations, and the defer costs too much for small IO
-	// operations.
+	var hasExpired bool
 	w.mu.RLock()
 	for _, watch := range w.ws {
 		if unlinked && watch.ExcludeUnlinked() && et == PathEvent {
 			continue
 		}
-		watch.Notify(name, events, cookie)
+		if watch.Notify(name, events, cookie) {
+			hasExpired = true
+		}
+	}
+	w.mu.RUnlock()
+
+	if hasExpired {
+		w.cleanupExpiredWatches()
+	}
+}
+
+// This function is relatively expensive and should only be called where there
+// are expired watches.
+func (w *Watches) cleanupExpiredWatches() {
+	// Because of lock ordering, we cannot acquire Inotify.mu for each watch
+	// owner while holding w.mu. As a result, store expired watches locally
+	// before removing.
+	var toRemove []*Watch
+	w.mu.RLock()
+	for _, watch := range w.ws {
+		if atomic.LoadInt32(&watch.expired) == 1 {
+			toRemove = append(toRemove, watch)
+		}
 	}
 	w.mu.RUnlock()
+	for _, watch := range toRemove {
+		watch.owner.RmWatch(watch.wd)
+	}
 }
 
 // HandleDeletion is called when the watch target is destroyed. Clear the
@@ -478,16 +500,10 @@ func (w *Watches) Notify(name string, events, cookie uint32, et EventType, unlin
 func (w *Watches) HandleDeletion() {
 	w.Notify("", linux.IN_DELETE_SELF, 0, InodeEvent, true /* unlinked */)
 
-	// We can't hold w.mu while calling watch.handleDeletion to preserve lock
-	// ordering w.r.t to the owner inotify instances. Instead, atomically move
-	// the watches map into a local variable so we can iterate over it safely.
-	//
-	// Because of this however, it is possible for the watches' owners to reach
-	// this inode while the inode has no refs. This is still safe because the
-	// owners can only reach the inode until this function finishes calling
-	// watch.handleDeletion below and the inode is guaranteed to exist in the
-	// meantime. But we still have to be very careful not to rely on inode state
-	// that may have been already destroyed.
+	// As in Watches.Notify, we can't hold w.mu while acquiring Inotify.mu for
+	// the owner of each watch being deleted. Instead, atomically store the
+	// watches map in a local variable and set it to nil so we can iterate over
+	// it with the assurance that there will be no concurrent accesses.
 	var ws map[uint64]*Watch
 	w.mu.Lock()
 	ws = w.ws
@@ -519,17 +535,28 @@ func (w *Watches) HandleDeletion() {
 // +stateify savable
 type Watch struct {
 	// Inotify instance which owns this watch.
+	//
+	// This field is immutable after creation.
 	owner *Inotify
 
 	// Descriptor for this watch. This is unique across an inotify instance.
+	//
+	// This field is immutable after creation.
 	wd int32
 
 	// target is a dentry representing the watch target. Its watch set contains this watch.
+	//
+	// This field is immutable after creation.
 	target *Dentry
 
 	// Events being monitored via this watch. Must be accessed with atomic
 	// memory operations.
 	mask uint32
+
+	// expired is set to 1 to indicate that this watch is a one-shot that has
+	// already sent a notification and therefore can be removed. Must be accessed
+	// with atomic memory operations.
+	expired int32
 }
 
 // OwnerID returns the id of the inotify instance that owns this watch.
@@ -546,12 +573,20 @@ func (w *Watch) ExcludeUnlinked() bool {
 	return atomic.LoadUint32(&w.mask)&linux.IN_EXCL_UNLINK != 0
 }
 
-// Notify queues a new event on this watch.
-func (w *Watch) Notify(name string, events uint32, cookie uint32) {
+// Notify queues a new event on this watch. Returns true if this is a one-shot
+// watch that should be deleted, after this event was successfully queued.
+func (w *Watch) Notify(name string, events uint32, cookie uint32) bool {
+	if atomic.LoadInt32(&w.expired) == 1 {
+		// This is a one-shot watch that is already in the process of being
+		// removed. This may happen if a second event reaches the watch target
+		// before this watch has been removed.
+		return false
+	}
+
 	mask := atomic.LoadUint32(&w.mask)
 	if mask&events == 0 {
 		// We weren't watching for this event.
-		return
+		return false
 	}
 
 	// Event mask should include bits matched from the watch plus all control
@@ -560,6 +595,11 @@ func (w *Watch) Notify(name string, events uint32, cookie uint32) {
 	effectiveMask := unmaskableBits | mask
 	matchedEvents := effectiveMask & events
 	w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie))
+	if mask&linux.IN_ONESHOT != 0 {
+		atomic.StoreInt32(&w.expired, 1)
+		return true
+	}
+	return false
 }
 
 // Event represents a struct inotify_event from linux.
diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc
index 731c7046c..bdb645c35 100644
--- a/test/syscalls/linux/inotify.cc
+++ b/test/syscalls/linux/inotify.cc
@@ -1485,20 +1485,26 @@ TEST(Inotify, DuplicateWatchReturnsSameWatchDescriptor) {
 
 TEST(Inotify, UnmatchedEventsAreDiscarded) {
   const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
-  const TempPath file1 =
+  TempPath file1 =
       ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
   const FileDescriptor fd =
       ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
 
-  ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(fd.get(), file1.path(), IN_ACCESS));
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(fd.get(), file1.path(), IN_ACCESS));
 
-  const FileDescriptor file1_fd =
+  FileDescriptor file1_fd =
       ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_WRONLY));
 
-  const std::vector<Event> events =
-      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
   // We only asked for access events, the open event should be discarded.
   ASSERT_THAT(events, Are({}));
+
+  // IN_IGNORED events are always generated, regardless of the mask.
+  file1_fd.reset();
+  file1.reset();
+  events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+  ASSERT_THAT(events, Are({Event(IN_IGNORED, wd)}));
 }
 
 TEST(Inotify, AddWatchWithInvalidEventMaskFails) {
@@ -2073,6 +2079,38 @@ TEST(Inotify, ExcludeUnlinkInodeEvents_NoRandomSave) {
                       }));
 }
 
+TEST(Inotify, OneShot) {
+  // TODO(gvisor.dev/issue/1624): IN_ONESHOT not supported in VFS1.
+  SKIP_IF(IsRunningWithVFS1());
+
+  const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+  const FileDescriptor inotify_fd =
+      ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+  const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+      InotifyAddWatch(inotify_fd.get(), file.path(), IN_MODIFY | IN_ONESHOT));
+
+  // Open an fd, write to it, and then close it.
+  FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_WRONLY));
+  ASSERT_THAT(write(fd.get(), "x", 1), SyscallSucceedsWithValue(1));
+  fd.reset();
+
+  // We should get a single event followed by IN_IGNORED indicating removal
+  // of the one-shot watch. Prior activity (i.e. open) that is not in the mask
+  // should not trigger removal, and activity after removal (i.e. close) should
+  // not generate events.
+  std::vector<Event> events =
+      ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+  EXPECT_THAT(events, Are({
+                          Event(IN_MODIFY, wd),
+                          Event(IN_IGNORED, wd),
+                      }));
+
+  // The watch should already have been removed.
+  EXPECT_THAT(inotify_rm_watch(inotify_fd.get(), wd),
+              SyscallFailsWithErrno(EINVAL));
+}
+
 // This test helps verify that the lock order of filesystem and inotify locks
 // is respected when inotify instances and watch targets are concurrently being
 // destroyed.
author	Dean Deng <deandeng@google.com>	2020-06-26 13:46:01 -0700
committer	gVisor bot <gvisor-bot@google.com>	2020-06-26 13:47:48 -0700
commit	54a31e219ca9d6086a367213a92d2a72ce3af07b (patch)
tree	72fa69f3f94115c56b2c884527d7c3b93b3ba593
parent	cfd049da87f1a8ce5b9b20c65ab5ccd84cdaf3f1 (diff)