Read vfs2 epoll events atomically.

Discovered by ayushranjan@: VFS2 was employing the following algorithm for fetching ready events from an epoll instance: - Create a statically sized EpollEvent slice on the stack of size 16. - Pass that to EpollInstance.ReadEvents() to populate. - EpollInstance.ReadEvents() requeues level-triggered events that it returns back into the ready queue. - Write the results to usermem. - If the number of results were = 16 then recall EpollInstance.ReadEvents() in the hopes of getting more. But this will cause duplication of the "requeued" ready level-triggered events. So if the ready queue has >= 16 ready events, the EpollWait for loop will spin until it fills the usermem with `maxEvents` events. Fixes #3521 PiperOrigin-RevId: 331840527
author: Jamie Liu <jamieliu@google.com> 2020-09-15 13:23:14 -0700
committer: gVisor bot <gvisor-bot@google.com> 2020-09-15 13:25:58 -0700
commit: 0d790cbaea5cd40ceabdf8f59978efff1a0626f1 (patch)
tree: 14a0d03184804079a69814bebbda703376c5a24c /pkg/sentry
parent: 86b31a80777dd906b275fb4b21888e93836f8a9b (diff)
2 files changed, 18 insertions, 47 deletions
diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go
index c62f03509..d0cbb77eb 100644
--- a/pkg/sentry/syscalls/linux/vfs2/epoll.go
+++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go
@@ -24,7 +24,6 @@ import (
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
@@ -141,50 +140,26 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 		return 0, nil, syserror.EINVAL
 	}
 
-	// Use a fixed-size buffer in a loop, instead of make([]linux.EpollEvent,
-	// maxEvents), so that the buffer can be allocated on the stack.
+	// Allocate space for a few events on the stack for the common case in
+	// which we don't have too many events.
 	var (
-		events       [16]linux.EpollEvent
-		total        int
+		eventsArr    [16]linux.EpollEvent
 		ch           chan struct{}
 		haveDeadline bool
 		deadline     ktime.Time
 	)
 	for {
-		batchEvents := len(events)
-		if batchEvents > maxEvents {
-			batchEvents = maxEvents
-		}
-		n := ep.ReadEvents(events[:batchEvents])
-		maxEvents -= n
-		if n != 0 {
-			// Copy what we read out.
-			copiedBytes, err := linux.CopyEpollEventSliceOut(t, eventsAddr, events[:n])
+		events := ep.ReadEvents(eventsArr[:0], maxEvents)
+		if len(events) != 0 {
+			copiedBytes, err := linux.CopyEpollEventSliceOut(t, eventsAddr, events)
 			copiedEvents := copiedBytes / sizeofEpollEvent // rounded down
-			eventsAddr += usermem.Addr(copiedEvents * sizeofEpollEvent)
-			total += copiedEvents
-			if err != nil {
-				if total != 0 {
-					return uintptr(total), nil, nil
-				}
-				return 0, nil, err
-			}
-			// If we've filled the application's event buffer, we're done.
-			if maxEvents == 0 {
-				return uintptr(total), nil, nil
-			}
-			// Loop if we read a full batch, under the expectation that there
-			// may be more events to read.
-			if n == batchEvents {
-				continue
+			if copiedEvents != 0 {
+				return uintptr(copiedEvents), nil, nil
 			}
+			return 0, nil, err
 		}
-		// We get here if n != batchEvents. If we read any number of events
-		// (just now, or in a previous iteration of this loop), or if timeout
-		// is 0 (such that epoll_wait should be non-blocking), return the
-		// events we've read so far to the application.
-		if total != 0 || timeout == 0 {
-			return uintptr(total), nil, nil
+		if timeout == 0 {
+			return 0, nil, nil
 		}
 		// In the first iteration of this loop, register with the epoll
 		// instance for readability events, but then immediately continue the
@@ -207,8 +182,6 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
 				if err == syserror.ETIMEDOUT {
 					err = nil
 				}
-				// total must be 0 since otherwise we would have returned
-				// above.
 				return 0, nil, err
 			}
 		}
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
index 1b5af9f73..754e76aec 100644
--- a/pkg/sentry/vfs/epoll.go
+++ b/pkg/sentry/vfs/epoll.go
@@ -331,11 +331,9 @@ func (ep *EpollInstance) removeLocked(epi *epollInterest) {
 	ep.mu.Unlock()
 }
 
-// ReadEvents reads up to len(events) ready events into events and returns the
-// number of events read.
-//
-// Preconditions: len(events) != 0.
-func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int {
+// ReadEvents appends up to maxReady events to events and returns the updated
+// slice of events.
+func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent, maxEvents int) []linux.EpollEvent {
 	i := 0
 	// Hot path: avoid defer.
 	ep.mu.Lock()
@@ -368,16 +366,16 @@ func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int {
 			requeue.PushBack(epi)
 		}
 		// Report ievents.
-		events[i] = linux.EpollEvent{
+		events = append(events, linux.EpollEvent{
 			Events: ievents.ToLinux(),
 			Data:   epi.userData,
-		}
+		})
 		i++
-		if i == len(events) {
+		if i == maxEvents {
 			break
 		}
 	}
 	ep.ready.PushBackList(&requeue)
 	ep.mu.Unlock()
-	return i
+	return events
 }
author	Jamie Liu <jamieliu@google.com>	2020-09-15 13:23:14 -0700
committer	gVisor bot <gvisor-bot@google.com>	2020-09-15 13:25:58 -0700
commit	0d790cbaea5cd40ceabdf8f59978efff1a0626f1 (patch)
tree	14a0d03184804079a69814bebbda703376c5a24c /pkg/sentry
parent	86b31a80777dd906b275fb4b21888e93836f8a9b (diff)