From 267560d159b299a17f626029d7091ab923afeef6 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 16 Nov 2020 18:52:20 -0800
Subject: Reset watchdog timer between sendfile() iterations.

As part of this, change Task.interrupted() to not drain Task.interruptChan, and
do so explicitly using new function Task.unsetInterrupted() instead.

PiperOrigin-RevId: 342768365
---
 pkg/sentry/kernel/task_block.go          | 28 +++++++++++++---------
 pkg/sentry/kernel/task_sched.go          | 12 ++++++++++
 pkg/sentry/kernel/task_signals.go        | 12 ++++------
 pkg/sentry/syscalls/linux/vfs2/splice.go | 40 ++++++++++++++++++++++----------
 test/syscalls/linux/sendfile.cc          | 18 ++++++++++++++
 5 files changed, 80 insertions(+), 30 deletions(-)

diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go
index 147a3d286..9419f2e95 100644
--- a/pkg/sentry/kernel/task_block.go
+++ b/pkg/sentry/kernel/task_block.go
@@ -177,19 +177,23 @@ func (t *Task) SleepStart() <-chan struct{} {
 // SleepFinish implements context.ChannelSleeper.SleepFinish.
 func (t *Task) SleepFinish(success bool) {
 	if !success {
-		// The interrupted notification is consumed only at the top-level
-		// (Run). Therefore we attempt to reset the pending notification.
-		// This will also elide our next entry back into the task, so we
-		// will process signals, state changes, etc.
+		// Our caller received from t.interruptChan; we need to re-send to it
+		// to ensure that t.interrupted() is still true.
 		t.interruptSelf()
 	}
 	t.accountTaskGoroutineLeave(TaskGoroutineBlockedInterruptible)
 	t.Activate()
 }
 
-// Interrupted implements amutex.Sleeper.Interrupted
+// Interrupted implements context.ChannelSleeper.Interrupted.
 func (t *Task) Interrupted() bool {
-	return len(t.interruptChan) != 0
+	if t.interrupted() {
+		return true
+	}
+	// Indicate that t's task goroutine is still responsive (i.e. reset the
+	// watchdog timer).
+	t.accountTaskGoroutineRunning()
+	return false
 }
 
 // UninterruptibleSleepStart implements context.Context.UninterruptibleSleepStart.
@@ -210,13 +214,17 @@ func (t *Task) UninterruptibleSleepFinish(activate bool) {
 }
 
 // interrupted returns true if interrupt or interruptSelf has been called at
-// least once since the last call to interrupted.
+// least once since the last call to unsetInterrupted.
 func (t *Task) interrupted() bool {
+	return len(t.interruptChan) != 0
+}
+
+// unsetInterrupted causes interrupted to return false until the next call to
+// interrupt or interruptSelf.
+func (t *Task) unsetInterrupted() {
 	select {
 	case <-t.interruptChan:
-		return true
 	default:
-		return false
 	}
 }
 
@@ -232,9 +240,7 @@ func (t *Task) interrupt() {
 func (t *Task) interruptSelf() {
 	select {
 	case t.interruptChan <- struct{}{}:
-		t.Debugf("Interrupt queued")
 	default:
-		t.Debugf("Dropping duplicate interrupt")
 	}
 	// platform.Context.Interrupt() is unnecessary since a task goroutine
 	// calling interruptSelf() cannot also be blocked in
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 52c55d13d..9ba5f8d78 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -157,6 +157,18 @@ func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) {
 	t.goschedSeq.EndWrite()
 }
 
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) accountTaskGoroutineRunning() {
+	now := t.k.CPUClockNow()
+	if t.gosched.State != TaskGoroutineRunningSys {
+		panic(fmt.Sprintf("Task goroutine in state %v (expected %v)", t.gosched.State, TaskGoroutineRunningSys))
+	}
+	t.goschedSeq.BeginWrite()
+	t.gosched.SysTicks += now - t.gosched.Timestamp
+	t.gosched.Timestamp = now
+	t.goschedSeq.EndWrite()
+}
+
 // TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info.
 // Most clients should use t.CPUStats() instead.
 func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo {
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index ebdb83061..42dd3e278 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -619,9 +619,6 @@ func (t *Task) setSignalMaskLocked(mask linux.SignalSet) {
 				return
 			}
 		})
-		// We have to re-issue the interrupt consumed by t.interrupted() since
-		// it might have been for a different reason.
-		t.interruptSelf()
 	}
 
 	// Conversely, if the new mask unblocks any signals that were blocked by
@@ -931,10 +928,10 @@ func (t *Task) signalStop(target *Task, code int32, status int32) {
 type runInterrupt struct{}
 
 func (*runInterrupt) execute(t *Task) taskRunState {
-	// Interrupts are de-duplicated (if t is interrupted twice before
-	// t.interrupted() is called, t.interrupted() will only return true once),
-	// so early exits from this function must re-enter the runInterrupt state
-	// to check for more interrupt-signaled conditions.
+	// Interrupts are de-duplicated (t.unsetInterrupted() will undo the effect
+	// of all previous calls to t.interrupted() regardless of how many such
+	// calls there have been), so early exits from this function must re-enter
+	// the runInterrupt state to check for more interrupt-signaled conditions.
 
 	t.tg.signalHandlers.mu.Lock()
 
@@ -1080,6 +1077,7 @@ func (*runInterrupt) execute(t *Task) taskRunState {
 		return t.deliverSignal(info, act)
 	}
 
+	t.unsetInterrupted()
 	t.tg.signalHandlers.mu.Unlock()
 	return (*runApp)(nil)
 }
diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go
index 9ce4f280a..8bb763a47 100644
--- a/pkg/sentry/syscalls/linux/vfs2/splice.go
+++ b/pkg/sentry/syscalls/linux/vfs2/splice.go
@@ -343,8 +343,8 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 
 	// Copy data.
 	var (
-		n   int64
-		err error
+		total int64
+		err   error
 	)
 	dw := dualWaiter{
 		inFile:  inFile,
@@ -357,13 +357,20 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	// can block.
 	nonBlock := outFile.StatusFlags()&linux.O_NONBLOCK != 0
 	if outIsPipe {
-		for n < count {
-			var spliceN int64
-			spliceN, err = outPipeFD.SpliceFromNonPipe(t, inFile, offset, count)
+		for {
+			var n int64
+			n, err = outPipeFD.SpliceFromNonPipe(t, inFile, offset, count-total)
 			if offset != -1 {
-				offset += spliceN
+				offset += n
+			}
+			total += n
+			if total == count {
+				break
+			}
+			if err == nil && t.Interrupted() {
+				err = syserror.ErrInterrupted
+				break
 			}
-			n += spliceN
 			if err == syserror.ErrWouldBlock && !nonBlock {
 				err = dw.waitForBoth(t)
 			}
@@ -374,7 +381,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 	} else {
 		// Read inFile to buffer, then write the contents to outFile.
 		buf := make([]byte, count)
-		for n < count {
+		for {
 			var readN int64
 			if offset != -1 {
 				readN, err = inFile.PRead(t, usermem.BytesIOSequence(buf), offset, vfs.ReadOptions{})
@@ -382,7 +389,6 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 			} else {
 				readN, err = inFile.Read(t, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
 			}
-			n += readN
 
 			// Write all of the bytes that we read. This may need
 			// multiple write calls to complete.
@@ -398,7 +404,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 					// We didn't complete the write. Only report the bytes that were actually
 					// written, and rewind offsets as needed.
 					notWritten := int64(len(wbuf))
-					n -= notWritten
+					readN -= notWritten
 					if offset == -1 {
 						// We modified the offset of the input file itself during the read
 						// operation. Rewind it.
@@ -415,6 +421,16 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 					break
 				}
 			}
+
+			total += readN
+			buf = buf[readN:]
+			if total == count {
+				break
+			}
+			if err == nil && t.Interrupted() {
+				err = syserror.ErrInterrupted
+				break
+			}
 			if err == syserror.ErrWouldBlock && !nonBlock {
 				err = dw.waitForBoth(t)
 			}
@@ -432,7 +448,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 		}
 	}
 
-	if n != 0 {
+	if total != 0 {
 		inFile.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent)
 		outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent)
 
@@ -445,7 +461,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
 
 	// We can only pass a single file to handleIOError, so pick inFile arbitrarily.
 	// This is used only for debugging purposes.
-	return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "sendfile", inFile)
+	return uintptr(total), nil, slinux.HandleIOErrorVFS2(t, total != 0, err, syserror.ERESTARTSYS, "sendfile", inFile)
 }
 
 // dualWaiter is used to wait on one or both vfs.FileDescriptions. It is not
diff --git a/test/syscalls/linux/sendfile.cc b/test/syscalls/linux/sendfile.cc
index cf0977118..3924e0001 100644
--- a/test/syscalls/linux/sendfile.cc
+++ b/test/syscalls/linux/sendfile.cc
@@ -631,6 +631,24 @@ TEST(SendFileTest, SendFileToPipe) {
               SyscallSucceedsWithValue(kDataSize));
 }
 
+TEST(SendFileTest, SendFileToSelf_NoRandomSave) {
+  int rawfd;
+  ASSERT_THAT(rawfd = memfd_create("memfd", 0), SyscallSucceeds());
+  const FileDescriptor fd(rawfd);
+
+  char c = 0x01;
+  ASSERT_THAT(WriteFd(fd.get(), &c, 1), SyscallSucceedsWithValue(1));
+
+  // Arbitrarily chosen to make sendfile() take long enough that the sentry
+  // watchdog usually fires unless it's reset by sendfile() between iterations
+  // of the buffered copy. See b/172076632.
+  constexpr size_t kSendfileSize = 0xa00000;
+
+  off_t offset = 0;
+  ASSERT_THAT(sendfile(fd.get(), fd.get(), &offset, kSendfileSize),
+              SyscallSucceedsWithValue(kSendfileSize));
+}
+
 static volatile int signaled = 0;
 void SigUsr1Handler(int sig, siginfo_t* info, void* context) { signaled = 1; }
 
-- 
cgit v1.2.3