42 files changed, 1046 insertions, 578 deletions
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index c0de72eef..0ee60569c 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -79,7 +79,7 @@ go_template_instance(
     out = "fd_table_refs.go",
     package = "kernel",
     prefix = "FDTable",
-    template = "//pkg/refs_vfs2:refs_template",
+    template = "//pkg/refsvfs2:refs_template",
     types = {
         "T": "FDTable",
     },
@@ -90,7 +90,7 @@ go_template_instance(
     out = "fs_context_refs.go",
     package = "kernel",
     prefix = "FSContext",
-    template = "//pkg/refs_vfs2:refs_template",
+    template = "//pkg/refsvfs2:refs_template",
     types = {
         "T": "FSContext",
     },
@@ -101,7 +101,7 @@ go_template_instance(
     out = "ipc_namespace_refs.go",
     package = "kernel",
     prefix = "IPCNamespace",
-    template = "//pkg/refs_vfs2:refs_template",
+    template = "//pkg/refsvfs2:refs_template",
     types = {
         "T": "IPCNamespace",
     },
@@ -112,7 +112,7 @@ go_template_instance(
     out = "process_group_refs.go",
     package = "kernel",
     prefix = "ProcessGroup",
-    template = "//pkg/refs_vfs2:refs_template",
+    template = "//pkg/refsvfs2:refs_template",
     types = {
         "T": "ProcessGroup",
     },
@@ -123,7 +123,7 @@ go_template_instance(
     out = "session_refs.go",
     package = "kernel",
     prefix = "Session",
-    template = "//pkg/refs_vfs2:refs_template",
+    template = "//pkg/refsvfs2:refs_template",
     types = {
         "T": "Session",
     },
@@ -184,6 +184,7 @@ go_library(
         "task_exit.go",
         "task_futex.go",
         "task_identity.go",
+        "task_image.go",
         "task_list.go",
         "task_log.go",
         "task_net.go",
@@ -224,12 +225,13 @@ go_library(
         "//pkg/cpuid",
         "//pkg/eventchannel",
         "//pkg/fspath",
+        "//pkg/goid",
         "//pkg/log",
         "//pkg/marshal",
         "//pkg/marshal/primitive",
         "//pkg/metric",
         "//pkg/refs",
-        "//pkg/refs_vfs2",
+        "//pkg/refsvfs2",
         "//pkg/safemem",
         "//pkg/secio",
         "//pkg/sentry/arch",
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
index 1b9721534..0ddbe5ff6 100644
--- a/pkg/sentry/kernel/abstract_socket_namespace.go
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -19,7 +19,7 @@ import (
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs_vfs2"
+	"gvisor.dev/gvisor/pkg/refsvfs2"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sync"
 )
@@ -27,7 +27,7 @@ import (
 // +stateify savable
 type abstractEndpoint struct {
 	ep     transport.BoundEndpoint
-	socket refs_vfs2.RefCounter
+	socket refsvfs2.RefCounter
 	name   string
 	ns     *AbstractSocketNamespace
 }
@@ -57,7 +57,7 @@ func NewAbstractSocketNamespace() *AbstractSocketNamespace {
 // its backing socket.
 type boundEndpoint struct {
 	transport.BoundEndpoint
-	socket refs_vfs2.RefCounter
+	socket refsvfs2.RefCounter
 }
 
 // Release implements transport.BoundEndpoint.Release.
@@ -89,7 +89,7 @@ func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndp
 //
 // When the last reference managed by socket is dropped, ep may be removed from the
 // namespace.
-func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refs_vfs2.RefCounter) error {
+func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, socket refsvfs2.RefCounter) error {
 	a.mu.Lock()
 	defer a.mu.Unlock()
 
@@ -109,7 +109,7 @@ func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep tran
 
 // Remove removes the specified socket at name from the abstract socket
 // namespace, if it has not yet been replaced.
-func (a *AbstractSocketNamespace) Remove(name string, socket refs_vfs2.RefCounter) {
+func (a *AbstractSocketNamespace) Remove(name string, socket refsvfs2.RefCounter) {
 	a.mu.Lock()
 	defer a.mu.Unlock()
 
diff --git a/pkg/sentry/kernel/aio.go b/pkg/sentry/kernel/aio.go
index 0ac78c0b8..ec36d1a49 100644
--- a/pkg/sentry/kernel/aio.go
+++ b/pkg/sentry/kernel/aio.go
@@ -15,10 +15,7 @@
 package kernel
 
 import (
-	"time"
-
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/log"
 )
 
 // AIOCallback is an function that does asynchronous I/O on behalf of a task.
@@ -26,7 +23,7 @@ type AIOCallback func(context.Context)
 
 // QueueAIO queues an AIOCallback which will be run asynchronously.
 func (t *Task) QueueAIO(cb AIOCallback) {
-	ctx := taskAsyncContext{t: t}
+	ctx := t.AsyncContext()
 	wg := &t.TaskSet().aioGoroutines
 	wg.Add(1)
 	go func() {
@@ -34,48 +31,3 @@ func (t *Task) QueueAIO(cb AIOCallback) {
 		wg.Done()
 	}()
 }
-
-type taskAsyncContext struct {
-	context.NoopSleeper
-	t *Task
-}
-
-// Debugf implements log.Logger.Debugf.
-func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) {
-	ctx.t.Debugf(format, v...)
-}
-
-// Infof implements log.Logger.Infof.
-func (ctx taskAsyncContext) Infof(format string, v ...interface{}) {
-	ctx.t.Infof(format, v...)
-}
-
-// Warningf implements log.Logger.Warningf.
-func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) {
-	ctx.t.Warningf(format, v...)
-}
-
-// IsLogging implements log.Logger.IsLogging.
-func (ctx taskAsyncContext) IsLogging(level log.Level) bool {
-	return ctx.t.IsLogging(level)
-}
-
-// Deadline implements context.Context.Deadline.
-func (ctx taskAsyncContext) Deadline() (time.Time, bool) {
-	return ctx.t.Deadline()
-}
-
-// Done implements context.Context.Done.
-func (ctx taskAsyncContext) Done() <-chan struct{} {
-	return ctx.t.Done()
-}
-
-// Err implements context.Context.Err.
-func (ctx taskAsyncContext) Err() error {
-	return ctx.t.Err()
-}
-
-// Value implements context.Context.Value.
-func (ctx taskAsyncContext) Value(key interface{}) interface{} {
-	return ctx.t.Value(key)
-}
diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go
index bb94769c4..a8596410f 100644
--- a/pkg/sentry/kernel/context.go
+++ b/pkg/sentry/kernel/context.go
@@ -15,8 +15,6 @@
 package kernel
 
 import (
-	"time"
-
 	"gvisor.dev/gvisor/pkg/context"
 )
 
@@ -98,18 +96,3 @@ func TaskFromContext(ctx context.Context) *Task {
 	}
 	return nil
 }
-
-// Deadline implements context.Context.Deadline.
-func (*Task) Deadline() (time.Time, bool) {
-	return time.Time{}, false
-}
-
-// Done implements context.Context.Done.
-func (*Task) Done() <-chan struct{} {
-	return nil
-}
-
-// Err implements context.Context.Err.
-func (*Task) Err() error {
-	return nil
-}
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 15519f0df..61aeca044 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -273,7 +273,7 @@ func (e *EventPoll) ReadEvents(max int) []linux.EpollEvent {
 //
 // Callback is called when one of the files we're polling becomes ready. It
 // moves said file to the readyList if it's currently in the waiting list.
-func (p *pollEntry) Callback(*waiter.Entry) {
+func (p *pollEntry) Callback(*waiter.Entry, waiter.EventMask) {
 	e := p.epoll
 
 	e.listsMu.Lock()
@@ -306,9 +306,8 @@ func (e *EventPoll) initEntryReadiness(entry *pollEntry) {
 	f.EventRegister(&entry.waiter, entry.mask)
 
 	// Check if the file happens to already be in a ready state.
-	ready := f.Readiness(entry.mask) & entry.mask
-	if ready != 0 {
-		entry.Callback(&entry.waiter)
+	if ready := f.Readiness(entry.mask) & entry.mask; ready != 0 {
+		entry.Callback(&entry.waiter, ready)
 	}
 }
 
diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD
index 2b3955598..f855f038b 100644
--- a/pkg/sentry/kernel/fasync/BUILD
+++ b/pkg/sentry/kernel/fasync/BUILD
@@ -8,11 +8,13 @@ go_library(
     visibility = ["//:sandbox"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/sentry/arch",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/vfs",
         "//pkg/sync",
+        "//pkg/syserror",
         "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go
index 153d2cd9b..b66d61c6f 100644
--- a/pkg/sentry/kernel/fasync/fasync.go
+++ b/pkg/sentry/kernel/fasync/fasync.go
@@ -17,22 +17,45 @@ package fasync
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// New creates a new fs.FileAsync.
-func New() fs.FileAsync {
-	return &FileAsync{}
+// Table to convert waiter event masks into si_band siginfo codes.
+// Taken from fs/fcntl.c:band_table.
+var bandTable = map[waiter.EventMask]int64{
+	// POLL_IN
+	waiter.EventIn: linux.EPOLLIN | linux.EPOLLRDNORM,
+	// POLL_OUT
+	waiter.EventOut: linux.EPOLLOUT | linux.EPOLLWRNORM | linux.EPOLLWRBAND,
+	// POLL_ERR
+	waiter.EventErr: linux.EPOLLERR,
+	// POLL_PRI
+	waiter.EventPri: linux.EPOLLPRI | linux.EPOLLRDBAND,
+	// POLL_HUP
+	waiter.EventHUp: linux.EPOLLHUP | linux.EPOLLERR,
 }
 
-// NewVFS2 creates a new vfs.FileAsync.
-func NewVFS2() vfs.FileAsync {
-	return &FileAsync{}
+// New returns a function that creates a new fs.FileAsync with the given file
+// descriptor.
+func New(fd int) func() fs.FileAsync {
+	return func() fs.FileAsync {
+		return &FileAsync{fd: fd}
+	}
+}
+
+// NewVFS2 returns a function that creates a new vfs.FileAsync with the given
+// file descriptor.
+func NewVFS2(fd int) func() vfs.FileAsync {
+	return func() vfs.FileAsync {
+		return &FileAsync{fd: fd}
+	}
 }
 
 // FileAsync sends signals when the registered file is ready for IO.
@@ -42,6 +65,12 @@ type FileAsync struct {
 	// e is immutable after first use (which is protected by mu below).
 	e waiter.Entry
 
+	// fd is the file descriptor to notify about.
+	// It is immutable, set at allocation time. This matches Linux semantics in
+	// fs/fcntl.c:fasync_helper.
+	// The fd value is passed to the signal recipient in siginfo.si_fd.
+	fd int
+
 	// regMu protects registeration and unregistration actions on e.
 	//
 	// regMu must be held while registration decisions are being made
@@ -56,6 +85,10 @@ type FileAsync struct {
 	mu         sync.Mutex `state:"nosave"`
 	requester  *auth.Credentials
 	registered bool
+	// signal is the signal to deliver upon I/O being available.
+	// The default value ("zero signal") means the default SIGIO signal will be
+	// delivered.
+	signal linux.Signal
 
 	// Only one of the following is allowed to be non-nil.
 	recipientPG *kernel.ProcessGroup
@@ -64,10 +97,10 @@ type FileAsync struct {
 }
 
 // Callback sends a signal.
-func (a *FileAsync) Callback(e *waiter.Entry) {
+func (a *FileAsync) Callback(e *waiter.Entry, mask waiter.EventMask) {
 	a.mu.Lock()
+	defer a.mu.Unlock()
 	if !a.registered {
-		a.mu.Unlock()
 		return
 	}
 	t := a.recipientT
@@ -80,19 +113,34 @@ func (a *FileAsync) Callback(e *waiter.Entry) {
 	}
 	if t == nil {
 		// No recipient has been registered.
-		a.mu.Unlock()
 		return
 	}
 	c := t.Credentials()
 	// Logic from sigio_perm in fs/fcntl.c.
-	if a.requester.EffectiveKUID == 0 ||
+	permCheck := (a.requester.EffectiveKUID == 0 ||
 		a.requester.EffectiveKUID == c.SavedKUID ||
 		a.requester.EffectiveKUID == c.RealKUID ||
 		a.requester.RealKUID == c.SavedKUID ||
-		a.requester.RealKUID == c.RealKUID {
-		t.SendSignal(kernel.SignalInfoPriv(linux.SIGIO))
+		a.requester.RealKUID == c.RealKUID)
+	if !permCheck {
+		return
 	}
-	a.mu.Unlock()
+	signalInfo := &arch.SignalInfo{
+		Signo: int32(linux.SIGIO),
+		Code:  arch.SignalInfoKernel,
+	}
+	if a.signal != 0 {
+		signalInfo.Signo = int32(a.signal)
+		signalInfo.SetFD(uint32(a.fd))
+		var band int64
+		for m, bandCode := range bandTable {
+			if m&mask != 0 {
+				band |= bandCode
+			}
+		}
+		signalInfo.SetBand(band)
+	}
+	t.SendSignal(signalInfo)
 }
 
 // Register sets the file which will be monitored for IO events.
@@ -186,3 +234,25 @@ func (a *FileAsync) ClearOwner() {
 	a.recipientTG = nil
 	a.recipientPG = nil
 }
+
+// Signal returns which signal will be sent to the signal recipient.
+// A value of zero means the signal to deliver wasn't customized, which means
+// the default signal (SIGIO) will be delivered.
+func (a *FileAsync) Signal() linux.Signal {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	return a.signal
+}
+
+// SetSignal overrides which signal to send when I/O is available.
+// The default behavior can be reset by specifying signal zero, which means
+// to send SIGIO.
+func (a *FileAsync) SetSignal(signal linux.Signal) error {
+	if signal != 0 && !signal.IsValid() {
+		return syserror.EINVAL
+	}
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.signal = signal
+	return nil
+}
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 0ec7344cd..7aba31587 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -110,7 +110,7 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
 
 func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
 	ctx := context.Background()
-	f.init() // Initialize table.
+	f.initNoLeakCheck() // Initialize table.
 	f.used = 0
 	for fd, d := range m {
 		if file, fileVFS2 := f.setAll(ctx, fd, d.file, d.fileVFS2, d.flags); file != nil || fileVFS2 != nil {
@@ -240,6 +240,10 @@ func (f *FDTable) String() string {
 
 		case fileVFS2 != nil:
 			vfsObj := fileVFS2.Mount().Filesystem().VirtualFilesystem()
+			vd := fileVFS2.VirtualDentry()
+			if vd.Dentry() == nil {
+				panic(fmt.Sprintf("fd %d (type %T) has nil dentry: %#v", fd, fileVFS2.Impl(), fileVFS2))
+			}
 			name, err := vfsObj.PathnameWithDeleted(ctx, vfs.VirtualDentry{}, fileVFS2.VirtualDentry())
 			if err != nil {
 				fmt.Fprintf(&buf, "<err: %v>\n", err)
diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go
index da79e6627..f17f9c59c 100644
--- a/pkg/sentry/kernel/fd_table_unsafe.go
+++ b/pkg/sentry/kernel/fd_table_unsafe.go
@@ -31,14 +31,21 @@ type descriptorTable struct {
 	slice unsafe.Pointer `state:".(map[int32]*descriptor)"`
 }
 
-// init initializes the table.
+// initNoLeakCheck initializes the table without enabling leak checking.
 //
-// TODO(gvisor.dev/1486): Enable leak check for FDTable.
-func (f *FDTable) init() {
+// This is used when loading an FDTable after S/R, during which the ref count
+// object itself will enable leak checking if necessary.
+func (f *FDTable) initNoLeakCheck() {
 	var slice []unsafe.Pointer // Empty slice.
 	atomic.StorePointer(&f.slice, unsafe.Pointer(&slice))
 }
 
+// init initializes the table with leak checking.
+func (f *FDTable) init() {
+	f.initNoLeakCheck()
+	f.InitRefs()
+}
+
 // get gets a file entry.
 //
 // The boolean indicates whether this was in range.
@@ -114,18 +121,21 @@ func (f *FDTable) setAll(ctx context.Context, fd int32, file *fs.File, fileVFS2
 		panic("VFS1 and VFS2 files set")
 	}
 
-	slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
+	slicePtr := (*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
 
 	// Grow the table as required.
-	if last := int32(len(slice)); fd >= last {
+	if last := int32(len(*slicePtr)); fd >= last {
 		end := fd + 1
 		if end < 2*last {
 			end = 2 * last
 		}
-		slice = append(slice, make([]unsafe.Pointer, end-last)...)
-		atomic.StorePointer(&f.slice, unsafe.Pointer(&slice))
+		newSlice := append(*slicePtr, make([]unsafe.Pointer, end-last)...)
+		slicePtr = &newSlice
+		atomic.StorePointer(&f.slice, unsafe.Pointer(slicePtr))
 	}
 
+	slice := *slicePtr
+
 	var desc *descriptor
 	if file != nil || fileVFS2 != nil {
 		desc = &descriptor{
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index d46d1e1c1..dfde4deee 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -63,7 +63,7 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
 		cwd:   cwd,
 		umask: umask,
 	}
-	f.EnableLeakCheck()
+	f.InitRefs()
 	return &f
 }
 
@@ -76,7 +76,7 @@ func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext {
 		cwdVFS2:  cwd,
 		umask:    umask,
 	}
-	f.EnableLeakCheck()
+	f.InitRefs()
 	return &f
 }
 
@@ -130,13 +130,15 @@ func (f *FSContext) Fork() *FSContext {
 		f.root.IncRef()
 	}
 
-	return &FSContext{
+	ctx := &FSContext{
 		cwd:      f.cwd,
 		root:     f.root,
 		cwdVFS2:  f.cwdVFS2,
 		rootVFS2: f.rootVFS2,
 		umask:    f.umask,
 	}
+	ctx.InitRefs()
+	return ctx
 }
 
 // WorkingDirectory returns the current working directory.
@@ -147,19 +149,23 @@ func (f *FSContext) WorkingDirectory() *fs.Dirent {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 
-	f.cwd.IncRef()
+	if f.cwd != nil {
+		f.cwd.IncRef()
+	}
 	return f.cwd
 }
 
 // WorkingDirectoryVFS2 returns the current working directory.
 //
-// This will return nil if called after f is destroyed, otherwise it will return
-// a Dirent with a reference taken.
+// This will return an empty vfs.VirtualDentry if called after f is
+// destroyed, otherwise it will return a Dirent with a reference taken.
 func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 
-	f.cwdVFS2.IncRef()
+	if f.cwdVFS2.Ok() {
+		f.cwdVFS2.IncRef()
+	}
 	return f.cwdVFS2
 }
 
@@ -218,13 +224,15 @@ func (f *FSContext) RootDirectory() *fs.Dirent {
 
 // RootDirectoryVFS2 returns the current filesystem root.
 //
-// This will return nil if called after f is destroyed, otherwise it will return
-// a Dirent with a reference taken.
+// This will return an empty vfs.VirtualDentry if called after f is
+// destroyed, otherwise it will return a Dirent with a reference taken.
 func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 
-	f.rootVFS2.IncRef()
+	if f.rootVFS2.Ok() {
+		f.rootVFS2.IncRef()
+	}
 	return f.rootVFS2
 }
 
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
index 3f34ee0db..9545bb5ef 100644
--- a/pkg/sentry/kernel/ipc_namespace.go
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -41,7 +41,7 @@ func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace {
 		semaphores: semaphore.NewRegistry(userNS),
 		shms:       shm.NewRegistry(userNS),
 	}
-	ns.EnableLeakCheck()
+	ns.InitRefs()
 	return ns
 }
 
@@ -55,7 +55,7 @@ func (i *IPCNamespace) ShmRegistry() *shm.Registry {
 	return i.shms
 }
 
-// DecRef implements refs_vfs2.RefCounter.DecRef.
+// DecRef implements refsvfs2.RefCounter.DecRef.
 func (i *IPCNamespace) DecRef(ctx context.Context) {
 	i.IPCNamespaceRefs.DecRef(func() {
 		i.shms.Release(ctx)
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 0eb2bf7bd..b8627a54f 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -214,9 +214,11 @@ type Kernel struct {
 	// netlinkPorts manages allocation of netlink socket port IDs.
 	netlinkPorts *port.Manager
 
-	// saveErr is the error causing the sandbox to exit during save, if
-	// any. It is protected by extMu.
-	saveErr error `state:"nosave"`
+	// saveStatus is nil if the sandbox has not been saved, errSaved or
+	// errAutoSaved if it has been saved successfully, or the error causing the
+	// sandbox to exit during save.
+	// It is protected by extMu.
+	saveStatus error `state:"nosave"`
 
 	// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
 	danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
@@ -430,9 +432,8 @@ func (k *Kernel) Init(args InitKernelArgs) error {
 // SaveTo saves the state of k to w.
 //
 // Preconditions: The kernel must be paused throughout the call to SaveTo.
-func (k *Kernel) SaveTo(w wire.Writer) error {
+func (k *Kernel) SaveTo(ctx context.Context, w wire.Writer) error {
 	saveStart := time.Now()
-	ctx := k.SupervisorContext()
 
 	// Do not allow other Kernel methods to affect it while it's being saved.
 	k.extMu.Lock()
@@ -446,38 +447,55 @@ func (k *Kernel) SaveTo(w wire.Writer) error {
 	k.mf.StartEvictions()
 	k.mf.WaitForEvictions()
 
-	// Flush write operations on open files so data reaches backing storage.
-	// This must come after MemoryFile eviction since eviction may cause file
-	// writes.
-	if err := k.tasks.flushWritesToFiles(ctx); err != nil {
-		return err
-	}
+	if VFS2Enabled {
+		// Discard unsavable mappings, such as those for host file descriptors.
+		if err := k.invalidateUnsavableMappings(ctx); err != nil {
+			return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
+		}
+
+		// Prepare filesystems for saving. This must be done after
+		// invalidateUnsavableMappings(), since dropping memory mappings may
+		// affect filesystem state (e.g. page cache reference counts).
+		if err := k.vfs.PrepareSave(ctx); err != nil {
+			return err
+		}
+	} else {
+		// Flush cached file writes to backing storage. This must come after
+		// MemoryFile eviction since eviction may cause file writes.
+		if err := k.flushWritesToFiles(ctx); err != nil {
+			return err
+		}
 
-	// Remove all epoll waiter objects from underlying wait queues.
-	// NOTE: for programs to resume execution in future snapshot scenarios,
-	// we will need to re-establish these waiter objects after saving.
-	k.tasks.unregisterEpollWaiters(ctx)
+		// Remove all epoll waiter objects from underlying wait queues.
+		// NOTE: for programs to resume execution in future snapshot scenarios,
+		// we will need to re-establish these waiter objects after saving.
+		k.tasks.unregisterEpollWaiters(ctx)
 
-	// Clear the dirent cache before saving because Dirents must be Loaded in a
-	// particular order (parents before children), and Loading dirents from a cache
-	// breaks that order.
-	if err := k.flushMountSourceRefs(ctx); err != nil {
-		return err
-	}
+		// Clear the dirent cache before saving because Dirents must be Loaded in a
+		// particular order (parents before children), and Loading dirents from a cache
+		// breaks that order.
+		if err := k.flushMountSourceRefs(ctx); err != nil {
+			return err
+		}
 
-	// Ensure that all inode and mount release operations have completed.
-	fs.AsyncBarrier()
+		// Ensure that all inode and mount release operations have completed.
+		fs.AsyncBarrier()
 
-	// Once all fs work has completed (flushed references have all been released),
-	// reset mount mappings. This allows individual mounts to save how inodes map
-	// to filesystem resources. Without this, fs.Inodes cannot be restored.
-	fs.SaveInodeMappings()
+		// Once all fs work has completed (flushed references have all been released),
+		// reset mount mappings. This allows individual mounts to save how inodes map
+		// to filesystem resources. Without this, fs.Inodes cannot be restored.
+		fs.SaveInodeMappings()
 
-	// Discard unsavable mappings, such as those for host file descriptors.
-	// This must be done after waiting for "asynchronous fs work", which
-	// includes async I/O that may touch application memory.
-	if err := k.invalidateUnsavableMappings(ctx); err != nil {
-		return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
+		// Discard unsavable mappings, such as those for host file descriptors.
+		// This must be done after waiting for "asynchronous fs work", which
+		// includes async I/O that may touch application memory.
+		//
+		// TODO(gvisor.dev/issue/1624): This rationale is believed to be
+		// obsolete since AIO callbacks are now waited-for by Kernel.Pause(),
+		// but this order is conservatively retained for VFS1.
+		if err := k.invalidateUnsavableMappings(ctx); err != nil {
+			return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
+		}
 	}
 
 	// Save the CPUID FeatureSet before the rest of the kernel so we can
@@ -486,14 +504,14 @@ func (k *Kernel) SaveTo(w wire.Writer) error {
 	//
 	// N.B. This will also be saved along with the full kernel save below.
 	cpuidStart := time.Now()
-	if _, err := state.Save(k.SupervisorContext(), w, k.FeatureSet()); err != nil {
+	if _, err := state.Save(ctx, w, k.FeatureSet()); err != nil {
 		return err
 	}
 	log.Infof("CPUID save took [%s].", time.Since(cpuidStart))
 
 	// Save the kernel state.
 	kernelStart := time.Now()
-	stats, err := state.Save(k.SupervisorContext(), w, k)
+	stats, err := state.Save(ctx, w, k)
 	if err != nil {
 		return err
 	}
@@ -502,7 +520,7 @@ func (k *Kernel) SaveTo(w wire.Writer) error {
 
 	// Save the memory file's state.
 	memoryStart := time.Now()
-	if err := k.mf.SaveTo(k.SupervisorContext(), w); err != nil {
+	if err := k.mf.SaveTo(ctx, w); err != nil {
 		return err
 	}
 	log.Infof("Memory save took [%s].", time.Since(memoryStart))
@@ -514,11 +532,9 @@ func (k *Kernel) SaveTo(w wire.Writer) error {
 
 // flushMountSourceRefs flushes the MountSources for all mounted filesystems
 // and open FDs.
+//
+// Preconditions: !VFS2Enabled.
 func (k *Kernel) flushMountSourceRefs(ctx context.Context) error {
-	if VFS2Enabled {
-		return nil // Not relevant.
-	}
-
 	// Flush all mount sources for currently mounted filesystems in each task.
 	flushed := make(map[*fs.MountNamespace]struct{})
 	k.tasks.mu.RLock()
@@ -561,13 +577,9 @@ func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.Fi
 	return err
 }
 
-func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
-	// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
-	if VFS2Enabled {
-		return nil
-	}
-
-	return ts.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error {
+// Preconditions: !VFS2Enabled.
+func (k *Kernel) flushWritesToFiles(ctx context.Context) error {
+	return k.tasks.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error {
 		if flags := file.Flags(); !flags.Write {
 			return nil
 		}
@@ -589,37 +601,8 @@ func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
 	})
 }
 
-// Preconditions: The kernel must be paused.
-func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
-	invalidated := make(map[*mm.MemoryManager]struct{})
-	k.tasks.mu.RLock()
-	defer k.tasks.mu.RUnlock()
-	for t := range k.tasks.Root.tids {
-		// We can skip locking Task.mu here since the kernel is paused.
-		if mm := t.tc.MemoryManager; mm != nil {
-			if _, ok := invalidated[mm]; !ok {
-				if err := mm.InvalidateUnsavable(ctx); err != nil {
-					return err
-				}
-				invalidated[mm] = struct{}{}
-			}
-		}
-		// I really wish we just had a sync.Map of all MMs...
-		if r, ok := t.runState.(*runSyscallAfterExecStop); ok {
-			if err := r.tc.MemoryManager.InvalidateUnsavable(ctx); err != nil {
-				return err
-			}
-		}
-	}
-	return nil
-}
-
+// Preconditions: !VFS2Enabled.
 func (ts *TaskSet) unregisterEpollWaiters(ctx context.Context) {
-	// TODO(gvisor.dev/issue/1663): Add save support for VFS2.
-	if VFS2Enabled {
-		return
-	}
-
 	ts.mu.RLock()
 	defer ts.mu.RUnlock()
 
@@ -644,8 +627,33 @@ func (ts *TaskSet) unregisterEpollWaiters(ctx context.Context) {
 	}
 }
 
+// Preconditions: The kernel must be paused.
+func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
+	invalidated := make(map[*mm.MemoryManager]struct{})
+	k.tasks.mu.RLock()
+	defer k.tasks.mu.RUnlock()
+	for t := range k.tasks.Root.tids {
+		// We can skip locking Task.mu here since the kernel is paused.
+		if mm := t.image.MemoryManager; mm != nil {
+			if _, ok := invalidated[mm]; !ok {
+				if err := mm.InvalidateUnsavable(ctx); err != nil {
+					return err
+				}
+				invalidated[mm] = struct{}{}
+			}
+		}
+		// I really wish we just had a sync.Map of all MMs...
+		if r, ok := t.runState.(*runSyscallAfterExecStop); ok {
+			if err := r.image.MemoryManager.InvalidateUnsavable(ctx); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
 // LoadFrom returns a new Kernel loaded from args.
-func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clocks) error {
+func (k *Kernel) LoadFrom(ctx context.Context, r wire.Reader, net inet.Stack, clocks sentrytime.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error {
 	loadStart := time.Now()
 
 	initAppCores := k.applicationCores
@@ -656,7 +664,7 @@ func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clock
 	// don't need to explicitly install it in the Kernel.
 	cpuidStart := time.Now()
 	var features cpuid.FeatureSet
-	if _, err := state.Load(k.SupervisorContext(), r, &features); err != nil {
+	if _, err := state.Load(ctx, r, &features); err != nil {
 		return err
 	}
 	log.Infof("CPUID load took [%s].", time.Since(cpuidStart))
@@ -671,7 +679,7 @@ func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clock
 
 	// Load the kernel state.
 	kernelStart := time.Now()
-	stats, err := state.Load(k.SupervisorContext(), r, k)
+	stats, err := state.Load(ctx, r, k)
 	if err != nil {
 		return err
 	}
@@ -684,7 +692,7 @@ func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clock
 
 	// Load the memory file's state.
 	memoryStart := time.Now()
-	if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil {
+	if err := k.mf.LoadFrom(ctx, r); err != nil {
 		return err
 	}
 	log.Infof("Memory load took [%s].", time.Since(memoryStart))
@@ -696,11 +704,17 @@ func (k *Kernel) LoadFrom(r wire.Reader, net inet.Stack, clocks sentrytime.Clock
 		net.Resume()
 	}
 
-	// Ensure that all pending asynchronous work is complete:
-	//   - namedpipe opening
-	//   - inode file opening
-	if err := fs.AsyncErrorBarrier(); err != nil {
-		return err
+	if VFS2Enabled {
+		if err := k.vfs.CompleteRestore(ctx, vfsOpts); err != nil {
+			return err
+		}
+	} else {
+		// Ensure that all pending asynchronous work is complete:
+		//   - namedpipe opening
+		//   - inode file opening
+		if err := fs.AsyncErrorBarrier(); err != nil {
+			return err
+		}
 	}
 
 	tcpip.AsyncLoading.Wait()
@@ -1005,7 +1019,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		Features:            k.featureSet,
 	}
 
-	tc, se := k.LoadTaskImage(ctx, loadArgs)
+	image, se := k.LoadTaskImage(ctx, loadArgs)
 	if se != nil {
 		return nil, 0, errors.New(se.String())
 	}
@@ -1018,7 +1032,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 	config := &TaskConfig{
 		Kernel:                  k,
 		ThreadGroup:             tg,
-		TaskContext:             tc,
+		TaskImage:               image,
 		FSContext:               fsContext,
 		FDTable:                 args.FDTable,
 		Credentials:             args.Credentials,
@@ -1034,7 +1048,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 	if err != nil {
 		return nil, 0, err
 	}
-	t.traceExecEvent(tc) // Simulate exec for tracing.
+	t.traceExecEvent(image) // Simulate exec for tracing.
 
 	// Success.
 	cu.Release()
@@ -1347,6 +1361,13 @@ func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error {
 // not have meaningful trace data. Rebuilding here ensures that we can do so
 // after tracing has been enabled.
 func (k *Kernel) RebuildTraceContexts() {
+	// We need to pause all task goroutines because Task.rebuildTraceContext()
+	// replaces Task.traceContext and Task.traceTask, which are
+	// task-goroutine-exclusive (i.e. the task goroutine assumes that it can
+	// access them without synchronization) for performance.
+	k.Pause()
+	defer k.Unpause()
+
 	k.extMu.Lock()
 	defer k.extMu.Unlock()
 	k.tasks.mu.RLock()
@@ -1462,12 +1483,42 @@ func (k *Kernel) NetlinkPorts() *port.Manager {
 	return k.netlinkPorts
 }
 
-// SaveError returns the sandbox error that caused the kernel to exit during
-// save.
-func (k *Kernel) SaveError() error {
+var (
+	errSaved     = errors.New("sandbox has been successfully saved")
+	errAutoSaved = errors.New("sandbox has been successfully auto-saved")
+)
+
+// SaveStatus returns the sandbox save status. If it was saved successfully,
+// autosaved indicates whether save was triggered by autosave. If it was not
+// saved successfully, err indicates the sandbox error that caused the kernel to
+// exit during save.
+func (k *Kernel) SaveStatus() (saved, autosaved bool, err error) {
 	k.extMu.Lock()
 	defer k.extMu.Unlock()
-	return k.saveErr
+	switch k.saveStatus {
+	case nil:
+		return false, false, nil
+	case errSaved:
+		return true, false, nil
+	case errAutoSaved:
+		return true, true, nil
+	default:
+		return false, false, k.saveStatus
+	}
+}
+
+// SetSaveSuccess sets the flag indicating that save completed successfully, if
+// no status was already set.
+func (k *Kernel) SetSaveSuccess(autosave bool) {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	if k.saveStatus == nil {
+		if autosave {
+			k.saveStatus = errAutoSaved
+		} else {
+			k.saveStatus = errSaved
+		}
+	}
 }
 
 // SetSaveError sets the sandbox error that caused the kernel to exit during
@@ -1475,8 +1526,8 @@ func (k *Kernel) SaveError() error {
 func (k *Kernel) SetSaveError(err error) {
 	k.extMu.Lock()
 	defer k.extMu.Unlock()
-	if k.saveErr == nil {
-		k.saveErr = err
+	if k.saveStatus == nil {
+		k.saveStatus = err
 	}
 }
 
diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go
index ce0db5583..d6fb0fdb8 100644
--- a/pkg/sentry/kernel/pipe/node_test.go
+++ b/pkg/sentry/kernel/pipe/node_test.go
@@ -22,7 +22,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 type sleeper struct {
@@ -66,7 +65,8 @@ func testOpenOrDie(ctx context.Context, t *testing.T, n fs.InodeOperations, flag
 	d := fs.NewDirent(ctx, inode, "pipe")
 	file, err := n.GetFile(ctx, d, flags)
 	if err != nil {
-		t.Fatalf("open with flags %+v failed: %v", flags, err)
+		t.Errorf("open with flags %+v failed: %v", flags, err)
+		return nil, err
 	}
 	if doneChan != nil {
 		doneChan <- struct{}{}
@@ -85,11 +85,11 @@ func testOpen(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.
 }
 
 func newNamedPipe(t *testing.T) *Pipe {
-	return NewPipe(true, DefaultPipeSize, usermem.PageSize)
+	return NewPipe(true, DefaultPipeSize)
 }
 
 func newAnonPipe(t *testing.T) *Pipe {
-	return NewPipe(false, DefaultPipeSize, usermem.PageSize)
+	return NewPipe(false, DefaultPipeSize)
 }
 
 // assertRecvBlocks ensures that a recv attempt on c blocks for at least
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 67beb0ad6..b989e14c7 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -26,18 +26,27 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
 const (
 	// MinimumPipeSize is a hard limit of the minimum size of a pipe.
-	MinimumPipeSize = 64 << 10
+	// It corresponds to fs/pipe.c:pipe_min_size.
+	MinimumPipeSize = usermem.PageSize
+
+	// MaximumPipeSize is a hard limit on the maximum size of a pipe.
+	// It corresponds to fs/pipe.c:pipe_max_size.
+	MaximumPipeSize = 1048576
 
 	// DefaultPipeSize is the system-wide default size of a pipe in bytes.
-	DefaultPipeSize = MinimumPipeSize
+	// It corresponds to pipe_fs_i.h:PIPE_DEF_BUFFERS.
+	DefaultPipeSize = 16 * usermem.PageSize
 
-	// MaximumPipeSize is a hard limit on the maximum size of a pipe.
-	MaximumPipeSize = 8 << 20
+	// atomicIOBytes is the maximum number of bytes that the pipe will
+	// guarantee atomic reads or writes atomically.
+	// It corresponds to limits.h:PIPE_BUF.
+	atomicIOBytes = 4096
 )
 
 // Pipe is an encapsulation of a platform-independent pipe.
@@ -53,12 +62,6 @@ type Pipe struct {
 	// This value is immutable.
 	isNamed bool
 
-	// atomicIOBytes is the maximum number of bytes that the pipe will
-	// guarantee atomic reads or writes atomically.
-	//
-	// This value is immutable.
-	atomicIOBytes int64
-
 	// The number of active readers for this pipe.
 	//
 	// Access atomically.
@@ -94,47 +97,34 @@ type Pipe struct {
 
 // NewPipe initializes and returns a pipe.
 //
-// N.B. The size and atomicIOBytes will be bounded.
-func NewPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *Pipe {
+// N.B. The size will be bounded.
+func NewPipe(isNamed bool, sizeBytes int64) *Pipe {
 	if sizeBytes < MinimumPipeSize {
 		sizeBytes = MinimumPipeSize
 	}
 	if sizeBytes > MaximumPipeSize {
 		sizeBytes = MaximumPipeSize
 	}
-	if atomicIOBytes <= 0 {
-		atomicIOBytes = 1
-	}
-	if atomicIOBytes > sizeBytes {
-		atomicIOBytes = sizeBytes
-	}
 	var p Pipe
-	initPipe(&p, isNamed, sizeBytes, atomicIOBytes)
+	initPipe(&p, isNamed, sizeBytes)
 	return &p
 }
 
-func initPipe(pipe *Pipe, isNamed bool, sizeBytes, atomicIOBytes int64) {
+func initPipe(pipe *Pipe, isNamed bool, sizeBytes int64) {
 	if sizeBytes < MinimumPipeSize {
 		sizeBytes = MinimumPipeSize
 	}
 	if sizeBytes > MaximumPipeSize {
 		sizeBytes = MaximumPipeSize
 	}
-	if atomicIOBytes <= 0 {
-		atomicIOBytes = 1
-	}
-	if atomicIOBytes > sizeBytes {
-		atomicIOBytes = sizeBytes
-	}
 	pipe.isNamed = isNamed
 	pipe.max = sizeBytes
-	pipe.atomicIOBytes = atomicIOBytes
 }
 
 // NewConnectedPipe initializes a pipe and returns a pair of objects
 // representing the read and write ends of the pipe.
-func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs.File, *fs.File) {
-	p := NewPipe(false /* isNamed */, sizeBytes, atomicIOBytes)
+func NewConnectedPipe(ctx context.Context, sizeBytes int64) (*fs.File, *fs.File) {
+	p := NewPipe(false /* isNamed */, sizeBytes)
 
 	// Build an fs.Dirent for the pipe which will be shared by both
 	// returned files.
@@ -264,7 +254,7 @@ func (p *Pipe) writeLocked(ctx context.Context, ops writeOps) (int64, error) {
 	wanted := ops.left()
 	avail := p.max - p.view.Size()
 	if wanted > avail {
-		if wanted <= p.atomicIOBytes {
+		if wanted <= atomicIOBytes {
 			return 0, syserror.ErrWouldBlock
 		}
 		ops.limit(avail)
diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go
index fe97e9800..3dd739080 100644
--- a/pkg/sentry/kernel/pipe/pipe_test.go
+++ b/pkg/sentry/kernel/pipe/pipe_test.go
@@ -26,7 +26,7 @@ import (
 
 func TestPipeRW(t *testing.T) {
 	ctx := contexttest.Context(t)
-	r, w := NewConnectedPipe(ctx, 65536, 4096)
+	r, w := NewConnectedPipe(ctx, 65536)
 	defer r.DecRef(ctx)
 	defer w.DecRef(ctx)
 
@@ -46,7 +46,7 @@ func TestPipeRW(t *testing.T) {
 
 func TestPipeReadBlock(t *testing.T) {
 	ctx := contexttest.Context(t)
-	r, w := NewConnectedPipe(ctx, 65536, 4096)
+	r, w := NewConnectedPipe(ctx, 65536)
 	defer r.DecRef(ctx)
 	defer w.DecRef(ctx)
 
@@ -61,7 +61,7 @@ func TestPipeWriteBlock(t *testing.T) {
 	const capacity = MinimumPipeSize
 
 	ctx := contexttest.Context(t)
-	r, w := NewConnectedPipe(ctx, capacity, atomicIOBytes)
+	r, w := NewConnectedPipe(ctx, capacity)
 	defer r.DecRef(ctx)
 	defer w.DecRef(ctx)
 
@@ -76,7 +76,7 @@ func TestPipeWriteUntilEnd(t *testing.T) {
 	const atomicIOBytes = 2
 
 	ctx := contexttest.Context(t)
-	r, w := NewConnectedPipe(ctx, atomicIOBytes, atomicIOBytes)
+	r, w := NewConnectedPipe(ctx, atomicIOBytes)
 	defer r.DecRef(ctx)
 	defer w.DecRef(ctx)
 
@@ -116,7 +116,8 @@ func TestPipeWriteUntilEnd(t *testing.T) {
 				}
 			}
 			if err != nil {
-				t.Fatalf("Readv: got unexpected error %v", err)
+				t.Errorf("Readv: got unexpected error %v", err)
+				return
 			}
 		}
 	}()
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index 1a152142b..7b23cbe86 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -33,6 +33,8 @@ import (
 
 // VFSPipe represents the actual pipe, analagous to an inode. VFSPipes should
 // not be copied.
+//
+// +stateify savable
 type VFSPipe struct {
 	// mu protects the fields below.
 	mu sync.Mutex `state:"nosave"`
@@ -52,9 +54,9 @@ type VFSPipe struct {
 }
 
 // NewVFSPipe returns an initialized VFSPipe.
-func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe {
+func NewVFSPipe(isNamed bool, sizeBytes int64) *VFSPipe {
 	var vp VFSPipe
-	initPipe(&vp.pipe, isNamed, sizeBytes, atomicIOBytes)
+	initPipe(&vp.pipe, isNamed, sizeBytes)
 	return &vp
 }
 
@@ -164,6 +166,8 @@ func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, l
 // VFSPipeFD implements vfs.FileDescriptionImpl for pipes. It also implements
 // non-atomic usermem.IO methods, allowing it to be passed as usermem.IO to
 // other FileDescriptions for splice(2) and tee(2).
+//
+// +stateify savable
 type VFSPipeFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 1145faf13..cef58a590 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -259,8 +259,8 @@ func (t *Task) ptraceTrapLocked(code int32) {
 		Signo: int32(linux.SIGTRAP),
 		Code:  code,
 	}
-	t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t]))
-	t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+	t.ptraceSiginfo.SetPID(int32(t.tg.pidns.tids[t]))
+	t.ptraceSiginfo.SetUID(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
 	if t.beginPtraceStopLocked() {
 		tracer := t.Tracer()
 		tracer.signalStop(t, arch.CLD_TRAPPED, int32(linux.SIGTRAP))
@@ -1000,7 +1000,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 		// at the address specified by the data parameter, and the return value
 		// is the error flag." - ptrace(2)
 		word := t.Arch().Native(0)
-		if _, err := word.CopyIn(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr); err != nil {
+		if _, err := word.CopyIn(target.CopyContext(t, usermem.IOOpts{IgnorePermissions: true}), addr); err != nil {
 			return err
 		}
 		_, err := word.CopyOut(t, data)
@@ -1008,7 +1008,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
 
 	case linux.PTRACE_POKETEXT, linux.PTRACE_POKEDATA:
 		word := t.Arch().Native(uintptr(data))
-		_, err := word.CopyOut(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr)
+		_, err := word.CopyOut(target.CopyContext(t, usermem.IOOpts{IgnorePermissions: true}), addr)
 		return err
 
 	case linux.PTRACE_GETREGSET:
diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go
index 387edfa91..60917e7d3 100644
--- a/pkg/sentry/kernel/seccomp.go
+++ b/pkg/sentry/kernel/seccomp.go
@@ -106,7 +106,7 @@ func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip u
 func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip usermem.Addr) uint32 {
 	data := linux.SeccompData{
 		Nr:                 sysno,
-		Arch:               t.tc.st.AuditNumber,
+		Arch:               t.image.st.AuditNumber,
 		InstructionPointer: uint64(ip),
 	}
 	// data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index c00fa1138..3dd3953b3 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -29,17 +29,17 @@ import (
 )
 
 const (
-	valueMax = 32767 // SEMVMX
+	// Maximum semaphore value.
+	valueMax = linux.SEMVMX
 
-	// semaphoresMax is "maximum number of semaphores per semaphore ID" (SEMMSL).
-	semaphoresMax = 32000
+	// Maximum number of semaphore sets.
+	setsMax = linux.SEMMNI
 
-	// setMax is "system-wide limit on the number of semaphore sets" (SEMMNI).
-	setsMax = 32000
+	// Maximum number of semaphroes in a semaphore set.
+	semsMax = linux.SEMMSL
 
-	// semaphoresTotalMax is "system-wide limit on the number of semaphores"
-	// (SEMMNS = SEMMNI*SEMMSL).
-	semaphoresTotalMax = 1024000000
+	// Maximum number of semaphores in all semaphroe sets.
+	semsTotalMax = linux.SEMMNS
 )
 
 // Registry maintains a set of semaphores that can be found by key or ID.
@@ -52,6 +52,9 @@ type Registry struct {
 	mu         sync.Mutex `state:"nosave"`
 	semaphores map[int32]*Set
 	lastIDUsed int32
+	// indexes maintains a mapping between a set's index in virtual array and
+	// its identifier.
+	indexes map[int32]int32
 }
 
 // Set represents a set of semaphores that can be operated atomically.
@@ -103,6 +106,7 @@ type waiter struct {
 	waiterEntry
 
 	// value represents how much resource the waiter needs to wake up.
+	// The value is either 0 or negative.
 	value int16
 	ch    chan struct{}
 }
@@ -112,6 +116,7 @@ func NewRegistry(userNS *auth.UserNamespace) *Registry {
 	return &Registry{
 		userNS:     userNS,
 		semaphores: make(map[int32]*Set),
+		indexes:    make(map[int32]int32),
 	}
 }
 
@@ -121,7 +126,7 @@ func NewRegistry(userNS *auth.UserNamespace) *Registry {
 // be found. If exclusive is true, it fails if a set with the same key already
 // exists.
 func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) {
-	if nsems < 0 || nsems > semaphoresMax {
+	if nsems < 0 || nsems > semsMax {
 		return nil, syserror.EINVAL
 	}
 
@@ -162,10 +167,13 @@ func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linu
 	}
 
 	// Apply system limits.
+	//
+	// Map semaphores and map indexes in a registry are of the same size,
+	// check map semaphores only here for the system limit.
 	if len(r.semaphores) >= setsMax {
 		return nil, syserror.EINVAL
 	}
-	if r.totalSems() > int(semaphoresTotalMax-nsems) {
+	if r.totalSems() > int(semsTotalMax-nsems) {
 		return nil, syserror.EINVAL
 	}
 
@@ -175,6 +183,39 @@ func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linu
 	return r.newSet(ctx, key, owner, owner, perms, nsems)
 }
 
+// IPCInfo returns information about system-wide semaphore limits and parameters.
+func (r *Registry) IPCInfo() *linux.SemInfo {
+	return &linux.SemInfo{
+		SemMap: linux.SEMMAP,
+		SemMni: linux.SEMMNI,
+		SemMns: linux.SEMMNS,
+		SemMnu: linux.SEMMNU,
+		SemMsl: linux.SEMMSL,
+		SemOpm: linux.SEMOPM,
+		SemUme: linux.SEMUME,
+		SemUsz: 0, // SemUsz not supported.
+		SemVmx: linux.SEMVMX,
+		SemAem: linux.SEMAEM,
+	}
+}
+
+// HighestIndex returns the index of the highest used entry in
+// the kernel's array.
+func (r *Registry) HighestIndex() int32 {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	// By default, highest used index is 0 even though
+	// there is no semaphroe set.
+	var highestIndex int32
+	for index := range r.indexes {
+		if index > highestIndex {
+			highestIndex = index
+		}
+	}
+	return highestIndex
+}
+
 // RemoveID removes set with give 'id' from the registry and marks the set as
 // dead. All waiters will be awakened and fail.
 func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error {
@@ -185,6 +226,11 @@ func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error {
 	if set == nil {
 		return syserror.EINVAL
 	}
+	index, found := r.findIndexByID(id)
+	if !found {
+		// Inconsistent state.
+		panic(fmt.Sprintf("unable to find an index for ID: %d", id))
+	}
 
 	set.mu.Lock()
 	defer set.mu.Unlock()
@@ -196,6 +242,7 @@ func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error {
 	}
 
 	delete(r.semaphores, set.ID)
+	delete(r.indexes, index)
 	set.destroy()
 	return nil
 }
@@ -219,6 +266,11 @@ func (r *Registry) newSet(ctx context.Context, key int32, owner, creator fs.File
 			continue
 		}
 		if r.semaphores[id] == nil {
+			index, found := r.findFirstAvailableIndex()
+			if !found {
+				panic("unable to find an available index")
+			}
+			r.indexes[index] = id
 			r.lastIDUsed = id
 			r.semaphores[id] = set
 			set.ID = id
@@ -246,6 +298,24 @@ func (r *Registry) findByKey(key int32) *Set {
 	return nil
 }
 
+func (r *Registry) findIndexByID(id int32) (int32, bool) {
+	for k, v := range r.indexes {
+		if v == id {
+			return k, true
+		}
+	}
+	return 0, false
+}
+
+func (r *Registry) findFirstAvailableIndex() (int32, bool) {
+	for index := int32(0); index < setsMax; index++ {
+		if _, present := r.indexes[index]; !present {
+			return index, true
+		}
+	}
+	return 0, false
+}
+
 func (r *Registry) totalSems() int {
 	totalSems := 0
 	for _, v := range r.semaphores {
@@ -283,6 +353,33 @@ func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.File
 	return nil
 }
 
+// GetStat extracts semid_ds information from the set.
+func (s *Set) GetStat(creds *auth.Credentials) (*linux.SemidDS, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// "The calling process must have read permission on the semaphore set."
+	if !s.checkPerms(creds, fs.PermMask{Read: true}) {
+		return nil, syserror.EACCES
+	}
+
+	ds := &linux.SemidDS{
+		SemPerm: linux.IPCPerm{
+			Key:  uint32(s.key),
+			UID:  uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)),
+			GID:  uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)),
+			CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)),
+			CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)),
+			Mode: uint16(s.perms.LinuxMode()),
+			Seq:  0, // IPC sequence not supported.
+		},
+		SemOTime: s.opTime.TimeT(),
+		SemCTime: s.changeTime.TimeT(),
+		SemNSems: uint64(s.Size()),
+	}
+	return ds, nil
+}
+
 // SetVal overrides a semaphore value, waking up waiters as needed.
 func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials, pid int32) error {
 	if val < 0 || val > valueMax {
@@ -320,7 +417,7 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti
 	}
 
 	for _, val := range vals {
-		if val < 0 || val > valueMax {
+		if val > valueMax {
 			return syserror.ERANGE
 		}
 	}
@@ -396,6 +493,42 @@ func (s *Set) GetPID(num int32, creds *auth.Credentials) (int32, error) {
 	return sem.pid, nil
 }
 
+func (s *Set) countWaiters(num int32, creds *auth.Credentials, pred func(w *waiter) bool) (uint16, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// The calling process must have read permission on the semaphore set.
+	if !s.checkPerms(creds, fs.PermMask{Read: true}) {
+		return 0, syserror.EACCES
+	}
+
+	sem := s.findSem(num)
+	if sem == nil {
+		return 0, syserror.ERANGE
+	}
+	var cnt uint16
+	for w := sem.waiters.Front(); w != nil; w = w.Next() {
+		if pred(w) {
+			cnt++
+		}
+	}
+	return cnt, nil
+}
+
+// CountZeroWaiters returns number of waiters waiting for the sem's value to increase.
+func (s *Set) CountZeroWaiters(num int32, creds *auth.Credentials) (uint16, error) {
+	return s.countWaiters(num, creds, func(w *waiter) bool {
+		return w.value == 0
+	})
+}
+
+// CountNegativeWaiters returns number of waiters waiting for the sem to go to zero.
+func (s *Set) CountNegativeWaiters(num int32, creds *auth.Credentials) (uint16, error) {
+	return s.countWaiters(num, creds, func(w *waiter) bool {
+		return w.value < 0
+	})
+}
+
 // ExecuteOps attempts to execute a list of operations to the set. It only
 // succeeds when all operations can be applied. No changes are made if it fails.
 //
@@ -548,11 +681,18 @@ func (s *Set) destroy() {
 	}
 }
 
+func abs(val int16) int16 {
+	if val < 0 {
+		return -val
+	}
+	return val
+}
+
 // wakeWaiters goes over all waiters and checks which of them can be notified.
 func (s *sem) wakeWaiters() {
 	// Note that this will release all waiters waiting for 0 too.
 	for w := s.waiters.Front(); w != nil; {
-		if s.value < w.value {
+		if s.value < abs(w.value) {
 			// Still blocked, skip it.
 			w = w.Next()
 			continue
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index df5c8421b..0cd9e2533 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -295,7 +295,7 @@ func (tg *ThreadGroup) createSession() error {
 		id:     SessionID(id),
 		leader: tg,
 	}
-	s.EnableLeakCheck()
+	s.InitRefs()
 
 	// Create a new ProcessGroup, belonging to that Session.
 	// This also has a single reference (assigned below).
@@ -309,7 +309,7 @@ func (tg *ThreadGroup) createSession() error {
 		session:    s,
 		ancestors:  0,
 	}
-	pg.refs.EnableLeakCheck()
+	pg.refs.InitRefs()
 
 	// Tie them and return the result.
 	s.processGroups.PushBack(pg)
@@ -395,7 +395,7 @@ func (tg *ThreadGroup) CreateProcessGroup() error {
 		originator: tg,
 		session:    tg.processGroup.session,
 	}
-	pg.refs.EnableLeakCheck()
+	pg.refs.InitRefs()
 
 	if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session {
 		pg.ancestors++
@@ -477,20 +477,20 @@ func (tg *ThreadGroup) Session() *Session {
 //
 // If this group isn't visible in this namespace, zero will be returned. It is
 // the callers responsibility to check that before using this function.
-func (pidns *PIDNamespace) IDOfSession(s *Session) SessionID {
-	pidns.owner.mu.RLock()
-	defer pidns.owner.mu.RUnlock()
-	return pidns.sids[s]
+func (ns *PIDNamespace) IDOfSession(s *Session) SessionID {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	return ns.sids[s]
 }
 
 // SessionWithID returns the Session with the given ID in the PID namespace ns,
 // or nil if that given ID is not defined in this namespace.
 //
 // A reference is not taken on the session.
-func (pidns *PIDNamespace) SessionWithID(id SessionID) *Session {
-	pidns.owner.mu.RLock()
-	defer pidns.owner.mu.RUnlock()
-	return pidns.sessions[id]
+func (ns *PIDNamespace) SessionWithID(id SessionID) *Session {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	return ns.sessions[id]
 }
 
 // ProcessGroup returns the ThreadGroup's ProcessGroup.
@@ -505,18 +505,18 @@ func (tg *ThreadGroup) ProcessGroup() *ProcessGroup {
 // IDOfProcessGroup returns the process group assigned to pg in PID namespace ns.
 //
 // The same constraints apply as IDOfSession.
-func (pidns *PIDNamespace) IDOfProcessGroup(pg *ProcessGroup) ProcessGroupID {
-	pidns.owner.mu.RLock()
-	defer pidns.owner.mu.RUnlock()
-	return pidns.pgids[pg]
+func (ns *PIDNamespace) IDOfProcessGroup(pg *ProcessGroup) ProcessGroupID {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	return ns.pgids[pg]
 }
 
 // ProcessGroupWithID returns the ProcessGroup with the given ID in the PID
 // namespace ns, or nil if that given ID is not defined in this namespace.
 //
 // A reference is not taken on the process group.
-func (pidns *PIDNamespace) ProcessGroupWithID(id ProcessGroupID) *ProcessGroup {
-	pidns.owner.mu.RLock()
-	defer pidns.owner.mu.RUnlock()
-	return pidns.processGroups[id]
+func (ns *PIDNamespace) ProcessGroupWithID(id ProcessGroupID) *ProcessGroup {
+	ns.owner.mu.RLock()
+	defer ns.owner.mu.RUnlock()
+	return ns.processGroups[id]
 }
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index f8a382fd8..073e14507 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -6,9 +6,12 @@ package(licenses = ["notice"])
 go_template_instance(
     name = "shm_refs",
     out = "shm_refs.go",
+    consts = {
+        "enableLogging": "true",
+    },
     package = "shm",
     prefix = "Shm",
-    template = "//pkg/refs_vfs2:refs_template",
+    template = "//pkg/refsvfs2:refs_template",
     types = {
         "T": "Shm",
     },
@@ -27,7 +30,7 @@ go_library(
         "//pkg/context",
         "//pkg/log",
         "//pkg/refs",
-        "//pkg/refs_vfs2",
+        "//pkg/refsvfs2",
         "//pkg/sentry/device",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index ebbebf46b..92d60ba78 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -251,7 +251,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi
 		creatorPID:    pid,
 		changeTime:    ktime.NowFromContext(ctx),
 	}
-	shm.EnableLeakCheck()
+	shm.InitRefs()
 
 	// Find the next available ID.
 	for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
diff --git a/pkg/sentry/kernel/signal.go b/pkg/sentry/kernel/signal.go
index e8cce37d0..2488ae7d5 100644
--- a/pkg/sentry/kernel/signal.go
+++ b/pkg/sentry/kernel/signal.go
@@ -73,7 +73,7 @@ func SignalInfoNoInfo(sig linux.Signal, sender, receiver *Task) *arch.SignalInfo
 		Signo: int32(sig),
 		Code:  arch.SignalInfoUser,
 	}
-	info.SetPid(int32(receiver.tg.pidns.IDOfThreadGroup(sender.tg)))
-	info.SetUid(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
+	info.SetPID(int32(receiver.tg.pidns.IDOfThreadGroup(sender.tg)))
+	info.SetUID(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
 	return info
 }
diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go
index 78f718cfe..884966120 100644
--- a/pkg/sentry/kernel/signalfd/signalfd.go
+++ b/pkg/sentry/kernel/signalfd/signalfd.go
@@ -106,8 +106,8 @@ func (s *SignalOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
 		Signo:   uint32(info.Signo),
 		Errno:   info.Errno,
 		Code:    info.Code,
-		PID:     uint32(info.Pid()),
-		UID:     uint32(info.Uid()),
+		PID:     uint32(info.PID()),
+		UID:     uint32(info.UID()),
 		Status:  info.Status(),
 		Overrun: uint32(info.Overrun()),
 		Addr:    info.Addr(),
diff --git a/pkg/sentry/kernel/syscalls_state.go b/pkg/sentry/kernel/syscalls_state.go
index 90f890495..0b17a562e 100644
--- a/pkg/sentry/kernel/syscalls_state.go
+++ b/pkg/sentry/kernel/syscalls_state.go
@@ -30,18 +30,18 @@ type syscallTableInfo struct {
 }
 
 // saveSt saves the SyscallTable.
-func (tc *TaskContext) saveSt() syscallTableInfo {
+func (image *TaskImage) saveSt() syscallTableInfo {
 	return syscallTableInfo{
-		OS:   tc.st.OS,
-		Arch: tc.st.Arch,
+		OS:   image.st.OS,
+		Arch: image.st.Arch,
 	}
 }
 
 // loadSt loads the SyscallTable.
-func (tc *TaskContext) loadSt(sti syscallTableInfo) {
+func (image *TaskImage) loadSt(sti syscallTableInfo) {
 	st, ok := LookupSyscallTable(sti.OS, sti.Arch)
 	if !ok {
 		panic(fmt.Sprintf("syscall table not found for OS %v, Arch %v", sti.OS, sti.Arch))
 	}
-	tc.st = st // Save the table reference.
+	image.st = st // Save the table reference.
 }
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
index a83ce219c..3fee7aa68 100644
--- a/pkg/sentry/kernel/syslog.go
+++ b/pkg/sentry/kernel/syslog.go
@@ -75,6 +75,12 @@ func (s *syslog) Log() []byte {
 		"Checking naughty and nice process list...", // Check it up to twice.
 		"Granting licence to kill(2)...",            // British spelling for British movie.
 		"Letting the watchdogs out...",
+		"Conjuring /dev/null black hole...",
+		"Adversarially training Redcode AI...",
+		"Singleplexing /dev/ptmx...",
+		"Recruiting cron-ies...",
+		"Verifying that no non-zero bytes made their way into /dev/zero...",
+		"Accelerating teletypewriter to 9600 baud...",
 	}
 
 	selectMessage := func() string {
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 037971393..c0ab53c94 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -21,7 +21,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/bpf"
-	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
@@ -29,11 +28,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
-	"gvisor.dev/gvisor/pkg/sentry/limits"
-	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/platform"
-	"gvisor.dev/gvisor/pkg/sentry/unimpl"
-	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -63,6 +58,12 @@ import (
 type Task struct {
 	taskNode
 
+	// goid is the task goroutine's ID. goid is owned by the task goroutine,
+	// but since it's used to detect cases where non-task goroutines
+	// incorrectly access state owned by, or exclusive to, the task goroutine,
+	// goid is always accessed using atomic memory operations.
+	goid int64 `state:"nosave"`
+
 	// runState is what the task goroutine is executing if it is not stopped.
 	// If runState is nil, the task goroutine should exit or has exited.
 	// runState is exclusive to the task goroutine.
@@ -83,7 +84,7 @@ type Task struct {
 	// taskWork is exclusive to the task goroutine.
 	taskWork []TaskWorker
 
-	// haveSyscallReturn is true if tc.Arch().Return() represents a value
+	// haveSyscallReturn is true if image.Arch().Return() represents a value
 	// returned by a syscall (or set by ptrace after a syscall).
 	//
 	// haveSyscallReturn is exclusive to the task goroutine.
@@ -257,10 +258,10 @@ type Task struct {
 	// mu protects some of the following fields.
 	mu sync.Mutex `state:"nosave"`
 
-	// tc holds task data provided by the ELF loader.
+	// image holds task data provided by the ELF loader.
 	//
-	// tc is protected by mu, and is owned by the task goroutine.
-	tc TaskContext
+	// image is protected by mu, and is owned by the task goroutine.
+	image TaskImage
 
 	// fsContext is the task's filesystem context.
 	//
@@ -274,7 +275,7 @@ type Task struct {
 
 	// If vforkParent is not nil, it is the task that created this task with
 	// vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when
-	// this TaskContext is released.
+	// this TaskImage is released.
 	//
 	// vforkParent is protected by the TaskSet mutex.
 	vforkParent *Task
@@ -641,64 +642,6 @@ func (t *Task) Kernel() *Kernel {
 	return t.k
 }
 
-// Value implements context.Context.Value.
-//
-// Preconditions: The caller must be running on the task goroutine (as implied
-// by the requirements of context.Context).
-func (t *Task) Value(key interface{}) interface{} {
-	switch key {
-	case CtxCanTrace:
-		return t.CanTrace
-	case CtxKernel:
-		return t.k
-	case CtxPIDNamespace:
-		return t.tg.pidns
-	case CtxUTSNamespace:
-		return t.utsns
-	case CtxIPCNamespace:
-		ipcns := t.IPCNamespace()
-		ipcns.IncRef()
-		return ipcns
-	case CtxTask:
-		return t
-	case auth.CtxCredentials:
-		return t.Credentials()
-	case context.CtxThreadGroupID:
-		return int32(t.ThreadGroup().ID())
-	case fs.CtxRoot:
-		return t.fsContext.RootDirectory()
-	case vfs.CtxRoot:
-		return t.fsContext.RootDirectoryVFS2()
-	case vfs.CtxMountNamespace:
-		t.mountNamespaceVFS2.IncRef()
-		return t.mountNamespaceVFS2
-	case fs.CtxDirentCacheLimiter:
-		return t.k.DirentCacheLimiter
-	case inet.CtxStack:
-		return t.NetworkContext()
-	case ktime.CtxRealtimeClock:
-		return t.k.RealtimeClock()
-	case limits.CtxLimits:
-		return t.tg.limits
-	case pgalloc.CtxMemoryFile:
-		return t.k.mf
-	case pgalloc.CtxMemoryFileProvider:
-		return t.k
-	case platform.CtxPlatform:
-		return t.k
-	case uniqueid.CtxGlobalUniqueID:
-		return t.k.UniqueID()
-	case uniqueid.CtxGlobalUniqueIDProvider:
-		return t.k
-	case uniqueid.CtxInotifyCookie:
-		return t.k.GenerateInotifyCookie()
-	case unimpl.CtxEvents:
-		return t.k
-	default:
-		return nil
-	}
-}
-
 // SetClearTID sets t's cleartid.
 //
 // Preconditions: The caller must be running on the task goroutine.
@@ -751,12 +694,12 @@ func (t *Task) IsChrooted() bool {
 	return root != realRoot
 }
 
-// TaskContext returns t's TaskContext.
+// TaskImage returns t's TaskImage.
 //
 // Precondition: The caller must be running on the task goroutine, or t.mu must
 // be locked.
-func (t *Task) TaskContext() *TaskContext {
-	return &t.tc
+func (t *Task) TaskImage() *TaskImage {
+	return &t.image
 }
 
 // FSContext returns t's FSContext. FSContext does not take an additional
diff --git a/pkg/sentry/kernel/task_acct.go b/pkg/sentry/kernel/task_acct.go
index 5f3e60fe8..e574997f7 100644
--- a/pkg/sentry/kernel/task_acct.go
+++ b/pkg/sentry/kernel/task_acct.go
@@ -136,14 +136,14 @@ func (tg *ThreadGroup) IOUsage() *usage.IO {
 func (t *Task) Name() string {
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	return t.tc.Name
+	return t.image.Name
 }
 
 // SetName changes t's name.
 func (t *Task) SetName(name string) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
-	t.tc.Name = name
+	t.image.Name = name
 	t.Debugf("Set thread name to %q", name)
 }
 
diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go
index 4a4a69ee2..9419f2e95 100644
--- a/pkg/sentry/kernel/task_block.go
+++ b/pkg/sentry/kernel/task_block.go
@@ -20,6 +20,7 @@ import (
 	"time"
 
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -32,6 +33,8 @@ import (
 //
 // - An error which is nil if an event is received from C, ETIMEDOUT if the timeout
 // expired, and syserror.ErrInterrupted if t is interrupted.
+//
+// Preconditions: The caller must be running on the task goroutine.
 func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time.Duration) (time.Duration, error) {
 	if !haveTimeout {
 		return timeout, t.block(C, nil)
@@ -112,7 +115,14 @@ func (t *Task) Block(C <-chan struct{}) error {
 
 // block blocks a task on one of many events.
 // N.B. defer is too expensive to be used here.
+//
+// Preconditions: The caller must be running on the task goroutine.
 func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error {
+	// This function is very hot; skip this check outside of +race builds.
+	if sync.RaceEnabled {
+		t.assertTaskGoroutine()
+	}
+
 	// Fast path if the request is already done.
 	select {
 	case <-C:
@@ -156,33 +166,39 @@ func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error {
 	}
 }
 
-// SleepStart implements amutex.Sleeper.SleepStart.
+// SleepStart implements context.ChannelSleeper.SleepStart.
 func (t *Task) SleepStart() <-chan struct{} {
+	t.assertTaskGoroutine()
 	t.Deactivate()
 	t.accountTaskGoroutineEnter(TaskGoroutineBlockedInterruptible)
 	return t.interruptChan
 }
 
-// SleepFinish implements amutex.Sleeper.SleepFinish.
+// SleepFinish implements context.ChannelSleeper.SleepFinish.
 func (t *Task) SleepFinish(success bool) {
 	if !success {
-		// The interrupted notification is consumed only at the top-level
-		// (Run). Therefore we attempt to reset the pending notification.
-		// This will also elide our next entry back into the task, so we
-		// will process signals, state changes, etc.
+		// Our caller received from t.interruptChan; we need to re-send to it
+		// to ensure that t.interrupted() is still true.
 		t.interruptSelf()
 	}
 	t.accountTaskGoroutineLeave(TaskGoroutineBlockedInterruptible)
 	t.Activate()
 }
 
-// Interrupted implements amutex.Sleeper.Interrupted
+// Interrupted implements context.ChannelSleeper.Interrupted.
 func (t *Task) Interrupted() bool {
-	return len(t.interruptChan) != 0
+	if t.interrupted() {
+		return true
+	}
+	// Indicate that t's task goroutine is still responsive (i.e. reset the
+	// watchdog timer).
+	t.accountTaskGoroutineRunning()
+	return false
 }
 
 // UninterruptibleSleepStart implements context.Context.UninterruptibleSleepStart.
 func (t *Task) UninterruptibleSleepStart(deactivate bool) {
+	t.assertTaskGoroutine()
 	if deactivate {
 		t.Deactivate()
 	}
@@ -198,13 +214,17 @@ func (t *Task) UninterruptibleSleepFinish(activate bool) {
 }
 
 // interrupted returns true if interrupt or interruptSelf has been called at
-// least once since the last call to interrupted.
+// least once since the last call to unsetInterrupted.
 func (t *Task) interrupted() bool {
+	return len(t.interruptChan) != 0
+}
+
+// unsetInterrupted causes interrupted to return false until the next call to
+// interrupt or interruptSelf.
+func (t *Task) unsetInterrupted() {
 	select {
 	case <-t.interruptChan:
-		return true
 	default:
-		return false
 	}
 }
 
@@ -220,9 +240,7 @@ func (t *Task) interrupt() {
 func (t *Task) interruptSelf() {
 	select {
 	case t.interruptChan <- struct{}{}:
-		t.Debugf("Interrupt queued")
 	default:
-		t.Debugf("Dropping duplicate interrupt")
 	}
 	// platform.Context.Interrupt() is unnecessary since a task goroutine
 	// calling interruptSelf() cannot also be blocked in
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 682080c14..f305e69c0 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -115,7 +115,7 @@ type CloneOptions struct {
 	ParentTID    usermem.Addr
 
 	// If Vfork is true, place the parent in vforkStop until the cloned task
-	// releases its TaskContext.
+	// releases its TaskImage.
 	Vfork bool
 
 	// If Untraced is true, do not report PTRACE_EVENT_CLONE/FORK/VFORK for
@@ -226,20 +226,20 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		})
 	}
 
-	tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace)
+	image, err := t.image.Fork(t, t.k, !opts.NewAddressSpace)
 	if err != nil {
 		return 0, nil, err
 	}
 	cu.Add(func() {
-		tc.release()
+		image.release()
 	})
 	// clone() returns 0 in the child.
-	tc.Arch.SetReturn(0)
+	image.Arch.SetReturn(0)
 	if opts.Stack != 0 {
-		tc.Arch.SetStack(uintptr(opts.Stack))
+		image.Arch.SetStack(uintptr(opts.Stack))
 	}
 	if opts.SetTLS {
-		if !tc.Arch.SetTLS(uintptr(opts.TLS)) {
+		if !image.Arch.SetTLS(uintptr(opts.TLS)) {
 			return 0, nil, syserror.EPERM
 		}
 	}
@@ -288,7 +288,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		Kernel:                  t.k,
 		ThreadGroup:             tg,
 		SignalMask:              t.SignalMask(),
-		TaskContext:             tc,
+		TaskImage:               image,
 		FSContext:               fsContext,
 		FDTable:                 fdTable,
 		Credentials:             creds,
@@ -355,7 +355,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	}
 	if opts.ChildSetTID {
 		ctid := nt.ThreadID()
-		ctid.CopyOut(nt.AsCopyContext(usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID)
+		ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID)
 	}
 	ntid := t.tg.pidns.IDOfTask(nt)
 	if opts.ParentSetTID {
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index d1136461a..70b0699dc 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -1,4 +1,4 @@
-// Copyright 2018 The gVisor Authors.
+// Copyright 2020 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,159 +15,175 @@
 package kernel
 
 import (
-	"fmt"
+	"time"
 
-	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/sentry/arch"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
-	"gvisor.dev/gvisor/pkg/sentry/loader"
-	"gvisor.dev/gvisor/pkg/sentry/mm"
-	"gvisor.dev/gvisor/pkg/syserr"
-	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/unimpl"
+	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
-var errNoSyscalls = syserr.New("no syscall table found", linux.ENOEXEC)
-
-// Auxmap contains miscellaneous data for the task.
-type Auxmap map[string]interface{}
-
-// TaskContext is the subset of a task's data that is provided by the loader.
-//
-// +stateify savable
-type TaskContext struct {
-	// Name is the thread name set by the prctl(PR_SET_NAME) system call.
-	Name string
-
-	// Arch is the architecture-specific context (registers, etc.)
-	Arch arch.Context
-
-	// MemoryManager is the task's address space.
-	MemoryManager *mm.MemoryManager
+// Deadline implements context.Context.Deadline.
+func (t *Task) Deadline() (time.Time, bool) {
+	return time.Time{}, false
+}
 
-	// fu implements futexes in the address space.
-	fu *futex.Manager
+// Done implements context.Context.Done.
+func (t *Task) Done() <-chan struct{} {
+	return nil
+}
 
-	// st is the task's syscall table.
-	st *SyscallTable `state:".(syscallTableInfo)"`
+// Err implements context.Context.Err.
+func (t *Task) Err() error {
+	return nil
 }
 
-// release releases all resources held by the TaskContext. release is called by
-// the task when it execs into a new TaskContext or exits.
-func (tc *TaskContext) release() {
-	// Nil out pointers so that if the task is saved after release, it doesn't
-	// follow the pointers to possibly now-invalid objects.
-	if tc.MemoryManager != nil {
-		tc.MemoryManager.DecUsers(context.Background())
-		tc.MemoryManager = nil
+// Value implements context.Context.Value.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) Value(key interface{}) interface{} {
+	// This function is very hot; skip this check outside of +race builds.
+	if sync.RaceEnabled {
+		t.assertTaskGoroutine()
 	}
-	tc.fu = nil
+	return t.contextValue(key, true /* isTaskGoroutine */)
 }
 
-// Fork returns a duplicate of tc. The copied TaskContext always has an
-// independent arch.Context. If shareAddressSpace is true, the copied
-// TaskContext shares an address space with the original; otherwise, the copied
-// TaskContext has an independent address space that is initially a duplicate
-// of the original's.
-func (tc *TaskContext) Fork(ctx context.Context, k *Kernel, shareAddressSpace bool) (*TaskContext, error) {
-	newTC := &TaskContext{
-		Name: tc.Name,
-		Arch: tc.Arch.Fork(),
-		st:   tc.st,
-	}
-	if shareAddressSpace {
-		newTC.MemoryManager = tc.MemoryManager
-		if newTC.MemoryManager != nil {
-			if !newTC.MemoryManager.IncUsers() {
-				// Shouldn't be possible since tc.MemoryManager should be a
-				// counted user.
-				panic(fmt.Sprintf("TaskContext.Fork called with userless TaskContext.MemoryManager"))
-			}
+func (t *Task) contextValue(key interface{}, isTaskGoroutine bool) interface{} {
+	switch key {
+	case CtxCanTrace:
+		return t.CanTrace
+	case CtxKernel:
+		return t.k
+	case CtxPIDNamespace:
+		return t.tg.pidns
+	case CtxUTSNamespace:
+		if !isTaskGoroutine {
+			t.mu.Lock()
+			defer t.mu.Unlock()
+		}
+		return t.utsns
+	case CtxIPCNamespace:
+		if !isTaskGoroutine {
+			t.mu.Lock()
+			defer t.mu.Unlock()
+		}
+		ipcns := t.ipcns
+		ipcns.IncRef()
+		return ipcns
+	case CtxTask:
+		return t
+	case auth.CtxCredentials:
+		return t.creds.Load()
+	case context.CtxThreadGroupID:
+		return int32(t.tg.ID())
+	case fs.CtxRoot:
+		if !isTaskGoroutine {
+			t.mu.Lock()
+			defer t.mu.Unlock()
+		}
+		return t.fsContext.RootDirectory()
+	case vfs.CtxRoot:
+		if !isTaskGoroutine {
+			t.mu.Lock()
+			defer t.mu.Unlock()
 		}
-		newTC.fu = tc.fu
-	} else {
-		newMM, err := tc.MemoryManager.Fork(ctx)
-		if err != nil {
-			return nil, err
+		return t.fsContext.RootDirectoryVFS2()
+	case vfs.CtxMountNamespace:
+		if !isTaskGoroutine {
+			t.mu.Lock()
+			defer t.mu.Unlock()
 		}
-		newTC.MemoryManager = newMM
-		newTC.fu = k.futexes.Fork()
+		t.mountNamespaceVFS2.IncRef()
+		return t.mountNamespaceVFS2
+	case fs.CtxDirentCacheLimiter:
+		return t.k.DirentCacheLimiter
+	case inet.CtxStack:
+		return t.NetworkContext()
+	case ktime.CtxRealtimeClock:
+		return t.k.RealtimeClock()
+	case limits.CtxLimits:
+		return t.tg.limits
+	case pgalloc.CtxMemoryFile:
+		return t.k.mf
+	case pgalloc.CtxMemoryFileProvider:
+		return t.k
+	case platform.CtxPlatform:
+		return t.k
+	case uniqueid.CtxGlobalUniqueID:
+		return t.k.UniqueID()
+	case uniqueid.CtxGlobalUniqueIDProvider:
+		return t.k
+	case uniqueid.CtxInotifyCookie:
+		return t.k.GenerateInotifyCookie()
+	case unimpl.CtxEvents:
+		return t.k
+	default:
+		return nil
 	}
-	return newTC, nil
 }
 
-// Arch returns t's arch.Context.
-//
-// Preconditions: The caller must be running on the task goroutine, or t.mu
-// must be locked.
-func (t *Task) Arch() arch.Context {
-	return t.tc.Arch
+// taskAsyncContext implements context.Context for a goroutine that performs
+// work on behalf of a Task, but is not the task goroutine.
+type taskAsyncContext struct {
+	context.NoopSleeper
+
+	t *Task
 }
 
-// MemoryManager returns t's MemoryManager. MemoryManager does not take an
-// additional reference on the returned MM.
-//
-// Preconditions: The caller must be running on the task goroutine, or t.mu
-// must be locked.
-func (t *Task) MemoryManager() *mm.MemoryManager {
-	return t.tc.MemoryManager
+// AsyncContext returns a context.Context representing t. The returned
+// context.Context is intended for use by goroutines other than t's task
+// goroutine; for example, signal delivery to t will not interrupt goroutines
+// that are blocking using the returned context.Context.
+func (t *Task) AsyncContext() context.Context {
+	return taskAsyncContext{t: t}
 }
 
-// SyscallTable returns t's syscall table.
-//
-// Preconditions: The caller must be running on the task goroutine, or t.mu
-// must be locked.
-func (t *Task) SyscallTable() *SyscallTable {
-	return t.tc.st
+// Debugf implements log.Logger.Debugf.
+func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) {
+	ctx.t.Debugf(format, v...)
 }
 
-// Stack returns the userspace stack.
-//
-// Preconditions: The caller must be running on the task goroutine, or t.mu
-// must be locked.
-func (t *Task) Stack() *arch.Stack {
-	return &arch.Stack{
-		Arch:   t.Arch(),
-		IO:     t.MemoryManager(),
-		Bottom: usermem.Addr(t.Arch().Stack()),
-	}
+// Infof implements log.Logger.Infof.
+func (ctx taskAsyncContext) Infof(format string, v ...interface{}) {
+	ctx.t.Infof(format, v...)
 }
 
-// LoadTaskImage loads a specified file into a new TaskContext.
-//
-// args.MemoryManager does not need to be set by the caller.
-func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskContext, *syserr.Error) {
-	// If File is not nil, we should load that instead of resolving Filename.
-	if args.File != nil {
-		args.Filename = args.File.PathnameWithDeleted(ctx)
-	}
+// Warningf implements log.Logger.Warningf.
+func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) {
+	ctx.t.Warningf(format, v...)
+}
+
+// IsLogging implements log.Logger.IsLogging.
+func (ctx taskAsyncContext) IsLogging(level log.Level) bool {
+	return ctx.t.IsLogging(level)
+}
 
-	// Prepare a new user address space to load into.
-	m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation)
-	defer m.DecUsers(ctx)
-	args.MemoryManager = m
+// Deadline implements context.Context.Deadline.
+func (ctx taskAsyncContext) Deadline() (time.Time, bool) {
+	return time.Time{}, false
+}
 
-	os, ac, name, err := loader.Load(ctx, args, k.extraAuxv, k.vdso)
-	if err != nil {
-		return nil, err
-	}
+// Done implements context.Context.Done.
+func (ctx taskAsyncContext) Done() <-chan struct{} {
+	return nil
+}
 
-	// Lookup our new syscall table.
-	st, ok := LookupSyscallTable(os, ac.Arch())
-	if !ok {
-		// No syscall table found. This means that the ELF binary does not match
-		// the architecture.
-		return nil, errNoSyscalls
-	}
+// Err implements context.Context.Err.
+func (ctx taskAsyncContext) Err() error {
+	return nil
+}
 
-	if !m.IncUsers() {
-		panic("Failed to increment users count on new MM")
-	}
-	return &TaskContext{
-		Name:          name,
-		Arch:          ac,
-		MemoryManager: m,
-		fu:            k.futexes.Fork(),
-		st:            st,
-	}, nil
+// Value implements context.Context.Value.
+func (ctx taskAsyncContext) Value(key interface{}) interface{} {
+	return ctx.t.contextValue(key, false /* isTaskGoroutine */)
 }
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 412d471d3..d9897e802 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -83,11 +83,12 @@ type execStop struct{}
 func (*execStop) Killable() bool { return true }
 
 // Execve implements the execve(2) syscall by killing all other tasks in its
-// thread group and switching to newTC. Execve always takes ownership of newTC.
+// thread group and switching to newImage. Execve always takes ownership of
+// newImage.
 //
 // Preconditions: The caller must be running Task.doSyscallInvoke on the task
 // goroutine.
-func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) {
+func (t *Task) Execve(newImage *TaskImage) (*SyscallControl, error) {
 	t.tg.pidns.owner.mu.Lock()
 	defer t.tg.pidns.owner.mu.Unlock()
 	t.tg.signalHandlers.mu.Lock()
@@ -96,7 +97,7 @@ func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) {
 	if t.tg.exiting || t.tg.execing != nil {
 		// We lost to a racing group-exit, kill, or exec from another thread
 		// and should just exit.
-		newTC.release()
+		newImage.release()
 		return nil, syserror.EINTR
 	}
 
@@ -118,7 +119,7 @@ func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) {
 		t.beginInternalStopLocked((*execStop)(nil))
 	}
 
-	return &SyscallControl{next: &runSyscallAfterExecStop{newTC}, ignoreReturn: true}, nil
+	return &SyscallControl{next: &runSyscallAfterExecStop{newImage}, ignoreReturn: true}, nil
 }
 
 // The runSyscallAfterExecStop state continues execve(2) after all siblings of
@@ -126,16 +127,16 @@ func (t *Task) Execve(newTC *TaskContext) (*SyscallControl, error) {
 //
 // +stateify savable
 type runSyscallAfterExecStop struct {
-	tc *TaskContext
+	image *TaskImage
 }
 
 func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
-	t.traceExecEvent(r.tc)
+	t.traceExecEvent(r.image)
 	t.tg.pidns.owner.mu.Lock()
 	t.tg.execing = nil
 	if t.killed() {
 		t.tg.pidns.owner.mu.Unlock()
-		r.tc.release()
+		r.image.release()
 		return (*runInterrupt)(nil)
 	}
 	// We are the thread group leader now. Save our old thread ID for
@@ -214,7 +215,7 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 	// executables (set-user/group-ID bits and file capabilities). This
 	// allows us to unconditionally enable user dumpability on the new mm.
 	// See fs/exec.c:setup_new_exec.
-	r.tc.MemoryManager.SetDumpability(mm.UserDumpable)
+	r.image.MemoryManager.SetDumpability(mm.UserDumpable)
 
 	// Switch to the new process.
 	t.MemoryManager().Deactivate()
@@ -222,8 +223,8 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 	// Update credentials to reflect the execve. This should precede switching
 	// MMs to ensure that dumpability has been reset first, if needed.
 	t.updateCredsForExecLocked()
-	t.tc.release()
-	t.tc = *r.tc
+	t.image.release()
+	t.image = *r.image
 	t.mu.Unlock()
 	t.unstopVforkParent()
 	t.p.FullStateChanged()
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index ce7b9641d..16986244c 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -266,7 +266,7 @@ func (*runExitMain) execute(t *Task) taskRunState {
 	t.updateRSSLocked()
 	t.tg.pidns.owner.mu.Unlock()
 	t.mu.Lock()
-	t.tc.release()
+	t.image.release()
 	t.mu.Unlock()
 
 	// Releasing the MM unblocks a blocked CLONE_VFORK parent.
@@ -368,8 +368,8 @@ func (t *Task) exitChildren() {
 				Signo: int32(sig),
 				Code:  arch.SignalInfoUser,
 			}
-			siginfo.SetPid(int32(c.tg.pidns.tids[t]))
-			siginfo.SetUid(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow()))
+			siginfo.SetPID(int32(c.tg.pidns.tids[t]))
+			siginfo.SetUID(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow()))
 			c.tg.signalHandlers.mu.Lock()
 			c.sendSignalLocked(siginfo, true /* group */)
 			c.tg.signalHandlers.mu.Unlock()
@@ -698,8 +698,8 @@ func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *arch.Si
 	info := &arch.SignalInfo{
 		Signo: int32(sig),
 	}
-	info.SetPid(int32(receiver.tg.pidns.tids[t]))
-	info.SetUid(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
+	info.SetPID(int32(receiver.tg.pidns.tids[t]))
+	info.SetUID(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
 	if t.exitStatus.Signaled() {
 		info.Code = arch.CLD_KILLED
 		info.SetStatus(int32(t.exitStatus.Signo))
diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go
index c80391475..195c7da9b 100644
--- a/pkg/sentry/kernel/task_futex.go
+++ b/pkg/sentry/kernel/task_futex.go
@@ -26,7 +26,7 @@ import (
 // Preconditions: The caller must be running on the task goroutine, or t.mu
 // must be locked.
 func (t *Task) Futex() *futex.Manager {
-	return t.tc.fu
+	return t.image.fu
 }
 
 // SwapUint32 implements futex.Target.SwapUint32.
diff --git a/pkg/sentry/kernel/task_image.go b/pkg/sentry/kernel/task_image.go
new file mode 100644
index 000000000..ce5fbd299
--- /dev/null
+++ b/pkg/sentry/kernel/task_image.go
@@ -0,0 +1,173 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
+	"gvisor.dev/gvisor/pkg/sentry/loader"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+var errNoSyscalls = syserr.New("no syscall table found", linux.ENOEXEC)
+
+// Auxmap contains miscellaneous data for the task.
+type Auxmap map[string]interface{}
+
+// TaskImage is the subset of a task's data that is provided by the loader.
+//
+// +stateify savable
+type TaskImage struct {
+	// Name is the thread name set by the prctl(PR_SET_NAME) system call.
+	Name string
+
+	// Arch is the architecture-specific context (registers, etc.)
+	Arch arch.Context
+
+	// MemoryManager is the task's address space.
+	MemoryManager *mm.MemoryManager
+
+	// fu implements futexes in the address space.
+	fu *futex.Manager
+
+	// st is the task's syscall table.
+	st *SyscallTable `state:".(syscallTableInfo)"`
+}
+
+// release releases all resources held by the TaskImage. release is called by
+// the task when it execs into a new TaskImage or exits.
+func (image *TaskImage) release() {
+	// Nil out pointers so that if the task is saved after release, it doesn't
+	// follow the pointers to possibly now-invalid objects.
+	if image.MemoryManager != nil {
+		image.MemoryManager.DecUsers(context.Background())
+		image.MemoryManager = nil
+	}
+	image.fu = nil
+}
+
+// Fork returns a duplicate of image. The copied TaskImage always has an
+// independent arch.Context. If shareAddressSpace is true, the copied
+// TaskImage shares an address space with the original; otherwise, the copied
+// TaskImage has an independent address space that is initially a duplicate
+// of the original's.
+func (image *TaskImage) Fork(ctx context.Context, k *Kernel, shareAddressSpace bool) (*TaskImage, error) {
+	newImage := &TaskImage{
+		Name: image.Name,
+		Arch: image.Arch.Fork(),
+		st:   image.st,
+	}
+	if shareAddressSpace {
+		newImage.MemoryManager = image.MemoryManager
+		if newImage.MemoryManager != nil {
+			if !newImage.MemoryManager.IncUsers() {
+				// Shouldn't be possible since image.MemoryManager should be a
+				// counted user.
+				panic(fmt.Sprintf("TaskImage.Fork called with userless TaskImage.MemoryManager"))
+			}
+		}
+		newImage.fu = image.fu
+	} else {
+		newMM, err := image.MemoryManager.Fork(ctx)
+		if err != nil {
+			return nil, err
+		}
+		newImage.MemoryManager = newMM
+		newImage.fu = k.futexes.Fork()
+	}
+	return newImage, nil
+}
+
+// Arch returns t's arch.Context.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) Arch() arch.Context {
+	return t.image.Arch
+}
+
+// MemoryManager returns t's MemoryManager. MemoryManager does not take an
+// additional reference on the returned MM.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) MemoryManager() *mm.MemoryManager {
+	return t.image.MemoryManager
+}
+
+// SyscallTable returns t's syscall table.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) SyscallTable() *SyscallTable {
+	return t.image.st
+}
+
+// Stack returns the userspace stack.
+//
+// Preconditions: The caller must be running on the task goroutine, or t.mu
+// must be locked.
+func (t *Task) Stack() *arch.Stack {
+	return &arch.Stack{
+		Arch:   t.Arch(),
+		IO:     t.MemoryManager(),
+		Bottom: usermem.Addr(t.Arch().Stack()),
+	}
+}
+
+// LoadTaskImage loads a specified file into a new TaskImage.
+//
+// args.MemoryManager does not need to be set by the caller.
+func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskImage, *syserr.Error) {
+	// If File is not nil, we should load that instead of resolving Filename.
+	if args.File != nil {
+		args.Filename = args.File.PathnameWithDeleted(ctx)
+	}
+
+	// Prepare a new user address space to load into.
+	m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation)
+	defer m.DecUsers(ctx)
+	args.MemoryManager = m
+
+	os, ac, name, err := loader.Load(ctx, args, k.extraAuxv, k.vdso)
+	if err != nil {
+		return nil, err
+	}
+
+	// Lookup our new syscall table.
+	st, ok := LookupSyscallTable(os, ac.Arch())
+	if !ok {
+		// No syscall table found. This means that the ELF binary does not match
+		// the architecture.
+		return nil, errNoSyscalls
+	}
+
+	if !m.IncUsers() {
+		panic("Failed to increment users count on new MM")
+	}
+	return &TaskImage{
+		Name:          name,
+		Arch:          ac,
+		MemoryManager: m,
+		fu:            k.futexes.Fork(),
+		st:            st,
+	}, nil
+}
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index d23cea802..c70e5e6ce 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -19,6 +19,7 @@ import (
 	"runtime/trace"
 	"sort"
 
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -215,7 +216,7 @@ func (t *Task) rebuildTraceContext(tid ThreadID) {
 	// arbitrarily large (in general it won't be, especially for cases
 	// where we're collecting a brief profile), so using the TID is a
 	// reasonable compromise in this case.
-	t.traceContext, t.traceTask = trace.NewTask(t, fmt.Sprintf("tid:%d", tid))
+	t.traceContext, t.traceTask = trace.NewTask(context.Background(), fmt.Sprintf("tid:%d", tid))
 }
 
 // traceCloneEvent is called when a new task is spawned.
@@ -237,11 +238,11 @@ func (t *Task) traceExitEvent() {
 }
 
 // traceExecEvent is called when a task calls exec.
-func (t *Task) traceExecEvent(tc *TaskContext) {
+func (t *Task) traceExecEvent(image *TaskImage) {
 	if !trace.IsEnabled() {
 		return
 	}
-	file := tc.MemoryManager.Executable()
+	file := image.MemoryManager.Executable()
 	if file == nil {
 		trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>")
 		return
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index 8dc3fec90..3ccecf4b6 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -16,11 +16,13 @@ package kernel
 
 import (
 	"bytes"
+	"fmt"
 	"runtime"
 	"runtime/trace"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/goid"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -57,6 +59,8 @@ type taskRunState interface {
 // make it visible in stack dumps. A goroutine for a given task can be identified
 // searching for Task.run()'s argument value.
 func (t *Task) run(threadID uintptr) {
+	atomic.StoreInt64(&t.goid, goid.Get())
+
 	// Construct t.blockingTimer here. We do this here because we can't
 	// reconstruct t.blockingTimer during restore in Task.afterLoad(), because
 	// kernel.timekeeper.SetClocks() hasn't been called yet.
@@ -99,6 +103,9 @@ func (t *Task) run(threadID uintptr) {
 			t.tg.pidns.owner.runningGoroutines.Done()
 			t.p.Release()
 
+			// Deferring this store triggers a false positive in the race
+			// detector (https://github.com/golang/go/issues/42599).
+			atomic.StoreInt64(&t.goid, 0)
 			// Keep argument alive because stack trace for dead variables may not be correct.
 			runtime.KeepAlive(threadID)
 			return
@@ -317,7 +324,7 @@ func (app *runApp) execute(t *Task) taskRunState {
 			// region. We should be able to easily identify
 			// vsyscalls by having a <fault><syscall> pair.
 			if at.Execute {
-				if sysno, ok := t.tc.st.LookupEmulate(addr); ok {
+				if sysno, ok := t.image.st.LookupEmulate(addr); ok {
 					return t.doVsyscall(addr, sysno)
 				}
 			}
@@ -375,6 +382,19 @@ func (app *runApp) execute(t *Task) taskRunState {
 	}
 }
 
+// assertTaskGoroutine panics if the caller is not running on t's task
+// goroutine.
+func (t *Task) assertTaskGoroutine() {
+	if got, want := goid.Get(), atomic.LoadInt64(&t.goid); got != want {
+		panic(fmt.Sprintf("running on goroutine %d (task goroutine for kernel.Task %p is %d)", got, t, want))
+	}
+}
+
+// GoroutineID returns the ID of t's task goroutine.
+func (t *Task) GoroutineID() int64 {
+	return atomic.LoadInt64(&t.goid)
+}
+
 // waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits.
 func (t *Task) waitGoroutineStoppedOrExited() {
 	t.goroutineStopped.Wait()
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 52c55d13d..9ba5f8d78 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -157,6 +157,18 @@ func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) {
 	t.goschedSeq.EndWrite()
 }
 
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) accountTaskGoroutineRunning() {
+	now := t.k.CPUClockNow()
+	if t.gosched.State != TaskGoroutineRunningSys {
+		panic(fmt.Sprintf("Task goroutine in state %v (expected %v)", t.gosched.State, TaskGoroutineRunningSys))
+	}
+	t.goschedSeq.BeginWrite()
+	t.gosched.SysTicks += now - t.gosched.Timestamp
+	t.gosched.Timestamp = now
+	t.goschedSeq.EndWrite()
+}
+
 // TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info.
 // Most clients should use t.CPUStats() instead.
 func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo {
diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go
index ebdb83061..75af3af79 100644
--- a/pkg/sentry/kernel/task_signals.go
+++ b/pkg/sentry/kernel/task_signals.go
@@ -619,9 +619,6 @@ func (t *Task) setSignalMaskLocked(mask linux.SignalSet) {
 				return
 			}
 		})
-		// We have to re-issue the interrupt consumed by t.interrupted() since
-		// it might have been for a different reason.
-		t.interruptSelf()
 	}
 
 	// Conversely, if the new mask unblocks any signals that were blocked by
@@ -917,8 +914,8 @@ func (t *Task) signalStop(target *Task, code int32, status int32) {
 			Signo: int32(linux.SIGCHLD),
 			Code:  code,
 		}
-		sigchld.SetPid(int32(t.tg.pidns.tids[target]))
-		sigchld.SetUid(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+		sigchld.SetPID(int32(t.tg.pidns.tids[target]))
+		sigchld.SetUID(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
 		sigchld.SetStatus(status)
 		// TODO(b/72102453): Set utime, stime.
 		t.sendSignalLocked(sigchld, true /* group */)
@@ -931,10 +928,10 @@ func (t *Task) signalStop(target *Task, code int32, status int32) {
 type runInterrupt struct{}
 
 func (*runInterrupt) execute(t *Task) taskRunState {
-	// Interrupts are de-duplicated (if t is interrupted twice before
-	// t.interrupted() is called, t.interrupted() will only return true once),
-	// so early exits from this function must re-enter the runInterrupt state
-	// to check for more interrupt-signaled conditions.
+	// Interrupts are de-duplicated (t.unsetInterrupted() will undo the effect
+	// of all previous calls to t.interrupted() regardless of how many such
+	// calls there have been), so early exits from this function must re-enter
+	// the runInterrupt state to check for more interrupt-signaled conditions.
 
 	t.tg.signalHandlers.mu.Lock()
 
@@ -1025,8 +1022,8 @@ func (*runInterrupt) execute(t *Task) taskRunState {
 					Signo: int32(sig),
 					Code:  t.ptraceCode,
 				}
-				t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t]))
-				t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+				t.ptraceSiginfo.SetPID(int32(t.tg.pidns.tids[t]))
+				t.ptraceSiginfo.SetUID(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
 			} else {
 				t.ptraceCode = int32(sig)
 				t.ptraceSiginfo = nil
@@ -1080,6 +1077,7 @@ func (*runInterrupt) execute(t *Task) taskRunState {
 		return t.deliverSignal(info, act)
 	}
 
+	t.unsetInterrupted()
 	t.tg.signalHandlers.mu.Unlock()
 	return (*runApp)(nil)
 }
@@ -1116,11 +1114,11 @@ func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState {
 		if parent == nil {
 			// Tracer has detached and t was created by Kernel.CreateProcess().
 			// Pretend the parent is in an ancestor PID + user namespace.
-			info.SetPid(0)
-			info.SetUid(int32(auth.OverflowUID))
+			info.SetPID(0)
+			info.SetUID(int32(auth.OverflowUID))
 		} else {
-			info.SetPid(int32(t.tg.pidns.tids[parent]))
-			info.SetUid(int32(parent.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
+			info.SetPID(int32(t.tg.pidns.tids[parent]))
+			info.SetUID(int32(parent.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
 		}
 	}
 	t.tg.signalHandlers.mu.Lock()
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index 8e28230cc..36e1384f1 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -46,10 +46,10 @@ type TaskConfig struct {
 	// SignalMask is the new task's initial signal mask.
 	SignalMask linux.SignalSet
 
-	// TaskContext is the TaskContext of the new task. Ownership of the
-	// TaskContext is transferred to TaskSet.NewTask, whether or not it
+	// TaskImage is the TaskImage of the new task. Ownership of the
+	// TaskImage is transferred to TaskSet.NewTask, whether or not it
 	// succeeds.
-	TaskContext *TaskContext
+	TaskImage *TaskImage
 
 	// FSContext is the FSContext of the new task. A reference must be held on
 	// FSContext, which is transferred to TaskSet.NewTask whether or not it
@@ -105,7 +105,7 @@ type TaskConfig struct {
 func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) {
 	t, err := ts.newTask(cfg)
 	if err != nil {
-		cfg.TaskContext.release()
+		cfg.TaskImage.release()
 		cfg.FSContext.DecRef(ctx)
 		cfg.FDTable.DecRef(ctx)
 		cfg.IPCNamespace.DecRef(ctx)
@@ -121,7 +121,7 @@ func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error)
 // of cfg if it succeeds.
 func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 	tg := cfg.ThreadGroup
-	tc := cfg.TaskContext
+	image := cfg.TaskImage
 	t := &Task{
 		taskNode: taskNode{
 			tg:       tg,
@@ -132,7 +132,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		interruptChan:      make(chan struct{}, 1),
 		signalMask:         cfg.SignalMask,
 		signalStack:        arch.SignalStack{Flags: arch.SignalStackFlagDisable},
-		tc:                 *tc,
+		image:              *image,
 		fsContext:          cfg.FSContext,
 		fdTable:            cfg.FDTable,
 		p:                  cfg.Kernel.Platform.NewContext(),
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index ce134bf54..94dabbcd8 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -18,7 +18,8 @@ import (
 	"math"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -281,29 +282,89 @@ func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOp
 	}, nil
 }
 
-// copyContext implements marshal.CopyContext. It wraps a task to allow copying
-// memory to and from the task memory with custom usermem.IOOpts.
-type copyContext struct {
-	*Task
+type taskCopyContext struct {
+	ctx  context.Context
+	t    *Task
 	opts usermem.IOOpts
 }
 
-// AsCopyContext wraps the task and returns it as CopyContext.
-func (t *Task) AsCopyContext(opts usermem.IOOpts) marshal.CopyContext {
-	return &copyContext{t, opts}
+// CopyContext returns a marshal.CopyContext that copies to/from t's address
+// space using opts.
+func (t *Task) CopyContext(ctx context.Context, opts usermem.IOOpts) *taskCopyContext {
+	return &taskCopyContext{
+		ctx:  ctx,
+		t:    t,
+		opts: opts,
+	}
+}
+
+// CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer.
+func (cc *taskCopyContext) CopyScratchBuffer(size int) []byte {
+	if ctxTask, ok := cc.ctx.(*Task); ok {
+		return ctxTask.CopyScratchBuffer(size)
+	}
+	return make([]byte, size)
+}
+
+func (cc *taskCopyContext) getMemoryManager() (*mm.MemoryManager, error) {
+	cc.t.mu.Lock()
+	tmm := cc.t.MemoryManager()
+	cc.t.mu.Unlock()
+	if !tmm.IncUsers() {
+		return nil, syserror.EFAULT
+	}
+	return tmm, nil
+}
+
+// CopyInBytes implements marshal.CopyContext.CopyInBytes.
+func (cc *taskCopyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+	tmm, err := cc.getMemoryManager()
+	if err != nil {
+		return 0, err
+	}
+	defer tmm.DecUsers(cc.ctx)
+	return tmm.CopyIn(cc.ctx, addr, dst, cc.opts)
+}
+
+// CopyOutBytes implements marshal.CopyContext.CopyOutBytes.
+func (cc *taskCopyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+	tmm, err := cc.getMemoryManager()
+	if err != nil {
+		return 0, err
+	}
+	defer tmm.DecUsers(cc.ctx)
+	return tmm.CopyOut(cc.ctx, addr, src, cc.opts)
+}
+
+type ownTaskCopyContext struct {
+	t    *Task
+	opts usermem.IOOpts
+}
+
+// OwnCopyContext returns a marshal.CopyContext that copies to/from t's address
+// space using opts. The returned CopyContext may only be used by t's task
+// goroutine.
+//
+// Since t already implements marshal.CopyContext, this is only needed to
+// override the usermem.IOOpts used for the copy.
+func (t *Task) OwnCopyContext(opts usermem.IOOpts) *ownTaskCopyContext {
+	return &ownTaskCopyContext{
+		t:    t,
+		opts: opts,
+	}
 }
 
-// CopyInString copies a string in from the task's memory.
-func (t *copyContext) CopyInString(addr usermem.Addr, maxLen int) (string, error) {
-	return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxLen, t.opts)
+// CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer.
+func (cc *ownTaskCopyContext) CopyScratchBuffer(size int) []byte {
+	return cc.t.CopyScratchBuffer(size)
 }
 
-// CopyInBytes copies task memory into dst from an IO context.
-func (t *copyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
-	return t.MemoryManager().CopyIn(t, addr, dst, t.opts)
+// CopyInBytes implements marshal.CopyContext.CopyInBytes.
+func (cc *ownTaskCopyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+	return cc.t.MemoryManager().CopyIn(cc.t, addr, dst, cc.opts)
 }
 
-// CopyOutBytes copies src into task memoryfrom an IO context.
-func (t *copyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
-	return t.MemoryManager().CopyOut(t, addr, src, t.opts)
+// CopyOutBytes implements marshal.CopyContext.CopyOutBytes.
+func (cc *ownTaskCopyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+	return cc.t.MemoryManager().CopyOut(cc.t, addr, src, cc.opts)
 }
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index 9bc452e67..9e5c2d26f 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -115,7 +115,7 @@ func (v *VDSOParamPage) incrementSeq(paramPage safemem.Block) error {
 	}
 
 	if old != v.seq {
-		return fmt.Errorf("unexpected VDSOParamPage seq value: got %d expected %d. Application may hang or get incorrect time from the VDSO.", old, v.seq)
+		return fmt.Errorf("unexpected VDSOParamPage seq value: got %d expected %d; application may hang or get incorrect time from the VDSO", old, v.seq)
 	}
 
 	v.seq = next