16 files changed, 653 insertions, 126 deletions
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 28ba950bd..8653d2f63 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -762,7 +762,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		mounts.IncRef()
 	}
 
-	tg := k.newThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
+	tg := k.NewThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
 	ctx := args.NewContext(k)
 
 	// Get the root directory from the MountNamespace.
@@ -841,9 +841,11 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
 		AbstractSocketNamespace: args.AbstractSocketNamespace,
 		ContainerID:             args.ContainerID,
 	}
-	if _, err := k.tasks.NewTask(config); err != nil {
+	t, err := k.tasks.NewTask(config)
+	if err != nil {
 		return nil, 0, err
 	}
+	t.traceExecEvent(tc) // Simulate exec for tracing.
 
 	// Success.
 	tgid := k.tasks.Root.IDOfThreadGroup(tg)
@@ -1118,6 +1120,22 @@ func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error {
 	return lastErr
 }
 
+// RebuildTraceContexts rebuilds the trace context for all tasks.
+//
+// Unfortunately, if these are built while tracing is not enabled, then we will
+// not have meaningful trace data. Rebuilding here ensures that we can do so
+// after tracing has been enabled.
+func (k *Kernel) RebuildTraceContexts() {
+	k.extMu.Lock()
+	defer k.extMu.Unlock()
+	k.tasks.mu.RLock()
+	defer k.tasks.mu.RUnlock()
+
+	for t, tid := range k.tasks.Root.tids {
+		t.rebuildTraceContext(tid)
+	}
+}
+
 // FeatureSet returns the FeatureSet.
 func (k *Kernel) FeatureSet() *cpuid.FeatureSet {
 	return k.featureSet
@@ -1173,6 +1191,11 @@ func (k *Kernel) GlobalInit() *ThreadGroup {
 	return k.globalInit
 }
 
+// TestOnly_SetGlobalInit sets the thread group with ID 1 in the root PID namespace.
+func (k *Kernel) TestOnly_SetGlobalInit(tg *ThreadGroup) {
+	k.globalInit = tg
+}
+
 // ApplicationCores returns the number of CPUs visible to sandboxed
 // applications.
 func (k *Kernel) ApplicationCores() uint {
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 24ea002ba..b14429854 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -15,17 +15,29 @@
 package kernel
 
 import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-// Restartable sequences, as described in https://lwn.net/Articles/650333/.
+// Restartable sequences.
+//
+// We support two different APIs for restartable sequences.
+//
+//  1. The upstream interface added in v4.18.
+//  2. The interface described in https://lwn.net/Articles/650333/.
+//
+// Throughout this file and other parts of the kernel, the latter is referred
+// to as "old rseq". This interface was never merged upstream, but is supported
+// for a limited set of applications that use it regardless.
 
-// RSEQCriticalRegion describes a restartable sequence critical region.
+// OldRSeqCriticalRegion describes an old rseq critical region.
 //
 // +stateify savable
-type RSEQCriticalRegion struct {
+type OldRSeqCriticalRegion struct {
 	// When a task in this thread group has its CPU preempted (as defined by
 	// platform.ErrContextCPUPreempted) or has a signal delivered to an
 	// application handler while its instruction pointer is in CriticalSection,
@@ -35,86 +47,359 @@ type RSEQCriticalRegion struct {
 	Restart         usermem.Addr
 }
 
-// RSEQAvailable returns true if t supports restartable sequences.
-func (t *Task) RSEQAvailable() bool {
+// RSeqAvailable returns true if t supports (old and new) restartable sequences.
+func (t *Task) RSeqAvailable() bool {
 	return t.k.useHostCores && t.k.Platform.DetectsCPUPreemption()
 }
 
-// RSEQCriticalRegion returns a copy of t's thread group's current restartable
-// sequence.
-func (t *Task) RSEQCriticalRegion() RSEQCriticalRegion {
-	return *t.tg.rscr.Load().(*RSEQCriticalRegion)
+// SetRSeq registers addr as this thread's rseq structure.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) SetRSeq(addr usermem.Addr, length, signature uint32) error {
+	if t.rseqAddr != 0 {
+		if t.rseqAddr != addr {
+			return syserror.EINVAL
+		}
+		if t.rseqSignature != signature {
+			return syserror.EINVAL
+		}
+		return syserror.EBUSY
+	}
+
+	// rseq must be aligned and correctly sized.
+	if addr&(linux.AlignOfRSeq-1) != 0 {
+		return syserror.EINVAL
+	}
+	if length != linux.SizeOfRSeq {
+		return syserror.EINVAL
+	}
+	if _, ok := t.MemoryManager().CheckIORange(addr, linux.SizeOfRSeq); !ok {
+		return syserror.EFAULT
+	}
+
+	t.rseqAddr = addr
+	t.rseqSignature = signature
+
+	// Initialize the CPUID.
+	//
+	// Linux implicitly does this on return from userspace, where failure
+	// would cause SIGSEGV.
+	if err := t.rseqUpdateCPU(); err != nil {
+		t.rseqAddr = 0
+		t.rseqSignature = 0
+
+		t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return syserror.EFAULT
+	}
+
+	return nil
 }
 
-// SetRSEQCriticalRegion replaces t's thread group's restartable sequence.
+// ClearRSeq unregisters addr as this thread's rseq structure.
 //
-// Preconditions: t.RSEQAvailable() == true.
-func (t *Task) SetRSEQCriticalRegion(rscr RSEQCriticalRegion) error {
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) ClearRSeq(addr usermem.Addr, length, signature uint32) error {
+	if t.rseqAddr == 0 {
+		return syserror.EINVAL
+	}
+	if t.rseqAddr != addr {
+		return syserror.EINVAL
+	}
+	if length != linux.SizeOfRSeq {
+		return syserror.EINVAL
+	}
+	if t.rseqSignature != signature {
+		return syserror.EPERM
+	}
+
+	if err := t.rseqClearCPU(); err != nil {
+		return err
+	}
+
+	t.rseqAddr = 0
+	t.rseqSignature = 0
+
+	if t.oldRSeqCPUAddr == 0 {
+		// rseqCPU no longer needed.
+		t.rseqCPU = -1
+	}
+
+	return nil
+}
+
+// OldRSeqCriticalRegion returns a copy of t's thread group's current
+// old restartable sequence.
+func (t *Task) OldRSeqCriticalRegion() OldRSeqCriticalRegion {
+	return *t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
+}
+
+// SetOldRSeqCriticalRegion replaces t's thread group's old restartable
+// sequence.
+//
+// Preconditions: t.RSeqAvailable() == true.
+func (t *Task) SetOldRSeqCriticalRegion(r OldRSeqCriticalRegion) error {
 	// These checks are somewhat more lenient than in Linux, which (bizarrely)
-	// requires rscr.CriticalSection to be non-empty and rscr.Restart to be
-	// outside of rscr.CriticalSection, even if rscr.CriticalSection.Start == 0
+	// requires r.CriticalSection to be non-empty and r.Restart to be
+	// outside of r.CriticalSection, even if r.CriticalSection.Start == 0
 	// (which disables the critical region).
-	if rscr.CriticalSection.Start == 0 {
-		rscr.CriticalSection.End = 0
-		rscr.Restart = 0
-		t.tg.rscr.Store(&rscr)
+	if r.CriticalSection.Start == 0 {
+		r.CriticalSection.End = 0
+		r.Restart = 0
+		t.tg.oldRSeqCritical.Store(&r)
 		return nil
 	}
-	if rscr.CriticalSection.Start >= rscr.CriticalSection.End {
+	if r.CriticalSection.Start >= r.CriticalSection.End {
 		return syserror.EINVAL
 	}
-	if rscr.CriticalSection.Contains(rscr.Restart) {
+	if r.CriticalSection.Contains(r.Restart) {
 		return syserror.EINVAL
 	}
-	// TODO(jamieliu): check that rscr.CriticalSection and rscr.Restart are in
-	// the application address range, for consistency with Linux
-	t.tg.rscr.Store(&rscr)
+	// TODO(jamieliu): check that r.CriticalSection and r.Restart are in
+	// the application address range, for consistency with Linux.
+	t.tg.oldRSeqCritical.Store(&r)
 	return nil
 }
 
-// RSEQCPUAddr returns the address that RSEQ will keep updated with t's CPU
-// number.
+// OldRSeqCPUAddr returns the address that old rseq will keep updated with t's
+// CPU number.
 //
 // Preconditions: The caller must be running on the task goroutine.
-func (t *Task) RSEQCPUAddr() usermem.Addr {
-	return t.rseqCPUAddr
+func (t *Task) OldRSeqCPUAddr() usermem.Addr {
+	return t.oldRSeqCPUAddr
 }
 
-// SetRSEQCPUAddr replaces the address that RSEQ will keep updated with t's CPU
-// number.
+// SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with
+// t's CPU number.
 //
-// Preconditions: t.RSEQAvailable() == true. The caller must be running on the
+// Preconditions: t.RSeqAvailable() == true. The caller must be running on the
 // task goroutine. t's AddressSpace must be active.
-func (t *Task) SetRSEQCPUAddr(addr usermem.Addr) error {
-	t.rseqCPUAddr = addr
-	if addr != 0 {
-		t.rseqCPU = int32(hostcpu.GetCPU())
-		if err := t.rseqCopyOutCPU(); err != nil {
-			t.rseqCPUAddr = 0
-			t.rseqCPU = -1
-			return syserror.EINVAL // yes, EINVAL, not err or EFAULT
-		}
-	} else {
-		t.rseqCPU = -1
+func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error {
+	t.oldRSeqCPUAddr = addr
+
+	// Check that addr is writable.
+	//
+	// N.B. rseqUpdateCPU may fail on a bad t.rseqAddr as well. That's
+	// unfortunate, but unlikely in a correct program.
+	if err := t.rseqUpdateCPU(); err != nil {
+		t.oldRSeqCPUAddr = 0
+		return syserror.EINVAL // yes, EINVAL, not err or EFAULT
 	}
 	return nil
 }
 
 // Preconditions: The caller must be running on the task goroutine. t's
 // AddressSpace must be active.
-func (t *Task) rseqCopyOutCPU() error {
+func (t *Task) rseqUpdateCPU() error {
+	if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 {
+		t.rseqCPU = -1
+		return nil
+	}
+
+	t.rseqCPU = int32(hostcpu.GetCPU())
+
+	// Update both CPUs, even if one fails.
+	rerr := t.rseqCopyOutCPU()
+	oerr := t.oldRSeqCopyOutCPU()
+
+	if rerr != nil {
+		return rerr
+	}
+	return oerr
+}
+
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) oldRSeqCopyOutCPU() error {
+	if t.oldRSeqCPUAddr == 0 {
+		return nil
+	}
+
 	buf := t.CopyScratchBuffer(4)
 	usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))
-	_, err := t.CopyOutBytes(t.rseqCPUAddr, buf)
+	_, err := t.CopyOutBytes(t.oldRSeqCPUAddr, buf)
+	return err
+}
+
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) rseqCopyOutCPU() error {
+	if t.rseqAddr == 0 {
+		return nil
+	}
+
+	buf := t.CopyScratchBuffer(8)
+	// CPUIDStart and CPUID are the first two fields in linux.RSeq.
+	usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))     // CPUIDStart
+	usermem.ByteOrder.PutUint32(buf[4:], uint32(t.rseqCPU)) // CPUID
+	// N.B. This write is not atomic, but since this occurs on the task
+	// goroutine then as long as userspace uses a single-instruction read
+	// it can't see an invalid value.
+	_, err := t.CopyOutBytes(t.rseqAddr, buf)
+	return err
+}
+
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) rseqClearCPU() error {
+	buf := t.CopyScratchBuffer(8)
+	// CPUIDStart and CPUID are the first two fields in linux.RSeq.
+	usermem.ByteOrder.PutUint32(buf, 0)                                   // CPUIDStart
+	usermem.ByteOrder.PutUint32(buf[4:], linux.RSEQ_CPU_ID_UNINITIALIZED) // CPUID
+	// N.B. This write is not atomic, but since this occurs on the task
+	// goroutine then as long as userspace uses a single-instruction read
+	// it can't see an invalid value.
+	_, err := t.CopyOutBytes(t.rseqAddr, buf)
 	return err
 }
 
+// rseqAddrInterrupt checks if IP is in a critical section, and aborts if so.
+//
+// This is a bit complex since both the RSeq and RSeqCriticalSection structs
+// are stored in userspace. So we must:
+//
+// 1. Copy in the address of RSeqCriticalSection from RSeq.
+// 2. Copy in RSeqCriticalSection itself.
+// 3. Validate critical section struct version, address range, abort address.
+// 4. Validate the abort signature (4 bytes preceding abort IP match expected
+//    signature).
+// 5. Clear address of RSeqCriticalSection from RSeq.
+// 6. Finally, conditionally abort.
+//
+// See kernel/rseq.c:rseq_ip_fixup for reference.
+//
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) rseqAddrInterrupt() {
+	if t.rseqAddr == 0 {
+		return
+	}
+
+	critAddrAddr, ok := t.rseqAddr.AddLength(linux.OffsetOfRSeqCriticalSection)
+	if !ok {
+		// SetRSeq should validate this.
+		panic(fmt.Sprintf("t.rseqAddr (%#x) not large enough", t.rseqAddr))
+	}
+
+	if t.Arch().Width() != 8 {
+		// We only handle 64-bit for now.
+		t.Debugf("Only 64-bit rseq supported.")
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	buf := t.CopyScratchBuffer(8)
+	if _, err := t.CopyInBytes(critAddrAddr, buf); err != nil {
+		t.Debugf("Failed to copy critical section address from %#x for rseq: %v", critAddrAddr, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	critAddr := usermem.Addr(usermem.ByteOrder.Uint64(buf))
+	if critAddr == 0 {
+		return
+	}
+
+	buf = t.CopyScratchBuffer(linux.SizeOfRSeqCriticalSection)
+	if _, err := t.CopyInBytes(critAddr, buf); err != nil {
+		t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	// Manually marshal RSeqCriticalSection as this is in the hot path when
+	// rseq is enabled. It must be as fast as possible.
+	//
+	// TODO(b/130243041): Replace with go_marshal.
+	cs := linux.RSeqCriticalSection{
+		Version:          usermem.ByteOrder.Uint32(buf[0:4]),
+		Flags:            usermem.ByteOrder.Uint32(buf[4:8]),
+		Start:            usermem.ByteOrder.Uint64(buf[8:16]),
+		PostCommitOffset: usermem.ByteOrder.Uint64(buf[16:24]),
+		Abort:            usermem.ByteOrder.Uint64(buf[24:32]),
+	}
+
+	if cs.Version != 0 {
+		t.Debugf("Unknown version in %+v", cs)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	start := usermem.Addr(cs.Start)
+	critRange, ok := start.ToRange(cs.PostCommitOffset)
+	if !ok {
+		t.Debugf("Invalid start and offset in %+v", cs)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	abort := usermem.Addr(cs.Abort)
+	if critRange.Contains(abort) {
+		t.Debugf("Abort in critical section in %+v", cs)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	// Verify signature.
+	sigAddr := abort - linux.SizeOfRSeqSignature
+
+	buf = t.CopyScratchBuffer(linux.SizeOfRSeqSignature)
+	if _, err := t.CopyInBytes(sigAddr, buf); err != nil {
+		t.Debugf("Failed to copy critical section signature from %#x for rseq: %v", sigAddr, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	sig := usermem.ByteOrder.Uint32(buf)
+	if sig != t.rseqSignature {
+		t.Debugf("Mismatched rseq signature %d != %d", sig, t.rseqSignature)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	// Clear the critical section address.
+	//
+	// NOTE(b/143949567): We don't support any rseq flags, so we always
+	// restart if we are in the critical section, and thus *always* clear
+	// critAddrAddr.
+	if _, err := t.MemoryManager().ZeroOut(t, critAddrAddr, int64(t.Arch().Width()), usermem.IOOpts{
+		AddressSpaceActive: true,
+	}); err != nil {
+		t.Debugf("Failed to clear critical section address from %#x for rseq: %v", critAddrAddr, err)
+		t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+		t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+		return
+	}
+
+	// Finally we can actually decide whether or not to restart.
+	if !critRange.Contains(usermem.Addr(t.Arch().IP())) {
+		return
+	}
+
+	t.Arch().SetIP(uintptr(cs.Abort))
+}
+
 // Preconditions: The caller must be running on the task goroutine.
-func (t *Task) rseqInterrupt() {
-	rscr := t.tg.rscr.Load().(*RSEQCriticalRegion)
-	if ip := t.Arch().IP(); rscr.CriticalSection.Contains(usermem.Addr(ip)) {
-		t.Debugf("Interrupted RSEQ critical section at %#x; restarting at %#x", ip, rscr.Restart)
-		t.Arch().SetIP(uintptr(rscr.Restart))
-		t.Arch().SetRSEQInterruptedIP(ip)
+func (t *Task) oldRSeqInterrupt() {
+	r := t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
+	if ip := t.Arch().IP(); r.CriticalSection.Contains(usermem.Addr(ip)) {
+		t.Debugf("Interrupted rseq critical section at %#x; restarting at %#x", ip, r.Restart)
+		t.Arch().SetIP(uintptr(r.Restart))
+		t.Arch().SetOldRSeqInterruptedIP(ip)
 	}
 }
+
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) rseqInterrupt() {
+	t.rseqAddrInterrupt()
+	t.oldRSeqInterrupt()
+}
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index 93fe68a3e..de9617e9d 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -302,7 +302,7 @@ func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Cred
 		return syserror.ERANGE
 	}
 
-	// TODO(b/29354920): Clear undo entries in all processes
+	// TODO(gvisor.dev/issue/137): Clear undo entries in all processes.
 	sem.value = val
 	sem.pid = pid
 	s.changeTime = ktime.NowFromContext(ctx)
@@ -336,7 +336,7 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti
 	for i, val := range vals {
 		sem := &s.sems[i]
 
-		// TODO(b/29354920): Clear undo entries in all processes
+		// TODO(gvisor.dev/issue/137): Clear undo entries in all processes.
 		sem.value = int16(val)
 		sem.pid = pid
 		sem.wakeWaiters()
@@ -481,7 +481,7 @@ func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (ch
 	}
 
 	// All operations succeeded, apply them.
-	// TODO(b/29354920): handle undo operations.
+	// TODO(gvisor.dev/issue/137): handle undo operations.
 	for i, v := range tmpVals {
 		s.sems[i].value = v
 		s.sems[i].wakeWaiters()
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 5bd610f68..19034a21e 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -71,9 +71,20 @@ type Registry struct {
 	mu sync.Mutex `state:"nosave"`
 
 	// shms maps segment ids to segments.
+	//
+	// shms holds all referenced segments, which are removed on the last
+	// DecRef. Thus, it cannot itself hold a reference on the Shm.
+	//
+	// Since removal only occurs after the last (unlocked) DecRef, there
+	// exists a short window during which a Shm still exists in Shm, but is
+	// unreferenced. Users must use TryIncRef to determine if the Shm is
+	// still valid.
 	shms map[ID]*Shm
 
 	// keysToShms maps segment keys to segments.
+	//
+	// Shms in keysToShms are guaranteed to be referenced, as they are
+	// removed by disassociateKey before the last DecRef.
 	keysToShms map[Key]*Shm
 
 	// Sum of the sizes of all existing segments rounded up to page size, in
@@ -95,10 +106,18 @@ func NewRegistry(userNS *auth.UserNamespace) *Registry {
 }
 
 // FindByID looks up a segment given an ID.
+//
+// FindByID returns a reference on Shm.
 func (r *Registry) FindByID(id ID) *Shm {
 	r.mu.Lock()
 	defer r.mu.Unlock()
-	return r.shms[id]
+	s := r.shms[id]
+	// Take a reference on s. If TryIncRef fails, s has reached the last
+	// DecRef, but hasn't quite been removed from r.shms yet.
+	if s != nil && s.TryIncRef() {
+		return s
+	}
+	return nil
 }
 
 // dissociateKey removes the association between a segment and its key,
@@ -119,6 +138,8 @@ func (r *Registry) dissociateKey(s *Shm) {
 
 // FindOrCreate looks up or creates a segment in the registry. It's functionally
 // analogous to open(2).
+//
+// FindOrCreate returns a reference on Shm.
 func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) {
 	if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) {
 		// "A new segment was to be created and size is less than SHMMIN or
@@ -166,6 +187,7 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui
 				return nil, syserror.EEXIST
 			}
 
+			shm.IncRef()
 			return shm, nil
 		}
 
@@ -193,7 +215,14 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui
 	// Need to create a new segment.
 	creator := fs.FileOwnerFromContext(ctx)
 	perms := fs.FilePermsFromMode(mode)
-	return r.newShm(ctx, pid, key, creator, perms, size)
+	s, err := r.newShm(ctx, pid, key, creator, perms, size)
+	if err != nil {
+		return nil, err
+	}
+	// The initial reference is held by s itself. Take another to return to
+	// the caller.
+	s.IncRef()
+	return s, nil
 }
 
 // newShm creates a new segment in the registry.
@@ -296,22 +325,26 @@ func (r *Registry) remove(s *Shm) {
 
 // Shm represents a single shared memory segment.
 //
-// Shm segment are backed directly by an allocation from platform
-// memory. Segments are always mapped as a whole, greatly simplifying how
-// mappings are tracked. However note that mremap and munmap calls may cause the
-// vma for a segment to become fragmented; which requires special care when
-// unmapping a segment. See mm/shm.go.
+// Shm segment are backed directly by an allocation from platform memory.
+// Segments are always mapped as a whole, greatly simplifying how mappings are
+// tracked. However note that mremap and munmap calls may cause the vma for a
+// segment to become fragmented; which requires special care when unmapping a
+// segment. See mm/shm.go.
 //
 // Segments persist until they are explicitly marked for destruction via
-// shmctl(SHM_RMID).
+// MarkDestroyed().
 //
 // Shm implements memmap.Mappable and memmap.MappingIdentity.
 //
 // +stateify savable
 type Shm struct {
-	// AtomicRefCount tracks the number of references to this segment from
-	// maps. A segment always holds a reference to itself, until it's marked for
+	// AtomicRefCount tracks the number of references to this segment.
+	//
+	// A segment holds a reference to itself until it is marked for
 	// destruction.
+	//
+	// In addition to direct users, the MemoryManager will hold references
+	// via MappingIdentity.
 	refs.AtomicRefCount
 
 	mfp pgalloc.MemoryFileProvider
@@ -484,9 +517,8 @@ type AttachOpts struct {
 // ConfigureAttach creates an mmap configuration for the segment with the
 // requested attach options.
 //
-// ConfigureAttach returns with a ref on s on success. The caller should drop
-// this once the map is installed. This reference prevents s from being
-// destroyed before the returned configuration is used.
+// Postconditions: The returned MMapOpts are valid only as long as a reference
+// continues to be held on s.
 func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts AttachOpts) (memmap.MMapOpts, error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -504,7 +536,6 @@ func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts Attac
 		// in the user namespace that governs its IPC namespace." - man shmat(2)
 		return memmap.MMapOpts{}, syserror.EACCES
 	}
-	s.IncRef()
 	return memmap.MMapOpts{
 		Length: s.size,
 		Offset: 0,
@@ -549,10 +580,15 @@ func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) {
 	}
 	creds := auth.CredentialsFromContext(ctx)
 
-	nattach := uint64(s.ReadRefs())
-	// Don't report the self-reference we keep prior to being marked for
-	// destruction. However, also don't report a count of -1 for segments marked
-	// as destroyed, with no mappings.
+	// Use the reference count as a rudimentary count of the number of
+	// attaches. We exclude:
+	//
+	// 1. The reference the caller holds.
+	// 2. The self-reference held by s prior to destruction.
+	//
+	// Note that this may still overcount by including transient references
+	// used in concurrent calls.
+	nattach := uint64(s.ReadRefs()) - 1
 	if !s.pendingDestruction {
 		nattach--
 	}
@@ -620,18 +656,17 @@ func (s *Shm) MarkDestroyed() {
 	s.registry.dissociateKey(s)
 
 	s.mu.Lock()
-	// Only drop the segment's self-reference once, when destruction is
-	// requested. Otherwise, repeated calls to shmctl(IPC_RMID) would force a
-	// segment to be destroyed prematurely, potentially with active maps to the
-	// segment's address range. Remaining references are dropped when the
-	// segment is detached or unmaped.
+	defer s.mu.Unlock()
 	if !s.pendingDestruction {
 		s.pendingDestruction = true
-		s.mu.Unlock() // Must release s.mu before calling s.DecRef.
+		// Drop the self-reference so destruction occurs when all
+		// external references are gone.
+		//
+		// N.B. This cannot be the final DecRef, as the caller also
+		// holds a reference.
 		s.DecRef()
 		return
 	}
-	s.mu.Unlock()
 }
 
 // checkOwnership verifies whether a segment may be accessed by ctx as an
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 220fa73a2..2fdee0282 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -339,6 +339,14 @@ func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn {
 	return nil
 }
 
+// LookupName looks up a syscall name.
+func (s *SyscallTable) LookupName(sysno uintptr) string {
+	if sc, ok := s.Table[sysno]; ok {
+		return sc.Name
+	}
+	return fmt.Sprintf("sys_%d", sysno) // Unlikely.
+}
+
 // LookupEmulate looks up an emulation syscall number.
 func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) {
 	sysno, ok := s.Emulate[addr]
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 80c8e5464..d25a7903b 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -15,6 +15,8 @@
 package kernel
 
 import (
+	gocontext "context"
+	"runtime/trace"
 	"sync"
 	"sync/atomic"
 
@@ -390,7 +392,14 @@ type Task struct {
 
 	// logPrefix is a string containing the task's thread ID in the root PID
 	// namespace, and is prepended to log messages emitted by Task.Infof etc.
-	logPrefix atomic.Value `state:".(string)"`
+	logPrefix atomic.Value `state:"nosave"`
+
+	// traceContext and traceTask are both used for tracing, and are
+	// updated along with the logPrefix in updateInfoLocked.
+	//
+	// These are exclusive to the task goroutine.
+	traceContext gocontext.Context `state:"nosave"`
+	traceTask    *trace.Task       `state:"nosave"`
 
 	// creds is the task's credentials.
 	//
@@ -480,18 +489,43 @@ type Task struct {
 	// netns is protected by mu. netns is owned by the task goroutine.
 	netns bool
 
-	// If rseqPreempted is true, before the next call to p.Switch(), interrupt
-	// RSEQ critical regions as defined by tg.rseq and write the task
-	// goroutine's CPU number to rseqCPUAddr. rseqCPU is the last CPU number
-	// written to rseqCPUAddr.
+	// If rseqPreempted is true, before the next call to p.Switch(),
+	// interrupt rseq critical regions as defined by rseqAddr and
+	// tg.oldRSeqCritical and write the task goroutine's CPU number to
+	// rseqAddr/oldRSeqCPUAddr.
 	//
-	// If rseqCPUAddr is 0, rseqCPU is -1.
+	// We support two ABIs for restartable sequences:
 	//
-	// rseqCPUAddr, rseqCPU, and rseqPreempted are exclusive to the task
-	// goroutine.
+	//  1. The upstream interface added in v4.18,
+	//  2. An "old" interface never merged upstream. In the implementation,
+	//     this is referred to as "old rseq".
+	//
+	// rseqPreempted is exclusive to the task goroutine.
 	rseqPreempted bool `state:"nosave"`
-	rseqCPUAddr   usermem.Addr
-	rseqCPU       int32
+
+	// rseqCPU is the last CPU number written to rseqAddr/oldRSeqCPUAddr.
+	//
+	// If rseq is unused, rseqCPU is -1 for convenient use in
+	// platform.Context.Switch.
+	//
+	// rseqCPU is exclusive to the task goroutine.
+	rseqCPU int32
+
+	// oldRSeqCPUAddr is a pointer to the userspace old rseq CPU variable.
+	//
+	// oldRSeqCPUAddr is exclusive to the task goroutine.
+	oldRSeqCPUAddr usermem.Addr
+
+	// rseqAddr is a pointer to the userspace linux.RSeq structure.
+	//
+	// rseqAddr is exclusive to the task goroutine.
+	rseqAddr usermem.Addr
+
+	// rseqSignature is the signature that the rseq abort IP must be signed
+	// with.
+	//
+	// rseqSignature is exclusive to the task goroutine.
+	rseqSignature uint32
 
 	// copyScratchBuffer is a buffer available to CopyIn/CopyOut
 	// implementations that require an intermediate buffer to copy data
@@ -528,14 +562,6 @@ func (t *Task) loadPtraceTracer(tracer *Task) {
 	t.ptraceTracer.Store(tracer)
 }
 
-func (t *Task) saveLogPrefix() string {
-	return t.logPrefix.Load().(string)
-}
-
-func (t *Task) loadLogPrefix(prefix string) {
-	t.logPrefix.Store(prefix)
-}
-
 func (t *Task) saveSyscallFilters() []bpf.Program {
 	if f := t.syscallFilters.Load(); f != nil {
 		return f.([]bpf.Program)
@@ -549,6 +575,7 @@ func (t *Task) loadSyscallFilters(filters []bpf.Program) {
 
 // afterLoad is invoked by stateify.
 func (t *Task) afterLoad() {
+	t.updateInfoLocked()
 	t.interruptChan = make(chan struct{}, 1)
 	t.gosched.State = TaskGoroutineNonexistent
 	if t.stop != nil {
diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go
index dd69939f9..4a4a69ee2 100644
--- a/pkg/sentry/kernel/task_block.go
+++ b/pkg/sentry/kernel/task_block.go
@@ -16,6 +16,7 @@ package kernel
 
 import (
 	"runtime"
+	"runtime/trace"
 	"time"
 
 	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -133,19 +134,24 @@ func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error {
 		runtime.Gosched()
 	}
 
+	region := trace.StartRegion(t.traceContext, blockRegion)
 	select {
 	case <-C:
+		region.End()
 		t.SleepFinish(true)
+		// Woken by event.
 		return nil
 
 	case <-interrupt:
+		region.End()
 		t.SleepFinish(false)
 		// Return the indicated error on interrupt.
 		return syserror.ErrInterrupted
 
 	case <-timerChan:
-		// We've timed out.
+		region.End()
 		t.SleepFinish(true)
+		// We've timed out.
 		return syserror.ETIMEDOUT
 	}
 }
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 0916fd658..247bd4aba 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -236,14 +236,19 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	} else if opts.NewPIDNamespace {
 		pidns = pidns.NewChild(userns)
 	}
+
 	tg := t.tg
+	rseqAddr := usermem.Addr(0)
+	rseqSignature := uint32(0)
 	if opts.NewThreadGroup {
 		tg.mounts.IncRef()
 		sh := t.tg.signalHandlers
 		if opts.NewSignalHandlers {
 			sh = sh.Fork()
 		}
-		tg = t.k.newThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock)
+		tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy())
+		rseqAddr = t.rseqAddr
+		rseqSignature = t.rseqSignature
 	}
 
 	cfg := &TaskConfig{
@@ -260,6 +265,8 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 		UTSNamespace:            utsns,
 		IPCNamespace:            ipcns,
 		AbstractSocketNamespace: t.abstractSockets,
+		RSeqAddr:                rseqAddr,
+		RSeqSignature:           rseqSignature,
 		ContainerID:             t.ContainerID(),
 	}
 	if opts.NewThreadGroup {
@@ -299,6 +306,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
 	// nt that it must receive before its task goroutine starts running.
 	tid := nt.k.tasks.Root.IDOfTask(nt)
 	defer nt.Start(tid)
+	t.traceCloneEvent(tid)
 
 	// "If fork/clone and execve are allowed by @prog, any child processes will
 	// be constrained to the same filters and system call ABI as the parent." -
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 17a089b90..fa6528386 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -129,6 +129,7 @@ type runSyscallAfterExecStop struct {
 }
 
 func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
+	t.traceExecEvent(r.tc)
 	t.tg.pidns.owner.mu.Lock()
 	t.tg.execing = nil
 	if t.killed() {
@@ -189,9 +190,11 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
 	t.updateRSSLocked()
 	// Restartable sequence state is discarded.
 	t.rseqPreempted = false
-	t.rseqCPUAddr = 0
 	t.rseqCPU = -1
-	t.tg.rscr.Store(&RSEQCriticalRegion{})
+	t.rseqAddr = 0
+	t.rseqSignature = 0
+	t.oldRSeqCPUAddr = 0
+	t.tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{})
 	t.tg.pidns.owner.mu.Unlock()
 
 	// Remove FDs with the CloseOnExec flag set.
@@ -253,7 +256,7 @@ func (t *Task) promoteLocked() {
 
 	t.tg.leader = t
 	t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t])
-	t.updateLogPrefixLocked()
+	t.updateInfoLocked()
 	// Reap the original leader. If it has a tracer, detach it instead of
 	// waiting for it to acknowledge the original leader's death.
 	oldLeader.exitParentNotified = true
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index 535f03e50..435761e5a 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -236,6 +236,7 @@ func (*runExit) execute(t *Task) taskRunState {
 type runExitMain struct{}
 
 func (*runExitMain) execute(t *Task) taskRunState {
+	t.traceExitEvent()
 	lastExiter := t.exitThreadGroup()
 
 	// If the task has a cleartid, and the thread group wasn't killed by a
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index a29e9b9eb..0fb3661de 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -16,6 +16,7 @@ package kernel
 
 import (
 	"fmt"
+	"runtime/trace"
 	"sort"
 
 	"gvisor.dev/gvisor/pkg/log"
@@ -127,11 +128,88 @@ func (t *Task) debugDumpStack() {
 	}
 }
 
-// updateLogPrefix updates the task's cached log prefix to reflect its
-// current thread ID.
+// trace definitions.
+//
+// Note that all region names are prefixed by ':' in order to ensure that they
+// are lexically ordered before all system calls, which use the naked system
+// call name (e.g. "read") for maximum clarity.
+const (
+	traceCategory = "task"
+	runRegion     = ":run"
+	blockRegion   = ":block"
+	cpuidRegion   = ":cpuid"
+	faultRegion   = ":fault"
+)
+
+// updateInfoLocked updates the task's cached log prefix and tracing
+// information to reflect its current thread ID.
 //
 // Preconditions: The task's owning TaskSet.mu must be locked.
-func (t *Task) updateLogPrefixLocked() {
+func (t *Task) updateInfoLocked() {
 	// Use the task's TID in the root PID namespace for logging.
-	t.logPrefix.Store(fmt.Sprintf("[% 4d] ", t.tg.pidns.owner.Root.tids[t]))
+	tid := t.tg.pidns.owner.Root.tids[t]
+	t.logPrefix.Store(fmt.Sprintf("[% 4d] ", tid))
+	t.rebuildTraceContext(tid)
+}
+
+// rebuildTraceContext rebuilds the trace context.
+//
+// Precondition: the passed tid must be the tid in the root namespace.
+func (t *Task) rebuildTraceContext(tid ThreadID) {
+	// Re-initialize the trace context.
+	if t.traceTask != nil {
+		t.traceTask.End()
+	}
+
+	// Note that we define the "task type" to be the dynamic TID. This does
+	// not align perfectly with the documentation for "tasks" in the
+	// tracing package. Tasks may be assumed to be bounded by analysis
+	// tools. However, if we just use a generic "task" type here, then the
+	// "user-defined tasks" page on the tracing dashboard becomes nearly
+	// unusable, as it loads all traces from all tasks.
+	//
+	// We can assume that the number of tasks in the system is not
+	// arbitrarily large (in general it won't be, especially for cases
+	// where we're collecting a brief profile), so using the TID is a
+	// reasonable compromise in this case.
+	t.traceContext, t.traceTask = trace.NewTask(t, fmt.Sprintf("tid:%d", tid))
+}
+
+// traceCloneEvent is called when a new task is spawned.
+//
+// ntid must be the new task's ThreadID in the root namespace.
+func (t *Task) traceCloneEvent(ntid ThreadID) {
+	if !trace.IsEnabled() {
+		return
+	}
+	trace.Logf(t.traceContext, traceCategory, "spawn: %d", ntid)
+}
+
+// traceExitEvent is called when a task exits.
+func (t *Task) traceExitEvent() {
+	if !trace.IsEnabled() {
+		return
+	}
+	trace.Logf(t.traceContext, traceCategory, "exit status: 0x%x", t.exitStatus.Status())
+}
+
+// traceExecEvent is called when a task calls exec.
+func (t *Task) traceExecEvent(tc *TaskContext) {
+	if !trace.IsEnabled() {
+		return
+	}
+	d := tc.MemoryManager.Executable()
+	if d == nil {
+		trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>")
+		return
+	}
+	defer d.DecRef()
+	root := t.fsContext.RootDirectory()
+	if root == nil {
+		trace.Logf(t.traceContext, traceCategory, "exec: << no root directory >>")
+		return
+	}
+	defer root.DecRef()
+	n, _ := d.FullName(root)
+	trace.Logf(t.traceContext, traceCategory, "exec: %s", n)
 }
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index c92266c59..6357273d3 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -17,6 +17,7 @@ package kernel
 import (
 	"bytes"
 	"runtime"
+	"runtime/trace"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -168,12 +169,22 @@ func (*runApp) execute(t *Task) taskRunState {
 	// Apply restartable sequences.
 	if t.rseqPreempted {
 		t.rseqPreempted = false
-		if t.rseqCPUAddr != 0 {
+		if t.rseqAddr != 0 || t.oldRSeqCPUAddr != 0 {
+			// Linux writes the CPU on every preemption. We only do
+			// so if it changed. Thus we may delay delivery of
+			// SIGSEGV if rseqAddr/oldRSeqCPUAddr is invalid.
 			cpu := int32(hostcpu.GetCPU())
 			if t.rseqCPU != cpu {
 				t.rseqCPU = cpu
 				if err := t.rseqCopyOutCPU(); err != nil {
-					t.Warningf("Failed to copy CPU to %#x for RSEQ: %v", t.rseqCPUAddr, err)
+					t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err)
+					t.forceSignal(linux.SIGSEGV, false)
+					t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+					// Re-enter the task run loop for signal delivery.
+					return (*runApp)(nil)
+				}
+				if err := t.oldRSeqCopyOutCPU(); err != nil {
+					t.Debugf("Failed to copy CPU to %#x for old rseq: %v", t.oldRSeqCPUAddr, err)
 					t.forceSignal(linux.SIGSEGV, false)
 					t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
 					// Re-enter the task run loop for signal delivery.
@@ -205,9 +216,11 @@ func (*runApp) execute(t *Task) taskRunState {
 		t.tg.pidns.owner.mu.RUnlock()
 	}
 
+	region := trace.StartRegion(t.traceContext, runRegion)
 	t.accountTaskGoroutineEnter(TaskGoroutineRunningApp)
 	info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU)
 	t.accountTaskGoroutineLeave(TaskGoroutineRunningApp)
+	region.End()
 
 	if clearSinglestep {
 		t.Arch().ClearSingleStep()
@@ -225,6 +238,7 @@ func (*runApp) execute(t *Task) taskRunState {
 
 	case platform.ErrContextSignalCPUID:
 		// Is this a CPUID instruction?
+		region := trace.StartRegion(t.traceContext, cpuidRegion)
 		expected := arch.CPUIDInstruction[:]
 		found := make([]byte, len(expected))
 		_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
@@ -232,10 +246,12 @@ func (*runApp) execute(t *Task) taskRunState {
 			// Skip the cpuid instruction.
 			t.Arch().CPUIDEmulate(t)
 			t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
+			region.End()
 
 			// Resume execution.
 			return (*runApp)(nil)
 		}
+		region.End() // Not an actual CPUID, but required copy-in.
 
 		// The instruction at the given RIP was not a CPUID, and we
 		// fallthrough to the default signal deliver behavior below.
@@ -251,8 +267,10 @@ func (*runApp) execute(t *Task) taskRunState {
 		// an application-generated signal and we should continue execution
 		// normally.
 		if at.Any() {
+			region := trace.StartRegion(t.traceContext, faultRegion)
 			addr := usermem.Addr(info.Addr())
 			err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack()))
+			region.End()
 			if err == nil {
 				// The fault was handled appropriately.
 				// We can resume running the application.
@@ -260,6 +278,12 @@ func (*runApp) execute(t *Task) taskRunState {
 			}
 
 			// Is this a vsyscall that we need emulate?
+			//
+			// Note that we don't track vsyscalls as part of a
+			// specific trace region. This is because regions don't
+			// stack, and the actual system call will count as a
+			// region. We should be able to easily identify
+			// vsyscalls by having a <fault><syscall> pair.
 			if at.Execute {
 				if sysno, ok := t.tc.st.LookupEmulate(addr); ok {
 					return t.doVsyscall(addr, sysno)
@@ -306,7 +330,7 @@ func (*runApp) execute(t *Task) taskRunState {
 		return (*runApp)(nil)
 
 	case platform.ErrContextCPUPreempted:
-		// Ensure that RSEQ critical sections are interrupted and per-thread
+		// Ensure that rseq critical sections are interrupted and per-thread
 		// CPU values are updated before the next platform.Context.Switch().
 		t.rseqPreempted = true
 		return (*runApp)(nil)
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index ae6fc4025..58af16ee2 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -79,6 +80,13 @@ type TaskConfig struct {
 	// AbstractSocketNamespace is the AbstractSocketNamespace of the new task.
 	AbstractSocketNamespace *AbstractSocketNamespace
 
+	// RSeqAddr is a pointer to the the userspace linux.RSeq structure.
+	RSeqAddr usermem.Addr
+
+	// RSeqSignature is the signature that the rseq abort IP must be signed
+	// with.
+	RSeqSignature uint32
+
 	// ContainerID is the container the new task belongs to.
 	ContainerID string
 }
@@ -126,6 +134,8 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 		ipcns:           cfg.IPCNamespace,
 		abstractSockets: cfg.AbstractSocketNamespace,
 		rseqCPU:         -1,
+		rseqAddr:        cfg.RSeqAddr,
+		rseqSignature:   cfg.RSeqSignature,
 		futexWaiter:     futex.NewWaiter(),
 		containerID:     cfg.ContainerID,
 	}
@@ -154,10 +164,10 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
 	// Below this point, newTask is expected not to fail (there is no rollback
 	// of assignTIDsLocked or any of the following).
 
-	// Logging on t's behalf will panic if t.logPrefix hasn't been initialized.
-	// This is the earliest point at which we can do so (since t now has thread
-	// IDs).
-	t.updateLogPrefixLocked()
+	// Logging on t's behalf will panic if t.logPrefix hasn't been
+	// initialized. This is the earliest point at which we can do so
+	// (since t now has thread IDs).
+	t.updateInfoLocked()
 
 	if cfg.InheritParent != nil {
 		t.parent = cfg.InheritParent.parent
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index b543d536a..3180f5560 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -17,6 +17,7 @@ package kernel
 import (
 	"fmt"
 	"os"
+	"runtime/trace"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -160,6 +161,10 @@ func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval u
 		ctrl = ctrlStopAndReinvokeSyscall
 	} else {
 		fn := s.Lookup(sysno)
+		var region *trace.Region // Only non-nil if tracing == true.
+		if trace.IsEnabled() {
+			region = trace.StartRegion(t.traceContext, s.LookupName(sysno))
+		}
 		if fn != nil {
 			// Call our syscall implementation.
 			rval, ctrl, err = fn(t, args)
@@ -167,6 +172,9 @@ func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval u
 			// Use the missing function if not found.
 			rval, err = t.SyscallTable().Missing(t, sysno, args)
 		}
+		if region != nil {
+			region.End()
+		}
 	}
 
 	if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) {
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 72568d296..c0197a563 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -238,8 +238,8 @@ type ThreadGroup struct {
 	// execed is protected by the TaskSet mutex.
 	execed bool
 
-	// rscr is the thread group's RSEQ critical region.
-	rscr atomic.Value `state:".(*RSEQCriticalRegion)"`
+	// oldRSeqCritical is the thread group's old rseq critical region.
+	oldRSeqCritical atomic.Value `state:".(*OldRSeqCriticalRegion)"`
 
 	// mounts is the thread group's mount namespace. This does not really
 	// correspond to a "mount namespace" in Linux, but is more like a
@@ -256,35 +256,35 @@ type ThreadGroup struct {
 	tty *TTY
 }
 
-// newThreadGroup returns a new, empty thread group in PID namespace ns. The
+// NewThreadGroup returns a new, empty thread group in PID namespace ns. The
 // thread group leader will send its parent terminationSignal when it exits.
 // The new thread group isn't visible to the system until a task has been
 // created inside of it by a successful call to TaskSet.NewTask.
-func (k *Kernel) newThreadGroup(mounts *fs.MountNamespace, ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup {
+func (k *Kernel) NewThreadGroup(mntns *fs.MountNamespace, pidns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet) *ThreadGroup {
 	tg := &ThreadGroup{
 		threadGroupNode: threadGroupNode{
-			pidns: ns,
+			pidns: pidns,
 		},
 		signalHandlers:    sh,
 		terminationSignal: terminationSignal,
 		ioUsage:           &usage.IO{},
 		limits:            limits,
-		mounts:            mounts,
+		mounts:            mntns,
 	}
 	tg.itimerRealTimer = ktime.NewTimer(k.monotonicClock, &itimerRealListener{tg: tg})
 	tg.timers = make(map[linux.TimerID]*IntervalTimer)
-	tg.rscr.Store(&RSEQCriticalRegion{})
+	tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{})
 	return tg
 }
 
-// saveRscr is invoked by stateify.
-func (tg *ThreadGroup) saveRscr() *RSEQCriticalRegion {
-	return tg.rscr.Load().(*RSEQCriticalRegion)
+// saveOldRSeqCritical is invoked by stateify.
+func (tg *ThreadGroup) saveOldRSeqCritical() *OldRSeqCriticalRegion {
+	return tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
 }
 
-// loadRscr is invoked by stateify.
-func (tg *ThreadGroup) loadRscr(rscr *RSEQCriticalRegion) {
-	tg.rscr.Store(rscr)
+// loadOldRSeqCritical is invoked by stateify.
+func (tg *ThreadGroup) loadOldRSeqCritical(r *OldRSeqCriticalRegion) {
+	tg.oldRSeqCritical.Store(r)
 }
 
 // SignalHandlers returns the signal handlers used by tg.
diff --git a/pkg/sentry/kernel/tty.go b/pkg/sentry/kernel/tty.go
index 34f84487a..048de26dc 100644
--- a/pkg/sentry/kernel/tty.go
+++ b/pkg/sentry/kernel/tty.go
@@ -21,8 +21,19 @@ import "sync"
 //
 // +stateify savable
 type TTY struct {
+	// Index is the terminal index. It is immutable.
+	Index uint32
+
 	mu sync.Mutex `state:"nosave"`
 
 	// tg is protected by mu.
 	tg *ThreadGroup
 }
+
+// TTY returns the thread group's controlling terminal. If nil, there is no
+// controlling terminal.
+func (tg *ThreadGroup) TTY() *TTY {
+	tg.signalHandlers.mu.Lock()
+	defer tg.signalHandlers.mu.Unlock()
+	return tg.tty
+}